diff --git a/build.sh b/build.sh index bfb679b0..6f605797 100755 --- a/build.sh +++ b/build.sh @@ -3,7 +3,7 @@ cflags+="-Wall -O3 -g -std=gnu11 -fno-strict-aliasing -Isrc -Ilib/font_renderer" cflags+=" $(pkg-config --cflags lua5.2) $(sdl2-config --cflags)" lflags="-static-libgcc -static-libstdc++" -for package in libagg freetype2 lua5.2 x11; do +for package in libagg freetype2 lua5.2 x11 libpcre2-8; do lflags+=" $(pkg-config --libs $package)" done lflags+=" $(sdl2-config --libs) -lm" diff --git a/data/core/commands/findreplace.lua b/data/core/commands/findreplace.lua index 937c410a..af60f33f 100644 --- a/data/core/commands/findreplace.lua +++ b/data/core/commands/findreplace.lua @@ -90,6 +90,7 @@ local function has_selection() and core.active_view.doc:has_selection() end + command.add(has_selection, { ["find-replace:select-next"] = function() local l1, c1, l2, c2 = doc():get_selection(true) @@ -107,9 +108,9 @@ command.add("core.docview", { end) end, - ["find-replace:find-pattern"] = function() - find("Find Text Pattern", function(doc, line, col, text) - local opt = { wrap = true, no_case = true, pattern = true } + ["find-replace:find-regex"] = function() + find("Find Text Regex", function(doc, line, col, text) + local opt = { wrap = true, no_case = true, regex = true } return search.find(doc, line, col, text, opt) end) end, @@ -144,9 +145,10 @@ command.add("core.docview", { end) end, - ["find-replace:replace-pattern"] = function() - replace("Pattern", "", function(text, old, new) - return text:gsub(old, new) + ["find-replace:replace-regex"] = function() + replace("Regex", "", function(text, old, new) + local re = regex.compile(old) + return regex.gsub(re, text, new) end) end, diff --git a/data/core/doc/search.lua b/data/core/doc/search.lua index fe57523e..04090673 100644 --- a/data/core/doc/search.lua +++ b/data/core/doc/search.lua @@ -15,12 +15,8 @@ local function init_args(doc, line, col, text, opt) opt = opt or default_opt line, col = doc:sanitize_position(line, col) - if opt.no_case then - if opt.pattern then - text = text:gsub("%%?.", pattern_lower) - else - text = text:lower() - end + if opt.no_case and not opt.regex then + text = text:lower() end return doc, line, col, text, opt @@ -30,20 +26,32 @@ end function search.find(doc, line, col, text, opt) doc, line, col, text, opt = init_args(doc, line, col, text, opt) + local re + if opt.regex then + re = regex.compile(text, opt.no_case and "i" or "") + end for line = line, #doc.lines do local line_text = doc.lines[line] - if opt.no_case then - line_text = line_text:lower() + if opt.regex then + local s, e = re:cmatch(line_text, col) + if s then + return line, s, line, e + end + col = 1 + else + if opt.no_case then + line_text = line_text:lower() + end + local s, e = line_text:find(text, col, true) + if s then + return line, s, line, e + 1 + end + col = 1 end - local s, e = line_text:find(text, col, not opt.pattern) - if s then - return line, s, line, e + 1 - end - col = 1 end if opt.wrap then - opt = { no_case = opt.no_case, pattern = opt.pattern } + opt = { no_case = opt.no_case, regex = opt.regex } return search.find(doc, 1, 1, text, opt) end end diff --git a/data/core/init.lua b/data/core/init.lua index cc4b46d8..a3e6eba6 100644 --- a/data/core/init.lua +++ b/data/core/init.lua @@ -1,4 +1,5 @@ require "core.strict" +require "core.regex" local common = require "core.common" local config = require "core.config" local style = require "core.style" diff --git a/data/core/regex.lua b/data/core/regex.lua new file mode 100644 index 00000000..a360f0a9 --- /dev/null +++ b/data/core/regex.lua @@ -0,0 +1,69 @@ + +-- So that in addition to regex.gsub(pattern, string), we can also do +-- pattern:gsub(string). +regex.__index = function(table, key) return regex[key]; end + +regex.match = function(pattern_string, string, offset, options) + local pattern = type(pattern_string) == "table" and + pattern_string or regex.compile(pattern_string) + return regex.cmatch(pattern, string, offset, options) +end + +-- Will iterate back through any UTF-8 bytes so that we don't replace bits +-- mid character. +local function previous_character(str, index) + local byte + repeat + index = index - 1 + byte = string.byte(str, index) + until byte < 128 or byte >= 192 + return index +end + +-- Moves to the end of the identified character. +local function end_character(str, index) + local byte = string.byte(str, index + 1) + while byte >= 128 and byte < 192 do + index = index + 1 + byte = string.byte(str, index + 1) + end + return index +end + +-- Build off matching. For now, only support basic replacements, but capture +-- groupings should be doable. We can even have custom group replacements and +-- transformations and stuff in lua. Currently, this takes group replacements +-- as \1 - \9. +-- Should work on UTF-8 text. +regex.gsub = function(pattern_string, str, replacement) + local pattern = type(pattern_string) == "table" and + pattern_string or regex.compile(pattern_string) + local result, indices = "" + local n = 0 + repeat + indices = { regex.cmatch(pattern, str) } + if #indices > 0 then + n = n + 1 + local currentReplacement = replacement + if #indices > 2 then + for i = 1, (#indices/2 - 1) do + currentReplacement = string.gsub( + currentReplacement, + "\\" .. i, + str:sub(indices[i*2+1], end_character(str,indices[i*2+2]-1)) + ) + end + end + currentReplacement = string.gsub(currentReplacement, "\\%d", "") + if indices[1] > 1 then + result = result .. + str:sub(1, previous_character(str, indices[1])) .. currentReplacement + else + result = result .. currentReplacement + end + str = str:sub(indices[2]) + end + until #indices == 0 or indices[1] == indices[2] + return result .. str, n +end + diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index 83e0e665..a20dba5e 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -48,29 +48,6 @@ local function push_tokens(t, syn, pattern, full_text, find_results) end -local function is_escaped(text, idx, esc) - local byte = esc:byte() - local count = 0 - for i = idx - 1, 1, -1 do - if text:byte(i) ~= byte then break end - count = count + 1 - end - return count % 2 == 1 -end - - -local function find_non_escaped(text, pattern, offset, esc) - while true do - local s, e = text:find(pattern, offset) - if not s then break end - if esc and is_escaped(text, s, esc) then - offset = e + 1 - else - return s, e - end - end -end - -- State is a 32-bit number that is four separate bytes, illustrating how many -- differnet delimiters we have open, and which subsyntaxes we have active. -- At most, there are 3 subsyntaxes active at the same time. Beyond that, @@ -155,26 +132,44 @@ function tokenizer.tokenize(incoming_syntax, text, state) set_subsyntax_pattern_idx(0) current_syntax, subsyntax_info, current_pattern_idx, current_level = retrieve_syntax_state(incoming_syntax, state) + end + local function find_text(text, p, offset, at_start, close) + local target, res = p.pattern or p.regex, { 1, offset - 1 }, p.regex + local code = type(target) == "table" and target[close and 2 or 1] or target + if p.regex and type(p.regex) ~= "table" then + p._regex = p._regex or regex.compile(p.regex) + code = p._regex + end + repeat + res = p.pattern and { text:find(at_start and "^" .. code or code, res[2]+1) } + or { regex.match(code, text, res[2]+1, at_start and regex.ANCHORED or 0) } + if res[1] and close and target[3] then + local count = 0 + for i = res[1] - 1, 1, -1 do + if text:byte(i) ~= target[3]:byte() then break end + count = count + 1 + end + -- Check to see if the escaped character is there, + -- and if it is not itself escaped. + if count % 2 == 0 then break end + end + until not res[1] or not close or not target[3] + return unpack(res) end while i <= #text do -- continue trying to match the end pattern of a pair if we have a state set if current_pattern_idx > 0 then local p = current_syntax.patterns[current_pattern_idx] - local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3]) + local s, e = find_text(text, p, i, false, true) local cont = true -- If we're in subsyntax mode, always check to see if we end our syntax -- first, before the found delimeter, as ending the subsyntax takes -- precedence over ending the delimiter in the subsyntax. if subsyntax_info then - local ss, se = find_non_escaped( - text, - subsyntax_info.pattern[2], - i, - subsyntax_info.pattern[3] - ) + local ss, se = find_text(text, subsyntax_info, i, false, true) -- If we find that we end the subsyntax before the -- delimiter, push the token, and signal we shouldn't -- treat the bit after as a token to be normally parsed @@ -202,12 +197,7 @@ function tokenizer.tokenize(incoming_syntax, text, state) -- we're ending early in the middle of a delimiter, or -- just normally, upon finding a token. if subsyntax_info then - local s, e = find_non_escaped( - text, - "^" .. subsyntax_info.pattern[2], - i, - nil - ) + local s, e = find_text(text, subsyntax_info, i, true, true) if s then push_token(res, subsyntax_info.type, text:sub(i, e)) -- On finding unescaped delimiter, pop it. @@ -219,16 +209,12 @@ function tokenizer.tokenize(incoming_syntax, text, state) -- find matching pattern local matched = false for n, p in ipairs(current_syntax.patterns) do - local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern - local find_results = { text:find("^" .. pattern, i) } - local start, fin = find_results[1], find_results[2] - - if start then + local find_results = { find_text(text, p, i, true, false) } + if find_results[1] then -- matched pattern; make and add tokens push_tokens(res, current_syntax, p, text, find_results) - -- update state if this was a start|end pattern pair - if type(p.pattern) == "table" then + if type(p.pattern or p.regex) == "table" then -- If we have a subsyntax, push that onto the subsyntax stack. if p.syntax then push_subsyntax(p, n) @@ -236,9 +222,8 @@ function tokenizer.tokenize(incoming_syntax, text, state) set_subsyntax_pattern_idx(n) end end - -- move cursor past this token - i = fin + 1 + i = find_results[2] + 1 matched = true break end diff --git a/data/plugins/projectsearch.lua b/data/plugins/projectsearch.lua index 69a27094..45399ed0 100644 --- a/data/plugins/projectsearch.lua +++ b/data/plugins/projectsearch.lua @@ -237,9 +237,12 @@ command.add(nil, { end) end, - ["project-search:find-pattern"] = function() - core.command_view:enter("Find Pattern In Project", function(text) - begin_search(text, function(line_text) return line_text:find(text) end) + ["project-search:find-regex"] = function() + core.command_view:enter("Find Regex In Project", function(text) + local re = regex.compile(text, "i") + begin_search(text, function(line_text) + return regex.cmatch(re, line_text) + end) end) end, diff --git a/meson.build b/meson.build index 9b1ab84d..5a218df7 100644 --- a/meson.build +++ b/meson.build @@ -9,6 +9,7 @@ libm = cc.find_library('m', required : false) libdl = cc.find_library('dl', required : false) libx11 = dependency('x11', required : false) lua_dep = dependency('lua5.2', required : false) +pcre2_dep = dependency('libpcre2-8') if not lua_dep.found() lua_subproject = subproject('lua', default_options: ['shared=false', 'use_readline=false', 'app=false']) diff --git a/src/api/api.c b/src/api/api.c index 34067a9c..5ea2e782 100644 --- a/src/api/api.c +++ b/src/api/api.c @@ -3,11 +3,13 @@ int luaopen_system(lua_State *L); int luaopen_renderer(lua_State *L); +int luaopen_regex(lua_State *L); static const luaL_Reg libs[] = { { "system", luaopen_system }, { "renderer", luaopen_renderer }, + { "regex", luaopen_regex }, { NULL, NULL } }; diff --git a/src/api/regex.c b/src/api/regex.c new file mode 100644 index 00000000..5245d8c2 --- /dev/null +++ b/src/api/regex.c @@ -0,0 +1,115 @@ +#include "api.h" + +#define PCRE2_CODE_UNIT_WIDTH 8 + +#include +#include + +static int f_pcre_gc(lua_State* L) { + lua_rawgeti(L, -1, 1); + pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1); + if (re) + pcre2_code_free(re); + return 0; +} + +static int f_pcre_compile(lua_State *L) { + size_t len; + PCRE2_SIZE errorOffset; + int errorNumber; + int pattern = PCRE2_UTF; + const char* str = luaL_checklstring(L, 1, &len); + if (lua_gettop(L) > 1) { + const char* options = luaL_checkstring(L, 2); + if (strstr(options,"i")) + pattern |= PCRE2_CASELESS; + if (strstr(options,"m")) + pattern |= PCRE2_MULTILINE; + if (strstr(options,"s")) + pattern |= PCRE2_DOTALL; + } + pcre2_code* re = pcre2_compile( + (PCRE2_SPTR)str, + len, + pattern, + &errorNumber, + &errorOffset, + NULL + ); + if (re) { + lua_newtable(L); + lua_pushlightuserdata(L, re); + lua_rawseti(L, -2, 1); + luaL_setmetatable(L, "regex"); + return 1; + } + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(errorNumber, buffer, sizeof(buffer)); + luaL_error(L, "regex compilation failed at offset %d: %s", + (int)errorOffset, buffer); + return 0; +} + +// Takes string, compiled regex, returns list of indices of matched groups +// (including the whole match), if a match was found. +static int f_pcre_match(lua_State *L) { + size_t len, offset = 1, opts = 0; + luaL_checktype(L, 1, LUA_TTABLE); + const char* str = luaL_checklstring(L, 2, &len); + if (lua_gettop(L) > 2) + offset = luaL_checknumber(L, 3); + if (lua_gettop(L) > 3) + opts = luaL_checknumber(L, 4); + lua_rawgeti(L, 1, 1); + pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1); + pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL); + int rc = pcre2_match(re, (PCRE2_SPTR)str, len, offset - 1, opts, md, NULL); + if (rc < 0) { + pcre2_match_data_free(md); + if (rc != PCRE2_ERROR_NOMATCH) + luaL_error(L, "regex matching error %d", rc); + return 0; + } + PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md); + if (ovector[0] > ovector[1]) { + /* We must guard against patterns such as /(?=.\K)/ that use \K in an + assertion to set the start of a match later than its end. In the editor, + we just detect this case and give up. */ + luaL_error(L, "regex matching error: \\K was used in an assertion to " + " set the match start after its end"); + pcre2_match_data_free(md); + return 0; + } + for (int i = 0; i < rc*2; i++) + lua_pushnumber(L, ovector[i]+1); + pcre2_match_data_free(md); + return rc*2; +} + +static const luaL_Reg lib[] = { + { "compile", f_pcre_compile }, + { "cmatch", f_pcre_match }, + { "__gc", f_pcre_gc }, + { NULL, NULL } +}; + +int luaopen_regex(lua_State *L) { + luaL_newlib(L, lib); + lua_pushliteral(L, "regex"); + lua_setfield(L, -2, "__name"); + lua_pushvalue(L, -1); + lua_setfield(L, LUA_REGISTRYINDEX, "regex"); + lua_pushnumber(L, PCRE2_ANCHORED); + lua_setfield(L, -2, "ANCHORED"); + lua_pushnumber(L, PCRE2_ANCHORED) ; + lua_setfield(L, -2, "ENDANCHORED"); + lua_pushnumber(L, PCRE2_NOTBOL); + lua_setfield(L, -2, "NOTBOL"); + lua_pushnumber(L, PCRE2_NOTEOL); + lua_setfield(L, -2, "NOTEOL"); + lua_pushnumber(L, PCRE2_NOTEMPTY); + lua_setfield(L, -2, "NOTEMPTY"); + lua_pushnumber(L, PCRE2_NOTEMPTY_ATSTART); + lua_setfield(L, -2, "NOTEMPTY_ATSTART"); + return 1; +} diff --git a/src/meson.build b/src/meson.build index 881014be..faa1a8ea 100644 --- a/src/meson.build +++ b/src/meson.build @@ -3,6 +3,7 @@ lite_sources = [ 'api/cp_replace.c', 'api/renderer.c', 'api/renderer_font.c', + 'api/regex.c', 'api/system.c', 'renderer.c', 'renwindow.c', @@ -18,7 +19,7 @@ endif executable('lite', lite_sources + lite_rc, include_directories: [lite_include, font_renderer_include], - dependencies: [lua_dep, sdl_dep, libm, libdl, libx11], + dependencies: [lua_dep, sdl_dep, pcre2_dep, libm, libdl, libx11], c_args: lite_cargs, link_with: libfontrenderer, link_args: lite_link_args,