Add PCRE to support regular expressions

Use regular expressions instead of Lua patterns for find and replace editor commands.

Syntax files can now use regex or Lua patterns as before keeping backward compatibility for plugins.
This commit is contained in:
Adam 2021-06-02 15:27:00 -04:00 committed by GitHub
parent ea5e9b0ce5
commit 248d70a8ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 257 additions and 70 deletions

View File

@ -3,7 +3,7 @@
cflags+="-Wall -O3 -g -std=gnu11 -fno-strict-aliasing -Isrc -Ilib/font_renderer"
cflags+=" $(pkg-config --cflags lua5.2) $(sdl2-config --cflags)"
lflags="-static-libgcc -static-libstdc++"
for package in libagg freetype2 lua5.2 x11; do
for package in libagg freetype2 lua5.2 x11 libpcre2-8; do
lflags+=" $(pkg-config --libs $package)"
done
lflags+=" $(sdl2-config --libs) -lm"

View File

@ -90,6 +90,7 @@ local function has_selection()
and core.active_view.doc:has_selection()
end
command.add(has_selection, {
["find-replace:select-next"] = function()
local l1, c1, l2, c2 = doc():get_selection(true)
@ -107,9 +108,9 @@ command.add("core.docview", {
end)
end,
["find-replace:find-pattern"] = function()
find("Find Text Pattern", function(doc, line, col, text)
local opt = { wrap = true, no_case = true, pattern = true }
["find-replace:find-regex"] = function()
find("Find Text Regex", function(doc, line, col, text)
local opt = { wrap = true, no_case = true, regex = true }
return search.find(doc, line, col, text, opt)
end)
end,
@ -144,9 +145,10 @@ command.add("core.docview", {
end)
end,
["find-replace:replace-pattern"] = function()
replace("Pattern", "", function(text, old, new)
return text:gsub(old, new)
["find-replace:replace-regex"] = function()
replace("Regex", "", function(text, old, new)
local re = regex.compile(old)
return regex.gsub(re, text, new)
end)
end,

View File

@ -15,12 +15,8 @@ local function init_args(doc, line, col, text, opt)
opt = opt or default_opt
line, col = doc:sanitize_position(line, col)
if opt.no_case then
if opt.pattern then
text = text:gsub("%%?.", pattern_lower)
else
text = text:lower()
end
if opt.no_case and not opt.regex then
text = text:lower()
end
return doc, line, col, text, opt
@ -30,20 +26,32 @@ end
function search.find(doc, line, col, text, opt)
doc, line, col, text, opt = init_args(doc, line, col, text, opt)
local re
if opt.regex then
re = regex.compile(text, opt.no_case and "i" or "")
end
for line = line, #doc.lines do
local line_text = doc.lines[line]
if opt.no_case then
line_text = line_text:lower()
if opt.regex then
local s, e = re:cmatch(line_text, col)
if s then
return line, s, line, e
end
col = 1
else
if opt.no_case then
line_text = line_text:lower()
end
local s, e = line_text:find(text, col, true)
if s then
return line, s, line, e + 1
end
col = 1
end
local s, e = line_text:find(text, col, not opt.pattern)
if s then
return line, s, line, e + 1
end
col = 1
end
if opt.wrap then
opt = { no_case = opt.no_case, pattern = opt.pattern }
opt = { no_case = opt.no_case, regex = opt.regex }
return search.find(doc, 1, 1, text, opt)
end
end

View File

@ -1,4 +1,5 @@
require "core.strict"
require "core.regex"
local common = require "core.common"
local config = require "core.config"
local style = require "core.style"

69
data/core/regex.lua Normal file
View File

@ -0,0 +1,69 @@
-- So that in addition to regex.gsub(pattern, string), we can also do
-- pattern:gsub(string).
regex.__index = function(table, key) return regex[key]; end
regex.match = function(pattern_string, string, offset, options)
local pattern = type(pattern_string) == "table" and
pattern_string or regex.compile(pattern_string)
return regex.cmatch(pattern, string, offset, options)
end
-- Will iterate back through any UTF-8 bytes so that we don't replace bits
-- mid character.
local function previous_character(str, index)
local byte
repeat
index = index - 1
byte = string.byte(str, index)
until byte < 128 or byte >= 192
return index
end
-- Moves to the end of the identified character.
local function end_character(str, index)
local byte = string.byte(str, index + 1)
while byte >= 128 and byte < 192 do
index = index + 1
byte = string.byte(str, index + 1)
end
return index
end
-- Build off matching. For now, only support basic replacements, but capture
-- groupings should be doable. We can even have custom group replacements and
-- transformations and stuff in lua. Currently, this takes group replacements
-- as \1 - \9.
-- Should work on UTF-8 text.
regex.gsub = function(pattern_string, str, replacement)
local pattern = type(pattern_string) == "table" and
pattern_string or regex.compile(pattern_string)
local result, indices = ""
local n = 0
repeat
indices = { regex.cmatch(pattern, str) }
if #indices > 0 then
n = n + 1
local currentReplacement = replacement
if #indices > 2 then
for i = 1, (#indices/2 - 1) do
currentReplacement = string.gsub(
currentReplacement,
"\\" .. i,
str:sub(indices[i*2+1], end_character(str,indices[i*2+2]-1))
)
end
end
currentReplacement = string.gsub(currentReplacement, "\\%d", "")
if indices[1] > 1 then
result = result ..
str:sub(1, previous_character(str, indices[1])) .. currentReplacement
else
result = result .. currentReplacement
end
str = str:sub(indices[2])
end
until #indices == 0 or indices[1] == indices[2]
return result .. str, n
end

View File

@ -48,29 +48,6 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
end
local function is_escaped(text, idx, esc)
local byte = esc:byte()
local count = 0
for i = idx - 1, 1, -1 do
if text:byte(i) ~= byte then break end
count = count + 1
end
return count % 2 == 1
end
local function find_non_escaped(text, pattern, offset, esc)
while true do
local s, e = text:find(pattern, offset)
if not s then break end
if esc and is_escaped(text, s, esc) then
offset = e + 1
else
return s, e
end
end
end
-- State is a 32-bit number that is four separate bytes, illustrating how many
-- differnet delimiters we have open, and which subsyntaxes we have active.
-- At most, there are 3 subsyntaxes active at the same time. Beyond that,
@ -155,26 +132,44 @@ function tokenizer.tokenize(incoming_syntax, text, state)
set_subsyntax_pattern_idx(0)
current_syntax, subsyntax_info, current_pattern_idx, current_level =
retrieve_syntax_state(incoming_syntax, state)
end
local function find_text(text, p, offset, at_start, close)
local target, res = p.pattern or p.regex, { 1, offset - 1 }, p.regex
local code = type(target) == "table" and target[close and 2 or 1] or target
if p.regex and type(p.regex) ~= "table" then
p._regex = p._regex or regex.compile(p.regex)
code = p._regex
end
repeat
res = p.pattern and { text:find(at_start and "^" .. code or code, res[2]+1) }
or { regex.match(code, text, res[2]+1, at_start and regex.ANCHORED or 0) }
if res[1] and close and target[3] then
local count = 0
for i = res[1] - 1, 1, -1 do
if text:byte(i) ~= target[3]:byte() then break end
count = count + 1
end
-- Check to see if the escaped character is there,
-- and if it is not itself escaped.
if count % 2 == 0 then break end
end
until not res[1] or not close or not target[3]
return unpack(res)
end
while i <= #text do
-- continue trying to match the end pattern of a pair if we have a state set
if current_pattern_idx > 0 then
local p = current_syntax.patterns[current_pattern_idx]
local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3])
local s, e = find_text(text, p, i, false, true)
local cont = true
-- If we're in subsyntax mode, always check to see if we end our syntax
-- first, before the found delimeter, as ending the subsyntax takes
-- precedence over ending the delimiter in the subsyntax.
if subsyntax_info then
local ss, se = find_non_escaped(
text,
subsyntax_info.pattern[2],
i,
subsyntax_info.pattern[3]
)
local ss, se = find_text(text, subsyntax_info, i, false, true)
-- If we find that we end the subsyntax before the
-- delimiter, push the token, and signal we shouldn't
-- treat the bit after as a token to be normally parsed
@ -202,12 +197,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
-- we're ending early in the middle of a delimiter, or
-- just normally, upon finding a token.
if subsyntax_info then
local s, e = find_non_escaped(
text,
"^" .. subsyntax_info.pattern[2],
i,
nil
)
local s, e = find_text(text, subsyntax_info, i, true, true)
if s then
push_token(res, subsyntax_info.type, text:sub(i, e))
-- On finding unescaped delimiter, pop it.
@ -219,16 +209,12 @@ function tokenizer.tokenize(incoming_syntax, text, state)
-- find matching pattern
local matched = false
for n, p in ipairs(current_syntax.patterns) do
local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern
local find_results = { text:find("^" .. pattern, i) }
local start, fin = find_results[1], find_results[2]
if start then
local find_results = { find_text(text, p, i, true, false) }
if find_results[1] then
-- matched pattern; make and add tokens
push_tokens(res, current_syntax, p, text, find_results)
-- update state if this was a start|end pattern pair
if type(p.pattern) == "table" then
if type(p.pattern or p.regex) == "table" then
-- If we have a subsyntax, push that onto the subsyntax stack.
if p.syntax then
push_subsyntax(p, n)
@ -236,9 +222,8 @@ function tokenizer.tokenize(incoming_syntax, text, state)
set_subsyntax_pattern_idx(n)
end
end
-- move cursor past this token
i = fin + 1
i = find_results[2] + 1
matched = true
break
end

View File

@ -237,9 +237,12 @@ command.add(nil, {
end)
end,
["project-search:find-pattern"] = function()
core.command_view:enter("Find Pattern In Project", function(text)
begin_search(text, function(line_text) return line_text:find(text) end)
["project-search:find-regex"] = function()
core.command_view:enter("Find Regex In Project", function(text)
local re = regex.compile(text, "i")
begin_search(text, function(line_text)
return regex.cmatch(re, line_text)
end)
end)
end,

View File

@ -9,6 +9,7 @@ libm = cc.find_library('m', required : false)
libdl = cc.find_library('dl', required : false)
libx11 = dependency('x11', required : false)
lua_dep = dependency('lua5.2', required : false)
pcre2_dep = dependency('libpcre2-8')
if not lua_dep.found()
lua_subproject = subproject('lua', default_options: ['shared=false', 'use_readline=false', 'app=false'])

View File

@ -3,11 +3,13 @@
int luaopen_system(lua_State *L);
int luaopen_renderer(lua_State *L);
int luaopen_regex(lua_State *L);
static const luaL_Reg libs[] = {
{ "system", luaopen_system },
{ "renderer", luaopen_renderer },
{ "regex", luaopen_regex },
{ NULL, NULL }
};

115
src/api/regex.c Normal file
View File

@ -0,0 +1,115 @@
#include "api.h"
#define PCRE2_CODE_UNIT_WIDTH 8
#include <string.h>
#include <pcre2.h>
static int f_pcre_gc(lua_State* L) {
lua_rawgeti(L, -1, 1);
pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1);
if (re)
pcre2_code_free(re);
return 0;
}
static int f_pcre_compile(lua_State *L) {
size_t len;
PCRE2_SIZE errorOffset;
int errorNumber;
int pattern = PCRE2_UTF;
const char* str = luaL_checklstring(L, 1, &len);
if (lua_gettop(L) > 1) {
const char* options = luaL_checkstring(L, 2);
if (strstr(options,"i"))
pattern |= PCRE2_CASELESS;
if (strstr(options,"m"))
pattern |= PCRE2_MULTILINE;
if (strstr(options,"s"))
pattern |= PCRE2_DOTALL;
}
pcre2_code* re = pcre2_compile(
(PCRE2_SPTR)str,
len,
pattern,
&errorNumber,
&errorOffset,
NULL
);
if (re) {
lua_newtable(L);
lua_pushlightuserdata(L, re);
lua_rawseti(L, -2, 1);
luaL_setmetatable(L, "regex");
return 1;
}
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(errorNumber, buffer, sizeof(buffer));
luaL_error(L, "regex compilation failed at offset %d: %s",
(int)errorOffset, buffer);
return 0;
}
// Takes string, compiled regex, returns list of indices of matched groups
// (including the whole match), if a match was found.
static int f_pcre_match(lua_State *L) {
size_t len, offset = 1, opts = 0;
luaL_checktype(L, 1, LUA_TTABLE);
const char* str = luaL_checklstring(L, 2, &len);
if (lua_gettop(L) > 2)
offset = luaL_checknumber(L, 3);
if (lua_gettop(L) > 3)
opts = luaL_checknumber(L, 4);
lua_rawgeti(L, 1, 1);
pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1);
pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL);
int rc = pcre2_match(re, (PCRE2_SPTR)str, len, offset - 1, opts, md, NULL);
if (rc < 0) {
pcre2_match_data_free(md);
if (rc != PCRE2_ERROR_NOMATCH)
luaL_error(L, "regex matching error %d", rc);
return 0;
}
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md);
if (ovector[0] > ovector[1]) {
/* We must guard against patterns such as /(?=.\K)/ that use \K in an
assertion to set the start of a match later than its end. In the editor,
we just detect this case and give up. */
luaL_error(L, "regex matching error: \\K was used in an assertion to "
" set the match start after its end");
pcre2_match_data_free(md);
return 0;
}
for (int i = 0; i < rc*2; i++)
lua_pushnumber(L, ovector[i]+1);
pcre2_match_data_free(md);
return rc*2;
}
static const luaL_Reg lib[] = {
{ "compile", f_pcre_compile },
{ "cmatch", f_pcre_match },
{ "__gc", f_pcre_gc },
{ NULL, NULL }
};
int luaopen_regex(lua_State *L) {
luaL_newlib(L, lib);
lua_pushliteral(L, "regex");
lua_setfield(L, -2, "__name");
lua_pushvalue(L, -1);
lua_setfield(L, LUA_REGISTRYINDEX, "regex");
lua_pushnumber(L, PCRE2_ANCHORED);
lua_setfield(L, -2, "ANCHORED");
lua_pushnumber(L, PCRE2_ANCHORED) ;
lua_setfield(L, -2, "ENDANCHORED");
lua_pushnumber(L, PCRE2_NOTBOL);
lua_setfield(L, -2, "NOTBOL");
lua_pushnumber(L, PCRE2_NOTEOL);
lua_setfield(L, -2, "NOTEOL");
lua_pushnumber(L, PCRE2_NOTEMPTY);
lua_setfield(L, -2, "NOTEMPTY");
lua_pushnumber(L, PCRE2_NOTEMPTY_ATSTART);
lua_setfield(L, -2, "NOTEMPTY_ATSTART");
return 1;
}

View File

@ -3,6 +3,7 @@ lite_sources = [
'api/cp_replace.c',
'api/renderer.c',
'api/renderer_font.c',
'api/regex.c',
'api/system.c',
'renderer.c',
'renwindow.c',
@ -18,7 +19,7 @@ endif
executable('lite',
lite_sources + lite_rc,
include_directories: [lite_include, font_renderer_include],
dependencies: [lua_dep, sdl_dep, libm, libdl, libx11],
dependencies: [lua_dep, sdl_dep, pcre2_dep, libm, libdl, libx11],
c_args: lite_cargs,
link_with: libfontrenderer,
link_args: lite_link_args,