Add PCRE to support regular expressions
Use regular expressions instead of Lua patterns for find and replace editor commands. Syntax files can now use regex or Lua patterns as before keeping backward compatibility for plugins.
This commit is contained in:
parent
ea5e9b0ce5
commit
248d70a8ca
2
build.sh
2
build.sh
|
@ -3,7 +3,7 @@
|
|||
cflags+="-Wall -O3 -g -std=gnu11 -fno-strict-aliasing -Isrc -Ilib/font_renderer"
|
||||
cflags+=" $(pkg-config --cflags lua5.2) $(sdl2-config --cflags)"
|
||||
lflags="-static-libgcc -static-libstdc++"
|
||||
for package in libagg freetype2 lua5.2 x11; do
|
||||
for package in libagg freetype2 lua5.2 x11 libpcre2-8; do
|
||||
lflags+=" $(pkg-config --libs $package)"
|
||||
done
|
||||
lflags+=" $(sdl2-config --libs) -lm"
|
||||
|
|
|
@ -90,6 +90,7 @@ local function has_selection()
|
|||
and core.active_view.doc:has_selection()
|
||||
end
|
||||
|
||||
|
||||
command.add(has_selection, {
|
||||
["find-replace:select-next"] = function()
|
||||
local l1, c1, l2, c2 = doc():get_selection(true)
|
||||
|
@ -107,9 +108,9 @@ command.add("core.docview", {
|
|||
end)
|
||||
end,
|
||||
|
||||
["find-replace:find-pattern"] = function()
|
||||
find("Find Text Pattern", function(doc, line, col, text)
|
||||
local opt = { wrap = true, no_case = true, pattern = true }
|
||||
["find-replace:find-regex"] = function()
|
||||
find("Find Text Regex", function(doc, line, col, text)
|
||||
local opt = { wrap = true, no_case = true, regex = true }
|
||||
return search.find(doc, line, col, text, opt)
|
||||
end)
|
||||
end,
|
||||
|
@ -144,9 +145,10 @@ command.add("core.docview", {
|
|||
end)
|
||||
end,
|
||||
|
||||
["find-replace:replace-pattern"] = function()
|
||||
replace("Pattern", "", function(text, old, new)
|
||||
return text:gsub(old, new)
|
||||
["find-replace:replace-regex"] = function()
|
||||
replace("Regex", "", function(text, old, new)
|
||||
local re = regex.compile(old)
|
||||
return regex.gsub(re, text, new)
|
||||
end)
|
||||
end,
|
||||
|
||||
|
|
|
@ -15,13 +15,9 @@ local function init_args(doc, line, col, text, opt)
|
|||
opt = opt or default_opt
|
||||
line, col = doc:sanitize_position(line, col)
|
||||
|
||||
if opt.no_case then
|
||||
if opt.pattern then
|
||||
text = text:gsub("%%?.", pattern_lower)
|
||||
else
|
||||
if opt.no_case and not opt.regex then
|
||||
text = text:lower()
|
||||
end
|
||||
end
|
||||
|
||||
return doc, line, col, text, opt
|
||||
end
|
||||
|
@ -30,20 +26,32 @@ end
|
|||
function search.find(doc, line, col, text, opt)
|
||||
doc, line, col, text, opt = init_args(doc, line, col, text, opt)
|
||||
|
||||
local re
|
||||
if opt.regex then
|
||||
re = regex.compile(text, opt.no_case and "i" or "")
|
||||
end
|
||||
for line = line, #doc.lines do
|
||||
local line_text = doc.lines[line]
|
||||
if opt.regex then
|
||||
local s, e = re:cmatch(line_text, col)
|
||||
if s then
|
||||
return line, s, line, e
|
||||
end
|
||||
col = 1
|
||||
else
|
||||
if opt.no_case then
|
||||
line_text = line_text:lower()
|
||||
end
|
||||
local s, e = line_text:find(text, col, not opt.pattern)
|
||||
local s, e = line_text:find(text, col, true)
|
||||
if s then
|
||||
return line, s, line, e + 1
|
||||
end
|
||||
col = 1
|
||||
end
|
||||
end
|
||||
|
||||
if opt.wrap then
|
||||
opt = { no_case = opt.no_case, pattern = opt.pattern }
|
||||
opt = { no_case = opt.no_case, regex = opt.regex }
|
||||
return search.find(doc, 1, 1, text, opt)
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
require "core.strict"
|
||||
require "core.regex"
|
||||
local common = require "core.common"
|
||||
local config = require "core.config"
|
||||
local style = require "core.style"
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
|
||||
-- So that in addition to regex.gsub(pattern, string), we can also do
|
||||
-- pattern:gsub(string).
|
||||
regex.__index = function(table, key) return regex[key]; end
|
||||
|
||||
regex.match = function(pattern_string, string, offset, options)
|
||||
local pattern = type(pattern_string) == "table" and
|
||||
pattern_string or regex.compile(pattern_string)
|
||||
return regex.cmatch(pattern, string, offset, options)
|
||||
end
|
||||
|
||||
-- Will iterate back through any UTF-8 bytes so that we don't replace bits
|
||||
-- mid character.
|
||||
local function previous_character(str, index)
|
||||
local byte
|
||||
repeat
|
||||
index = index - 1
|
||||
byte = string.byte(str, index)
|
||||
until byte < 128 or byte >= 192
|
||||
return index
|
||||
end
|
||||
|
||||
-- Moves to the end of the identified character.
|
||||
local function end_character(str, index)
|
||||
local byte = string.byte(str, index + 1)
|
||||
while byte >= 128 and byte < 192 do
|
||||
index = index + 1
|
||||
byte = string.byte(str, index + 1)
|
||||
end
|
||||
return index
|
||||
end
|
||||
|
||||
-- Build off matching. For now, only support basic replacements, but capture
|
||||
-- groupings should be doable. We can even have custom group replacements and
|
||||
-- transformations and stuff in lua. Currently, this takes group replacements
|
||||
-- as \1 - \9.
|
||||
-- Should work on UTF-8 text.
|
||||
regex.gsub = function(pattern_string, str, replacement)
|
||||
local pattern = type(pattern_string) == "table" and
|
||||
pattern_string or regex.compile(pattern_string)
|
||||
local result, indices = ""
|
||||
local n = 0
|
||||
repeat
|
||||
indices = { regex.cmatch(pattern, str) }
|
||||
if #indices > 0 then
|
||||
n = n + 1
|
||||
local currentReplacement = replacement
|
||||
if #indices > 2 then
|
||||
for i = 1, (#indices/2 - 1) do
|
||||
currentReplacement = string.gsub(
|
||||
currentReplacement,
|
||||
"\\" .. i,
|
||||
str:sub(indices[i*2+1], end_character(str,indices[i*2+2]-1))
|
||||
)
|
||||
end
|
||||
end
|
||||
currentReplacement = string.gsub(currentReplacement, "\\%d", "")
|
||||
if indices[1] > 1 then
|
||||
result = result ..
|
||||
str:sub(1, previous_character(str, indices[1])) .. currentReplacement
|
||||
else
|
||||
result = result .. currentReplacement
|
||||
end
|
||||
str = str:sub(indices[2])
|
||||
end
|
||||
until #indices == 0 or indices[1] == indices[2]
|
||||
return result .. str, n
|
||||
end
|
||||
|
|
@ -48,29 +48,6 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
|
|||
end
|
||||
|
||||
|
||||
local function is_escaped(text, idx, esc)
|
||||
local byte = esc:byte()
|
||||
local count = 0
|
||||
for i = idx - 1, 1, -1 do
|
||||
if text:byte(i) ~= byte then break end
|
||||
count = count + 1
|
||||
end
|
||||
return count % 2 == 1
|
||||
end
|
||||
|
||||
|
||||
local function find_non_escaped(text, pattern, offset, esc)
|
||||
while true do
|
||||
local s, e = text:find(pattern, offset)
|
||||
if not s then break end
|
||||
if esc and is_escaped(text, s, esc) then
|
||||
offset = e + 1
|
||||
else
|
||||
return s, e
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
-- State is a 32-bit number that is four separate bytes, illustrating how many
|
||||
-- differnet delimiters we have open, and which subsyntaxes we have active.
|
||||
-- At most, there are 3 subsyntaxes active at the same time. Beyond that,
|
||||
|
@ -155,26 +132,44 @@ function tokenizer.tokenize(incoming_syntax, text, state)
|
|||
set_subsyntax_pattern_idx(0)
|
||||
current_syntax, subsyntax_info, current_pattern_idx, current_level =
|
||||
retrieve_syntax_state(incoming_syntax, state)
|
||||
end
|
||||
|
||||
local function find_text(text, p, offset, at_start, close)
|
||||
local target, res = p.pattern or p.regex, { 1, offset - 1 }, p.regex
|
||||
local code = type(target) == "table" and target[close and 2 or 1] or target
|
||||
if p.regex and type(p.regex) ~= "table" then
|
||||
p._regex = p._regex or regex.compile(p.regex)
|
||||
code = p._regex
|
||||
end
|
||||
repeat
|
||||
res = p.pattern and { text:find(at_start and "^" .. code or code, res[2]+1) }
|
||||
or { regex.match(code, text, res[2]+1, at_start and regex.ANCHORED or 0) }
|
||||
if res[1] and close and target[3] then
|
||||
local count = 0
|
||||
for i = res[1] - 1, 1, -1 do
|
||||
if text:byte(i) ~= target[3]:byte() then break end
|
||||
count = count + 1
|
||||
end
|
||||
-- Check to see if the escaped character is there,
|
||||
-- and if it is not itself escaped.
|
||||
if count % 2 == 0 then break end
|
||||
end
|
||||
until not res[1] or not close or not target[3]
|
||||
return unpack(res)
|
||||
end
|
||||
|
||||
while i <= #text do
|
||||
-- continue trying to match the end pattern of a pair if we have a state set
|
||||
if current_pattern_idx > 0 then
|
||||
local p = current_syntax.patterns[current_pattern_idx]
|
||||
local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3])
|
||||
local s, e = find_text(text, p, i, false, true)
|
||||
|
||||
local cont = true
|
||||
-- If we're in subsyntax mode, always check to see if we end our syntax
|
||||
-- first, before the found delimeter, as ending the subsyntax takes
|
||||
-- precedence over ending the delimiter in the subsyntax.
|
||||
if subsyntax_info then
|
||||
local ss, se = find_non_escaped(
|
||||
text,
|
||||
subsyntax_info.pattern[2],
|
||||
i,
|
||||
subsyntax_info.pattern[3]
|
||||
)
|
||||
local ss, se = find_text(text, subsyntax_info, i, false, true)
|
||||
-- If we find that we end the subsyntax before the
|
||||
-- delimiter, push the token, and signal we shouldn't
|
||||
-- treat the bit after as a token to be normally parsed
|
||||
|
@ -202,12 +197,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
|
|||
-- we're ending early in the middle of a delimiter, or
|
||||
-- just normally, upon finding a token.
|
||||
if subsyntax_info then
|
||||
local s, e = find_non_escaped(
|
||||
text,
|
||||
"^" .. subsyntax_info.pattern[2],
|
||||
i,
|
||||
nil
|
||||
)
|
||||
local s, e = find_text(text, subsyntax_info, i, true, true)
|
||||
if s then
|
||||
push_token(res, subsyntax_info.type, text:sub(i, e))
|
||||
-- On finding unescaped delimiter, pop it.
|
||||
|
@ -219,16 +209,12 @@ function tokenizer.tokenize(incoming_syntax, text, state)
|
|||
-- find matching pattern
|
||||
local matched = false
|
||||
for n, p in ipairs(current_syntax.patterns) do
|
||||
local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern
|
||||
local find_results = { text:find("^" .. pattern, i) }
|
||||
local start, fin = find_results[1], find_results[2]
|
||||
|
||||
if start then
|
||||
local find_results = { find_text(text, p, i, true, false) }
|
||||
if find_results[1] then
|
||||
-- matched pattern; make and add tokens
|
||||
push_tokens(res, current_syntax, p, text, find_results)
|
||||
|
||||
-- update state if this was a start|end pattern pair
|
||||
if type(p.pattern) == "table" then
|
||||
if type(p.pattern or p.regex) == "table" then
|
||||
-- If we have a subsyntax, push that onto the subsyntax stack.
|
||||
if p.syntax then
|
||||
push_subsyntax(p, n)
|
||||
|
@ -236,9 +222,8 @@ function tokenizer.tokenize(incoming_syntax, text, state)
|
|||
set_subsyntax_pattern_idx(n)
|
||||
end
|
||||
end
|
||||
|
||||
-- move cursor past this token
|
||||
i = fin + 1
|
||||
i = find_results[2] + 1
|
||||
matched = true
|
||||
break
|
||||
end
|
||||
|
|
|
@ -237,9 +237,12 @@ command.add(nil, {
|
|||
end)
|
||||
end,
|
||||
|
||||
["project-search:find-pattern"] = function()
|
||||
core.command_view:enter("Find Pattern In Project", function(text)
|
||||
begin_search(text, function(line_text) return line_text:find(text) end)
|
||||
["project-search:find-regex"] = function()
|
||||
core.command_view:enter("Find Regex In Project", function(text)
|
||||
local re = regex.compile(text, "i")
|
||||
begin_search(text, function(line_text)
|
||||
return regex.cmatch(re, line_text)
|
||||
end)
|
||||
end)
|
||||
end,
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ libm = cc.find_library('m', required : false)
|
|||
libdl = cc.find_library('dl', required : false)
|
||||
libx11 = dependency('x11', required : false)
|
||||
lua_dep = dependency('lua5.2', required : false)
|
||||
pcre2_dep = dependency('libpcre2-8')
|
||||
|
||||
if not lua_dep.found()
|
||||
lua_subproject = subproject('lua', default_options: ['shared=false', 'use_readline=false', 'app=false'])
|
||||
|
|
|
@ -3,11 +3,13 @@
|
|||
|
||||
int luaopen_system(lua_State *L);
|
||||
int luaopen_renderer(lua_State *L);
|
||||
int luaopen_regex(lua_State *L);
|
||||
|
||||
|
||||
static const luaL_Reg libs[] = {
|
||||
{ "system", luaopen_system },
|
||||
{ "renderer", luaopen_renderer },
|
||||
{ "regex", luaopen_regex },
|
||||
{ NULL, NULL }
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
#include "api.h"
|
||||
|
||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||
|
||||
#include <string.h>
|
||||
#include <pcre2.h>
|
||||
|
||||
static int f_pcre_gc(lua_State* L) {
|
||||
lua_rawgeti(L, -1, 1);
|
||||
pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1);
|
||||
if (re)
|
||||
pcre2_code_free(re);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int f_pcre_compile(lua_State *L) {
|
||||
size_t len;
|
||||
PCRE2_SIZE errorOffset;
|
||||
int errorNumber;
|
||||
int pattern = PCRE2_UTF;
|
||||
const char* str = luaL_checklstring(L, 1, &len);
|
||||
if (lua_gettop(L) > 1) {
|
||||
const char* options = luaL_checkstring(L, 2);
|
||||
if (strstr(options,"i"))
|
||||
pattern |= PCRE2_CASELESS;
|
||||
if (strstr(options,"m"))
|
||||
pattern |= PCRE2_MULTILINE;
|
||||
if (strstr(options,"s"))
|
||||
pattern |= PCRE2_DOTALL;
|
||||
}
|
||||
pcre2_code* re = pcre2_compile(
|
||||
(PCRE2_SPTR)str,
|
||||
len,
|
||||
pattern,
|
||||
&errorNumber,
|
||||
&errorOffset,
|
||||
NULL
|
||||
);
|
||||
if (re) {
|
||||
lua_newtable(L);
|
||||
lua_pushlightuserdata(L, re);
|
||||
lua_rawseti(L, -2, 1);
|
||||
luaL_setmetatable(L, "regex");
|
||||
return 1;
|
||||
}
|
||||
PCRE2_UCHAR buffer[256];
|
||||
pcre2_get_error_message(errorNumber, buffer, sizeof(buffer));
|
||||
luaL_error(L, "regex compilation failed at offset %d: %s",
|
||||
(int)errorOffset, buffer);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Takes string, compiled regex, returns list of indices of matched groups
|
||||
// (including the whole match), if a match was found.
|
||||
static int f_pcre_match(lua_State *L) {
|
||||
size_t len, offset = 1, opts = 0;
|
||||
luaL_checktype(L, 1, LUA_TTABLE);
|
||||
const char* str = luaL_checklstring(L, 2, &len);
|
||||
if (lua_gettop(L) > 2)
|
||||
offset = luaL_checknumber(L, 3);
|
||||
if (lua_gettop(L) > 3)
|
||||
opts = luaL_checknumber(L, 4);
|
||||
lua_rawgeti(L, 1, 1);
|
||||
pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1);
|
||||
pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL);
|
||||
int rc = pcre2_match(re, (PCRE2_SPTR)str, len, offset - 1, opts, md, NULL);
|
||||
if (rc < 0) {
|
||||
pcre2_match_data_free(md);
|
||||
if (rc != PCRE2_ERROR_NOMATCH)
|
||||
luaL_error(L, "regex matching error %d", rc);
|
||||
return 0;
|
||||
}
|
||||
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md);
|
||||
if (ovector[0] > ovector[1]) {
|
||||
/* We must guard against patterns such as /(?=.\K)/ that use \K in an
|
||||
assertion to set the start of a match later than its end. In the editor,
|
||||
we just detect this case and give up. */
|
||||
luaL_error(L, "regex matching error: \\K was used in an assertion to "
|
||||
" set the match start after its end");
|
||||
pcre2_match_data_free(md);
|
||||
return 0;
|
||||
}
|
||||
for (int i = 0; i < rc*2; i++)
|
||||
lua_pushnumber(L, ovector[i]+1);
|
||||
pcre2_match_data_free(md);
|
||||
return rc*2;
|
||||
}
|
||||
|
||||
static const luaL_Reg lib[] = {
|
||||
{ "compile", f_pcre_compile },
|
||||
{ "cmatch", f_pcre_match },
|
||||
{ "__gc", f_pcre_gc },
|
||||
{ NULL, NULL }
|
||||
};
|
||||
|
||||
int luaopen_regex(lua_State *L) {
|
||||
luaL_newlib(L, lib);
|
||||
lua_pushliteral(L, "regex");
|
||||
lua_setfield(L, -2, "__name");
|
||||
lua_pushvalue(L, -1);
|
||||
lua_setfield(L, LUA_REGISTRYINDEX, "regex");
|
||||
lua_pushnumber(L, PCRE2_ANCHORED);
|
||||
lua_setfield(L, -2, "ANCHORED");
|
||||
lua_pushnumber(L, PCRE2_ANCHORED) ;
|
||||
lua_setfield(L, -2, "ENDANCHORED");
|
||||
lua_pushnumber(L, PCRE2_NOTBOL);
|
||||
lua_setfield(L, -2, "NOTBOL");
|
||||
lua_pushnumber(L, PCRE2_NOTEOL);
|
||||
lua_setfield(L, -2, "NOTEOL");
|
||||
lua_pushnumber(L, PCRE2_NOTEMPTY);
|
||||
lua_setfield(L, -2, "NOTEMPTY");
|
||||
lua_pushnumber(L, PCRE2_NOTEMPTY_ATSTART);
|
||||
lua_setfield(L, -2, "NOTEMPTY_ATSTART");
|
||||
return 1;
|
||||
}
|
|
@ -3,6 +3,7 @@ lite_sources = [
|
|||
'api/cp_replace.c',
|
||||
'api/renderer.c',
|
||||
'api/renderer_font.c',
|
||||
'api/regex.c',
|
||||
'api/system.c',
|
||||
'renderer.c',
|
||||
'renwindow.c',
|
||||
|
@ -18,7 +19,7 @@ endif
|
|||
executable('lite',
|
||||
lite_sources + lite_rc,
|
||||
include_directories: [lite_include, font_renderer_include],
|
||||
dependencies: [lua_dep, sdl_dep, libm, libdl, libx11],
|
||||
dependencies: [lua_dep, sdl_dep, pcre2_dep, libm, libdl, libx11],
|
||||
c_args: lite_cargs,
|
||||
link_with: libfontrenderer,
|
||||
link_args: lite_link_args,
|
||||
|
|
Loading…
Reference in New Issue