core: ported regex.gsub to faster native version (#1233)

* added regex.gmatch iterator and other fixes
* fixed issues reported by Guldoman
* push strings with fixed len just in case for binary safety
* added limit to regex.gsub and use pushinteger
* added description to regex.gsub limits param
* replaced substitutions regex description for correctness
* ignore negative limits on regex.gsub
This commit is contained in:
Jefferson González 2022-12-20 17:46:37 -04:00 committed by GitHub
parent 5b5b5fd3e3
commit 3c64c32379
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 301 additions and 66 deletions

View File

@ -216,7 +216,7 @@ command.add("core.docview!", {
return text:gsub(old:gsub("%W", "%%%1"), new:gsub("%%", "%%%%"), nil)
end
local result, matches = regex.gsub(regex.compile(old, "m"), text, new)
return result, #matches
return result, matches
end)
end,

View File

@ -80,63 +80,3 @@ regex.find = function(pattern, str, offset, options)
end
return table.unpack(out)
end
-- Will iterate back through any UTF-8 bytes so that we don't replace bits
-- mid character.
local function previous_character(str, index)
local byte
repeat
index = index - 1
byte = string.byte(str, index)
until byte < 128 or byte >= 192
return index
end
-- Moves to the end of the identified character.
local function end_character(str, index)
local byte = string.byte(str, index + 1)
while byte and byte >= 128 and byte < 192 do
index = index + 1
byte = string.byte(str, index + 1)
end
return index
end
-- Build off matching. For now, only support basic replacements, but capture
-- groupings should be doable. We can even have custom group replacements and
-- transformations and stuff in lua. Currently, this takes group replacements
-- as \1 - \9.
-- Should work on UTF-8 text.
regex.gsub = function(pattern_string, str, replacement)
local pattern = type(pattern_string) == "table" and
pattern_string or regex.compile(pattern_string)
local result, indices = {}
local matches, replacements = {}, {}
local offset = 0
repeat
indices = { regex.cmatch(pattern, str, offset) }
if #indices > 0 then
table.insert(matches, indices)
local currentReplacement = replacement
if #indices > 2 then
for i = 1, (#indices/2 - 1) do
currentReplacement = string.gsub(
currentReplacement,
"\\" .. i,
str:sub(indices[i*2+1], end_character(str,indices[i*2+2]-1))
)
end
end
currentReplacement = string.gsub(currentReplacement, "\\%d", "")
table.insert(replacements, { indices[1], #currentReplacement+indices[1] })
if indices[1] > 1 then
table.insert(result, str:sub(offset, previous_character(str, indices[1])) .. currentReplacement)
else
table.insert(result, currentReplacement)
end
offset = indices[2]
end
until #indices == 0 or indices[1] == indices[2]
return table.concat(result) .. str:sub(offset), matches, replacements
end

View File

@ -41,7 +41,7 @@ regex.NOTEMPTY_ATSTART = 0x00000008
---@param pattern string
---@param options? regex.modifiers A string of one or more pattern modifiers.
---
---@return regex|nil regex Ready to use regular expression object or nil on error.
---@return regex? regex Ready to use regular expression object or nil on error.
---@return string? error The error message if compiling the pattern failed.
function regex.compile(pattern, options) end
@ -53,8 +53,42 @@ function regex.compile(pattern, options) end
---@param options? integer A bit field of matching options, eg:
---regex.NOTBOL | regex.NOTEMPTY
---
---@return integer ... list List of offsets where a match was found.
---@return integer? ... List of offsets where a match was found.
function regex:cmatch(subject, offset, options) end
---
---Returns an iterator function that, each time it is called, returns the
---next captures from `pattern` over the string subject.
---
---Example:
---```lua
--- s = "hello world hello world"
--- for hello, world in regex.gmatch("(hello)\\s+(world)", s) do
--- print(hello .. " " .. world)
--- end
---```
---
---@param pattern string
---@param subject string
---@param offset? integer
---
---@return fun():string, ...
function regex.gmatch(pattern, subject, offset) end
---
---Replaces the matched pattern globally on the subject with the given
---replacement, supports named captures ((?'name'<pattern>), ${name}) and
---$[1-9][0-9]* substitutions. Raises an error when failing to compile the
---pattern or by a substitution mistake.
---
---@param pattern regex|string
---@param subject string
---@param replacement string
---@param limit? integer Limits the number of substitutions that will be done.
---
---@return string? replaced_subject
---@return integer? total_replacements
function regex.gsub(pattern, subject, replacement, limit) end
return regex

View File

@ -4,6 +4,128 @@
#include <string.h>
#include <pcre2.h>
#include <stdbool.h>
typedef struct RegexState {
pcre2_code* re;
pcre2_match_data* match_data;
const char* subject;
size_t subject_len;
size_t offset;
bool regex_compiled;
bool found;
} RegexState;
static pcre2_code* regex_get_pattern(lua_State *L, bool* should_free) {
pcre2_code* re = NULL;
*should_free = false;
if (lua_type(L, 1) == LUA_TTABLE) {
lua_rawgeti(L, 1, 1);
re = (pcre2_code*)lua_touserdata(L, -1);
lua_settop(L, -2);
} else {
int errornumber;
PCRE2_SIZE erroroffset;
size_t pattern_len = 0;
const char* pattern = luaL_checklstring(L, 1, &pattern_len);
re = pcre2_compile(
(PCRE2_SPTR)pattern,
pattern_len, PCRE2_UTF,
&errornumber, &erroroffset, NULL
);
if (re == NULL) {
PCRE2_UCHAR errmsg[256];
pcre2_get_error_message(errornumber, errmsg, sizeof(errmsg));
luaL_error(
L, "regex pattern error at offset %d: %s",
(int)erroroffset, errmsg
);
return NULL;
}
if (pcre2_config(PCRE2_CONFIG_JIT, NULL) == 1) {
pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
}
*should_free = true;
}
return re;
}
static int regex_gmatch_iterator(lua_State *L) {
RegexState *state = (RegexState*)lua_touserdata(L, lua_upvalueindex(3));
if (state->found) {
int rc = pcre2_match(
state->re,
(PCRE2_SPTR)state->subject, state->subject_len,
state->offset, 0, state->match_data, NULL
);
if (rc < 0) {
if (rc != PCRE2_ERROR_NOMATCH) {
PCRE2_UCHAR buffer[120];
pcre2_get_error_message(rc, buffer, sizeof(buffer));
luaL_error(L, "regex matching error %d: %s", rc, buffer);
}
goto clean;
} else {
size_t ovector_count = pcre2_get_ovector_count(state->match_data);
if (ovector_count > 0) {
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(state->match_data);
if (ovector[0] > ovector[1]) {
/* We must guard against patterns such as /(?=.\K)/ that use \K in an
assertion to set the start of a match later than its end. In the editor,
we just detect this case and give up. */
luaL_error(L, "regex matching error: \\K was used in an assertion to "
" set the match start after its end");
goto clean;
}
int index = 0;
if (ovector_count > 1) index = 2;
int total = 0;
int total_results = ovector_count * 2;
size_t last_offset = 0;
for (int i = index; i < total_results; i+=2) {
lua_pushlstring(L, state->subject+ovector[i], ovector[i+1] - ovector[i]);
last_offset = ovector[i+1];
total++;
}
if (last_offset - 1 < state->subject_len)
state->offset = last_offset;
else
state->found = false;
return total;
} else {
state->found = false;
}
}
}
clean:
if (state->regex_compiled) pcre2_code_free(state->re);
pcre2_match_data_free(state->match_data);
return 0; /* not found */
}
static size_t regex_offset_relative(lua_Integer pos, size_t len) {
if (pos > 0)
return (size_t)pos;
else if (pos == 0)
return 1;
else if (pos < -(lua_Integer)len) /* inverted comparison */
return 1; /* clip to 1 */
else return len + (size_t)pos + 1;
}
static int f_pcre_gc(lua_State* L) {
lua_rawgeti(L, -1, 1);
@ -56,19 +178,21 @@ static int f_pcre_compile(lua_State *L) {
// (including the whole match), if a match was found.
static int f_pcre_match(lua_State *L) {
size_t len, offset = 1, opts = 0;
luaL_checktype(L, 1, LUA_TTABLE);
bool regex_compiled = false;
pcre2_code* re = regex_get_pattern(L, &regex_compiled);
if (!re) return 0 ;
const char* str = luaL_checklstring(L, 2, &len);
if (lua_gettop(L) > 2)
offset = luaL_checknumber(L, 3);
offset = regex_offset_relative(luaL_checknumber(L, 3), len);
offset -= 1;
len -= offset;
if (lua_gettop(L) > 3)
opts = luaL_checknumber(L, 4);
lua_rawgeti(L, 1, 1);
pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1);
pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL);
int rc = pcre2_match(re, (PCRE2_SPTR)&str[offset], len, 0, opts, md, NULL);
if (rc < 0) {
if (regex_compiled) pcre2_code_free(re);
pcre2_match_data_free(md);
if (rc != PCRE2_ERROR_NOMATCH) {
PCRE2_UCHAR buffer[120];
@ -84,18 +208,155 @@ static int f_pcre_match(lua_State *L) {
we just detect this case and give up. */
luaL_error(L, "regex matching error: \\K was used in an assertion to "
" set the match start after its end");
if (regex_compiled) pcre2_code_free(re);
pcre2_match_data_free(md);
return 0;
}
for (int i = 0; i < rc*2; i++)
lua_pushinteger(L, ovector[i]+offset+1);
if (regex_compiled) pcre2_code_free(re);
pcre2_match_data_free(md);
return rc*2;
}
static int f_pcre_gmatch(lua_State *L) {
/* pattern param */
bool regex_compiled = false;
pcre2_code* re = regex_get_pattern(L, &regex_compiled);
if (!re) return 0;
size_t subject_len = 0;
/* subject param */
const char* subject = luaL_checklstring(L, 2, &subject_len);
/* offset param */
size_t offset = regex_offset_relative(
luaL_optnumber(L, 3, 1), subject_len
) - 1;
/* keep strings on closure to avoid being collected */
lua_settop(L, 2);
RegexState *state;
state = (RegexState*)lua_newuserdatauv(L, sizeof(RegexState), 0);
state->re = re;
state->match_data = pcre2_match_data_create_from_pattern(re, NULL);
state->subject = subject;
state->subject_len = subject_len;
state->offset = offset;
state->found = true;
state->regex_compiled = regex_compiled;
lua_pushcclosure(L, regex_gmatch_iterator, 3);
return 1;
}
static int f_pcre_gsub(lua_State *L) {
size_t subject_len = 0, replacement_len = 0;
bool regex_compiled = false;
pcre2_code* re = regex_get_pattern(L, &regex_compiled);
if (!re) return 0 ;
char* subject = (char*) luaL_checklstring(L, 2, &subject_len);
const char* replacement = luaL_checklstring(L, 3, &replacement_len);
int limit = luaL_optinteger(L, 4, 0);
if (limit < 0 ) limit = 0;
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re, NULL);
size_t buffer_size = 1024;
char *output = (char *)malloc(buffer_size);
int options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED;
if (limit == 0) options |= PCRE2_SUBSTITUTE_GLOBAL;
int results_count = 0;
int limit_count = 0;
bool done = false;
size_t offset = 0;
PCRE2_SIZE outlen = buffer_size;
while (!done) {
results_count = pcre2_substitute(
re,
(PCRE2_SPTR)subject, subject_len,
offset, options,
match_data, NULL,
(PCRE2_SPTR)replacement, replacement_len,
(PCRE2_UCHAR*)output, &outlen
);
if (results_count != PCRE2_ERROR_NOMEMORY || buffer_size >= outlen) {
/* PCRE2_SUBSTITUTE_GLOBAL code path (fastest) */
if(limit == 0) {
done = true;
/* non PCRE2_SUBSTITUTE_GLOBAL with limit code path (slower) */
} else {
size_t ovector_count = pcre2_get_ovector_count(match_data);
if (results_count > 0 && ovector_count > 0) {
limit_count++;
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(match_data);
if (outlen > subject_len) {
offset = ovector[1] + (outlen - subject_len);
} else {
offset = ovector[1] - (subject_len - outlen);
}
if (limit_count > 1) free(subject);
if (limit_count == limit || offset-1 == outlen) {
done = true;
results_count = limit_count;
} else {
subject = output;
subject_len = outlen;
output = (char *)malloc(buffer_size);
outlen = buffer_size;
}
} else {
if (limit_count > 1) {
free(subject);
}
done = true;
results_count = limit_count;
}
}
} else {
buffer_size = outlen;
output = (char *)realloc(output, buffer_size);
}
}
int return_count = 0;
if (results_count > 0) {
lua_pushlstring(L, (const char*) output, outlen);
lua_pushinteger(L, results_count);
return_count = 2;
} else if (results_count == 0) {
lua_pushlstring(L, subject, subject_len);
lua_pushinteger(L, 0);
return_count = 2;
}
free(output);
pcre2_match_data_free(match_data);
if (regex_compiled)
pcre2_code_free(re);
if (results_count < 0) {
PCRE2_UCHAR errmsg[256];
pcre2_get_error_message(results_count, errmsg, sizeof(errmsg));
return luaL_error(L, "regex substitute error: %s", errmsg);
}
return return_count;
}
static const luaL_Reg lib[] = {
{ "compile", f_pcre_compile },
{ "cmatch", f_pcre_match },
{ "gmatch", f_pcre_gmatch },
{ "gsub", f_pcre_gsub },
{ "__gc", f_pcre_gc },
{ NULL, NULL }
};