core: ported regex.gsub to faster native version (#1233)
* added regex.gmatch iterator and other fixes * fixed issues reported by Guldoman * push strings with fixed len just in case for binary safety * added limit to regex.gsub and use pushinteger * added description to regex.gsub limits param * replaced substitutions regex description for correctness * ignore negative limits on regex.gsub
This commit is contained in:
parent
5b5b5fd3e3
commit
3c64c32379
|
@ -216,7 +216,7 @@ command.add("core.docview!", {
|
|||
return text:gsub(old:gsub("%W", "%%%1"), new:gsub("%%", "%%%%"), nil)
|
||||
end
|
||||
local result, matches = regex.gsub(regex.compile(old, "m"), text, new)
|
||||
return result, #matches
|
||||
return result, matches
|
||||
end)
|
||||
end,
|
||||
|
||||
|
|
|
@ -80,63 +80,3 @@ regex.find = function(pattern, str, offset, options)
|
|||
end
|
||||
return table.unpack(out)
|
||||
end
|
||||
|
||||
-- Will iterate back through any UTF-8 bytes so that we don't replace bits
|
||||
-- mid character.
|
||||
local function previous_character(str, index)
|
||||
local byte
|
||||
repeat
|
||||
index = index - 1
|
||||
byte = string.byte(str, index)
|
||||
until byte < 128 or byte >= 192
|
||||
return index
|
||||
end
|
||||
|
||||
-- Moves to the end of the identified character.
|
||||
local function end_character(str, index)
|
||||
local byte = string.byte(str, index + 1)
|
||||
while byte and byte >= 128 and byte < 192 do
|
||||
index = index + 1
|
||||
byte = string.byte(str, index + 1)
|
||||
end
|
||||
return index
|
||||
end
|
||||
|
||||
-- Build off matching. For now, only support basic replacements, but capture
|
||||
-- groupings should be doable. We can even have custom group replacements and
|
||||
-- transformations and stuff in lua. Currently, this takes group replacements
|
||||
-- as \1 - \9.
|
||||
-- Should work on UTF-8 text.
|
||||
regex.gsub = function(pattern_string, str, replacement)
|
||||
local pattern = type(pattern_string) == "table" and
|
||||
pattern_string or regex.compile(pattern_string)
|
||||
local result, indices = {}
|
||||
local matches, replacements = {}, {}
|
||||
local offset = 0
|
||||
repeat
|
||||
indices = { regex.cmatch(pattern, str, offset) }
|
||||
if #indices > 0 then
|
||||
table.insert(matches, indices)
|
||||
local currentReplacement = replacement
|
||||
if #indices > 2 then
|
||||
for i = 1, (#indices/2 - 1) do
|
||||
currentReplacement = string.gsub(
|
||||
currentReplacement,
|
||||
"\\" .. i,
|
||||
str:sub(indices[i*2+1], end_character(str,indices[i*2+2]-1))
|
||||
)
|
||||
end
|
||||
end
|
||||
currentReplacement = string.gsub(currentReplacement, "\\%d", "")
|
||||
table.insert(replacements, { indices[1], #currentReplacement+indices[1] })
|
||||
if indices[1] > 1 then
|
||||
table.insert(result, str:sub(offset, previous_character(str, indices[1])) .. currentReplacement)
|
||||
else
|
||||
table.insert(result, currentReplacement)
|
||||
end
|
||||
offset = indices[2]
|
||||
end
|
||||
until #indices == 0 or indices[1] == indices[2]
|
||||
return table.concat(result) .. str:sub(offset), matches, replacements
|
||||
end
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ regex.NOTEMPTY_ATSTART = 0x00000008
|
|||
---@param pattern string
|
||||
---@param options? regex.modifiers A string of one or more pattern modifiers.
|
||||
---
|
||||
---@return regex|nil regex Ready to use regular expression object or nil on error.
|
||||
---@return regex? regex Ready to use regular expression object or nil on error.
|
||||
---@return string? error The error message if compiling the pattern failed.
|
||||
function regex.compile(pattern, options) end
|
||||
|
||||
|
@ -53,8 +53,42 @@ function regex.compile(pattern, options) end
|
|||
---@param options? integer A bit field of matching options, eg:
|
||||
---regex.NOTBOL | regex.NOTEMPTY
|
||||
---
|
||||
---@return integer ... list List of offsets where a match was found.
|
||||
---@return integer? ... List of offsets where a match was found.
|
||||
function regex:cmatch(subject, offset, options) end
|
||||
|
||||
---
|
||||
---Returns an iterator function that, each time it is called, returns the
|
||||
---next captures from `pattern` over the string subject.
|
||||
---
|
||||
---Example:
|
||||
---```lua
|
||||
--- s = "hello world hello world"
|
||||
--- for hello, world in regex.gmatch("(hello)\\s+(world)", s) do
|
||||
--- print(hello .. " " .. world)
|
||||
--- end
|
||||
---```
|
||||
---
|
||||
---@param pattern string
|
||||
---@param subject string
|
||||
---@param offset? integer
|
||||
---
|
||||
---@return fun():string, ...
|
||||
function regex.gmatch(pattern, subject, offset) end
|
||||
|
||||
---
|
||||
---Replaces the matched pattern globally on the subject with the given
|
||||
---replacement, supports named captures ((?'name'<pattern>), ${name}) and
|
||||
---$[1-9][0-9]* substitutions. Raises an error when failing to compile the
|
||||
---pattern or by a substitution mistake.
|
||||
---
|
||||
---@param pattern regex|string
|
||||
---@param subject string
|
||||
---@param replacement string
|
||||
---@param limit? integer Limits the number of substitutions that will be done.
|
||||
---
|
||||
---@return string? replaced_subject
|
||||
---@return integer? total_replacements
|
||||
function regex.gsub(pattern, subject, replacement, limit) end
|
||||
|
||||
|
||||
return regex
|
||||
|
|
267
src/api/regex.c
267
src/api/regex.c
|
@ -4,6 +4,128 @@
|
|||
|
||||
#include <string.h>
|
||||
#include <pcre2.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
typedef struct RegexState {
|
||||
pcre2_code* re;
|
||||
pcre2_match_data* match_data;
|
||||
const char* subject;
|
||||
size_t subject_len;
|
||||
size_t offset;
|
||||
bool regex_compiled;
|
||||
bool found;
|
||||
} RegexState;
|
||||
|
||||
static pcre2_code* regex_get_pattern(lua_State *L, bool* should_free) {
|
||||
pcre2_code* re = NULL;
|
||||
*should_free = false;
|
||||
|
||||
if (lua_type(L, 1) == LUA_TTABLE) {
|
||||
lua_rawgeti(L, 1, 1);
|
||||
re = (pcre2_code*)lua_touserdata(L, -1);
|
||||
lua_settop(L, -2);
|
||||
} else {
|
||||
int errornumber;
|
||||
PCRE2_SIZE erroroffset;
|
||||
size_t pattern_len = 0;
|
||||
const char* pattern = luaL_checklstring(L, 1, &pattern_len);
|
||||
|
||||
re = pcre2_compile(
|
||||
(PCRE2_SPTR)pattern,
|
||||
pattern_len, PCRE2_UTF,
|
||||
&errornumber, &erroroffset, NULL
|
||||
);
|
||||
|
||||
if (re == NULL) {
|
||||
PCRE2_UCHAR errmsg[256];
|
||||
pcre2_get_error_message(errornumber, errmsg, sizeof(errmsg));
|
||||
luaL_error(
|
||||
L, "regex pattern error at offset %d: %s",
|
||||
(int)erroroffset, errmsg
|
||||
);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (pcre2_config(PCRE2_CONFIG_JIT, NULL) == 1) {
|
||||
pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
|
||||
}
|
||||
|
||||
*should_free = true;
|
||||
}
|
||||
|
||||
return re;
|
||||
}
|
||||
|
||||
static int regex_gmatch_iterator(lua_State *L) {
|
||||
RegexState *state = (RegexState*)lua_touserdata(L, lua_upvalueindex(3));
|
||||
|
||||
if (state->found) {
|
||||
int rc = pcre2_match(
|
||||
state->re,
|
||||
(PCRE2_SPTR)state->subject, state->subject_len,
|
||||
state->offset, 0, state->match_data, NULL
|
||||
);
|
||||
|
||||
if (rc < 0) {
|
||||
if (rc != PCRE2_ERROR_NOMATCH) {
|
||||
PCRE2_UCHAR buffer[120];
|
||||
pcre2_get_error_message(rc, buffer, sizeof(buffer));
|
||||
luaL_error(L, "regex matching error %d: %s", rc, buffer);
|
||||
}
|
||||
goto clean;
|
||||
} else {
|
||||
size_t ovector_count = pcre2_get_ovector_count(state->match_data);
|
||||
if (ovector_count > 0) {
|
||||
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(state->match_data);
|
||||
if (ovector[0] > ovector[1]) {
|
||||
/* We must guard against patterns such as /(?=.\K)/ that use \K in an
|
||||
assertion to set the start of a match later than its end. In the editor,
|
||||
we just detect this case and give up. */
|
||||
luaL_error(L, "regex matching error: \\K was used in an assertion to "
|
||||
" set the match start after its end");
|
||||
goto clean;
|
||||
}
|
||||
|
||||
int index = 0;
|
||||
if (ovector_count > 1) index = 2;
|
||||
|
||||
int total = 0;
|
||||
int total_results = ovector_count * 2;
|
||||
size_t last_offset = 0;
|
||||
for (int i = index; i < total_results; i+=2) {
|
||||
lua_pushlstring(L, state->subject+ovector[i], ovector[i+1] - ovector[i]);
|
||||
last_offset = ovector[i+1];
|
||||
total++;
|
||||
}
|
||||
|
||||
if (last_offset - 1 < state->subject_len)
|
||||
state->offset = last_offset;
|
||||
else
|
||||
state->found = false;
|
||||
|
||||
return total;
|
||||
} else {
|
||||
state->found = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
clean:
|
||||
if (state->regex_compiled) pcre2_code_free(state->re);
|
||||
pcre2_match_data_free(state->match_data);
|
||||
|
||||
return 0; /* not found */
|
||||
}
|
||||
|
||||
static size_t regex_offset_relative(lua_Integer pos, size_t len) {
|
||||
if (pos > 0)
|
||||
return (size_t)pos;
|
||||
else if (pos == 0)
|
||||
return 1;
|
||||
else if (pos < -(lua_Integer)len) /* inverted comparison */
|
||||
return 1; /* clip to 1 */
|
||||
else return len + (size_t)pos + 1;
|
||||
}
|
||||
|
||||
static int f_pcre_gc(lua_State* L) {
|
||||
lua_rawgeti(L, -1, 1);
|
||||
|
@ -56,19 +178,21 @@ static int f_pcre_compile(lua_State *L) {
|
|||
// (including the whole match), if a match was found.
|
||||
static int f_pcre_match(lua_State *L) {
|
||||
size_t len, offset = 1, opts = 0;
|
||||
luaL_checktype(L, 1, LUA_TTABLE);
|
||||
bool regex_compiled = false;
|
||||
pcre2_code* re = regex_get_pattern(L, ®ex_compiled);
|
||||
if (!re) return 0 ;
|
||||
const char* str = luaL_checklstring(L, 2, &len);
|
||||
if (lua_gettop(L) > 2)
|
||||
offset = luaL_checknumber(L, 3);
|
||||
offset = regex_offset_relative(luaL_checknumber(L, 3), len);
|
||||
offset -= 1;
|
||||
len -= offset;
|
||||
if (lua_gettop(L) > 3)
|
||||
opts = luaL_checknumber(L, 4);
|
||||
lua_rawgeti(L, 1, 1);
|
||||
pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1);
|
||||
pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL);
|
||||
int rc = pcre2_match(re, (PCRE2_SPTR)&str[offset], len, 0, opts, md, NULL);
|
||||
if (rc < 0) {
|
||||
if (regex_compiled) pcre2_code_free(re);
|
||||
pcre2_match_data_free(md);
|
||||
if (rc != PCRE2_ERROR_NOMATCH) {
|
||||
PCRE2_UCHAR buffer[120];
|
||||
|
@ -84,18 +208,155 @@ static int f_pcre_match(lua_State *L) {
|
|||
we just detect this case and give up. */
|
||||
luaL_error(L, "regex matching error: \\K was used in an assertion to "
|
||||
" set the match start after its end");
|
||||
if (regex_compiled) pcre2_code_free(re);
|
||||
pcre2_match_data_free(md);
|
||||
return 0;
|
||||
}
|
||||
for (int i = 0; i < rc*2; i++)
|
||||
lua_pushinteger(L, ovector[i]+offset+1);
|
||||
if (regex_compiled) pcre2_code_free(re);
|
||||
pcre2_match_data_free(md);
|
||||
return rc*2;
|
||||
}
|
||||
|
||||
static int f_pcre_gmatch(lua_State *L) {
|
||||
/* pattern param */
|
||||
bool regex_compiled = false;
|
||||
pcre2_code* re = regex_get_pattern(L, ®ex_compiled);
|
||||
if (!re) return 0;
|
||||
size_t subject_len = 0;
|
||||
|
||||
/* subject param */
|
||||
const char* subject = luaL_checklstring(L, 2, &subject_len);
|
||||
|
||||
/* offset param */
|
||||
size_t offset = regex_offset_relative(
|
||||
luaL_optnumber(L, 3, 1), subject_len
|
||||
) - 1;
|
||||
|
||||
/* keep strings on closure to avoid being collected */
|
||||
lua_settop(L, 2);
|
||||
|
||||
RegexState *state;
|
||||
state = (RegexState*)lua_newuserdatauv(L, sizeof(RegexState), 0);
|
||||
|
||||
state->re = re;
|
||||
state->match_data = pcre2_match_data_create_from_pattern(re, NULL);
|
||||
state->subject = subject;
|
||||
state->subject_len = subject_len;
|
||||
state->offset = offset;
|
||||
state->found = true;
|
||||
state->regex_compiled = regex_compiled;
|
||||
|
||||
lua_pushcclosure(L, regex_gmatch_iterator, 3);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int f_pcre_gsub(lua_State *L) {
|
||||
size_t subject_len = 0, replacement_len = 0;
|
||||
|
||||
bool regex_compiled = false;
|
||||
pcre2_code* re = regex_get_pattern(L, ®ex_compiled);
|
||||
if (!re) return 0 ;
|
||||
|
||||
char* subject = (char*) luaL_checklstring(L, 2, &subject_len);
|
||||
const char* replacement = luaL_checklstring(L, 3, &replacement_len);
|
||||
int limit = luaL_optinteger(L, 4, 0);
|
||||
if (limit < 0 ) limit = 0;
|
||||
|
||||
pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re, NULL);
|
||||
|
||||
size_t buffer_size = 1024;
|
||||
char *output = (char *)malloc(buffer_size);
|
||||
|
||||
int options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED;
|
||||
if (limit == 0) options |= PCRE2_SUBSTITUTE_GLOBAL;
|
||||
|
||||
int results_count = 0;
|
||||
int limit_count = 0;
|
||||
bool done = false;
|
||||
size_t offset = 0;
|
||||
PCRE2_SIZE outlen = buffer_size;
|
||||
while (!done) {
|
||||
results_count = pcre2_substitute(
|
||||
re,
|
||||
(PCRE2_SPTR)subject, subject_len,
|
||||
offset, options,
|
||||
match_data, NULL,
|
||||
(PCRE2_SPTR)replacement, replacement_len,
|
||||
(PCRE2_UCHAR*)output, &outlen
|
||||
);
|
||||
|
||||
if (results_count != PCRE2_ERROR_NOMEMORY || buffer_size >= outlen) {
|
||||
/* PCRE2_SUBSTITUTE_GLOBAL code path (fastest) */
|
||||
if(limit == 0) {
|
||||
done = true;
|
||||
/* non PCRE2_SUBSTITUTE_GLOBAL with limit code path (slower) */
|
||||
} else {
|
||||
size_t ovector_count = pcre2_get_ovector_count(match_data);
|
||||
if (results_count > 0 && ovector_count > 0) {
|
||||
limit_count++;
|
||||
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(match_data);
|
||||
if (outlen > subject_len) {
|
||||
offset = ovector[1] + (outlen - subject_len);
|
||||
} else {
|
||||
offset = ovector[1] - (subject_len - outlen);
|
||||
}
|
||||
if (limit_count > 1) free(subject);
|
||||
if (limit_count == limit || offset-1 == outlen) {
|
||||
done = true;
|
||||
results_count = limit_count;
|
||||
} else {
|
||||
subject = output;
|
||||
subject_len = outlen;
|
||||
output = (char *)malloc(buffer_size);
|
||||
outlen = buffer_size;
|
||||
}
|
||||
} else {
|
||||
if (limit_count > 1) {
|
||||
free(subject);
|
||||
}
|
||||
done = true;
|
||||
results_count = limit_count;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
buffer_size = outlen;
|
||||
output = (char *)realloc(output, buffer_size);
|
||||
}
|
||||
}
|
||||
|
||||
int return_count = 0;
|
||||
|
||||
if (results_count > 0) {
|
||||
lua_pushlstring(L, (const char*) output, outlen);
|
||||
lua_pushinteger(L, results_count);
|
||||
return_count = 2;
|
||||
} else if (results_count == 0) {
|
||||
lua_pushlstring(L, subject, subject_len);
|
||||
lua_pushinteger(L, 0);
|
||||
return_count = 2;
|
||||
}
|
||||
|
||||
free(output);
|
||||
pcre2_match_data_free(match_data);
|
||||
if (regex_compiled)
|
||||
pcre2_code_free(re);
|
||||
|
||||
if (results_count < 0) {
|
||||
PCRE2_UCHAR errmsg[256];
|
||||
pcre2_get_error_message(results_count, errmsg, sizeof(errmsg));
|
||||
return luaL_error(L, "regex substitute error: %s", errmsg);
|
||||
}
|
||||
|
||||
return return_count;
|
||||
}
|
||||
|
||||
static const luaL_Reg lib[] = {
|
||||
{ "compile", f_pcre_compile },
|
||||
{ "cmatch", f_pcre_match },
|
||||
{ "gmatch", f_pcre_gmatch },
|
||||
{ "gsub", f_pcre_gsub },
|
||||
{ "__gc", f_pcre_gc },
|
||||
{ NULL, NULL }
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue