diff --git a/data/core/commands/findreplace.lua b/data/core/commands/findreplace.lua index 3966dbf4..4a456aae 100644 --- a/data/core/commands/findreplace.lua +++ b/data/core/commands/findreplace.lua @@ -216,7 +216,7 @@ command.add("core.docview!", { return text:gsub(old:gsub("%W", "%%%1"), new:gsub("%%", "%%%%"), nil) end local result, matches = regex.gsub(regex.compile(old, "m"), text, new) - return result, #matches + return result, matches end) end, diff --git a/data/core/regex.lua b/data/core/regex.lua index d3a5c943..61089283 100644 --- a/data/core/regex.lua +++ b/data/core/regex.lua @@ -80,63 +80,3 @@ regex.find = function(pattern, str, offset, options) end return table.unpack(out) end - --- Will iterate back through any UTF-8 bytes so that we don't replace bits --- mid character. -local function previous_character(str, index) - local byte - repeat - index = index - 1 - byte = string.byte(str, index) - until byte < 128 or byte >= 192 - return index -end - --- Moves to the end of the identified character. -local function end_character(str, index) - local byte = string.byte(str, index + 1) - while byte and byte >= 128 and byte < 192 do - index = index + 1 - byte = string.byte(str, index + 1) - end - return index -end - --- Build off matching. For now, only support basic replacements, but capture --- groupings should be doable. We can even have custom group replacements and --- transformations and stuff in lua. Currently, this takes group replacements --- as \1 - \9. --- Should work on UTF-8 text. -regex.gsub = function(pattern_string, str, replacement) - local pattern = type(pattern_string) == "table" and - pattern_string or regex.compile(pattern_string) - local result, indices = {} - local matches, replacements = {}, {} - local offset = 0 - repeat - indices = { regex.cmatch(pattern, str, offset) } - if #indices > 0 then - table.insert(matches, indices) - local currentReplacement = replacement - if #indices > 2 then - for i = 1, (#indices/2 - 1) do - currentReplacement = string.gsub( - currentReplacement, - "\\" .. i, - str:sub(indices[i*2+1], end_character(str,indices[i*2+2]-1)) - ) - end - end - currentReplacement = string.gsub(currentReplacement, "\\%d", "") - table.insert(replacements, { indices[1], #currentReplacement+indices[1] }) - if indices[1] > 1 then - table.insert(result, str:sub(offset, previous_character(str, indices[1])) .. currentReplacement) - else - table.insert(result, currentReplacement) - end - offset = indices[2] - end - until #indices == 0 or indices[1] == indices[2] - return table.concat(result) .. str:sub(offset), matches, replacements -end - diff --git a/docs/api/regex.lua b/docs/api/regex.lua index 2792f4f6..0ff66479 100644 --- a/docs/api/regex.lua +++ b/docs/api/regex.lua @@ -41,7 +41,7 @@ regex.NOTEMPTY_ATSTART = 0x00000008 ---@param pattern string ---@param options? regex.modifiers A string of one or more pattern modifiers. --- ----@return regex|nil regex Ready to use regular expression object or nil on error. +---@return regex? regex Ready to use regular expression object or nil on error. ---@return string? error The error message if compiling the pattern failed. function regex.compile(pattern, options) end @@ -53,8 +53,42 @@ function regex.compile(pattern, options) end ---@param options? integer A bit field of matching options, eg: ---regex.NOTBOL | regex.NOTEMPTY --- ----@return integer ... list List of offsets where a match was found. +---@return integer? ... List of offsets where a match was found. function regex:cmatch(subject, offset, options) end +--- +---Returns an iterator function that, each time it is called, returns the +---next captures from `pattern` over the string subject. +--- +---Example: +---```lua +--- s = "hello world hello world" +--- for hello, world in regex.gmatch("(hello)\\s+(world)", s) do +--- print(hello .. " " .. world) +--- end +---``` +--- +---@param pattern string +---@param subject string +---@param offset? integer +--- +---@return fun():string, ... +function regex.gmatch(pattern, subject, offset) end + +--- +---Replaces the matched pattern globally on the subject with the given +---replacement, supports named captures ((?'name'), ${name}) and +---$[1-9][0-9]* substitutions. Raises an error when failing to compile the +---pattern or by a substitution mistake. +--- +---@param pattern regex|string +---@param subject string +---@param replacement string +---@param limit? integer Limits the number of substitutions that will be done. +--- +---@return string? replaced_subject +---@return integer? total_replacements +function regex.gsub(pattern, subject, replacement, limit) end + return regex diff --git a/src/api/regex.c b/src/api/regex.c index d23eaf71..8e525048 100644 --- a/src/api/regex.c +++ b/src/api/regex.c @@ -4,6 +4,128 @@ #include #include +#include + +typedef struct RegexState { + pcre2_code* re; + pcre2_match_data* match_data; + const char* subject; + size_t subject_len; + size_t offset; + bool regex_compiled; + bool found; +} RegexState; + +static pcre2_code* regex_get_pattern(lua_State *L, bool* should_free) { + pcre2_code* re = NULL; + *should_free = false; + + if (lua_type(L, 1) == LUA_TTABLE) { + lua_rawgeti(L, 1, 1); + re = (pcre2_code*)lua_touserdata(L, -1); + lua_settop(L, -2); + } else { + int errornumber; + PCRE2_SIZE erroroffset; + size_t pattern_len = 0; + const char* pattern = luaL_checklstring(L, 1, &pattern_len); + + re = pcre2_compile( + (PCRE2_SPTR)pattern, + pattern_len, PCRE2_UTF, + &errornumber, &erroroffset, NULL + ); + + if (re == NULL) { + PCRE2_UCHAR errmsg[256]; + pcre2_get_error_message(errornumber, errmsg, sizeof(errmsg)); + luaL_error( + L, "regex pattern error at offset %d: %s", + (int)erroroffset, errmsg + ); + return NULL; + } + + if (pcre2_config(PCRE2_CONFIG_JIT, NULL) == 1) { + pcre2_jit_compile(re, PCRE2_JIT_COMPLETE); + } + + *should_free = true; + } + + return re; +} + +static int regex_gmatch_iterator(lua_State *L) { + RegexState *state = (RegexState*)lua_touserdata(L, lua_upvalueindex(3)); + + if (state->found) { + int rc = pcre2_match( + state->re, + (PCRE2_SPTR)state->subject, state->subject_len, + state->offset, 0, state->match_data, NULL + ); + + if (rc < 0) { + if (rc != PCRE2_ERROR_NOMATCH) { + PCRE2_UCHAR buffer[120]; + pcre2_get_error_message(rc, buffer, sizeof(buffer)); + luaL_error(L, "regex matching error %d: %s", rc, buffer); + } + goto clean; + } else { + size_t ovector_count = pcre2_get_ovector_count(state->match_data); + if (ovector_count > 0) { + PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(state->match_data); + if (ovector[0] > ovector[1]) { + /* We must guard against patterns such as /(?=.\K)/ that use \K in an + assertion to set the start of a match later than its end. In the editor, + we just detect this case and give up. */ + luaL_error(L, "regex matching error: \\K was used in an assertion to " + " set the match start after its end"); + goto clean; + } + + int index = 0; + if (ovector_count > 1) index = 2; + + int total = 0; + int total_results = ovector_count * 2; + size_t last_offset = 0; + for (int i = index; i < total_results; i+=2) { + lua_pushlstring(L, state->subject+ovector[i], ovector[i+1] - ovector[i]); + last_offset = ovector[i+1]; + total++; + } + + if (last_offset - 1 < state->subject_len) + state->offset = last_offset; + else + state->found = false; + + return total; + } else { + state->found = false; + } + } + } + +clean: + if (state->regex_compiled) pcre2_code_free(state->re); + pcre2_match_data_free(state->match_data); + + return 0; /* not found */ +} + +static size_t regex_offset_relative(lua_Integer pos, size_t len) { + if (pos > 0) + return (size_t)pos; + else if (pos == 0) + return 1; + else if (pos < -(lua_Integer)len) /* inverted comparison */ + return 1; /* clip to 1 */ + else return len + (size_t)pos + 1; +} static int f_pcre_gc(lua_State* L) { lua_rawgeti(L, -1, 1); @@ -56,19 +178,21 @@ static int f_pcre_compile(lua_State *L) { // (including the whole match), if a match was found. static int f_pcre_match(lua_State *L) { size_t len, offset = 1, opts = 0; - luaL_checktype(L, 1, LUA_TTABLE); + bool regex_compiled = false; + pcre2_code* re = regex_get_pattern(L, ®ex_compiled); + if (!re) return 0 ; const char* str = luaL_checklstring(L, 2, &len); if (lua_gettop(L) > 2) - offset = luaL_checknumber(L, 3); + offset = regex_offset_relative(luaL_checknumber(L, 3), len); offset -= 1; len -= offset; if (lua_gettop(L) > 3) opts = luaL_checknumber(L, 4); lua_rawgeti(L, 1, 1); - pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1); pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL); int rc = pcre2_match(re, (PCRE2_SPTR)&str[offset], len, 0, opts, md, NULL); if (rc < 0) { + if (regex_compiled) pcre2_code_free(re); pcre2_match_data_free(md); if (rc != PCRE2_ERROR_NOMATCH) { PCRE2_UCHAR buffer[120]; @@ -84,18 +208,155 @@ static int f_pcre_match(lua_State *L) { we just detect this case and give up. */ luaL_error(L, "regex matching error: \\K was used in an assertion to " " set the match start after its end"); + if (regex_compiled) pcre2_code_free(re); pcre2_match_data_free(md); return 0; } for (int i = 0; i < rc*2; i++) lua_pushinteger(L, ovector[i]+offset+1); + if (regex_compiled) pcre2_code_free(re); pcre2_match_data_free(md); return rc*2; } +static int f_pcre_gmatch(lua_State *L) { + /* pattern param */ + bool regex_compiled = false; + pcre2_code* re = regex_get_pattern(L, ®ex_compiled); + if (!re) return 0; + size_t subject_len = 0; + + /* subject param */ + const char* subject = luaL_checklstring(L, 2, &subject_len); + + /* offset param */ + size_t offset = regex_offset_relative( + luaL_optnumber(L, 3, 1), subject_len + ) - 1; + + /* keep strings on closure to avoid being collected */ + lua_settop(L, 2); + + RegexState *state; + state = (RegexState*)lua_newuserdatauv(L, sizeof(RegexState), 0); + + state->re = re; + state->match_data = pcre2_match_data_create_from_pattern(re, NULL); + state->subject = subject; + state->subject_len = subject_len; + state->offset = offset; + state->found = true; + state->regex_compiled = regex_compiled; + + lua_pushcclosure(L, regex_gmatch_iterator, 3); + return 1; +} + +static int f_pcre_gsub(lua_State *L) { + size_t subject_len = 0, replacement_len = 0; + + bool regex_compiled = false; + pcre2_code* re = regex_get_pattern(L, ®ex_compiled); + if (!re) return 0 ; + + char* subject = (char*) luaL_checklstring(L, 2, &subject_len); + const char* replacement = luaL_checklstring(L, 3, &replacement_len); + int limit = luaL_optinteger(L, 4, 0); + if (limit < 0 ) limit = 0; + + pcre2_match_data* match_data = pcre2_match_data_create_from_pattern(re, NULL); + + size_t buffer_size = 1024; + char *output = (char *)malloc(buffer_size); + + int options = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED; + if (limit == 0) options |= PCRE2_SUBSTITUTE_GLOBAL; + + int results_count = 0; + int limit_count = 0; + bool done = false; + size_t offset = 0; + PCRE2_SIZE outlen = buffer_size; + while (!done) { + results_count = pcre2_substitute( + re, + (PCRE2_SPTR)subject, subject_len, + offset, options, + match_data, NULL, + (PCRE2_SPTR)replacement, replacement_len, + (PCRE2_UCHAR*)output, &outlen + ); + + if (results_count != PCRE2_ERROR_NOMEMORY || buffer_size >= outlen) { + /* PCRE2_SUBSTITUTE_GLOBAL code path (fastest) */ + if(limit == 0) { + done = true; + /* non PCRE2_SUBSTITUTE_GLOBAL with limit code path (slower) */ + } else { + size_t ovector_count = pcre2_get_ovector_count(match_data); + if (results_count > 0 && ovector_count > 0) { + limit_count++; + PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(match_data); + if (outlen > subject_len) { + offset = ovector[1] + (outlen - subject_len); + } else { + offset = ovector[1] - (subject_len - outlen); + } + if (limit_count > 1) free(subject); + if (limit_count == limit || offset-1 == outlen) { + done = true; + results_count = limit_count; + } else { + subject = output; + subject_len = outlen; + output = (char *)malloc(buffer_size); + outlen = buffer_size; + } + } else { + if (limit_count > 1) { + free(subject); + } + done = true; + results_count = limit_count; + } + } + } else { + buffer_size = outlen; + output = (char *)realloc(output, buffer_size); + } + } + + int return_count = 0; + + if (results_count > 0) { + lua_pushlstring(L, (const char*) output, outlen); + lua_pushinteger(L, results_count); + return_count = 2; + } else if (results_count == 0) { + lua_pushlstring(L, subject, subject_len); + lua_pushinteger(L, 0); + return_count = 2; + } + + free(output); + pcre2_match_data_free(match_data); + if (regex_compiled) + pcre2_code_free(re); + + if (results_count < 0) { + PCRE2_UCHAR errmsg[256]; + pcre2_get_error_message(results_count, errmsg, sizeof(errmsg)); + return luaL_error(L, "regex substitute error: %s", errmsg); + } + + return return_count; +} + static const luaL_Reg lib[] = { { "compile", f_pcre_compile }, { "cmatch", f_pcre_match }, + { "gmatch", f_pcre_gmatch }, + { "gsub", f_pcre_gsub }, { "__gc", f_pcre_gc }, { NULL, NULL } };