From fbb893c6b1b1c17f0ad2d755e2a5f128ecbc17ce Mon Sep 17 00:00:00 2001 From: Guldoman Date: Thu, 3 Mar 2022 22:09:48 +0100 Subject: [PATCH 1/2] Fix `^` regex matching when using an offset Before, if `offset > 1` was used, the match would have failed because the beginning of the string was never met. Now we force the beginning of the string to be the one specified by the offset. --- src/api/regex.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/api/regex.c b/src/api/regex.c index 08f0f142..6a0aac7a 100644 --- a/src/api/regex.c +++ b/src/api/regex.c @@ -60,12 +60,14 @@ static int f_pcre_match(lua_State *L) { const char* str = luaL_checklstring(L, 2, &len); if (lua_gettop(L) > 2) offset = luaL_checknumber(L, 3); + offset -= 1; + len -= offset; if (lua_gettop(L) > 3) opts = luaL_checknumber(L, 4); lua_rawgeti(L, 1, 1); pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1); pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL); - int rc = pcre2_match(re, (PCRE2_SPTR)str, len, offset - 1, opts, md, NULL); + int rc = pcre2_match(re, (PCRE2_SPTR)&str[offset], len, 0, opts, md, NULL); if (rc < 0) { pcre2_match_data_free(md); if (rc != PCRE2_ERROR_NOMATCH) { @@ -86,7 +88,7 @@ static int f_pcre_match(lua_State *L) { return 0; } for (int i = 0; i < rc*2; i++) - lua_pushnumber(L, ovector[i]+1); + lua_pushnumber(L, ovector[i]+offset+1); pcre2_match_data_free(md); return rc*2; } From caefc9112ae033751f05383080e120d59409efaa Mon Sep 17 00:00:00 2001 From: Guldoman Date: Fri, 4 Mar 2022 11:27:01 +0100 Subject: [PATCH 2/2] Force syntax patterns starting with `^` to match with the whole line Before, syntax patterns/regexes that started with `^` didn't have the desired effect of matching with the start of the line. Now those patterns are used only when matching the whole line. --- data/core/tokenizer.lua | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index 5fd8c69f..bb3faa03 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -136,20 +136,42 @@ function tokenizer.tokenize(incoming_syntax, text, state) end local function find_text(text, p, offset, at_start, close) - local target, res = p.pattern or p.regex, { 1, offset - 1 }, p.regex - local code = type(target) == "table" and target[close and 2 or 1] or target + local target, res = p.pattern or p.regex, { 1, offset - 1 } + local p_idx = close and 2 or 1 + local code = type(target) == "table" and target[p_idx] or target + + if p.whole_line == nil then p.whole_line = { } end + if p.whole_line[p_idx] == nil then + -- Match patterns that start with '^' + p.whole_line[p_idx] = code:match("^%^") and true or false + if p.whole_line[p_idx] then + -- Remove '^' from the beginning of the pattern + if type(target) == "table" then + target[p_idx] = code:sub(2) + else + p.pattern = p.pattern and code:sub(2) + p.regex = p.regex and code:sub(2) + end + end + end + if p.regex and type(p.regex) ~= "table" then p._regex = p._regex or regex.compile(p.regex) code = p._regex - end + end + repeat local next = res[2] + 1 + -- If the pattern contained '^', allow matching only the whole line + if p.whole_line[p_idx] and next > 1 then + return + end -- go to the start of the next utf-8 character while text:byte(next) and common.is_utf8_cont(text, next) do next = next + 1 end - res = p.pattern and { text:find(at_start and "^" .. code or code, next) } - or { regex.match(code, text, next, at_start and regex.ANCHORED or 0) } + res = p.pattern and { text:find((at_start or p.whole_line[p_idx]) and "^" .. code or code, next) } + or { regex.match(code, text, next, (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) } if res[1] and close and target[3] then local count = 0 for i = res[1] - 1, 1, -1 do