Merge pull request #612 from Guldoman/fix_regex

Fix regex in tokenizer
2021-10-22 21:44:44 +02:00 · 2021-10-22 21:44:44 +02:00 · 6f732f67f9
parent ddb6196e9e 8a516d35ce
commit 6f732f67f9
4 changed files with 17 additions and 8 deletions
--- a/data/core/common.lua
+++ b/data/core/common.lua
@ -1,8 +1,8 @@
 local common = {}
-function common.is_utf8_cont(char)
+function common.is_utf8_cont(s, offset)
-  local byte = char:byte()
+  local byte = s:byte(offset or 1)
  return byte >= 0x80 and byte < 0xc0
 end
--- a/data/core/regex.lua
+++ b/data/core/regex.lua
@ -1,4 +1,3 @@
 -- So that in addition to regex.gsub(pattern, string), we can also do
 -- pattern:gsub(string).
 regex.__index = function(table, key) return regex[key]; end
@ -6,7 +5,8 @@ regex.__index = function(table, key) return regex[key]; end
 regex.match = function(pattern_string, string, offset, options)
  local pattern = type(pattern_string) == "table" and
    pattern_string or regex.compile(pattern_string)
-  return regex.cmatch(pattern, string, offset or 1, options or 0)
+  local s, e = regex.cmatch(pattern, string, offset or 1, options or 0)
  return s, e and e - 1
 end
 -- Will iterate back through any UTF-8 bytes so that we don't replace bits
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@ -1,4 +1,5 @@
 local syntax = require "core.syntax"
 local common = require "core.common"
 local tokenizer = {}
@ -142,8 +143,13 @@ function tokenizer.tokenize(incoming_syntax, text, state)
      code = p._regex
    end    
    repeat
-      res = p.pattern and { text:find(at_start and "^" .. code or code, res[2]+1) } 
+      local next = res[2] + 1
-        or { regex.match(code, text, res[2]+1, at_start and regex.ANCHORED or 0) }
+      -- go to the start of the next utf-8 character
      while common.is_utf8_cont(text, next) do
        next = next + 1
      end
      res = p.pattern and { text:find(at_start and "^" .. code or code, next) }
        or { regex.match(code, text, next, at_start and regex.ANCHORED or 0) }
      if res[1] and close and target[3] then
        local count = 0
        for i = res[1] - 1, 1, -1 do
--- a/src/api/regex.c
+++ b/src/api/regex.c
@ -68,8 +68,11 @@ static int f_pcre_match(lua_State *L) {
  int rc = pcre2_match(re, (PCRE2_SPTR)str, len, offset - 1, opts, md, NULL);
  if (rc < 0) {
    pcre2_match_data_free(md);
-    if (rc != PCRE2_ERROR_NOMATCH)
+    if (rc != PCRE2_ERROR_NOMATCH) {
-      luaL_error(L, "regex matching error %d", rc);
+      PCRE2_UCHAR buffer[120];
      pcre2_get_error_message(rc, buffer, sizeof(buffer));
      luaL_error(L, "regex matching error %d: %s", rc, buffer);
    }
    return 0;
  }
  PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md);