diff --git a/data/core/common.lua b/data/core/common.lua index 99e6ee5a..ab21b758 100644 --- a/data/core/common.lua +++ b/data/core/common.lua @@ -1,8 +1,8 @@ local common = {} -function common.is_utf8_cont(char) - local byte = char:byte() +function common.is_utf8_cont(s, offset) + local byte = s:byte(offset or 1) return byte >= 0x80 and byte < 0xc0 end diff --git a/data/core/regex.lua b/data/core/regex.lua index 69203cbd..637d23fd 100644 --- a/data/core/regex.lua +++ b/data/core/regex.lua @@ -1,4 +1,3 @@ - -- So that in addition to regex.gsub(pattern, string), we can also do -- pattern:gsub(string). regex.__index = function(table, key) return regex[key]; end @@ -6,7 +5,8 @@ regex.__index = function(table, key) return regex[key]; end regex.match = function(pattern_string, string, offset, options) local pattern = type(pattern_string) == "table" and pattern_string or regex.compile(pattern_string) - return regex.cmatch(pattern, string, offset or 1, options or 0) + local s, e = regex.cmatch(pattern, string, offset or 1, options or 0) + return s, e and e - 1 end -- Will iterate back through any UTF-8 bytes so that we don't replace bits diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index a20dba5e..08a5ea31 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -1,4 +1,5 @@ local syntax = require "core.syntax" +local common = require "core.common" local tokenizer = {} @@ -142,8 +143,13 @@ function tokenizer.tokenize(incoming_syntax, text, state) code = p._regex end repeat - res = p.pattern and { text:find(at_start and "^" .. code or code, res[2]+1) } - or { regex.match(code, text, res[2]+1, at_start and regex.ANCHORED or 0) } + local next = res[2] + 1 + -- go to the start of the next utf-8 character + while common.is_utf8_cont(text, next) do + next = next + 1 + end + res = p.pattern and { text:find(at_start and "^" .. code or code, next) } + or { regex.match(code, text, next, at_start and regex.ANCHORED or 0) } if res[1] and close and target[3] then local count = 0 for i = res[1] - 1, 1, -1 do diff --git a/src/api/regex.c b/src/api/regex.c index 1043b1c5..9f6bd3ee 100644 --- a/src/api/regex.c +++ b/src/api/regex.c @@ -68,8 +68,11 @@ static int f_pcre_match(lua_State *L) { int rc = pcre2_match(re, (PCRE2_SPTR)str, len, offset - 1, opts, md, NULL); if (rc < 0) { pcre2_match_data_free(md); - if (rc != PCRE2_ERROR_NOMATCH) - luaL_error(L, "regex matching error %d", rc); + if (rc != PCRE2_ERROR_NOMATCH) { + PCRE2_UCHAR buffer[120]; + pcre2_get_error_message(rc, buffer, sizeof(buffer)); + luaL_error(L, "regex matching error %d: %s", rc, buffer); + } return 0; } PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md);