diff --git a/data/core/regex.lua b/data/core/regex.lua index 637d23fd..fa85d56c 100644 --- a/data/core/regex.lua +++ b/data/core/regex.lua @@ -5,8 +5,9 @@ regex.__index = function(table, key) return regex[key]; end regex.match = function(pattern_string, string, offset, options) local pattern = type(pattern_string) == "table" and pattern_string or regex.compile(pattern_string) - local s, e = regex.cmatch(pattern, string, offset or 1, options or 0) - return s, e and e - 1 + local res = { regex.cmatch(pattern, string, offset or 1, options or 0) } + res[2] = res[2] and res[2] - 1 + return table.unpack(res) end -- Will iterate back through any UTF-8 bytes so that we don't replace bits diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index ebe550ff..555d60b5 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -1,9 +1,12 @@ +local core = require "core" local syntax = require "core.syntax" local common = require "core.common" local tokenizer = {} +local bad_patterns = {} local function push_token(t, type, text) + type = type or "normal" local prev_type = t[#t-1] local prev_text = t[#t] if prev_type and (prev_type == type or prev_text:ufind("^%s*$")) then @@ -173,6 +176,20 @@ function tokenizer.tokenize(incoming_syntax, text, state) or { regex.match(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) } if p.regex and #res > 0 then -- set correct utf8 len for regex result res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1 + -- `regex.match` returns group results as a series of `begin, end` + -- we only want `begin`s + if #res >= 3 then + res[3] = res[1] + string.ulen(text:sub(res[1], res[3])) - 1 + end + for i=1,(#res-3) do + local curr = i + 3 + local from = i * 2 + 3 + if from < #res then + res[curr] = res[1] + string.ulen(text:sub(res[1], res[from])) - 1 + else + res[curr] = nil + end + end res[1] = next end if res[1] and close and target[3] then @@ -242,6 +259,15 @@ function tokenizer.tokenize(incoming_syntax, text, state) local matched = false for n, p in ipairs(current_syntax.patterns) do local find_results = { find_text(text, p, i, true, false) } + if #find_results - 1 > #p.type then + if not bad_patterns[current_syntax] then + bad_patterns[current_syntax] = { } + end + if not bad_patterns[current_syntax][n] then + bad_patterns[current_syntax][n] = true + core.error("Malformed pattern #%d in %s language plugin", n, current_syntax.name or "unnamed") + end + end if find_results[1] then -- matched pattern; make and add tokens push_tokens(res, current_syntax, p, text, find_results) diff --git a/src/api/regex.c b/src/api/regex.c index 6a0aac7a..d23eaf71 100644 --- a/src/api/regex.c +++ b/src/api/regex.c @@ -88,7 +88,7 @@ static int f_pcre_match(lua_State *L) { return 0; } for (int i = 0; i < rc*2; i++) - lua_pushnumber(L, ovector[i]+offset+1); + lua_pushinteger(L, ovector[i]+offset+1); pcre2_match_data_free(md); return rc*2; }