From 7ac776bef66de7732d1b3075b5d21198a3f7e353 Mon Sep 17 00:00:00 2001 From: Guldoman Date: Tue, 31 May 2022 01:59:14 +0200 Subject: [PATCH] Fix UTF-8 matches in regex group `tokenizer` --- data/core/tokenizer.lua | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index 3d935cae..6f3515b4 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -173,18 +173,21 @@ function tokenizer.tokenize(incoming_syntax, text, state) or { regex.match(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) } if p.regex and #res > 0 then -- set correct utf8 len for regex result res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1 - res[1] = next -- `regex.match` returns group results as a series of `begin, end` -- we only want `begin`s + if #res >= 3 then + res[3] = res[1] + string.ulen(text:sub(res[1], res[3])) - 1 + end for i=1,(#res-3) do local curr = i + 3 local from = i * 2 + 3 if from < #res then - res[curr] = string.uoffset(text, res[from]) + res[curr] = res[1] + string.ulen(text:sub(res[1], res[from])) - 1 else res[curr] = nil end end + res[1] = next end if res[1] and close and target[3] then local count = 0