Fix UTF-8 matches in regex group `tokenizer`

This commit is contained in:
Guldoman 2022-05-31 01:59:14 +02:00
parent 2a41002355
commit 7ac776bef6
No known key found for this signature in database
GPG Key ID: EA928C8BDA1A8825
1 changed files with 5 additions and 2 deletions

View File

@ -173,18 +173,21 @@ function tokenizer.tokenize(incoming_syntax, text, state)
or { regex.match(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) } or { regex.match(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) }
if p.regex and #res > 0 then -- set correct utf8 len for regex result if p.regex and #res > 0 then -- set correct utf8 len for regex result
res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1 res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1
res[1] = next
-- `regex.match` returns group results as a series of `begin, end` -- `regex.match` returns group results as a series of `begin, end`
-- we only want `begin`s -- we only want `begin`s
if #res >= 3 then
res[3] = res[1] + string.ulen(text:sub(res[1], res[3])) - 1
end
for i=1,(#res-3) do for i=1,(#res-3) do
local curr = i + 3 local curr = i + 3
local from = i * 2 + 3 local from = i * 2 + 3
if from < #res then if from < #res then
res[curr] = string.uoffset(text, res[from]) res[curr] = res[1] + string.ulen(text:sub(res[1], res[from])) - 1
else else
res[curr] = nil res[curr] = nil
end end
end end
res[1] = next
end end
if res[1] and close and target[3] then if res[1] and close and target[3] then
local count = 0 local count = 0