Correctly identify the start of the next character in `tokenizer`

When moving to the next character, we have to consider that the current 
one might be multi-byte.
This commit is contained in:
Guldoman 2021-10-11 22:37:31 +02:00
parent 1872e82141
commit 8a516d35ce
No known key found for this signature in database
GPG Key ID: C08A498EC7F1AFDD
1 changed files with 8 additions and 2 deletions

View File

@ -1,4 +1,5 @@
local syntax = require "core.syntax" local syntax = require "core.syntax"
local common = require "core.common"
local tokenizer = {} local tokenizer = {}
@ -142,8 +143,13 @@ function tokenizer.tokenize(incoming_syntax, text, state)
code = p._regex code = p._regex
end end
repeat repeat
res = p.pattern and { text:find(at_start and "^" .. code or code, res[2]+1) } local next = res[2] + 1
or { regex.match(code, text, res[2]+1, at_start and regex.ANCHORED or 0) } -- go to the start of the next utf-8 character
while common.is_utf8_cont(text, next) do
next = next + 1
end
res = p.pattern and { text:find(at_start and "^" .. code or code, next) }
or { regex.match(code, text, next, at_start and regex.ANCHORED or 0) }
if res[1] and close and target[3] then if res[1] and close and target[3] then
local count = 0 local count = 0
for i = res[1] - 1, 1, -1 do for i = res[1] - 1, 1, -1 do