diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index d95baeb1..57c17a0b 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -237,8 +237,13 @@ function tokenizer.tokenize(incoming_syntax, text, state) -- consume character if we didn't match if not matched then - push_token(res, "normal", text:sub(i, i)) - i = i + 1 + local n = 0 + -- reach the next character + while text:byte(i + n + 1) and common.is_utf8_cont(text, i + n + 1) do + n = n + 1 + end + push_token(res, "normal", text:sub(i, i + n)) + i = i + n + 1 end end