From 29318be9c71e1be290e7507e9f8b1c9445aad1b0 Mon Sep 17 00:00:00 2001 From: Guldoman Date: Sat, 11 Dec 2021 03:43:33 +0100 Subject: [PATCH] Consume unmatched character correctly We must consume the whole UTF-8 character, not just a single byte. --- data/core/tokenizer.lua | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index d95baeb1..57c17a0b 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -237,8 +237,13 @@ function tokenizer.tokenize(incoming_syntax, text, state) -- consume character if we didn't match if not matched then - push_token(res, "normal", text:sub(i, i)) - i = i + 1 + local n = 0 + -- reach the next character + while text:byte(i + n + 1) and common.is_utf8_cont(text, i + n + 1) do + n = n + 1 + end + push_token(res, "normal", text:sub(i, i + n)) + i = i + n + 1 end end