Consume unmatched character correctly

We must consume the whole UTF-8 character, not just a single byte.
2021-12-11 03:43:33 +01:00 · 2021-12-11 03:43:33 +01:00 · 29318be9c7
parent 37c00c877a
commit 29318be9c7
1 changed files with 7 additions and 2 deletions
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@ -237,8 +237,13 @@ function tokenizer.tokenize(incoming_syntax, text, state)

    -- consume character if we didn't match
    if not matched then
-      push_token(res, "normal", text:sub(i, i))
-      i = i + 1
+      local n = 0
+      -- reach the next character
+      while text:byte(i + n + 1) and common.is_utf8_cont(text, i + n + 1) do
+        n = n + 1
+      end
+      push_token(res, "normal", text:sub(i, i + n))
+      i = i + n + 1
    end
  end