From 29318be9c71e1be290e7507e9f8b1c9445aad1b0 Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Sat, 11 Dec 2021 03:43:33 +0100
Subject: [PATCH] Consume unmatched character correctly

We must consume the whole UTF-8 character, not just a single byte.
---
 data/core/tokenizer.lua | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua
index d95baeb1..57c17a0b 100644
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@@ -237,8 +237,13 @@ function tokenizer.tokenize(incoming_syntax, text, state)
 
     -- consume character if we didn't match
     if not matched then
-      push_token(res, "normal", text:sub(i, i))
-      i = i + 1
+      local n = 0
+      -- reach the next character
+      while text:byte(i + n + 1) and common.is_utf8_cont(text, i + n + 1) do
+        n = n + 1
+      end
+      push_token(res, "normal", text:sub(i, i + n))
+      i = i + n + 1
     end
   end