From 8a516d35ce4aeecc9f7b2879028a89f9d8816d11 Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Mon, 11 Oct 2021 22:37:31 +0200
Subject: [PATCH] Correctly identify the start of the next character in
 `tokenizer`

When moving to the next character, we have to consider that the current
one might be multi-byte.
---
 data/core/tokenizer.lua | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua
index a20dba5e..08a5ea31 100644
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@@ -1,4 +1,5 @@
 local syntax = require "core.syntax"
+local common = require "core.common"
 
 local tokenizer = {}
 
@@ -142,8 +143,13 @@ function tokenizer.tokenize(incoming_syntax, text, state)
       code = p._regex
     end    
     repeat
-      res = p.pattern and { text:find(at_start and "^" .. code or code, res[2]+1) } 
-        or { regex.match(code, text, res[2]+1, at_start and regex.ANCHORED or 0) }
+      local next = res[2] + 1
+      -- go to the start of the next utf-8 character
+      while common.is_utf8_cont(text, next) do
+        next = next + 1
+      end
+      res = p.pattern and { text:find(at_start and "^" .. code or code, next) }
+        or { regex.match(code, text, next, at_start and regex.ANCHORED or 0) }
       if res[1] and close and target[3] then
         local count = 0
         for i = res[1] - 1, 1, -1 do