From 3a71528087206cdd084e62424ad6539490fbf12d Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Mon, 11 Oct 2021 22:18:02 +0200
Subject: [PATCH 1/4] Allow specifying offset for `common.is_utf8_cont`

---
 data/core/common.lua | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/core/common.lua b/data/core/common.lua
index 1a1b22cd..0d360640 100644
--- a/data/core/common.lua
+++ b/data/core/common.lua
@@ -1,8 +1,8 @@
 local common = {}
 
 
-function common.is_utf8_cont(char)
-  local byte = char:byte()
+function common.is_utf8_cont(s, offset)
+  local byte = s:byte(offset or 1)
   return byte >= 0x80 and byte < 0xc0
 end
 

From 038e335c8c1813b257e132a30f00a4d60ee0b153 Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Mon, 11 Oct 2021 22:20:44 +0200
Subject: [PATCH 2/4] Show error message when `pcre2_match` fails

---
 src/api/regex.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/api/regex.c b/src/api/regex.c
index 1043b1c5..9f6bd3ee 100644
--- a/src/api/regex.c
+++ b/src/api/regex.c
@@ -68,8 +68,11 @@ static int f_pcre_match(lua_State *L) {
   int rc = pcre2_match(re, (PCRE2_SPTR)str, len, offset - 1, opts, md, NULL);
   if (rc < 0) {
     pcre2_match_data_free(md);
-    if (rc != PCRE2_ERROR_NOMATCH)
-      luaL_error(L, "regex matching error %d", rc);
+    if (rc != PCRE2_ERROR_NOMATCH) {
+      PCRE2_UCHAR buffer[120];
+      pcre2_get_error_message(rc, buffer, sizeof(buffer));
+      luaL_error(L, "regex matching error %d: %s", rc, buffer);
+    }
     return 0;
   }
   PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md);

From 1872e8214137b02e2d7c905d267b3d76297d6087 Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Mon, 11 Oct 2021 22:32:50 +0200
Subject: [PATCH 3/4] Make `regex.match` return the appropriate `end` index

This makes its behavior similar to `string.find`.
---
 data/core/regex.lua | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/core/regex.lua b/data/core/regex.lua
index 69203cbd..637d23fd 100644
--- a/data/core/regex.lua
+++ b/data/core/regex.lua
@@ -1,4 +1,3 @@
-
 -- So that in addition to regex.gsub(pattern, string), we can also do
 -- pattern:gsub(string).
 regex.__index = function(table, key) return regex[key]; end
@@ -6,7 +5,8 @@ regex.__index = function(table, key) return regex[key]; end
 regex.match = function(pattern_string, string, offset, options)
   local pattern = type(pattern_string) == "table" and
     pattern_string or regex.compile(pattern_string)
-  return regex.cmatch(pattern, string, offset or 1, options or 0)
+  local s, e = regex.cmatch(pattern, string, offset or 1, options or 0)
+  return s, e and e - 1
 end
 
 -- Will iterate back through any UTF-8 bytes so that we don't replace bits

From 8a516d35ce4aeecc9f7b2879028a89f9d8816d11 Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Mon, 11 Oct 2021 22:37:31 +0200
Subject: [PATCH 4/4] Correctly identify the start of the next character in
 `tokenizer`

When moving to the next character, we have to consider that the current
one might be multi-byte.
---
 data/core/tokenizer.lua | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua
index a20dba5e..08a5ea31 100644
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@@ -1,4 +1,5 @@
 local syntax = require "core.syntax"
+local common = require "core.common"
 
 local tokenizer = {}
 
@@ -142,8 +143,13 @@ function tokenizer.tokenize(incoming_syntax, text, state)
       code = p._regex
     end    
     repeat
-      res = p.pattern and { text:find(at_start and "^" .. code or code, res[2]+1) } 
-        or { regex.match(code, text, res[2]+1, at_start and regex.ANCHORED or 0) }
+      local next = res[2] + 1
+      -- go to the start of the next utf-8 character
+      while common.is_utf8_cont(text, next) do
+        next = next + 1
+      end
+      res = p.pattern and { text:find(at_start and "^" .. code or code, next) }
+        or { regex.match(code, text, next, at_start and regex.ANCHORED or 0) }
       if res[1] and close and target[3] then
         local count = 0
         for i = res[1] - 1, 1, -1 do