From 74f7389caca79b78cd392931069bd998e3b791ea Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Sat, 28 May 2022 01:20:41 +0200
Subject: [PATCH 1/5] Make regex API return integers

---
 src/api/regex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/api/regex.c b/src/api/regex.c
index 6a0aac7a..d23eaf71 100644
--- a/src/api/regex.c
+++ b/src/api/regex.c
@@ -88,7 +88,7 @@ static int f_pcre_match(lua_State *L) {
     return 0;
   }
   for (int i = 0; i < rc*2; i++)
-    lua_pushnumber(L, ovector[i]+offset+1);
+    lua_pushinteger(L, ovector[i]+offset+1);
   pcre2_match_data_free(md);
   return rc*2;
 }

From 14be51b1eca87dc1a97249d04984c2ce38926fa1 Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Sat, 28 May 2022 01:21:41 +0200
Subject: [PATCH 2/5] Make `regex.match` return all the results

---
 data/core/regex.lua | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/data/core/regex.lua b/data/core/regex.lua
index 637d23fd..fa85d56c 100644
--- a/data/core/regex.lua
+++ b/data/core/regex.lua
@@ -5,8 +5,9 @@ regex.__index = function(table, key) return regex[key]; end
 regex.match = function(pattern_string, string, offset, options)
   local pattern = type(pattern_string) == "table" and
     pattern_string or regex.compile(pattern_string)
-  local s, e = regex.cmatch(pattern, string, offset or 1, options or 0)
-  return s, e and e - 1
+  local res = { regex.cmatch(pattern, string, offset or 1, options or 0) }
+  res[2] = res[2] and res[2] - 1
+  return table.unpack(res)
 end
 
 -- Will iterate back through any UTF-8 bytes so that we don't replace bits

From 2a41002355181399e993a5395c28d8db024cb161 Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Sat, 28 May 2022 01:38:22 +0200
Subject: [PATCH 3/5] Allow using regex groups to split tokens

Before, this was only supported by Lua patterns.

This expects the regex to use the same syntax used for patterns. That
is, the token should be split by empty groups.
---
 data/core/tokenizer.lua | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua
index ebe550ff..3d935cae 100644
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@@ -174,6 +174,17 @@ function tokenizer.tokenize(incoming_syntax, text, state)
       if p.regex and #res > 0 then -- set correct utf8 len for regex result
         res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1
         res[1] = next
+        -- `regex.match` returns group results as a series of `begin, end`
+        -- we only want `begin`s
+        for i=1,(#res-3) do
+          local curr = i + 3
+          local from = i * 2 + 3
+          if from < #res then
+            res[curr] = string.uoffset(text, res[from])
+          else
+            res[curr] = nil
+          end
+        end
       end
       if res[1] and close and target[3] then
         local count = 0

From 7ac776bef66de7732d1b3075b5d21198a3f7e353 Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Tue, 31 May 2022 01:59:14 +0200
Subject: [PATCH 4/5] Fix UTF-8 matches in regex group `tokenizer`

---
 data/core/tokenizer.lua | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua
index 3d935cae..6f3515b4 100644
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@@ -173,18 +173,21 @@ function tokenizer.tokenize(incoming_syntax, text, state)
         or { regex.match(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) }
       if p.regex and #res > 0 then -- set correct utf8 len for regex result
         res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1
-        res[1] = next
         -- `regex.match` returns group results as a series of `begin, end`
         -- we only want `begin`s
+        if #res >= 3 then
+          res[3] = res[1] + string.ulen(text:sub(res[1], res[3])) - 1
+        end
         for i=1,(#res-3) do
           local curr = i + 3
           local from = i * 2 + 3
           if from < #res then
-            res[curr] = string.uoffset(text, res[from])
+            res[curr] = res[1] + string.ulen(text:sub(res[1], res[from])) - 1
           else
             res[curr] = nil
           end
         end
+        res[1] = next
       end
       if res[1] and close and target[3] then
         local count = 0

From d8efb1ab53c7e6414d78230219f5ae6655b8b9b0 Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Tue, 31 May 2022 02:03:42 +0200
Subject: [PATCH 5/5] Show error if language plugin pattern has mismatching
 number of groups

The number of results from a pattern with groups must never be greater
than the number of token types for that pattern.

Also if a token type was undefined, it's now pushed as a `normal` one.
---
 data/core/tokenizer.lua | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua
index 6f3515b4..555d60b5 100644
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@@ -1,9 +1,12 @@
+local core = require "core"
 local syntax = require "core.syntax"
 local common = require "core.common"
 
 local tokenizer = {}
+local bad_patterns = {}
 
 local function push_token(t, type, text)
+  type = type or "normal"
   local prev_type = t[#t-1]
   local prev_text = t[#t]
   if prev_type and (prev_type == type or prev_text:ufind("^%s*$")) then
@@ -256,6 +259,15 @@ function tokenizer.tokenize(incoming_syntax, text, state)
     local matched = false
     for n, p in ipairs(current_syntax.patterns) do
       local find_results = { find_text(text, p, i, true, false) }
+      if #find_results - 1 > #p.type then
+        if not bad_patterns[current_syntax] then
+          bad_patterns[current_syntax] = { }
+        end
+        if not bad_patterns[current_syntax][n] then
+          bad_patterns[current_syntax][n] = true
+          core.error("Malformed pattern #%d in %s language plugin", n, current_syntax.name or "unnamed")
+        end
+      end
       if find_results[1] then
         -- matched pattern; make and add tokens
         push_tokens(res, current_syntax, p, text, find_results)