From 2a41002355181399e993a5395c28d8db024cb161 Mon Sep 17 00:00:00 2001
From: Guldoman <giulio.lettieri@gmail.com>
Date: Sat, 28 May 2022 01:38:22 +0200
Subject: [PATCH] Allow using regex groups to split tokens

Before, this was only supported by Lua patterns.

This expects the regex to use the same syntax used for patterns. That
is, the token should be split by empty groups.
---
 data/core/tokenizer.lua | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua
index ebe550ff..3d935cae 100644
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@@ -174,6 +174,17 @@ function tokenizer.tokenize(incoming_syntax, text, state)
       if p.regex and #res > 0 then -- set correct utf8 len for regex result
         res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1
         res[1] = next
+        -- `regex.match` returns group results as a series of `begin, end`
+        -- we only want `begin`s
+        for i=1,(#res-3) do
+          local curr = i + 3
+          local from = i * 2 + 3
+          if from < #res then
+            res[curr] = string.uoffset(text, res[from])
+          else
+            res[curr] = nil
+          end
+        end
       end
       if res[1] and close and target[3] then
         local count = 0