Allow using regex groups to split tokens

Before, this was only supported by Lua patterns. This expects the regex to use the same syntax used for patterns. That is, the token should be split by empty groups.
2022-05-28 01:38:22 +02:00 · 2022-05-28 01:38:22 +02:00 · 2a41002355
parent 14be51b1ec
commit 2a41002355
1 changed files with 11 additions and 0 deletions
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@ -174,6 +174,17 @@ function tokenizer.tokenize(incoming_syntax, text, state)
      if p.regex and #res > 0 then -- set correct utf8 len for regex result
        res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1
        res[1] = next
+        -- `regex.match` returns group results as a series of `begin, end`
+        -- we only want `begin`s
+        for i=1,(#res-3) do
+          local curr = i + 3
+          local from = i * 2 + 3
+          if from < #res then
+            res[curr] = string.uoffset(text, res[from])
+          else
+            res[curr] = nil
+          end
+        end
      end
      if res[1] and close and target[3] then
        local count = 0