From 2a41002355181399e993a5395c28d8db024cb161 Mon Sep 17 00:00:00 2001 From: Guldoman Date: Sat, 28 May 2022 01:38:22 +0200 Subject: [PATCH] Allow using regex groups to split tokens Before, this was only supported by Lua patterns. This expects the regex to use the same syntax used for patterns. That is, the token should be split by empty groups. --- data/core/tokenizer.lua | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index ebe550ff..3d935cae 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -174,6 +174,17 @@ function tokenizer.tokenize(incoming_syntax, text, state) if p.regex and #res > 0 then -- set correct utf8 len for regex result res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1 res[1] = next + -- `regex.match` returns group results as a series of `begin, end` + -- we only want `begin`s + for i=1,(#res-3) do + local curr = i + 3 + local from = i * 2 + 3 + if from < #res then + res[curr] = string.uoffset(text, res[from]) + else + res[curr] = nil + end + end end if res[1] and close and target[3] then local count = 0