From 74f7389caca79b78cd392931069bd998e3b791ea Mon Sep 17 00:00:00 2001 From: Guldoman Date: Sat, 28 May 2022 01:20:41 +0200 Subject: [PATCH 1/5] Make regex API return integers --- src/api/regex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/regex.c b/src/api/regex.c index 6a0aac7a..d23eaf71 100644 --- a/src/api/regex.c +++ b/src/api/regex.c @@ -88,7 +88,7 @@ static int f_pcre_match(lua_State *L) { return 0; } for (int i = 0; i < rc*2; i++) - lua_pushnumber(L, ovector[i]+offset+1); + lua_pushinteger(L, ovector[i]+offset+1); pcre2_match_data_free(md); return rc*2; } From 14be51b1eca87dc1a97249d04984c2ce38926fa1 Mon Sep 17 00:00:00 2001 From: Guldoman Date: Sat, 28 May 2022 01:21:41 +0200 Subject: [PATCH 2/5] Make `regex.match` return all the results --- data/core/regex.lua | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data/core/regex.lua b/data/core/regex.lua index 637d23fd..fa85d56c 100644 --- a/data/core/regex.lua +++ b/data/core/regex.lua @@ -5,8 +5,9 @@ regex.__index = function(table, key) return regex[key]; end regex.match = function(pattern_string, string, offset, options) local pattern = type(pattern_string) == "table" and pattern_string or regex.compile(pattern_string) - local s, e = regex.cmatch(pattern, string, offset or 1, options or 0) - return s, e and e - 1 + local res = { regex.cmatch(pattern, string, offset or 1, options or 0) } + res[2] = res[2] and res[2] - 1 + return table.unpack(res) end -- Will iterate back through any UTF-8 bytes so that we don't replace bits From 2a41002355181399e993a5395c28d8db024cb161 Mon Sep 17 00:00:00 2001 From: Guldoman Date: Sat, 28 May 2022 01:38:22 +0200 Subject: [PATCH 3/5] Allow using regex groups to split tokens Before, this was only supported by Lua patterns. This expects the regex to use the same syntax used for patterns. That is, the token should be split by empty groups. --- data/core/tokenizer.lua | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index ebe550ff..3d935cae 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -174,6 +174,17 @@ function tokenizer.tokenize(incoming_syntax, text, state) if p.regex and #res > 0 then -- set correct utf8 len for regex result res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1 res[1] = next + -- `regex.match` returns group results as a series of `begin, end` + -- we only want `begin`s + for i=1,(#res-3) do + local curr = i + 3 + local from = i * 2 + 3 + if from < #res then + res[curr] = string.uoffset(text, res[from]) + else + res[curr] = nil + end + end end if res[1] and close and target[3] then local count = 0 From 7ac776bef66de7732d1b3075b5d21198a3f7e353 Mon Sep 17 00:00:00 2001 From: Guldoman Date: Tue, 31 May 2022 01:59:14 +0200 Subject: [PATCH 4/5] Fix UTF-8 matches in regex group `tokenizer` --- data/core/tokenizer.lua | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index 3d935cae..6f3515b4 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -173,18 +173,21 @@ function tokenizer.tokenize(incoming_syntax, text, state) or { regex.match(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) } if p.regex and #res > 0 then -- set correct utf8 len for regex result res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1 - res[1] = next -- `regex.match` returns group results as a series of `begin, end` -- we only want `begin`s + if #res >= 3 then + res[3] = res[1] + string.ulen(text:sub(res[1], res[3])) - 1 + end for i=1,(#res-3) do local curr = i + 3 local from = i * 2 + 3 if from < #res then - res[curr] = string.uoffset(text, res[from]) + res[curr] = res[1] + string.ulen(text:sub(res[1], res[from])) - 1 else res[curr] = nil end end + res[1] = next end if res[1] and close and target[3] then local count = 0 From d8efb1ab53c7e6414d78230219f5ae6655b8b9b0 Mon Sep 17 00:00:00 2001 From: Guldoman Date: Tue, 31 May 2022 02:03:42 +0200 Subject: [PATCH 5/5] Show error if language plugin pattern has mismatching number of groups The number of results from a pattern with groups must never be greater than the number of token types for that pattern. Also if a token type was undefined, it's now pushed as a `normal` one. --- data/core/tokenizer.lua | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index 6f3515b4..555d60b5 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -1,9 +1,12 @@ +local core = require "core" local syntax = require "core.syntax" local common = require "core.common" local tokenizer = {} +local bad_patterns = {} local function push_token(t, type, text) + type = type or "normal" local prev_type = t[#t-1] local prev_text = t[#t] if prev_type and (prev_type == type or prev_text:ufind("^%s*$")) then @@ -256,6 +259,15 @@ function tokenizer.tokenize(incoming_syntax, text, state) local matched = false for n, p in ipairs(current_syntax.patterns) do local find_results = { find_text(text, p, i, true, false) } + if #find_results - 1 > #p.type then + if not bad_patterns[current_syntax] then + bad_patterns[current_syntax] = { } + end + if not bad_patterns[current_syntax][n] then + bad_patterns[current_syntax][n] = true + core.error("Malformed pattern #%d in %s language plugin", n, current_syntax.name or "unnamed") + end + end if find_results[1] then -- matched pattern; make and add tokens push_tokens(res, current_syntax, p, text, find_results)