From d9925b7d447d1b6df5935c88f3caac35ee7f729a Mon Sep 17 00:00:00 2001 From: Guldoman Date: Mon, 6 Feb 2023 20:24:40 +0100 Subject: [PATCH] Allow groups to be used in end delimiter patterns in tokenizer (#1317) * Allow empty groups as first match in tokenizer * Avoid pushing tokens with empty strings * Allow groups to be used in end delimiter in tokenizer * Use the first entry of the type table for the middle part of a subsyntax This applies to delimited matches with a table for `type` and without a `syntax` field. * Match only once if using `at_start` in tokenizer `find_text` * Check if match is escaped in the "close" case too Also allow continuing matching if the match was escaped. --- data/core/tokenizer.lua | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index b2126124..0a3b58b1 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -5,6 +5,7 @@ local tokenizer = {} local bad_patterns = {} local function push_token(t, type, text) + if not text or #text == 0 then return end type = type or "normal" local prev_type = t[#t-1] local prev_text = t[#t] @@ -26,11 +27,8 @@ local function push_tokens(t, syn, pattern, full_text, find_results) -- Each position spans characters from i_n to ((i_n+1) - 1), to form -- consecutive spans of text. -- - -- If i_1 is not equal to start, start is automatically inserted at - -- that index. - if find_results[3] ~= find_results[1] then - table.insert(find_results, 3, find_results[1]) - end + -- Insert the start index at i_1 to make iterating easier + table.insert(find_results, 3, find_results[1]) -- Copy the ending index to the end of the table, so that an ending index -- always follows a starting index after position 3 in the table. table.insert(find_results, find_results[2] + 1) @@ -40,8 +38,10 @@ local function push_tokens(t, syn, pattern, full_text, find_results) local fin = find_results[i + 1] - 1 local type = pattern.type[i - 2] -- ↑ (i - 2) to convert from [3; n] to [1; n] - local text = full_text:usub(start, fin) - push_token(t, syn.symbols[text] or type, text) + if fin >= start then + local text = full_text:usub(start, fin) + push_token(t, syn.symbols[text] or type, text) + end end else local start, fin = find_results[1], find_results[2] @@ -224,6 +224,7 @@ function tokenizer.tokenize(incoming_syntax, text, state) res[1] = char_pos_1 res[2] = char_pos_2 end + if not res[1] then return end if res[1] and target[3] then -- Check to see if the escaped character is there, -- and if it is not itself escaped. @@ -235,12 +236,12 @@ function tokenizer.tokenize(incoming_syntax, text, state) if count % 2 == 0 then -- The match is not escaped, so confirm it break - elseif not close then - -- The *open* match is escaped, so avoid it - return + else + -- The match is escaped, so avoid it + res[1] = false end end - until not res[1] or not close or not target[3] + until at_start or not close or not target[3] return table.unpack(res) end @@ -250,6 +251,9 @@ function tokenizer.tokenize(incoming_syntax, text, state) if current_pattern_idx > 0 then local p = current_syntax.patterns[current_pattern_idx] local s, e = find_text(text, p, i, false, true) + -- Use the first token type specified in the type table for the "middle" + -- part of the subsyntax. + local token_type = type(p.type) == "table" and p.type[1] or p.type local cont = true -- If we're in subsyntax mode, always check to see if we end our syntax @@ -262,7 +266,7 @@ function tokenizer.tokenize(incoming_syntax, text, state) -- treat the bit after as a token to be normally parsed -- (as it's the syntax delimiter). if ss and (s == nil or ss < s) then - push_token(res, p.type, text:usub(i, ss - 1)) + push_token(res, token_type, text:usub(i, ss - 1)) i = ss cont = false end @@ -271,11 +275,11 @@ function tokenizer.tokenize(incoming_syntax, text, state) -- continue on as normal. if cont then if s then - push_token(res, p.type, text:usub(i, e)) + push_token(res, token_type, text:usub(i, e)) set_subsyntax_pattern_idx(0) i = e + 1 else - push_token(res, p.type, text:usub(i)) + push_token(res, token_type, text:usub(i)) break end end @@ -284,9 +288,10 @@ function tokenizer.tokenize(incoming_syntax, text, state) -- we're ending early in the middle of a delimiter, or -- just normally, upon finding a token. while subsyntax_info do - local s, e = find_text(text, subsyntax_info, i, true, true) + local find_results = { find_text(text, subsyntax_info, i, true, true) } + local s, e = find_results[1], find_results[2] if s then - push_token(res, subsyntax_info.type, text:usub(i, e)) + push_tokens(res, current_syntax, subsyntax_info, text, find_results) -- On finding unescaped delimiter, pop it. pop_subsyntax() i = e + 1