Allow groups to be used in end delimiter patterns in tokenizer (#1317)

* Allow empty groups as first match in tokenizer
* Avoid pushing tokens with empty strings
* Allow groups to be used in end delimiter in tokenizer
* Use the first entry of the type table for the middle part of a subsyntax
This applies to delimited matches with a table for `type` and without a
`syntax` field.
* Match only once if using `at_start` in tokenizer `find_text`
* Check if match is escaped in the "close" case too
Also allow continuing matching if the match was escaped.
This commit is contained in:
Guldoman 2023-02-06 20:24:40 +01:00 committed by George Sokianos
parent a0c05791b1
commit 5f24108772
1 changed files with 54 additions and 82 deletions

View File

@ -5,6 +5,7 @@ local tokenizer = {}
local bad_patterns = {}
local function push_token(t, type, text)
if not text or #text == 0 then return end
type = type or "normal"
local prev_type = t[#t-1]
local prev_text = t[#t]
@ -26,11 +27,8 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
-- Each position spans characters from i_n to ((i_n+1) - 1), to form
-- consecutive spans of text.
--
-- If i_1 is not equal to start, start is automatically inserted at
-- that index.
if find_results[3] ~= find_results[1] then
table.insert(find_results, 3, find_results[1])
end
-- Insert the start index at i_1 to make iterating easier
table.insert(find_results, 3, find_results[1])
-- Copy the ending index to the end of the table, so that an ending index
-- always follows a starting index after position 3 in the table.
table.insert(find_results, find_results[2] + 1)
@ -40,8 +38,10 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
local fin = find_results[i + 1] - 1
local type = pattern.type[i - 2]
-- ↑ (i - 2) to convert from [3; n] to [1; n]
local text = full_text:usub(start, fin)
push_token(t, syn.symbols[text] or type, text)
if fin >= start then
local text = full_text:usub(start, fin)
push_token(t, syn.symbols[text] or type, text)
end
end
else
local start, fin = find_results[1], find_results[2]
@ -224,6 +224,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
res[1] = char_pos_1
res[2] = char_pos_2
end
if not res[1] then return end
if res[1] and target[3] then
-- Check to see if the escaped character is there,
-- and if it is not itself escaped.
@ -235,50 +236,39 @@ function tokenizer.tokenize(incoming_syntax, text, state)
if count % 2 == 0 then
-- The match is not escaped, so confirm it
break
elseif not close then
-- The *open* match is escaped, so avoid it
return
else
-- The match is escaped, so avoid it
res[1] = false
end
end
until not res[1] or not close or not target[3]
until at_start or not close or not target[3]
return table.unpack(res)
end
local text_len = text:ulen()
if text_len ~= nil then
while i <= text_len do
-- continue trying to match the end pattern of a pair if we have a state set
if current_pattern_idx > 0 then
local p = current_syntax.patterns[current_pattern_idx]
local s, e = find_text(text, p, i, false, true)
while i <= text_len do
-- continue trying to match the end pattern of a pair if we have a state set
if current_pattern_idx > 0 then
local p = current_syntax.patterns[current_pattern_idx]
local s, e = find_text(text, p, i, false, true)
-- Use the first token type specified in the type table for the "middle"
-- part of the subsyntax.
local token_type = type(p.type) == "table" and p.type[1] or p.type
local cont = true
-- If we're in subsyntax mode, always check to see if we end our syntax
-- first, before the found delimeter, as ending the subsyntax takes
-- precedence over ending the delimiter in the subsyntax.
if subsyntax_info then
local ss, se = find_text(text, subsyntax_info, i, false, true)
-- If we find that we end the subsyntax before the
-- delimiter, push the token, and signal we shouldn't
-- treat the bit after as a token to be normally parsed
-- (as it's the syntax delimiter).
if ss and (s == nil or ss < s) then
push_token(res, p.type, text:usub(i, ss - 1))
i = ss
cont = false
end
end
-- If we don't have any concerns about syntax delimiters,
-- continue on as normal.
if cont then
if s then
push_token(res, p.type, text:usub(i, e))
set_subsyntax_pattern_idx(0)
i = e + 1
else
push_token(res, p.type, text:usub(i))
break
end
local cont = true
-- If we're in subsyntax mode, always check to see if we end our syntax
-- first, before the found delimeter, as ending the subsyntax takes
-- precedence over ending the delimiter in the subsyntax.
if subsyntax_info then
local ss, se = find_text(text, subsyntax_info, i, false, true)
-- If we find that we end the subsyntax before the
-- delimiter, push the token, and signal we shouldn't
-- treat the bit after as a token to be normally parsed
-- (as it's the syntax delimiter).
if ss and (s == nil or ss < s) then
push_token(res, token_type, text:usub(i, ss - 1))
i = ss
cont = false
end
end
-- General end of syntax check. Applies in the case where
@ -287,48 +277,30 @@ function tokenizer.tokenize(incoming_syntax, text, state)
if subsyntax_info then
local s, e = find_text(text, subsyntax_info, i, true, true)
if s then
push_token(res, subsyntax_info.type, text:usub(i, e))
-- On finding unescaped delimiter, pop it.
pop_subsyntax()
push_token(res, token_type, text:usub(i, e))
set_subsyntax_pattern_idx(0)
i = e + 1
end
end
-- find matching pattern
local matched = false
for n, p in ipairs(current_syntax.patterns) do
local find_results = { find_text(text, p, i, true, false) }
if find_results[1] then
local type_is_table = type(p.type) == "table"
local n_types = type_is_table and #p.type or 1
if #find_results == 2 and type_is_table then
report_bad_pattern(core.warn, current_syntax, n,
"Token type is a table, but a string was expected.")
p.type = p.type[1]
elseif #find_results - 1 > n_types then
report_bad_pattern(core.error, current_syntax, n,
"Not enough token types: got %d needed %d.", n_types, #find_results - 1)
elseif #find_results - 1 < n_types then
report_bad_pattern(core.warn, current_syntax, n,
"Too many token types: got %d needed %d.", n_types, #find_results - 1)
end
-- matched pattern; make and add tokens
push_tokens(res, current_syntax, p, text, find_results)
-- update state if this was a start|end pattern pair
if type(p.pattern or p.regex) == "table" then
-- If we have a subsyntax, push that onto the subsyntax stack.
if p.syntax then
push_subsyntax(p, n)
else
set_subsyntax_pattern_idx(n)
end
end
-- move cursor past this token
i = find_results[2] + 1
matched = true
else
push_token(res, token_type, text:usub(i))
break
end
end
end
-- General end of syntax check. Applies in the case where
-- we're ending early in the middle of a delimiter, or
-- just normally, upon finding a token.
while subsyntax_info do
local find_results = { find_text(text, subsyntax_info, i, true, true) }
local s, e = find_results[1], find_results[2]
if s then
push_tokens(res, current_syntax, subsyntax_info, text, find_results)
-- On finding unescaped delimiter, pop it.
pop_subsyntax()
i = e + 1
else
break
end
end
-- consume character if we didn't match
if not matched then