diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index 89364f28..0b9b4ac6 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -225,97 +225,99 @@ function tokenizer.tokenize(incoming_syntax, text, state) end local text_len = text:ulen() - while i <= text_len do - -- continue trying to match the end pattern of a pair if we have a state set - if current_pattern_idx > 0 then - local p = current_syntax.patterns[current_pattern_idx] - local s, e = find_text(text, p, i, false, true) + if text_len ~= nil then + while i <= text_len do + -- continue trying to match the end pattern of a pair if we have a state set + if current_pattern_idx > 0 then + local p = current_syntax.patterns[current_pattern_idx] + local s, e = find_text(text, p, i, false, true) - local cont = true - -- If we're in subsyntax mode, always check to see if we end our syntax - -- first, before the found delimeter, as ending the subsyntax takes - -- precedence over ending the delimiter in the subsyntax. - if subsyntax_info then - local ss, se = find_text(text, subsyntax_info, i, false, true) - -- If we find that we end the subsyntax before the - -- delimiter, push the token, and signal we shouldn't - -- treat the bit after as a token to be normally parsed - -- (as it's the syntax delimiter). - if ss and (s == nil or ss < s) then - push_token(res, p.type, text:usub(i, ss - 1)) - i = ss - cont = false + local cont = true + -- If we're in subsyntax mode, always check to see if we end our syntax + -- first, before the found delimeter, as ending the subsyntax takes + -- precedence over ending the delimiter in the subsyntax. + if subsyntax_info then + local ss, se = find_text(text, subsyntax_info, i, false, true) + -- If we find that we end the subsyntax before the + -- delimiter, push the token, and signal we shouldn't + -- treat the bit after as a token to be normally parsed + -- (as it's the syntax delimiter). + if ss and (s == nil or ss < s) then + push_token(res, p.type, text:usub(i, ss - 1)) + i = ss + cont = false + end + end + -- If we don't have any concerns about syntax delimiters, + -- continue on as normal. + if cont then + if s then + push_token(res, p.type, text:usub(i, e)) + set_subsyntax_pattern_idx(0) + i = e + 1 + else + push_token(res, p.type, text:usub(i)) + break + end end end - -- If we don't have any concerns about syntax delimiters, - -- continue on as normal. - if cont then + -- General end of syntax check. Applies in the case where + -- we're ending early in the middle of a delimiter, or + -- just normally, upon finding a token. + if subsyntax_info then + local s, e = find_text(text, subsyntax_info, i, true, true) if s then - push_token(res, p.type, text:usub(i, e)) - set_subsyntax_pattern_idx(0) + push_token(res, subsyntax_info.type, text:usub(i, e)) + -- On finding unescaped delimiter, pop it. + pop_subsyntax() i = e + 1 - else - push_token(res, p.type, text:usub(i)) + end + end + + -- find matching pattern + local matched = false + for n, p in ipairs(current_syntax.patterns) do + local find_results = { find_text(text, p, i, true, false) } + if find_results[1] then + local type_is_table = type(p.type) == "table" + local n_types = type_is_table and #p.type or 1 + if #find_results == 2 and type_is_table then + report_bad_pattern(core.warn, current_syntax, n, + "Token type is a table, but a string was expected.") + p.type = p.type[1] + elseif #find_results - 1 > n_types then + report_bad_pattern(core.error, current_syntax, n, + "Not enough token types: got %d needed %d.", n_types, #find_results - 1) + elseif #find_results - 1 < n_types then + report_bad_pattern(core.warn, current_syntax, n, + "Too many token types: got %d needed %d.", n_types, #find_results - 1) + end + -- matched pattern; make and add tokens + push_tokens(res, current_syntax, p, text, find_results) + -- update state if this was a start|end pattern pair + if type(p.pattern or p.regex) == "table" then + -- If we have a subsyntax, push that onto the subsyntax stack. + if p.syntax then + push_subsyntax(p, n) + else + set_subsyntax_pattern_idx(n) + end + end + -- move cursor past this token + i = find_results[2] + 1 + matched = true break end end - end - -- General end of syntax check. Applies in the case where - -- we're ending early in the middle of a delimiter, or - -- just normally, upon finding a token. - if subsyntax_info then - local s, e = find_text(text, subsyntax_info, i, true, true) - if s then - push_token(res, subsyntax_info.type, text:usub(i, e)) - -- On finding unescaped delimiter, pop it. - pop_subsyntax() - i = e + 1 - end - end - -- find matching pattern - local matched = false - for n, p in ipairs(current_syntax.patterns) do - local find_results = { find_text(text, p, i, true, false) } - if find_results[1] then - local type_is_table = type(p.type) == "table" - local n_types = type_is_table and #p.type or 1 - if #find_results == 2 and type_is_table then - report_bad_pattern(core.warn, current_syntax, n, - "Token type is a table, but a string was expected.") - p.type = p.type[1] - elseif #find_results - 1 > n_types then - report_bad_pattern(core.error, current_syntax, n, - "Not enough token types: got %d needed %d.", n_types, #find_results - 1) - elseif #find_results - 1 < n_types then - report_bad_pattern(core.warn, current_syntax, n, - "Too many token types: got %d needed %d.", n_types, #find_results - 1) - end - -- matched pattern; make and add tokens - push_tokens(res, current_syntax, p, text, find_results) - -- update state if this was a start|end pattern pair - if type(p.pattern or p.regex) == "table" then - -- If we have a subsyntax, push that onto the subsyntax stack. - if p.syntax then - push_subsyntax(p, n) - else - set_subsyntax_pattern_idx(n) - end - end - -- move cursor past this token - i = find_results[2] + 1 - matched = true - break + -- consume character if we didn't match + if not matched then + push_token(res, "normal", text:usub(i, i)) + i = i + 1 end end - - -- consume character if we didn't match - if not matched then - push_token(res, "normal", text:usub(i, i)) - i = i + 1 - end end - + return res, state end