Allow groups to be used in end delimiter patterns in tokenizer (#1317)

* Allow empty groups as first match in tokenizer * Avoid pushing tokens with empty strings * Allow groups to be used in end delimiter in tokenizer * Use the first entry of the type table for the middle part of a subsyntax This applies to delimited matches with a table for `type` and without a `syntax` field. * Match only once if using `at_start` in tokenizer `find_text` * Check if match is escaped in the "close" case too Also allow continuing matching if the match was escaped.
2023-02-06 20:24:40 +01:00 · 2023-02-06 20:24:40 +01:00 · 5f24108772
parent a0c05791b1
commit 5f24108772
1 changed files with 54 additions and 82 deletions
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@ -5,6 +5,7 @@ local tokenizer = {}
 local bad_patterns = {}

 local function push_token(t, type, text)
+  if not text or #text == 0 then return end
  type = type or "normal"
  local prev_type = t[#t-1]
  local prev_text = t[#t]
@ -26,11 +27,8 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
    -- Each position spans characters from i_n to ((i_n+1) - 1), to form
    -- consecutive spans of text.
    --
-    -- If i_1 is not equal to start, start is automatically inserted at
-    -- that index.
-    if find_results[3] ~= find_results[1] then
-      table.insert(find_results, 3, find_results[1])
-    end
+    -- Insert the start index at i_1 to make iterating easier
+    table.insert(find_results, 3, find_results[1])
    -- Copy the ending index to the end of the table, so that an ending index
    -- always follows a starting index after position 3 in the table.
    table.insert(find_results, find_results[2] + 1)
@ -40,8 +38,10 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
      local fin = find_results[i + 1] - 1
      local type = pattern.type[i - 2]
        -- ↑ (i - 2) to convert from [3; n] to [1; n]
-      local text = full_text:usub(start, fin)
-      push_token(t, syn.symbols[text] or type, text)
+      if fin >= start then
+        local text = full_text:usub(start, fin)
+        push_token(t, syn.symbols[text] or type, text)
+      end
    end
  else
    local start, fin = find_results[1], find_results[2]
@ -224,6 +224,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
        res[1] = char_pos_1
        res[2] = char_pos_2
      end
+      if not res[1] then return end
      if res[1] and target[3] then
        -- Check to see if the escaped character is there,
        -- and if it is not itself escaped.
@ -235,50 +236,39 @@ function tokenizer.tokenize(incoming_syntax, text, state)
        if count % 2 == 0 then
          -- The match is not escaped, so confirm it
          break
-        elseif not close then
-          -- The *open* match is escaped, so avoid it
-          return
+        else
+          -- The match is escaped, so avoid it
+          res[1] = false
        end
      end
-    until not res[1] or not close or not target[3]
+    until at_start or not close or not target[3]
    return table.unpack(res)
  end

  local text_len = text:ulen()
-  if text_len ~= nil then
-    while i <= text_len do
-      -- continue trying to match the end pattern of a pair if we have a state set
-      if current_pattern_idx > 0 then
-        local p = current_syntax.patterns[current_pattern_idx]
-        local s, e = find_text(text, p, i, false, true)
+  while i <= text_len do
+    -- continue trying to match the end pattern of a pair if we have a state set
+    if current_pattern_idx > 0 then
+      local p = current_syntax.patterns[current_pattern_idx]
+      local s, e = find_text(text, p, i, false, true)
+      -- Use the first token type specified in the type table for the "middle"
+      -- part of the subsyntax.
+      local token_type = type(p.type) == "table" and p.type[1] or p.type

-        local cont = true
-        -- If we're in subsyntax mode, always check to see if we end our syntax
-        -- first, before the found delimeter, as ending the subsyntax takes
-        -- precedence over ending the delimiter in the subsyntax.
-        if subsyntax_info then
-          local ss, se = find_text(text, subsyntax_info, i, false, true)
-          -- If we find that we end the subsyntax before the
-          -- delimiter, push the token, and signal we shouldn't
-          -- treat the bit after as a token to be normally parsed
-          -- (as it's the syntax delimiter).
-          if ss and (s == nil or ss < s) then
-            push_token(res, p.type, text:usub(i, ss - 1))
-            i = ss
-            cont = false
-          end
-        end
-        -- If we don't have any concerns about syntax delimiters,
-        -- continue on as normal.
-        if cont then
-          if s then
-            push_token(res, p.type, text:usub(i, e))
-            set_subsyntax_pattern_idx(0)
-            i = e + 1
-          else
-            push_token(res, p.type, text:usub(i))
-            break
-          end
+      local cont = true
+      -- If we're in subsyntax mode, always check to see if we end our syntax
+      -- first, before the found delimeter, as ending the subsyntax takes
+      -- precedence over ending the delimiter in the subsyntax.
+      if subsyntax_info then
+        local ss, se = find_text(text, subsyntax_info, i, false, true)
+        -- If we find that we end the subsyntax before the
+        -- delimiter, push the token, and signal we shouldn't
+        -- treat the bit after as a token to be normally parsed
+        -- (as it's the syntax delimiter).
+        if ss and (s == nil or ss < s) then
+          push_token(res, token_type, text:usub(i, ss - 1))
+          i = ss
+          cont = false
        end
      end
      -- General end of syntax check. Applies in the case where
@ -287,48 +277,30 @@ function tokenizer.tokenize(incoming_syntax, text, state)
      if subsyntax_info then
        local s, e = find_text(text, subsyntax_info, i, true, true)
        if s then
-          push_token(res, subsyntax_info.type, text:usub(i, e))
-          -- On finding unescaped delimiter, pop it.
-          pop_subsyntax()
+          push_token(res, token_type, text:usub(i, e))
+          set_subsyntax_pattern_idx(0)
          i = e + 1
-        end
-      end
-
-      -- find matching pattern
-      local matched = false
-      for n, p in ipairs(current_syntax.patterns) do
-        local find_results = { find_text(text, p, i, true, false) }
-        if find_results[1] then
-          local type_is_table = type(p.type) == "table"
-          local n_types = type_is_table and #p.type or 1
-          if #find_results == 2 and type_is_table then
-            report_bad_pattern(core.warn, current_syntax, n,
-              "Token type is a table, but a string was expected.")
-            p.type = p.type[1]
-          elseif #find_results - 1 > n_types then
-            report_bad_pattern(core.error, current_syntax, n,
-              "Not enough token types: got %d needed %d.", n_types, #find_results - 1)
-          elseif #find_results - 1 < n_types then
-            report_bad_pattern(core.warn, current_syntax, n,
-              "Too many token types: got %d needed %d.", n_types, #find_results - 1)
-          end
-          -- matched pattern; make and add tokens
-          push_tokens(res, current_syntax, p, text, find_results)
-          -- update state if this was a start|end pattern pair
-          if type(p.pattern or p.regex) == "table" then
-            -- If we have a subsyntax, push that onto the subsyntax stack.
-            if p.syntax then
-              push_subsyntax(p, n)
-            else
-              set_subsyntax_pattern_idx(n)
-            end
-          end
-          -- move cursor past this token
-          i = find_results[2] + 1
-          matched = true
+        else
+          push_token(res, token_type, text:usub(i))
          break
        end
      end
+    end
+    -- General end of syntax check. Applies in the case where
+    -- we're ending early in the middle of a delimiter, or
+    -- just normally, upon finding a token.
+    while subsyntax_info do
+      local find_results = { find_text(text, subsyntax_info, i, true, true) }
+      local s, e = find_results[1], find_results[2]
+      if s then
+        push_tokens(res, current_syntax, subsyntax_info, text, find_results)
+        -- On finding unescaped delimiter, pop it.
+        pop_subsyntax()
+        i = e + 1
+      else
+        break
+      end
+    end

      -- consume character if we didn't match
      if not matched then