tokenizer: remove the limit of 3 subsyntaxes depth (#1186)

* tokenizer: remove the limit of 3 subsyntaxes depth Make the state a string of bytes instead of a 32bits integer to be able to have deeper subsyntax support. Fixes issues with syntax files like the one for PHP that was already hitting more than 3 subsyntaxes depth. * remove unnecesary call to set_subsyntax_pattern_idx * fixed wrong word on comments
2022-11-03 18:56:20 -04:00 · 2022-11-03 18:56:20 -04:00 · b8a4f729df
parent 03cc5ffcd1
commit b8a4f729df
1 changed files with 41 additions and 25 deletions
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@ -1,6 +1,5 @@
 local core = require "core"
 local syntax = require "core.syntax"
-local common = require "core.common"

 local tokenizer = {}
 local bad_patterns = {}
@ -51,31 +50,37 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
  end
 end

+-- State is a string of bytes, where the count of bytes represents the depth
+-- of the subsyntax we are currently in. Each individual byte represents the
+-- index of the pattern for the current subsyntax in relation to its parent
+-- syntax. Using a string of bytes allows us to have as many subsyntaxes as
+-- bytes can be stored on a string while keeping some level of performance in
+-- comparison to a Lua table. The only limitation is that a syntax would not
+-- be able to contain more than 255 patterns.
+--
+-- Lets say a state contains 2 bytes byte #1 with value `3` and byte #2 with
+-- a value of `5`. This would mean that on the parent syntax at index `3` a
+-- pattern subsyntax that matched current text was found, then inside that
+-- subsyntax another subsyntax pattern at index `5` that matched current text
+-- was also found.

-- State is a 32-bit number that is four separate bytes, illustrating how many
-- differnet delimiters we have open, and which subsyntaxes we have active.
-- At most, there are 3 subsyntaxes active at the same time. Beyond that,
-- does not support further highlighting.
+-- Calling `push_subsyntax` appends the current subsyntax pattern index to the
+-- state and increases the stack depth. Calling `pop_subsyntax` clears the
+-- last appended subsyntax and decreases the stack.

-- You can think of it as a maximum 4 integer (0-255) stack. It always has
-- 1 integer in it. Calling `push_subsyntax` increases the stack depth. Calling
-- `pop_subsyntax` decreases it. The integers represent the index of a pattern
-- that we're following in the syntax. The top of the stack can be any valid
-- pattern index, any integer lower in the stack must represent a pattern that
-- specifies a subsyntax.
-
-- If you do not have subsyntaxes in your syntax, the three most
-- singificant numbers will always be 0, the stack will only ever be length 1
-- and the state variable will only ever range from 0-255.
 local function retrieve_syntax_state(incoming_syntax, state)
  local current_syntax, subsyntax_info, current_pattern_idx, current_level =
-    incoming_syntax, nil, state, 0
-  if state > 0 and (state > 255 or current_syntax.patterns[state].syntax) then
-    -- If we have higher bits, then decode them one at a time, and find which
+    incoming_syntax, nil, state:byte(1) or 0, 1
+  if
+    current_pattern_idx > 0
+    and
+    current_syntax.patterns[current_pattern_idx]
+  then
+    -- If the state is not empty we iterate over each byte, and find which
    -- syntax we're using. Rather than walking the bytes, and calling into
    -- `syntax` each time, we could probably cache this in a single table.
-    for i = 0, 2 do
-      local target = bit32.extract(state, i*8, 8)
+    for i = 1, #state do
+      local target = state:byte(i)
      if target ~= 0 then
        if current_syntax.patterns[target].syntax then
          subsyntax_info = current_syntax.patterns[target]
@ -107,7 +112,7 @@ end

 ---@param incoming_syntax table
 ---@param text string
---@param state integer
+---@param state string
 function tokenizer.tokenize(incoming_syntax, text, state)
  local res = {}
  local i = 1
@ -116,9 +121,9 @@ function tokenizer.tokenize(incoming_syntax, text, state)
    return { "normal", text }
  end

-  state = state or 0
+  state = state or ""
  -- incoming_syntax    : the parent syntax of the file.
-  -- state              : a 32-bit number representing syntax state (see above)
+  -- state              : a string of bytes representing syntax state (see above)

  -- current_syntax     : the syntax we're currently in.
  -- subsyntax_info     : info about the delimiters of this subsyntax.
@ -130,7 +135,18 @@ function tokenizer.tokenize(incoming_syntax, text, state)
  -- Should be used to set the state variable. Don't modify it directly.
  local function set_subsyntax_pattern_idx(pattern_idx)
    current_pattern_idx = pattern_idx
-    state = bit32.replace(state, pattern_idx, current_level*8, 8)
+    local state_len = #state
+    if current_level > state_len then
+      state = state .. string.char(pattern_idx)
+    elseif state_len == 1 then
+      state = string.char(pattern_idx)
+    else
+      state = ("%s%s%s"):format(
+        state:sub(1,current_level-1),
+        string.char(pattern_idx),
+        state:sub(current_level+1)
+      )
+    end
  end


@ -144,8 +160,8 @@ function tokenizer.tokenize(incoming_syntax, text, state)
  end

  local function pop_subsyntax()
-    set_subsyntax_pattern_idx(0)
    current_level = current_level - 1
+    state = string.sub(state, 1, current_level)
    set_subsyntax_pattern_idx(0)
    current_syntax, subsyntax_info, current_pattern_idx, current_level =
      retrieve_syntax_state(incoming_syntax, state)