From 949692860e777ff6be0ddc013806f20302e7ffea Mon Sep 17 00:00:00 2001
From: Adam <adamdharrison@gmail.com>
Date: Thu, 20 May 2021 15:58:27 -0400
Subject: [PATCH] Tokenizer cleanup (#198)

* Cleaned up tokenizer to make subsyntax operations more clear.

* Explanatory comments.

* Made it so push_subsyntax could be safely called elsewhere.

* Unified terminology.

* Minor bug fix.

* State is an incredibly vaguely named variable. Changed convention to represent what it actually is.

* Also changed function name.

* Fixed bug.
---
 data/core/tokenizer.lua | 99 +++++++++++++++++++++++++++++------------
 1 file changed, 71 insertions(+), 28 deletions(-)

diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua
index 6d51928c..83e0e665 100644
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@@ -74,9 +74,20 @@ end
 -- State is a 32-bit number that is four separate bytes, illustrating how many
 -- differnet delimiters we have open, and which subsyntaxes we have active.
 -- At most, there are 3 subsyntaxes active at the same time. Beyond that,
--- does not support further highlighting.
+-- does not support further highlighting. 
+
+-- You can think of it as a maximum 4 integer (0-255) stack. It always has
+-- 1 integer in it. Calling `push_subsyntax` increases the stack depth. Calling
+-- `pop_subsyntax` decreases it. The integers represent the index of a pattern
+-- that we're following in the syntax. The top of the stack can be any valid 
+-- pattern index, any integer lower in the stack must represent a pattern that
+-- specifies a subsyntax.
+
+-- If you do not have subsyntaxes in your syntax, the three most
+-- singificant numbers will always be 0, the stack will only ever be length 1
+-- and the state variable will only ever range from 0-255.
 local function retrieve_syntax_state(incoming_syntax, state)
-  local current_syntax, subsyntax_info, current_state, current_level =
+  local current_syntax, subsyntax_info, current_pattern_idx, current_level =
     incoming_syntax, nil, state, 0
   if state > 0 and (state > 255 or current_syntax.patterns[state].syntax) then
     -- If we have higher bits, then decode them one at a time, and find which
@@ -89,10 +100,10 @@ local function retrieve_syntax_state(incoming_syntax, state)
           subsyntax_info = current_syntax.patterns[target]
           current_syntax = type(subsyntax_info.syntax) == "table" and
             subsyntax_info.syntax or syntax.get(subsyntax_info.syntax)
-          current_state = 0
+          current_pattern_idx = 0
           current_level = i+1
         else
-          current_state = target
+          current_pattern_idx = target
           break
         end
       else
@@ -100,7 +111,7 @@ local function retrieve_syntax_state(incoming_syntax, state)
       end
     end
   end
-  return current_syntax, subsyntax_info, current_state, current_level
+  return current_syntax, subsyntax_info, current_pattern_idx, current_level
 end
 
 function tokenizer.tokenize(incoming_syntax, text, state)
@@ -112,17 +123,51 @@ function tokenizer.tokenize(incoming_syntax, text, state)
   end
 
   state = state or 0
-  local current_syntax, subsyntax_info, current_state, current_level =
+  -- incoming_syntax    : the parent syntax of the file.
+  -- state              : a 32-bit number representing syntax state (see above) 
+  
+  -- current_syntax     : the syntax we're currently in.
+  -- subsyntax_info     : info about the delimiters of this subsyntax.
+  -- current_pattern_idx: the index of the pattern we're on for this syntax.
+  -- current_level      : how many subsyntaxes deep we are.
+  local current_syntax, subsyntax_info, current_pattern_idx, current_level =
     retrieve_syntax_state(incoming_syntax, state)
+  
+  -- Should be used to set the state variable. Don't modify it directly.
+  local function set_subsyntax_pattern_idx(pattern_idx)
+    current_pattern_idx = pattern_idx
+    state = bit32.replace(state, pattern_idx, current_level*8, 8)
+  end
+  
+  
+  local function push_subsyntax(entering_syntax, pattern_idx)
+    set_subsyntax_pattern_idx(pattern_idx)
+    current_level = current_level + 1
+    subsyntax_info = entering_syntax
+    current_syntax = type(entering_syntax.syntax) == "table" and
+      entering_syntax.syntax or syntax.get(entering_syntax.syntax)
+    current_pattern_idx = 0
+  end
+  
+  local function pop_subsyntax()
+    set_subsyntax_pattern_idx(0)
+    current_level = current_level - 1
+    set_subsyntax_pattern_idx(0)
+    current_syntax, subsyntax_info, current_pattern_idx, current_level = 
+      retrieve_syntax_state(incoming_syntax, state)
+  
+  end
+  
   while i <= #text do
     -- continue trying to match the end pattern of a pair if we have a state set
-    if current_state > 0 then
-      local p = current_syntax.patterns[current_state]
+    if current_pattern_idx > 0 then
+      local p = current_syntax.patterns[current_pattern_idx]
       local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3])
 
       local cont = true
       -- If we're in subsyntax mode, always check to see if we end our syntax
-      -- first.
+      -- first, before the found delimeter, as ending the subsyntax takes
+      -- precedence over ending the delimiter in the subsyntax.
       if subsyntax_info then
         local ss, se = find_non_escaped(
           text,
@@ -130,17 +175,22 @@ function tokenizer.tokenize(incoming_syntax, text, state)
           i,
           subsyntax_info.pattern[3]
         )
+        -- If we find that we end the subsyntax before the 
+        -- delimiter, push the token, and signal we shouldn't
+        -- treat the bit after as a token to be normally parsed
+        -- (as it's the syntax delimiter).
         if ss and (s == nil or ss < s) then
           push_token(res, p.type, text:sub(i, ss - 1))
           i = ss
           cont = false
         end
       end
+      -- If we don't have any concerns about syntax delimiters,
+      -- continue on as normal.
       if cont then
         if s then
           push_token(res, p.type, text:sub(i, e))
-          current_state = 0
-          state = bit32.replace(state, 0, current_level*8, 8)
+          set_subsyntax_pattern_idx(0)
           i = e + 1
         else
           push_token(res, p.type, text:sub(i))
@@ -148,7 +198,9 @@ function tokenizer.tokenize(incoming_syntax, text, state)
         end
       end
     end
-    -- Check for end of syntax.
+    -- General end of syntax check. Applies in the case where
+    -- we're ending early in the middle of a delimiter, or
+    -- just normally, upon finding a token.
     if subsyntax_info then
       local s, e = find_non_escaped(
         text,
@@ -158,11 +210,8 @@ function tokenizer.tokenize(incoming_syntax, text, state)
       )
       if s then
         push_token(res, subsyntax_info.type, text:sub(i, e))
-        current_level = current_level - 1
-        -- Zero out the state above us, as well as our new current state.
-        state = bit32.replace(state, 0, current_level*8, 16)
-        current_syntax, subsyntax_info, current_state, current_level =
-          retrieve_syntax_state(incoming_syntax, state)
+        -- On finding unescaped delimiter, pop it.
+        pop_subsyntax()
         i = e + 1
       end
     end
@@ -180,20 +229,14 @@ function tokenizer.tokenize(incoming_syntax, text, state)
 
         -- update state if this was a start|end pattern pair
         if type(p.pattern) == "table" then
-          state = bit32.replace(state, n, current_level*8, 8)
-          -- If we've found a new subsyntax, bump our level, and set the
-          -- appropriate variables.
+          -- If we have a subsyntax, push that onto the subsyntax stack.
           if p.syntax then
-            current_level = current_level + 1
-            subsyntax_info = p
-            current_syntax = type(p.syntax) == "table" and
-              p.syntax or syntax.get(p.syntax)
-            current_state = 0
-          else
-            current_state = n
+            push_subsyntax(p, n)
+          else          
+            set_subsyntax_pattern_idx(n)
           end
         end
-
+        
         -- move cursor past this token
         i = fin + 1
         matched = true