Nested Syntax Highlighting (#160)

2021-05-01 05:45:30 -04:00 · 2021-05-01 05:45:30 -04:00 · 3fe6665b9a
parent a72431ace7
commit 3fe6665b9a
3 changed files with 136 additions and 17 deletions
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@ -1,5 +1,6 @@
-local tokenizer = {}
+local syntax = require "core.syntax"

+local tokenizer = {}

 local function push_token(t, type, text)
  local prev_type = t[#t-1]
@ -37,45 +38,127 @@ local function find_non_escaped(text, pattern, offset, esc)
  end
 end

+-- State is a 32-bit number that is four separate bytes, illustrating how many 
+-- differnet delimiters we have open, and which subsyntaxes we have active. 
+-- At most, there are 3 subsyntaxes active at the same time. Beyond that, 
+-- does not support further highlighting.
+local function retrieve_syntax_state(incoming_syntax, state)
+  local current_syntax, subsyntax_info, current_state, current_level = 
+    incoming_syntax, nil, state, 0
+  if state > 0 and (state > 255 or current_syntax.patterns[state].syntax) then
+    -- If we have higher bits, then decode them one at a time, and find which 
+    -- syntax we're using. Rather than walking the bytes, and calling into 
+    -- `syntax` each time, we could probably cache this in a single table.
+    for i=0,2 do
+      local target = bit32.extract(state, i*8, 8)
+      if target ~= 0 then
+        if current_syntax.patterns[target].syntax then
+          subsyntax_info = current_syntax.patterns[target]
+          current_syntax = type(subsyntax_info.syntax) == "table" and 
+            subsyntax_info.syntax or syntax.get(subsyntax_info.syntax)
+          current_state = 0
+          current_level = i+1
+        else
+          current_state = target
+          break
+        end
+      else      
+        break
+      end
+    end
+  end
+  return current_syntax, subsyntax_info, current_state, current_level
+end

-function tokenizer.tokenize(syntax, text, state)
+function tokenizer.tokenize(incoming_syntax, text, state)
  local res = {}
  local i = 1

-  if #syntax.patterns == 0 then
+  if #incoming_syntax.patterns == 0 then
    return { "normal", text }
  end
-
+ 
+  state = state or 0
+  local current_syntax, subsyntax_info, current_state, current_level = 
+    retrieve_syntax_state(incoming_syntax, state)
  while i <= #text do
    -- continue trying to match the end pattern of a pair if we have a state set
-    if state then
-      local p = syntax.patterns[state]
+    if current_state > 0 then
+      local p = current_syntax.patterns[current_state]
      local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3])
-
+      
+      local cont = true
+      -- If we're in subsyntax mode, always check to see if we end our syntax
+      -- first.
+      if subsyntax_info then
+        local ss, se = find_non_escaped(
+	  text, 
+	  subsyntax_info.pattern[2], 
+	  i, 
+	  subsyntax_info.pattern[3]
+	)
+        if ss and (s == nil or ss < s) then
+          push_token(res, p.type, text:sub(i, ss - 1))          
+          i = ss
+          cont = false
+        end
+      end
+      if cont then
+        if s then
+          push_token(res, p.type, text:sub(i, e))
+          current_state = 0
+          state = bit32.replace(state, 0, current_level*8, 8)
+          i = e + 1
+        else
+          push_token(res, p.type, text:sub(i))
+          break
+        end
+      end
+    end
+    -- Check for end of syntax.
+    if subsyntax_info then
+      local s, e = find_non_escaped(
+        text, 
+	"^" .. subsyntax_info.pattern[2], 
+	i, 
+	nil
+      )
      if s then
-        push_token(res, p.type, text:sub(i, e))
-        state = nil
+        push_token(res, subsyntax_info.type, text:sub(i, e))
+        current_level = current_level - 1
+        -- Zero out the state above us, as well as our new current state.
+        state = bit32.replace(state, 0, current_level*8, 16)
+        current_syntax, subsyntax_info, current_state, current_level = 
+          retrieve_syntax_state(incoming_syntax, state)
        i = e + 1
-      else
-        push_token(res, p.type, text:sub(i))
-        break
      end
    end

    -- find matching pattern
    local matched = false
-    for n, p in ipairs(syntax.patterns) do
+    for n, p in ipairs(current_syntax.patterns) do
      local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern
      local s, e = text:find("^" .. pattern, i)

      if s then
        -- matched pattern; make and add token
        local t = text:sub(s, e)
-        push_token(res, syntax.symbols[t] or p.type, t)
-
+        
+        push_token(res, current_syntax.symbols[t] or p.type, t)
        -- update state if this was a start|end pattern pair
        if type(p.pattern) == "table" then
-          state = n
+          state = bit32.replace(state, n, current_level*8, 8)
+          -- If we've found a new subsyntax, bump our level, and set the 
+	  -- appropriate variables.
+          if p.syntax then
+            current_level = current_level + 1
+            subsyntax_info = p
+            current_syntax = type(p.syntax) == "table" and 
+	      p.syntax or syntax.get(p.syntax)
+            current_state = 0
+          else        
+            current_state = n
+          end
        end

        -- move cursor past this token
--- a/data/plugins/language_html.lua
+++ b/data/plugins/language_html.lua
@ -0,0 +1,36 @@
+-- lite-xl 1.16
+local syntax = require "core.syntax"
+
+syntax.add {
+  files = { "%.html?$" },
+  patterns = {
+    { 
+      pattern = { "<script type=['\"]%a+/javascript['\"]>", "</script>" },
+      syntax = ".js", 
+      type = "function" 
+    },
+    { 
+      pattern = { "<script>", "</script>" },
+      syntax = ".js",
+      type = "function"
+    },
+    { 
+      pattern = { "<style[^>]*>", "</style>" },
+      syntax = ".css",
+      type = "function"
+    },
+    { pattern = { "<!%-%-", "%-%->" },     type = "comment"  },
+    { pattern = { '%f[^>][^<]', '%f[<]' }, type = "normal"   },
+    { pattern = { '"', '"', '\\' },        type = "string"   },
+    { pattern = { "'", "'", '\\' },        type = "string"   },
+    { pattern = "0x[%da-fA-F]+",           type = "number"   },
+    { pattern = "-?%d+[%d%.]*f?",          type = "number"   },
+    { pattern = "-?%.?%d+f?",              type = "number"   },
+    { pattern = "%f[^<]![%a_][%w_]*",      type = "keyword2" },
+    { pattern = "%f[^<][%a_][%w_]*",       type = "function" },
+    { pattern = "%f[^<]/[%a_][%w_]*",      type = "function" },
+    { pattern = "[%a_][%w_]*",             type = "keyword"  },
+    { pattern = "[/<>=]",                  type = "operator" },
+  },
+  symbols = {},
+}
--- a/data/plugins/language_xml.lua
+++ b/data/plugins/language_xml.lua
@ -2,7 +2,7 @@
 local syntax = require "core.syntax"

 syntax.add {
-  files = { "%.xml$", "%.html?$" },
+  files = { "%.xml$" },
  headers = "<%?xml",
  patterns = {
    { pattern = { "<!%-%-", "%-%->" },     type = "comment"  },