Nested Syntax Highlighting (#160)

2021-05-01 05:45:30 -04:00 · 2021-05-01 05:45:30 -04:00 · 3fe6665b9a
parent a72431ace7
commit 3fe6665b9a
3 changed files with 136 additions and 17 deletions
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@ -1,5 +1,6 @@
-local tokenizer = {}
+local syntax = require "core.syntax"
 local tokenizer = {}
 local function push_token(t, type, text)
  local prev_type = t[#t-1]
@ -37,45 +38,127 @@ local function find_non_escaped(text, pattern, offset, esc)
  end
 end
 -- State is a 32-bit number that is four separate bytes, illustrating how many 
 -- differnet delimiters we have open, and which subsyntaxes we have active. 
 -- At most, there are 3 subsyntaxes active at the same time. Beyond that, 
 -- does not support further highlighting.
 local function retrieve_syntax_state(incoming_syntax, state)
  local current_syntax, subsyntax_info, current_state, current_level = 
    incoming_syntax, nil, state, 0
  if state > 0 and (state > 255 or current_syntax.patterns[state].syntax) then
    -- If we have higher bits, then decode them one at a time, and find which 
    -- syntax we're using. Rather than walking the bytes, and calling into 
    -- `syntax` each time, we could probably cache this in a single table.
    for i=0,2 do
      local target = bit32.extract(state, i*8, 8)
      if target ~= 0 then
        if current_syntax.patterns[target].syntax then
          subsyntax_info = current_syntax.patterns[target]
          current_syntax = type(subsyntax_info.syntax) == "table" and 
            subsyntax_info.syntax or syntax.get(subsyntax_info.syntax)
          current_state = 0
          current_level = i+1
        else
          current_state = target
          break
        end
      else      
        break
      end
    end
  end
  return current_syntax, subsyntax_info, current_state, current_level
 end
-function tokenizer.tokenize(syntax, text, state)
+function tokenizer.tokenize(incoming_syntax, text, state)
  local res = {}
  local i = 1
-  if #syntax.patterns == 0 then
+  if #incoming_syntax.patterns == 0 then
    return { "normal", text }
  end
  state = state or 0
  local current_syntax, subsyntax_info, current_state, current_level = 
    retrieve_syntax_state(incoming_syntax, state)
  while i <= #text do
    -- continue trying to match the end pattern of a pair if we have a state set
-    if state then
+    if current_state > 0 then
-      local p = syntax.patterns[state]
+      local p = current_syntax.patterns[current_state]
      local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3])
      local cont = true
      -- If we're in subsyntax mode, always check to see if we end our syntax
      -- first.
      if subsyntax_info then
        local ss, se = find_non_escaped(
 	  text, 
 	  subsyntax_info.pattern[2], 
 	  i, 
 	  subsyntax_info.pattern[3]
 	)
        if ss and (s == nil or ss < s) then
          push_token(res, p.type, text:sub(i, ss - 1))          
          i = ss
          cont = false
        end
      end
      if cont then
        if s then
          push_token(res, p.type, text:sub(i, e))
          current_state = 0
          state = bit32.replace(state, 0, current_level*8, 8)
          i = e + 1
        else
          push_token(res, p.type, text:sub(i))
          break
        end
      end
    end
    -- Check for end of syntax.
    if subsyntax_info then
      local s, e = find_non_escaped(
        text, 
 	"^" .. subsyntax_info.pattern[2], 
 	i, 
 	nil
      )
      if s then
-        push_token(res, p.type, text:sub(i, e))
+        push_token(res, subsyntax_info.type, text:sub(i, e))
-        state = nil
+        current_level = current_level - 1
        -- Zero out the state above us, as well as our new current state.
        state = bit32.replace(state, 0, current_level*8, 16)
        current_syntax, subsyntax_info, current_state, current_level = 
          retrieve_syntax_state(incoming_syntax, state)
        i = e + 1
      else
        push_token(res, p.type, text:sub(i))
        break
      end
    end
    -- find matching pattern
    local matched = false
-    for n, p in ipairs(syntax.patterns) do
+    for n, p in ipairs(current_syntax.patterns) do
      local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern
      local s, e = text:find("^" .. pattern, i)
      if s then
        -- matched pattern; make and add token
        local t = text:sub(s, e)
        push_token(res, syntax.symbols[t] or p.type, t)
        push_token(res, current_syntax.symbols[t] or p.type, t)
        -- update state if this was a start|end pattern pair
        if type(p.pattern) == "table" then
-          state = n
+          state = bit32.replace(state, n, current_level*8, 8)
          -- If we've found a new subsyntax, bump our level, and set the 
 	  -- appropriate variables.
          if p.syntax then
            current_level = current_level + 1
            subsyntax_info = p
            current_syntax = type(p.syntax) == "table" and 
 	      p.syntax or syntax.get(p.syntax)
            current_state = 0
          else        
            current_state = n
          end
        end
        -- move cursor past this token
--- a/data/plugins/language_html.lua
+++ b/data/plugins/language_html.lua
@ -0,0 +1,36 @@
 -- lite-xl 1.16
 local syntax = require "core.syntax"
 syntax.add {
  files = { "%.html?$" },
  patterns = {
    { 
      pattern = { "<script type=['\"]%a+/javascript['\"]>", "</script>" },
      syntax = ".js", 
      type = "function" 
    },
    { 
      pattern = { "<script>", "</script>" },
      syntax = ".js",
      type = "function"
    },
    { 
      pattern = { "<style[^>]*>", "</style>" },
      syntax = ".css",
      type = "function"
    },
    { pattern = { "<!%-%-", "%-%->" },     type = "comment"  },
    { pattern = { '%f[^>][^<]', '%f[<]' }, type = "normal"   },
    { pattern = { '"', '"', '\\' },        type = "string"   },
    { pattern = { "'", "'", '\\' },        type = "string"   },
    { pattern = "0x[%da-fA-F]+",           type = "number"   },
    { pattern = "-?%d+[%d%.]*f?",          type = "number"   },
    { pattern = "-?%.?%d+f?",              type = "number"   },
    { pattern = "%f[^<]![%a_][%w_]*",      type = "keyword2" },
    { pattern = "%f[^<][%a_][%w_]*",       type = "function" },
    { pattern = "%f[^<]/[%a_][%w_]*",      type = "function" },
    { pattern = "[%a_][%w_]*",             type = "keyword"  },
    { pattern = "[/<>=]",                  type = "operator" },
  },
  symbols = {},
 }
--- a/data/plugins/language_xml.lua
+++ b/data/plugins/language_xml.lua
@ -2,7 +2,7 @@
 local syntax = require "core.syntax"
 syntax.add {
-  files = { "%.xml$", "%.html?$" },
+  files = { "%.xml$" },
  headers = "<%?xml",
  patterns = {
    { pattern = { "<!%-%-", "%-%->" },     type = "comment"  },