diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index 98aafc71..f3070995 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -1,5 +1,6 @@ -local tokenizer = {} +local syntax = require "core.syntax" +local tokenizer = {} local function push_token(t, type, text) local prev_type = t[#t-1] @@ -37,45 +38,127 @@ local function find_non_escaped(text, pattern, offset, esc) end end +-- State is a 32-bit number that is four separate bytes, illustrating how many +-- differnet delimiters we have open, and which subsyntaxes we have active. +-- At most, there are 3 subsyntaxes active at the same time. Beyond that, +-- does not support further highlighting. +local function retrieve_syntax_state(incoming_syntax, state) + local current_syntax, subsyntax_info, current_state, current_level = + incoming_syntax, nil, state, 0 + if state > 0 and (state > 255 or current_syntax.patterns[state].syntax) then + -- If we have higher bits, then decode them one at a time, and find which + -- syntax we're using. Rather than walking the bytes, and calling into + -- `syntax` each time, we could probably cache this in a single table. + for i=0,2 do + local target = bit32.extract(state, i*8, 8) + if target ~= 0 then + if current_syntax.patterns[target].syntax then + subsyntax_info = current_syntax.patterns[target] + current_syntax = type(subsyntax_info.syntax) == "table" and + subsyntax_info.syntax or syntax.get(subsyntax_info.syntax) + current_state = 0 + current_level = i+1 + else + current_state = target + break + end + else + break + end + end + end + return current_syntax, subsyntax_info, current_state, current_level +end -function tokenizer.tokenize(syntax, text, state) +function tokenizer.tokenize(incoming_syntax, text, state) local res = {} local i = 1 - if #syntax.patterns == 0 then + if #incoming_syntax.patterns == 0 then return { "normal", text } end - + + state = state or 0 + local current_syntax, subsyntax_info, current_state, current_level = + retrieve_syntax_state(incoming_syntax, state) while i <= #text do -- continue trying to match the end pattern of a pair if we have a state set - if state then - local p = syntax.patterns[state] + if current_state > 0 then + local p = current_syntax.patterns[current_state] local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3]) - + + local cont = true + -- If we're in subsyntax mode, always check to see if we end our syntax + -- first. + if subsyntax_info then + local ss, se = find_non_escaped( + text, + subsyntax_info.pattern[2], + i, + subsyntax_info.pattern[3] + ) + if ss and (s == nil or ss < s) then + push_token(res, p.type, text:sub(i, ss - 1)) + i = ss + cont = false + end + end + if cont then + if s then + push_token(res, p.type, text:sub(i, e)) + current_state = 0 + state = bit32.replace(state, 0, current_level*8, 8) + i = e + 1 + else + push_token(res, p.type, text:sub(i)) + break + end + end + end + -- Check for end of syntax. + if subsyntax_info then + local s, e = find_non_escaped( + text, + "^" .. subsyntax_info.pattern[2], + i, + nil + ) if s then - push_token(res, p.type, text:sub(i, e)) - state = nil + push_token(res, subsyntax_info.type, text:sub(i, e)) + current_level = current_level - 1 + -- Zero out the state above us, as well as our new current state. + state = bit32.replace(state, 0, current_level*8, 16) + current_syntax, subsyntax_info, current_state, current_level = + retrieve_syntax_state(incoming_syntax, state) i = e + 1 - else - push_token(res, p.type, text:sub(i)) - break end end -- find matching pattern local matched = false - for n, p in ipairs(syntax.patterns) do + for n, p in ipairs(current_syntax.patterns) do local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern local s, e = text:find("^" .. pattern, i) if s then -- matched pattern; make and add token local t = text:sub(s, e) - push_token(res, syntax.symbols[t] or p.type, t) - + + push_token(res, current_syntax.symbols[t] or p.type, t) -- update state if this was a start|end pattern pair if type(p.pattern) == "table" then - state = n + state = bit32.replace(state, n, current_level*8, 8) + -- If we've found a new subsyntax, bump our level, and set the + -- appropriate variables. + if p.syntax then + current_level = current_level + 1 + subsyntax_info = p + current_syntax = type(p.syntax) == "table" and + p.syntax or syntax.get(p.syntax) + current_state = 0 + else + current_state = n + end end -- move cursor past this token diff --git a/data/plugins/language_html.lua b/data/plugins/language_html.lua new file mode 100644 index 00000000..d5ef59d8 --- /dev/null +++ b/data/plugins/language_html.lua @@ -0,0 +1,36 @@ +-- lite-xl 1.16 +local syntax = require "core.syntax" + +syntax.add { + files = { "%.html?$" }, + patterns = { + { + pattern = { "" }, + syntax = ".js", + type = "function" + }, + { + pattern = { "" }, + syntax = ".js", + type = "function" + }, + { + pattern = { "]*>", "" }, + syntax = ".css", + type = "function" + }, + { pattern = { "" }, type = "comment" }, + { pattern = { '%f[^>][^<]', '%f[<]' }, type = "normal" }, + { pattern = { '"', '"', '\\' }, type = "string" }, + { pattern = { "'", "'", '\\' }, type = "string" }, + { pattern = "0x[%da-fA-F]+", type = "number" }, + { pattern = "-?%d+[%d%.]*f?", type = "number" }, + { pattern = "-?%.?%d+f?", type = "number" }, + { pattern = "%f[^<]![%a_][%w_]*", type = "keyword2" }, + { pattern = "%f[^<][%a_][%w_]*", type = "function" }, + { pattern = "%f[^<]/[%a_][%w_]*", type = "function" }, + { pattern = "[%a_][%w_]*", type = "keyword" }, + { pattern = "[/<>=]", type = "operator" }, + }, + symbols = {}, +} diff --git a/data/plugins/language_xml.lua b/data/plugins/language_xml.lua index 8687ab6a..b87fcc56 100644 --- a/data/plugins/language_xml.lua +++ b/data/plugins/language_xml.lua @@ -2,7 +2,7 @@ local syntax = require "core.syntax" syntax.add { - files = { "%.xml$", "%.html?$" }, + files = { "%.xml$" }, headers = "<%?xml", patterns = { { pattern = { "" }, type = "comment" },