2022-05-31 02:03:42 +02:00
|
|
|
local core = require "core"
|
2021-05-01 11:45:30 +02:00
|
|
|
local syntax = require "core.syntax"
|
2021-10-11 22:37:31 +02:00
|
|
|
local common = require "core.common"
|
2019-12-28 12:16:32 +01:00
|
|
|
|
2021-05-01 11:45:30 +02:00
|
|
|
local tokenizer = {}
|
2022-05-31 02:03:42 +02:00
|
|
|
local bad_patterns = {}
|
2019-12-28 12:16:32 +01:00
|
|
|
|
|
|
|
local function push_token(t, type, text)
|
2022-05-31 02:03:42 +02:00
|
|
|
type = type or "normal"
|
2019-12-28 12:16:32 +01:00
|
|
|
local prev_type = t[#t-1]
|
|
|
|
local prev_text = t[#t]
|
2022-04-26 15:42:02 +02:00
|
|
|
if prev_type and (prev_type == type or prev_text:ufind("^%s*$")) then
|
2019-12-28 12:16:32 +01:00
|
|
|
t[#t-1] = type
|
|
|
|
t[#t] = prev_text .. text
|
|
|
|
else
|
|
|
|
table.insert(t, type)
|
|
|
|
table.insert(t, text)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
2021-05-19 22:35:28 +02:00
|
|
|
local function push_tokens(t, syn, pattern, full_text, find_results)
|
|
|
|
if #find_results > 2 then
|
|
|
|
-- We do some manipulation with find_results so that it's arranged
|
|
|
|
-- like this:
|
|
|
|
-- { start, end, i_1, i_2, i_3, …, i_last }
|
|
|
|
-- Each position spans characters from i_n to ((i_n+1) - 1), to form
|
|
|
|
-- consecutive spans of text.
|
|
|
|
--
|
|
|
|
-- If i_1 is not equal to start, start is automatically inserted at
|
|
|
|
-- that index.
|
|
|
|
if find_results[3] ~= find_results[1] then
|
|
|
|
table.insert(find_results, 3, find_results[1])
|
|
|
|
end
|
|
|
|
-- Copy the ending index to the end of the table, so that an ending index
|
|
|
|
-- always follows a starting index after position 3 in the table.
|
|
|
|
table.insert(find_results, find_results[2] + 1)
|
|
|
|
-- Then, we just iterate over our modified table.
|
|
|
|
for i = 3, #find_results - 1 do
|
|
|
|
local start = find_results[i]
|
|
|
|
local fin = find_results[i + 1] - 1
|
|
|
|
local type = pattern.type[i - 2]
|
|
|
|
-- ↑ (i - 2) to convert from [3; n] to [1; n]
|
2022-04-26 15:42:02 +02:00
|
|
|
local text = full_text:usub(start, fin)
|
2021-05-19 22:35:28 +02:00
|
|
|
push_token(t, syn.symbols[text] or type, text)
|
|
|
|
end
|
|
|
|
else
|
|
|
|
local start, fin = find_results[1], find_results[2]
|
2022-04-26 15:42:02 +02:00
|
|
|
local text = full_text:usub(start, fin)
|
2021-05-19 22:35:28 +02:00
|
|
|
push_token(t, syn.symbols[text] or pattern.type, text)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
2021-05-18 17:52:18 +02:00
|
|
|
-- State is a 32-bit number that is four separate bytes, illustrating how many
|
|
|
|
-- differnet delimiters we have open, and which subsyntaxes we have active.
|
|
|
|
-- At most, there are 3 subsyntaxes active at the same time. Beyond that,
|
2022-04-26 15:42:02 +02:00
|
|
|
-- does not support further highlighting.
|
2021-05-20 21:58:27 +02:00
|
|
|
|
|
|
|
-- You can think of it as a maximum 4 integer (0-255) stack. It always has
|
|
|
|
-- 1 integer in it. Calling `push_subsyntax` increases the stack depth. Calling
|
|
|
|
-- `pop_subsyntax` decreases it. The integers represent the index of a pattern
|
2022-04-26 15:42:02 +02:00
|
|
|
-- that we're following in the syntax. The top of the stack can be any valid
|
2021-05-20 21:58:27 +02:00
|
|
|
-- pattern index, any integer lower in the stack must represent a pattern that
|
|
|
|
-- specifies a subsyntax.
|
|
|
|
|
|
|
|
-- If you do not have subsyntaxes in your syntax, the three most
|
|
|
|
-- singificant numbers will always be 0, the stack will only ever be length 1
|
|
|
|
-- and the state variable will only ever range from 0-255.
|
2021-05-01 11:45:30 +02:00
|
|
|
local function retrieve_syntax_state(incoming_syntax, state)
|
2021-05-20 21:58:27 +02:00
|
|
|
local current_syntax, subsyntax_info, current_pattern_idx, current_level =
|
2021-05-01 11:45:30 +02:00
|
|
|
incoming_syntax, nil, state, 0
|
|
|
|
if state > 0 and (state > 255 or current_syntax.patterns[state].syntax) then
|
2021-05-18 17:52:18 +02:00
|
|
|
-- If we have higher bits, then decode them one at a time, and find which
|
|
|
|
-- syntax we're using. Rather than walking the bytes, and calling into
|
2021-05-01 11:45:30 +02:00
|
|
|
-- `syntax` each time, we could probably cache this in a single table.
|
2021-05-19 22:35:28 +02:00
|
|
|
for i = 0, 2 do
|
2021-05-01 11:45:30 +02:00
|
|
|
local target = bit32.extract(state, i*8, 8)
|
|
|
|
if target ~= 0 then
|
|
|
|
if current_syntax.patterns[target].syntax then
|
|
|
|
subsyntax_info = current_syntax.patterns[target]
|
2021-05-18 17:52:18 +02:00
|
|
|
current_syntax = type(subsyntax_info.syntax) == "table" and
|
2021-05-01 11:45:30 +02:00
|
|
|
subsyntax_info.syntax or syntax.get(subsyntax_info.syntax)
|
2021-05-20 21:58:27 +02:00
|
|
|
current_pattern_idx = 0
|
2021-05-01 11:45:30 +02:00
|
|
|
current_level = i+1
|
|
|
|
else
|
2021-05-20 21:58:27 +02:00
|
|
|
current_pattern_idx = target
|
2021-05-01 11:45:30 +02:00
|
|
|
break
|
|
|
|
end
|
2021-05-18 17:52:18 +02:00
|
|
|
else
|
2021-05-01 11:45:30 +02:00
|
|
|
break
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2021-05-20 21:58:27 +02:00
|
|
|
return current_syntax, subsyntax_info, current_pattern_idx, current_level
|
2021-05-01 11:45:30 +02:00
|
|
|
end
|
2019-12-28 12:16:32 +01:00
|
|
|
|
2022-04-26 15:42:02 +02:00
|
|
|
---@param incoming_syntax table
|
|
|
|
---@param text string
|
|
|
|
---@param state integer
|
2021-05-01 11:45:30 +02:00
|
|
|
function tokenizer.tokenize(incoming_syntax, text, state)
|
2019-12-28 12:16:32 +01:00
|
|
|
local res = {}
|
|
|
|
local i = 1
|
|
|
|
|
2021-05-01 11:45:30 +02:00
|
|
|
if #incoming_syntax.patterns == 0 then
|
2020-05-14 11:08:12 +02:00
|
|
|
return { "normal", text }
|
|
|
|
end
|
2021-05-18 17:52:18 +02:00
|
|
|
|
2021-05-01 11:45:30 +02:00
|
|
|
state = state or 0
|
2021-05-20 21:58:27 +02:00
|
|
|
-- incoming_syntax : the parent syntax of the file.
|
2022-04-26 15:42:02 +02:00
|
|
|
-- state : a 32-bit number representing syntax state (see above)
|
|
|
|
|
2021-05-20 21:58:27 +02:00
|
|
|
-- current_syntax : the syntax we're currently in.
|
|
|
|
-- subsyntax_info : info about the delimiters of this subsyntax.
|
|
|
|
-- current_pattern_idx: the index of the pattern we're on for this syntax.
|
|
|
|
-- current_level : how many subsyntaxes deep we are.
|
|
|
|
local current_syntax, subsyntax_info, current_pattern_idx, current_level =
|
2021-05-01 11:45:30 +02:00
|
|
|
retrieve_syntax_state(incoming_syntax, state)
|
2022-04-26 15:42:02 +02:00
|
|
|
|
2021-05-20 21:58:27 +02:00
|
|
|
-- Should be used to set the state variable. Don't modify it directly.
|
|
|
|
local function set_subsyntax_pattern_idx(pattern_idx)
|
|
|
|
current_pattern_idx = pattern_idx
|
|
|
|
state = bit32.replace(state, pattern_idx, current_level*8, 8)
|
|
|
|
end
|
2022-04-26 15:42:02 +02:00
|
|
|
|
|
|
|
|
2021-05-20 21:58:27 +02:00
|
|
|
local function push_subsyntax(entering_syntax, pattern_idx)
|
|
|
|
set_subsyntax_pattern_idx(pattern_idx)
|
|
|
|
current_level = current_level + 1
|
|
|
|
subsyntax_info = entering_syntax
|
|
|
|
current_syntax = type(entering_syntax.syntax) == "table" and
|
|
|
|
entering_syntax.syntax or syntax.get(entering_syntax.syntax)
|
|
|
|
current_pattern_idx = 0
|
|
|
|
end
|
2022-04-26 15:42:02 +02:00
|
|
|
|
2021-05-20 21:58:27 +02:00
|
|
|
local function pop_subsyntax()
|
|
|
|
set_subsyntax_pattern_idx(0)
|
|
|
|
current_level = current_level - 1
|
|
|
|
set_subsyntax_pattern_idx(0)
|
2022-04-26 15:42:02 +02:00
|
|
|
current_syntax, subsyntax_info, current_pattern_idx, current_level =
|
2021-05-20 21:58:27 +02:00
|
|
|
retrieve_syntax_state(incoming_syntax, state)
|
2021-06-02 21:27:00 +02:00
|
|
|
end
|
2022-04-26 15:42:02 +02:00
|
|
|
|
2021-06-02 21:27:00 +02:00
|
|
|
local function find_text(text, p, offset, at_start, close)
|
2022-03-04 11:27:01 +01:00
|
|
|
local target, res = p.pattern or p.regex, { 1, offset - 1 }
|
|
|
|
local p_idx = close and 2 or 1
|
|
|
|
local code = type(target) == "table" and target[p_idx] or target
|
|
|
|
|
|
|
|
if p.whole_line == nil then p.whole_line = { } end
|
|
|
|
if p.whole_line[p_idx] == nil then
|
|
|
|
-- Match patterns that start with '^'
|
2022-04-26 15:42:02 +02:00
|
|
|
p.whole_line[p_idx] = code:umatch("^%^") and true or false
|
2022-03-04 11:27:01 +01:00
|
|
|
if p.whole_line[p_idx] then
|
|
|
|
-- Remove '^' from the beginning of the pattern
|
|
|
|
if type(target) == "table" then
|
2022-04-26 15:42:02 +02:00
|
|
|
target[p_idx] = code:usub(2)
|
2022-03-04 11:27:01 +01:00
|
|
|
else
|
2022-04-26 15:42:02 +02:00
|
|
|
p.pattern = p.pattern and code:usub(2)
|
|
|
|
p.regex = p.regex and code:usub(2)
|
2022-03-04 11:27:01 +01:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2021-06-02 21:27:00 +02:00
|
|
|
if p.regex and type(p.regex) ~= "table" then
|
|
|
|
p._regex = p._regex or regex.compile(p.regex)
|
|
|
|
code = p._regex
|
2022-03-04 11:27:01 +01:00
|
|
|
end
|
|
|
|
|
2021-06-02 21:27:00 +02:00
|
|
|
repeat
|
2021-10-11 22:37:31 +02:00
|
|
|
local next = res[2] + 1
|
2022-03-04 11:27:01 +01:00
|
|
|
-- If the pattern contained '^', allow matching only the whole line
|
|
|
|
if p.whole_line[p_idx] and next > 1 then
|
|
|
|
return
|
|
|
|
end
|
2022-04-26 15:42:02 +02:00
|
|
|
res = p.pattern and { text:ufind((at_start or p.whole_line[p_idx]) and "^" .. code or code, next) }
|
2022-05-11 07:05:36 +02:00
|
|
|
or { regex.match(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) }
|
|
|
|
if p.regex and #res > 0 then -- set correct utf8 len for regex result
|
2022-06-12 02:55:36 +02:00
|
|
|
local char_pos_1 = string.ulen(text:sub(1, res[1]))
|
|
|
|
local char_pos_2 = char_pos_1 + string.ulen(text:sub(res[1], res[2])) - 1
|
2022-05-28 01:38:22 +02:00
|
|
|
-- `regex.match` returns group results as a series of `begin, end`
|
|
|
|
-- we only want `begin`s
|
2022-05-31 01:59:14 +02:00
|
|
|
if #res >= 3 then
|
2022-06-12 02:55:36 +02:00
|
|
|
res[3] = char_pos_1 + string.ulen(text:sub(res[1], res[3])) - 1
|
2022-05-31 01:59:14 +02:00
|
|
|
end
|
2022-05-28 01:38:22 +02:00
|
|
|
for i=1,(#res-3) do
|
|
|
|
local curr = i + 3
|
|
|
|
local from = i * 2 + 3
|
|
|
|
if from < #res then
|
2022-06-12 02:55:36 +02:00
|
|
|
res[curr] = char_pos_1 + string.ulen(text:sub(res[1], res[from])) - 1
|
2022-05-28 01:38:22 +02:00
|
|
|
else
|
|
|
|
res[curr] = nil
|
|
|
|
end
|
|
|
|
end
|
2022-06-12 02:55:36 +02:00
|
|
|
res[1] = char_pos_1
|
|
|
|
res[2] = char_pos_2
|
2022-05-11 07:05:36 +02:00
|
|
|
end
|
2022-06-12 04:19:05 +02:00
|
|
|
if res[1] and target[3] then
|
|
|
|
-- Check to see if the escaped character is there,
|
|
|
|
-- and if it is not itself escaped.
|
2021-06-02 21:27:00 +02:00
|
|
|
local count = 0
|
|
|
|
for i = res[1] - 1, 1, -1 do
|
2022-06-12 02:55:36 +02:00
|
|
|
if text:ubyte(i) ~= target[3]:ubyte() then break end
|
2021-06-02 21:27:00 +02:00
|
|
|
count = count + 1
|
|
|
|
end
|
2022-06-12 04:19:05 +02:00
|
|
|
if count % 2 == 0 then
|
|
|
|
-- The match is not escaped, so confirm it
|
|
|
|
break
|
|
|
|
elseif not close then
|
|
|
|
-- The *open* match is escaped, so avoid it
|
|
|
|
return
|
|
|
|
end
|
2021-06-02 21:27:00 +02:00
|
|
|
end
|
|
|
|
until not res[1] or not close or not target[3]
|
2021-08-29 03:14:12 +02:00
|
|
|
return table.unpack(res)
|
2021-05-20 21:58:27 +02:00
|
|
|
end
|
2022-04-26 15:42:02 +02:00
|
|
|
|
2022-05-11 07:05:36 +02:00
|
|
|
local text_len = text:ulen()
|
|
|
|
while i <= text_len do
|
2019-12-28 12:16:32 +01:00
|
|
|
-- continue trying to match the end pattern of a pair if we have a state set
|
2021-05-20 21:58:27 +02:00
|
|
|
if current_pattern_idx > 0 then
|
|
|
|
local p = current_syntax.patterns[current_pattern_idx]
|
2021-06-02 21:27:00 +02:00
|
|
|
local s, e = find_text(text, p, i, false, true)
|
2021-05-18 17:52:18 +02:00
|
|
|
|
2021-05-01 11:45:30 +02:00
|
|
|
local cont = true
|
|
|
|
-- If we're in subsyntax mode, always check to see if we end our syntax
|
2021-05-20 21:58:27 +02:00
|
|
|
-- first, before the found delimeter, as ending the subsyntax takes
|
|
|
|
-- precedence over ending the delimiter in the subsyntax.
|
2021-05-01 11:45:30 +02:00
|
|
|
if subsyntax_info then
|
2021-06-02 21:27:00 +02:00
|
|
|
local ss, se = find_text(text, subsyntax_info, i, false, true)
|
2022-04-26 15:42:02 +02:00
|
|
|
-- If we find that we end the subsyntax before the
|
2021-05-20 21:58:27 +02:00
|
|
|
-- delimiter, push the token, and signal we shouldn't
|
|
|
|
-- treat the bit after as a token to be normally parsed
|
|
|
|
-- (as it's the syntax delimiter).
|
2021-05-01 11:45:30 +02:00
|
|
|
if ss and (s == nil or ss < s) then
|
2022-04-26 15:42:02 +02:00
|
|
|
push_token(res, p.type, text:usub(i, ss - 1))
|
2021-05-01 11:45:30 +02:00
|
|
|
i = ss
|
|
|
|
cont = false
|
|
|
|
end
|
|
|
|
end
|
2021-05-20 21:58:27 +02:00
|
|
|
-- If we don't have any concerns about syntax delimiters,
|
|
|
|
-- continue on as normal.
|
2021-05-01 11:45:30 +02:00
|
|
|
if cont then
|
|
|
|
if s then
|
2022-04-26 15:42:02 +02:00
|
|
|
push_token(res, p.type, text:usub(i, e))
|
2021-05-20 21:58:27 +02:00
|
|
|
set_subsyntax_pattern_idx(0)
|
2021-05-01 11:45:30 +02:00
|
|
|
i = e + 1
|
|
|
|
else
|
2022-04-26 15:42:02 +02:00
|
|
|
push_token(res, p.type, text:usub(i))
|
2021-05-01 11:45:30 +02:00
|
|
|
break
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2021-05-20 21:58:27 +02:00
|
|
|
-- General end of syntax check. Applies in the case where
|
|
|
|
-- we're ending early in the middle of a delimiter, or
|
|
|
|
-- just normally, upon finding a token.
|
2021-05-01 11:45:30 +02:00
|
|
|
if subsyntax_info then
|
2021-06-02 21:27:00 +02:00
|
|
|
local s, e = find_text(text, subsyntax_info, i, true, true)
|
2019-12-28 12:16:32 +01:00
|
|
|
if s then
|
2022-04-26 15:42:02 +02:00
|
|
|
push_token(res, subsyntax_info.type, text:usub(i, e))
|
2021-05-20 21:58:27 +02:00
|
|
|
-- On finding unescaped delimiter, pop it.
|
|
|
|
pop_subsyntax()
|
2019-12-28 12:16:32 +01:00
|
|
|
i = e + 1
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
-- find matching pattern
|
|
|
|
local matched = false
|
2021-05-01 11:45:30 +02:00
|
|
|
for n, p in ipairs(current_syntax.patterns) do
|
2021-06-02 21:27:00 +02:00
|
|
|
local find_results = { find_text(text, p, i, true, false) }
|
2022-05-31 02:03:42 +02:00
|
|
|
if #find_results - 1 > #p.type then
|
|
|
|
if not bad_patterns[current_syntax] then
|
|
|
|
bad_patterns[current_syntax] = { }
|
|
|
|
end
|
|
|
|
if not bad_patterns[current_syntax][n] then
|
|
|
|
bad_patterns[current_syntax][n] = true
|
|
|
|
core.error("Malformed pattern #%d in %s language plugin", n, current_syntax.name or "unnamed")
|
|
|
|
end
|
|
|
|
end
|
2021-06-02 21:27:00 +02:00
|
|
|
if find_results[1] then
|
2021-05-19 22:35:28 +02:00
|
|
|
-- matched pattern; make and add tokens
|
|
|
|
push_tokens(res, current_syntax, p, text, find_results)
|
2019-12-28 12:16:32 +01:00
|
|
|
-- update state if this was a start|end pattern pair
|
2021-06-02 21:27:00 +02:00
|
|
|
if type(p.pattern or p.regex) == "table" then
|
2021-05-20 21:58:27 +02:00
|
|
|
-- If we have a subsyntax, push that onto the subsyntax stack.
|
2021-05-01 11:45:30 +02:00
|
|
|
if p.syntax then
|
2021-05-20 21:58:27 +02:00
|
|
|
push_subsyntax(p, n)
|
2022-04-26 15:42:02 +02:00
|
|
|
else
|
2021-05-20 21:58:27 +02:00
|
|
|
set_subsyntax_pattern_idx(n)
|
2021-05-01 11:45:30 +02:00
|
|
|
end
|
2019-12-28 12:16:32 +01:00
|
|
|
end
|
|
|
|
-- move cursor past this token
|
2021-06-02 21:27:00 +02:00
|
|
|
i = find_results[2] + 1
|
2019-12-28 12:16:32 +01:00
|
|
|
matched = true
|
|
|
|
break
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
-- consume character if we didn't match
|
|
|
|
if not matched then
|
2022-05-11 07:05:36 +02:00
|
|
|
push_token(res, "normal", text:usub(i, i))
|
|
|
|
i = i + 1
|
2019-12-28 12:16:32 +01:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
return res, state
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
local function iter(t, i)
|
|
|
|
i = i + 2
|
|
|
|
local type, text = t[i], t[i+1]
|
|
|
|
if type then
|
|
|
|
return i, type, text
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2020-05-07 22:14:46 +02:00
|
|
|
function tokenizer.each_token(t)
|
2019-12-28 12:16:32 +01:00
|
|
|
return iter, t, -1
|
|
|
|
end
|
|
|
|
|
|
|
|
|
2020-05-07 22:14:46 +02:00
|
|
|
return tokenizer
|