tokenizer: remove the limit of 3 subsyntaxes depth (#1186)
* tokenizer: remove the limit of 3 subsyntaxes depth Make the state a string of bytes instead of a 32bits integer to be able to have deeper subsyntax support. Fixes issues with syntax files like the one for PHP that was already hitting more than 3 subsyntaxes depth. * remove unnecesary call to set_subsyntax_pattern_idx * fixed wrong word on comments
This commit is contained in:
parent
03cc5ffcd1
commit
b8a4f729df
|
@ -1,6 +1,5 @@
|
||||||
local core = require "core"
|
local core = require "core"
|
||||||
local syntax = require "core.syntax"
|
local syntax = require "core.syntax"
|
||||||
local common = require "core.common"
|
|
||||||
|
|
||||||
local tokenizer = {}
|
local tokenizer = {}
|
||||||
local bad_patterns = {}
|
local bad_patterns = {}
|
||||||
|
@ -51,31 +50,37 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
-- State is a string of bytes, where the count of bytes represents the depth
|
||||||
|
-- of the subsyntax we are currently in. Each individual byte represents the
|
||||||
|
-- index of the pattern for the current subsyntax in relation to its parent
|
||||||
|
-- syntax. Using a string of bytes allows us to have as many subsyntaxes as
|
||||||
|
-- bytes can be stored on a string while keeping some level of performance in
|
||||||
|
-- comparison to a Lua table. The only limitation is that a syntax would not
|
||||||
|
-- be able to contain more than 255 patterns.
|
||||||
|
--
|
||||||
|
-- Lets say a state contains 2 bytes byte #1 with value `3` and byte #2 with
|
||||||
|
-- a value of `5`. This would mean that on the parent syntax at index `3` a
|
||||||
|
-- pattern subsyntax that matched current text was found, then inside that
|
||||||
|
-- subsyntax another subsyntax pattern at index `5` that matched current text
|
||||||
|
-- was also found.
|
||||||
|
|
||||||
-- State is a 32-bit number that is four separate bytes, illustrating how many
|
-- Calling `push_subsyntax` appends the current subsyntax pattern index to the
|
||||||
-- differnet delimiters we have open, and which subsyntaxes we have active.
|
-- state and increases the stack depth. Calling `pop_subsyntax` clears the
|
||||||
-- At most, there are 3 subsyntaxes active at the same time. Beyond that,
|
-- last appended subsyntax and decreases the stack.
|
||||||
-- does not support further highlighting.
|
|
||||||
|
|
||||||
-- You can think of it as a maximum 4 integer (0-255) stack. It always has
|
|
||||||
-- 1 integer in it. Calling `push_subsyntax` increases the stack depth. Calling
|
|
||||||
-- `pop_subsyntax` decreases it. The integers represent the index of a pattern
|
|
||||||
-- that we're following in the syntax. The top of the stack can be any valid
|
|
||||||
-- pattern index, any integer lower in the stack must represent a pattern that
|
|
||||||
-- specifies a subsyntax.
|
|
||||||
|
|
||||||
-- If you do not have subsyntaxes in your syntax, the three most
|
|
||||||
-- singificant numbers will always be 0, the stack will only ever be length 1
|
|
||||||
-- and the state variable will only ever range from 0-255.
|
|
||||||
local function retrieve_syntax_state(incoming_syntax, state)
|
local function retrieve_syntax_state(incoming_syntax, state)
|
||||||
local current_syntax, subsyntax_info, current_pattern_idx, current_level =
|
local current_syntax, subsyntax_info, current_pattern_idx, current_level =
|
||||||
incoming_syntax, nil, state, 0
|
incoming_syntax, nil, state:byte(1) or 0, 1
|
||||||
if state > 0 and (state > 255 or current_syntax.patterns[state].syntax) then
|
if
|
||||||
-- If we have higher bits, then decode them one at a time, and find which
|
current_pattern_idx > 0
|
||||||
|
and
|
||||||
|
current_syntax.patterns[current_pattern_idx]
|
||||||
|
then
|
||||||
|
-- If the state is not empty we iterate over each byte, and find which
|
||||||
-- syntax we're using. Rather than walking the bytes, and calling into
|
-- syntax we're using. Rather than walking the bytes, and calling into
|
||||||
-- `syntax` each time, we could probably cache this in a single table.
|
-- `syntax` each time, we could probably cache this in a single table.
|
||||||
for i = 0, 2 do
|
for i = 1, #state do
|
||||||
local target = bit32.extract(state, i*8, 8)
|
local target = state:byte(i)
|
||||||
if target ~= 0 then
|
if target ~= 0 then
|
||||||
if current_syntax.patterns[target].syntax then
|
if current_syntax.patterns[target].syntax then
|
||||||
subsyntax_info = current_syntax.patterns[target]
|
subsyntax_info = current_syntax.patterns[target]
|
||||||
|
@ -107,7 +112,7 @@ end
|
||||||
|
|
||||||
---@param incoming_syntax table
|
---@param incoming_syntax table
|
||||||
---@param text string
|
---@param text string
|
||||||
---@param state integer
|
---@param state string
|
||||||
function tokenizer.tokenize(incoming_syntax, text, state)
|
function tokenizer.tokenize(incoming_syntax, text, state)
|
||||||
local res = {}
|
local res = {}
|
||||||
local i = 1
|
local i = 1
|
||||||
|
@ -116,9 +121,9 @@ function tokenizer.tokenize(incoming_syntax, text, state)
|
||||||
return { "normal", text }
|
return { "normal", text }
|
||||||
end
|
end
|
||||||
|
|
||||||
state = state or 0
|
state = state or ""
|
||||||
-- incoming_syntax : the parent syntax of the file.
|
-- incoming_syntax : the parent syntax of the file.
|
||||||
-- state : a 32-bit number representing syntax state (see above)
|
-- state : a string of bytes representing syntax state (see above)
|
||||||
|
|
||||||
-- current_syntax : the syntax we're currently in.
|
-- current_syntax : the syntax we're currently in.
|
||||||
-- subsyntax_info : info about the delimiters of this subsyntax.
|
-- subsyntax_info : info about the delimiters of this subsyntax.
|
||||||
|
@ -130,7 +135,18 @@ function tokenizer.tokenize(incoming_syntax, text, state)
|
||||||
-- Should be used to set the state variable. Don't modify it directly.
|
-- Should be used to set the state variable. Don't modify it directly.
|
||||||
local function set_subsyntax_pattern_idx(pattern_idx)
|
local function set_subsyntax_pattern_idx(pattern_idx)
|
||||||
current_pattern_idx = pattern_idx
|
current_pattern_idx = pattern_idx
|
||||||
state = bit32.replace(state, pattern_idx, current_level*8, 8)
|
local state_len = #state
|
||||||
|
if current_level > state_len then
|
||||||
|
state = state .. string.char(pattern_idx)
|
||||||
|
elseif state_len == 1 then
|
||||||
|
state = string.char(pattern_idx)
|
||||||
|
else
|
||||||
|
state = ("%s%s%s"):format(
|
||||||
|
state:sub(1,current_level-1),
|
||||||
|
string.char(pattern_idx),
|
||||||
|
state:sub(current_level+1)
|
||||||
|
)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -144,8 +160,8 @@ function tokenizer.tokenize(incoming_syntax, text, state)
|
||||||
end
|
end
|
||||||
|
|
||||||
local function pop_subsyntax()
|
local function pop_subsyntax()
|
||||||
set_subsyntax_pattern_idx(0)
|
|
||||||
current_level = current_level - 1
|
current_level = current_level - 1
|
||||||
|
state = string.sub(state, 1, current_level)
|
||||||
set_subsyntax_pattern_idx(0)
|
set_subsyntax_pattern_idx(0)
|
||||||
current_syntax, subsyntax_info, current_pattern_idx, current_level =
|
current_syntax, subsyntax_info, current_pattern_idx, current_level =
|
||||||
retrieve_syntax_state(incoming_syntax, state)
|
retrieve_syntax_state(incoming_syntax, state)
|
||||||
|
|
Loading…
Reference in New Issue