tokenizer: remove the limit of 3 subsyntaxes depth (#1186)

* tokenizer: remove the limit of 3 subsyntaxes depth

Make the state a string of bytes instead of a 32bits integer to be able
to have deeper subsyntax support. Fixes issues with syntax files like
the one for PHP that was already hitting more than 3 subsyntaxes depth.

* remove unnecesary call to set_subsyntax_pattern_idx

* fixed wrong word on comments
This commit is contained in:
Jefferson González 2022-11-03 18:56:20 -04:00 committed by GitHub
parent 03cc5ffcd1
commit b8a4f729df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 41 additions and 25 deletions

View File

@ -1,6 +1,5 @@
local core = require "core"
local syntax = require "core.syntax"
local common = require "core.common"
local tokenizer = {}
local bad_patterns = {}
@ -51,31 +50,37 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
end
end
-- State is a string of bytes, where the count of bytes represents the depth
-- of the subsyntax we are currently in. Each individual byte represents the
-- index of the pattern for the current subsyntax in relation to its parent
-- syntax. Using a string of bytes allows us to have as many subsyntaxes as
-- bytes can be stored on a string while keeping some level of performance in
-- comparison to a Lua table. The only limitation is that a syntax would not
-- be able to contain more than 255 patterns.
--
-- Lets say a state contains 2 bytes byte #1 with value `3` and byte #2 with
-- a value of `5`. This would mean that on the parent syntax at index `3` a
-- pattern subsyntax that matched current text was found, then inside that
-- subsyntax another subsyntax pattern at index `5` that matched current text
-- was also found.
-- State is a 32-bit number that is four separate bytes, illustrating how many
-- differnet delimiters we have open, and which subsyntaxes we have active.
-- At most, there are 3 subsyntaxes active at the same time. Beyond that,
-- does not support further highlighting.
-- Calling `push_subsyntax` appends the current subsyntax pattern index to the
-- state and increases the stack depth. Calling `pop_subsyntax` clears the
-- last appended subsyntax and decreases the stack.
-- You can think of it as a maximum 4 integer (0-255) stack. It always has
-- 1 integer in it. Calling `push_subsyntax` increases the stack depth. Calling
-- `pop_subsyntax` decreases it. The integers represent the index of a pattern
-- that we're following in the syntax. The top of the stack can be any valid
-- pattern index, any integer lower in the stack must represent a pattern that
-- specifies a subsyntax.
-- If you do not have subsyntaxes in your syntax, the three most
-- singificant numbers will always be 0, the stack will only ever be length 1
-- and the state variable will only ever range from 0-255.
local function retrieve_syntax_state(incoming_syntax, state)
local current_syntax, subsyntax_info, current_pattern_idx, current_level =
incoming_syntax, nil, state, 0
if state > 0 and (state > 255 or current_syntax.patterns[state].syntax) then
-- If we have higher bits, then decode them one at a time, and find which
incoming_syntax, nil, state:byte(1) or 0, 1
if
current_pattern_idx > 0
and
current_syntax.patterns[current_pattern_idx]
then
-- If the state is not empty we iterate over each byte, and find which
-- syntax we're using. Rather than walking the bytes, and calling into
-- `syntax` each time, we could probably cache this in a single table.
for i = 0, 2 do
local target = bit32.extract(state, i*8, 8)
for i = 1, #state do
local target = state:byte(i)
if target ~= 0 then
if current_syntax.patterns[target].syntax then
subsyntax_info = current_syntax.patterns[target]
@ -107,7 +112,7 @@ end
---@param incoming_syntax table
---@param text string
---@param state integer
---@param state string
function tokenizer.tokenize(incoming_syntax, text, state)
local res = {}
local i = 1
@ -116,9 +121,9 @@ function tokenizer.tokenize(incoming_syntax, text, state)
return { "normal", text }
end
state = state or 0
state = state or ""
-- incoming_syntax : the parent syntax of the file.
-- state : a 32-bit number representing syntax state (see above)
-- state : a string of bytes representing syntax state (see above)
-- current_syntax : the syntax we're currently in.
-- subsyntax_info : info about the delimiters of this subsyntax.
@ -130,7 +135,18 @@ function tokenizer.tokenize(incoming_syntax, text, state)
-- Should be used to set the state variable. Don't modify it directly.
local function set_subsyntax_pattern_idx(pattern_idx)
current_pattern_idx = pattern_idx
state = bit32.replace(state, pattern_idx, current_level*8, 8)
local state_len = #state
if current_level > state_len then
state = state .. string.char(pattern_idx)
elseif state_len == 1 then
state = string.char(pattern_idx)
else
state = ("%s%s%s"):format(
state:sub(1,current_level-1),
string.char(pattern_idx),
state:sub(current_level+1)
)
end
end
@ -144,8 +160,8 @@ function tokenizer.tokenize(incoming_syntax, text, state)
end
local function pop_subsyntax()
set_subsyntax_pattern_idx(0)
current_level = current_level - 1
state = string.sub(state, 1, current_level)
set_subsyntax_pattern_idx(0)
current_syntax, subsyntax_info, current_pattern_idx, current_level =
retrieve_syntax_state(incoming_syntax, state)