From b8a4f729dfc88af71c860d7cc8b86ef3cb50e72d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jefferson=20Gonz=C3=A1lez?= Date: Thu, 3 Nov 2022 18:56:20 -0400 Subject: [PATCH] tokenizer: remove the limit of 3 subsyntaxes depth (#1186) * tokenizer: remove the limit of 3 subsyntaxes depth Make the state a string of bytes instead of a 32bits integer to be able to have deeper subsyntax support. Fixes issues with syntax files like the one for PHP that was already hitting more than 3 subsyntaxes depth. * remove unnecesary call to set_subsyntax_pattern_idx * fixed wrong word on comments --- data/core/tokenizer.lua | 66 +++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index 89364f28..dd1f2e2c 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -1,6 +1,5 @@ local core = require "core" local syntax = require "core.syntax" -local common = require "core.common" local tokenizer = {} local bad_patterns = {} @@ -51,31 +50,37 @@ local function push_tokens(t, syn, pattern, full_text, find_results) end end +-- State is a string of bytes, where the count of bytes represents the depth +-- of the subsyntax we are currently in. Each individual byte represents the +-- index of the pattern for the current subsyntax in relation to its parent +-- syntax. Using a string of bytes allows us to have as many subsyntaxes as +-- bytes can be stored on a string while keeping some level of performance in +-- comparison to a Lua table. The only limitation is that a syntax would not +-- be able to contain more than 255 patterns. +-- +-- Lets say a state contains 2 bytes byte #1 with value `3` and byte #2 with +-- a value of `5`. This would mean that on the parent syntax at index `3` a +-- pattern subsyntax that matched current text was found, then inside that +-- subsyntax another subsyntax pattern at index `5` that matched current text +-- was also found. --- State is a 32-bit number that is four separate bytes, illustrating how many --- differnet delimiters we have open, and which subsyntaxes we have active. --- At most, there are 3 subsyntaxes active at the same time. Beyond that, --- does not support further highlighting. +-- Calling `push_subsyntax` appends the current subsyntax pattern index to the +-- state and increases the stack depth. Calling `pop_subsyntax` clears the +-- last appended subsyntax and decreases the stack. --- You can think of it as a maximum 4 integer (0-255) stack. It always has --- 1 integer in it. Calling `push_subsyntax` increases the stack depth. Calling --- `pop_subsyntax` decreases it. The integers represent the index of a pattern --- that we're following in the syntax. The top of the stack can be any valid --- pattern index, any integer lower in the stack must represent a pattern that --- specifies a subsyntax. - --- If you do not have subsyntaxes in your syntax, the three most --- singificant numbers will always be 0, the stack will only ever be length 1 --- and the state variable will only ever range from 0-255. local function retrieve_syntax_state(incoming_syntax, state) local current_syntax, subsyntax_info, current_pattern_idx, current_level = - incoming_syntax, nil, state, 0 - if state > 0 and (state > 255 or current_syntax.patterns[state].syntax) then - -- If we have higher bits, then decode them one at a time, and find which + incoming_syntax, nil, state:byte(1) or 0, 1 + if + current_pattern_idx > 0 + and + current_syntax.patterns[current_pattern_idx] + then + -- If the state is not empty we iterate over each byte, and find which -- syntax we're using. Rather than walking the bytes, and calling into -- `syntax` each time, we could probably cache this in a single table. - for i = 0, 2 do - local target = bit32.extract(state, i*8, 8) + for i = 1, #state do + local target = state:byte(i) if target ~= 0 then if current_syntax.patterns[target].syntax then subsyntax_info = current_syntax.patterns[target] @@ -107,7 +112,7 @@ end ---@param incoming_syntax table ---@param text string ----@param state integer +---@param state string function tokenizer.tokenize(incoming_syntax, text, state) local res = {} local i = 1 @@ -116,9 +121,9 @@ function tokenizer.tokenize(incoming_syntax, text, state) return { "normal", text } end - state = state or 0 + state = state or "" -- incoming_syntax : the parent syntax of the file. - -- state : a 32-bit number representing syntax state (see above) + -- state : a string of bytes representing syntax state (see above) -- current_syntax : the syntax we're currently in. -- subsyntax_info : info about the delimiters of this subsyntax. @@ -130,7 +135,18 @@ function tokenizer.tokenize(incoming_syntax, text, state) -- Should be used to set the state variable. Don't modify it directly. local function set_subsyntax_pattern_idx(pattern_idx) current_pattern_idx = pattern_idx - state = bit32.replace(state, pattern_idx, current_level*8, 8) + local state_len = #state + if current_level > state_len then + state = state .. string.char(pattern_idx) + elseif state_len == 1 then + state = string.char(pattern_idx) + else + state = ("%s%s%s"):format( + state:sub(1,current_level-1), + string.char(pattern_idx), + state:sub(current_level+1) + ) + end end @@ -144,8 +160,8 @@ function tokenizer.tokenize(incoming_syntax, text, state) end local function pop_subsyntax() - set_subsyntax_pattern_idx(0) current_level = current_level - 1 + state = string.sub(state, 1, current_level) set_subsyntax_pattern_idx(0) current_syntax, subsyntax_info, current_pattern_idx, current_level = retrieve_syntax_state(incoming_syntax, state)