Add utf8 support to tokenizer (#945)

* add utf8 support to tokenizer

* wrap utf8 functions in string table using a 'u' prefix

* document new utf8 functions
This commit is contained in:
Jefferson González 2022-04-26 09:42:02 -04:00 committed by GitHub
parent 7dd83bb737
commit e572c58f24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 5431 additions and 26 deletions

View File

@ -35,6 +35,8 @@ table.unpack = table.unpack or unpack
bit32 = bit32 or require "core.bit"
require "core.utf8string"
-- Because AppImages change the working directory before running the executable,
-- we need to change it back to the original one.
-- https://github.com/AppImage/AppImageKit/issues/172

View File

@ -6,7 +6,7 @@ local tokenizer = {}
local function push_token(t, type, text)
local prev_type = t[#t-1]
local prev_text = t[#t]
if prev_type and (prev_type == type or prev_text:find("^%s*$")) then
if prev_type and (prev_type == type or prev_text:ufind("^%s*$")) then
t[#t-1] = type
t[#t] = prev_text .. text
else
@ -38,12 +38,12 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
local fin = find_results[i + 1] - 1
local type = pattern.type[i - 2]
-- ↑ (i - 2) to convert from [3; n] to [1; n]
local text = full_text:sub(start, fin)
local text = full_text:usub(start, fin)
push_token(t, syn.symbols[text] or type, text)
end
else
local start, fin = find_results[1], find_results[2]
local text = full_text:sub(start, fin)
local text = full_text:usub(start, fin)
push_token(t, syn.symbols[text] or pattern.type, text)
end
end
@ -52,12 +52,12 @@ end
-- State is a 32-bit number that is four separate bytes, illustrating how many
-- differnet delimiters we have open, and which subsyntaxes we have active.
-- At most, there are 3 subsyntaxes active at the same time. Beyond that,
-- does not support further highlighting.
-- does not support further highlighting.
-- You can think of it as a maximum 4 integer (0-255) stack. It always has
-- 1 integer in it. Calling `push_subsyntax` increases the stack depth. Calling
-- `pop_subsyntax` decreases it. The integers represent the index of a pattern
-- that we're following in the syntax. The top of the stack can be any valid
-- that we're following in the syntax. The top of the stack can be any valid
-- pattern index, any integer lower in the stack must represent a pattern that
-- specifies a subsyntax.
@ -92,6 +92,9 @@ local function retrieve_syntax_state(incoming_syntax, state)
return current_syntax, subsyntax_info, current_pattern_idx, current_level
end
---@param incoming_syntax table
---@param text string
---@param state integer
function tokenizer.tokenize(incoming_syntax, text, state)
local res = {}
local i = 1
@ -102,22 +105,22 @@ function tokenizer.tokenize(incoming_syntax, text, state)
state = state or 0
-- incoming_syntax : the parent syntax of the file.
-- state : a 32-bit number representing syntax state (see above)
-- state : a 32-bit number representing syntax state (see above)
-- current_syntax : the syntax we're currently in.
-- subsyntax_info : info about the delimiters of this subsyntax.
-- current_pattern_idx: the index of the pattern we're on for this syntax.
-- current_level : how many subsyntaxes deep we are.
local current_syntax, subsyntax_info, current_pattern_idx, current_level =
retrieve_syntax_state(incoming_syntax, state)
-- Should be used to set the state variable. Don't modify it directly.
local function set_subsyntax_pattern_idx(pattern_idx)
current_pattern_idx = pattern_idx
state = bit32.replace(state, pattern_idx, current_level*8, 8)
end
local function push_subsyntax(entering_syntax, pattern_idx)
set_subsyntax_pattern_idx(pattern_idx)
current_level = current_level + 1
@ -126,15 +129,15 @@ function tokenizer.tokenize(incoming_syntax, text, state)
entering_syntax.syntax or syntax.get(entering_syntax.syntax)
current_pattern_idx = 0
end
local function pop_subsyntax()
set_subsyntax_pattern_idx(0)
current_level = current_level - 1
set_subsyntax_pattern_idx(0)
current_syntax, subsyntax_info, current_pattern_idx, current_level =
current_syntax, subsyntax_info, current_pattern_idx, current_level =
retrieve_syntax_state(incoming_syntax, state)
end
local function find_text(text, p, offset, at_start, close)
local target, res = p.pattern or p.regex, { 1, offset - 1 }
local p_idx = close and 2 or 1
@ -143,14 +146,14 @@ function tokenizer.tokenize(incoming_syntax, text, state)
if p.whole_line == nil then p.whole_line = { } end
if p.whole_line[p_idx] == nil then
-- Match patterns that start with '^'
p.whole_line[p_idx] = code:match("^%^") and true or false
p.whole_line[p_idx] = code:umatch("^%^") and true or false
if p.whole_line[p_idx] then
-- Remove '^' from the beginning of the pattern
if type(target) == "table" then
target[p_idx] = code:sub(2)
target[p_idx] = code:usub(2)
else
p.pattern = p.pattern and code:sub(2)
p.regex = p.regex and code:sub(2)
p.pattern = p.pattern and code:usub(2)
p.regex = p.regex and code:usub(2)
end
end
end
@ -170,7 +173,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
while text:byte(next) and common.is_utf8_cont(text, next) do
next = next + 1
end
res = p.pattern and { text:find((at_start or p.whole_line[p_idx]) and "^" .. code or code, next) }
res = p.pattern and { text:ufind((at_start or p.whole_line[p_idx]) and "^" .. code or code, next) }
or { regex.match(code, text, next, (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) }
if res[1] and close and target[3] then
local count = 0
@ -185,7 +188,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
until not res[1] or not close or not target[3]
return table.unpack(res)
end
while i <= #text do
-- continue trying to match the end pattern of a pair if we have a state set
if current_pattern_idx > 0 then
@ -198,12 +201,12 @@ function tokenizer.tokenize(incoming_syntax, text, state)
-- precedence over ending the delimiter in the subsyntax.
if subsyntax_info then
local ss, se = find_text(text, subsyntax_info, i, false, true)
-- If we find that we end the subsyntax before the
-- If we find that we end the subsyntax before the
-- delimiter, push the token, and signal we shouldn't
-- treat the bit after as a token to be normally parsed
-- (as it's the syntax delimiter).
if ss and (s == nil or ss < s) then
push_token(res, p.type, text:sub(i, ss - 1))
push_token(res, p.type, text:usub(i, ss - 1))
i = ss
cont = false
end
@ -212,11 +215,11 @@ function tokenizer.tokenize(incoming_syntax, text, state)
-- continue on as normal.
if cont then
if s then
push_token(res, p.type, text:sub(i, e))
push_token(res, p.type, text:usub(i, e))
set_subsyntax_pattern_idx(0)
i = e + 1
else
push_token(res, p.type, text:sub(i))
push_token(res, p.type, text:usub(i))
break
end
end
@ -227,7 +230,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
if subsyntax_info then
local s, e = find_text(text, subsyntax_info, i, true, true)
if s then
push_token(res, subsyntax_info.type, text:sub(i, e))
push_token(res, subsyntax_info.type, text:usub(i, e))
-- On finding unescaped delimiter, pop it.
pop_subsyntax()
i = e + 1
@ -246,7 +249,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
-- If we have a subsyntax, push that onto the subsyntax stack.
if p.syntax then
push_subsyntax(p, n)
else
else
set_subsyntax_pattern_idx(n)
end
end
@ -264,7 +267,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
while text:byte(i + n + 1) and common.is_utf8_cont(text, i + n + 1) do
n = n + 1
end
push_token(res, "normal", text:sub(i, i + n))
push_token(res, "normal", text:usub(i, i + n))
i = i + n + 1
end
end

30
data/core/utf8string.lua Normal file
View File

@ -0,0 +1,30 @@
--------------------------------------------------------------------------------
-- inject utf8 functions to strings
--------------------------------------------------------------------------------
string.ubyte = utf8.byte
string.uchar = utf8.char
string.ufind = utf8.find
string.ugmatch = utf8.gmatch
string.ugsub = utf8.gsub
string.ulen = utf8.len
string.ulower = utf8.lower
string.umatch = utf8.match
string.ureverse = utf8.reverse
string.usub = utf8.sub
string.uupper = utf8.upper
string.uescape = utf8.escape
string.ucharpos = utf8.charpos
string.unext = utf8.next
string.uinsert = utf8.insert
string.uremove = utf8.remove
string.uwidth = utf8.width
string.uwidthindex = utf8.widthindex
string.utitle = utf8.title
string.ufold = utf8.fold
string.uncasecmp = utf8.ncasecmp
string.uoffset = utf8.offset
string.ucodepoint = utf8.codepoint
string.ucodes = utf8.codes

165
docs/api/string.lua Normal file
View File

@ -0,0 +1,165 @@
---@meta
---UTF-8 equivalent of string.byte
---@param s string
---@param i? integer
---@param j? integer
---@return integer
---@return ...
function string.ubyte(s, i, j) end
---UTF-8 equivalent of string.char
---@param byte integer
---@param ... integer
---@return string
---@return ...
function string.uchar(byte, ...) end
---UTF-8 equivalent of string.find
---@param s string
---@param pattern string
---@param init? integer
---@param plain? boolean
---@return integer start
---@return integer end
---@return ... captured
function string.ufind(s, pattern, init, plain) end
---UTF-8 equivalent of string.gmatch
---@param s string
---@param pattern string
---@param init? integer
---@return fun():string, ...
function string.ugmatch(s, pattern, init) end
---UTF-8 equivalent of string.gsub
---@param s string
---@param pattern string
---@param repl string|table|function
---@param n integer
---@return string
---@return integer count
function string.ugsub(s, pattern, repl, n) end
---UTF-8 equivalent of string.len
---@param s string
---@return integer
function string.ulen(s) end
---UTF-8 equivalent of string.lower
---@param s string
---@return string
function string.ulower(s) end
---UTF-8 equivalent of string.match
---@param s string
---@param pattern string
---@param init? integer
---@return string | number captured
function string.umatch(s, pattern, init) end
---UTF-8 equivalent of string.reverse
---@param s string
---@return string
function string.ureverse(s) end
---UTF-8 equivalent of string.sub
---@param s string
---@param i integer
---@param j? integer
---@return string
function string.usub(s, i, j) end
---UTF-8 equivalent of string.upper
---@param s string
---@return string
function string.uupper(s) end
---Equivalent to utf8.escape()
---@param s string
---@return string utf8_string
function string.uescape(s) end
---Equivalent to utf8.charpos()
---@param s string
---@param charpos? integer
---@param index? integer
---@return integer charpos
---@return integer codepoint
function string.ucharpos(s, charpos, index) end
---Equivalent to utf8.next()
---@param s string
---@param charpos? integer
---@param index? integer
---@return integer charpos
---@return integer codepoint
function string.unext(s, charpos, index) end
---Equivalent to utf8.insert()
---@param s string
---@param idx? integer
---@param substring string
---return string new_string
function string.uinsert(s, idx, substring) end
---Equivalent to utf8.remove()
---@param s string
---@param start? integer
---@param stop? integer
---return string new_string
function string.uremove(s, start, stop) end
---Equivalent to utf8.width()
---@param s string
---@param ambi_is_double? boolean
---@param default_width? integer
---@return integer width
function string.uwidth(s, ambi_is_double, default_width) end
---Equivalent to utf8.widthindex()
---@param s string
---@param location integer
---@param ambi_is_double? boolean
---@param default_width? integer
---@return integer idx
---@return integer offset
---@return integer width
function string.uwidthindex(s, location, ambi_is_double, default_width) end
---Equivalent to utf8.title()
---@param s string
---return string new_string
function string.utitle(s) end
---Equivalent to utf8.fold()
---@param s string
---return string new_string
function string.ufold(s) end
---Equivalent to utf8.ncasecmp()
---@param a string
---@param b string
---@return integer result
function string.uncasecmp(a, b) end
---Equivalent to utf8.offset()
---@param s string
---@param n integer
---@param i? integer
---@return integer position_in_bytes
function string.uoffset(s, n, i) end
---Equivalent to utf8.codepoint()
---@param s string
---@param i? integer
---@param j? integer
---@return integer code
---@return ...
function string.ucodepoint(s, i, j) end
---Equivalent to utf8.codes()
---@param s string
---@return fun():integer, integer
function string.ucodes(s) end

187
docs/api/utf8.lua Normal file
View File

@ -0,0 +1,187 @@
---@meta
---UTF-8 equivalent of string.byte
---@param s string
---@param i? integer
---@param j? integer
---@return integer
---@return ...
function utf8.byte(s, i, j) end
---UTF-8 equivalent of string.char
---@param byte integer
---@param ... integer
---@return string
---@return ...
function utf8.char(byte, ...) end
---UTF-8 equivalent of string.find
---@param s string
---@param pattern string
---@param init? integer
---@param plain? boolean
---@return integer start
---@return integer end
---@return ... captured
function utf8.find(s, pattern, init, plain) end
---UTF-8 equivalent of string.gmatch
---@param s string
---@param pattern string
---@param init? integer
---@return fun():string, ...
function utf8.gmatch(s, pattern, init) end
---UTF-8 equivalent of string.gsub
---@param s string
---@param pattern string
---@param repl string|table|function
---@param n integer
---@return string
---@return integer count
function utf8.gsub(s, pattern, repl, n) end
---UTF-8 equivalent of string.len
---@param s string
---@return integer
function utf8.len(s) end
---UTF-8 equivalent of string.lower
---@param s string
---@return string
function utf8.lower(s) end
---UTF-8 equivalent of string.match
---@param s string
---@param pattern string
---@param init? integer
---@return string | number captured
function utf8.match(s, pattern, init) end
---UTF-8 equivalent of string.reverse
---@param s string
---@return string
function utf8.reverse(s) end
---UTF-8 equivalent of string.sub
---@param s string
---@param i integer
---@param j? integer
---@return string
function utf8.sub(s, i, j) end
---UTF-8 equivalent of string.upper
---@param s string
---@return string
function utf8.upper(s) end
---Escape a str to UTF-8 format string. It support several escape format:
---* %ddd - which ddd is a decimal number at any length: change Unicode code point to UTF-8 format.
---* %{ddd} - same as %nnn but has bracket around.
---* %uddd - same as %ddd, u stands Unicode
---* %u{ddd} - same as %{ddd}
---* %xhhh - hexadigit version of %ddd
---* %x{hhh} same as %xhhh.
---* %? - '?' stands for any other character: escape this character.
---Example:
---```lua
---local u = utf8.escape
---print(u"%123%u123%{123}%u{123}%xABC%x{ABC}")
---print(u"%%123%?%d%%u")
---```
---@param s string
---@return string utf8_string
function utf8.escape(s) end
---Convert UTF-8 position to byte offset. if only index is given, return byte
---offset of this UTF-8 char index. if both charpos and index is given, a new
---charpos will be calculated, by add/subtract UTF-8 char index to current
---charpos. in all cases, it returns a new char position, and code point
---(a number) at this position.
---@param s string
---@param charpos? integer
---@param index? integer
---@return integer charpos
---@return integer codepoint
function utf8.charpos(s, charpos, index) end
---Iterate though the UTF-8 string s. If only s is given, it can used as a iterator:
---```lua
--- for pos, code in utf8.next, "utf8-string" do
--- -- ...
--- end
---````
---If only charpos is given, return the next byte offset of in string. if
---charpos and index is given, a new charpos will be calculated, by add/subtract
---UTF-8 char offset to current charpos. in all case, it return a new char
---position (in bytes), and code point (a number) at this position.
---@param s string
---@param charpos? integer
---@param index? integer
---@return integer charpos
---@return integer codepoint
function utf8.next(s, charpos, index) end
---Insert a substring to s. If idx is given, insert substring before char at
---this index, otherwise substring will concat to s. idx can be negative.
---@param s string
---@param idx? integer
---@param substring string
---return string new_string
function utf8.insert(s, idx, substring) end
---Delete a substring in s. If neither start nor stop is given, delete the last
---UTF-8 char in s, otherwise delete char from start to end of s. if stop is
---given, delete char from start to stop (include start and stop). start and
---stop can be negative.
---@param s string
---@param start? integer
---@param stop? integer
---return string new_string
function utf8.remove(s, start, stop) end
---Calculate the width of UTF-8 string s. if ambi_is_double is given, the
---ambiguous width character's width is 2, otherwise it's 1. fullwidth/doublewidth
---character's width is 2, and other character's width is 1. if default_width is
---given, it will be the width of unprintable character, used display a
---non-character mark for these characters. if s is a code point, return the
---width of this code point.
---@param s string
---@param ambi_is_double? boolean
---@param default_width? integer
---@return integer width
function utf8.width(s, ambi_is_double, default_width) end
---Return the character index at given location in string s. this is a reverse
---operation of utf8.width(). this function returns a index of location, and a
---offset in UTF-8 encoding. e.g. if cursor is at the second column (middle)
---of the wide char, offset will be 2. the width of character at idx is
---returned, also.
---@param s string
---@param location integer
---@param ambi_is_double? boolean
---@param default_width? integer
---@return integer idx
---@return integer offset
---@return integer width
function utf8.widthindex(s, location, ambi_is_double, default_width) end
---Convert UTF-8 string s to title-case, used to compare by ignore case. if s
---is a number, it's treat as a code point and return a convert code point
---(number). utf8.lower/utf8.pper has the same extension.
---@param s string
---return string new_string
function utf8.title(s) end
---Convert UTF-8 string s to folded case, used to compare by ignore case. if s
---is a number, it's treat as a code point and return a convert code point
---(number). utf8.lower/utf8.pper has the same extension.
---@param s string
---return string new_string
function utf8.fold(s) end
---Compare a and b without case, -1 means a < b, 0 means a == b and 1 means a > b.
---@param a string
---@param b string
---@return integer result
function utf8.ncasecmp(a, b) end

View File

@ -5,6 +5,7 @@ int luaopen_renderer(lua_State *L);
int luaopen_regex(lua_State *L);
int luaopen_process(lua_State *L);
int luaopen_dirmonitor(lua_State* L);
int luaopen_utf8(lua_State* L);
static const luaL_Reg libs[] = {
{ "system", luaopen_system },
@ -12,6 +13,7 @@ static const luaL_Reg libs[] = {
{ "regex", luaopen_regex },
{ "process", luaopen_process },
{ "dirmonitor", luaopen_dirmonitor },
{ "utf8", luaopen_utf8 },
{ NULL, NULL }
};

1305
src/api/utf8.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,7 @@ lite_sources = [
'api/regex.c',
'api/system.c',
'api/process.c',
'api/utf8.c',
'renderer.c',
'renwindow.c',
'rencache.c',

3710
src/unidata.h Normal file

File diff suppressed because it is too large Load Diff