Add utf8 support to tokenizer (#945)

* add utf8 support to tokenizer * wrap utf8 functions in string table using a 'u' prefix * document new utf8 functions
2022-04-26 09:42:02 -04:00 · 2022-04-26 09:42:02 -04:00 · e572c58f24
parent 7dd83bb737
commit e572c58f24
9 changed files with 5431 additions and 26 deletions
--- a/data/core/start.lua
+++ b/data/core/start.lua
@ -35,6 +35,8 @@ table.unpack = table.unpack or unpack

 bit32 = bit32 or require "core.bit"

+require "core.utf8string"
+
 -- Because AppImages change the working directory before running the executable,
 -- we need to change it back to the original one.
 -- https://github.com/AppImage/AppImageKit/issues/172
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@ -6,7 +6,7 @@ local tokenizer = {}
 local function push_token(t, type, text)
  local prev_type = t[#t-1]
  local prev_text = t[#t]
-  if prev_type and (prev_type == type or prev_text:find("^%s*$")) then
+  if prev_type and (prev_type == type or prev_text:ufind("^%s*$")) then
    t[#t-1] = type
    t[#t] = prev_text .. text
  else
@ -38,12 +38,12 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
      local fin = find_results[i + 1] - 1
      local type = pattern.type[i - 2]
        -- ↑ (i - 2) to convert from [3; n] to [1; n]
-      local text = full_text:sub(start, fin)
+      local text = full_text:usub(start, fin)
      push_token(t, syn.symbols[text] or type, text)
    end
  else
    local start, fin = find_results[1], find_results[2]
-    local text = full_text:sub(start, fin)
+    local text = full_text:usub(start, fin)
    push_token(t, syn.symbols[text] or pattern.type, text)
  end
 end
@ -52,12 +52,12 @@ end
 -- State is a 32-bit number that is four separate bytes, illustrating how many
 -- differnet delimiters we have open, and which subsyntaxes we have active.
 -- At most, there are 3 subsyntaxes active at the same time. Beyond that,
-- does not support further highlighting. 
+-- does not support further highlighting.

 -- You can think of it as a maximum 4 integer (0-255) stack. It always has
 -- 1 integer in it. Calling `push_subsyntax` increases the stack depth. Calling
 -- `pop_subsyntax` decreases it. The integers represent the index of a pattern
-- that we're following in the syntax. The top of the stack can be any valid 
+-- that we're following in the syntax. The top of the stack can be any valid
 -- pattern index, any integer lower in the stack must represent a pattern that
 -- specifies a subsyntax.

@ -92,6 +92,9 @@ local function retrieve_syntax_state(incoming_syntax, state)
  return current_syntax, subsyntax_info, current_pattern_idx, current_level
 end

+---@param incoming_syntax table
+---@param text string
+---@param state integer
 function tokenizer.tokenize(incoming_syntax, text, state)
  local res = {}
  local i = 1
@ -102,22 +105,22 @@ function tokenizer.tokenize(incoming_syntax, text, state)

  state = state or 0
  -- incoming_syntax    : the parent syntax of the file.
-  -- state              : a 32-bit number representing syntax state (see above) 
-  
+  -- state              : a 32-bit number representing syntax state (see above)
+
  -- current_syntax     : the syntax we're currently in.
  -- subsyntax_info     : info about the delimiters of this subsyntax.
  -- current_pattern_idx: the index of the pattern we're on for this syntax.
  -- current_level      : how many subsyntaxes deep we are.
  local current_syntax, subsyntax_info, current_pattern_idx, current_level =
    retrieve_syntax_state(incoming_syntax, state)
-  
+
  -- Should be used to set the state variable. Don't modify it directly.
  local function set_subsyntax_pattern_idx(pattern_idx)
    current_pattern_idx = pattern_idx
    state = bit32.replace(state, pattern_idx, current_level*8, 8)
  end
-  
-  
+
+
  local function push_subsyntax(entering_syntax, pattern_idx)
    set_subsyntax_pattern_idx(pattern_idx)
    current_level = current_level + 1
@ -126,15 +129,15 @@ function tokenizer.tokenize(incoming_syntax, text, state)
      entering_syntax.syntax or syntax.get(entering_syntax.syntax)
    current_pattern_idx = 0
  end
-  
+
  local function pop_subsyntax()
    set_subsyntax_pattern_idx(0)
    current_level = current_level - 1
    set_subsyntax_pattern_idx(0)
-    current_syntax, subsyntax_info, current_pattern_idx, current_level = 
+    current_syntax, subsyntax_info, current_pattern_idx, current_level =
      retrieve_syntax_state(incoming_syntax, state)
  end
-  
+
  local function find_text(text, p, offset, at_start, close)
    local target, res = p.pattern or p.regex, { 1, offset - 1 }
    local p_idx = close and 2 or 1
@ -143,14 +146,14 @@ function tokenizer.tokenize(incoming_syntax, text, state)
    if p.whole_line == nil then p.whole_line = { } end
    if p.whole_line[p_idx] == nil then
      -- Match patterns that start with '^'
-      p.whole_line[p_idx] = code:match("^%^") and true or false
+      p.whole_line[p_idx] = code:umatch("^%^") and true or false
      if p.whole_line[p_idx] then
        -- Remove '^' from the beginning of the pattern
        if type(target) == "table" then
-          target[p_idx] = code:sub(2)
+          target[p_idx] = code:usub(2)
        else
-          p.pattern = p.pattern and code:sub(2)
-          p.regex = p.regex and code:sub(2)
+          p.pattern = p.pattern and code:usub(2)
+          p.regex = p.regex and code:usub(2)
        end
      end
    end
@ -170,7 +173,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
      while text:byte(next) and common.is_utf8_cont(text, next) do
        next = next + 1
      end
-      res = p.pattern and { text:find((at_start or p.whole_line[p_idx]) and "^" .. code or code, next) }
+      res = p.pattern and { text:ufind((at_start or p.whole_line[p_idx]) and "^" .. code or code, next) }
        or { regex.match(code, text, next, (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) }
      if res[1] and close and target[3] then
        local count = 0
@ -185,7 +188,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
    until not res[1] or not close or not target[3]
    return table.unpack(res)
  end
-  
+
  while i <= #text do
    -- continue trying to match the end pattern of a pair if we have a state set
    if current_pattern_idx > 0 then
@ -198,12 +201,12 @@ function tokenizer.tokenize(incoming_syntax, text, state)
      -- precedence over ending the delimiter in the subsyntax.
      if subsyntax_info then
        local ss, se = find_text(text, subsyntax_info, i, false, true)
-        -- If we find that we end the subsyntax before the 
+        -- If we find that we end the subsyntax before the
        -- delimiter, push the token, and signal we shouldn't
        -- treat the bit after as a token to be normally parsed
        -- (as it's the syntax delimiter).
        if ss and (s == nil or ss < s) then
-          push_token(res, p.type, text:sub(i, ss - 1))
+          push_token(res, p.type, text:usub(i, ss - 1))
          i = ss
          cont = false
        end
@ -212,11 +215,11 @@ function tokenizer.tokenize(incoming_syntax, text, state)
      -- continue on as normal.
      if cont then
        if s then
-          push_token(res, p.type, text:sub(i, e))
+          push_token(res, p.type, text:usub(i, e))
          set_subsyntax_pattern_idx(0)
          i = e + 1
        else
-          push_token(res, p.type, text:sub(i))
+          push_token(res, p.type, text:usub(i))
          break
        end
      end
@ -227,7 +230,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
    if subsyntax_info then
      local s, e = find_text(text, subsyntax_info, i, true, true)
      if s then
-        push_token(res, subsyntax_info.type, text:sub(i, e))
+        push_token(res, subsyntax_info.type, text:usub(i, e))
        -- On finding unescaped delimiter, pop it.
        pop_subsyntax()
        i = e + 1
@ -246,7 +249,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
          -- If we have a subsyntax, push that onto the subsyntax stack.
          if p.syntax then
            push_subsyntax(p, n)
-          else          
+          else
            set_subsyntax_pattern_idx(n)
          end
        end
@ -264,7 +267,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
      while text:byte(i + n + 1) and common.is_utf8_cont(text, i + n + 1) do
        n = n + 1
      end
-      push_token(res, "normal", text:sub(i, i + n))
+      push_token(res, "normal", text:usub(i, i + n))
      i = i + n + 1
    end
  end
--- a/data/core/utf8string.lua
+++ b/data/core/utf8string.lua
@ -0,0 +1,30 @@
+--------------------------------------------------------------------------------
+-- inject utf8 functions to strings
+--------------------------------------------------------------------------------
+
+string.ubyte = utf8.byte
+string.uchar = utf8.char
+string.ufind = utf8.find
+string.ugmatch = utf8.gmatch
+string.ugsub = utf8.gsub
+string.ulen = utf8.len
+string.ulower = utf8.lower
+string.umatch = utf8.match
+string.ureverse = utf8.reverse
+string.usub = utf8.sub
+string.uupper = utf8.upper
+
+string.uescape = utf8.escape
+string.ucharpos = utf8.charpos
+string.unext = utf8.next
+string.uinsert = utf8.insert
+string.uremove = utf8.remove
+string.uwidth = utf8.width
+string.uwidthindex = utf8.widthindex
+string.utitle = utf8.title
+string.ufold = utf8.fold
+string.uncasecmp = utf8.ncasecmp
+
+string.uoffset = utf8.offset
+string.ucodepoint = utf8.codepoint
+string.ucodes = utf8.codes
--- a/docs/api/string.lua
+++ b/docs/api/string.lua
@ -0,0 +1,165 @@
+---@meta
+
+---UTF-8 equivalent of string.byte
+---@param s  string
+---@param i? integer
+---@param j? integer
+---@return integer
+---@return ...
+function string.ubyte(s, i, j) end
+
+---UTF-8 equivalent of string.char
+---@param byte integer
+---@param ... integer
+---@return string
+---@return ...
+function string.uchar(byte, ...) end
+
+---UTF-8 equivalent of string.find
+---@param s       string
+---@param pattern string
+---@param init?   integer
+---@param plain?  boolean
+---@return integer start
+---@return integer end
+---@return ... captured
+function string.ufind(s, pattern, init, plain) end
+
+---UTF-8 equivalent of string.gmatch
+---@param s       string
+---@param pattern string
+---@param init?   integer
+---@return fun():string, ...
+function string.ugmatch(s, pattern, init) end
+
+---UTF-8 equivalent of string.gsub
+---@param s       string
+---@param pattern string
+---@param repl    string|table|function
+---@param n       integer
+---@return string
+---@return integer count
+function string.ugsub(s, pattern, repl, n) end
+
+---UTF-8 equivalent of string.len
+---@param s string
+---@return integer
+function string.ulen(s) end
+
+---UTF-8 equivalent of string.lower
+---@param s string
+---@return string
+function string.ulower(s) end
+
+---UTF-8 equivalent of string.match
+---@param s       string
+---@param pattern string
+---@param init?   integer
+---@return string | number captured
+function string.umatch(s, pattern, init) end
+
+---UTF-8 equivalent of string.reverse
+---@param s string
+---@return string
+function string.ureverse(s) end
+
+---UTF-8 equivalent of string.sub
+---@param s  string
+---@param i  integer
+---@param j? integer
+---@return string
+function string.usub(s, i, j) end
+
+---UTF-8 equivalent of string.upper
+---@param s string
+---@return string
+function string.uupper(s) end
+
+---Equivalent to utf8.escape()
+---@param s  string
+---@return string utf8_string
+function string.uescape(s) end
+
+
+---Equivalent to utf8.charpos()
+---@param s  string
+---@param charpos? integer
+---@param index? integer
+---@return integer charpos
+---@return integer codepoint
+function string.ucharpos(s, charpos, index) end
+
+---Equivalent to utf8.next()
+---@param s  string
+---@param charpos? integer
+---@param index? integer
+---@return integer charpos
+---@return integer codepoint
+function string.unext(s, charpos, index) end
+
+---Equivalent to utf8.insert()
+---@param s string
+---@param idx? integer
+---@param substring string
+---return string new_string
+function string.uinsert(s, idx, substring) end
+
+---Equivalent to utf8.remove()
+---@param s string
+---@param start? integer
+---@param stop? integer
+---return string new_string
+function string.uremove(s, start, stop) end
+
+---Equivalent to utf8.width()
+---@param s string
+---@param ambi_is_double? boolean
+---@param default_width? integer
+---@return integer width
+function string.uwidth(s, ambi_is_double, default_width) end
+
+---Equivalent to utf8.widthindex()
+---@param s string
+---@param location integer
+---@param ambi_is_double? boolean
+---@param default_width? integer
+---@return integer idx
+---@return integer offset
+---@return integer width
+function string.uwidthindex(s, location, ambi_is_double, default_width) end
+
+---Equivalent to utf8.title()
+---@param s string
+---return string new_string
+function string.utitle(s) end
+
+---Equivalent to utf8.fold()
+---@param s string
+---return string new_string
+function string.ufold(s) end
+
+---Equivalent to utf8.ncasecmp()
+---@param a string
+---@param b string
+---@return integer result
+function string.uncasecmp(a, b) end
+
+---Equivalent to utf8.offset()
+---@param s string
+---@param n integer
+---@param i? integer
+---@return integer position_in_bytes
+function string.uoffset(s, n, i) end
+
+---Equivalent to utf8.codepoint()
+---@param s    string
+---@param i?   integer
+---@param j?   integer
+---@return integer code
+---@return ...
+function string.ucodepoint(s, i, j) end
+
+---Equivalent to utf8.codes()
+---@param s string
+---@return fun():integer, integer
+function string.ucodes(s) end
--- a/docs/api/utf8.lua
+++ b/docs/api/utf8.lua
@ -0,0 +1,187 @@
+---@meta
+
+---UTF-8 equivalent of string.byte
+---@param s  string
+---@param i? integer
+---@param j? integer
+---@return integer
+---@return ...
+function utf8.byte(s, i, j) end
+
+---UTF-8 equivalent of string.char
+---@param byte integer
+---@param ... integer
+---@return string
+---@return ...
+function utf8.char(byte, ...) end
+
+---UTF-8 equivalent of string.find
+---@param s       string
+---@param pattern string
+---@param init?   integer
+---@param plain?  boolean
+---@return integer start
+---@return integer end
+---@return ... captured
+function utf8.find(s, pattern, init, plain) end
+
+---UTF-8 equivalent of string.gmatch
+---@param s       string
+---@param pattern string
+---@param init?   integer
+---@return fun():string, ...
+function utf8.gmatch(s, pattern, init) end
+
+---UTF-8 equivalent of string.gsub
+---@param s       string
+---@param pattern string
+---@param repl    string|table|function
+---@param n       integer
+---@return string
+---@return integer count
+function utf8.gsub(s, pattern, repl, n) end
+
+---UTF-8 equivalent of string.len
+---@param s string
+---@return integer
+function utf8.len(s) end
+
+---UTF-8 equivalent of string.lower
+---@param s string
+---@return string
+function utf8.lower(s) end
+
+---UTF-8 equivalent of string.match
+---@param s       string
+---@param pattern string
+---@param init?   integer
+---@return string | number captured
+function utf8.match(s, pattern, init) end
+
+---UTF-8 equivalent of string.reverse
+---@param s string
+---@return string
+function utf8.reverse(s) end
+
+---UTF-8 equivalent of string.sub
+---@param s  string
+---@param i  integer
+---@param j? integer
+---@return string
+function utf8.sub(s, i, j) end
+
+---UTF-8 equivalent of string.upper
+---@param s string
+---@return string
+function utf8.upper(s) end
+
+---Escape a str to UTF-8 format string. It support several escape format:
+---* %ddd - which ddd is a decimal number at any length: change Unicode code point to UTF-8 format.
+---* %{ddd} - same as %nnn but has bracket around.
+---* %uddd - same as %ddd, u stands Unicode
+---* %u{ddd} - same as %{ddd}
+---* %xhhh - hexadigit version of %ddd
+---* %x{hhh} same as %xhhh.
+---* %? - '?' stands for any other character: escape this character.
+---Example:
+---```lua
+---local u = utf8.escape
+---print(u"%123%u123%{123}%u{123}%xABC%x{ABC}")
+---print(u"%%123%?%d%%u")
+---```
+---@param s  string
+---@return string utf8_string
+function utf8.escape(s) end
+
+---Convert UTF-8 position to byte offset. if only index is given, return byte
+---offset of this UTF-8 char index. if both charpos and index is given, a new
+---charpos will be calculated, by add/subtract UTF-8 char index to current
+---charpos. in all cases, it returns a new char position, and code point
+---(a number) at this position.
+---@param s  string
+---@param charpos? integer
+---@param index? integer
+---@return integer charpos
+---@return integer codepoint
+function utf8.charpos(s, charpos, index) end
+
+---Iterate though the UTF-8 string s. If only s is given, it can used as a iterator:
+---```lua
+--- for pos, code in utf8.next, "utf8-string" do
+--- -- ...
+--- end
+---````
+---If only charpos is given, return the next byte offset of in string. if
+---charpos and index is given, a new charpos will be calculated, by add/subtract
+---UTF-8 char offset to current charpos. in all case, it return a new char
+---position (in bytes), and code point (a number) at this position.
+---@param s  string
+---@param charpos? integer
+---@param index? integer
+---@return integer charpos
+---@return integer codepoint
+function utf8.next(s, charpos, index) end
+
+---Insert a substring to s. If idx is given, insert substring before char at
+---this index, otherwise substring will concat to s. idx can be negative.
+---@param s string
+---@param idx? integer
+---@param substring string
+---return string new_string
+function utf8.insert(s, idx, substring) end
+
+---Delete a substring in s. If neither start nor stop is given, delete the last
+---UTF-8 char in s, otherwise delete char from start to end of s. if stop is
+---given, delete char from start to stop (include start and stop). start and
+---stop can be negative.
+---@param s string
+---@param start? integer
+---@param stop? integer
+---return string new_string
+function utf8.remove(s, start, stop) end
+
+---Calculate the width of UTF-8 string s. if ambi_is_double is given, the
+---ambiguous width character's width is 2, otherwise it's 1. fullwidth/doublewidth
+---character's width is 2, and other character's width is 1. if default_width is
+---given, it will be the width of unprintable character, used display a
+---non-character mark for these characters. if s is a code point, return the
+---width of this code point.
+---@param s string
+---@param ambi_is_double? boolean
+---@param default_width? integer
+---@return integer width
+function utf8.width(s, ambi_is_double, default_width) end
+
+---Return the character index at given location in string s. this is a reverse
+---operation of utf8.width(). this function returns a index of location, and a
+---offset in UTF-8 encoding. e.g. if cursor is at the second column (middle)
+---of the wide char, offset will be 2. the width of character at idx is
+---returned, also.
+---@param s string
+---@param location integer
+---@param ambi_is_double? boolean
+---@param default_width? integer
+---@return integer idx
+---@return integer offset
+---@return integer width
+function utf8.widthindex(s, location, ambi_is_double, default_width) end
+
+---Convert UTF-8 string s to title-case, used to compare by ignore case. if s
+---is a number, it's treat as a code point and return a convert code point
+---(number). utf8.lower/utf8.pper has the same extension.
+---@param s string
+---return string new_string
+function utf8.title(s) end
+
+---Convert UTF-8 string s to folded case, used to compare by ignore case. if s
+---is a number, it's treat as a code point and return a convert code point
+---(number). utf8.lower/utf8.pper has the same extension.
+---@param s string
+---return string new_string
+function utf8.fold(s) end
+
+---Compare a and b without case, -1 means a < b, 0 means a == b and 1 means a > b.
+---@param a string
+---@param b string
+---@return integer result
+function utf8.ncasecmp(a, b) end
--- a/src/api/api.c
+++ b/src/api/api.c
@ -5,6 +5,7 @@ int luaopen_renderer(lua_State *L);
 int luaopen_regex(lua_State *L);
 int luaopen_process(lua_State *L);
 int luaopen_dirmonitor(lua_State* L);
+int luaopen_utf8(lua_State* L);

 static const luaL_Reg libs[] = {
  { "system",     luaopen_system     },
@ -12,6 +13,7 @@ static const luaL_Reg libs[] = {
  { "regex",      luaopen_regex      },
  { "process",    luaopen_process    },
  { "dirmonitor", luaopen_dirmonitor },
+  { "utf8",       luaopen_utf8       },
  { NULL, NULL }
 };

--- a/src/api/utf8.c
+++ b/src/api/utf8.c
--- a/src/meson.build
+++ b/src/meson.build
@ -4,6 +4,7 @@ lite_sources = [
    'api/regex.c',
    'api/system.c',
    'api/process.c',
+    'api/utf8.c',
    'renderer.c',
    'renwindow.c',
    'rencache.c',
--- a/src/unidata.h
+++ b/src/unidata.h