support for multiple groups in one pattern (#196)

2021-05-19 22:35:28 +02:00 · 2021-05-19 22:35:28 +02:00 · 86a7037ed9
commit 86a7037ed9
parent ba4fbde33d
2 changed files with 54 additions and 21 deletions
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@ -15,6 +15,39 @@ local function push_token(t, type, text)
 end


+local function push_tokens(t, syn, pattern, full_text, find_results)
+  if #find_results > 2 then
+    -- We do some manipulation with find_results so that it's arranged
+    -- like this:
+    -- { start, end, i_1, i_2, i_3, …, i_last }
+    -- Each position spans characters from i_n to ((i_n+1) - 1), to form
+    -- consecutive spans of text.
+    --
+    -- If i_1 is not equal to start, start is automatically inserted at
+    -- that index.
+    if find_results[3] ~= find_results[1] then
+      table.insert(find_results, 3, find_results[1])
+    end
+    -- Copy the ending index to the end of the table, so that an ending index
+    -- always follows a starting index after position 3 in the table.
+    table.insert(find_results, find_results[2] + 1)
+    -- Then, we just iterate over our modified table.
+    for i = 3, #find_results - 1 do
+      local start = find_results[i]
+      local fin = find_results[i + 1] - 1
+      local type = pattern.type[i - 2]
+        -- ↑ (i - 2) to convert from [3; n] to [1; n]
+      local text = full_text:sub(start, fin)
+      push_token(t, syn.symbols[text] or type, text)
+    end
+  else
+    local start, fin = find_results[1], find_results[2]
+    local text = full_text:sub(start, fin)
+    push_token(t, syn.symbols[text] or pattern.type, text)
+  end
+end
+
+
 local function is_escaped(text, idx, esc)
  local byte = esc:byte()
  local count = 0
@ -49,7 +82,7 @@ local function retrieve_syntax_state(incoming_syntax, state)
    -- If we have higher bits, then decode them one at a time, and find which
    -- syntax we're using. Rather than walking the bytes, and calling into
    -- `syntax` each time, we could probably cache this in a single table.
-    for i=0,2 do
+    for i = 0, 2 do
      local target = bit32.extract(state, i*8, 8)
      if target ~= 0 then
        if current_syntax.patterns[target].syntax then
@ -138,13 +171,13 @@ function tokenizer.tokenize(incoming_syntax, text, state)
    local matched = false
    for n, p in ipairs(current_syntax.patterns) do
      local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern
-      local s, e = text:find("^" .. pattern, i)
+      local find_results = { text:find("^" .. pattern, i) }
+      local start, fin = find_results[1], find_results[2]

-      if s then
-        -- matched pattern; make and add token
-        local t = text:sub(s, e)
+      if start then
+        -- matched pattern; make and add tokens
+        push_tokens(res, current_syntax, p, text, find_results)

-        push_token(res, current_syntax.symbols[t] or p.type, t)
        -- update state if this was a start|end pattern pair
        if type(p.pattern) == "table" then
          state = bit32.replace(state, n, current_level*8, 8)
@ -162,7 +195,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
        end

        -- move cursor past this token
-        i = e + 1
+        i = fin + 1
        matched = true
        break
      end
--- a/data/plugins/language_c.lua
+++ b/data/plugins/language_c.lua
@ -5,17 +5,20 @@ syntax.add {
  files = { "%.c$", "%.h$", "%.inl$", "%.cpp$", "%.hpp$" },
  comment = "//",
  patterns = {
-    { pattern = "//.-\n",               type = "comment"  },
-    { pattern = { "/%*", "%*/" },       type = "comment"  },
-    { pattern = { "#", "[^\\]\n" },     type = "comment"  },
-    { pattern = { '"', '"', '\\' },     type = "string"   },
-    { pattern = { "'", "'", '\\' },     type = "string"   },
-    { pattern = "-?0x%x+",              type = "number"   },
-    { pattern = "-?%d+[%d%.eE]*f?",     type = "number"   },
-    { pattern = "-?%.?%d+f?",           type = "number"   },
-    { pattern = "[%+%-=/%*%^%%<>!~|&]", type = "operator" },
-    { pattern = "[%a_][%w_]*%f[(]",     type = "function" },
-    { pattern = "[%a_][%w_]*",          type = "symbol"   },
+    { pattern = "//.-\n",                type = "comment" },
+    { pattern = { "/%*", "%*/" },        type = "comment" },
+    { pattern = { '"', '"', '\\' },      type = "string"  },
+    { pattern = { "'", "'", '\\' },      type = "string"  },
+    { pattern = "0x%x+",                 type = "number"  },
+    { pattern = "%d+[%d%.eE]*f?",        type = "number"  },
+    { pattern = "%.?%d+f?",              type = "number"  },
+    { pattern = "[%+%-=/%*%^%%<>!~|&]",  type = "operator" },
+    { pattern = "struct%s()[%a_][%w_]*", type = {"keyword", "keyword2"} },
+    { pattern = "union%s()[%a_][%w_]*",  type = {"keyword", "keyword2"} },
+    { pattern = "[%a_][%w_]*%f[(]",      type = "function" },
+    { pattern = "[%a_][%w_]*",           type = "symbol" },
+    { pattern = "#include%s()<.->",      type = {"keyword", "string"} },
+    { pattern = "#[%a_][%w_]*",          type = "keyword" },
  },
  symbols = {
    ["if"]       = "keyword",
@ -29,8 +32,6 @@ syntax.add {
    ["continue"] = "keyword",
    ["return"]   = "keyword",
    ["goto"]     = "keyword",
-    ["struct"]   = "keyword",
-    ["union"]    = "keyword",
    ["typedef"]  = "keyword",
    ["enum"]     = "keyword",
    ["extern"]   = "keyword",
@ -42,7 +43,6 @@ syntax.add {
    ["case"]     = "keyword",
    ["default"]  = "keyword",
    ["auto"]     = "keyword",
-    ["const"]    = "keyword",
    ["void"]     = "keyword",
    ["int"]      = "keyword2",
    ["short"]    = "keyword2",