Merge pull request #907 from jgmdev/PR/less-hacky-tokenizer-fix

syntax: remove pattern re-ordering on optimization
2022-03-29 22:26:50 -04:00 · 2022-03-29 22:26:50 -04:00 · 693bf2cf29
parent 7372d2f82d b0c005a5ac
commit 693bf2cf29
2 changed files with 45 additions and 59 deletions
--- a/data/core/syntax.lua
+++ b/data/core/syntax.lua
@ -7,47 +7,22 @@ local plain_text_syntax = { name = "Plain Text", patterns = {}, symbols = {} }


 function syntax.add(t)
-  -- the rule %s+ gives us a performance gain for the tokenizer in lines with
-  -- long amounts of consecutive spaces, to not affect other patterns we
-  -- insert it after any rule that starts with spaces to prevent conflicts
+  if type(t.space_handling) ~= "boolean" then t.space_handling = true end
+
  if t.patterns then
-    local temp_patterns = {}
-    ::pattern_remove_loop::
-    for pos, pattern in ipairs(t.patterns) do
-      local pattern_str = ""
-      local ptype = pattern.pattern
-        and "pattern" or (pattern.regex and "regex" or nil)
-      if ptype then
-        if type(pattern[ptype]) == "table" then
-          pattern_str = pattern[ptype][1]
-        else
-          pattern_str = pattern[ptype]
+    -- the rule %s+ gives us a performance gain for the tokenizer in lines with
+    -- long amounts of consecutive spaces, can be disabled by plugins where it
+    -- causes conflicts by declaring the table property: space_handling = false
+    if t.space_handling then
+      table.insert(t.patterns, { pattern = "%s+", type = "normal" })
    end
-        if (ptype == "pattern" and(
-            pattern_str:find("^%^?%%s")
-            or
-            pattern_str:find("^%^?%s")
-          ))
-          or
-          (ptype == "regex" and (
-            pattern_str:find("^%^?\\s")
-            or
-            pattern_str:find("^%^?%s")
-          ))
-        then
-          table.insert(temp_patterns, table.remove(t.patterns, pos))
-          -- since we are removing from iterated table we need to start
-          -- from the beginning again to prevent any issues
-          goto pattern_remove_loop
-        end
-      end
-    end
-    for pos, pattern in ipairs(temp_patterns) do
-      table.insert(t.patterns, pos, pattern)
-    end
-    local pos = 1
-    if #temp_patterns > 0 then pos = #temp_patterns+1 end
-    table.insert(t.patterns, pos, { pattern = "%s+", type = "normal" })
+
+    -- this rule gives us additional performance gain by matching every word
+    -- that was not matched by the syntax patterns as a single token, preventing
+    -- the tokenizer from iterating over each character individually which is a
+    -- lot slower since iteration occurs in lua instead of C and adding to that
+    -- it will also try to match every pattern to a single char (same as spaces)
+    table.insert(t.patterns, { pattern = "%w+%f[%s]", type = "normal" })
  end

  table.insert(syntax.items, t)
--- a/data/plugins/language_md.lua
+++ b/data/plugins/language_md.lua
@ -31,7 +31,35 @@ syntax.add {
  name = "Markdown",
  files = { "%.md$", "%.markdown$" },
  block_comment = { "<!--", "-->" },
+  space_handling = false, -- turn off this feature to handle it our selfs
  patterns = {
+  ---- Place patterns that require spaces at start to optimize matching speed
+  ---- and apply the %s+ optimization immediately afterwards
+    -- bullets
+    { pattern = "^%s*%*%s",                 type = "number" },
+    { pattern = "^%s*%-%s",                 type = "number" },
+    { pattern = "^%s*%+%s",                 type = "number" },
+    -- numbered bullet
+    { pattern = "^%s*[0-9]+[%.%)]%s",       type = "number" },
+    -- blockquote
+    { pattern = "^%s*>+%s",                 type = "string" },
+    -- alternative bold italic formats
+    { pattern = { "%s___", "___%f[%s]" },   type = "markdown_bold_italic" },
+    { pattern = { "%s__", "__%f[%s]" },     type = "markdown_bold" },
+    { pattern = { "%s_[%S]", "_%f[%s]" },   type = "markdown_italic" },
+    -- reference links
+    {
+      pattern = "^%s*%[%^()["..in_squares_match.."]+()%]: ",
+      type = { "function", "number", "function" }
+    },
+    {
+      pattern = "^%s*%[%^?()["..in_squares_match.."]+()%]:%s+.+\n",
+      type = { "function", "number", "function" }
+    },
+    -- optimization
+    { pattern = "%s+",                      type = "normal" },
+
+
  ---- HTML rules imported and adapted from language_html
  ---- to not conflict with markdown rules
    -- Inline JS and CSS
@ -129,14 +157,6 @@ syntax.add {
    { pattern = "^%-%-%-+\n",               type = "comment" },
    { pattern = "^%*%*%*+\n",               type = "comment" },
    { pattern = "^___+\n",                  type = "comment" },
-    -- bullets
-    { pattern = "^%s*%*%s",                 type = "number" },
-    { pattern = "^%s*%-%s",                 type = "number" },
-    { pattern = "^%s*%+%s",                 type = "number" },
-    -- numbered bullet
-    { pattern = "^%s*[0-9]+[%.%)]%s",       type = "number" },
-    -- blockquote
-    { pattern = "^%s*>+%s",                 type = "string" },
    -- bold and italic
    { pattern = { "%*%*%*%S", "%*%*%*" },   type = "markdown_bold_italic" },
    { pattern = { "%*%*%S", "%*%*" },       type = "markdown_bold" },
@ -149,9 +169,6 @@ syntax.add {
    { pattern = "^___[%s%p%w]+___%s" ,      type = "markdown_bold_italic" },
    { pattern = "^__[%s%p%w]+__%s" ,        type = "markdown_bold" },
    { pattern = "^_[%s%p%w]+_%s" ,          type = "markdown_italic" },
-    { pattern = { "%s___", "___%f[%s]" },   type = "markdown_bold_italic" },
-    { pattern = { "%s__", "__%f[%s]" },     type = "markdown_bold" },
-    { pattern = { "%s_[%S]", "_%f[%s]" },   type = "markdown_italic" },
    -- heading with custom id
    {
      pattern = "^#+%s[%w%s%p]+(){()#[%w%-]+()}",
@ -186,14 +203,6 @@ syntax.add {
      pattern = "%[()["..in_squares_match.."]+()%] *()%[()["..in_squares_match.."]+()%]",
      type = { "function", "string", "function", "function", "number", "function" }
    },
-    {
-      pattern = "^%s*%[%^()["..in_squares_match.."]+()%]: ",
-      type = { "function", "number", "function" }
-    },
-    {
-      pattern = "^%s*%[%^?()["..in_squares_match.."]+()%]:%s+.+\n",
-      type = { "function", "number", "function" }
-    },
    {
      pattern = "!?%[%^?()["..in_squares_match.."]+()%]",
      type = { "function", "number", "function" }
@ -204,7 +213,9 @@ syntax.add {
      type = "function"
    },
    { pattern = "<https?://%S+>",           type = "function" },
-    { pattern = "https?://%S+",             type = "function" }
+    { pattern = "https?://%S+",             type = "function" },
+    -- optimize consecutive dashes used in tables
+    { pattern = "%-+",                      type = "normal" },
  },
  symbols = { },
 }