From 248d70a8ca5d8bf516ca6809fde0d500b9a7221f Mon Sep 17 00:00:00 2001
From: Adam <adamdharrison@gmail.com>
Date: Wed, 2 Jun 2021 15:27:00 -0400
Subject: [PATCH] Add PCRE to support regular expressions

Use regular expressions instead of Lua patterns for find and replace editor commands.

Syntax files can now use regex or Lua patterns as before keeping backward compatibility for plugins.
---
 build.sh                           |   2 +-
 data/core/commands/findreplace.lua |  14 ++--
 data/core/doc/search.lua           |  36 +++++----
 data/core/init.lua                 |   1 +
 data/core/regex.lua                |  69 +++++++++++++++++
 data/core/tokenizer.lua            |  75 ++++++++-----------
 data/plugins/projectsearch.lua     |   9 ++-
 meson.build                        |   1 +
 src/api/api.c                      |   2 +
 src/api/regex.c                    | 115 +++++++++++++++++++++++++++++
 src/meson.build                    |   3 +-
 11 files changed, 257 insertions(+), 70 deletions(-)
 create mode 100644 data/core/regex.lua
 create mode 100644 src/api/regex.c

diff --git a/build.sh b/build.sh
index bfb679b0..6f605797 100755
--- a/build.sh
+++ b/build.sh
@@ -3,7 +3,7 @@
 cflags+="-Wall -O3 -g -std=gnu11 -fno-strict-aliasing -Isrc -Ilib/font_renderer"
 cflags+=" $(pkg-config --cflags lua5.2) $(sdl2-config --cflags)"
 lflags="-static-libgcc -static-libstdc++"
-for package in libagg freetype2 lua5.2 x11; do
+for package in libagg freetype2 lua5.2 x11 libpcre2-8; do
   lflags+=" $(pkg-config --libs $package)"
 done
 lflags+=" $(sdl2-config --libs) -lm"
diff --git a/data/core/commands/findreplace.lua b/data/core/commands/findreplace.lua
index 937c410a..af60f33f 100644
--- a/data/core/commands/findreplace.lua
+++ b/data/core/commands/findreplace.lua
@@ -90,6 +90,7 @@ local function has_selection()
      and core.active_view.doc:has_selection()
 end
 
+
 command.add(has_selection, {
   ["find-replace:select-next"] = function()
     local l1, c1, l2, c2 = doc():get_selection(true)
@@ -107,9 +108,9 @@ command.add("core.docview", {
     end)
   end,
 
-  ["find-replace:find-pattern"] = function()
-    find("Find Text Pattern", function(doc, line, col, text)
-      local opt = { wrap = true, no_case = true, pattern = true }
+  ["find-replace:find-regex"] = function()
+    find("Find Text Regex", function(doc, line, col, text)
+      local opt = { wrap = true, no_case = true, regex = true }
       return search.find(doc, line, col, text, opt)
     end)
   end,
@@ -144,9 +145,10 @@ command.add("core.docview", {
     end)
   end,
 
-  ["find-replace:replace-pattern"] = function()
-    replace("Pattern", "", function(text, old, new)
-      return text:gsub(old, new)
+  ["find-replace:replace-regex"] = function()
+    replace("Regex", "", function(text, old, new)
+      local re = regex.compile(old)
+      return regex.gsub(re, text, new)
     end)
   end,
 
diff --git a/data/core/doc/search.lua b/data/core/doc/search.lua
index fe57523e..04090673 100644
--- a/data/core/doc/search.lua
+++ b/data/core/doc/search.lua
@@ -15,12 +15,8 @@ local function init_args(doc, line, col, text, opt)
   opt = opt or default_opt
   line, col = doc:sanitize_position(line, col)
 
-  if opt.no_case then
-    if opt.pattern then
-      text = text:gsub("%%?.", pattern_lower)
-    else
-      text = text:lower()
-    end
+  if opt.no_case and not opt.regex then
+    text = text:lower()
   end
 
   return doc, line, col, text, opt
@@ -30,20 +26,32 @@ end
 function search.find(doc, line, col, text, opt)
   doc, line, col, text, opt = init_args(doc, line, col, text, opt)
 
+  local re
+  if opt.regex then
+    re = regex.compile(text, opt.no_case and "i" or "")
+  end
   for line = line, #doc.lines do
     local line_text = doc.lines[line]
-    if opt.no_case then
-      line_text = line_text:lower()
+    if opt.regex then
+      local s, e = re:cmatch(line_text, col)
+      if s then
+        return line, s, line, e
+      end
+      col = 1
+    else
+      if opt.no_case then
+        line_text = line_text:lower()
+      end
+      local s, e = line_text:find(text, col, true)
+      if s then
+        return line, s, line, e + 1
+      end
+      col = 1
     end
-    local s, e = line_text:find(text, col, not opt.pattern)
-    if s then
-      return line, s, line, e + 1
-    end
-    col = 1
   end
 
   if opt.wrap then
-    opt = { no_case = opt.no_case, pattern = opt.pattern }
+    opt = { no_case = opt.no_case, regex = opt.regex }
     return search.find(doc, 1, 1, text, opt)
   end
 end
diff --git a/data/core/init.lua b/data/core/init.lua
index cc4b46d8..a3e6eba6 100644
--- a/data/core/init.lua
+++ b/data/core/init.lua
@@ -1,4 +1,5 @@
 require "core.strict"
+require "core.regex"
 local common = require "core.common"
 local config = require "core.config"
 local style = require "core.style"
diff --git a/data/core/regex.lua b/data/core/regex.lua
new file mode 100644
index 00000000..a360f0a9
--- /dev/null
+++ b/data/core/regex.lua
@@ -0,0 +1,69 @@
+
+-- So that in addition to regex.gsub(pattern, string), we can also do 
+-- pattern:gsub(string).
+regex.__index = function(table, key) return regex[key]; end
+
+regex.match = function(pattern_string, string, offset, options)
+  local pattern = type(pattern_string) == "table" and
+    pattern_string or regex.compile(pattern_string)
+  return regex.cmatch(pattern, string, offset, options)
+end
+
+-- Will iterate back through any UTF-8 bytes so that we don't replace bits 
+-- mid character.
+local function previous_character(str, index)
+  local byte
+  repeat
+    index = index - 1
+    byte = string.byte(str, index)
+  until byte < 128 or byte >= 192
+  return index
+end
+
+-- Moves to the end of the identified character.
+local function end_character(str, index)
+  local byte = string.byte(str, index + 1)
+  while byte >= 128 and byte < 192 do
+    index = index + 1
+    byte = string.byte(str, index + 1)
+  end
+  return index
+end
+
+-- Build off matching. For now, only support basic replacements, but capture
+-- groupings should be doable. We can even have custom group replacements and
+-- transformations and stuff in lua. Currently, this takes group replacements 
+-- as \1 - \9.
+-- Should work on UTF-8 text.
+regex.gsub = function(pattern_string, str, replacement)
+  local pattern = type(pattern_string) == "table" and
+    pattern_string or regex.compile(pattern_string)
+  local result, indices = ""
+  local n = 0
+  repeat
+    indices = { regex.cmatch(pattern, str) }
+    if #indices > 0 then
+      n = n + 1
+      local currentReplacement = replacement
+      if #indices > 2 then
+        for i = 1, (#indices/2 - 1) do
+          currentReplacement = string.gsub(
+            currentReplacement, 
+            "\\" .. i, 
+            str:sub(indices[i*2+1], end_character(str,indices[i*2+2]-1))
+          )
+        end
+      end
+      currentReplacement = string.gsub(currentReplacement, "\\%d", "")
+      if indices[1] > 1 then
+        result = result .. 
+          str:sub(1, previous_character(str, indices[1])) .. currentReplacement
+      else
+        result = result .. currentReplacement      
+      end
+      str = str:sub(indices[2])
+    end
+  until #indices == 0 or indices[1] == indices[2]
+  return result .. str, n
+end
+
diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua
index 83e0e665..a20dba5e 100644
--- a/data/core/tokenizer.lua
+++ b/data/core/tokenizer.lua
@@ -48,29 +48,6 @@ local function push_tokens(t, syn, pattern, full_text, find_results)
 end
 
 
-local function is_escaped(text, idx, esc)
-  local byte = esc:byte()
-  local count = 0
-  for i = idx - 1, 1, -1 do
-    if text:byte(i) ~= byte then break end
-    count = count + 1
-  end
-  return count % 2 == 1
-end
-
-
-local function find_non_escaped(text, pattern, offset, esc)
-  while true do
-    local s, e = text:find(pattern, offset)
-    if not s then break end
-    if esc and is_escaped(text, s, esc) then
-      offset = e + 1
-    else
-      return s, e
-    end
-  end
-end
-
 -- State is a 32-bit number that is four separate bytes, illustrating how many
 -- differnet delimiters we have open, and which subsyntaxes we have active.
 -- At most, there are 3 subsyntaxes active at the same time. Beyond that,
@@ -155,26 +132,44 @@ function tokenizer.tokenize(incoming_syntax, text, state)
     set_subsyntax_pattern_idx(0)
     current_syntax, subsyntax_info, current_pattern_idx, current_level = 
       retrieve_syntax_state(incoming_syntax, state)
+  end
   
+  local function find_text(text, p, offset, at_start, close)
+    local target, res = p.pattern or p.regex, { 1, offset - 1 }, p.regex
+    local code = type(target) == "table" and target[close and 2 or 1] or target
+    if p.regex and type(p.regex) ~= "table" then
+      p._regex = p._regex or regex.compile(p.regex)
+      code = p._regex
+    end    
+    repeat
+      res = p.pattern and { text:find(at_start and "^" .. code or code, res[2]+1) } 
+        or { regex.match(code, text, res[2]+1, at_start and regex.ANCHORED or 0) }
+      if res[1] and close and target[3] then
+        local count = 0
+        for i = res[1] - 1, 1, -1 do
+          if text:byte(i) ~= target[3]:byte() then break end
+          count = count + 1
+        end
+        -- Check to see if the escaped character is there,
+        -- and if it is not itself escaped.
+        if count % 2 == 0 then break end
+      end
+    until not res[1] or not close or not target[3]
+    return unpack(res)
   end
   
   while i <= #text do
     -- continue trying to match the end pattern of a pair if we have a state set
     if current_pattern_idx > 0 then
       local p = current_syntax.patterns[current_pattern_idx]
-      local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3])
+      local s, e = find_text(text, p, i, false, true)
 
       local cont = true
       -- If we're in subsyntax mode, always check to see if we end our syntax
       -- first, before the found delimeter, as ending the subsyntax takes
       -- precedence over ending the delimiter in the subsyntax.
       if subsyntax_info then
-        local ss, se = find_non_escaped(
-          text,
-          subsyntax_info.pattern[2],
-          i,
-          subsyntax_info.pattern[3]
-        )
+        local ss, se = find_text(text, subsyntax_info, i, false, true)
         -- If we find that we end the subsyntax before the 
         -- delimiter, push the token, and signal we shouldn't
         -- treat the bit after as a token to be normally parsed
@@ -202,12 +197,7 @@ function tokenizer.tokenize(incoming_syntax, text, state)
     -- we're ending early in the middle of a delimiter, or
     -- just normally, upon finding a token.
     if subsyntax_info then
-      local s, e = find_non_escaped(
-        text,
-        "^" .. subsyntax_info.pattern[2],
-        i,
-        nil
-      )
+      local s, e = find_text(text, subsyntax_info, i, true, true)
       if s then
         push_token(res, subsyntax_info.type, text:sub(i, e))
         -- On finding unescaped delimiter, pop it.
@@ -219,16 +209,12 @@ function tokenizer.tokenize(incoming_syntax, text, state)
     -- find matching pattern
     local matched = false
     for n, p in ipairs(current_syntax.patterns) do
-      local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern
-      local find_results = { text:find("^" .. pattern, i) }
-      local start, fin = find_results[1], find_results[2]
-
-      if start then
+      local find_results = { find_text(text, p, i, true, false) }
+      if find_results[1] then
         -- matched pattern; make and add tokens
         push_tokens(res, current_syntax, p, text, find_results)
-
         -- update state if this was a start|end pattern pair
-        if type(p.pattern) == "table" then
+        if type(p.pattern or p.regex) == "table" then
           -- If we have a subsyntax, push that onto the subsyntax stack.
           if p.syntax then
             push_subsyntax(p, n)
@@ -236,9 +222,8 @@ function tokenizer.tokenize(incoming_syntax, text, state)
             set_subsyntax_pattern_idx(n)
           end
         end
-        
         -- move cursor past this token
-        i = fin + 1
+        i = find_results[2] + 1
         matched = true
         break
       end
diff --git a/data/plugins/projectsearch.lua b/data/plugins/projectsearch.lua
index 69a27094..45399ed0 100644
--- a/data/plugins/projectsearch.lua
+++ b/data/plugins/projectsearch.lua
@@ -237,9 +237,12 @@ command.add(nil, {
     end)
   end,
 
-  ["project-search:find-pattern"] = function()
-    core.command_view:enter("Find Pattern In Project", function(text)
-      begin_search(text, function(line_text) return line_text:find(text) end)
+  ["project-search:find-regex"] = function()
+    core.command_view:enter("Find Regex In Project", function(text)
+      local re = regex.compile(text, "i")
+      begin_search(text, function(line_text)
+        return regex.cmatch(re, line_text) 
+      end)
     end)
   end,
 
diff --git a/meson.build b/meson.build
index 9b1ab84d..5a218df7 100644
--- a/meson.build
+++ b/meson.build
@@ -9,6 +9,7 @@ libm = cc.find_library('m', required : false)
 libdl = cc.find_library('dl', required : false)
 libx11 = dependency('x11', required : false)
 lua_dep = dependency('lua5.2', required : false)
+pcre2_dep = dependency('libpcre2-8')
 
 if not lua_dep.found()
     lua_subproject = subproject('lua', default_options: ['shared=false', 'use_readline=false', 'app=false'])
diff --git a/src/api/api.c b/src/api/api.c
index 34067a9c..5ea2e782 100644
--- a/src/api/api.c
+++ b/src/api/api.c
@@ -3,11 +3,13 @@
 
 int luaopen_system(lua_State *L);
 int luaopen_renderer(lua_State *L);
+int luaopen_regex(lua_State *L);
 
 
 static const luaL_Reg libs[] = {
   { "system",    luaopen_system     },
   { "renderer",  luaopen_renderer   },
+  { "regex",     luaopen_regex   },
   { NULL, NULL }
 };
 
diff --git a/src/api/regex.c b/src/api/regex.c
new file mode 100644
index 00000000..5245d8c2
--- /dev/null
+++ b/src/api/regex.c
@@ -0,0 +1,115 @@
+#include "api.h"
+
+#define PCRE2_CODE_UNIT_WIDTH 8
+
+#include <string.h>
+#include <pcre2.h>
+
+static int f_pcre_gc(lua_State* L) {
+  lua_rawgeti(L, -1, 1);
+  pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1);
+  if (re)
+    pcre2_code_free(re);
+  return 0;
+}
+
+static int f_pcre_compile(lua_State *L) {
+  size_t len;
+  PCRE2_SIZE errorOffset;
+  int errorNumber;
+  int pattern = PCRE2_UTF;
+  const char* str = luaL_checklstring(L, 1, &len);
+  if (lua_gettop(L) > 1) {
+    const char* options = luaL_checkstring(L, 2);
+    if (strstr(options,"i"))
+      pattern |= PCRE2_CASELESS;
+    if (strstr(options,"m"))
+      pattern |= PCRE2_MULTILINE;
+    if (strstr(options,"s"))
+      pattern |= PCRE2_DOTALL;
+  }
+  pcre2_code* re = pcre2_compile(
+    (PCRE2_SPTR)str,
+    len,
+    pattern,
+    &errorNumber,
+    &errorOffset,
+    NULL
+  );
+  if (re) {
+    lua_newtable(L);
+    lua_pushlightuserdata(L, re);
+    lua_rawseti(L, -2, 1);
+    luaL_setmetatable(L, "regex");
+    return 1;
+  }
+  PCRE2_UCHAR buffer[256];
+  pcre2_get_error_message(errorNumber, buffer, sizeof(buffer));
+  luaL_error(L, "regex compilation failed at offset %d: %s", 
+    (int)errorOffset, buffer);
+  return 0;
+}
+
+// Takes string, compiled regex, returns list of indices of matched groups
+// (including the whole match), if a match was found.
+static int f_pcre_match(lua_State *L) {
+  size_t len, offset = 1, opts = 0;
+  luaL_checktype(L, 1, LUA_TTABLE);
+  const char* str = luaL_checklstring(L, 2, &len);
+  if (lua_gettop(L) > 2)
+    offset = luaL_checknumber(L, 3);
+  if (lua_gettop(L) > 3)
+    opts = luaL_checknumber(L, 4);
+  lua_rawgeti(L, 1, 1);
+  pcre2_code* re = (pcre2_code*)lua_touserdata(L, -1);
+  pcre2_match_data* md = pcre2_match_data_create_from_pattern(re, NULL);
+  int rc = pcre2_match(re, (PCRE2_SPTR)str, len, offset - 1, opts, md, NULL);
+  if (rc < 0) {
+    pcre2_match_data_free(md);
+    if (rc != PCRE2_ERROR_NOMATCH)
+      luaL_error(L, "regex matching error %d", rc);
+    return 0;
+  }
+  PCRE2_SIZE* ovector = pcre2_get_ovector_pointer(md);
+  if (ovector[0] > ovector[1]) {
+    /* We must guard against patterns such as /(?=.\K)/ that use \K in an 
+    assertion  to set the start of a match later than its end. In the editor,
+    we just detect this case and give up. */
+    luaL_error(L, "regex matching error: \\K was used in an assertion to "
+    " set the match start after its end"); 
+    pcre2_match_data_free(md);
+    return 0;
+  }
+  for (int i = 0; i < rc*2; i++)
+    lua_pushnumber(L, ovector[i]+1);
+  pcre2_match_data_free(md);
+  return rc*2;
+}
+
+static const luaL_Reg lib[] = {
+  { "compile",  f_pcre_compile },
+  { "cmatch",   f_pcre_match },
+  { "__gc",     f_pcre_gc },
+  { NULL,       NULL }
+};
+
+int luaopen_regex(lua_State *L) {
+  luaL_newlib(L, lib);
+  lua_pushliteral(L, "regex");
+  lua_setfield(L, -2, "__name");
+  lua_pushvalue(L, -1);
+  lua_setfield(L, LUA_REGISTRYINDEX, "regex");
+  lua_pushnumber(L, PCRE2_ANCHORED);
+  lua_setfield(L, -2, "ANCHORED");
+  lua_pushnumber(L, PCRE2_ANCHORED) ; 
+  lua_setfield(L, -2, "ENDANCHORED");  
+  lua_pushnumber(L, PCRE2_NOTBOL);
+  lua_setfield(L, -2, "NOTBOL");
+  lua_pushnumber(L, PCRE2_NOTEOL);
+  lua_setfield(L, -2, "NOTEOL");
+  lua_pushnumber(L, PCRE2_NOTEMPTY);
+  lua_setfield(L, -2, "NOTEMPTY");
+  lua_pushnumber(L, PCRE2_NOTEMPTY_ATSTART);
+  lua_setfield(L, -2, "NOTEMPTY_ATSTART");
+  return 1;
+}
diff --git a/src/meson.build b/src/meson.build
index 881014be..faa1a8ea 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -3,6 +3,7 @@ lite_sources = [
     'api/cp_replace.c',
     'api/renderer.c',
     'api/renderer_font.c',
+    'api/regex.c',
     'api/system.c',
     'renderer.c',
     'renwindow.c',
@@ -18,7 +19,7 @@ endif
 executable('lite',
     lite_sources + lite_rc,
     include_directories: [lite_include, font_renderer_include],
-    dependencies: [lua_dep, sdl_dep, libm, libdl, libx11],
+    dependencies: [lua_dep, sdl_dep, pcre2_dep, libm, libdl, libx11],
     c_args: lite_cargs,
     link_with: libfontrenderer,
     link_args: lite_link_args,