Merge pull request #999 from Guldoman/tokenizer_regex_groups
Allow regexes in `tokenizer` to split tokens with groups
This commit is contained in:
commit
bd742d5b3d
|
@ -5,8 +5,9 @@ regex.__index = function(table, key) return regex[key]; end
|
|||
regex.match = function(pattern_string, string, offset, options)
|
||||
local pattern = type(pattern_string) == "table" and
|
||||
pattern_string or regex.compile(pattern_string)
|
||||
local s, e = regex.cmatch(pattern, string, offset or 1, options or 0)
|
||||
return s, e and e - 1
|
||||
local res = { regex.cmatch(pattern, string, offset or 1, options or 0) }
|
||||
res[2] = res[2] and res[2] - 1
|
||||
return table.unpack(res)
|
||||
end
|
||||
|
||||
-- Will iterate back through any UTF-8 bytes so that we don't replace bits
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
local core = require "core"
|
||||
local syntax = require "core.syntax"
|
||||
local common = require "core.common"
|
||||
|
||||
local tokenizer = {}
|
||||
local bad_patterns = {}
|
||||
|
||||
local function push_token(t, type, text)
|
||||
type = type or "normal"
|
||||
local prev_type = t[#t-1]
|
||||
local prev_text = t[#t]
|
||||
if prev_type and (prev_type == type or prev_text:ufind("^%s*$")) then
|
||||
|
@ -173,6 +176,20 @@ function tokenizer.tokenize(incoming_syntax, text, state)
|
|||
or { regex.match(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) }
|
||||
if p.regex and #res > 0 then -- set correct utf8 len for regex result
|
||||
res[2] = res[1] + string.ulen(text:sub(res[1], res[2])) - 1
|
||||
-- `regex.match` returns group results as a series of `begin, end`
|
||||
-- we only want `begin`s
|
||||
if #res >= 3 then
|
||||
res[3] = res[1] + string.ulen(text:sub(res[1], res[3])) - 1
|
||||
end
|
||||
for i=1,(#res-3) do
|
||||
local curr = i + 3
|
||||
local from = i * 2 + 3
|
||||
if from < #res then
|
||||
res[curr] = res[1] + string.ulen(text:sub(res[1], res[from])) - 1
|
||||
else
|
||||
res[curr] = nil
|
||||
end
|
||||
end
|
||||
res[1] = next
|
||||
end
|
||||
if res[1] and close and target[3] then
|
||||
|
@ -242,6 +259,15 @@ function tokenizer.tokenize(incoming_syntax, text, state)
|
|||
local matched = false
|
||||
for n, p in ipairs(current_syntax.patterns) do
|
||||
local find_results = { find_text(text, p, i, true, false) }
|
||||
if #find_results - 1 > #p.type then
|
||||
if not bad_patterns[current_syntax] then
|
||||
bad_patterns[current_syntax] = { }
|
||||
end
|
||||
if not bad_patterns[current_syntax][n] then
|
||||
bad_patterns[current_syntax][n] = true
|
||||
core.error("Malformed pattern #%d in %s language plugin", n, current_syntax.name or "unnamed")
|
||||
end
|
||||
end
|
||||
if find_results[1] then
|
||||
-- matched pattern; make and add tokens
|
||||
push_tokens(res, current_syntax, p, text, find_results)
|
||||
|
|
|
@ -88,7 +88,7 @@ static int f_pcre_match(lua_State *L) {
|
|||
return 0;
|
||||
}
|
||||
for (int i = 0; i < rc*2; i++)
|
||||
lua_pushnumber(L, ovector[i]+offset+1);
|
||||
lua_pushinteger(L, ovector[i]+offset+1);
|
||||
pcre2_match_data_free(md);
|
||||
return rc*2;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue