Add `regex.find_offsets`, `regex.find`, improve `regex.match` (#1232)
`regex.match` now behaves like `string.match`. This required changes in the `tokenizer` and in the `detectindent` plugin.
This commit is contained in:
parent
e13f265fac
commit
9d48441685
|
@ -2,14 +2,85 @@
|
||||||
-- pattern:gsub(string).
|
-- pattern:gsub(string).
|
||||||
regex.__index = function(table, key) return regex[key]; end
|
regex.__index = function(table, key) return regex[key]; end
|
||||||
|
|
||||||
regex.match = function(pattern_string, string, offset, options)
|
---Looks for the first match of `pattern` in the string `str`.
|
||||||
local pattern = type(pattern_string) == "table" and
|
---If it finds a match, it returns the indices of `str` where this occurrence
|
||||||
pattern_string or regex.compile(pattern_string)
|
---starts and ends; otherwise, it returns `nil`.
|
||||||
local res = { regex.cmatch(pattern, string, offset or 1, options or 0) }
|
---If the pattern has captures, the captured start and end indexes are returned,
|
||||||
res[2] = res[2] and res[2] - 1
|
---after the two initial ones.
|
||||||
|
---
|
||||||
|
---@param pattern string|table The regex pattern to use, either as a simple string or precompiled.
|
||||||
|
---@param str string The string to search for valid matches.
|
||||||
|
---@param offset? integer The position on the subject to start searching.
|
||||||
|
---@param options? integer A bit field of matching options, eg: regex.NOTBOL | regex.NOTEMPTY
|
||||||
|
---
|
||||||
|
---@return integer? start Offset where the first match was found; `nil` if no match.
|
||||||
|
---@return integer? end Offset where the first match ends; `nil` if no match.
|
||||||
|
---@return integer? ... #Captured matches offsets.
|
||||||
|
regex.find_offsets = function(pattern, str, offset, options)
|
||||||
|
if type(pattern) ~= "table" then
|
||||||
|
pattern = regex.compile(pattern)
|
||||||
|
end
|
||||||
|
local res = { regex.cmatch(pattern, str, offset or 1, options or 0) }
|
||||||
|
-- Reduce every end delimiter by 1
|
||||||
|
for i = 2,#res,2 do
|
||||||
|
res[i] = res[i] - 1
|
||||||
|
end
|
||||||
return table.unpack(res)
|
return table.unpack(res)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
---Behaves like `string.match`.
|
||||||
|
---Looks for the first match of `pattern` in the string `str`.
|
||||||
|
---If it finds a match, it returns the matched string; otherwise, it returns `nil`.
|
||||||
|
---If the pattern has captures, only the captured strings are returned.
|
||||||
|
---If a capture is empty, its offset is returned instead.
|
||||||
|
---
|
||||||
|
---@param pattern string|table The regex pattern to use, either as a simple string or precompiled.
|
||||||
|
---@param str string The string to search for valid matches.
|
||||||
|
---@param offset? integer The position on the subject to start searching.
|
||||||
|
---@param options? integer A bit field of matching options, eg: regex.NOTBOL | regex.NOTEMPTY
|
||||||
|
---
|
||||||
|
---@return (string|integer)? ... #List of captured matches; the entire match if no matches were specified; if the match is empty, its offset is returned instead.
|
||||||
|
regex.match = function(pattern, str, offset, options)
|
||||||
|
local res = { regex.find(pattern, str, offset, options) }
|
||||||
|
if #res == 0 then return end
|
||||||
|
-- If available, only return captures
|
||||||
|
if #res > 2 then return table.unpack(res, 3) end
|
||||||
|
return string.sub(str, res[1], res[2])
|
||||||
|
end
|
||||||
|
|
||||||
|
---Behaves like `string.find`.
|
||||||
|
---Looks for the first match of `pattern` in the string `str`.
|
||||||
|
---If it finds a match, it returns the indices of `str` where this occurrence
|
||||||
|
---starts and ends; otherwise, it returns `nil`.
|
||||||
|
---If the pattern has captures, the captured strings are returned,
|
||||||
|
---after the two indexes ones.
|
||||||
|
---If a capture is empty, its offset is returned instead.
|
||||||
|
---
|
||||||
|
---@param pattern string|table The regex pattern to use, either as a simple string or precompiled.
|
||||||
|
---@param str string The string to search for valid matches.
|
||||||
|
---@param offset? integer The position on the subject to start searching.
|
||||||
|
---@param options? integer A bit field of matching options, eg: regex.NOTBOL | regex.NOTEMPTY
|
||||||
|
---
|
||||||
|
---@return integer? start Offset where the first match was found; `nil` if no match.
|
||||||
|
---@return integer? end Offset where the first match ends; `nil` if no match.
|
||||||
|
---@return (string|integer)? ... #List of captured matches; if the match is empty, its offset is returned instead.
|
||||||
|
regex.find = function(pattern, str, offset, options)
|
||||||
|
local res = { regex.find_offsets(pattern, str, offset, options) }
|
||||||
|
local out = { }
|
||||||
|
if #res == 0 then return end
|
||||||
|
out[1] = res[1]
|
||||||
|
out[2] = res[2]
|
||||||
|
for i = 3,#res,2 do
|
||||||
|
if res[i] > res[i+1] then
|
||||||
|
-- Like in string.find, if the group has size 0, return the index
|
||||||
|
table.insert(out, res[i])
|
||||||
|
else
|
||||||
|
table.insert(out, string.sub(str, res[i], res[i+1]))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return table.unpack(out)
|
||||||
|
end
|
||||||
|
|
||||||
-- Will iterate back through any UTF-8 bytes so that we don't replace bits
|
-- Will iterate back through any UTF-8 bytes so that we don't replace bits
|
||||||
-- mid character.
|
-- mid character.
|
||||||
local function previous_character(str, index)
|
local function previous_character(str, index)
|
||||||
|
|
|
@ -214,23 +214,12 @@ function tokenizer.tokenize(incoming_syntax, text, state)
|
||||||
return
|
return
|
||||||
end
|
end
|
||||||
res = p.pattern and { text:ufind((at_start or p.whole_line[p_idx]) and "^" .. code or code, next) }
|
res = p.pattern and { text:ufind((at_start or p.whole_line[p_idx]) and "^" .. code or code, next) }
|
||||||
or { regex.match(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) }
|
or { regex.find(code, text, text:ucharpos(next), (at_start or p.whole_line[p_idx]) and regex.ANCHORED or 0) }
|
||||||
if p.regex and #res > 0 then -- set correct utf8 len for regex result
|
if p.regex and #res > 0 then -- set correct utf8 len for regex result
|
||||||
local char_pos_1 = string.ulen(text:sub(1, res[1]))
|
local char_pos_1 = res[1] > next and string.ulen(text:sub(1, res[1])) or next
|
||||||
local char_pos_2 = char_pos_1 + string.ulen(text:sub(res[1], res[2])) - 1
|
local char_pos_2 = string.ulen(text:sub(1, res[2]))
|
||||||
-- `regex.match` returns group results as a series of `begin, end`
|
for i=3,#res do
|
||||||
-- we only want `begin`s
|
res[i] = string.ulen(text:sub(1, res[i] - 1)) + 1
|
||||||
if #res >= 3 then
|
|
||||||
res[3] = char_pos_1 + string.ulen(text:sub(res[1], res[3])) - 1
|
|
||||||
end
|
|
||||||
for i=1,(#res-3) do
|
|
||||||
local curr = i + 3
|
|
||||||
local from = i * 2 + 3
|
|
||||||
if from < #res then
|
|
||||||
res[curr] = char_pos_1 + string.ulen(text:sub(res[1], res[from])) - 1
|
|
||||||
else
|
|
||||||
res[curr] = nil
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
res[1] = char_pos_1
|
res[1] = char_pos_1
|
||||||
res[2] = char_pos_2
|
res[2] = char_pos_2
|
||||||
|
|
|
@ -190,11 +190,11 @@ local function get_non_empty_lines(syntax, lines)
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
if comment[3] then
|
if comment[3] then
|
||||||
local start, ending = regex.match(
|
local start, ending = regex.find_offsets(
|
||||||
comment[2], line, 1, regex.ANCHORED
|
comment[2], line, 1, regex.ANCHORED
|
||||||
)
|
)
|
||||||
if start then
|
if start then
|
||||||
if not regex.match(
|
if not regex.find_offsets(
|
||||||
comment[3], line, ending+1, regex.ANCHORED
|
comment[3], line, ending+1, regex.ANCHORED
|
||||||
)
|
)
|
||||||
then
|
then
|
||||||
|
@ -204,7 +204,7 @@ local function get_non_empty_lines(syntax, lines)
|
||||||
end
|
end
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
elseif regex.match(comment[2], line, 1, regex.ANCHORED) then
|
elseif regex.find_offsets(comment[2], line, 1, regex.ANCHORED) then
|
||||||
is_comment = true
|
is_comment = true
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
|
@ -214,7 +214,7 @@ local function get_non_empty_lines(syntax, lines)
|
||||||
is_comment = true
|
is_comment = true
|
||||||
inside_comment = false
|
inside_comment = false
|
||||||
end_pattern = nil
|
end_pattern = nil
|
||||||
elseif end_regex and regex.match(end_regex, line) then
|
elseif end_regex and regex.find_offsets(end_regex, line) then
|
||||||
is_comment = true
|
is_comment = true
|
||||||
inside_comment = false
|
inside_comment = false
|
||||||
end_regex = nil
|
end_regex = nil
|
||||||
|
|
|
@ -41,8 +41,8 @@ regex.NOTEMPTY_ATSTART = 0x00000008
|
||||||
---@param pattern string
|
---@param pattern string
|
||||||
---@param options? regex.modifiers A string of one or more pattern modifiers.
|
---@param options? regex.modifiers A string of one or more pattern modifiers.
|
||||||
---
|
---
|
||||||
---@return regex|string regex Ready to use regular expression object or error
|
---@return regex|nil regex Ready to use regular expression object or nil on error.
|
||||||
---message if compiling the pattern failed.
|
---@return string? error The error message if compiling the pattern failed.
|
||||||
function regex.compile(pattern, options) end
|
function regex.compile(pattern, options) end
|
||||||
|
|
||||||
---
|
---
|
||||||
|
@ -53,7 +53,7 @@ function regex.compile(pattern, options) end
|
||||||
---@param options? integer A bit field of matching options, eg:
|
---@param options? integer A bit field of matching options, eg:
|
||||||
---regex.NOTBOL | regex.NOTEMPTY
|
---regex.NOTBOL | regex.NOTEMPTY
|
||||||
---
|
---
|
||||||
---@return table<integer, integer> list List of offsets where a match was found.
|
---@return integer ... list List of offsets where a match was found.
|
||||||
function regex:cmatch(subject, offset, options) end
|
function regex:cmatch(subject, offset, options) end
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue