113 lines
2.3 KiB
Lua
113 lines
2.3 KiB
Lua
local tokenizer = {}
|
|
|
|
|
|
local function push_token(t, type, text)
|
|
local prev_type = t[#t-1]
|
|
local prev_text = t[#t]
|
|
if prev_type and (prev_type == type or prev_text:find("^%s*$")) then
|
|
t[#t-1] = type
|
|
t[#t] = prev_text .. text
|
|
else
|
|
table.insert(t, type)
|
|
table.insert(t, text)
|
|
end
|
|
end
|
|
|
|
|
|
local function is_escaped(text, idx, esc)
|
|
local byte = esc:byte()
|
|
local count = 0
|
|
for i = idx - 1, 1, -1 do
|
|
if text:byte(i) ~= byte then break end
|
|
count = count + 1
|
|
end
|
|
return count % 2 == 1
|
|
end
|
|
|
|
|
|
local function find_non_escaped(text, pattern, offset, esc)
|
|
while true do
|
|
local s, e = text:find(pattern, offset)
|
|
if not s then break end
|
|
if esc and is_escaped(text, s, esc) then
|
|
offset = e + 1
|
|
else
|
|
return s, e
|
|
end
|
|
end
|
|
end
|
|
|
|
|
|
function tokenizer.tokenize(syntax, text, state)
|
|
local res = {}
|
|
local i = 1
|
|
|
|
if #syntax.patterns == 0 then
|
|
return { "normal", text }
|
|
end
|
|
|
|
while i <= #text do
|
|
-- continue trying to match the end pattern of a pair if we have a state set
|
|
if state then
|
|
local p = syntax.patterns[state]
|
|
local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3])
|
|
|
|
if s then
|
|
push_token(res, p.type, text:sub(i, e))
|
|
state = nil
|
|
i = e + 1
|
|
else
|
|
push_token(res, p.type, text:sub(i))
|
|
break
|
|
end
|
|
end
|
|
|
|
-- find matching pattern
|
|
local matched = false
|
|
for n, p in ipairs(syntax.patterns) do
|
|
local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern
|
|
local s, e = text:find("^" .. pattern, i)
|
|
|
|
if s then
|
|
-- matched pattern; make and add token
|
|
local t = text:sub(s, e)
|
|
push_token(res, syntax.symbols[t] or p.type, t)
|
|
|
|
-- update state if this was a start|end pattern pair
|
|
if type(p.pattern) == "table" then
|
|
state = n
|
|
end
|
|
|
|
-- move cursor past this token
|
|
i = e + 1
|
|
matched = true
|
|
break
|
|
end
|
|
end
|
|
|
|
-- consume character if we didn't match
|
|
if not matched then
|
|
push_token(res, "normal", text:sub(i, i))
|
|
i = i + 1
|
|
end
|
|
end
|
|
|
|
return res, state
|
|
end
|
|
|
|
|
|
local function iter(t, i)
|
|
i = i + 2
|
|
local type, text = t[i], t[i+1]
|
|
if type then
|
|
return i, type, text
|
|
end
|
|
end
|
|
|
|
function tokenizer.each_token(t)
|
|
return iter, t, -1
|
|
end
|
|
|
|
|
|
return tokenizer
|