lite/data/core/tokenizer.lua

local tokenizer = {}


local function push_token(t, type, text)
  local prev_type = t[#t-1]
  local prev_text = t[#t]
  if prev_type and (prev_type == type or prev_text:find("^%s*$")) then
    t[#t-1] = type
    t[#t] = prev_text .. text
  else
    table.insert(t, type)
    table.insert(t, text)
  end
end


local function is_escaped(text, idx, esc)
  local byte = esc:byte()
  local count = 0
  for i = idx - 1, 1, -1 do
    if text:byte(i) ~= byte then break end
    count = count + 1
  end
  return count % 2 == 1
end


local function find_non_escaped(text, pattern, offset, esc)
  while true do
    local s, e = text:find(pattern, offset)
    if not s then break end
    if esc and is_escaped(text, s, esc) then
      offset = e + 1
    else
      return s, e
    end
  end
end


function tokenizer.tokenize(syntax, text, state)
  local res = {}
  local i = 1

  if #syntax.patterns == 0 then
    return { "normal", text }
  end

  while i <= #text do
    -- continue trying to match the end pattern of a pair if we have a state set
    if state then
      local p = syntax.patterns[state]
      local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3])

      if s then
        push_token(res, p.type, text:sub(i, e))
        state = nil
        i = e + 1
      else
        push_token(res, p.type, text:sub(i))
        break
      end
    end

    -- find matching pattern
    local matched = false
    for n, p in ipairs(syntax.patterns) do
      local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern
      local s, e = text:find("^" .. pattern, i)

      if s then
        -- matched pattern; make and add token
        local t = text:sub(s, e)
        push_token(res, syntax.symbols[t] or p.type, t)

        -- update state if this was a start|end pattern pair
        if type(p.pattern) == "table" then
          state = n
        end

        -- move cursor past this token
        i = e + 1
        matched = true
        break
      end
    end

    -- consume character if we didn't match
    if not matched then
      push_token(res, "normal", text:sub(i, i))
      i = i + 1
    end
  end

  return res, state
end


local function iter(t, i)
  i = i + 2
  local type, text = t[i], t[i+1]
  if type then
    return i, type, text
  end
end

function tokenizer.each_token(t)
  return iter, t, -1
end


return tokenizer
Moved highlighter code from `DocView` to `Doc` * Only one highlighter state is kept per-document as opposed to one per-docview * Fixes a bug with retaining older highlighter state as a DocView wasn't able to detect lines changing above it's viewport * Renames `highlighter` module to more descriptive `tokenizer` 2020-05-07 22:14:46 +02:00			`local tokenizer = {}`
Initial commit 2019-12-28 12:16:32 +01:00

			`local function push_token(t, type, text)`
			`local prev_type = t[#t-1]`
			`local prev_text = t[#t]`
			`if prev_type and (prev_type == type or prev_text:find("^%s*$")) then`
			`t[#t-1] = type`
			`t[#t] = prev_text .. text`
			`else`
			`table.insert(t, type)`
			`table.insert(t, text)`
			`end`
			`end`


			`local function is_escaped(text, idx, esc)`
			`local byte = esc:byte()`
			`local count = 0`
			`for i = idx - 1, 1, -1 do`
			`if text:byte(i) ~= byte then break end`
			`count = count + 1`
			`end`
			`return count % 2 == 1`
			`end`


			`local function find_non_escaped(text, pattern, offset, esc)`
			`while true do`
			`local s, e = text:find(pattern, offset)`
			`if not s then break end`
			`if esc and is_escaped(text, s, esc) then`
			`offset = e + 1`
			`else`
			`return s, e`
			`end`
			`end`
			`end`


Moved highlighter code from `DocView` to `Doc` * Only one highlighter state is kept per-document as opposed to one per-docview * Fixes a bug with retaining older highlighter state as a DocView wasn't able to detect lines changing above it's viewport * Renames `highlighter` module to more descriptive `tokenizer` 2020-05-07 22:14:46 +02:00			`function tokenizer.tokenize(syntax, text, state)`
Initial commit 2019-12-28 12:16:32 +01:00			`local res = {}`
			`local i = 1`

Made tokenizer skip parsing process on plain-text files This, along with the earlier rencache changes should resolve #64 2020-05-14 11:08:12 +02:00			`if #syntax.patterns == 0 then`
			`return { "normal", text }`
			`end`

Initial commit 2019-12-28 12:16:32 +01:00			`while i <= #text do`
			`-- continue trying to match the end pattern of a pair if we have a state set`
			`if state then`
			`local p = syntax.patterns[state]`
			`local s, e = find_non_escaped(text, p.pattern[2], i, p.pattern[3])`

			`if s then`
			`push_token(res, p.type, text:sub(i, e))`
			`state = nil`
			`i = e + 1`
			`else`
			`push_token(res, p.type, text:sub(i))`
			`break`
			`end`
			`end`

			`-- find matching pattern`
			`local matched = false`
			`for n, p in ipairs(syntax.patterns) do`
			`local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern`
			`local s, e = text:find("^" .. pattern, i)`

			`if s then`
			`-- matched pattern; make and add token`
			`local t = text:sub(s, e)`
			`push_token(res, syntax.symbols[t] or p.type, t)`

			`-- update state if this was a start\|end pattern pair`
			`if type(p.pattern) == "table" then`
			`state = n`
			`end`

			`-- move cursor past this token`
			`i = e + 1`
			`matched = true`
			`break`
			`end`
			`end`

			`-- consume character if we didn't match`
			`if not matched then`
			`push_token(res, "normal", text:sub(i, i))`
			`i = i + 1`
			`end`
			`end`

			`return res, state`
			`end`


			`local function iter(t, i)`
			`i = i + 2`
			`local type, text = t[i], t[i+1]`
			`if type then`
			`return i, type, text`
			`end`
			`end`

Moved highlighter code from `DocView` to `Doc` * Only one highlighter state is kept per-document as opposed to one per-docview * Fixes a bug with retaining older highlighter state as a DocView wasn't able to detect lines changing above it's viewport * Renames `highlighter` module to more descriptive `tokenizer` 2020-05-07 22:14:46 +02:00			`function tokenizer.each_token(t)`
Initial commit 2019-12-28 12:16:32 +01:00			`return iter, t, -1`
			`end`


Moved highlighter code from `DocView` to `Doc` * Only one highlighter state is kept per-document as opposed to one per-docview * Fixes a bug with retaining older highlighter state as a DocView wasn't able to detect lines changing above it's viewport * Renames `highlighter` module to more descriptive `tokenizer` 2020-05-07 22:14:46 +02:00			`return tokenizer`