From dcbebef2aba11484d80fd6a7da22ffd0744c61c4 Mon Sep 17 00:00:00 2001 From: jgmdev Date: Tue, 15 Mar 2022 18:14:27 -0400 Subject: [PATCH 1/2] plugin detectident: fixes and improvements * Improved performance 67x by not using the tokenizer, this means that now opening files or saving them where indentation is re-detected is much more faster. * Improved the algorithm to detect the space size. --- data/plugins/detectindent.lua | 310 ++++++++++++++++++++++++++-------- 1 file changed, 239 insertions(+), 71 deletions(-) diff --git a/data/plugins/detectindent.lua b/data/plugins/detectindent.lua index 9ac29882..119601f0 100644 --- a/data/plugins/detectindent.lua +++ b/data/plugins/detectindent.lua @@ -3,93 +3,253 @@ local core = require "core" local command = require "core.command" local common = require "core.common" local config = require "core.config" +local core_syntax = require "core.syntax" local DocView = require "core.docview" local Doc = require "core.doc" -local tokenizer = require "core.tokenizer" local cache = setmetatable({}, { __mode = "k" }) +local comments_cache = {} +local auto_detect_max_lines = 150 -local function add_to_stat(stat, val) - for i = 1, #stat do - if val == stat[i][1] then - stat[i][2] = stat[i][2] + 1 - return - end +local function indent_occurrences_more_than_once(stat, idx) + if stat[idx-1] and stat[idx-1] == stat[idx] then + return true + elseif stat[idx+1] and stat[idx+1] == stat[idx] then + return true end - stat[#stat + 1] = {val, 1} + return false end local function optimal_indent_from_stat(stat) if #stat == 0 then return nil, 0 end - local bins = {} - for k = 1, #stat do - local indent = stat[k][1] + table.sort(stat, function(a, b) return a > b end) + local best_indent = 0 + local best_score = 0 + local count = #stat + for x=1, count do + local indent = stat[x] local score = 0 - local mult_prev, lines_prev - for i = k, #stat do - if stat[i][1] % indent == 0 then - local mult = stat[i][1] / indent - if not mult_prev or (mult_prev + 1 == mult and lines_prev / stat[i][2] > 0.1) then - -- we add the number of lines to the score only if the previous - -- multiple of "indent" was populated with enough lines. - score = score + stat[i][2] - end - mult_prev, lines_prev = mult, stat[i][2] + for y=1, count do + if y ~= x and stat[y] % indent == 0 then + score = score + 1 + elseif + indent > stat[y] + and + indent_occurrences_more_than_once(stat, y) + then + score = 0 + break end end - bins[#bins + 1] = {indent, score} - end - table.sort(bins, function(a, b) return a[2] > b[2] end) - return bins[1][1], bins[1][2] -end - - --- return nil if it is a comment or blank line or the initial part of the --- line otherwise. --- we don't need to have the whole line to detect indentation. -local function get_first_line_part(tokens) - local i, n = 1, #tokens - while i + 1 <= n do - local ttype, ttext = tokens[i], tokens[i + 1] - if ttype ~= "comment" and ttext:gsub("%s+", "") ~= "" then - return ttext + if score > best_score then + best_indent = indent + best_score = score + end + if score > 0 then + break end - i = i + 2 end + return best_score > 0 and best_indent or nil, best_score end + +local function escape_comment_tokens(token) + local special_chars = "*-%[].()+?^$" + local escaped = "" + for x=1, token:len() do + local found = false + for y=1, special_chars:len() do + if token:sub(x, x) == special_chars:sub(y, y) then + escaped = escaped .. "%" .. token:sub(x, x) + found = true + break + end + end + if not found then + escaped = escaped .. token:sub(x, x) + end + end + return escaped +end + + +local function get_comment_patterns(syntax) + if comments_cache[syntax.name] then + if #comments_cache[syntax.name] > 0 then + return comments_cache[syntax.name] + else + return nil + end + end + local comments = {} + for idx=1, #syntax.patterns do + local pattern = syntax.patterns[idx] + local startp = "" + if + type(pattern.type) == "string" + and + (pattern.type == "comment" or pattern.type == "string") + then + local not_is_string = pattern.type ~= "string" + if pattern.pattern then + startp = type(pattern.pattern) == "table" + and pattern.pattern[1] or pattern.pattern + if not_is_string and startp:sub(1, 1) ~= "^" then + startp = "^%s*" .. startp + elseif not_is_string then + startp = "^%s*" .. startp:sub(2, startp:len()) + end + if type(pattern.pattern) == "table" then + table.insert(comments, {"p", startp, pattern.pattern[2]}) + elseif not_is_string then + table.insert(comments, {"p", startp}) + end + elseif pattern.regex then + startp = type(pattern.regex) == "table" + and pattern.regex[1] or pattern.regex + if not_is_string and startp:sub(1, 1) ~= "^" then + startp = "^\\s*" .. startp + elseif not_is_string then + startp = "^\\s*" .. startp:sub(2, startp:len()) + end + if type(pattern.regex) == "table" then + table.insert(comments, { + "r", startp, pattern.regex[2] + }) + elseif not_is_string then + table.insert(comments, {"r", startp}) + end + end + elseif pattern.syntax then + local subsyntax = core_syntax.get("file"..pattern.syntax, "") + local sub_comments = get_comment_patterns(subsyntax) + if sub_comments then + for s=1, #sub_comments do + table.insert(comments, sub_comments[s]) + end + end + end + end + if #comments == 0 then + local single_line_comment = syntax.comment + and escape_comment_tokens(syntax.comment) or nil + local block_comment = nil + if syntax.block_comment then + block_comment = { + escape_comment_tokens(syntax.block_comment[1]), + escape_comment_tokens(syntax.block_comment[2]) + } + end + if single_line_comment then + table.insert(comments, {"p", "^%s*" .. single_line_comment}) + end + if block_comment then + table.insert(comments, {"p", "^%s*" .. block_comment[1], block_comment[2]}) + end + end + comments_cache[syntax.name] = comments + if #comments > 0 then + return comments + end + return nil +end + + local function get_non_empty_lines(syntax, lines) return coroutine.wrap(function() - local tokens, state + local comments = get_comment_patterns(syntax) + local i = 0 + local end_regex = nil + local end_pattern = nil + local inside_comment = false for _, line in ipairs(lines) do - tokens, state = tokenizer.tokenize(syntax, line, state) - local line_start = get_first_line_part(tokens) - if line_start then - i = i + 1 - coroutine.yield(i, line_start) + if line:gsub("^%s+", "") ~= "" then + local is_comment = false + if comments then + if not inside_comment then + for c=1, #comments do + local comment = comments[c] + if comment[1] == "p" then + if comment[3] then + local start, ending = line:find(comment[2]) + if start then + if not line:find(comment[3], ending+1) then + is_comment = true + inside_comment = true + end_pattern = comment[3] + end + break + end + elseif line:find(comment[2]) then + is_comment = true + break + end + else + if comment[3] then + local start, ending = regex.match( + comment[2], line, 1, regex.ANCHORED + ) + if start then + if not regex.match( + comment[3], line, ending+1, regex.ANCHORED + ) + then + is_comment = true + inside_comment = true + end_regex = comment[3] + end + break + end + elseif regex.match(comment[2], line, 1, regex.ANCHORED) then + is_comment = true + break + end + end + end + elseif end_pattern and line:find(end_pattern) then + is_comment = true + inside_comment = false + end_pattern = nil + elseif end_regex and regex.match(end_regex, line) then + is_comment = true + inside_comment = false + end_regex = nil + end + end + if + not is_comment + and + not inside_comment + then + i = i + 1 + coroutine.yield(i, line) + end end end end) end -local auto_detect_max_lines = 100 - local function detect_indent_stat(doc) local stat = {} local tab_count = 0 + local runs = 1 + local max_lines = auto_detect_max_lines for i, text in get_non_empty_lines(doc.syntax, doc.lines) do - local str = text:match("^ %s+%S") - if str then add_to_stat(stat, #str - 1) end - local str = text:match("^\t+") - if str then tab_count = tab_count + 1 end + local spaces = text:match("^ +") + if spaces then table.insert(stat, spaces:len()) end + local tabs = text:match("^\t+") + if tabs then tab_count = tab_count + 1 end + -- if nothing found for first lines try at least 4 more times + if i == max_lines and runs < 5 and #stat == 0 and tab_count == 0 then + max_lines = max_lines + auto_detect_max_lines + runs = runs + 1 -- Stop parsing when files is very long. Not needed for euristic determination. - if i > auto_detect_max_lines then break end + elseif i > max_lines then break end end - table.sort(stat, function(a, b) return a[1] < b[1] end) local indent, score = optimal_indent_from_stat(stat) if tab_count > score then return "hard", config.indent_size, tab_count @@ -101,7 +261,7 @@ end local function update_cache(doc) local type, size, score = detect_indent_stat(doc) - local score_threshold = 4 + local score_threshold = 2 if score < score_threshold then -- use default values type = config.tab_type @@ -130,9 +290,11 @@ end local function set_indent_type(doc, type) local _, indent_size = doc:get_indent_info() - cache[doc] = {type = type, - size = indent_size, - confirmed = true} + cache[doc] = { + type = type, + size = indent_size, + confirmed = true + } doc.indent_info = cache[doc] end @@ -158,9 +320,11 @@ end local function set_indent_size(doc, size) local indent_type = doc:get_indent_info() - cache[doc] = {type = indent_type, - size = size, - confirmed = true} + cache[doc] = { + type = indent_type, + size = size, + confirmed = true + } doc.indent_info = cache[doc] end @@ -168,14 +332,14 @@ local function set_indent_size_command() core.command_view:enter( "Specify indent size for current file", function(value) -- submit - local value = math.floor(tonumber(value)) + value = math.floor(tonumber(value)) local doc = core.active_view.doc set_indent_size(doc, value) end, nil, -- suggest nil, -- cancel function(value) -- validate - local value = tonumber(value) + value = tonumber(value) return value ~= nil and value >= 1 end ) @@ -187,20 +351,24 @@ command.add("core.docview", { ["indent:set-file-indent-size"] = set_indent_size_command }) - -command.add(function() +command.add( + function() return core.active_view:is(DocView) - and cache[core.active_view.doc] - and cache[core.active_view.doc].type == "soft" + and cache[core.active_view.doc] + and cache[core.active_view.doc].type == "soft" end, { - ["indent:switch-file-to-tabs-indentation"] = function() set_indent_type(core.active_view.doc, "hard") end + ["indent:switch-file-to-tabs-indentation"] = function() + set_indent_type(core.active_view.doc, "hard") + end }) - -command.add(function() +command.add( + function() return core.active_view:is(DocView) - and cache[core.active_view.doc] - and cache[core.active_view.doc].type == "hard" + and cache[core.active_view.doc] + and cache[core.active_view.doc].type == "hard" end, { - ["indent:switch-file-to-spaces-indentation"] = function() set_indent_type(core.active_view.doc, "soft") end + ["indent:switch-file-to-spaces-indentation"] = function() + set_indent_type(core.active_view.doc, "soft") + end }) From 5830b7d9f0aeb5bfc3eed8011ebcf0d4ca9caf08 Mon Sep 17 00:00:00 2001 From: jgmdev Date: Thu, 17 Mar 2022 00:14:36 -0400 Subject: [PATCH 2/2] plugin detectindent: pre-compile regexes --- data/plugins/detectindent.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/plugins/detectindent.lua b/data/plugins/detectindent.lua index 119601f0..ea9c8b28 100644 --- a/data/plugins/detectindent.lua +++ b/data/plugins/detectindent.lua @@ -116,10 +116,10 @@ local function get_comment_patterns(syntax) end if type(pattern.regex) == "table" then table.insert(comments, { - "r", startp, pattern.regex[2] + "r", regex.compile(startp), regex.compile(pattern.regex[2]) }) elseif not_is_string then - table.insert(comments, {"r", startp}) + table.insert(comments, {"r", regex.compile(startp)}) end end elseif pattern.syntax then