diff --git a/data/core/tokenizer.lua b/data/core/tokenizer.lua index a4fe2867..6d51928c 100644 --- a/data/core/tokenizer.lua +++ b/data/core/tokenizer.lua @@ -15,6 +15,39 @@ local function push_token(t, type, text) end +local function push_tokens(t, syn, pattern, full_text, find_results) + if #find_results > 2 then + -- We do some manipulation with find_results so that it's arranged + -- like this: + -- { start, end, i_1, i_2, i_3, …, i_last } + -- Each position spans characters from i_n to ((i_n+1) - 1), to form + -- consecutive spans of text. + -- + -- If i_1 is not equal to start, start is automatically inserted at + -- that index. + if find_results[3] ~= find_results[1] then + table.insert(find_results, 3, find_results[1]) + end + -- Copy the ending index to the end of the table, so that an ending index + -- always follows a starting index after position 3 in the table. + table.insert(find_results, find_results[2] + 1) + -- Then, we just iterate over our modified table. + for i = 3, #find_results - 1 do + local start = find_results[i] + local fin = find_results[i + 1] - 1 + local type = pattern.type[i - 2] + -- ↑ (i - 2) to convert from [3; n] to [1; n] + local text = full_text:sub(start, fin) + push_token(t, syn.symbols[text] or type, text) + end + else + local start, fin = find_results[1], find_results[2] + local text = full_text:sub(start, fin) + push_token(t, syn.symbols[text] or pattern.type, text) + end +end + + local function is_escaped(text, idx, esc) local byte = esc:byte() local count = 0 @@ -49,7 +82,7 @@ local function retrieve_syntax_state(incoming_syntax, state) -- If we have higher bits, then decode them one at a time, and find which -- syntax we're using. Rather than walking the bytes, and calling into -- `syntax` each time, we could probably cache this in a single table. - for i=0,2 do + for i = 0, 2 do local target = bit32.extract(state, i*8, 8) if target ~= 0 then if current_syntax.patterns[target].syntax then @@ -138,13 +171,13 @@ function tokenizer.tokenize(incoming_syntax, text, state) local matched = false for n, p in ipairs(current_syntax.patterns) do local pattern = (type(p.pattern) == "table") and p.pattern[1] or p.pattern - local s, e = text:find("^" .. pattern, i) + local find_results = { text:find("^" .. pattern, i) } + local start, fin = find_results[1], find_results[2] - if s then - -- matched pattern; make and add token - local t = text:sub(s, e) + if start then + -- matched pattern; make and add tokens + push_tokens(res, current_syntax, p, text, find_results) - push_token(res, current_syntax.symbols[t] or p.type, t) -- update state if this was a start|end pattern pair if type(p.pattern) == "table" then state = bit32.replace(state, n, current_level*8, 8) @@ -162,7 +195,7 @@ function tokenizer.tokenize(incoming_syntax, text, state) end -- move cursor past this token - i = e + 1 + i = fin + 1 matched = true break end diff --git a/data/plugins/language_c.lua b/data/plugins/language_c.lua index 1445d067..b311884b 100644 --- a/data/plugins/language_c.lua +++ b/data/plugins/language_c.lua @@ -5,17 +5,20 @@ syntax.add { files = { "%.c$", "%.h$", "%.inl$", "%.cpp$", "%.hpp$" }, comment = "//", patterns = { - { pattern = "//.-\n", type = "comment" }, - { pattern = { "/%*", "%*/" }, type = "comment" }, - { pattern = { "#", "[^\\]\n" }, type = "comment" }, - { pattern = { '"', '"', '\\' }, type = "string" }, - { pattern = { "'", "'", '\\' }, type = "string" }, - { pattern = "-?0x%x+", type = "number" }, - { pattern = "-?%d+[%d%.eE]*f?", type = "number" }, - { pattern = "-?%.?%d+f?", type = "number" }, - { pattern = "[%+%-=/%*%^%%<>!~|&]", type = "operator" }, - { pattern = "[%a_][%w_]*%f[(]", type = "function" }, - { pattern = "[%a_][%w_]*", type = "symbol" }, + { pattern = "//.-\n", type = "comment" }, + { pattern = { "/%*", "%*/" }, type = "comment" }, + { pattern = { '"', '"', '\\' }, type = "string" }, + { pattern = { "'", "'", '\\' }, type = "string" }, + { pattern = "0x%x+", type = "number" }, + { pattern = "%d+[%d%.eE]*f?", type = "number" }, + { pattern = "%.?%d+f?", type = "number" }, + { pattern = "[%+%-=/%*%^%%<>!~|&]", type = "operator" }, + { pattern = "struct%s()[%a_][%w_]*", type = {"keyword", "keyword2"} }, + { pattern = "union%s()[%a_][%w_]*", type = {"keyword", "keyword2"} }, + { pattern = "[%a_][%w_]*%f[(]", type = "function" }, + { pattern = "[%a_][%w_]*", type = "symbol" }, + { pattern = "#include%s()<.->", type = {"keyword", "string"} }, + { pattern = "#[%a_][%w_]*", type = "keyword" }, }, symbols = { ["if"] = "keyword", @@ -29,8 +32,6 @@ syntax.add { ["continue"] = "keyword", ["return"] = "keyword", ["goto"] = "keyword", - ["struct"] = "keyword", - ["union"] = "keyword", ["typedef"] = "keyword", ["enum"] = "keyword", ["extern"] = "keyword", @@ -42,7 +43,6 @@ syntax.add { ["case"] = "keyword", ["default"] = "keyword", ["auto"] = "keyword", - ["const"] = "keyword", ["void"] = "keyword", ["int"] = "keyword2", ["short"] = "keyword2",