From 1c10bb894dc8c374cb9a981e1e2f06c7c72b6713 Mon Sep 17 00:00:00 2001 From: Guldoman Date: Wed, 29 Nov 2023 17:00:09 +0100 Subject: [PATCH] Fix `language_js` regex constant detection (#1581) * Fix `language_js` regex constant detection * Simplify regex constant detection in `language_js` * Add more possessive quantifiers in `language_js` regex constant detection This avoids more catastrophic backtracking cases. * Allow `.` after regex constant in `language_js` --- data/plugins/language_js.lua | 74 ++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 12 deletions(-) diff --git a/data/plugins/language_js.lua b/data/plugins/language_js.lua index f79fece6..307aeecf 100644 --- a/data/plugins/language_js.lua +++ b/data/plugins/language_js.lua @@ -1,24 +1,74 @@ -- mod-version:3 local syntax = require "core.syntax" +-- Regex pattern explanation: +-- This will match / and will look ahead for something that looks like a regex. +-- +-- (?!/) Don't match empty regexes. +-- +-- (?>...) this is using an atomic group to minimize backtracking, as that'd +-- cause "Catastrophic Backtracking" in some cases. +-- +-- [^\\[\/]++ will match anything that's isn't an escape, a start of character +-- class or an end of pattern, without backtracking (the second +). +-- +-- \\. will match anything that's escaped. +-- +-- \[(?:[^\\\]++]|\\.)*+\] will match character classes. +-- +-- /[gmiyuvsd]*\s*[\n,;\)\]\}\.]) will match the end of pattern delimiter, optionally +-- followed by pattern options, and anything that can +-- be after a pattern. +-- +-- Demo with some unit tests (click on the Unit Tests entry): https://regex101.com/r/R0w8Qw/1 +-- Note that it has a couple of changes to make it work on that platform. +local regex_pattern = { + [=[/(?=(?!/)(?:(?>[^\\[\/]++|\\.|\[(?:[^\\\]]++|\\.)*+\])*+)++/[gmiyuvsd]*\s*[\n,;\)\]\}\.])()]=], + "/()[gmiyuvsd]*", "\\" +} + +-- For the moment let's not actually differentiate the insides of the regex, +-- as this will need new token types... +local inner_regex_syntax = { + patterns = { + { pattern = "%(()%?[:!=><]", type = { "string", "string" } }, + { pattern = "[.?+*%(%)|]", type = "string" }, + { pattern = "{%d*,?%d*}", type = "string" }, + { regex = { [=[\[()\^?]=], [=[(?:\]|(?=\n))()]=], "\\" }, + type = { "string", "string" }, + syntax = { -- Inside character class + patterns = { + { pattern = "\\\\", type = "string" }, + { pattern = "\\%]", type = "string" }, + { pattern = "[^%]\n]", type = "string" } + }, + symbols = {} + } + }, + { regex = "\\/", type = "string" }, + { regex = "[^/\n]", type = "string" }, + }, + symbols = {} +} + syntax.add { name = "JavaScript", files = { "%.js$", "%.json$", "%.cson$", "%.mjs$", "%.cjs$" }, comment = "//", block_comment = { "/*", "*/" }, patterns = { - { pattern = "//.*", type = "comment" }, - { pattern = { "/%*", "%*/" }, type = "comment" }, - { pattern = { '/[^= ]', '/', '\\' },type = "string" }, - { pattern = { '"', '"', '\\' }, type = "string" }, - { pattern = { "'", "'", '\\' }, type = "string" }, - { pattern = { "`", "`", '\\' }, type = "string" }, - { pattern = "0x[%da-fA-F_]+n?", type = "number" }, - { pattern = "-?%d+[%d%.eE_n]*", type = "number" }, - { pattern = "-?%.?%d+", type = "number" }, - { pattern = "[%+%-=/%*%^%%<>!~|&]", type = "operator" }, - { pattern = "[%a_][%w_]*%f[(]", type = "function" }, - { pattern = "[%a_][%w_]*", type = "symbol" }, + { pattern = "//.*", type = "comment" }, + { pattern = { "/%*", "%*/" }, type = "comment" }, + { regex = regex_pattern, syntax = inner_regex_syntax, type = {"string", "string"} }, + { pattern = { '"', '"', '\\' }, type = "string" }, + { pattern = { "'", "'", '\\' }, type = "string" }, + { pattern = { "`", "`", '\\' }, type = "string" }, + { pattern = "0x[%da-fA-F_]+n?()%s*()/?", type = {"number", "normal", "operator"} }, + { pattern = "-?%d+[%d%.eE_n]*()%s*()/?", type = {"number", "normal", "operator"} }, + { pattern = "-?%.?%d+()%s*()/?", type = {"number", "normal", "operator"} }, + { pattern = "[%+%-=/%*%^%%<>!~|&]", type = "operator" }, + { pattern = "[%a_][%w_]*%f[(]", type = "function" }, + { pattern = "[%a_][%w_]*()%s*()/?", type = {"symbol", "normal", "operator"} }, }, symbols = { ["async"] = "keyword",