Refactor named group handling by adding a pre-pass that generates a list of

named groups with their numbers before the rest of the compiling code is run. 
This has simplified the main compiling code and removed some sources of error.
This commit is contained in:
Philip.Hazel 2015-06-03 16:27:47 +00:00
parent bf492e47f6
commit c9ac9e23ec
11 changed files with 727 additions and 263 deletions

View File

@ -137,6 +137,15 @@ provoke a buffer overflow. This bug was discovered by the LLVM fuzzer.
35. Make pcre2test give an error if a pattern that follows #forbud_utf contains
\P, \p, or \X.
36. The way named subpatterns are handled has been refactored. There is now a
pre-pass over the regex which does nothing other than identify named
subpatterns and count the total captures. This means that information about
named patterns is known before the rest of the compile. In particular, it means
that forward references can be checked as they are encountered. Previously, the
code for handling forward references was contorted and led to several errors in
computing the memory requirements for some patterns, leading to buffer
overflows.
Version 10.10 06-March-2015
---------------------------

File diff suppressed because it is too large Load Diff

View File

@ -164,6 +164,7 @@ static const char compile_error_texts[] =
"missing terminating delimiter for callout with string argument\0"
"unrecognized string delimiter follows (?C\0"
"using \\C is disabled by the application\0"
"(?| and/or (?J: or (?x: parentheses are too deeply nested\0"
;
/* Match-time and UTF error texts are in the same format. */

View File

@ -660,8 +660,9 @@ compiling. */
typedef struct named_group {
PCRE2_SPTR name; /* Points to the name in the pattern */
int length; /* Length of the name */
uint32_t number; /* Group number */
uint16_t length; /* Length of the name */
uint16_t isdup; /* TRUE if a duplicate */
} named_group;
/* Structure for passing "static" information around between the functions

View File

@ -179,6 +179,9 @@
/(*UTF-32)\x{11234}/
abcd\x{11234}pqr
/(*UTF-32)\x{112}/
abcd\x{11234}pqr
/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I
/(*CRLF)(*UTF32)(*BSR_UNICODE)a\Rb/I

13
testdata/testinput2 vendored
View File

@ -4289,6 +4289,17 @@ a random value. /Ix
/A\8B\9C/
A8B9C
/(?x:((?'a')) # comment (with parentheses) and | vertical
(?-x:#not a comment (?'b')) # this is a comment ()
(?'c')) # not a comment (?'d')/info
/(?|(?'a')(2)(?'b')|(?'a')(?'a')(3))/I,dupnames
A23B
B32A
# These are some patterns that used to cause buffer overflows or other errors
# while compiling.
/.((?2)(?R)|\1|$)()/B
@ -4310,4 +4321,6 @@ a random value. /Ix
/A(?'')Z/
"(?J:(?|(?'R')(\k'R')|((?'R'))))"
# End of testinput2

7
testdata/testinput9 vendored
View File

@ -251,4 +251,11 @@
/[^\s]*\s* [^\W]+\W+ [^\d]*?\d0 [^\d\w]{4,6}?\w*A/B
# Check the absolute limit on nesting (?| etc. This varies with code unit
# width because the workspace is a different number of bytes.
/(?|(?|(?J:(?|(?x:(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|
)))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))
/parens_nest_limit=1000
# End of testinput9

View File

@ -647,6 +647,10 @@ Subject length lower bound = 1
0: \x{11234}
/(*UTF-32)\x{11234}/
Failed: error 134 at offset 17: character code point value in \x{} or \o{} is too large
abcd\x{11234}pqr
/(*UTF-32)\x{112}/
Failed: error 160 at offset 5: (*VERB) not recognized or malformed
abcd\x{11234}pqr

View File

@ -642,6 +642,10 @@ Subject length lower bound = 1
Failed: error 160 at offset 5: (*VERB) not recognized or malformed
abcd\x{11234}pqr
/(*UTF-32)\x{112}/
Failed: error 160 at offset 5: (*VERB) not recognized or malformed
abcd\x{11234}pqr
/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I
Failed: error 160 at offset 12: (*VERB) not recognized or malformed

44
testdata/testoutput2 vendored
View File

@ -13293,7 +13293,7 @@ Failed: error 150 at offset 4: invalid range in character class
Failed: error 144 at offset 3: group name must start with a non-digit
/(?&1abc)xx(?<1abc>y)/
Failed: error 144 at offset 3: group name must start with a non-digit
Failed: error 144 at offset 13: group name must start with a non-digit
/(?<ab-cd>xx)/
Failed: error 142 at offset 5: syntax error in subpattern name (missing terminator)
@ -13320,10 +13320,10 @@ Failed: error 144 at offset 4: group name must start with a non-digit
Failed: error 144 at offset 3: group name must start with a non-digit
/(?&1abc)xx(?<1abc>y)/
Failed: error 144 at offset 3: group name must start with a non-digit
Failed: error 144 at offset 13: group name must start with a non-digit
/(?P>1abc)xx(?<1abc>y)/
Failed: error 144 at offset 4: group name must start with a non-digit
Failed: error 144 at offset 14: group name must start with a non-digit
/\g'3gh'/
Failed: error 157 at offset 7: \g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number
@ -14347,6 +14347,42 @@ Failed: error 115 at offset 3: reference to non-existent subpattern
/A\8B\9C/
Failed: error 115 at offset 7: reference to non-existent subpattern
A8B9C
/(?x:((?'a')) # comment (with parentheses) and | vertical
(?-x:#not a comment (?'b')) # this is a comment ()
(?'c')) # not a comment (?'d')/info
Capturing subpattern count = 5
Named capturing subpatterns:
a 2
b 3
c 4
d 5
First code unit = '#'
Last code unit = ' '
Subject length lower bound = 32
/(?|(?'a')(2)(?'b')|(?'a')(?'a')(3))/I,dupnames
Capturing subpattern count = 3
Named capturing subpatterns:
a 1
a 2
b 3
Options: dupnames
Starting code units: 2 3
Subject length lower bound = 1
A23B
0: 2
1:
2: 2
3:
B32A
0: 3
1:
2:
3: 3
# These are some patterns that used to cause buffer overflows or other errors
# while compiling.
/.((?2)(?R)|\1|$)()/B
------------------------------------------------------------------
@ -14411,4 +14447,6 @@ Failed: error 125 at offset 72: lookbehind assertion is not fixed length
/A(?'')Z/
Failed: error 162 at offset 4: subpattern name expected
"(?J:(?|(?'R')(\k'R')|((?'R'))))"
# End of testinput2

View File

@ -356,4 +356,12 @@ Failed: error 177 at offset 6: character code point value in \u.... sequence is
End
------------------------------------------------------------------
# Check the absolute limit on nesting (?| etc. This varies with code unit
# width because the workspace is a different number of bytes.
/(?|(?|(?J:(?|(?x:(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|(?|
)))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))
/parens_nest_limit=1000
Failed: error 184 at offset 1540: (?| and/or (?J: or (?x: parentheses are too deeply nested
# End of testinput9