From 1226e2e0d0db961986dddb922c3b65641e705cd8 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Wed, 11 Jan 2017 16:40:35 +0000 Subject: [PATCH] Fix hyphen after \E after POSIX class causing an error. --- ChangeLog | 3 +++ src/pcre2_compile.c | 52 +++++++++++++++++++++++++------------------- testdata/testinput1 | 6 +++++ testdata/testoutput1 | 8 +++++++ 4 files changed, 47 insertions(+), 22 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8f7f3e7..cd7a470 100644 --- a/ChangeLog +++ b/ChangeLog @@ -125,6 +125,9 @@ fully released code, but are noted here for the record. (r) If a character whose code point was greater than 0xffff appeared within a lookbehind that was within another lookbehind, the calculation of the lookbehind length went wrong and could provoke an internal error. + + (t) The sequence \E- or \Q\E- after a POSIX class in a character class caused + an internal error. Now the hyphen is treated as a literal. 4. Back references are now permitted in lookbehind assertions when there are no duplicated group numbers (that is, (?| has not been used), and, if the diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index c2a8688..93326f8 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -3010,6 +3010,14 @@ while (ptr < ptrend) goto FAILED; } + /* Set "a hyphen is not the start of a range" just in case the POSIX + class is followed by \E or \Q\E (possibly repeated - fuzzers do that + kind of thing) and *then* a hyphen. This causes that hyphen to be + treated as a literal. I don't think it's worth setting up special + apparatus to do otherwise. */ + + class_range_state = RANGE_NO; + /* When PCRE2_UCP is set, some of the POSIX classes are converted to use Unicode properties \p or \P or, in one case, \h or \H. The substitutes table has two values per class, containing the type and @@ -4224,10 +4232,10 @@ return 0; /* This function packages up the logic of adding a character or range of characters to a class. The character values in the arguments will be within the -valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is -called only from within the "add to class" group of functions, some of which -are recursive and mutually recursive. The external entry point is -add_to_class(). +valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is +called only from within the "add to class" group of functions, some of which +are recursive and mutually recursive. The external entry point is +add_to_class(). Arguments: classbits the bit map for characters < 256 @@ -4242,7 +4250,7 @@ Returns: the number of < 256 characters added */ static unsigned int -add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, +add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, compile_block *cb, uint32_t start, uint32_t end) { uint32_t c; @@ -4307,7 +4315,7 @@ can be used in all cases. */ if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR) end = MAX_NON_UTF_CHAR; - + if (start > cb->class_range_start && end < cb->class_range_end) return n8; /* Use the bitmap for characters < 256. Otherwise use extra data.*/ @@ -4380,8 +4388,8 @@ return n8; /* Number of 8-bit characters */ /* This function is used for adding a list of case-equivalent characters to a class, and also for adding a list of horizontal or vertical whitespace. If the list is in order (which it should be), ranges of characters are detected and -handled appropriately. This function is called (sometimes recursively) only -from within the "add to class" set of functions. The external entry point is +handled appropriately. This function is called (sometimes recursively) only +from within the "add to class" set of functions. The external entry point is add_list_to_class(). Arguments: @@ -4399,7 +4407,7 @@ Returns: the number of < 256 characters added */ static unsigned int -add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, +add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except) { unsigned int n8 = 0; @@ -4422,7 +4430,7 @@ return n8; * External entry point for add range to class * *************************************************/ -/* This function sets the overall range so that the internal functions can try +/* This function sets the overall range so that the internal functions can try to avoid duplication when handling case-independence. Arguments: @@ -4451,7 +4459,7 @@ return add_to_class_internal(classbits, uchardptr, options, cb, start, end); * External entry point for add list to class * *************************************************/ -/* This function sets the overall range so that the internal functions can try +/* This function sets the overall range so that the internal functions can try to avoid duplication when handling case-independence. Arguments: @@ -4480,7 +4488,7 @@ while (p[0] < NOTACHAR) { while(p[n+1] == p[0] + n + 1) n++; cb->class_range_start = p[0]; - cb->class_range_end = p[n]; + cb->class_range_end = p[n]; n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]); } p += n + 1; @@ -4736,7 +4744,7 @@ for (;; pptr++) meta = META_CODE(*pptr); meta_arg = META_DATA(*pptr); - + /* If we are in the pre-compile phase, accumulate the length used for the previous cycle of this loop, unless the next item is a quantifier. */ @@ -5148,30 +5156,30 @@ for (;; pptr++) should_flip_negation = TRUE; for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space]; break; - - /* When adding the horizontal or vertical space lists to a class, or - their complements, disable PCRE2_CASELESS, because it justs wastes - time, and in the "not-x" UTF cases can create unwanted duplicates in - the XCLASS list (provoked by characters that have more than one other + + /* When adding the horizontal or vertical space lists to a class, or + their complements, disable PCRE2_CASELESS, because it justs wastes + time, and in the "not-x" UTF cases can create unwanted duplicates in + the XCLASS list (provoked by characters that have more than one other case and by both cases being in the same "not-x" sublist). */ case ESC_h: - (void)add_list_to_class(classbits, &class_uchardata, + (void)add_list_to_class(classbits, &class_uchardata, options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR); break; case ESC_H: - (void)add_not_list_to_class(classbits, &class_uchardata, + (void)add_not_list_to_class(classbits, &class_uchardata, options & ~PCRE2_CASELESS, cb, PRIV(hspace_list)); break; case ESC_v: - (void)add_list_to_class(classbits, &class_uchardata, + (void)add_list_to_class(classbits, &class_uchardata, options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR); break; case ESC_V: - (void)add_not_list_to_class(classbits, &class_uchardata, + (void)add_not_list_to_class(classbits, &class_uchardata, options & ~PCRE2_CASELESS, cb, PRIV(vspace_list)); break; diff --git a/testdata/testinput1 b/testdata/testinput1 index 12a7e82..08a9bcc 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -5820,4 +5820,10 @@ ef) x/x,mark /(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/ +/[s[:digit:]\E-H]+/ + s09-H + +/[s[:digit:]\Q\E-H]+/ + s09-H + # End of testinput1 diff --git a/testdata/testoutput1 b/testdata/testoutput1 index ec47dcd..d07b657 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -9297,4 +9297,12 @@ No match /(?'c')XX(?'YYYYYYYYYYYYYYYYYYYYYYYCl')/ +/[s[:digit:]\E-H]+/ + s09-H + 0: s09-H + +/[s[:digit:]\Q\E-H]+/ + s09-H + 0: s09-H + # End of testinput1