Fix UCP with [[:<]] and [[:>:]] bad compile bug.

This commit is contained in:
Philip.Hazel 2015-10-21 11:29:07 +00:00
parent 0fe342cbed
commit 94eda7669a
5 changed files with 63 additions and 25 deletions

View File

@ -206,6 +206,9 @@ very large.
59. Change 55 above introduced a bug by which certain patterns provoked the 59. Change 55 above introduced a bug by which certain patterns provoked the
erroneous error "\ at end of pattern". erroneous error "\ at end of pattern".
60. The special sequences [[:<:]] and [[:>:]] gave rise to incorrect compiling
errors or other strange effects if compiled in UCP mode.
Version 10.20 30-June-2015 Version 10.20 30-June-2015
-------------------------- --------------------------

View File

@ -1618,6 +1618,13 @@ sequences that define a data character are recognised. The isclass argument is
not relevant, but the options argument is the final value of the compiled not relevant, but the options argument is the final value of the compiled
pattern's options. pattern's options.
There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is
processed, it is replaced by a nested alternative sequence. If this contains a
backslash (which is usually does), ptrend does not point to its end - it still
points to the end of the whole pattern. However, we can detect this case
because cb->nestptr[0] will be non-NULL. The nested sequences are all zero-
terminated and there are only ever two levels of nesting.
Arguments: Arguments:
ptrptr points to the input position pointer ptrptr points to the input position pointer
ptrend points to the end of the input ptrend points to the end of the input
@ -1643,10 +1650,14 @@ register uint32_t c, cc;
int escape = 0; int escape = 0;
int i; int i;
/* If backslash is at the end of the string, it's an error. The check must be /* Find the end of a nested insert. */
skipped when processing a nested insertion string during compilation. */
if ((cb == NULL || cb->nestptr == NULL) && ptr >= ptrend) if (cb != NULL && cb->nestptr[0] != NULL)
ptrend = ptr + PRIV(strlen)(ptr);
/* If backslash is at the end of the string, it's an error. */
if (ptr >= ptrend)
{ {
*errorcodeptr = ERR1; *errorcodeptr = ERR1;
return 0; return 0;
@ -3700,13 +3711,14 @@ for (;; ptr++)
c = *ptr; c = *ptr;
/* If we are at the end of a nested substitution, revert to the outer level /* If we are at the end of a nested substitution, revert to the outer level
string. Nesting only happens one level deep, and the inserted string is string. Nesting only happens one or two levels deep, and the inserted string
always zero terminated. */ is always zero terminated. */
if (c == CHAR_NULL && cb->nestptr != NULL) if (c == CHAR_NULL && cb->nestptr[0] != NULL)
{ {
ptr = cb->nestptr; ptr = cb->nestptr[0];
cb->nestptr = NULL; cb->nestptr[0] = cb->nestptr[1];
cb->nestptr[1] = NULL;
c = *ptr; c = *ptr;
} }
@ -3823,7 +3835,7 @@ for (;; ptr++)
/* Fill in length of a previous callout, except when the next thing is a /* Fill in length of a previous callout, except when the next thing is a
quantifier or when processing a property substitution string in UCP mode. */ quantifier or when processing a property substitution string in UCP mode. */
if (!is_quantifier && previous_callout != NULL && cb->nestptr == NULL && if (!is_quantifier && previous_callout != NULL && cb->nestptr[0] == NULL &&
after_manual_callout-- <= 0) after_manual_callout-- <= 0)
{ {
if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
@ -3834,7 +3846,8 @@ for (;; ptr++)
/* Create auto callout, except for quantifiers, or while processing property /* Create auto callout, except for quantifiers, or while processing property
strings that are substituted for \w etc in UCP mode. */ strings that are substituted for \w etc in UCP mode. */
if ((options & PCRE2_AUTO_CALLOUT) != 0 && !is_quantifier && cb->nestptr == NULL) if ((options & PCRE2_AUTO_CALLOUT) != 0 && !is_quantifier &&
cb->nestptr[0] == NULL)
{ {
previous_callout = code; previous_callout = code;
code = auto_callout(code, ptr, cb); code = auto_callout(code, ptr, cb);
@ -3926,13 +3939,15 @@ for (;; ptr++)
In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
used for "start of word" and "end of word". As these are otherwise illegal used for "start of word" and "end of word". As these are otherwise illegal
sequences, we don't break anything by recognizing them. They are replaced sequences, we don't break anything by recognizing them. They are replaced
by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are by \b(?=\w) and \b(?<=\w) respectively. This can only happen at the top
erroneous and are handled by the normal code below. */ nesting level, as no other inserted sequences will contains these oddities.
Sequences like [a[:<:]] are erroneous and are handled by the normal code
below. */
case CHAR_LEFT_SQUARE_BRACKET: case CHAR_LEFT_SQUARE_BRACKET:
if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0) if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
{ {
cb->nestptr = ptr + 7; cb->nestptr[0] = ptr + 7;
ptr = sub_start_of_word; /* Do not combine these statements; clang's */ ptr = sub_start_of_word; /* Do not combine these statements; clang's */
ptr--; /* sanitizer moans about a negative index. */ ptr--; /* sanitizer moans about a negative index. */
continue; continue;
@ -3940,7 +3955,7 @@ for (;; ptr++)
if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0) if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
{ {
cb->nestptr = ptr + 7; cb->nestptr[0] = ptr + 7;
ptr = sub_end_of_word; /* Do not combine these statements; clang's */ ptr = sub_end_of_word; /* Do not combine these statements; clang's */
ptr--; /* sanitizer moans about a negative index. */ ptr--; /* sanitizer moans about a negative index. */
continue; continue;
@ -4125,11 +4140,13 @@ for (;; ptr++)
int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0); int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
/* The posix_substitutes table specifies which POSIX classes can be /* The posix_substitutes table specifies which POSIX classes can be
converted to \p or \P items. */ converted to \p or \P items. This can only happen at top nestling
level, as there will never be a POSIX class in a string that is
substituted for something else. */
if (posix_substitutes[pc] != NULL) if (posix_substitutes[pc] != NULL)
{ {
cb->nestptr = tempptr + 1; cb->nestptr[0] = tempptr + 1;
ptr = posix_substitutes[pc] - 1; ptr = posix_substitutes[pc] - 1;
goto CONTINUE_CLASS; goto CONTINUE_CLASS;
} }
@ -4263,9 +4280,10 @@ for (;; ptr++)
case ESC_DU: /* when PCRE2_UCP is set. We replace the */ case ESC_DU: /* when PCRE2_UCP is set. We replace the */
case ESC_wu: /* escape sequence with an appropriate \p */ case ESC_wu: /* escape sequence with an appropriate \p */
case ESC_WU: /* or \P to test Unicode properties instead */ case ESC_WU: /* or \P to test Unicode properties instead */
case ESC_su: /* of the default ASCII testing. */ case ESC_su: /* of the default ASCII testing. This might be */
case ESC_SU: case ESC_SU: /* a 2nd-level nesting for [[:<:]] or [[:>:]]. */
cb->nestptr = ptr; cb->nestptr[1] = cb->nestptr[0];
cb->nestptr[0] = ptr;
ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */ ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
class_has_8bitchar--; /* Undo! */ class_has_8bitchar--; /* Undo! */
break; break;
@ -4607,10 +4625,11 @@ for (;; ptr++)
CONTINUE_CLASS: CONTINUE_CLASS:
c = *(++ptr); c = *(++ptr);
if (c == 0 && cb->nestptr != NULL) if (c == CHAR_NULL && cb->nestptr[0] != NULL)
{ {
ptr = cb->nestptr; ptr = cb->nestptr[0];
cb->nestptr = NULL; cb->nestptr[0] = cb->nestptr[1];
cb->nestptr[1] = NULL;
c = *(++ptr); c = *(++ptr);
} }
@ -7082,7 +7101,8 @@ for (;; ptr++)
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (escape >= ESC_DU && escape <= ESC_wu) if (escape >= ESC_DU && escape <= ESC_wu)
{ {
cb->nestptr = ptr + 1; /* Where to resume */ cb->nestptr[1] = cb->nestptr[0]; /* Back up if at 2nd level */
cb->nestptr[0] = ptr + 1; /* Where to resume */
ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */ ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
} }
else else
@ -8079,7 +8099,7 @@ cb.bracount = cb.final_bracount = 0;
cb.cx = ccontext; cb.cx = ccontext;
cb.dupnames = FALSE; cb.dupnames = FALSE;
cb.end_pattern = pattern + patlen; cb.end_pattern = pattern + patlen;
cb.nestptr = NULL; cb.nestptr[0] = cb.nestptr[1] = NULL;
cb.external_flags = 0; cb.external_flags = 0;
cb.external_options = options; cb.external_options = options;
cb.had_recurse = FALSE; cb.had_recurse = FALSE;

View File

@ -686,7 +686,7 @@ typedef struct compile_block {
PCRE2_SPTR start_code; /* The start of the compiled code */ PCRE2_SPTR start_code; /* The start of the compiled code */
PCRE2_SPTR start_pattern; /* The start of the pattern */ PCRE2_SPTR start_pattern; /* The start of the pattern */
PCRE2_SPTR end_pattern; /* The end of the pattern */ PCRE2_SPTR end_pattern; /* The end of the pattern */
PCRE2_SPTR nestptr; /* Pointer saved for string substitution */ PCRE2_SPTR nestptr[2]; /* Pointer(s) saved for string substitution */
PCRE2_UCHAR *name_table; /* The name/number table */ PCRE2_UCHAR *name_table; /* The name/number table */
size_t workspace_size; /* Size of workspace */ size_t workspace_size; /* Size of workspace */
uint16_t names_found; /* Number of entries so far */ uint16_t names_found; /* Number of entries so far */

2
testdata/testinput5 vendored
View File

@ -1684,4 +1684,6 @@
\= Expect no match \= Expect no match
123 123
/(*UCP)(*UTF)[[:>:]]X/B
# End of testinput5 # End of testinput5

13
testdata/testoutput5 vendored
View File

@ -4045,4 +4045,17 @@ MK: a\x{12345}b\x{09}(d)c
123 123
No match No match
/(*UCP)(*UTF)[[:>:]]X/B
------------------------------------------------------------------
Bra
\b
AssertB
Reverse
prop Xwd
Ket
X
Ket
End
------------------------------------------------------------------
# End of testinput5 # End of testinput5