Fix UCP with [[:<]] and [[:>:]] bad compile bug.
This commit is contained in:
parent
0fe342cbed
commit
94eda7669a
|
@ -206,6 +206,9 @@ very large.
|
||||||
59. Change 55 above introduced a bug by which certain patterns provoked the
|
59. Change 55 above introduced a bug by which certain patterns provoked the
|
||||||
erroneous error "\ at end of pattern".
|
erroneous error "\ at end of pattern".
|
||||||
|
|
||||||
|
60. The special sequences [[:<:]] and [[:>:]] gave rise to incorrect compiling
|
||||||
|
errors or other strange effects if compiled in UCP mode.
|
||||||
|
|
||||||
|
|
||||||
Version 10.20 30-June-2015
|
Version 10.20 30-June-2015
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
|
@ -1618,6 +1618,13 @@ sequences that define a data character are recognised. The isclass argument is
|
||||||
not relevant, but the options argument is the final value of the compiled
|
not relevant, but the options argument is the final value of the compiled
|
||||||
pattern's options.
|
pattern's options.
|
||||||
|
|
||||||
|
There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is
|
||||||
|
processed, it is replaced by a nested alternative sequence. If this contains a
|
||||||
|
backslash (which is usually does), ptrend does not point to its end - it still
|
||||||
|
points to the end of the whole pattern. However, we can detect this case
|
||||||
|
because cb->nestptr[0] will be non-NULL. The nested sequences are all zero-
|
||||||
|
terminated and there are only ever two levels of nesting.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
ptrptr points to the input position pointer
|
ptrptr points to the input position pointer
|
||||||
ptrend points to the end of the input
|
ptrend points to the end of the input
|
||||||
|
@ -1643,10 +1650,14 @@ register uint32_t c, cc;
|
||||||
int escape = 0;
|
int escape = 0;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
/* If backslash is at the end of the string, it's an error. The check must be
|
/* Find the end of a nested insert. */
|
||||||
skipped when processing a nested insertion string during compilation. */
|
|
||||||
|
|
||||||
if ((cb == NULL || cb->nestptr == NULL) && ptr >= ptrend)
|
if (cb != NULL && cb->nestptr[0] != NULL)
|
||||||
|
ptrend = ptr + PRIV(strlen)(ptr);
|
||||||
|
|
||||||
|
/* If backslash is at the end of the string, it's an error. */
|
||||||
|
|
||||||
|
if (ptr >= ptrend)
|
||||||
{
|
{
|
||||||
*errorcodeptr = ERR1;
|
*errorcodeptr = ERR1;
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -3700,13 +3711,14 @@ for (;; ptr++)
|
||||||
c = *ptr;
|
c = *ptr;
|
||||||
|
|
||||||
/* If we are at the end of a nested substitution, revert to the outer level
|
/* If we are at the end of a nested substitution, revert to the outer level
|
||||||
string. Nesting only happens one level deep, and the inserted string is
|
string. Nesting only happens one or two levels deep, and the inserted string
|
||||||
always zero terminated. */
|
is always zero terminated. */
|
||||||
|
|
||||||
if (c == CHAR_NULL && cb->nestptr != NULL)
|
if (c == CHAR_NULL && cb->nestptr[0] != NULL)
|
||||||
{
|
{
|
||||||
ptr = cb->nestptr;
|
ptr = cb->nestptr[0];
|
||||||
cb->nestptr = NULL;
|
cb->nestptr[0] = cb->nestptr[1];
|
||||||
|
cb->nestptr[1] = NULL;
|
||||||
c = *ptr;
|
c = *ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3823,7 +3835,7 @@ for (;; ptr++)
|
||||||
/* Fill in length of a previous callout, except when the next thing is a
|
/* Fill in length of a previous callout, except when the next thing is a
|
||||||
quantifier or when processing a property substitution string in UCP mode. */
|
quantifier or when processing a property substitution string in UCP mode. */
|
||||||
|
|
||||||
if (!is_quantifier && previous_callout != NULL && cb->nestptr == NULL &&
|
if (!is_quantifier && previous_callout != NULL && cb->nestptr[0] == NULL &&
|
||||||
after_manual_callout-- <= 0)
|
after_manual_callout-- <= 0)
|
||||||
{
|
{
|
||||||
if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
|
if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
|
||||||
|
@ -3834,7 +3846,8 @@ for (;; ptr++)
|
||||||
/* Create auto callout, except for quantifiers, or while processing property
|
/* Create auto callout, except for quantifiers, or while processing property
|
||||||
strings that are substituted for \w etc in UCP mode. */
|
strings that are substituted for \w etc in UCP mode. */
|
||||||
|
|
||||||
if ((options & PCRE2_AUTO_CALLOUT) != 0 && !is_quantifier && cb->nestptr == NULL)
|
if ((options & PCRE2_AUTO_CALLOUT) != 0 && !is_quantifier &&
|
||||||
|
cb->nestptr[0] == NULL)
|
||||||
{
|
{
|
||||||
previous_callout = code;
|
previous_callout = code;
|
||||||
code = auto_callout(code, ptr, cb);
|
code = auto_callout(code, ptr, cb);
|
||||||
|
@ -3926,13 +3939,15 @@ for (;; ptr++)
|
||||||
In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
|
In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
|
||||||
used for "start of word" and "end of word". As these are otherwise illegal
|
used for "start of word" and "end of word". As these are otherwise illegal
|
||||||
sequences, we don't break anything by recognizing them. They are replaced
|
sequences, we don't break anything by recognizing them. They are replaced
|
||||||
by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
|
by \b(?=\w) and \b(?<=\w) respectively. This can only happen at the top
|
||||||
erroneous and are handled by the normal code below. */
|
nesting level, as no other inserted sequences will contains these oddities.
|
||||||
|
Sequences like [a[:<:]] are erroneous and are handled by the normal code
|
||||||
|
below. */
|
||||||
|
|
||||||
case CHAR_LEFT_SQUARE_BRACKET:
|
case CHAR_LEFT_SQUARE_BRACKET:
|
||||||
if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
|
if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
|
||||||
{
|
{
|
||||||
cb->nestptr = ptr + 7;
|
cb->nestptr[0] = ptr + 7;
|
||||||
ptr = sub_start_of_word; /* Do not combine these statements; clang's */
|
ptr = sub_start_of_word; /* Do not combine these statements; clang's */
|
||||||
ptr--; /* sanitizer moans about a negative index. */
|
ptr--; /* sanitizer moans about a negative index. */
|
||||||
continue;
|
continue;
|
||||||
|
@ -3940,7 +3955,7 @@ for (;; ptr++)
|
||||||
|
|
||||||
if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
|
if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
|
||||||
{
|
{
|
||||||
cb->nestptr = ptr + 7;
|
cb->nestptr[0] = ptr + 7;
|
||||||
ptr = sub_end_of_word; /* Do not combine these statements; clang's */
|
ptr = sub_end_of_word; /* Do not combine these statements; clang's */
|
||||||
ptr--; /* sanitizer moans about a negative index. */
|
ptr--; /* sanitizer moans about a negative index. */
|
||||||
continue;
|
continue;
|
||||||
|
@ -4125,11 +4140,13 @@ for (;; ptr++)
|
||||||
int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
|
int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
|
||||||
|
|
||||||
/* The posix_substitutes table specifies which POSIX classes can be
|
/* The posix_substitutes table specifies which POSIX classes can be
|
||||||
converted to \p or \P items. */
|
converted to \p or \P items. This can only happen at top nestling
|
||||||
|
level, as there will never be a POSIX class in a string that is
|
||||||
|
substituted for something else. */
|
||||||
|
|
||||||
if (posix_substitutes[pc] != NULL)
|
if (posix_substitutes[pc] != NULL)
|
||||||
{
|
{
|
||||||
cb->nestptr = tempptr + 1;
|
cb->nestptr[0] = tempptr + 1;
|
||||||
ptr = posix_substitutes[pc] - 1;
|
ptr = posix_substitutes[pc] - 1;
|
||||||
goto CONTINUE_CLASS;
|
goto CONTINUE_CLASS;
|
||||||
}
|
}
|
||||||
|
@ -4263,9 +4280,10 @@ for (;; ptr++)
|
||||||
case ESC_DU: /* when PCRE2_UCP is set. We replace the */
|
case ESC_DU: /* when PCRE2_UCP is set. We replace the */
|
||||||
case ESC_wu: /* escape sequence with an appropriate \p */
|
case ESC_wu: /* escape sequence with an appropriate \p */
|
||||||
case ESC_WU: /* or \P to test Unicode properties instead */
|
case ESC_WU: /* or \P to test Unicode properties instead */
|
||||||
case ESC_su: /* of the default ASCII testing. */
|
case ESC_su: /* of the default ASCII testing. This might be */
|
||||||
case ESC_SU:
|
case ESC_SU: /* a 2nd-level nesting for [[:<:]] or [[:>:]]. */
|
||||||
cb->nestptr = ptr;
|
cb->nestptr[1] = cb->nestptr[0];
|
||||||
|
cb->nestptr[0] = ptr;
|
||||||
ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
|
ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
|
||||||
class_has_8bitchar--; /* Undo! */
|
class_has_8bitchar--; /* Undo! */
|
||||||
break;
|
break;
|
||||||
|
@ -4607,10 +4625,11 @@ for (;; ptr++)
|
||||||
|
|
||||||
CONTINUE_CLASS:
|
CONTINUE_CLASS:
|
||||||
c = *(++ptr);
|
c = *(++ptr);
|
||||||
if (c == 0 && cb->nestptr != NULL)
|
if (c == CHAR_NULL && cb->nestptr[0] != NULL)
|
||||||
{
|
{
|
||||||
ptr = cb->nestptr;
|
ptr = cb->nestptr[0];
|
||||||
cb->nestptr = NULL;
|
cb->nestptr[0] = cb->nestptr[1];
|
||||||
|
cb->nestptr[1] = NULL;
|
||||||
c = *(++ptr);
|
c = *(++ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7082,7 +7101,8 @@ for (;; ptr++)
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (escape >= ESC_DU && escape <= ESC_wu)
|
if (escape >= ESC_DU && escape <= ESC_wu)
|
||||||
{
|
{
|
||||||
cb->nestptr = ptr + 1; /* Where to resume */
|
cb->nestptr[1] = cb->nestptr[0]; /* Back up if at 2nd level */
|
||||||
|
cb->nestptr[0] = ptr + 1; /* Where to resume */
|
||||||
ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
|
ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -8079,7 +8099,7 @@ cb.bracount = cb.final_bracount = 0;
|
||||||
cb.cx = ccontext;
|
cb.cx = ccontext;
|
||||||
cb.dupnames = FALSE;
|
cb.dupnames = FALSE;
|
||||||
cb.end_pattern = pattern + patlen;
|
cb.end_pattern = pattern + patlen;
|
||||||
cb.nestptr = NULL;
|
cb.nestptr[0] = cb.nestptr[1] = NULL;
|
||||||
cb.external_flags = 0;
|
cb.external_flags = 0;
|
||||||
cb.external_options = options;
|
cb.external_options = options;
|
||||||
cb.had_recurse = FALSE;
|
cb.had_recurse = FALSE;
|
||||||
|
|
|
@ -686,7 +686,7 @@ typedef struct compile_block {
|
||||||
PCRE2_SPTR start_code; /* The start of the compiled code */
|
PCRE2_SPTR start_code; /* The start of the compiled code */
|
||||||
PCRE2_SPTR start_pattern; /* The start of the pattern */
|
PCRE2_SPTR start_pattern; /* The start of the pattern */
|
||||||
PCRE2_SPTR end_pattern; /* The end of the pattern */
|
PCRE2_SPTR end_pattern; /* The end of the pattern */
|
||||||
PCRE2_SPTR nestptr; /* Pointer saved for string substitution */
|
PCRE2_SPTR nestptr[2]; /* Pointer(s) saved for string substitution */
|
||||||
PCRE2_UCHAR *name_table; /* The name/number table */
|
PCRE2_UCHAR *name_table; /* The name/number table */
|
||||||
size_t workspace_size; /* Size of workspace */
|
size_t workspace_size; /* Size of workspace */
|
||||||
uint16_t names_found; /* Number of entries so far */
|
uint16_t names_found; /* Number of entries so far */
|
||||||
|
|
|
@ -1684,4 +1684,6 @@
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
123
|
123
|
||||||
|
|
||||||
|
/(*UCP)(*UTF)[[:>:]]X/B
|
||||||
|
|
||||||
# End of testinput5
|
# End of testinput5
|
||||||
|
|
|
@ -4045,4 +4045,17 @@ MK: a\x{12345}b\x{09}(d)c
|
||||||
123
|
123
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
/(*UCP)(*UTF)[[:>:]]X/B
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
\b
|
||||||
|
AssertB
|
||||||
|
Reverse
|
||||||
|
prop Xwd
|
||||||
|
Ket
|
||||||
|
X
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
# End of testinput5
|
# End of testinput5
|
||||||
|
|
Loading…
Reference in New Issue