Fix stack overflow bug, copying fix from PCRE1.

This commit is contained in:
Philip.Hazel 2014-08-08 15:36:18 +00:00
parent 896e6051ab
commit b7c5d02b3d
3 changed files with 173 additions and 160 deletions

View File

@ -151,7 +151,7 @@ have to check them every time. */
#define REQ_UNSET (-2) /* Not yet found anything */
#define REQ_NONE (-1) /* Found not fixed char */
/* This bit (which is greater than any UTF value) is used to indicate that a
/* This bit (which is greater than any UTF value) is used to indicate that a
variable contains a number of code units instead of an actual code point. */
#define UTF_LENGTH 0x10000000l
@ -305,7 +305,7 @@ static const short int escapes[] = {
#else
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
It runs from 'a' to '9'. */
#define ESCAPES_FIRST CHAR_a
@ -327,7 +327,7 @@ static const short int escapes[] = {
/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
/* F8 */ 0, 0
/* F8 */ 0, 0
};
#endif
@ -556,19 +556,19 @@ static PCRE2_SPTR posix_substitutes[] = {
PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF)
/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
eint2 in pcre2posix.c must be updated, and a new error text must be added to
easily be tracked. When a new number is added, the tables called eint1 and
eint2 in pcre2posix.c must be updated, and a new error text must be added to
compile_error_texts in pcre2_error.c. */
enum { ERR0 = COMPILE_ERROR_BASE,
ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78 };
enum { ERR0 = COMPILE_ERROR_BASE,
ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78 };
/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -725,7 +725,7 @@ Returns: nothing
*/
static void
complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr,
complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr,
compile_block *cb)
{
size_t length = ptr - cb->start_pattern - GET(previous_callout, 2);
@ -1161,7 +1161,7 @@ typedef struct recurse_check {
} recurse_check;
static BOOL
could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf,
could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf,
compile_block *cb, recurse_check *recurses)
{
register PCRE2_UCHAR c;
@ -1195,6 +1195,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
if (c == OP_RECURSE)
{
PCRE2_SPTR scode = cb->start_code + GET(code, 1);
PCRE2_SPTR endgroup = scode;
BOOL empty_branch;
/* Test for forward reference or uncompleted reference. This is disabled
@ -1209,20 +1210,16 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
}
/* If we are scanning a completed pattern, there are no forward references
and all groups are complete. We need to detect whether this is a recursive
call, as otherwise there will be an infinite loop. If it is a recursion,
just skip over it. Simple recursions are easily detected. For mutual
recursions we keep a chain on the stack. */
/* If the reference is to a completed group, we need to detect whether this
is a recursive call, as otherwise there will be an infinite loop. If it is
a recursion, just skip over it. Simple recursions are easily detected. For
mutual recursions we keep a chain on the stack. */
do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
if (code >= scode && code <= endgroup) continue; /* Simple recursion */
else
{
recurse_check *r = recurses;
PCRE2_SPTR endgroup = scode;
do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
if (code >= scode && code <= endgroup) continue; /* Simple recursion */
for (r = recurses; r != NULL; r = r->prev)
if (r->group == scode) break;
if (r != NULL) continue; /* Mutual recursion */
@ -1539,7 +1536,7 @@ Returns: TRUE if what is matched could be empty
*/
static BOOL
could_be_empty(PCRE2_SPTR code, PCRE2_SPTR endcode, branch_chain *bcptr,
could_be_empty(PCRE2_SPTR code, PCRE2_SPTR endcode, branch_chain *bcptr,
BOOL utf, compile_block *cb)
{
while (bcptr != NULL && bcptr->current_branch >= code)
@ -1593,7 +1590,7 @@ return 0;
*************************************************/
/* This function is called when a '{' is encountered in a place where it might
start a quantifier. It looks ahead to see if it really is a quantifier, that
start a quantifier. It looks ahead to see if it really is a quantifier, that
is, one of the forms {ddd} {ddd,} or {ddd,ddd} where the ddds are digits.
Argument: pointer to the first char after '{'
@ -1628,7 +1625,7 @@ return (*p == CHAR_RIGHT_CURLY_BRACKET);
positive value for a simple escape such as \d, or 0 for a data character, which
is placed in chptr. A backreference to group n is returned as negative n. On
entry, ptr is pointing at the \. On exit, it points the final code unit of the
escape sequence.
escape sequence.
Arguments:
ptrptr points to the pattern position pointer
@ -1636,7 +1633,7 @@ Arguments:
errorcodeptr points to the errorcode variable (containing zero)
options the current options bits
isclass TRUE if inside a character class
cb compile data block
cb compile data block
Returns: zero => a data character
positive => a special escape sequence
@ -1669,7 +1666,7 @@ returned immediately. Otherwise further processing is required. */
else if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
{
{
if (i > 0) c = (uint32_t)i; /* Positive is a data character */
else escape = -i; /* Else return a special escape */
}
@ -1695,30 +1692,30 @@ else
/* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
specially, \u must be followed by four hex digits. Otherwise it is a
lowercase u letter. */
case CHAR_u:
if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
{
uint32_t xc;
if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
cc = (cc << 4) | xc;
cc = (cc << 4) | xc;
if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */
cc = (cc << 4) | xc;
cc = (cc << 4) | xc;
if ((xc = XDIGIT(ptr[4])) == 0xff) break; /* Not a hex digit */
c = (cc << 4) | xc;
c = (cc << 4) | xc;
ptr += 4;
if (utf)
{
if (c > 0x10ffffU) *errorcodeptr = ERR77;
else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
}
else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
}
break;
case CHAR_U:
/* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
/* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
upper case letter. */
if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
break;
@ -1892,7 +1889,7 @@ else
case CHAR_o:
if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else
if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else
if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else
{
ptr += 2;
c = 0;
@ -1936,7 +1933,7 @@ else
if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */
c = (cc << 4) | xc;
ptr += 2;
ptr += 2;
} /* End PCRE2_ALT_BSUX handling */
/* Handle \x in Perl's style. \x{ddd} is a character number which can be
@ -1955,10 +1952,10 @@ else
{
*errorcodeptr = ERR78;
break;
}
}
c = 0;
overflow = FALSE;
while ((cc = XDIGIT(*ptr)) != 0xff)
{
ptr++;
@ -1971,7 +1968,7 @@ else
{
overflow = TRUE;
break;
}
}
}
if (overflow)
@ -1999,10 +1996,10 @@ else
c = 0;
if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
ptr++;
c = cc;
c = cc;
if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */
ptr++;
c = (c << 4) | cc;
c = (c << 4) | cc;
} /* End of \xdd handling */
} /* End of Perl-style \x handling */
break;
@ -2033,9 +2030,9 @@ else
#endif
break;
/* Any other alphanumeric following \ is an error. Perl gives an error only
if in warning mode, but PCRE doesn't have a warning mode. */
/* Any other alphanumeric following \ is an error. Perl gives an error only
if in warning mode, but PCRE doesn't have a warning mode. */
default:
*errorcodeptr = ERR3;
break;
@ -2080,7 +2077,7 @@ Arguments:
ptypeptr an unsigned int that is set to the type value
pdataptr an unsigned int that is set to the detailed property value
errorcodeptr the error code variable
cb the compile data
cb the compile data
Returns: TRUE if the type value was found, or FALSE for an invalid type
*/
@ -2126,7 +2123,7 @@ else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
name[0] = c;
name[1] = 0;
}
else goto ERROR_RETURN;
else goto ERROR_RETURN;
*ptrptr = ptr;
@ -2179,13 +2176,13 @@ Returns: pointer to '}' on success;
current ptr on error, with errorcodeptr set non-zero
*/
static PCRE2_SPTR
static PCRE2_SPTR
read_repeat_counts(PCRE2_SPTR p, int *minp, int *maxp, int *errorcodeptr)
{
int min = 0;
int max = -1;
while (IS_DIGIT(*p))
while (IS_DIGIT(*p))
{
min = min * 10 + (int)(*p++ - CHAR_0);
if (min > 65535)
@ -2193,14 +2190,14 @@ while (IS_DIGIT(*p))
*errorcodeptr = ERR5;
return p;
}
}
}
if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
{
if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
{
max = 0;
while(IS_DIGIT(*p))
while(IS_DIGIT(*p))
{
max = max * 10 + (int)(*p++ - CHAR_0);
if (max > 65535)
@ -2208,7 +2205,7 @@ if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
*errorcodeptr = ERR5;
return p;
}
}
}
if (max < min)
{
*errorcodeptr = ERR4;
@ -2242,13 +2239,13 @@ Arguments:
Returns: pointer to the opcode for the bracket, or NULL if not found
*/
PCRE2_SPTR
PCRE2_SPTR
PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number)
{
for (;;)
{
register PCRE2_UCHAR c = *code;
if (c == OP_END) return NULL;
/* XCLASS is used for classes that cannot be represented just by a bit
@ -2377,7 +2374,7 @@ Arguments:
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
*/
static PCRE2_SPTR
static PCRE2_SPTR
find_recurse(PCRE2_SPTR code, BOOL utf)
{
for (;;)
@ -2845,7 +2842,7 @@ if (start <= 0xff) start = 0xff + 1;
if (end >= start)
{
PCRE2_UCHAR *uchardata = *uchardptr;
#ifdef SUPPORT_UTF
if ((options & PCRE2_UTF) != 0)
{
@ -3276,11 +3273,11 @@ for (;; ptr++)
{
/* ===================================================================*/
/* The branch terminates at string end or | or ) */
case CHAR_NULL:
if (ptr < cb->end_pattern) goto NORMAL_CHAR; /* Zero data character */
/* Fall through */
if (ptr < cb->end_pattern) goto NORMAL_CHAR; /* Zero data character */
/* Fall through */
case CHAR_VERTICAL_LINE:
case CHAR_RIGHT_PARENTHESIS:
*firstcuptr = firstcu;
@ -3309,7 +3306,7 @@ for (;; ptr++)
previous = NULL;
if ((options & PCRE2_MULTILINE) != 0)
{
if (firstcuflags == REQ_UNSET)
if (firstcuflags == REQ_UNSET)
zerofirstcuflags = firstcuflags = REQ_NONE;
*code++ = OP_CIRCM;
}
@ -3346,11 +3343,11 @@ for (;; ptr++)
opcode is compiled. It may optionally have a bit map for characters < 256,
but those above are are explicitly listed afterwards. A flag byte tells
whether the bitmap is present, and whether this is a negated class or not.
An isolated ']' character is not treated specially, so is just another data
character. In earlier versions of PCRE that used the original API there was
a "JavaScript compatibility mode" in which it gave an error. However,
JavaScript itself has changed in this respect so there is no longer any
JavaScript itself has changed in this respect so there is no longer any
need for this special handling.
In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
@ -3460,7 +3457,7 @@ for (;; ptr++)
memset(classbits, 0, 32 * sizeof(uint8_t));
/* Process characters until ] is reached. As the test is at the end of the
/* Process characters until ] is reached. As the test is at the end of the
loop, an initial ] is taken as a data character. At the start of the loop,
c contains the first code unit of the character. If it is zero, check for
the end of the pattern, to allow binary zero as data. */
@ -3468,13 +3465,13 @@ for (;; ptr++)
for(;;)
{
PCRE2_SPTR oldptr;
if (c == CHAR_NULL && ptr >= cb->end_pattern)
{
*errorcodeptr = ERR6; /* Missing terminating ']' */
goto FAILED;
}
#ifdef SUPPORT_UTF
if (utf && HAS_EXTRALEN(c))
{ /* Braces are required because the */
@ -3680,7 +3677,7 @@ for (;; ptr++)
}
else if (escape == ESC_E) goto CONTINUE_CLASS; /* Ignore orphan \E */
else /* Handle \d-type escapes */
else /* Handle \d-type escapes */
{
register const uint8_t *cbits = cb->cbits;
/* Every class contains at least two < 256 characters. */
@ -3773,17 +3770,17 @@ for (;; ptr++)
xclass_has_prop = TRUE;
class_has_8bitchar--; /* Undo! */
}
break;
#endif
break;
#endif
/* Unrecognized escapes are faulted. */
default:
*errorcodeptr = ERR7;
goto FAILED;
}
/* Handled \d-type escape */
goto CONTINUE_CLASS;
}
@ -3976,7 +3973,7 @@ for (;; ptr++)
/* For a single, positive character, get the value into mcbuffer, and
then we can handle this with the normal one-character code. */
mclength = PUTCHAR(c, mcbuffer);
goto ONE_CHAR;
} /* End of 1-char optimization */
@ -3986,8 +3983,8 @@ for (;; ptr++)
class_has_8bitchar +=
add_to_class(classbits, &class_uchardata, options, cb, c, c);
/* Continue to the next character in the class. Closing square bracket
/* Continue to the next character in the class. Closing square bracket
not within \Q..\E ends the class. A NULL character terminates a
nested substitution string, but may be a data character in the main
pattern (tested at the start of this loop). */
@ -3998,9 +3995,9 @@ for (;; ptr++)
{
ptr = nestptr;
nestptr = NULL;
c = *(++ptr);
}
if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
c = *(++ptr);
}
if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
} /* End of main class-processing loop */
/* We will need an XCLASS if data has been placed in class_uchardata. In
@ -4281,16 +4278,16 @@ for (;; ptr++)
prop_type = previous[1];
prop_value = previous[2];
}
else
else
{
/* Come here from just above with a character in c */
OUTPUT_SINGLE_REPEAT:
prop_type = prop_value = -1;
}
}
/* At this point we either have prop_type == prop_value == -1 and either
a code point or a character type that is not OP_[NOT]PROP in c, or we
have OP_[NOT]PROP in c and prop_type/prop_value not negative. */
a code point or a character type that is not OP_[NOT]PROP in c, or we
have OP_[NOT]PROP in c and prop_type/prop_value not negative. */
oldcode = code; /* Save where we were */
code = previous; /* Usually overwrite previous item */
@ -4343,16 +4340,16 @@ for (;; ptr++)
{
*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
PUT2INC(code, 0, repeat_min);
/* Unless repeat_max equals repeat_min, fill in the data for EXACT, and
then generate the second opcode. In UTF mode, multi-code-unit
characters have their length in c, with the UTF_LENGTH bit as a flag,
and the code units in utf_units. For a repeated Unicode property match,
there are two extra values that define the required property, and c
never has the UTF_LENGTH bit set. */
if (repeat_max != repeat_min)
{
{
#ifdef MAYBE_UTF_MULTI
if (utf && (c & UTF_LENGTH) != 0)
{
@ -4360,7 +4357,7 @@ for (;; ptr++)
code += c & 7;
}
else
#endif
#endif
{
*code++ = c;
if (prop_type >= 0)
@ -4369,7 +4366,7 @@ for (;; ptr++)
*code++ = prop_value;
}
}
/* Now set up the following opcode */
if (repeat_max < 0) *code++ = OP_STAR + repeat_type; else
@ -4385,7 +4382,7 @@ for (;; ptr++)
PUT2INC(code, 0, repeat_max);
}
}
}
}
}
/* Fill in the character or character type for the final opcode. */
@ -4405,7 +4402,7 @@ for (;; ptr++)
*code++ = prop_type;
*code++ = prop_value;
}
}
}
}
/* If previous was a character class or a back reference, we put the repeat
@ -4562,7 +4559,7 @@ for (;; ptr++)
just adjust the length as if we had. Do some paranoid checks for
potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
integer type when available, otherwise double. */
if (lengthptr != NULL)
{
size_t delta = (repeat_min - 1)*length_prevgroup;
@ -4822,7 +4819,7 @@ for (;; ptr++)
}
}
/* If previous is OP_FAIL, it was generated by an empty class []
/* If previous is OP_FAIL, it was generated by an empty class []
(PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
generated, that is by (*FAIL) or (?!), set previous to NULL, which gives a
"nothing to repeat" error above. We can just ignore the repeat in empty
@ -5231,7 +5228,7 @@ for (;; ptr++)
ptr++;
}
namelen = (int)(ptr - name);
if (lengthptr != NULL && (options & PCRE2_DUPNAMES) != 0)
if (lengthptr != NULL && (options & PCRE2_DUPNAMES) != 0)
*lengthptr += IMM2_SIZE;
}
@ -5297,7 +5294,7 @@ for (;; ptr++)
(slot+IMM2_SIZE)[namelen] != 0) break;
count++;
}
if (count > 1)
{
PUT2(code, 2+LINK_SIZE, offset);
@ -5552,7 +5549,7 @@ for (;; ptr++)
if (cb->names_found >= cb->named_group_list_size)
{
int newsize = cb->named_group_list_size * 2;
named_group *newspace =
named_group *newspace =
cb->cx->memctl.malloc(newsize * sizeof(named_group),
cb->cx->memctl.memory_data);
if (newspace == NULL)
@ -5646,7 +5643,7 @@ for (;; ptr++)
/* Count named back references. */
if (!is_recurse) cb->namedrefcount++;
/* If duplicate names are permitted, we have to allow for a named
reference to a duplicated name (this cannot be determined until the
second pass). This needs an extra 16-bit data item. */
@ -5701,7 +5698,7 @@ for (;; ptr++)
count++;
cslot += cb->name_entry_size;
}
if (count > 1)
{
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
@ -6403,7 +6400,7 @@ for (;; ptr++)
/* We have a data character whose value is in c. In UTF-8 mode it may have
a value > 127. We set its representation in the length/buffer, and then
handle it as a data character. */
mclength = PUTCHAR(c, mcbuffer);
goto ONE_CHAR;
@ -6536,15 +6533,15 @@ Arguments:
errorcodeptr -> pointer to error code variable
lookbehind TRUE if this is a lookbehind assertion
reset_bracount TRUE to reset the count for each branch
skipunits skip this many code units at start (for brackets and OP_COND)
skipunits skip this many code units at start (for brackets and OP_COND)
cond_depth depth of nesting for conditional subpatterns
firstcuptr place to put the first required code unit
firstcuflagsptr place to put the first code unit flags, or a negative number
reqcuptr place to put the last required code unit
reqcuflagsptr place to put the last required code unit flags, or a negative number
bcptr pointer to the chain of currently open branches
cb points to the data block with tables pointers etc.
lengthptr NULL during the real compile phase
firstcuptr place to put the first required code unit
firstcuflagsptr place to put the first code unit flags, or a negative number
reqcuptr place to put the last required code unit
reqcuflagsptr place to put the last required code unit flags, or a negative number
bcptr pointer to the chain of currently open branches
cb points to the data block with tables pointers etc.
lengthptr NULL during the real compile phase
points to length accumulator during pre-compile phase
Returns: TRUE on success
@ -6554,7 +6551,7 @@ static BOOL
compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, PCRE2_SPTR *ptrptr,
int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipunits,
int cond_depth, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
compile_block *cb, size_t *lengthptr)
{
PCRE2_SPTR ptr = *ptrptr;
@ -6687,7 +6684,7 @@ for (;;)
previously no reqcu, it takes on the value of the old firstcu. */
if (firstcuflags >= 0 &&
(firstcuflags != branchfirstcuflags ||
(firstcuflags != branchfirstcuflags ||
firstcu != branchfirstcu))
{
if (reqcuflags < 0)
@ -6701,7 +6698,7 @@ for (;;)
/* If we (now or from before) have no firstcu, a firstcu from the
branch becomes a reqcu if there isn't a branch reqcu. */
if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
branchreqcuflags < 0)
{
branchreqcu = branchfirstcu;
@ -6852,7 +6849,7 @@ for (;;)
bc.current_branch = last_branch = code;
code += 1 + LINK_SIZE;
}
/* Advance past the vertical bar */
ptr++;
@ -6994,7 +6991,7 @@ Returns: TRUE or FALSE
*/
static BOOL
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
int atomcount)
{
do {
@ -7102,7 +7099,7 @@ follow. However, if we end up without a first code unit setting for an
unanchored pattern, it is worth scanning the regex to see if there is an
initial asserted first code unit. If all branches start with the same asserted
code unit, or with a non-conditional bracket all of whose alternatives start
with the same asserted code unit (recurse ad lib), then we return that code
with the same asserted code unit (recurse ad lib), then we return that code
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
REQ_NONE in the flags.
@ -7146,7 +7143,7 @@ do {
d = find_firstassertedcu(scode, &dflags, op == OP_ASSERT);
if (dflags < 0)
return 0;
if (cflags < 0) { c = d; cflags = dflags; }
if (cflags < 0) { c = d; cflags = dflags; }
else if (c != d || cflags != dflags) return 0;
break;
@ -7254,7 +7251,7 @@ Arguments:
patlen the length of the pattern, or < 0 for zero-terminated
options option bits
errorptr pointer to errorcode
erroroffset pointer to error offset
erroroffset pointer to error offset
ccontext points to a compile context or is NULL
Returns: pointer to compiled data block, or NULL on error,
@ -7328,7 +7325,7 @@ if (ccontext == NULL)
PRIV(compile_context_init)(&default_context, TRUE);
ccontext = &default_context;
}
/* A negative pattern length means "zero-terminated". Otherwise, we make
a copy of the pattern and add a zero. */
@ -7350,7 +7347,7 @@ if (patlen < 0) patlen = PRIV(strlen)(pattern); else
copied_pattern[patlen] = 0;
pattern = copied_pattern;
}
/* ------------ Initialize the "static" compile data -------------- */
@ -7407,7 +7404,7 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
{
pso *p = pso_list + i;
if (PRIV(strncmp_c8)(ptr+skipatstart+2, (char *)(p->name), p->length) == 0)
{
uint32_t c, pp;
@ -7436,17 +7433,17 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
c = c*10 + ptr[pp++] - CHAR_0;
}
if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
{
errorcode = ERR60;
errorcode = ERR60;
goto HAD_ERROR;
}
}
if (p->type == PSO_LIMM) limit_match = c;
else limit_recursion = c;
skipatstart += pp - skipatstart;
break;
}
break; /* Out of the table scan loop */
break; /* Out of the table scan loop */
}
}
if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */
@ -7480,16 +7477,16 @@ if (utf)
if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
(errorcode = PRIV(valid_utf)(pattern, -1, erroroffset)) != 0)
goto HAD_ERROR;
}
}
/* Check UCP lockout. */
if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
(PCRE2_UCP|PCRE2_NEVER_UCP))
{
errorcode = ERR75;
goto HAD_ERROR;
}
}
/* Process the BSR setting. */
@ -7529,7 +7526,7 @@ switch(newline)
errorcode = ERR56;
goto HAD_ERROR;
}
/* Pretend to compile the pattern while actually just accumulating the amount
of memory required in the 'length' variable. This behaviour is triggered by
passing a non-NULL final argument to compile_regex(). We pass a block of
@ -7541,7 +7538,7 @@ On error, errorcode will be set non-zero, so we don't need to look at the
result of the function. The initial options have been put into the cb block so
that they can be changed if an option setting is found within the regex right
at the beginning. Bringing initial option settings outside can help speed up
starting point checks. We still have to pass a separate options variable (the
starting point checks. We still have to pass a separate options variable (the
first argument) because that may change as the pattern is processed. */
code = cworkspace;
@ -7550,14 +7547,14 @@ code = cworkspace;
(void)compile_regex(cb.external_options, &code, &ptr, &errorcode, FALSE,
FALSE, 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
&cb, &length);
if (errorcode != 0) goto HAD_ERROR;
if (length > MAX_PATTERN_SIZE)
{
errorcode = ERR20;
goto HAD_ERROR;
}
/* If there are groups with duplicate names and there are also references by
name, we must allow for the possibility of named references to duplicated
groups. These require an extra data item each. */
@ -7570,7 +7567,7 @@ the compiled pattern and names table. Integer overflow should no longer be
possible because nowadays we limit the maximum value of cb.names_found and
cb.name_entry_size. */
re_blocksize = sizeof(pcre2_real_code) +
re_blocksize = sizeof(pcre2_real_code) +
CU2BYTES(length + cb.names_found * cb.name_entry_size);
re = (pcre2_real_code *)
ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
@ -7595,17 +7592,17 @@ re->first_codeunit = 0;
re->last_codeunit = 0;
re->bsr_convention = bsr;
re->newline_convention = newline;
re->max_lookbehind =
re->max_lookbehind =
re->minlength = 0;
re->top_bracket = 0;
re->top_backref = 0;
re->name_entry_size = cb.name_entry_size;
re->name_count = cb.names_found;
/* The basic block is immediately followed by the name table, and the compiled
/* The basic block is immediately followed by the name table, and the compiled
code follows after that. */
codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
re->name_entry_size * re->name_count;
@ -7646,7 +7643,7 @@ cb.check_lookbehind = FALSE;
cb.open_caps = NULL;
/* If any named groups were found, create the name/number table from the list
created in the first pass. If the list was longer than the in-stack list, free
created in the first pass. If the list was longer than the in-stack list, free
the heap memory. */
if (cb.names_found > 0)
@ -7726,7 +7723,7 @@ if (cb.hwm > cb.start_workspace)
NULL to indicate that forward references have been filled in. */
if (cb.workspace_size > COMPILE_WORK_SIZE)
ccontext->memctl.free((void *)cb.start_workspace,
ccontext->memctl.free((void *)cb.start_workspace,
ccontext->memctl.memory_data);
cb.start_workspace = NULL;
@ -7744,9 +7741,9 @@ function call. */
if ((options & PCRE2_NO_AUTO_POSSESS) == 0)
{
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
PRIV(auto_possessify)(temp, utf, &cb);
}
}
/* If there were any lookbehind assertions that contained OP_RECURSE
(recursions or subroutine calls), a flag is set for them to be checked here,
@ -7800,7 +7797,7 @@ if (errorcode != 0)
re = NULL;
*errorptr = errorcode;
*erroroffset = (int)(ptr - pattern);
goto EXIT;
goto EXIT;
}
/* Successful compile. If the anchored option was not passed, set it if
@ -7809,9 +7806,9 @@ or anything else, such as starting with non-atomic .* when DOTALL is set and
there are no occurrences of *PRUNE or *SKIP. */
if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
is_anchored(codestart, 0, &cb, 0))
is_anchored(codestart, 0, &cb, 0))
re->overall_options |= PCRE2_ANCHORED;
/* If the pattern is still not anchored and we do not have a first code unit,
see if there is one that is asserted (these are not saved during the compile
because they can cause conflicts with actual literals that follow). */
@ -7820,14 +7817,14 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0)
{
if (firstcuflags < 0)
firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE);
/* Save the data for a first code unit. */
if (firstcuflags >= 0)
{
re->first_codeunit = firstcu;
re->flags |= PCRE2_FIRSTSET;
/* Handle caseless first code units. */
if ((firstcuflags & REQ_CASELESS) != 0)
@ -7836,20 +7833,20 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0)
{
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
}
/* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
8-bit UTF mode, codepoints in the range 128-255 are introductory code
points and cannot have another case. In 16-bit and 32-bit modes, we can
/* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
8-bit UTF mode, codepoints in the range 128-255 are introductory code
points and cannot have another case. In 16-bit and 32-bit modes, we can
check wide characters when UTF (and therefore UCP) is supported. */
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 8
else if (firstcu <= MAX_UTF_CODE_POINT &&
else if (firstcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS;
#endif
#endif
}
}
/* When there is no first code unit, see if we can set the PCRE2_STARTLINE
flag. This is helpful for multiline matches when all branches start with ^
and also when all branches start with non-atomic .* for non-DOTALL matches
@ -7857,19 +7854,19 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0)
else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
}
/* Handle the "required code unit", if one is set. In the case of an anchored
/* Handle the "required code unit", if one is set. In the case of an anchored
pattern, do this only if it follows a variable length item in the pattern. */
if (reqcuflags >= 0 &&
((re->overall_options & PCRE2_ANCHORED) == 0 ||
((re->overall_options & PCRE2_ANCHORED) == 0 ||
(reqcuflags & REQ_VARY) != 0))
{
re->last_codeunit = reqcu;
re->flags |= PCRE2_LASTSET;
/* Handle caseless required code units as for first code units (above). */
if ((reqcuflags & REQ_CASELESS) != 0)
{
if (reqcu < 128 || (!utf && reqcu < 255))
@ -7897,14 +7894,14 @@ do
}
while (*codestart == OP_ALT);
/* Finally, study the compiled pattern to set up information such as a bitmap
/* Finally, study the compiled pattern to set up information such as a bitmap
of starting code units and a minimum matching length. */
if (PRIV(study)(re) != 0)
{
errorcode = ERR31;
goto HAD_ERROR;
}
goto HAD_ERROR;
}
/* Control ends up here in all cases. If memory was obtained for a
zero-terminated copy of the pattern, remember to free it before returning. */

6
testdata/testinput1 vendored
View File

@ -4912,6 +4912,12 @@
/((?(R1)a+|(?1)b))/
aaaabcde
/((?(R)a|(?1)))*/
aaa
/((?(R)a|(?1)))+/
aaa
/a(*:any
name)/mark

10
testdata/testoutput1 vendored
View File

@ -8199,6 +8199,16 @@ MK: M
aaaabcde
0: aaaab
1: aaaab
/((?(R)a|(?1)))*/
aaa
0: aaa
1: a
/((?(R)a|(?1)))+/
aaa
0: aaa
1: a
/a(*:any
name)/mark