diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 4157a8a..4437af2 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -151,7 +151,7 @@ have to check them every time. */ #define REQ_UNSET (-2) /* Not yet found anything */ #define REQ_NONE (-1) /* Found not fixed char */ -/* This bit (which is greater than any UTF value) is used to indicate that a +/* This bit (which is greater than any UTF value) is used to indicate that a variable contains a number of code units instead of an actual code point. */ #define UTF_LENGTH 0x10000000l @@ -305,7 +305,7 @@ static const short int escapes[] = { #else -/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. +/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. It runs from 'a' to '9'. */ #define ESCAPES_FIRST CHAR_a @@ -327,7 +327,7 @@ static const short int escapes[] = { /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, -/* F8 */ 0, 0 +/* F8 */ 0, 0 }; #endif @@ -556,19 +556,19 @@ static PCRE2_SPTR posix_substitutes[] = { PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF) /* Compile time error code numbers. They are given names so that they can more -easily be tracked. When a new number is added, the tables called eint1 and -eint2 in pcre2posix.c must be updated, and a new error text must be added to +easily be tracked. When a new number is added, the tables called eint1 and +eint2 in pcre2posix.c must be updated, and a new error text must be added to compile_error_texts in pcre2_error.c. */ -enum { ERR0 = COMPILE_ERROR_BASE, - ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, - ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, - ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30, - ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, - ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, - ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, - ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, - ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78 }; +enum { ERR0 = COMPILE_ERROR_BASE, + ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, + ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, + ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30, + ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40, + ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, + ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, + ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, + ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78 }; /* This is a table of start-of-pattern options such as (*UTF) and settings such as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward @@ -725,7 +725,7 @@ Returns: nothing */ static void -complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr, +complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr, compile_block *cb) { size_t length = ptr - cb->start_pattern - GET(previous_callout, 2); @@ -1161,7 +1161,7 @@ typedef struct recurse_check { } recurse_check; static BOOL -could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf, +could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf, compile_block *cb, recurse_check *recurses) { register PCRE2_UCHAR c; @@ -1195,6 +1195,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); if (c == OP_RECURSE) { PCRE2_SPTR scode = cb->start_code + GET(code, 1); + PCRE2_SPTR endgroup = scode; BOOL empty_branch; /* Test for forward reference or uncompleted reference. This is disabled @@ -1209,20 +1210,16 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ } - /* If we are scanning a completed pattern, there are no forward references - and all groups are complete. We need to detect whether this is a recursive - call, as otherwise there will be an infinite loop. If it is a recursion, - just skip over it. Simple recursions are easily detected. For mutual - recursions we keep a chain on the stack. */ + /* If the reference is to a completed group, we need to detect whether this + is a recursive call, as otherwise there will be an infinite loop. If it is + a recursion, just skip over it. Simple recursions are easily detected. For + mutual recursions we keep a chain on the stack. */ + do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); + if (code >= scode && code <= endgroup) continue; /* Simple recursion */ else { recurse_check *r = recurses; - PCRE2_SPTR endgroup = scode; - - do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); - if (code >= scode && code <= endgroup) continue; /* Simple recursion */ - for (r = recurses; r != NULL; r = r->prev) if (r->group == scode) break; if (r != NULL) continue; /* Mutual recursion */ @@ -1539,7 +1536,7 @@ Returns: TRUE if what is matched could be empty */ static BOOL -could_be_empty(PCRE2_SPTR code, PCRE2_SPTR endcode, branch_chain *bcptr, +could_be_empty(PCRE2_SPTR code, PCRE2_SPTR endcode, branch_chain *bcptr, BOOL utf, compile_block *cb) { while (bcptr != NULL && bcptr->current_branch >= code) @@ -1593,7 +1590,7 @@ return 0; *************************************************/ /* This function is called when a '{' is encountered in a place where it might -start a quantifier. It looks ahead to see if it really is a quantifier, that +start a quantifier. It looks ahead to see if it really is a quantifier, that is, one of the forms {ddd} {ddd,} or {ddd,ddd} where the ddds are digits. Argument: pointer to the first char after '{' @@ -1628,7 +1625,7 @@ return (*p == CHAR_RIGHT_CURLY_BRACKET); positive value for a simple escape such as \d, or 0 for a data character, which is placed in chptr. A backreference to group n is returned as negative n. On entry, ptr is pointing at the \. On exit, it points the final code unit of the -escape sequence. +escape sequence. Arguments: ptrptr points to the pattern position pointer @@ -1636,7 +1633,7 @@ Arguments: errorcodeptr points to the errorcode variable (containing zero) options the current options bits isclass TRUE if inside a character class - cb compile data block + cb compile data block Returns: zero => a data character positive => a special escape sequence @@ -1669,7 +1666,7 @@ returned immediately. Otherwise further processing is required. */ else if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */ else if ((i = escapes[c - ESCAPES_FIRST]) != 0) - { + { if (i > 0) c = (uint32_t)i; /* Positive is a data character */ else escape = -i; /* Else return a special escape */ } @@ -1695,30 +1692,30 @@ else /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated specially, \u must be followed by four hex digits. Otherwise it is a lowercase u letter. */ - + case CHAR_u: if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else { uint32_t xc; if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ - cc = (cc << 4) | xc; + cc = (cc << 4) | xc; if ((xc = XDIGIT(ptr[3])) == 0xff) break; /* Not a hex digit */ - cc = (cc << 4) | xc; + cc = (cc << 4) | xc; if ((xc = XDIGIT(ptr[4])) == 0xff) break; /* Not a hex digit */ - c = (cc << 4) | xc; + c = (cc << 4) | xc; ptr += 4; if (utf) { if (c > 0x10ffffU) *errorcodeptr = ERR77; - else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; + else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; } - else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; + else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; } break; case CHAR_U: - /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an + /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an upper case letter. */ if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; break; @@ -1892,7 +1889,7 @@ else case CHAR_o: if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else - if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else + if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else { ptr += 2; c = 0; @@ -1936,7 +1933,7 @@ else if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ if ((xc = XDIGIT(ptr[2])) == 0xff) break; /* Not a hex digit */ c = (cc << 4) | xc; - ptr += 2; + ptr += 2; } /* End PCRE2_ALT_BSUX handling */ /* Handle \x in Perl's style. \x{ddd} is a character number which can be @@ -1955,10 +1952,10 @@ else { *errorcodeptr = ERR78; break; - } + } c = 0; overflow = FALSE; - + while ((cc = XDIGIT(*ptr)) != 0xff) { ptr++; @@ -1971,7 +1968,7 @@ else { overflow = TRUE; break; - } + } } if (overflow) @@ -1999,10 +1996,10 @@ else c = 0; if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ ptr++; - c = cc; + c = cc; if ((cc = XDIGIT(ptr[1])) == 0xff) break; /* Not a hex digit */ ptr++; - c = (c << 4) | cc; + c = (c << 4) | cc; } /* End of \xdd handling */ } /* End of Perl-style \x handling */ break; @@ -2033,9 +2030,9 @@ else #endif break; - /* Any other alphanumeric following \ is an error. Perl gives an error only - if in warning mode, but PCRE doesn't have a warning mode. */ - + /* Any other alphanumeric following \ is an error. Perl gives an error only + if in warning mode, but PCRE doesn't have a warning mode. */ + default: *errorcodeptr = ERR3; break; @@ -2080,7 +2077,7 @@ Arguments: ptypeptr an unsigned int that is set to the type value pdataptr an unsigned int that is set to the detailed property value errorcodeptr the error code variable - cb the compile data + cb the compile data Returns: TRUE if the type value was found, or FALSE for an invalid type */ @@ -2126,7 +2123,7 @@ else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0) name[0] = c; name[1] = 0; } -else goto ERROR_RETURN; +else goto ERROR_RETURN; *ptrptr = ptr; @@ -2179,13 +2176,13 @@ Returns: pointer to '}' on success; current ptr on error, with errorcodeptr set non-zero */ -static PCRE2_SPTR +static PCRE2_SPTR read_repeat_counts(PCRE2_SPTR p, int *minp, int *maxp, int *errorcodeptr) { int min = 0; int max = -1; -while (IS_DIGIT(*p)) +while (IS_DIGIT(*p)) { min = min * 10 + (int)(*p++ - CHAR_0); if (min > 65535) @@ -2193,14 +2190,14 @@ while (IS_DIGIT(*p)) *errorcodeptr = ERR5; return p; } - } + } if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else { if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) { max = 0; - while(IS_DIGIT(*p)) + while(IS_DIGIT(*p)) { max = max * 10 + (int)(*p++ - CHAR_0); if (max > 65535) @@ -2208,7 +2205,7 @@ if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else *errorcodeptr = ERR5; return p; } - } + } if (max < min) { *errorcodeptr = ERR4; @@ -2242,13 +2239,13 @@ Arguments: Returns: pointer to the opcode for the bracket, or NULL if not found */ -PCRE2_SPTR +PCRE2_SPTR PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number) { for (;;) { register PCRE2_UCHAR c = *code; - + if (c == OP_END) return NULL; /* XCLASS is used for classes that cannot be represented just by a bit @@ -2377,7 +2374,7 @@ Arguments: Returns: pointer to the opcode for OP_RECURSE, or NULL if not found */ -static PCRE2_SPTR +static PCRE2_SPTR find_recurse(PCRE2_SPTR code, BOOL utf) { for (;;) @@ -2845,7 +2842,7 @@ if (start <= 0xff) start = 0xff + 1; if (end >= start) { PCRE2_UCHAR *uchardata = *uchardptr; - + #ifdef SUPPORT_UTF if ((options & PCRE2_UTF) != 0) { @@ -3276,11 +3273,11 @@ for (;; ptr++) { /* ===================================================================*/ /* The branch terminates at string end or | or ) */ - + case CHAR_NULL: - if (ptr < cb->end_pattern) goto NORMAL_CHAR; /* Zero data character */ - /* Fall through */ - + if (ptr < cb->end_pattern) goto NORMAL_CHAR; /* Zero data character */ + /* Fall through */ + case CHAR_VERTICAL_LINE: case CHAR_RIGHT_PARENTHESIS: *firstcuptr = firstcu; @@ -3309,7 +3306,7 @@ for (;; ptr++) previous = NULL; if ((options & PCRE2_MULTILINE) != 0) { - if (firstcuflags == REQ_UNSET) + if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE; *code++ = OP_CIRCM; } @@ -3346,11 +3343,11 @@ for (;; ptr++) opcode is compiled. It may optionally have a bit map for characters < 256, but those above are are explicitly listed afterwards. A flag byte tells whether the bitmap is present, and whether this is a negated class or not. - + An isolated ']' character is not treated specially, so is just another data character. In earlier versions of PCRE that used the original API there was a "JavaScript compatibility mode" in which it gave an error. However, - JavaScript itself has changed in this respect so there is no longer any + JavaScript itself has changed in this respect so there is no longer any need for this special handling. In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is @@ -3460,7 +3457,7 @@ for (;; ptr++) memset(classbits, 0, 32 * sizeof(uint8_t)); - /* Process characters until ] is reached. As the test is at the end of the + /* Process characters until ] is reached. As the test is at the end of the loop, an initial ] is taken as a data character. At the start of the loop, c contains the first code unit of the character. If it is zero, check for the end of the pattern, to allow binary zero as data. */ @@ -3468,13 +3465,13 @@ for (;; ptr++) for(;;) { PCRE2_SPTR oldptr; - + if (c == CHAR_NULL && ptr >= cb->end_pattern) { *errorcodeptr = ERR6; /* Missing terminating ']' */ goto FAILED; } - + #ifdef SUPPORT_UTF if (utf && HAS_EXTRALEN(c)) { /* Braces are required because the */ @@ -3680,7 +3677,7 @@ for (;; ptr++) } else if (escape == ESC_E) goto CONTINUE_CLASS; /* Ignore orphan \E */ - else /* Handle \d-type escapes */ + else /* Handle \d-type escapes */ { register const uint8_t *cbits = cb->cbits; /* Every class contains at least two < 256 characters. */ @@ -3773,17 +3770,17 @@ for (;; ptr++) xclass_has_prop = TRUE; class_has_8bitchar--; /* Undo! */ } - break; -#endif + break; +#endif /* Unrecognized escapes are faulted. */ default: *errorcodeptr = ERR7; goto FAILED; } - + /* Handled \d-type escape */ - + goto CONTINUE_CLASS; } @@ -3976,7 +3973,7 @@ for (;; ptr++) /* For a single, positive character, get the value into mcbuffer, and then we can handle this with the normal one-character code. */ - + mclength = PUTCHAR(c, mcbuffer); goto ONE_CHAR; } /* End of 1-char optimization */ @@ -3986,8 +3983,8 @@ for (;; ptr++) class_has_8bitchar += add_to_class(classbits, &class_uchardata, options, cb, c, c); - - /* Continue to the next character in the class. Closing square bracket + + /* Continue to the next character in the class. Closing square bracket not within \Q..\E ends the class. A NULL character terminates a nested substitution string, but may be a data character in the main pattern (tested at the start of this loop). */ @@ -3998,9 +3995,9 @@ for (;; ptr++) { ptr = nestptr; nestptr = NULL; - c = *(++ptr); - } - if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; + c = *(++ptr); + } + if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break; } /* End of main class-processing loop */ /* We will need an XCLASS if data has been placed in class_uchardata. In @@ -4281,16 +4278,16 @@ for (;; ptr++) prop_type = previous[1]; prop_value = previous[2]; } - else + else { /* Come here from just above with a character in c */ OUTPUT_SINGLE_REPEAT: prop_type = prop_value = -1; - } - + } + /* At this point we either have prop_type == prop_value == -1 and either - a code point or a character type that is not OP_[NOT]PROP in c, or we - have OP_[NOT]PROP in c and prop_type/prop_value not negative. */ + a code point or a character type that is not OP_[NOT]PROP in c, or we + have OP_[NOT]PROP in c and prop_type/prop_value not negative. */ oldcode = code; /* Save where we were */ code = previous; /* Usually overwrite previous item */ @@ -4343,16 +4340,16 @@ for (;; ptr++) { *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ PUT2INC(code, 0, repeat_min); - + /* Unless repeat_max equals repeat_min, fill in the data for EXACT, and then generate the second opcode. In UTF mode, multi-code-unit characters have their length in c, with the UTF_LENGTH bit as a flag, and the code units in utf_units. For a repeated Unicode property match, there are two extra values that define the required property, and c never has the UTF_LENGTH bit set. */ - + if (repeat_max != repeat_min) - { + { #ifdef MAYBE_UTF_MULTI if (utf && (c & UTF_LENGTH) != 0) { @@ -4360,7 +4357,7 @@ for (;; ptr++) code += c & 7; } else -#endif +#endif { *code++ = c; if (prop_type >= 0) @@ -4369,7 +4366,7 @@ for (;; ptr++) *code++ = prop_value; } } - + /* Now set up the following opcode */ if (repeat_max < 0) *code++ = OP_STAR + repeat_type; else @@ -4385,7 +4382,7 @@ for (;; ptr++) PUT2INC(code, 0, repeat_max); } } - } + } } /* Fill in the character or character type for the final opcode. */ @@ -4405,7 +4402,7 @@ for (;; ptr++) *code++ = prop_type; *code++ = prop_value; } - } + } } /* If previous was a character class or a back reference, we put the repeat @@ -4562,7 +4559,7 @@ for (;; ptr++) just adjust the length as if we had. Do some paranoid checks for potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit integer type when available, otherwise double. */ - + if (lengthptr != NULL) { size_t delta = (repeat_min - 1)*length_prevgroup; @@ -4822,7 +4819,7 @@ for (;; ptr++) } } - /* If previous is OP_FAIL, it was generated by an empty class [] + /* If previous is OP_FAIL, it was generated by an empty class [] (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be generated, that is by (*FAIL) or (?!), set previous to NULL, which gives a "nothing to repeat" error above. We can just ignore the repeat in empty @@ -5231,7 +5228,7 @@ for (;; ptr++) ptr++; } namelen = (int)(ptr - name); - if (lengthptr != NULL && (options & PCRE2_DUPNAMES) != 0) + if (lengthptr != NULL && (options & PCRE2_DUPNAMES) != 0) *lengthptr += IMM2_SIZE; } @@ -5297,7 +5294,7 @@ for (;; ptr++) (slot+IMM2_SIZE)[namelen] != 0) break; count++; } - + if (count > 1) { PUT2(code, 2+LINK_SIZE, offset); @@ -5552,7 +5549,7 @@ for (;; ptr++) if (cb->names_found >= cb->named_group_list_size) { int newsize = cb->named_group_list_size * 2; - named_group *newspace = + named_group *newspace = cb->cx->memctl.malloc(newsize * sizeof(named_group), cb->cx->memctl.memory_data); if (newspace == NULL) @@ -5646,7 +5643,7 @@ for (;; ptr++) /* Count named back references. */ if (!is_recurse) cb->namedrefcount++; - + /* If duplicate names are permitted, we have to allow for a named reference to a duplicated name (this cannot be determined until the second pass). This needs an extra 16-bit data item. */ @@ -5701,7 +5698,7 @@ for (;; ptr++) count++; cslot += cb->name_entry_size; } - + if (count > 1) { if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; @@ -6403,7 +6400,7 @@ for (;; ptr++) /* We have a data character whose value is in c. In UTF-8 mode it may have a value > 127. We set its representation in the length/buffer, and then handle it as a data character. */ - + mclength = PUTCHAR(c, mcbuffer); goto ONE_CHAR; @@ -6536,15 +6533,15 @@ Arguments: errorcodeptr -> pointer to error code variable lookbehind TRUE if this is a lookbehind assertion reset_bracount TRUE to reset the count for each branch - skipunits skip this many code units at start (for brackets and OP_COND) + skipunits skip this many code units at start (for brackets and OP_COND) cond_depth depth of nesting for conditional subpatterns - firstcuptr place to put the first required code unit - firstcuflagsptr place to put the first code unit flags, or a negative number - reqcuptr place to put the last required code unit - reqcuflagsptr place to put the last required code unit flags, or a negative number - bcptr pointer to the chain of currently open branches - cb points to the data block with tables pointers etc. - lengthptr NULL during the real compile phase + firstcuptr place to put the first required code unit + firstcuflagsptr place to put the first code unit flags, or a negative number + reqcuptr place to put the last required code unit + reqcuflagsptr place to put the last required code unit flags, or a negative number + bcptr pointer to the chain of currently open branches + cb points to the data block with tables pointers etc. + lengthptr NULL during the real compile phase points to length accumulator during pre-compile phase Returns: TRUE on success @@ -6554,7 +6551,7 @@ static BOOL compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, PCRE2_SPTR *ptrptr, int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipunits, int cond_depth, uint32_t *firstcuptr, int32_t *firstcuflagsptr, - uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr, + uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr, compile_block *cb, size_t *lengthptr) { PCRE2_SPTR ptr = *ptrptr; @@ -6687,7 +6684,7 @@ for (;;) previously no reqcu, it takes on the value of the old firstcu. */ if (firstcuflags >= 0 && - (firstcuflags != branchfirstcuflags || + (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)) { if (reqcuflags < 0) @@ -6701,7 +6698,7 @@ for (;;) /* If we (now or from before) have no firstcu, a firstcu from the branch becomes a reqcu if there isn't a branch reqcu. */ - if (firstcuflags < 0 && branchfirstcuflags >= 0 && + if (firstcuflags < 0 && branchfirstcuflags >= 0 && branchreqcuflags < 0) { branchreqcu = branchfirstcu; @@ -6852,7 +6849,7 @@ for (;;) bc.current_branch = last_branch = code; code += 1 + LINK_SIZE; } - + /* Advance past the vertical bar */ ptr++; @@ -6994,7 +6991,7 @@ Returns: TRUE or FALSE */ static BOOL -is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, +is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, int atomcount) { do { @@ -7102,7 +7099,7 @@ follow. However, if we end up without a first code unit setting for an unanchored pattern, it is worth scanning the regex to see if there is an initial asserted first code unit. If all branches start with the same asserted code unit, or with a non-conditional bracket all of whose alternatives start -with the same asserted code unit (recurse ad lib), then we return that code +with the same asserted code unit (recurse ad lib), then we return that code unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with REQ_NONE in the flags. @@ -7146,7 +7143,7 @@ do { d = find_firstassertedcu(scode, &dflags, op == OP_ASSERT); if (dflags < 0) return 0; - if (cflags < 0) { c = d; cflags = dflags; } + if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0; break; @@ -7254,7 +7251,7 @@ Arguments: patlen the length of the pattern, or < 0 for zero-terminated options option bits errorptr pointer to errorcode - erroroffset pointer to error offset + erroroffset pointer to error offset ccontext points to a compile context or is NULL Returns: pointer to compiled data block, or NULL on error, @@ -7328,7 +7325,7 @@ if (ccontext == NULL) PRIV(compile_context_init)(&default_context, TRUE); ccontext = &default_context; } - + /* A negative pattern length means "zero-terminated". Otherwise, we make a copy of the pattern and add a zero. */ @@ -7350,7 +7347,7 @@ if (patlen < 0) patlen = PRIV(strlen)(pattern); else copied_pattern[patlen] = 0; pattern = copied_pattern; } - + /* ------------ Initialize the "static" compile data -------------- */ @@ -7407,7 +7404,7 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++) { pso *p = pso_list + i; - + if (PRIV(strncmp_c8)(ptr+skipatstart+2, (char *)(p->name), p->length) == 0) { uint32_t c, pp; @@ -7436,17 +7433,17 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ c = c*10 + ptr[pp++] - CHAR_0; } - if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) + if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) { - errorcode = ERR60; + errorcode = ERR60; goto HAD_ERROR; - } + } if (p->type == PSO_LIMM) limit_match = c; else limit_recursion = c; skipatstart += pp - skipatstart; break; } - break; /* Out of the table scan loop */ + break; /* Out of the table scan loop */ } } if (i >= sizeof(pso_list)/sizeof(pso)) break; /* Out of pso loop */ @@ -7480,16 +7477,16 @@ if (utf) if ((options & PCRE2_NO_UTF_CHECK) == 0 && (errorcode = PRIV(valid_utf)(pattern, -1, erroroffset)) != 0) goto HAD_ERROR; - } - + } + /* Check UCP lockout. */ -if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) == +if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) == (PCRE2_UCP|PCRE2_NEVER_UCP)) { errorcode = ERR75; goto HAD_ERROR; - } + } /* Process the BSR setting. */ @@ -7529,7 +7526,7 @@ switch(newline) errorcode = ERR56; goto HAD_ERROR; } - + /* Pretend to compile the pattern while actually just accumulating the amount of memory required in the 'length' variable. This behaviour is triggered by passing a non-NULL final argument to compile_regex(). We pass a block of @@ -7541,7 +7538,7 @@ On error, errorcode will be set non-zero, so we don't need to look at the result of the function. The initial options have been put into the cb block so that they can be changed if an option setting is found within the regex right at the beginning. Bringing initial option settings outside can help speed up -starting point checks. We still have to pass a separate options variable (the +starting point checks. We still have to pass a separate options variable (the first argument) because that may change as the pattern is processed. */ code = cworkspace; @@ -7550,14 +7547,14 @@ code = cworkspace; (void)compile_regex(cb.external_options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, &length); - + if (errorcode != 0) goto HAD_ERROR; if (length > MAX_PATTERN_SIZE) { errorcode = ERR20; goto HAD_ERROR; } - + /* If there are groups with duplicate names and there are also references by name, we must allow for the possibility of named references to duplicated groups. These require an extra data item each. */ @@ -7570,7 +7567,7 @@ the compiled pattern and names table. Integer overflow should no longer be possible because nowadays we limit the maximum value of cb.names_found and cb.name_entry_size. */ -re_blocksize = sizeof(pcre2_real_code) + +re_blocksize = sizeof(pcre2_real_code) + CU2BYTES(length + cb.names_found * cb.name_entry_size); re = (pcre2_real_code *) ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data); @@ -7595,17 +7592,17 @@ re->first_codeunit = 0; re->last_codeunit = 0; re->bsr_convention = bsr; re->newline_convention = newline; -re->max_lookbehind = +re->max_lookbehind = re->minlength = 0; re->top_bracket = 0; re->top_backref = 0; re->name_entry_size = cb.name_entry_size; re->name_count = cb.names_found; -/* The basic block is immediately followed by the name table, and the compiled +/* The basic block is immediately followed by the name table, and the compiled code follows after that. */ -codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) + +codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) + re->name_entry_size * re->name_count; @@ -7646,7 +7643,7 @@ cb.check_lookbehind = FALSE; cb.open_caps = NULL; /* If any named groups were found, create the name/number table from the list -created in the first pass. If the list was longer than the in-stack list, free +created in the first pass. If the list was longer than the in-stack list, free the heap memory. */ if (cb.names_found > 0) @@ -7726,7 +7723,7 @@ if (cb.hwm > cb.start_workspace) NULL to indicate that forward references have been filled in. */ if (cb.workspace_size > COMPILE_WORK_SIZE) - ccontext->memctl.free((void *)cb.start_workspace, + ccontext->memctl.free((void *)cb.start_workspace, ccontext->memctl.memory_data); cb.start_workspace = NULL; @@ -7744,9 +7741,9 @@ function call. */ if ((options & PCRE2_NO_AUTO_POSSESS) == 0) { - PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; + PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; PRIV(auto_possessify)(temp, utf, &cb); - } + } /* If there were any lookbehind assertions that contained OP_RECURSE (recursions or subroutine calls), a flag is set for them to be checked here, @@ -7800,7 +7797,7 @@ if (errorcode != 0) re = NULL; *errorptr = errorcode; *erroroffset = (int)(ptr - pattern); - goto EXIT; + goto EXIT; } /* Successful compile. If the anchored option was not passed, set it if @@ -7809,9 +7806,9 @@ or anything else, such as starting with non-atomic .* when DOTALL is set and there are no occurrences of *PRUNE or *SKIP. */ if ((re->overall_options & PCRE2_ANCHORED) == 0 && - is_anchored(codestart, 0, &cb, 0)) + is_anchored(codestart, 0, &cb, 0)) re->overall_options |= PCRE2_ANCHORED; - + /* If the pattern is still not anchored and we do not have a first code unit, see if there is one that is asserted (these are not saved during the compile because they can cause conflicts with actual literals that follow). */ @@ -7820,14 +7817,14 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0) { if (firstcuflags < 0) firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE); - + /* Save the data for a first code unit. */ if (firstcuflags >= 0) { re->first_codeunit = firstcu; re->flags |= PCRE2_FIRSTSET; - + /* Handle caseless first code units. */ if ((firstcuflags & REQ_CASELESS) != 0) @@ -7836,20 +7833,20 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0) { if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS; } - - /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In - 8-bit UTF mode, codepoints in the range 128-255 are introductory code - points and cannot have another case. In 16-bit and 32-bit modes, we can + + /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In + 8-bit UTF mode, codepoints in the range 128-255 are introductory code + points and cannot have another case. In 16-bit and 32-bit modes, we can check wide characters when UTF (and therefore UCP) is supported. */ - + #if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 8 - else if (firstcu <= MAX_UTF_CODE_POINT && + else if (firstcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(firstcu) != firstcu) re->flags |= PCRE2_FIRSTCASELESS; -#endif +#endif } } - + /* When there is no first code unit, see if we can set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all branches start with ^ and also when all branches start with non-atomic .* for non-DOTALL matches @@ -7857,19 +7854,19 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0) else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE; } - -/* Handle the "required code unit", if one is set. In the case of an anchored + +/* Handle the "required code unit", if one is set. In the case of an anchored pattern, do this only if it follows a variable length item in the pattern. */ if (reqcuflags >= 0 && - ((re->overall_options & PCRE2_ANCHORED) == 0 || + ((re->overall_options & PCRE2_ANCHORED) == 0 || (reqcuflags & REQ_VARY) != 0)) { re->last_codeunit = reqcu; re->flags |= PCRE2_LASTSET; - + /* Handle caseless required code units as for first code units (above). */ - + if ((reqcuflags & REQ_CASELESS) != 0) { if (reqcu < 128 || (!utf && reqcu < 255)) @@ -7897,14 +7894,14 @@ do } while (*codestart == OP_ALT); -/* Finally, study the compiled pattern to set up information such as a bitmap +/* Finally, study the compiled pattern to set up information such as a bitmap of starting code units and a minimum matching length. */ if (PRIV(study)(re) != 0) { errorcode = ERR31; - goto HAD_ERROR; - } + goto HAD_ERROR; + } /* Control ends up here in all cases. If memory was obtained for a zero-terminated copy of the pattern, remember to free it before returning. */ diff --git a/testdata/testinput1 b/testdata/testinput1 index b8c51fd..dfed98f 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -4912,6 +4912,12 @@ /((?(R1)a+|(?1)b))/ aaaabcde + +/((?(R)a|(?1)))*/ + aaa + +/((?(R)a|(?1)))+/ + aaa /a(*:any name)/mark diff --git a/testdata/testoutput1 b/testdata/testoutput1 index c29b586..2059825 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -8199,6 +8199,16 @@ MK: M aaaabcde 0: aaaab 1: aaaab + +/((?(R)a|(?1)))*/ + aaa + 0: aaa + 1: a + +/((?(R)a|(?1)))+/ + aaa + 0: aaa + 1: a /a(*:any name)/mark