diff --git a/ChangeLog b/ChangeLog index ae26ad8..b797c53 100644 --- a/ChangeLog +++ b/ChangeLog @@ -205,6 +205,11 @@ JIT. subjects from 1000 to 2000 for 8-bit searches, since they use memchr() and are much faster. +46. Arrange for anchored patterns to record and use "first code unit" data, +because this can give a fast "no match" without searching for a "required code +unit". Previously only non-anchored patterns did this. + + Version 10.23 14-February-2017 ------------------------------ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index a8801b3..c4aa14e 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -9632,14 +9632,19 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 && is_anchored(codestart, 0, &cb, 0, FALSE)) re->overall_options |= PCRE2_ANCHORED; -/* If the pattern is still not anchored and we do not have a first code unit, -see if there is one that is asserted (these are not saved during the compile -because they can cause conflicts with actual literals that follow). This code -need not be obeyed if PCRE2_NO_START_OPTIMIZE is set, as the data it would -create will not be used. */ +/* Set up the first code unit or startline flag, the required code unit, and +then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE +is set, as the data it would create will not be used. Note that a first code +unit (but not the startline flag) is useful for anchored patterns because it +can still give a quick "no match" and also avoid searching for a last code +unit. */ -if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0) +if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) { + /* If we do not have a first code unit, see if there is one that is asserted + (these are not saved during the compile because they can cause conflicts with + actual literals that follow). */ + if (firstcuflags < 0) firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE); @@ -9672,52 +9677,50 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0) } } - /* When there is no first code unit, see if we can set the PCRE2_STARTLINE - flag. This is helpful for multiline matches when all branches start with ^ - and also when all branches start with non-atomic .* for non-DOTALL matches - when *PRUNE and SKIP are not present. (There is an option that disables this - case.) */ + /* When there is no first code unit, for non-anchored patterns, see if we can + set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all + branches start with ^ and also when all branches start with non-atomic .* for + non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option + that disables this case.) */ - else if (is_startline(codestart, 0, &cb, 0, FALSE)) + else if ((re->overall_options & PCRE2_ANCHORED) == 0 && + is_startline(codestart, 0, &cb, 0, FALSE)) re->flags |= PCRE2_STARTLINE; - } -/* Handle the "required code unit", if one is set. In the case of an anchored -pattern, do this only if it follows a variable length item in the pattern. -Again, skip this if PCRE2_NO_START_OPTIMIZE is set. */ + /* Handle the "required code unit", if one is set. In the case of an anchored + pattern, do this only if it follows a variable length item in the pattern. */ -if (reqcuflags >= 0 && - ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0 || - (reqcuflags & REQ_VARY) != 0)) - { - re->last_codeunit = reqcu; - re->flags |= PCRE2_LASTSET; - - /* Handle caseless required code units as for first code units (above). */ - - if ((reqcuflags & REQ_CASELESS) != 0) + if (reqcuflags >= 0 && + ((re->overall_options & PCRE2_ANCHORED) == 0 || + (reqcuflags & REQ_VARY) != 0)) { - if (reqcu < 128 || (!utf && reqcu < 255)) + re->last_codeunit = reqcu; + re->flags |= PCRE2_LASTSET; + + /* Handle caseless required code units as for first code units (above). */ + + if ((reqcuflags & REQ_CASELESS) != 0) { - if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; - } + if (reqcu < 128 || (!utf && reqcu < 255)) + { + if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; + } #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) - re->flags |= PCRE2_LASTCASELESS; + else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) + re->flags |= PCRE2_LASTCASELESS; #endif + } } - } -/* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern -to set up information such as a bitmap of starting code units and a minimum -matching length. */ + /* Finally, study the compiled pattern to set up information such as a bitmap + of starting code units and a minimum matching length. */ -if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && - PRIV(study)(re) != 0) - { - errorcode = ERR31; - goto HAD_CB_ERROR; - } + if (PRIV(study)(re) != 0) + { + errorcode = ERR31; + goto HAD_CB_ERROR; + } + } /* End of start-of-match optimizations. */ /* Control ends up here in all cases. When running under valgrind, make a pattern's terminating zero defined again. If memory was obtained for the parsed diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index 1fd1550..7fe6dfe 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -3341,34 +3341,27 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) } #endif /* SUPPORT_UNICODE */ -/* Set up the first code unit to match, if available. The first_codeunit value -is never set for an anchored regular expression, but the anchoring may be -forced at run time, so we have to test for anchoring. The first code unit may -be unset for an unanchored pattern, of course. If there's no first code unit -there may be a bitmap of possible first characters. */ +/* Set up the first code unit to match, if available. If there's no first code +unit there may be a bitmap of possible first characters. */ -if (!anchored) +if ((re->flags & PCRE2_FIRSTSET) != 0) { - if ((re->flags & PCRE2_FIRSTSET) != 0) + has_first_cu = TRUE; + first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); + if ((re->flags & PCRE2_FIRSTCASELESS) != 0) { - has_first_cu = TRUE; - first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); - if ((re->flags & PCRE2_FIRSTCASELESS) != 0) - { - first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); + first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && first_cu > 127) - first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); + if (utf && first_cu > 127) + first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); #endif - } } - else - if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) - start_bits = re->start_bitmap; } +else + if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) + start_bits = re->start_bitmap; -/* For anchored or unanchored matches, there may be a "last known required -character" set. */ +/* There may be a "last known required code unit" set. */ if ((re->flags & PCRE2_LASTSET) != 0) { @@ -3414,8 +3407,8 @@ for (;;) /* If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. That is, the match must be before or at the first newline. Implement this by temporarily adjusting end_subject so that - we stop the optimization scans at a newline. If the match fails at the - newline, later code breaks this loop. */ + we stop the optimization scans for a first code unit at a newline. If the + match fails at the newline, later code breaks this loop. */ if (firstline) { @@ -3434,70 +3427,138 @@ for (;;) while (t < mb->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } + + /* Anchored: check the first code unit if one is recorded. This may seem + pointless but it can help in detecting a no match case without scanning for + the required code unit. */ - /* Advance to a unique first code unit if there is one. */ - - if (has_first_cu) + if (anchored) { - PCRE2_UCHAR smc; - if (first_cu != first_cu2) - while (start_match < end_subject && - (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2) - start_match++; - else - while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu) - start_match++; - } - - /* Or to just after a linebreak for a multiline match */ - - else if (startline) - { - if (start_match > mb->start_subject + start_offset) + if (has_first_cu || start_bits != NULL) { -#ifdef SUPPORT_UNICODE - if (utf) + BOOL ok = start_match < end_subject; + if (ok) { - while (start_match < end_subject && !WAS_NEWLINE(start_match)) + PCRE2_UCHAR c = UCHAR21TEST(start_match); + ok = has_first_cu && (c == first_cu || c == first_cu2); + if (!ok && start_bits != NULL) { - start_match++; - ACROSSCHAR(start_match < end_subject, *start_match, - start_match++); +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (c > 255) c = 255; +#endif + ok = (start_bits[c/8] & (1 << (c&7))) != 0; } } - else -#endif - while (start_match < end_subject && !WAS_NEWLINE(start_match)) - start_match++; - - /* If we have just passed a CR and the newline option is ANY or - ANYCRLF, and we are now at a LF, advance the match position by one more - code unit. */ - - if (start_match[-1] == CHAR_CR && - (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && - start_match < end_subject && - UCHAR21TEST(start_match) == CHAR_NL) - start_match++; + if (!ok) break; } } - /* Or to a non-unique first code unit if any have been identified. The - bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all - code units greater than 254 set the 255 bit. */ + /* Not anchored. Advance to a unique first code unit if there is one. In + 8-bit mode, the use of memchr() gives a big speed up, even though we have + to call it twice in caseless mode, in order to find the earliest occurrence + of the character in either of its cases. */ - else if (start_bits != NULL) + else { - while (start_match < end_subject) + if (has_first_cu) { - uint32_t c = UCHAR21TEST(start_match); + if (first_cu != first_cu2) /* Caseless */ + { #if PCRE2_CODE_UNIT_WIDTH != 8 - if (c > 255) c = 255; + PCRE2_UCHAR smc; + while (start_match < end_subject && + (smc = UCHAR21TEST(start_match)) != first_cu && + smc != first_cu2) + start_match++; +#else /* 8-bit code units */ + PCRE2_SPTR pp1 = + memchr(start_match, first_cu, end_subject-start_match); + PCRE2_SPTR pp2 = + memchr(start_match, first_cu2, end_subject-start_match); + if (pp1 == NULL) + start_match = (pp2 == NULL)? end_subject : pp2; + else + start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; #endif - if ((start_bits[c/8] & (1 << (c&7))) != 0) break; - start_match++; + } + + /* The caseful case */ + + else + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + while (start_match < end_subject && UCHAR21TEST(start_match) != + first_cu) + start_match++; +#else + start_match = memchr(start_match, first_cu, end_subject - start_match); + if (start_match == NULL) start_match = end_subject; +#endif + } + + /* If we can't find the required code unit, break the bumpalong loop, + to force a match failure, except when doing partial matching, when we + let the next cycle run at the end of the subject. To see why, consider + the pattern /(?<=abc)def/, which partially matches "abc", even though + the string does not contain the starting character "d". */ + + if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 && + start_match >= end_subject) + break; } - } + + /* If there's no first code unit, advance to just after a linebreak for a + multiline match if required. */ + + else if (startline) + { + if (start_match > mb->start_subject + start_offset) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + { + start_match++; + ACROSSCHAR(start_match < end_subject, *start_match, + start_match++); + } + } + else +#endif + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + start_match++; + + /* If we have just passed a CR and the newline option is ANY or + ANYCRLF, and we are now at a LF, advance the match position by one + more code unit. */ + + if (start_match[-1] == CHAR_CR && + (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && + start_match < end_subject && + UCHAR21TEST(start_match) == CHAR_NL) + start_match++; + } + } + + /* If there's no first code unit or a requirement for a multiline line + start, advance to a non-unique first code unit if any have been + identified. The bitmap contains only 256 bits. When code units are 16 or + 32 bits wide, all code units greater than 254 set the 255 bit. */ + + else if (start_bits != NULL) + { + while (start_match < end_subject) + { + uint32_t c = UCHAR21TEST(start_match); +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (c > 255) c = 255; +#endif + if ((start_bits[c/8] & (1 << (c&7))) != 0) break; + start_match++; + } + } + } /* End of first code unit handling */ /* Restore fudged end_subject */ diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 890b961..2461da1 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -270,7 +270,7 @@ pcre2_callout_block cb; *lengthptr = (*Fecode == OP_CALLOUT)? PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE); - + if (mb->callout == NULL) return 0; /* No callout function provided */ /* The original matching code (pre 10.30) worked directly with the ovector @@ -279,11 +279,11 @@ ovector is in the backtracking frame, it no longer needs to reserve space for the overall match offsets (which would waste space in the frame). For backward compatibility, however, we pass capture_top and offset_vector to the callout as if for the extended ovector, and we ensure that the first two slots are unset -by preserving and restoring their current contents. Picky compilers complain if -references such as Fovector[-2] are use directly, so we set up a separate +by preserving and restoring their current contents. Picky compilers complain if +references such as Fovector[-2] are use directly, so we set up a separate pointer. */ -callout_ovector = (PCRE2_SIZE *)(Fovector) - 2; +callout_ovector = (PCRE2_SIZE *)(Fovector) - 2; cb.version = 1; cb.capture_top = (uint32_t)Foffset_top/2 + 1; @@ -935,8 +935,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* ===================================================================== */ /* Match a single character, caselessly. If we are at the end of the - subject, give up immediately. We get here only when the pattern character - has at most one other case. Characters with more than two cases are coded + subject, give up immediately. We get here only when the pattern character + has at most one other case. Characters with more than two cases are coded as OP_PROP with the pseudo-property PT_CLIST. */ case OP_CHARI: @@ -954,7 +954,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); GETCHARLEN(fc, Fecode, Flength); /* If the pattern character's value is < 128, we know that its other case - (if any) is also < 128 (and therefore only one code unit long in all + (if any) is also < 128 (and therefore only one code unit long in all code-unit widths), so we can use the fast lookup table. We checked above that there is at least one character left in the subject. */ @@ -966,7 +966,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); Feptr++; } - /* Otherwise we must pick up the subject character and use Unicode + /* Otherwise we must pick up the subject character and use Unicode property support to test its other case. Note that we cannot use the value of "Flength" to check for sufficient bytes left, because the other case of the character may have more or fewer code units. */ @@ -3056,7 +3056,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } Feptr += Lmin; break; - + /* This OP_ANYBYTE case will never be reached because \C gets turned into OP_ALLANY in non-UTF mode. Cut out the code so that coverage reports don't complain about it's never being used. */ @@ -5352,8 +5352,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode); (char *)assert_accept_frame + offsetof(heapframe, ovector), assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); Foffset_top = assert_accept_frame->offset_top; - - /* Fall through */ + + /* Fall through */ /* In the case of a match, the captures have already been put into the current frame. */ @@ -5650,7 +5650,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH); if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS; - /* Fall through */ + /* Fall through */ /* Unconditional end of subject assertion (\z) */ case OP_EOD: @@ -6280,7 +6280,7 @@ The last of these is changed within the match() function if the frame vector has to be expanded. We therefore put it into the match block so that it is correct when calling match() more than once for non-anchored patterns. */ -frame_size = offsetof(heapframe, ovector) + +frame_size = offsetof(heapframe, ovector) + re->top_bracket * 2 * sizeof(PCRE2_SIZE); /* Limits set in the pattern override the match context only if they are @@ -6333,33 +6333,26 @@ mb->lcc = re->tables + lcc_offset; mb->fcc = re->tables + fcc_offset; mb->ctypes = re->tables + ctypes_offset; -/* Set up the first code unit to match, if available. The first_codeunit value -is never set for an anchored regular expression, but the anchoring may be -forced at run time, so we have to test for anchoring. The first code unit may -be unset for an unanchored pattern, of course. If there's no first code unit -there may be a bitmap of possible first characters. */ +/* Set up the first code unit to match, if available. If there's no first code +unit there may be a bitmap of possible first characters. */ -if (!anchored) +if ((re->flags & PCRE2_FIRSTSET) != 0) { - if ((re->flags & PCRE2_FIRSTSET) != 0) + has_first_cu = TRUE; + first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); + if ((re->flags & PCRE2_FIRSTCASELESS) != 0) { - has_first_cu = TRUE; - first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit); - if ((re->flags & PCRE2_FIRSTCASELESS) != 0) - { - first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); + first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); + if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); #endif - } } - else - if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) - start_bits = re->start_bitmap; } +else + if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0) + start_bits = re->start_bitmap; -/* For anchored or unanchored matches, there may be a "last known required -character" set. */ +/* There may also be a "last known required character" set. */ if ((re->flags & PCRE2_LASTSET) != 0) { @@ -6398,8 +6391,8 @@ for(;;) /* If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. That is, the match must be before or at the first newline. Implement this by temporarily adjusting end_subject so that - we stop the optimization scans at a newline. If the match fails at the - newline, later code breaks this loop. */ + we stop the optimization scans for a first code unit at a newline. If the + match fails at the newline, later code breaks this loop. */ if (firstline) { @@ -6419,107 +6412,143 @@ for(;;) end_subject = t; } - /* Advance to a unique first code unit if there is one. In 8-bit mode, the - use of memchr() gives a big speed up, even though we have to call it twice - in caseless mode, in order to find the first occurrence of the character in - either of its cases. */ + /* Anchored: check the first code unit if one is recorded. This may seem + pointless but it can help in detecting a no match case without scanning for + the required code unit. */ - if (has_first_cu) + if (anchored) { - if (first_cu != first_cu2) /* Caseless */ + if (has_first_cu || start_bits != NULL) { -#if PCRE2_CODE_UNIT_WIDTH != 8 - PCRE2_UCHAR smc; - while (start_match < end_subject && - (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2) - start_match++; -#else /* 8-bit code units */ - PCRE2_SPTR pp1 = memchr(start_match, first_cu, end_subject-start_match); - PCRE2_SPTR pp2 = memchr(start_match, first_cu2, end_subject-start_match); - if (pp1 == NULL) - start_match = (pp2 == NULL)? end_subject : pp2; - else - start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; -#endif - } - - /* The caseful case */ - - else - { -#if PCRE2_CODE_UNIT_WIDTH != 8 - while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu) - start_match++; -#else - start_match = memchr(start_match, first_cu, end_subject - start_match); - if (start_match == NULL) start_match = end_subject; -#endif - } - - /* If we can't find the required code unit, break the bumpalong loop, to - force a match failure, except when doing partial matching, when we let - the next cycle run at the end of the subject. To see why, consider the - pattern /(?<=abc)def/, which partially matches "abc", even though the - string does not contain the starting character "d". */ - - if (!mb->partial && start_match >= end_subject) - { - rc = MATCH_NOMATCH; - break; - } - } - - /* If there's no first code unit, advance to just after a linebreak for a - multiline match if required. */ - - else if (startline) - { - if (start_match > mb->start_subject + start_offset) - { -#ifdef SUPPORT_UNICODE - if (utf) + BOOL ok = start_match < end_subject; + if (ok) { - while (start_match < end_subject && !WAS_NEWLINE(start_match)) + PCRE2_UCHAR c = UCHAR21TEST(start_match); + ok = has_first_cu && (c == first_cu || c == first_cu2); + if (!ok && start_bits != NULL) { - start_match++; - ACROSSCHAR(start_match < end_subject, *start_match, - start_match++); +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (c > 255) c = 255; +#endif + ok = (start_bits[c/8] & (1 << (c&7))) != 0; } } - else -#endif - while (start_match < end_subject && !WAS_NEWLINE(start_match)) - start_match++; - - /* If we have just passed a CR and the newline option is ANY or - ANYCRLF, and we are now at a LF, advance the match position by one more - code unit. */ - - if (start_match[-1] == CHAR_CR && - (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && - start_match < end_subject && - UCHAR21TEST(start_match) == CHAR_NL) - start_match++; + if (!ok) + { + rc = MATCH_NOMATCH; + break; + } } } - /* If there's no first code unit or a requirement for a multiline line - start, advance to a non-unique first code unit if any have been identified. - The bitmap contains only 256 bits. When code units are 16 or 32 bits wide, - all code units greater than 254 set the 255 bit. */ + /* Not anchored. Advance to a unique first code unit if there is one. In + 8-bit mode, the use of memchr() gives a big speed up, even though we have + to call it twice in caseless mode, in order to find the earliest occurrence + of the character in either of its cases. */ - else if (start_bits != NULL) + else { - while (start_match < end_subject) + if (has_first_cu) { - uint32_t c = UCHAR21TEST(start_match); + if (first_cu != first_cu2) /* Caseless */ + { #if PCRE2_CODE_UNIT_WIDTH != 8 - if (c > 255) c = 255; + PCRE2_UCHAR smc; + while (start_match < end_subject && + (smc = UCHAR21TEST(start_match)) != first_cu && + smc != first_cu2) + start_match++; +#else /* 8-bit code units */ + PCRE2_SPTR pp1 = + memchr(start_match, first_cu, end_subject-start_match); + PCRE2_SPTR pp2 = + memchr(start_match, first_cu2, end_subject-start_match); + if (pp1 == NULL) + start_match = (pp2 == NULL)? end_subject : pp2; + else + start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; #endif - if ((start_bits[c/8] & (1 << (c&7))) != 0) break; - start_match++; + } + + /* The caseful case */ + + else + { +#if PCRE2_CODE_UNIT_WIDTH != 8 + while (start_match < end_subject && UCHAR21TEST(start_match) != + first_cu) + start_match++; +#else + start_match = memchr(start_match, first_cu, end_subject - start_match); + if (start_match == NULL) start_match = end_subject; +#endif + } + + /* If we can't find the required code unit, break the bumpalong loop, + to force a match failure, except when doing partial matching, when we + let the next cycle run at the end of the subject. To see why, consider + the pattern /(?<=abc)def/, which partially matches "abc", even though + the string does not contain the starting character "d". */ + + if (!mb->partial && start_match >= end_subject) + { + rc = MATCH_NOMATCH; + break; + } } - } + + /* If there's no first code unit, advance to just after a linebreak for a + multiline match if required. */ + + else if (startline) + { + if (start_match > mb->start_subject + start_offset) + { +#ifdef SUPPORT_UNICODE + if (utf) + { + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + { + start_match++; + ACROSSCHAR(start_match < end_subject, *start_match, + start_match++); + } + } + else +#endif + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + start_match++; + + /* If we have just passed a CR and the newline option is ANY or + ANYCRLF, and we are now at a LF, advance the match position by one + more code unit. */ + + if (start_match[-1] == CHAR_CR && + (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) && + start_match < end_subject && + UCHAR21TEST(start_match) == CHAR_NL) + start_match++; + } + } + + /* If there's no first code unit or a requirement for a multiline line + start, advance to a non-unique first code unit if any have been + identified. The bitmap contains only 256 bits. When code units are 16 or + 32 bits wide, all code units greater than 254 set the 255 bit. */ + + else if (start_bits != NULL) + { + while (start_match < end_subject) + { + uint32_t c = UCHAR21TEST(start_match); +#if PCRE2_CODE_UNIT_WIDTH != 8 + if (c > 255) c = 255; +#endif + if ((start_bits[c/8] & (1 << (c&7))) != 0) break; + start_match++; + } + } + } /* End first code unit handling */ /* Restore fudged end_subject */ diff --git a/src/pcre2_study.c b/src/pcre2_study.c index bfd651a..b926867 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -799,7 +799,7 @@ if (caseless) if (c > 0xff) SET_BIT(0xff); else SET_BIT(c); #endif } - else + else #endif /* SUPPORT_UNICODE */ /* Not UTF */ @@ -953,7 +953,6 @@ do case OP_ALLANY: case OP_ANY: case OP_ANYBYTE: - case OP_CIRC: case OP_CIRCM: case OP_CLOSE: case OP_COMMIT: @@ -1021,6 +1020,13 @@ do case OP_THEN_ARG: return SSB_FAIL; + /* OP_CIRC happens only at the start of an anchored branch (multiline ^ + uses OP_CIRCM). Skip over it. */ + + case OP_CIRC: + tcode += PRIV(OP_lengths)[OP_CIRC]; + break; + /* A "real" property test implies no starting bits, but the fake property PT_CLIST identifies a list of characters. These lists are short, as they are used for characters with more than one "other case", so there is no @@ -1450,7 +1456,7 @@ do #endif /* It seems that the fall through comment must be outside the #ifdef if it is to avoid the gcc compiler warning. */ - + /* Fall through */ /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are @@ -1579,12 +1585,11 @@ BOOL utf = (re->overall_options & PCRE2_UTF) != 0; code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) + re->name_entry_size * re->name_count; -/* For an anchored pattern, or an unanchored pattern that has a first code -unit, or a multiline pattern that matches only at "line start", there is no -point in seeking a list of starting code units. */ +/* For a pattern that has a first code unit, or a multiline pattern that +matches only at "line start", there is no point in seeking a list of starting +code units. */ -if ((re->overall_options & PCRE2_ANCHORED) == 0 && - (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) +if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0) { int rc = set_start_bits(re, code, utf); if (rc == SSB_UNKNOWN) return 1; diff --git a/testdata/testinput10 b/testdata/testinput10 index da36e52..2892b42 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -466,5 +466,14 @@ /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes \x{dfff}\x{df01}\=no_utf_check + +# This has different starting code units in 8-bit mode. + +/^[^ab]/IB,utf + c + \x{ff} + \x{100} +\= Expect no match + aaa # End of testinput10 diff --git a/testdata/testinput12 b/testdata/testinput12 index 482c151..09df9fa 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -373,4 +373,13 @@ /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes \x{dfff}\x{df01}\=no_utf_check +# This has different starting code units in 8-bit mode. + +/^[^ab]/IB,utf + c + \x{ff} + \x{100} +\= Expect no match + aaa + # End of testinput12 diff --git a/testdata/testinput2 b/testdata/testinput2 index 77b0a1a..a22a6ca 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5256,6 +5256,9 @@ a)"xI XAB /^(?!A(?C1)B)C/ + ABC\=callout_error=1,no_jit + +/^(?!A(?C1)B)C/no_start_optimize ABC\=callout_error=1 /^(?(?!A(?C1)B)C)/ diff --git a/testdata/testinput5 b/testdata/testinput5 index 3931b6c..83e7081 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -120,13 +120,6 @@ \x{ff} \x{100} -/^[^ab]/IB,utf - c - \x{ff} - \x{100} -\= Expect no match - aaa - /\x{100}*(\d+|"(?1)")/utf 1234 "1234" @@ -190,7 +183,10 @@ /\w/utf \x{100}X -/^\ሴ/IB,utf +# Use no_start_optimize because the first code unit is different in 8-bit from +# the wider modes. + +/^\ሴ/IB,utf,no_start_optimize /()()()()()()()()()() ()()()()()()()()()() diff --git a/testdata/testoutput10 b/testdata/testoutput10 index f5910ee..f6aeeb9 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -1585,5 +1585,39 @@ Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP), /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes \x{dfff}\x{df01}\=no_utf_check 0: \x{dfff}\x{df01} + +# This has different starting code units in 8-bit mode. + +/^[^ab]/IB,utf +------------------------------------------------------------------ + Bra + ^ + [\x00-`c-\xff] (neg) + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Compile options: utf +Overall options: anchored utf +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 + 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y + Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f + \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 + \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf + \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee + \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd + \xfe \xff +Subject length lower bound = 1 + c + 0: c + \x{ff} + 0: \x{ff} + \x{100} + 0: \x{100} +\= Expect no match + aaa +No match # End of testinput10 diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index 2ea6eb0..1078042 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1433,4 +1433,42 @@ Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowe Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode \x{dfff}\x{df01}\=no_utf_check +# This has different starting code units in 8-bit mode. + +/^[^ab]/IB,utf +------------------------------------------------------------------ + Bra + ^ + [\x00-`c-\xff] (neg) + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Compile options: utf +Overall options: anchored utf +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 + 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y + Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f + \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e + \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d + \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac + \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb + \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca + \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 + \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 + \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 + \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff +Subject length lower bound = 1 + c + 0: c + \x{ff} + 0: \x{ff} + \x{100} + 0: \x{100} +\= Expect no match + aaa +No match + # End of testinput12 diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index b171379..25848ea 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1425,4 +1425,42 @@ No match \x{dfff}\x{df01}\=no_utf_check 0: \x{dfff}\x{df01} +# This has different starting code units in 8-bit mode. + +/^[^ab]/IB,utf +------------------------------------------------------------------ + Bra + ^ + [\x00-`c-\xff] (neg) + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Compile options: utf +Overall options: anchored utf +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 + 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y + Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f + \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e + \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d + \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac + \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb + \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca + \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 + \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 + \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 + \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff +Subject length lower bound = 1 + c + 0: c + \x{ff} + 0: \x{ff} + \x{100} + 0: \x{100} +\= Expect no match + aaa +No match + # End of testinput12 diff --git a/testdata/testoutput17 b/testdata/testoutput17 index 75dce10..a0606a7 100644 --- a/testdata/testoutput17 +++ b/testdata/testoutput17 @@ -368,6 +368,7 @@ No match Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 6 JIT compilation was successful #pop jitverify @@ -379,6 +380,7 @@ JIT compilation was successful Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 6 JIT compilation was successful #save testsaved1 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 5db311c..ef71e50 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -72,6 +72,7 @@ No match Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 3 abc 0: abc @@ -110,6 +111,7 @@ Subject length lower bound = 2 Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 3 abc 0: abc @@ -339,6 +341,7 @@ Subject length lower bound = 19 /the quick brown fox/I,anchored Capturing subpattern count = 0 Options: anchored +First code unit = 't' Subject length lower bound = 19 the quick brown fox 0: the quick brown fox @@ -351,6 +354,7 @@ Failed: error 111 at offset 4: unrecognized character after (? or (?- /^abc|def/I Capturing subpattern count = 0 +Starting code units: a d Subject length lower bound = 3 abcdef 0: abc @@ -495,12 +499,14 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = '1' Subject length lower bound = 4 /(^b|(?i)^d)/I Capturing subpattern count = 1 Compile options: Overall options: anchored +Starting code units: D b d Subject length lower bound = 1 /(?s).*/I @@ -624,6 +630,7 @@ Capturing subpattern count = 0 Max lookbehind = 1 Compile options: multiline Overall options: anchored multiline +First code unit = 'a' Subject length lower bound = 3 /^abc/Im @@ -637,6 +644,7 @@ Subject length lower bound = 3 Capturing subpattern count = 5 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 3 aaaaabbbbbcccccdef 0: aaaaabbbbbcccccdef @@ -808,6 +816,7 @@ Capturing subpattern count = 1 Max back reference = 1 Compile options: Overall options: anchored +Starting code units: a Subject length lower bound = 4 \= Expect no match aaaa @@ -1004,6 +1013,7 @@ Subject length lower bound = 16 Capturing subpattern count = 3 Compile options: Overall options: anchored +Starting code units: a b Subject length lower bound = 4 adef\=get=1,get=2,get=3,get=4,getall 0: adef @@ -1042,6 +1052,7 @@ Get substring 4 failed (-49): unknown substring Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 7 abc\00def\=copy=0,getall 0: abc\x00def @@ -1227,6 +1238,7 @@ Subject length lower bound = 3 Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'i' Subject length lower bound = 3 ississippi 0: iss @@ -1286,6 +1298,7 @@ Capturing subpattern count = 0 Contains explicit CR or LF match Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 3 ab\nab\ncd 0: ab\x0a @@ -1776,6 +1789,8 @@ Subject length lower bound = 2 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P + Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z Subject length lower bound = 1 /^[[:^alnum:]]/IB @@ -1789,6 +1804,18 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = > + ? @ [ \ ] ^ _ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 + \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 + \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 + \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 + \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 + \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 + \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 + \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 + \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /^[[:alpha:]]/IB @@ -1802,6 +1829,8 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + a b c d e f g h i j k l m n o p q r s t u v w x y z Subject length lower bound = 1 /^[[:^alpha:]]/IB @@ -1815,6 +1844,19 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 + 5 6 7 8 9 : ; < = > ? @ [ \ ] ^ _ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 + \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 + \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 + \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 + \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 + \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf + \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde + \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed + \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc + \xfd \xfe \xff Subject length lower bound = 1 /[_[:alpha:]]/I @@ -1834,6 +1876,12 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 + 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y + Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ + \x7f Subject length lower bound = 1 /^[[:^ascii:]]/IB @@ -1847,6 +1895,15 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a + \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 + \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 + \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 + \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 + \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 + \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 + \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 + \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /^[[:blank:]]/IB @@ -1860,6 +1917,7 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x09 \x20 Subject length lower bound = 1 /^[[:^blank:]]/IB @@ -1873,6 +1931,20 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b + \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a + \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 + : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ + _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 + \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f + \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e + \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad + \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc + \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb + \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda + \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 + \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 + \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /[\n\x0b\x0c\x0d[:blank:]]/I @@ -1892,6 +1964,9 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x7f Subject length lower bound = 1 /^[[:digit:]]/IB @@ -1905,6 +1980,7 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: 0 1 2 3 4 5 6 7 8 9 Subject length lower bound = 1 /^[[:graph:]]/IB @@ -1918,6 +1994,9 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : + ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ + ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ Subject length lower bound = 1 /^[[:lower:]]/IB @@ -1931,6 +2010,7 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: a b c d e f g h i j k l m n o p q r s t u v w x y z Subject length lower bound = 1 /^[[:print:]]/IB @@ -1944,6 +2024,9 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 + 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] + ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ Subject length lower bound = 1 /^[[:punct:]]/IB @@ -1957,6 +2040,8 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ + _ ` { | } ~ Subject length lower bound = 1 /^[[:space:]]/IB @@ -1970,6 +2055,7 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x09 \x0a \x0b \x0c \x0d \x20 Subject length lower bound = 1 /^[[:upper:]]/IB @@ -1983,6 +2069,7 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Subject length lower bound = 1 /^[[:xdigit:]]/IB @@ -1996,6 +2083,7 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f Subject length lower bound = 1 /^[[:word:]]/IB @@ -2009,6 +2097,8 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P + Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z Subject length lower bound = 1 /^[[:^cntrl:]]/IB @@ -2022,6 +2112,18 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 + 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] + ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x80 \x81 + \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 + \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f + \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae + \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd + \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc + \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb + \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea + \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 + \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /^[12[:^digit:]]/IB @@ -2035,6 +2137,20 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 1 2 : ; < + = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a + b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 + \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 + \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 + \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf + \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe + \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd + \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc + \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb + \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa + \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /^[[:^blank:]]/IB @@ -2048,6 +2164,20 @@ Subject length lower bound = 1 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b + \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a + \x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 + : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ + _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 + \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f + \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e + \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad + \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc + \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb + \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda + \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 + \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 + \xf9 \xfa \xfb \xfc \xfd \xfe \xff Subject length lower bound = 1 /[01[:alpha:]%]/IB @@ -2418,6 +2548,7 @@ Subject length lower bound = 4 Capturing subpattern count = 2 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 1 aba 0: aba @@ -2428,6 +2559,7 @@ Subject length lower bound = 1 Capturing subpattern count = 2 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbaa 0: aabbaa @@ -2438,6 +2570,7 @@ Subject length lower bound = 2 Capturing subpattern count = 2 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbaa 0: aabbaa @@ -2448,6 +2581,7 @@ Subject length lower bound = 2 Capturing subpattern count = 2 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbaa 0: aabbaa @@ -2458,6 +2592,7 @@ Subject length lower bound = 2 Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbaa 0: aabbaa @@ -2467,6 +2602,7 @@ Subject length lower bound = 2 Capturing subpattern count = 3 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbaa 0: aabbaa @@ -2478,6 +2614,7 @@ Subject length lower bound = 2 Capturing subpattern count = 2 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbaa 0: aabbaa @@ -2488,6 +2625,7 @@ Subject length lower bound = 2 Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbaa 0: aabbaa @@ -2497,6 +2635,7 @@ Subject length lower bound = 2 Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbbaa 0: aabbbaa @@ -2506,6 +2645,7 @@ Subject length lower bound = 2 Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbbaa 0: aabbbaa @@ -2515,6 +2655,7 @@ Subject length lower bound = 2 Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbaa 0: aabbaa @@ -2524,6 +2665,7 @@ Subject length lower bound = 2 Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbbaa 0: aabbbaa @@ -2533,6 +2675,7 @@ Subject length lower bound = 2 Capturing subpattern count = 3 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbbaa 0: aabbbaa @@ -2544,6 +2687,7 @@ Subject length lower bound = 2 Capturing subpattern count = 3 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 aabbbbaa 0: aabbbbaa @@ -3052,6 +3196,7 @@ Subject length lower bound = 3 Capturing subpattern count = 5 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 3 /^x(?U)a+b/IB @@ -3067,6 +3212,7 @@ Subject length lower bound = 3 Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'x' Last code unit = 'b' Subject length lower bound = 3 @@ -3085,6 +3231,7 @@ Subject length lower bound = 3 Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'x' Last code unit = 'b' Subject length lower bound = 3 @@ -3725,6 +3872,7 @@ Subject length lower bound = 3 Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 3 /(?C)a|b/I @@ -3785,6 +3933,7 @@ No match Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = '>' Last code unit = '<' Subject length lower bound = 10 >abc>123 Overall options: anchored +Starting code units: ( - 0 1 2 3 4 5 6 7 8 9 Subject length lower bound = 1 12 0: 12 @@ -3854,6 +4004,7 @@ No match Capturing subpattern count = 2 Compile options: Overall options: anchored +First code unit = 'x' Subject length lower bound = 3 xyz 0: xyz @@ -3913,6 +4064,7 @@ Failed: error 114 at offset 10: missing closing parenthesis Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 9 abcdefabc 0: abcdefabc @@ -3922,6 +4074,7 @@ Subject length lower bound = 9 Capturing subpattern count = 1 Compile options: Overall options: anchored +Starting code units: a b c Subject length lower bound = 2 a=a 0: a=a @@ -3937,6 +4090,7 @@ Subject length lower bound = 2 Capturing subpattern count = 2 Compile options: Overall options: anchored +Starting code units: a b c Subject length lower bound = 2 a=a 0: a=a @@ -5173,6 +5327,7 @@ No match Capturing subpattern count = 3 Compile options: Overall options: anchored +Starting code units: 0 1 2 3 4 5 6 7 8 9 Last code unit = '/' Subject length lower bound = 6 13/05/04\=ps @@ -5270,6 +5425,7 @@ Partial match: c12 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: 0 1 2 3 4 5 6 7 8 9 Last code unit = 'X' Subject length lower bound = 4 1\=ps @@ -5643,6 +5799,7 @@ Named capturing subpatterns: A 3 Compile options: dupnames Overall options: anchored dupnames +First code unit = 'a' Subject length lower bound = 2 a1b\=copy=A 0: a1 @@ -5680,6 +5837,7 @@ Named capturing subpatterns: A 2 Compile options: dupnames Overall options: anchored dupnames +First code unit = 'a' Subject length lower bound = 2 ab\=copy=A 0: ab @@ -5693,6 +5851,7 @@ Named capturing subpatterns: A 1 A 2 Options: dupnames +Starting code units: a c Subject length lower bound = 2 ab\=copy=A 0: ab @@ -5711,6 +5870,7 @@ Named capturing subpatterns: A 3 A 4 Options: dupnames +Starting code units: a c Subject length lower bound = 2 cdefgh\=copy=A 0: cdefgh @@ -5727,6 +5887,7 @@ Named capturing subpatterns: A 3 Compile options: dupnames Overall options: anchored dupnames +First code unit = 'a' Subject length lower bound = 2 a1b\=get=A 0: a1 @@ -5754,6 +5915,7 @@ Named capturing subpatterns: A 2 Compile options: dupnames Overall options: anchored dupnames +First code unit = 'a' Subject length lower bound = 2 ab\=get=A 0: ab @@ -5767,6 +5929,7 @@ Named capturing subpatterns: A 1 A 2 Options: dupnames +Starting code units: a c Subject length lower bound = 2 ab\=get=A 0: ab @@ -5785,6 +5948,7 @@ Named capturing subpatterns: A 3 A 4 Options: dupnames +Starting code units: a c Subject length lower bound = 2 cdefgh\=get=A 0: cdefgh @@ -5802,6 +5966,7 @@ Named capturing subpatterns: Compile options: Overall options: anchored Duplicate name status changes +First code unit = 'a' Subject length lower bound = 2 a1b\=copy=A 0: a1 @@ -5832,6 +5997,7 @@ Named capturing subpatterns: Compile options: Overall options: anchored Duplicate name status changes +First code unit = 'a' Subject length lower bound = 6 a bc d\=copy=A,copy=B,copy=C 0: a bc d @@ -6233,6 +6399,7 @@ Subject length lower bound = 4 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: a b Last code unit = 'b' Subject length lower bound = 2 @@ -6249,6 +6416,7 @@ Subject length lower bound = 2 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: a b Last code unit = 'b' Subject length lower bound = 2 @@ -6265,6 +6433,7 @@ Subject length lower bound = 2 Capturing subpattern count = 0 Compile options: Overall options: anchored +Starting code units: a b Last code unit = 'b' Subject length lower bound = 2 @@ -6281,6 +6450,7 @@ Subject length lower bound = 2 Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'a' Last code unit = 'A' Subject length lower bound = 3 aaaA5 @@ -6302,6 +6472,7 @@ No match Capturing subpattern count = 0 Compile options: caseless Overall options: anchored caseless +Starting code units: A a Last code unit = 'A' (caseless) Subject length lower bound = 2 aaaA5 @@ -9540,6 +9711,7 @@ Subject length lower bound = 2 Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'F' Last code unit = ':' Subject length lower bound = 22 @@ -9691,6 +9863,7 @@ Named capturing subpatterns: D 1 Compile options: dupnames extended Overall options: anchored dupnames extended +Starting code units: a e Subject length lower bound = 2 abcdX 0: abcdX @@ -10445,12 +10618,14 @@ Failed: error 125 at offset 0: lookbehind assertion is not fixed length Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 /(^ab)++/I Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 /(^ab|^)+/I @@ -10471,12 +10646,14 @@ Subject length lower bound = 0 Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 /(?:^ab)++/I Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 2 /(?:^ab|^)+/I @@ -11586,6 +11763,7 @@ Subject length lower bound = 2 Capturing subpattern count = 0 Compile options: dotall Overall options: anchored dotall +First code unit = 'a' Subject length lower bound = 2 /.*?a(*SKIP)b/I @@ -11608,6 +11786,7 @@ Subject length lower bound = 2 Capturing subpattern count = 0 Compile options: dotall Overall options: anchored dotall +First code unit = 'a' Subject length lower bound = 2 /(?>.*?)(?<=(abcd)|(wxyz))/I @@ -13375,7 +13554,6 @@ Subject length lower bound = 1 /(|ab)*?d/I,no_start_optimize Capturing subpattern count = 1 Options: no_start_optimize -Last code unit = 'd' Subject length lower bound = 0 abd 0: abd @@ -13641,12 +13819,14 @@ get substring list failed (-2): partial match Capturing subpattern count = 0 Compile options: Overall options: anchored +First code unit = 'a' Subject length lower bound = 3 /^abc/info,no_dotstar_anchor Capturing subpattern count = 0 Compile options: no_dotstar_anchor Overall options: anchored no_dotstar_anchor +First code unit = 'a' Subject length lower bound = 3 /.*\d/info,auto_callout @@ -14684,6 +14864,7 @@ Capturing subpattern count = 2 Max back reference = 1 Compile options: Overall options: anchored +First code unit = 'o' Last code unit = '}' Subject length lower bound = 65535 @@ -15607,6 +15788,7 @@ No match Capturing subpattern count = 1 Compile options: Overall options: anchored +First code unit = 'b' Subject length lower bound = 2 /(a){0}.*bc/sI @@ -15885,6 +16067,10 @@ No match No match /^(?!A(?C1)B)C/ + ABC\=callout_error=1,no_jit +No match + +/^(?!A(?C1)B)C/no_start_optimize ABC\=callout_error=1 --->ABC 1 ^^ B diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 619942c..ff438e6 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -194,6 +194,7 @@ Subject length lower bound = 3 Capturing subpattern count = 0 Compile options: utf Overall options: anchored utf +Starting code units: a b Subject length lower bound = 1 bar 0: b @@ -205,28 +206,6 @@ No match \x{100} No match -/^[^ab]/IB,utf ------------------------------------------------------------------- - Bra - ^ - [\x00-`c-\xff] (neg) - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Compile options: utf -Overall options: anchored utf -Subject length lower bound = 1 - c - 0: c - \x{ff} - 0: \x{ff} - \x{100} - 0: \x{100} -\= Expect no match - aaa -No match - /\x{100}*(\d+|"(?1)")/utf 1234 0: 1234 @@ -479,7 +458,10 @@ Subject length lower bound = 0 \x{100}X 0: X -/^\ሴ/IB,utf +# Use no_start_optimize because the first code unit is different in 8-bit from +# the wider modes. + +/^\ሴ/IB,utf,no_start_optimize ------------------------------------------------------------------ Bra ^ @@ -488,9 +470,9 @@ Subject length lower bound = 0 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Compile options: utf -Overall options: anchored utf -Subject length lower bound = 1 +Compile options: no_start_optimize utf +Overall options: anchored no_start_optimize utf +Subject length lower bound = 0 /()()()()()()()()()() ()()()()()()()()()()