Allow anchored patterns to use "first code unit" optimization.

This commit is contained in:
Philip.Hazel 2017-06-30 16:00:33 +00:00
parent cc089cf971
commit b7d5cee61f
15 changed files with 673 additions and 273 deletions

View File

@ -205,6 +205,11 @@ JIT.
subjects from 1000 to 2000 for 8-bit searches, since they use memchr() and are
much faster.
46. Arrange for anchored patterns to record and use "first code unit" data,
because this can give a fast "no match" without searching for a "required code
unit". Previously only non-anchored patterns did this.
Version 10.23 14-February-2017
------------------------------

View File

@ -9632,14 +9632,19 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
is_anchored(codestart, 0, &cb, 0, FALSE))
re->overall_options |= PCRE2_ANCHORED;
/* If the pattern is still not anchored and we do not have a first code unit,
see if there is one that is asserted (these are not saved during the compile
because they can cause conflicts with actual literals that follow). This code
need not be obeyed if PCRE2_NO_START_OPTIMIZE is set, as the data it would
create will not be used. */
/* Set up the first code unit or startline flag, the required code unit, and
then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
is set, as the data it would create will not be used. Note that a first code
unit (but not the startline flag) is useful for anchored patterns because it
can still give a quick "no match" and also avoid searching for a last code
unit. */
if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
{
/* If we do not have a first code unit, see if there is one that is asserted
(these are not saved during the compile because they can cause conflicts with
actual literals that follow). */
if (firstcuflags < 0)
firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE);
@ -9672,52 +9677,50 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
}
}
/* When there is no first code unit, see if we can set the PCRE2_STARTLINE
flag. This is helpful for multiline matches when all branches start with ^
and also when all branches start with non-atomic .* for non-DOTALL matches
when *PRUNE and SKIP are not present. (There is an option that disables this
case.) */
/* When there is no first code unit, for non-anchored patterns, see if we can
set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
branches start with ^ and also when all branches start with non-atomic .* for
non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
that disables this case.) */
else if (is_startline(codestart, 0, &cb, 0, FALSE))
else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
is_startline(codestart, 0, &cb, 0, FALSE))
re->flags |= PCRE2_STARTLINE;
}
/* Handle the "required code unit", if one is set. In the case of an anchored
pattern, do this only if it follows a variable length item in the pattern.
Again, skip this if PCRE2_NO_START_OPTIMIZE is set. */
/* Handle the "required code unit", if one is set. In the case of an anchored
pattern, do this only if it follows a variable length item in the pattern. */
if (reqcuflags >= 0 &&
((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0 ||
(reqcuflags & REQ_VARY) != 0))
{
re->last_codeunit = reqcu;
re->flags |= PCRE2_LASTSET;
/* Handle caseless required code units as for first code units (above). */
if ((reqcuflags & REQ_CASELESS) != 0)
if (reqcuflags >= 0 &&
((re->overall_options & PCRE2_ANCHORED) == 0 ||
(reqcuflags & REQ_VARY) != 0))
{
if (reqcu < 128 || (!utf && reqcu < 255))
re->last_codeunit = reqcu;
re->flags |= PCRE2_LASTSET;
/* Handle caseless required code units as for first code units (above). */
if ((reqcuflags & REQ_CASELESS) != 0)
{
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
}
if (reqcu < 128 || (!utf && reqcu < 255))
{
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
}
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#endif
}
}
}
/* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern
to set up information such as a bitmap of starting code units and a minimum
matching length. */
/* Finally, study the compiled pattern to set up information such as a bitmap
of starting code units and a minimum matching length. */
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
PRIV(study)(re) != 0)
{
errorcode = ERR31;
goto HAD_CB_ERROR;
}
if (PRIV(study)(re) != 0)
{
errorcode = ERR31;
goto HAD_CB_ERROR;
}
} /* End of start-of-match optimizations. */
/* Control ends up here in all cases. When running under valgrind, make a
pattern's terminating zero defined again. If memory was obtained for the parsed

View File

@ -3341,34 +3341,27 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
}
#endif /* SUPPORT_UNICODE */
/* Set up the first code unit to match, if available. The first_codeunit value
is never set for an anchored regular expression, but the anchoring may be
forced at run time, so we have to test for anchoring. The first code unit may
be unset for an unanchored pattern, of course. If there's no first code unit
there may be a bitmap of possible first characters. */
/* Set up the first code unit to match, if available. If there's no first code
unit there may be a bitmap of possible first characters. */
if (!anchored)
if ((re->flags & PCRE2_FIRSTSET) != 0)
{
if ((re->flags & PCRE2_FIRSTSET) != 0)
has_first_cu = TRUE;
first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
has_first_cu = TRUE;
first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && first_cu > 127)
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
if (utf && first_cu > 127)
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#endif
}
}
else
if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
start_bits = re->start_bitmap;
}
else
if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
start_bits = re->start_bitmap;
/* For anchored or unanchored matches, there may be a "last known required
character" set. */
/* There may be a "last known required code unit" set. */
if ((re->flags & PCRE2_LASTSET) != 0)
{
@ -3414,8 +3407,8 @@ for (;;)
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
first newline. Implement this by temporarily adjusting end_subject so that
we stop the optimization scans at a newline. If the match fails at the
newline, later code breaks this loop. */
we stop the optimization scans for a first code unit at a newline. If the
match fails at the newline, later code breaks this loop. */
if (firstline)
{
@ -3434,70 +3427,138 @@ for (;;)
while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
/* Anchored: check the first code unit if one is recorded. This may seem
pointless but it can help in detecting a no match case without scanning for
the required code unit. */
/* Advance to a unique first code unit if there is one. */
if (has_first_cu)
if (anchored)
{
PCRE2_UCHAR smc;
if (first_cu != first_cu2)
while (start_match < end_subject &&
(smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
start_match++;
else
while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
start_match++;
}
/* Or to just after a linebreak for a multiline match */
else if (startline)
{
if (start_match > mb->start_subject + start_offset)
if (has_first_cu || start_bits != NULL)
{
#ifdef SUPPORT_UNICODE
if (utf)
BOOL ok = start_match < end_subject;
if (ok)
{
while (start_match < end_subject && !WAS_NEWLINE(start_match))
PCRE2_UCHAR c = UCHAR21TEST(start_match);
ok = has_first_cu && (c == first_cu || c == first_cu2);
if (!ok && start_bits != NULL)
{
start_match++;
ACROSSCHAR(start_match < end_subject, *start_match,
start_match++);
#if PCRE2_CODE_UNIT_WIDTH != 8
if (c > 255) c = 255;
#endif
ok = (start_bits[c/8] & (1 << (c&7))) != 0;
}
}
else
#endif
while (start_match < end_subject && !WAS_NEWLINE(start_match))
start_match++;
/* If we have just passed a CR and the newline option is ANY or
ANYCRLF, and we are now at a LF, advance the match position by one more
code unit. */
if (start_match[-1] == CHAR_CR &&
(mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
start_match < end_subject &&
UCHAR21TEST(start_match) == CHAR_NL)
start_match++;
if (!ok) break;
}
}
/* Or to a non-unique first code unit if any have been identified. The
bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
code units greater than 254 set the 255 bit. */
/* Not anchored. Advance to a unique first code unit if there is one. In
8-bit mode, the use of memchr() gives a big speed up, even though we have
to call it twice in caseless mode, in order to find the earliest occurrence
of the character in either of its cases. */
else if (start_bits != NULL)
else
{
while (start_match < end_subject)
if (has_first_cu)
{
uint32_t c = UCHAR21TEST(start_match);
if (first_cu != first_cu2) /* Caseless */
{
#if PCRE2_CODE_UNIT_WIDTH != 8
if (c > 255) c = 255;
PCRE2_UCHAR smc;
while (start_match < end_subject &&
(smc = UCHAR21TEST(start_match)) != first_cu &&
smc != first_cu2)
start_match++;
#else /* 8-bit code units */
PCRE2_SPTR pp1 =
memchr(start_match, first_cu, end_subject-start_match);
PCRE2_SPTR pp2 =
memchr(start_match, first_cu2, end_subject-start_match);
if (pp1 == NULL)
start_match = (pp2 == NULL)? end_subject : pp2;
else
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
#endif
if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
start_match++;
}
/* The caseful case */
else
{
#if PCRE2_CODE_UNIT_WIDTH != 8
while (start_match < end_subject && UCHAR21TEST(start_match) !=
first_cu)
start_match++;
#else
start_match = memchr(start_match, first_cu, end_subject - start_match);
if (start_match == NULL) start_match = end_subject;
#endif
}
/* If we can't find the required code unit, break the bumpalong loop,
to force a match failure, except when doing partial matching, when we
let the next cycle run at the end of the subject. To see why, consider
the pattern /(?<=abc)def/, which partially matches "abc", even though
the string does not contain the starting character "d". */
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
start_match >= end_subject)
break;
}
}
/* If there's no first code unit, advance to just after a linebreak for a
multiline match if required. */
else if (startline)
{
if (start_match > mb->start_subject + start_offset)
{
#ifdef SUPPORT_UNICODE
if (utf)
{
while (start_match < end_subject && !WAS_NEWLINE(start_match))
{
start_match++;
ACROSSCHAR(start_match < end_subject, *start_match,
start_match++);
}
}
else
#endif
while (start_match < end_subject && !WAS_NEWLINE(start_match))
start_match++;
/* If we have just passed a CR and the newline option is ANY or
ANYCRLF, and we are now at a LF, advance the match position by one
more code unit. */
if (start_match[-1] == CHAR_CR &&
(mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
start_match < end_subject &&
UCHAR21TEST(start_match) == CHAR_NL)
start_match++;
}
}
/* If there's no first code unit or a requirement for a multiline line
start, advance to a non-unique first code unit if any have been
identified. The bitmap contains only 256 bits. When code units are 16 or
32 bits wide, all code units greater than 254 set the 255 bit. */
else if (start_bits != NULL)
{
while (start_match < end_subject)
{
uint32_t c = UCHAR21TEST(start_match);
#if PCRE2_CODE_UNIT_WIDTH != 8
if (c > 255) c = 255;
#endif
if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
start_match++;
}
}
} /* End of first code unit handling */
/* Restore fudged end_subject */

View File

@ -270,7 +270,7 @@ pcre2_callout_block cb;
*lengthptr = (*Fecode == OP_CALLOUT)?
PRIV(OP_lengths)[OP_CALLOUT] : GET(Fecode, 1 + 2*LINK_SIZE);
if (mb->callout == NULL) return 0; /* No callout function provided */
/* The original matching code (pre 10.30) worked directly with the ovector
@ -279,11 +279,11 @@ ovector is in the backtracking frame, it no longer needs to reserve space for
the overall match offsets (which would waste space in the frame). For backward
compatibility, however, we pass capture_top and offset_vector to the callout as
if for the extended ovector, and we ensure that the first two slots are unset
by preserving and restoring their current contents. Picky compilers complain if
references such as Fovector[-2] are use directly, so we set up a separate
by preserving and restoring their current contents. Picky compilers complain if
references such as Fovector[-2] are use directly, so we set up a separate
pointer. */
callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
callout_ovector = (PCRE2_SIZE *)(Fovector) - 2;
cb.version = 1;
cb.capture_top = (uint32_t)Foffset_top/2 + 1;
@ -935,8 +935,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* ===================================================================== */
/* Match a single character, caselessly. If we are at the end of the
subject, give up immediately. We get here only when the pattern character
has at most one other case. Characters with more than two cases are coded
subject, give up immediately. We get here only when the pattern character
has at most one other case. Characters with more than two cases are coded
as OP_PROP with the pseudo-property PT_CLIST. */
case OP_CHARI:
@ -954,7 +954,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
GETCHARLEN(fc, Fecode, Flength);
/* If the pattern character's value is < 128, we know that its other case
(if any) is also < 128 (and therefore only one code unit long in all
(if any) is also < 128 (and therefore only one code unit long in all
code-unit widths), so we can use the fast lookup table. We checked above
that there is at least one character left in the subject. */
@ -966,7 +966,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
Feptr++;
}
/* Otherwise we must pick up the subject character and use Unicode
/* Otherwise we must pick up the subject character and use Unicode
property support to test its other case. Note that we cannot use the
value of "Flength" to check for sufficient bytes left, because the other
case of the character may have more or fewer code units. */
@ -3056,7 +3056,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
Feptr += Lmin;
break;
/* This OP_ANYBYTE case will never be reached because \C gets turned
into OP_ALLANY in non-UTF mode. Cut out the code so that coverage
reports don't complain about it's never being used. */
@ -5352,8 +5352,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
(char *)assert_accept_frame + offsetof(heapframe, ovector),
assert_accept_frame->offset_top * sizeof(PCRE2_SIZE));
Foffset_top = assert_accept_frame->offset_top;
/* Fall through */
/* Fall through */
/* In the case of a match, the captures have already been put into
the current frame. */
@ -5650,7 +5650,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if ((mb->moptions & PCRE2_NOTEOL) != 0) RRETURN(MATCH_NOMATCH);
if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
/* Fall through */
/* Fall through */
/* Unconditional end of subject assertion (\z) */
case OP_EOD:
@ -6280,7 +6280,7 @@ The last of these is changed within the match() function if the frame vector
has to be expanded. We therefore put it into the match block so that it is
correct when calling match() more than once for non-anchored patterns. */
frame_size = offsetof(heapframe, ovector) +
frame_size = offsetof(heapframe, ovector) +
re->top_bracket * 2 * sizeof(PCRE2_SIZE);
/* Limits set in the pattern override the match context only if they are
@ -6333,33 +6333,26 @@ mb->lcc = re->tables + lcc_offset;
mb->fcc = re->tables + fcc_offset;
mb->ctypes = re->tables + ctypes_offset;
/* Set up the first code unit to match, if available. The first_codeunit value
is never set for an anchored regular expression, but the anchoring may be
forced at run time, so we have to test for anchoring. The first code unit may
be unset for an unanchored pattern, of course. If there's no first code unit
there may be a bitmap of possible first characters. */
/* Set up the first code unit to match, if available. If there's no first code
unit there may be a bitmap of possible first characters. */
if (!anchored)
if ((re->flags & PCRE2_FIRSTSET) != 0)
{
if ((re->flags & PCRE2_FIRSTSET) != 0)
has_first_cu = TRUE;
first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
has_first_cu = TRUE;
first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
#endif
}
}
else
if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
start_bits = re->start_bitmap;
}
else
if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
start_bits = re->start_bitmap;
/* For anchored or unanchored matches, there may be a "last known required
character" set. */
/* There may also be a "last known required character" set. */
if ((re->flags & PCRE2_LASTSET) != 0)
{
@ -6398,8 +6391,8 @@ for(;;)
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
first newline. Implement this by temporarily adjusting end_subject so that
we stop the optimization scans at a newline. If the match fails at the
newline, later code breaks this loop. */
we stop the optimization scans for a first code unit at a newline. If the
match fails at the newline, later code breaks this loop. */
if (firstline)
{
@ -6419,107 +6412,143 @@ for(;;)
end_subject = t;
}
/* Advance to a unique first code unit if there is one. In 8-bit mode, the
use of memchr() gives a big speed up, even though we have to call it twice
in caseless mode, in order to find the first occurrence of the character in
either of its cases. */
/* Anchored: check the first code unit if one is recorded. This may seem
pointless but it can help in detecting a no match case without scanning for
the required code unit. */
if (has_first_cu)
if (anchored)
{
if (first_cu != first_cu2) /* Caseless */
if (has_first_cu || start_bits != NULL)
{
#if PCRE2_CODE_UNIT_WIDTH != 8
PCRE2_UCHAR smc;
while (start_match < end_subject &&
(smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
start_match++;
#else /* 8-bit code units */
PCRE2_SPTR pp1 = memchr(start_match, first_cu, end_subject-start_match);
PCRE2_SPTR pp2 = memchr(start_match, first_cu2, end_subject-start_match);
if (pp1 == NULL)
start_match = (pp2 == NULL)? end_subject : pp2;
else
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
#endif
}
/* The caseful case */
else
{
#if PCRE2_CODE_UNIT_WIDTH != 8
while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
start_match++;
#else
start_match = memchr(start_match, first_cu, end_subject - start_match);
if (start_match == NULL) start_match = end_subject;
#endif
}
/* If we can't find the required code unit, break the bumpalong loop, to
force a match failure, except when doing partial matching, when we let
the next cycle run at the end of the subject. To see why, consider the
pattern /(?<=abc)def/, which partially matches "abc", even though the
string does not contain the starting character "d". */
if (!mb->partial && start_match >= end_subject)
{
rc = MATCH_NOMATCH;
break;
}
}
/* If there's no first code unit, advance to just after a linebreak for a
multiline match if required. */
else if (startline)
{
if (start_match > mb->start_subject + start_offset)
{
#ifdef SUPPORT_UNICODE
if (utf)
BOOL ok = start_match < end_subject;
if (ok)
{
while (start_match < end_subject && !WAS_NEWLINE(start_match))
PCRE2_UCHAR c = UCHAR21TEST(start_match);
ok = has_first_cu && (c == first_cu || c == first_cu2);
if (!ok && start_bits != NULL)
{
start_match++;
ACROSSCHAR(start_match < end_subject, *start_match,
start_match++);
#if PCRE2_CODE_UNIT_WIDTH != 8
if (c > 255) c = 255;
#endif
ok = (start_bits[c/8] & (1 << (c&7))) != 0;
}
}
else
#endif
while (start_match < end_subject && !WAS_NEWLINE(start_match))
start_match++;
/* If we have just passed a CR and the newline option is ANY or
ANYCRLF, and we are now at a LF, advance the match position by one more
code unit. */
if (start_match[-1] == CHAR_CR &&
(mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
start_match < end_subject &&
UCHAR21TEST(start_match) == CHAR_NL)
start_match++;
if (!ok)
{
rc = MATCH_NOMATCH;
break;
}
}
}
/* If there's no first code unit or a requirement for a multiline line
start, advance to a non-unique first code unit if any have been identified.
The bitmap contains only 256 bits. When code units are 16 or 32 bits wide,
all code units greater than 254 set the 255 bit. */
/* Not anchored. Advance to a unique first code unit if there is one. In
8-bit mode, the use of memchr() gives a big speed up, even though we have
to call it twice in caseless mode, in order to find the earliest occurrence
of the character in either of its cases. */
else if (start_bits != NULL)
else
{
while (start_match < end_subject)
if (has_first_cu)
{
uint32_t c = UCHAR21TEST(start_match);
if (first_cu != first_cu2) /* Caseless */
{
#if PCRE2_CODE_UNIT_WIDTH != 8
if (c > 255) c = 255;
PCRE2_UCHAR smc;
while (start_match < end_subject &&
(smc = UCHAR21TEST(start_match)) != first_cu &&
smc != first_cu2)
start_match++;
#else /* 8-bit code units */
PCRE2_SPTR pp1 =
memchr(start_match, first_cu, end_subject-start_match);
PCRE2_SPTR pp2 =
memchr(start_match, first_cu2, end_subject-start_match);
if (pp1 == NULL)
start_match = (pp2 == NULL)? end_subject : pp2;
else
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
#endif
if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
start_match++;
}
/* The caseful case */
else
{
#if PCRE2_CODE_UNIT_WIDTH != 8
while (start_match < end_subject && UCHAR21TEST(start_match) !=
first_cu)
start_match++;
#else
start_match = memchr(start_match, first_cu, end_subject - start_match);
if (start_match == NULL) start_match = end_subject;
#endif
}
/* If we can't find the required code unit, break the bumpalong loop,
to force a match failure, except when doing partial matching, when we
let the next cycle run at the end of the subject. To see why, consider
the pattern /(?<=abc)def/, which partially matches "abc", even though
the string does not contain the starting character "d". */
if (!mb->partial && start_match >= end_subject)
{
rc = MATCH_NOMATCH;
break;
}
}
}
/* If there's no first code unit, advance to just after a linebreak for a
multiline match if required. */
else if (startline)
{
if (start_match > mb->start_subject + start_offset)
{
#ifdef SUPPORT_UNICODE
if (utf)
{
while (start_match < end_subject && !WAS_NEWLINE(start_match))
{
start_match++;
ACROSSCHAR(start_match < end_subject, *start_match,
start_match++);
}
}
else
#endif
while (start_match < end_subject && !WAS_NEWLINE(start_match))
start_match++;
/* If we have just passed a CR and the newline option is ANY or
ANYCRLF, and we are now at a LF, advance the match position by one
more code unit. */
if (start_match[-1] == CHAR_CR &&
(mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
start_match < end_subject &&
UCHAR21TEST(start_match) == CHAR_NL)
start_match++;
}
}
/* If there's no first code unit or a requirement for a multiline line
start, advance to a non-unique first code unit if any have been
identified. The bitmap contains only 256 bits. When code units are 16 or
32 bits wide, all code units greater than 254 set the 255 bit. */
else if (start_bits != NULL)
{
while (start_match < end_subject)
{
uint32_t c = UCHAR21TEST(start_match);
#if PCRE2_CODE_UNIT_WIDTH != 8
if (c > 255) c = 255;
#endif
if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
start_match++;
}
}
} /* End first code unit handling */
/* Restore fudged end_subject */

View File

@ -799,7 +799,7 @@ if (caseless)
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
#endif
}
else
else
#endif /* SUPPORT_UNICODE */
/* Not UTF */
@ -953,7 +953,6 @@ do
case OP_ALLANY:
case OP_ANY:
case OP_ANYBYTE:
case OP_CIRC:
case OP_CIRCM:
case OP_CLOSE:
case OP_COMMIT:
@ -1021,6 +1020,13 @@ do
case OP_THEN_ARG:
return SSB_FAIL;
/* OP_CIRC happens only at the start of an anchored branch (multiline ^
uses OP_CIRCM). Skip over it. */
case OP_CIRC:
tcode += PRIV(OP_lengths)[OP_CIRC];
break;
/* A "real" property test implies no starting bits, but the fake property
PT_CLIST identifies a list of characters. These lists are short, as they
are used for characters with more than one "other case", so there is no
@ -1450,7 +1456,7 @@ do
#endif
/* It seems that the fall through comment must be outside the #ifdef if
it is to avoid the gcc compiler warning. */
/* Fall through */
/* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
@ -1579,12 +1585,11 @@ BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
re->name_entry_size * re->name_count;
/* For an anchored pattern, or an unanchored pattern that has a first code
unit, or a multiline pattern that matches only at "line start", there is no
point in seeking a list of starting code units. */
/* For a pattern that has a first code unit, or a multiline pattern that
matches only at "line start", there is no point in seeking a list of starting
code units. */
if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
(re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
{
int rc = set_start_bits(re, code, utf);
if (rc == SSB_UNKNOWN) return 1;

View File

@ -466,5 +466,14 @@
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
\x{dfff}\x{df01}\=no_utf_check
# This has different starting code units in 8-bit mode.
/^[^ab]/IB,utf
c
\x{ff}
\x{100}
\= Expect no match
aaa
# End of testinput10

View File

@ -373,4 +373,13 @@
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
\x{dfff}\x{df01}\=no_utf_check
# This has different starting code units in 8-bit mode.
/^[^ab]/IB,utf
c
\x{ff}
\x{100}
\= Expect no match
aaa
# End of testinput12

3
testdata/testinput2 vendored
View File

@ -5256,6 +5256,9 @@ a)"xI
XAB
/^(?!A(?C1)B)C/
ABC\=callout_error=1,no_jit
/^(?!A(?C1)B)C/no_start_optimize
ABC\=callout_error=1
/^(?(?!A(?C1)B)C)/

12
testdata/testinput5 vendored
View File

@ -120,13 +120,6 @@
\x{ff}
\x{100}
/^[^ab]/IB,utf
c
\x{ff}
\x{100}
\= Expect no match
aaa
/\x{100}*(\d+|"(?1)")/utf
1234
"1234"
@ -190,7 +183,10 @@
/\w/utf
\x{100}X
/^\ሴ/IB,utf
# Use no_start_optimize because the first code unit is different in 8-bit from
# the wider modes.
/^\ሴ/IB,utf,no_start_optimize
/()()()()()()()()()()
()()()()()()()()()()

34
testdata/testoutput10 vendored
View File

@ -1585,5 +1585,39 @@ Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP),
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
\x{dfff}\x{df01}\=no_utf_check
0: \x{dfff}\x{df01}
# This has different starting code units in 8-bit mode.
/^[^ab]/IB,utf
------------------------------------------------------------------
Bra
^
[\x00-`c-\xff] (neg)
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 0
Compile options: utf
Overall options: anchored utf
Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
\x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
\x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f
\xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0
\xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf
\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee
\xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd
\xfe \xff
Subject length lower bound = 1
c
0: c
\x{ff}
0: \x{ff}
\x{100}
0: \x{100}
\= Expect no match
aaa
No match
# End of testinput10

View File

@ -1433,4 +1433,42 @@ Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowe
Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
\x{dfff}\x{df01}\=no_utf_check
# This has different starting code units in 8-bit mode.
/^[^ab]/IB,utf
------------------------------------------------------------------
Bra
^
[\x00-`c-\xff] (neg)
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 0
Compile options: utf
Overall options: anchored utf
Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
\x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
\x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f
\x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e
\x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d
\x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac
\xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb
\xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca
\xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9
\xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8
\xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7
\xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1
c
0: c
\x{ff}
0: \x{ff}
\x{100}
0: \x{100}
\= Expect no match
aaa
No match
# End of testinput12

View File

@ -1425,4 +1425,42 @@ No match
\x{dfff}\x{df01}\=no_utf_check
0: \x{dfff}\x{df01}
# This has different starting code units in 8-bit mode.
/^[^ab]/IB,utf
------------------------------------------------------------------
Bra
^
[\x00-`c-\xff] (neg)
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 0
Compile options: utf
Overall options: anchored utf
Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
\x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
\x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f
\x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e
\x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d
\x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac
\xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb
\xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca
\xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9
\xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8
\xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7
\xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1
c
0: c
\x{ff}
0: \x{ff}
\x{100}
0: \x{100}
\= Expect no match
aaa
No match
# End of testinput12

View File

@ -368,6 +368,7 @@ No match
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 6
JIT compilation was successful
#pop jitverify
@ -379,6 +380,7 @@ JIT compilation was successful
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 6
JIT compilation was successful
#save testsaved1

188
testdata/testoutput2 vendored
View File

@ -72,6 +72,7 @@ No match
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 3
abc
0: abc
@ -110,6 +111,7 @@ Subject length lower bound = 2
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 3
abc
0: abc
@ -339,6 +341,7 @@ Subject length lower bound = 19
/the quick brown fox/I,anchored
Capturing subpattern count = 0
Options: anchored
First code unit = 't'
Subject length lower bound = 19
the quick brown fox
0: the quick brown fox
@ -351,6 +354,7 @@ Failed: error 111 at offset 4: unrecognized character after (? or (?-
/^abc|def/I
Capturing subpattern count = 0
Starting code units: a d
Subject length lower bound = 3
abcdef
0: abc
@ -495,12 +499,14 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = '1'
Subject length lower bound = 4
/(^b|(?i)^d)/I
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
Starting code units: D b d
Subject length lower bound = 1
/(?s).*/I
@ -624,6 +630,7 @@ Capturing subpattern count = 0
Max lookbehind = 1
Compile options: multiline
Overall options: anchored multiline
First code unit = 'a'
Subject length lower bound = 3
/^abc/Im
@ -637,6 +644,7 @@ Subject length lower bound = 3
Capturing subpattern count = 5
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 3
aaaaabbbbbcccccdef
0: aaaaabbbbbcccccdef
@ -808,6 +816,7 @@ Capturing subpattern count = 1
Max back reference = 1
Compile options: <none>
Overall options: anchored
Starting code units: a
Subject length lower bound = 4
\= Expect no match
aaaa
@ -1004,6 +1013,7 @@ Subject length lower bound = 16
Capturing subpattern count = 3
Compile options: <none>
Overall options: anchored
Starting code units: a b
Subject length lower bound = 4
adef\=get=1,get=2,get=3,get=4,getall
0: adef
@ -1042,6 +1052,7 @@ Get substring 4 failed (-49): unknown substring
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 7
abc\00def\=copy=0,getall
0: abc\x00def
@ -1227,6 +1238,7 @@ Subject length lower bound = 3
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'i'
Subject length lower bound = 3
ississippi
0: iss
@ -1286,6 +1298,7 @@ Capturing subpattern count = 0
Contains explicit CR or LF match
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 3
ab\nab\ncd
0: ab\x0a
@ -1776,6 +1789,8 @@ Subject length lower bound = 2
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P
Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z
Subject length lower bound = 1
/^[[:^alnum:]]/IB
@ -1789,6 +1804,18 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
\x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
\x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = >
? @ [ \ ] ^ _ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88
\x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97
\x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6
\xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5
\xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4
\xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3
\xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2
\xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1
\xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1
/^[[:alpha:]]/IB
@ -1802,6 +1829,8 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
a b c d e f g h i j k l m n o p q r s t u v w x y z
Subject length lower bound = 1
/^[[:^alpha:]]/IB
@ -1815,6 +1844,19 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
\x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
\x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
5 6 7 8 9 : ; < = > ? @ [ \ ] ^ _ ` { | } ~ \x7f \x80 \x81 \x82 \x83 \x84
\x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93
\x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2
\xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1
\xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0
\xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf
\xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde
\xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed
\xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc
\xfd \xfe \xff
Subject length lower bound = 1
/[_[:alpha:]]/I
@ -1834,6 +1876,12 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
\x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
\x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4
5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y
Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
\x7f
Subject length lower bound = 1
/^[[:^ascii:]]/IB
@ -1847,6 +1895,15 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a
\x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99
\x9a \x9b \x9c \x9d \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8
\xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7
\xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6
\xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5
\xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4
\xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3
\xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1
/^[[:blank:]]/IB
@ -1860,6 +1917,7 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x09 \x20
Subject length lower bound = 1
/^[[:^blank:]]/IB
@ -1873,6 +1931,20 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b
\x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a
\x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9
: ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^
_ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80
\x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f
\x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e
\x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad
\xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc
\xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb
\xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda
\xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9
\xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8
\xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1
/[\n\x0b\x0c\x0d[:blank:]]/I
@ -1892,6 +1964,9 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
\x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
\x1a \x1b \x1c \x1d \x1e \x1f \x7f
Subject length lower bound = 1
/^[[:digit:]]/IB
@ -1905,6 +1980,7 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: 0 1 2 3 4 5 6 7 8 9
Subject length lower bound = 1
/^[[:graph:]]/IB
@ -1918,6 +1994,9 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 :
; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
Subject length lower bound = 1
/^[[:lower:]]/IB
@ -1931,6 +2010,7 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: a b c d e f g h i j k l m n o p q r s t u v w x y z
Subject length lower bound = 1
/^[[:print:]]/IB
@ -1944,6 +2024,9 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8
9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ]
^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
Subject length lower bound = 1
/^[[:punct:]]/IB
@ -1957,6 +2040,8 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^
_ ` { | } ~
Subject length lower bound = 1
/^[[:space:]]/IB
@ -1970,6 +2055,7 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x09 \x0a \x0b \x0c \x0d \x20
Subject length lower bound = 1
/^[[:upper:]]/IB
@ -1983,6 +2069,7 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
Subject length lower bound = 1
/^[[:xdigit:]]/IB
@ -1996,6 +2083,7 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f
Subject length lower bound = 1
/^[[:word:]]/IB
@ -2009,6 +2097,8 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P
Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z
Subject length lower bound = 1
/^[[:^cntrl:]]/IB
@ -2022,6 +2112,18 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8
9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ]
^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x80 \x81
\x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90
\x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f
\xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae
\xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd
\xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc
\xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb
\xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea
\xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9
\xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1
/^[12[:^digit:]]/IB
@ -2035,6 +2137,20 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
\x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
\x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 1 2 : ; <
= > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a
b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82
\x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91
\x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa0
\xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf
\xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe
\xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd
\xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc
\xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb
\xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa
\xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1
/^[[:^blank:]]/IB
@ -2048,6 +2164,20 @@ Subject length lower bound = 1
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0a \x0b
\x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a
\x1b \x1c \x1d \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9
: ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^
_ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80
\x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f
\x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e
\x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad
\xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc
\xbd \xbe \xbf \xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb
\xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda
\xdb \xdc \xdd \xde \xdf \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9
\xea \xeb \xec \xed \xee \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8
\xf9 \xfa \xfb \xfc \xfd \xfe \xff
Subject length lower bound = 1
/[01[:alpha:]%]/IB
@ -2418,6 +2548,7 @@ Subject length lower bound = 4
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 1
aba
0: aba
@ -2428,6 +2559,7 @@ Subject length lower bound = 1
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbaa
0: aabbaa
@ -2438,6 +2570,7 @@ Subject length lower bound = 2
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbaa
0: aabbaa
@ -2448,6 +2581,7 @@ Subject length lower bound = 2
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbaa
0: aabbaa
@ -2458,6 +2592,7 @@ Subject length lower bound = 2
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbaa
0: aabbaa
@ -2467,6 +2602,7 @@ Subject length lower bound = 2
Capturing subpattern count = 3
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbaa
0: aabbaa
@ -2478,6 +2614,7 @@ Subject length lower bound = 2
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbaa
0: aabbaa
@ -2488,6 +2625,7 @@ Subject length lower bound = 2
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbaa
0: aabbaa
@ -2497,6 +2635,7 @@ Subject length lower bound = 2
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbbaa
0: aabbbaa
@ -2506,6 +2645,7 @@ Subject length lower bound = 2
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbbaa
0: aabbbaa
@ -2515,6 +2655,7 @@ Subject length lower bound = 2
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbaa
0: aabbaa
@ -2524,6 +2665,7 @@ Subject length lower bound = 2
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbbaa
0: aabbbaa
@ -2533,6 +2675,7 @@ Subject length lower bound = 2
Capturing subpattern count = 3
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbbaa
0: aabbbaa
@ -2544,6 +2687,7 @@ Subject length lower bound = 2
Capturing subpattern count = 3
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
aabbbbaa
0: aabbbbaa
@ -3052,6 +3196,7 @@ Subject length lower bound = 3
Capturing subpattern count = 5
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 3
/^x(?U)a+b/IB
@ -3067,6 +3212,7 @@ Subject length lower bound = 3
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'x'
Last code unit = 'b'
Subject length lower bound = 3
@ -3085,6 +3231,7 @@ Subject length lower bound = 3
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'x'
Last code unit = 'b'
Subject length lower bound = 3
@ -3725,6 +3872,7 @@ Subject length lower bound = 3
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 3
/(?C)a|b/I
@ -3785,6 +3933,7 @@ No match
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = '>'
Last code unit = '<'
Subject length lower bound = 10
>abc>123<xyz<
@ -3835,6 +3984,7 @@ Subject length lower bound = 2
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
Starting code units: ( - 0 1 2 3 4 5 6 7 8 9
Subject length lower bound = 1
12
0: 12
@ -3854,6 +4004,7 @@ No match
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
First code unit = 'x'
Subject length lower bound = 3
xyz
0: xyz
@ -3913,6 +4064,7 @@ Failed: error 114 at offset 10: missing closing parenthesis
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 9
abcdefabc
0: abcdefabc
@ -3922,6 +4074,7 @@ Subject length lower bound = 9
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
Starting code units: a b c
Subject length lower bound = 2
a=a
0: a=a
@ -3937,6 +4090,7 @@ Subject length lower bound = 2
Capturing subpattern count = 2
Compile options: <none>
Overall options: anchored
Starting code units: a b c
Subject length lower bound = 2
a=a
0: a=a
@ -5173,6 +5327,7 @@ No match
Capturing subpattern count = 3
Compile options: <none>
Overall options: anchored
Starting code units: 0 1 2 3 4 5 6 7 8 9
Last code unit = '/'
Subject length lower bound = 6
13/05/04\=ps
@ -5270,6 +5425,7 @@ Partial match: c12
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: 0 1 2 3 4 5 6 7 8 9
Last code unit = 'X'
Subject length lower bound = 4
1\=ps
@ -5643,6 +5799,7 @@ Named capturing subpatterns:
A 3
Compile options: dupnames
Overall options: anchored dupnames
First code unit = 'a'
Subject length lower bound = 2
a1b\=copy=A
0: a1
@ -5680,6 +5837,7 @@ Named capturing subpatterns:
A 2
Compile options: dupnames
Overall options: anchored dupnames
First code unit = 'a'
Subject length lower bound = 2
ab\=copy=A
0: ab
@ -5693,6 +5851,7 @@ Named capturing subpatterns:
A 1
A 2
Options: dupnames
Starting code units: a c
Subject length lower bound = 2
ab\=copy=A
0: ab
@ -5711,6 +5870,7 @@ Named capturing subpatterns:
A 3
A 4
Options: dupnames
Starting code units: a c
Subject length lower bound = 2
cdefgh\=copy=A
0: cdefgh
@ -5727,6 +5887,7 @@ Named capturing subpatterns:
A 3
Compile options: dupnames
Overall options: anchored dupnames
First code unit = 'a'
Subject length lower bound = 2
a1b\=get=A
0: a1
@ -5754,6 +5915,7 @@ Named capturing subpatterns:
A 2
Compile options: dupnames
Overall options: anchored dupnames
First code unit = 'a'
Subject length lower bound = 2
ab\=get=A
0: ab
@ -5767,6 +5929,7 @@ Named capturing subpatterns:
A 1
A 2
Options: dupnames
Starting code units: a c
Subject length lower bound = 2
ab\=get=A
0: ab
@ -5785,6 +5948,7 @@ Named capturing subpatterns:
A 3
A 4
Options: dupnames
Starting code units: a c
Subject length lower bound = 2
cdefgh\=get=A
0: cdefgh
@ -5802,6 +5966,7 @@ Named capturing subpatterns:
Compile options: <none>
Overall options: anchored
Duplicate name status changes
First code unit = 'a'
Subject length lower bound = 2
a1b\=copy=A
0: a1
@ -5832,6 +5997,7 @@ Named capturing subpatterns:
Compile options: <none>
Overall options: anchored
Duplicate name status changes
First code unit = 'a'
Subject length lower bound = 6
a bc d\=copy=A,copy=B,copy=C
0: a bc d
@ -6233,6 +6399,7 @@ Subject length lower bound = 4
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: a b
Last code unit = 'b'
Subject length lower bound = 2
@ -6249,6 +6416,7 @@ Subject length lower bound = 2
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: a b
Last code unit = 'b'
Subject length lower bound = 2
@ -6265,6 +6433,7 @@ Subject length lower bound = 2
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
Starting code units: a b
Last code unit = 'b'
Subject length lower bound = 2
@ -6281,6 +6450,7 @@ Subject length lower bound = 2
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Last code unit = 'A'
Subject length lower bound = 3
aaaA5
@ -6302,6 +6472,7 @@ No match
Capturing subpattern count = 0
Compile options: caseless
Overall options: anchored caseless
Starting code units: A a
Last code unit = 'A' (caseless)
Subject length lower bound = 2
aaaA5
@ -9540,6 +9711,7 @@ Subject length lower bound = 2
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'F'
Last code unit = ':'
Subject length lower bound = 22
@ -9691,6 +9863,7 @@ Named capturing subpatterns:
D 1
Compile options: dupnames extended
Overall options: anchored dupnames extended
Starting code units: a e
Subject length lower bound = 2
abcdX
0: abcdX
@ -10445,12 +10618,14 @@ Failed: error 125 at offset 0: lookbehind assertion is not fixed length
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
/(^ab)++/I
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
/(^ab|^)+/I
@ -10471,12 +10646,14 @@ Subject length lower bound = 0
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
/(?:^ab)++/I
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 2
/(?:^ab|^)+/I
@ -11586,6 +11763,7 @@ Subject length lower bound = 2
Capturing subpattern count = 0
Compile options: dotall
Overall options: anchored dotall
First code unit = 'a'
Subject length lower bound = 2
/.*?a(*SKIP)b/I
@ -11608,6 +11786,7 @@ Subject length lower bound = 2
Capturing subpattern count = 0
Compile options: dotall
Overall options: anchored dotall
First code unit = 'a'
Subject length lower bound = 2
/(?>.*?)(?<=(abcd)|(wxyz))/I
@ -13375,7 +13554,6 @@ Subject length lower bound = 1
/(|ab)*?d/I,no_start_optimize
Capturing subpattern count = 1
Options: no_start_optimize
Last code unit = 'd'
Subject length lower bound = 0
abd
0: abd
@ -13641,12 +13819,14 @@ get substring list failed (-2): partial match
Capturing subpattern count = 0
Compile options: <none>
Overall options: anchored
First code unit = 'a'
Subject length lower bound = 3
/^abc/info,no_dotstar_anchor
Capturing subpattern count = 0
Compile options: no_dotstar_anchor
Overall options: anchored no_dotstar_anchor
First code unit = 'a'
Subject length lower bound = 3
/.*\d/info,auto_callout
@ -14684,6 +14864,7 @@ Capturing subpattern count = 2
Max back reference = 1
Compile options: <none>
Overall options: anchored
First code unit = 'o'
Last code unit = '}'
Subject length lower bound = 65535
@ -15607,6 +15788,7 @@ No match
Capturing subpattern count = 1
Compile options: <none>
Overall options: anchored
First code unit = 'b'
Subject length lower bound = 2
/(a){0}.*bc/sI
@ -15885,6 +16067,10 @@ No match
No match
/^(?!A(?C1)B)C/
ABC\=callout_error=1,no_jit
No match
/^(?!A(?C1)B)C/no_start_optimize
ABC\=callout_error=1
--->ABC
1 ^^ B

34
testdata/testoutput5 vendored
View File

@ -194,6 +194,7 @@ Subject length lower bound = 3
Capturing subpattern count = 0
Compile options: utf
Overall options: anchored utf
Starting code units: a b
Subject length lower bound = 1
bar
0: b
@ -205,28 +206,6 @@ No match
\x{100}
No match
/^[^ab]/IB,utf
------------------------------------------------------------------
Bra
^
[\x00-`c-\xff] (neg)
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 0
Compile options: utf
Overall options: anchored utf
Subject length lower bound = 1
c
0: c
\x{ff}
0: \x{ff}
\x{100}
0: \x{100}
\= Expect no match
aaa
No match
/\x{100}*(\d+|"(?1)")/utf
1234
0: 1234
@ -479,7 +458,10 @@ Subject length lower bound = 0
\x{100}X
0: X
/^\ሴ/IB,utf
# Use no_start_optimize because the first code unit is different in 8-bit from
# the wider modes.
/^\ሴ/IB,utf,no_start_optimize
------------------------------------------------------------------
Bra
^
@ -488,9 +470,9 @@ Subject length lower bound = 0
End
------------------------------------------------------------------
Capturing subpattern count = 0
Compile options: utf
Overall options: anchored utf
Subject length lower bound = 1
Compile options: no_start_optimize utf
Overall options: anchored no_start_optimize utf
Subject length lower bound = 0
/()()()()()()()()()()
()()()()()()()()()()