Refactor match_ref() and fix UTF-8 caseless bug.
This commit is contained in:
parent
fa3b6bb251
commit
cfb78cbd53
19
ChangeLog
19
ChangeLog
|
@ -5,11 +5,14 @@ Version 10.0 xx-xxxx-2014
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
Version 10.0 is the first release of PCRE2, a revised API for the PCRE library.
|
Version 10.0 is the first release of PCRE2, a revised API for the PCRE library.
|
||||||
Changes prior to 10.0 are logged in the ChangeLog file for the old API.
|
Changes prior to 10.0 are logged in the ChangeLog file for the old API, up to
|
||||||
|
item 20 for release 8.36.
|
||||||
|
|
||||||
The code of the library was heavily revised as part of the new API
|
The code of the library was heavily revised as part of the new API
|
||||||
implementation. Details of each and every modification were not individually
|
implementation. Details of each and every modification were not individually
|
||||||
logged. In addition to the API changes, the following changes were made:
|
logged. In addition to the API changes, the following changes were made. They
|
||||||
|
are either new functionality, or bugs that were fixed after the code had been
|
||||||
|
forked.
|
||||||
|
|
||||||
1. The test program, now called pcre2test, was re-specified and almost
|
1. The test program, now called pcre2test, was re-specified and almost
|
||||||
completely re-written. Its input is not compatible with input for pcretest.
|
completely re-written. Its input is not compatible with input for pcretest.
|
||||||
|
@ -23,4 +26,16 @@ not writing the function calls themselves, it is possible to check the PCRE2
|
||||||
version by matching a pattern such as /(?(VERSION>=10.0)yes|no)/ against a
|
version by matching a pattern such as /(?(VERSION>=10.0)yes|no)/ against a
|
||||||
string such as "yesno".
|
string such as "yesno".
|
||||||
|
|
||||||
|
4. There are case-equivalent Unicode characters whose encodings use different
|
||||||
|
numbers of code units in UTF-8. U+023A and U+2C65 are one example. (It is
|
||||||
|
theoretically possible for this to happen in UTF-16 too.) If a backreference to
|
||||||
|
a group containing one of these characters was greedily repeated, and during
|
||||||
|
the match a backtrack occurred, the subject might be backtracked by the wrong
|
||||||
|
number of code units. For example, if /^(\x{23a})\1*(.)/ is matched caselessly
|
||||||
|
(and in UTF-8 mode) against "\x{23a}\x{2c65}\x{2c65}\x{2c65}", group 2 should
|
||||||
|
capture the final character, which is the three bytes E2, B1, and A5 in UTF-8.
|
||||||
|
Incorrect backtracking meant that group 2 captured only the last two bytes.
|
||||||
|
This bug has been fixed; the new code is slower, but it is used only when the
|
||||||
|
strings matched by the repetition are not all the same length.
|
||||||
|
|
||||||
****
|
****
|
||||||
|
|
|
@ -122,39 +122,54 @@ ovector length is always a multiple of 3. */
|
||||||
* Match a back-reference *
|
* Match a back-reference *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* Normally, if a back reference hasn't been set, the length that is passed is
|
/* This function is called only when it is known that the offset lies within
|
||||||
negative, so the match always fails. However, in JavaScript compatibility mode,
|
the offsets that have so far been used in the match. Note that in caseless
|
||||||
the length passed is zero. Note that in caseless UTF-8 mode, the number of
|
UTF-8 mode, the number of subject bytes matched may be different to the number
|
||||||
subject bytes matched may be different to the number of reference bytes.
|
of reference bytes. (In theory this could also happen in UTF-16 mode, but it
|
||||||
|
seems unlikely.)
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
offset index into the offset vector
|
offset index into the offset vector
|
||||||
|
offset_top top of the used offset vector
|
||||||
eptr pointer into the subject
|
eptr pointer into the subject
|
||||||
length length of reference to be matched (number of code units)
|
|
||||||
mb points to match block
|
mb points to match block
|
||||||
caseless TRUE if caseless
|
caseless TRUE if caseless
|
||||||
|
lengthptr pointer for returning the length matched
|
||||||
|
|
||||||
Returns: >= 0 the number of subject code units matched
|
Returns: = 0 sucessful match; number of code units matched is set
|
||||||
-1 no match
|
< 0 no match
|
||||||
-2 partial match; always given if at end subject
|
> 0 partial match
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int
|
static int
|
||||||
match_ref(int offset, register PCRE2_SPTR eptr, int length, match_block *mb,
|
match_ref(PCRE2_SIZE offset, PCRE2_SIZE offset_top, register PCRE2_SPTR eptr,
|
||||||
BOOL caseless)
|
match_block *mb, BOOL caseless, PCRE2_SIZE *lengthptr)
|
||||||
{
|
{
|
||||||
PCRE2_SPTR eptr_start = eptr;
|
|
||||||
register PCRE2_SPTR p = mb->start_subject + mb->ovector[offset];
|
|
||||||
#if defined SUPPORT_UTF
|
#if defined SUPPORT_UTF
|
||||||
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
|
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Always fail if reference not set (unless PCRE2_MATCH_UNSET_BACKREF is set,
|
register PCRE2_SPTR p;
|
||||||
in which case the length is passed as zero). */
|
PCRE2_SIZE length;
|
||||||
|
PCRE2_SPTR eptr_start = eptr;
|
||||||
|
|
||||||
if (length < 0) return -1;
|
/* Deal with an unset group. The default is no match, but there is an option to
|
||||||
|
match an empty string. */
|
||||||
|
|
||||||
/* Separate the caseless and UTF case for speed. */
|
if (offset >= offset_top || mb->ovector[offset] == PCRE2_UNSET)
|
||||||
|
{
|
||||||
|
if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
|
||||||
|
{
|
||||||
|
*lengthptr = 0;
|
||||||
|
return 0; /* Match */
|
||||||
|
}
|
||||||
|
else return -1; /* No match */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Separate the caseless and UTF cases for speed. */
|
||||||
|
|
||||||
|
p = mb->start_subject + mb->ovector[offset];
|
||||||
|
length = mb->ovector[offset+1] - mb->ovector[offset];
|
||||||
|
|
||||||
if (caseless)
|
if (caseless)
|
||||||
{
|
{
|
||||||
|
@ -175,7 +190,7 @@ if (caseless)
|
||||||
{
|
{
|
||||||
uint32_t c, d;
|
uint32_t c, d;
|
||||||
const ucd_record *ur;
|
const ucd_record *ur;
|
||||||
if (eptr >= mb->end_subject) return -2; /* Partial match */
|
if (eptr >= mb->end_subject) return 1; /* Partial match */
|
||||||
GETCHARINC(c, eptr);
|
GETCHARINC(c, eptr);
|
||||||
GETCHARINC(d, p);
|
GETCHARINC(d, p);
|
||||||
ur = GET_UCD(d);
|
ur = GET_UCD(d);
|
||||||
|
@ -184,7 +199,7 @@ if (caseless)
|
||||||
const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
|
const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
if (c < *pp) return -1;
|
if (c < *pp) return -1; /* No match */
|
||||||
if (c == *pp++) break;
|
if (c == *pp++) break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -199,29 +214,31 @@ if (caseless)
|
||||||
while (length-- > 0)
|
while (length-- > 0)
|
||||||
{
|
{
|
||||||
uint32_t cc, cp;
|
uint32_t cc, cp;
|
||||||
if (eptr >= mb->end_subject) return -2; /* Partial match */
|
if (eptr >= mb->end_subject) return 1; /* Partial match */
|
||||||
cc = UCHAR21TEST(eptr);
|
cc = UCHAR21TEST(eptr);
|
||||||
cp = UCHAR21TEST(p);
|
cp = UCHAR21TEST(p);
|
||||||
if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) return -1;
|
if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
|
||||||
|
return -1; /* No match */
|
||||||
p++;
|
p++;
|
||||||
eptr++;
|
eptr++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* In the caseful case, we can just compare the bytes, whether or not we
|
/* In the caseful case, we can just compare the code units, whether or not we
|
||||||
are in UTF-8 mode. */
|
are in UT mode. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
while (length-- > 0)
|
while (length-- > 0)
|
||||||
{
|
{
|
||||||
if (eptr >= mb->end_subject) return -2; /* Partial match */
|
if (eptr >= mb->end_subject) return 1; /* Partial match */
|
||||||
if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
|
if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /*No match */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return (int)(eptr - eptr_start);
|
*lengthptr = eptr - eptr_start;
|
||||||
|
return 0; /* Match */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -350,6 +367,7 @@ typedef struct heapframe {
|
||||||
|
|
||||||
eptrblock *Xeptrb;
|
eptrblock *Xeptrb;
|
||||||
|
|
||||||
|
PCRE2_SIZE Xlength;
|
||||||
PCRE2_SIZE Xoffset;
|
PCRE2_SIZE Xoffset;
|
||||||
PCRE2_SIZE Xoffset_top;
|
PCRE2_SIZE Xoffset_top;
|
||||||
PCRE2_SIZE Xsave_offset1, Xsave_offset2, Xsave_offset3;
|
PCRE2_SIZE Xsave_offset1, Xsave_offset2, Xsave_offset3;
|
||||||
|
@ -370,7 +388,6 @@ typedef struct heapframe {
|
||||||
int Xcodelink;
|
int Xcodelink;
|
||||||
int Xctype;
|
int Xctype;
|
||||||
int Xfi;
|
int Xfi;
|
||||||
int Xlength;
|
|
||||||
int Xmax;
|
int Xmax;
|
||||||
int Xmin;
|
int Xmin;
|
||||||
int Xwhere; /* Where to jump back to */
|
int Xwhere; /* Where to jump back to */
|
||||||
|
@ -425,7 +442,7 @@ Arguments:
|
||||||
callpat the recursion point in the pattern
|
callpat the recursion point in the pattern
|
||||||
mstart pointer to the current match start position (can be modified
|
mstart pointer to the current match start position (can be modified
|
||||||
by encountering \K)
|
by encountering \K)
|
||||||
offset_top current top pointer
|
offset_top current top pointer (highest ovector offset used + 1)
|
||||||
mb pointer to "static" info block for the match
|
mb pointer to "static" info block for the match
|
||||||
eptrb pointer to chain of blocks containing eptr at start of
|
eptrb pointer to chain of blocks containing eptr at start of
|
||||||
brackets - for testing for empty matches
|
brackets - for testing for empty matches
|
||||||
|
@ -529,7 +546,7 @@ Arguments:
|
||||||
ecode pointer to current position in compiled code
|
ecode pointer to current position in compiled code
|
||||||
mstart pointer to the current match start position (can be modified
|
mstart pointer to the current match start position (can be modified
|
||||||
by encountering \K)
|
by encountering \K)
|
||||||
offset_top current top pointer
|
offset_top current top pointer (highest ovector offset used + 1)
|
||||||
mb pointer to "static" info block for the match
|
mb pointer to "static" info block for the match
|
||||||
eptrb pointer to chain of blocks containing eptr at start of
|
eptrb pointer to chain of blocks containing eptr at start of
|
||||||
brackets - for testing for empty matches
|
brackets - for testing for empty matches
|
||||||
|
@ -659,6 +676,7 @@ PCRE2_SPTR pp;
|
||||||
PCRE2_SPTR prev;
|
PCRE2_SPTR prev;
|
||||||
PCRE2_SPTR saved_eptr;
|
PCRE2_SPTR saved_eptr;
|
||||||
|
|
||||||
|
PCRE2_SIZE length;
|
||||||
PCRE2_SIZE offset;
|
PCRE2_SIZE offset;
|
||||||
PCRE2_SIZE save_offset1, save_offset2, save_offset3;
|
PCRE2_SIZE save_offset1, save_offset2, save_offset3;
|
||||||
|
|
||||||
|
@ -676,7 +694,6 @@ PCRE2_UCHAR occhars[6];
|
||||||
|
|
||||||
int codelink;
|
int codelink;
|
||||||
int ctype;
|
int ctype;
|
||||||
int length;
|
|
||||||
int max;
|
int max;
|
||||||
int min;
|
int min;
|
||||||
|
|
||||||
|
@ -697,8 +714,8 @@ the alternative names that are used. */
|
||||||
#define cbegroup condition
|
#define cbegroup condition
|
||||||
#define code_offset codelink
|
#define code_offset codelink
|
||||||
#define condassert condition
|
#define condassert condition
|
||||||
#define matched_once prev_is_word
|
|
||||||
#define foc number
|
#define foc number
|
||||||
|
#define matched_once prev_is_word
|
||||||
#define save_mark data
|
#define save_mark data
|
||||||
|
|
||||||
/* These statements are here to stop the compiler complaining about unitialized
|
/* These statements are here to stop the compiler complaining about unitialized
|
||||||
|
@ -2671,22 +2688,7 @@ for (;;)
|
||||||
|
|
||||||
|
|
||||||
/* Match a back reference, possibly repeatedly. Look past the end of the
|
/* Match a back reference, possibly repeatedly. Look past the end of the
|
||||||
item to see if there is repeat information following. The code is similar
|
item to see if there is repeat information following.
|
||||||
to that for character classes, but repeated for efficiency. Then obey
|
|
||||||
similar code to character type repeats - written out again for speed.
|
|
||||||
However, if the referenced string is the empty string, always treat
|
|
||||||
it as matched, any number of times (otherwise there could be infinite
|
|
||||||
loops). If the reference is unset, there are two possibilities:
|
|
||||||
|
|
||||||
(a) In the default, Perl-compatible state, set the length negative;
|
|
||||||
this ensures that every attempt at a match fails. We can't just fail
|
|
||||||
here, because of the possibility of quantifiers with zero minima.
|
|
||||||
|
|
||||||
(b) If the JavaScript compatibility flag is set, set the length to zero
|
|
||||||
so that the back reference matches an empty string.
|
|
||||||
|
|
||||||
Otherwise, set the length to the length of what was matched by the
|
|
||||||
referenced subpattern.
|
|
||||||
|
|
||||||
The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
|
The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
|
||||||
or to a non-duplicated named group. For a duplicated named group, OP_DNREF
|
or to a non-duplicated named group. For a duplicated named group, OP_DNREF
|
||||||
|
@ -2701,20 +2703,14 @@ for (;;)
|
||||||
PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
|
PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
|
||||||
ecode += 1 + 2*IMM2_SIZE;
|
ecode += 1 + 2*IMM2_SIZE;
|
||||||
|
|
||||||
/* Setting the default length first and initializing 'offset' avoids
|
/* Initializing 'offset' avoids a compiler warning in the REF_REPEAT
|
||||||
compiler warnings in the REF_REPEAT code. */
|
code. */
|
||||||
|
|
||||||
length = ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)? 0 : -1;
|
|
||||||
offset = 0;
|
offset = 0;
|
||||||
|
|
||||||
while (count-- > 0)
|
while (count-- > 0)
|
||||||
{
|
{
|
||||||
offset = GET2(slot, 0) << 1;
|
offset = GET2(slot, 0) << 1;
|
||||||
if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET)
|
if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET) break;
|
||||||
{
|
|
||||||
length = mb->ovector[offset+1] - mb->ovector[offset];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
slot += mb->name_entry_size;
|
slot += mb->name_entry_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2725,10 +2721,6 @@ for (;;)
|
||||||
caseless = op == OP_REFI;
|
caseless = op == OP_REFI;
|
||||||
offset = GET2(ecode, 1) << 1; /* Doubled ref number */
|
offset = GET2(ecode, 1) << 1; /* Doubled ref number */
|
||||||
ecode += 1 + IMM2_SIZE;
|
ecode += 1 + IMM2_SIZE;
|
||||||
if (offset >= offset_top || mb->ovector[offset] == PCRE2_UNSET)
|
|
||||||
length = ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)? 0 : -1;
|
|
||||||
else
|
|
||||||
length = mb->ovector[offset+1] - mb->ovector[offset];
|
|
||||||
|
|
||||||
/* Set up for repetition, or handle the non-repeated case */
|
/* Set up for repetition, or handle the non-repeated case */
|
||||||
|
|
||||||
|
@ -2758,24 +2750,34 @@ for (;;)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default: /* No repeat follows */
|
default: /* No repeat follows */
|
||||||
if ((length = match_ref(offset, eptr, length, mb, caseless)) < 0)
|
|
||||||
{
|
{
|
||||||
if (length == -2) eptr = mb->end_subject; /* Partial match */
|
int rc = match_ref(offset, offset_top, eptr, mb, caseless, &length);
|
||||||
|
if (rc != 0)
|
||||||
|
{
|
||||||
|
if (rc > 0) eptr = mb->end_subject; /* Partial match */
|
||||||
CHECK_PARTIAL();
|
CHECK_PARTIAL();
|
||||||
RRETURN(MATCH_NOMATCH);
|
RRETURN(MATCH_NOMATCH);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
eptr += length;
|
eptr += length;
|
||||||
continue; /* With the main loop */
|
continue; /* With the main loop */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Handle repeated back references. If the length of the reference is
|
/* Handle repeated back references. If a set group has length zero, just
|
||||||
zero, just continue with the main loop. If the length is negative, it
|
continue with the main loop, because it matches however many times. For an
|
||||||
means the reference is unset in non-Java-compatible mode. If the minimum is
|
unset reference, in non-match-unset-backref mode, if the minimum is
|
||||||
zero, we can continue at the same level without recursion. For any other
|
zero, we can continue at the same level without recursion. For any other
|
||||||
minimum, carrying on will result in NOMATCH. */
|
minimum, carrying on will result in NOMATCH. */
|
||||||
|
|
||||||
if (length == 0) continue;
|
if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET)
|
||||||
if (length < 0 && min == 0) continue;
|
{
|
||||||
|
if (mb->ovector[offset] == mb->ovector[offset + 1]) continue;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (min == 0 && (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) == 0)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
/* First, ensure the minimum number of matches are present. We get back
|
/* First, ensure the minimum number of matches are present. We get back
|
||||||
the length of the reference string explicitly rather than passing the
|
the length of the reference string explicitly rather than passing the
|
||||||
|
@ -2783,10 +2785,11 @@ for (;;)
|
||||||
|
|
||||||
for (i = 1; i <= min; i++)
|
for (i = 1; i <= min; i++)
|
||||||
{
|
{
|
||||||
int slength;
|
PCRE2_SIZE slength;
|
||||||
if ((slength = match_ref(offset, eptr, length, mb, caseless)) < 0)
|
int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
|
||||||
|
if (rc != 0)
|
||||||
{
|
{
|
||||||
if (slength == -2) eptr = mb->end_subject; /* Partial match */
|
if (rc > 0) eptr = mb->end_subject; /* Partial match */
|
||||||
CHECK_PARTIAL();
|
CHECK_PARTIAL();
|
||||||
RRETURN(MATCH_NOMATCH);
|
RRETURN(MATCH_NOMATCH);
|
||||||
}
|
}
|
||||||
|
@ -2804,13 +2807,15 @@ for (;;)
|
||||||
{
|
{
|
||||||
for (fi = min;; fi++)
|
for (fi = min;; fi++)
|
||||||
{
|
{
|
||||||
int slength;
|
int rc;
|
||||||
|
PCRE2_SIZE slength;
|
||||||
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM14);
|
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM14);
|
||||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||||
if (fi >= max) RRETURN(MATCH_NOMATCH);
|
if (fi >= max) RRETURN(MATCH_NOMATCH);
|
||||||
if ((slength = match_ref(offset, eptr, length, mb, caseless)) < 0)
|
rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
|
||||||
|
if (rc != 0)
|
||||||
{
|
{
|
||||||
if (slength == -2) eptr = mb->end_subject; /* Partial match */
|
if (rc > 0) eptr = mb->end_subject; /* Partial match */
|
||||||
CHECK_PARTIAL();
|
CHECK_PARTIAL();
|
||||||
RRETURN(MATCH_NOMATCH);
|
RRETURN(MATCH_NOMATCH);
|
||||||
}
|
}
|
||||||
|
@ -2819,20 +2824,26 @@ for (;;)
|
||||||
/* Control never gets here */
|
/* Control never gets here */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If maximizing, find the longest string and work backwards */
|
/* If maximizing, find the longest string and work backwards, as long as
|
||||||
|
the matched lengths for each iteration are the same. */
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
BOOL samelengths = TRUE;
|
||||||
pp = eptr;
|
pp = eptr;
|
||||||
|
length = mb->ovector[offset+1] - mb->ovector[offset];
|
||||||
|
|
||||||
for (i = min; i < max; i++)
|
for (i = min; i < max; i++)
|
||||||
{
|
{
|
||||||
int slength;
|
PCRE2_SIZE slength;
|
||||||
if ((slength = match_ref(offset, eptr, length, mb, caseless)) < 0)
|
int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
|
||||||
|
|
||||||
|
if (rc != 0)
|
||||||
{
|
{
|
||||||
/* Can't use CHECK_PARTIAL because we don't want to update eptr in
|
/* Can't use CHECK_PARTIAL because we don't want to update eptr in
|
||||||
the soft partial matching case. */
|
the soft partial matching case. */
|
||||||
|
|
||||||
if (slength == -2 && mb->partial != 0 &&
|
if (rc > 0 && mb->partial != 0 &&
|
||||||
mb->end_subject > mb->start_used_ptr)
|
mb->end_subject > mb->start_used_ptr)
|
||||||
{
|
{
|
||||||
mb->hitend = TRUE;
|
mb->hitend = TRUE;
|
||||||
|
@ -2840,15 +2851,49 @@ for (;;)
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (slength != length) samelengths = FALSE;
|
||||||
eptr += slength;
|
eptr += slength;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* If the length matched for each repetiaion is the same as the length of
|
||||||
|
the captured group, we can easily work backwards. This is the normal
|
||||||
|
case. However, in caseless UTF-8 mode there are pairs of case-equivalent
|
||||||
|
characters whose lengths (in terms of code units) differ. However, this
|
||||||
|
is very rare, so we handle it by re-matching fewer and fewer times. */
|
||||||
|
|
||||||
|
if (samelengths)
|
||||||
|
{
|
||||||
while (eptr >= pp)
|
while (eptr >= pp)
|
||||||
{
|
{
|
||||||
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
|
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
|
||||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||||
eptr -= length;
|
eptr -= length;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The rare case of non-matching lengths. Re-scan the repetition for each
|
||||||
|
iteration. We know that match_ref() will succeed every time. */
|
||||||
|
|
||||||
|
else
|
||||||
|
{
|
||||||
|
max = i;
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
|
||||||
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||||
|
if (eptr == pp) break; /* Failed after minimal repetition */
|
||||||
|
eptr = pp;
|
||||||
|
max--;
|
||||||
|
for (i = min; i < max; i++)
|
||||||
|
{
|
||||||
|
PCRE2_SIZE slength;
|
||||||
|
(void)match_ref(offset, offset_top, eptr, mb, caseless, &slength);
|
||||||
|
eptr += slength;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
RRETURN(MATCH_NOMATCH);
|
RRETURN(MATCH_NOMATCH);
|
||||||
}
|
}
|
||||||
/* Control never gets here */
|
/* Control never gets here */
|
||||||
|
@ -3223,7 +3268,7 @@ for (;;)
|
||||||
length = 1;
|
length = 1;
|
||||||
ecode++;
|
ecode++;
|
||||||
GETCHARLEN(fc, ecode, length);
|
GETCHARLEN(fc, ecode, length);
|
||||||
if (length > mb->end_subject - eptr)
|
if (length > (PCRE2_SIZE)(mb->end_subject - eptr))
|
||||||
{
|
{
|
||||||
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
|
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
|
||||||
RRETURN(MATCH_NOMATCH);
|
RRETURN(MATCH_NOMATCH);
|
||||||
|
|
|
@ -2195,4 +2195,25 @@
|
||||||
/^s?c/im,utf
|
/^s?c/im,utf
|
||||||
scat
|
scat
|
||||||
|
|
||||||
|
# The next four tests are for repeated caseless back references when the
|
||||||
|
# code unit length of the matched text is different to that of the original
|
||||||
|
# group in the UTF-8 case.
|
||||||
|
|
||||||
|
/^(\x{23a})\1*(.)/i,utf
|
||||||
|
\x{23a}\x{23a}\x{23a}\x{23a}
|
||||||
|
\x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
\x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
|
||||||
|
/^(\x{23a})\1*(..)/i,utf
|
||||||
|
\x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
\x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
|
||||||
|
/^(\x{23a})\1*(...)/i,utf
|
||||||
|
\x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
\x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
|
||||||
|
/^(\x{23a})\1*(....)/i,utf
|
||||||
|
\x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
\x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
|
||||||
# End of testinput4
|
# End of testinput4
|
||||||
|
|
|
@ -3691,4 +3691,48 @@ No match
|
||||||
scat
|
scat
|
||||||
0: sc
|
0: sc
|
||||||
|
|
||||||
|
# The next four tests are for repeated caseless back references when the
|
||||||
|
# code unit length of the matched text is different to that of the original
|
||||||
|
# group in the UTF-8 case.
|
||||||
|
|
||||||
|
/^(\x{23a})\1*(.)/i,utf
|
||||||
|
\x{23a}\x{23a}\x{23a}\x{23a}
|
||||||
|
0: \x{23a}\x{23a}\x{23a}\x{23a}
|
||||||
|
1: \x{23a}
|
||||||
|
2: \x{23a}
|
||||||
|
\x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
0: \x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
1: \x{23a}
|
||||||
|
2: \x{2c65}
|
||||||
|
\x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
0: \x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
1: \x{23a}
|
||||||
|
2: \x{23a}
|
||||||
|
|
||||||
|
/^(\x{23a})\1*(..)/i,utf
|
||||||
|
\x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
0: \x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
1: \x{23a}
|
||||||
|
2: \x{2c65}\x{2c65}
|
||||||
|
\x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
0: \x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
1: \x{23a}
|
||||||
|
2: \x{2c65}\x{23a}
|
||||||
|
|
||||||
|
/^(\x{23a})\1*(...)/i,utf
|
||||||
|
\x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
0: \x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
1: \x{23a}
|
||||||
|
2: \x{2c65}\x{2c65}\x{2c65}
|
||||||
|
\x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
0: \x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
1: \x{23a}
|
||||||
|
2: \x{23a}\x{2c65}\x{23a}
|
||||||
|
|
||||||
|
/^(\x{23a})\1*(....)/i,utf
|
||||||
|
\x{23a}\x{2c65}\x{2c65}\x{2c65}
|
||||||
|
No match
|
||||||
|
\x{23a}\x{23a}\x{2c65}\x{23a}
|
||||||
|
No match
|
||||||
|
|
||||||
# End of testinput4
|
# End of testinput4
|
||||||
|
|
Loading…
Reference in New Issue