Refactor match_ref() and fix UTF-8 caseless bug.

This commit is contained in:
Philip.Hazel 2014-08-27 16:59:56 +00:00
parent fa3b6bb251
commit cfb78cbd53
4 changed files with 224 additions and 99 deletions

View File

@ -5,11 +5,14 @@ Version 10.0 xx-xxxx-2014
-------------------------
Version 10.0 is the first release of PCRE2, a revised API for the PCRE library.
Changes prior to 10.0 are logged in the ChangeLog file for the old API.
Changes prior to 10.0 are logged in the ChangeLog file for the old API, up to
item 20 for release 8.36.
The code of the library was heavily revised as part of the new API
implementation. Details of each and every modification were not individually
logged. In addition to the API changes, the following changes were made:
logged. In addition to the API changes, the following changes were made. They
are either new functionality, or bugs that were fixed after the code had been
forked.
1. The test program, now called pcre2test, was re-specified and almost
completely re-written. Its input is not compatible with input for pcretest.
@ -23,4 +26,16 @@ not writing the function calls themselves, it is possible to check the PCRE2
version by matching a pattern such as /(?(VERSION>=10.0)yes|no)/ against a
string such as "yesno".
4. There are case-equivalent Unicode characters whose encodings use different
numbers of code units in UTF-8. U+023A and U+2C65 are one example. (It is
theoretically possible for this to happen in UTF-16 too.) If a backreference to
a group containing one of these characters was greedily repeated, and during
the match a backtrack occurred, the subject might be backtracked by the wrong
number of code units. For example, if /^(\x{23a})\1*(.)/ is matched caselessly
(and in UTF-8 mode) against "\x{23a}\x{2c65}\x{2c65}\x{2c65}", group 2 should
capture the final character, which is the three bytes E2, B1, and A5 in UTF-8.
Incorrect backtracking meant that group 2 captured only the last two bytes.
This bug has been fixed; the new code is slower, but it is used only when the
strings matched by the repetition are not all the same length.
****

View File

@ -122,39 +122,54 @@ ovector length is always a multiple of 3. */
* Match a back-reference *
*************************************************/
/* Normally, if a back reference hasn't been set, the length that is passed is
negative, so the match always fails. However, in JavaScript compatibility mode,
the length passed is zero. Note that in caseless UTF-8 mode, the number of
subject bytes matched may be different to the number of reference bytes.
/* This function is called only when it is known that the offset lies within
the offsets that have so far been used in the match. Note that in caseless
UTF-8 mode, the number of subject bytes matched may be different to the number
of reference bytes. (In theory this could also happen in UTF-16 mode, but it
seems unlikely.)
Arguments:
offset index into the offset vector
offset_top top of the used offset vector
eptr pointer into the subject
length length of reference to be matched (number of code units)
mb points to match block
caseless TRUE if caseless
lengthptr pointer for returning the length matched
Returns: >= 0 the number of subject code units matched
-1 no match
-2 partial match; always given if at end subject
Returns: = 0 sucessful match; number of code units matched is set
< 0 no match
> 0 partial match
*/
static int
match_ref(int offset, register PCRE2_SPTR eptr, int length, match_block *mb,
BOOL caseless)
match_ref(PCRE2_SIZE offset, PCRE2_SIZE offset_top, register PCRE2_SPTR eptr,
match_block *mb, BOOL caseless, PCRE2_SIZE *lengthptr)
{
PCRE2_SPTR eptr_start = eptr;
register PCRE2_SPTR p = mb->start_subject + mb->ovector[offset];
#if defined SUPPORT_UTF
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
#endif
/* Always fail if reference not set (unless PCRE2_MATCH_UNSET_BACKREF is set,
in which case the length is passed as zero). */
register PCRE2_SPTR p;
PCRE2_SIZE length;
PCRE2_SPTR eptr_start = eptr;
if (length < 0) return -1;
/* Deal with an unset group. The default is no match, but there is an option to
match an empty string. */
/* Separate the caseless and UTF case for speed. */
if (offset >= offset_top || mb->ovector[offset] == PCRE2_UNSET)
{
if ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)
{
*lengthptr = 0;
return 0; /* Match */
}
else return -1; /* No match */
}
/* Separate the caseless and UTF cases for speed. */
p = mb->start_subject + mb->ovector[offset];
length = mb->ovector[offset+1] - mb->ovector[offset];
if (caseless)
{
@ -175,7 +190,7 @@ if (caseless)
{
uint32_t c, d;
const ucd_record *ur;
if (eptr >= mb->end_subject) return -2; /* Partial match */
if (eptr >= mb->end_subject) return 1; /* Partial match */
GETCHARINC(c, eptr);
GETCHARINC(d, p);
ur = GET_UCD(d);
@ -184,7 +199,7 @@ if (caseless)
const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset;
for (;;)
{
if (c < *pp) return -1;
if (c < *pp) return -1; /* No match */
if (c == *pp++) break;
}
}
@ -199,29 +214,31 @@ if (caseless)
while (length-- > 0)
{
uint32_t cc, cp;
if (eptr >= mb->end_subject) return -2; /* Partial match */
if (eptr >= mb->end_subject) return 1; /* Partial match */
cc = UCHAR21TEST(eptr);
cp = UCHAR21TEST(p);
if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc)) return -1;
if (TABLE_GET(cp, mb->lcc, cp) != TABLE_GET(cc, mb->lcc, cc))
return -1; /* No match */
p++;
eptr++;
}
}
}
/* In the caseful case, we can just compare the bytes, whether or not we
are in UTF-8 mode. */
/* In the caseful case, we can just compare the code units, whether or not we
are in UT mode. */
else
{
while (length-- > 0)
{
if (eptr >= mb->end_subject) return -2; /* Partial match */
if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
if (eptr >= mb->end_subject) return 1; /* Partial match */
if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /*No match */
}
}
return (int)(eptr - eptr_start);
*lengthptr = eptr - eptr_start;
return 0; /* Match */
}
@ -350,6 +367,7 @@ typedef struct heapframe {
eptrblock *Xeptrb;
PCRE2_SIZE Xlength;
PCRE2_SIZE Xoffset;
PCRE2_SIZE Xoffset_top;
PCRE2_SIZE Xsave_offset1, Xsave_offset2, Xsave_offset3;
@ -370,7 +388,6 @@ typedef struct heapframe {
int Xcodelink;
int Xctype;
int Xfi;
int Xlength;
int Xmax;
int Xmin;
int Xwhere; /* Where to jump back to */
@ -425,7 +442,7 @@ Arguments:
callpat the recursion point in the pattern
mstart pointer to the current match start position (can be modified
by encountering \K)
offset_top current top pointer
offset_top current top pointer (highest ovector offset used + 1)
mb pointer to "static" info block for the match
eptrb pointer to chain of blocks containing eptr at start of
brackets - for testing for empty matches
@ -529,7 +546,7 @@ Arguments:
ecode pointer to current position in compiled code
mstart pointer to the current match start position (can be modified
by encountering \K)
offset_top current top pointer
offset_top current top pointer (highest ovector offset used + 1)
mb pointer to "static" info block for the match
eptrb pointer to chain of blocks containing eptr at start of
brackets - for testing for empty matches
@ -659,6 +676,7 @@ PCRE2_SPTR pp;
PCRE2_SPTR prev;
PCRE2_SPTR saved_eptr;
PCRE2_SIZE length;
PCRE2_SIZE offset;
PCRE2_SIZE save_offset1, save_offset2, save_offset3;
@ -676,7 +694,6 @@ PCRE2_UCHAR occhars[6];
int codelink;
int ctype;
int length;
int max;
int min;
@ -693,13 +710,13 @@ of the local variables that are used only in localised parts of the code, but
still need to be preserved over recursive calls of match(). These macros define
the alternative names that are used. */
#define allow_zero cur_is_word
#define cbegroup condition
#define code_offset codelink
#define condassert condition
#define matched_once prev_is_word
#define foc number
#define save_mark data
#define allow_zero cur_is_word
#define cbegroup condition
#define code_offset codelink
#define condassert condition
#define foc number
#define matched_once prev_is_word
#define save_mark data
/* These statements are here to stop the compiler complaining about unitialized
variables. */
@ -2671,23 +2688,8 @@ for (;;)
/* Match a back reference, possibly repeatedly. Look past the end of the
item to see if there is repeat information following. The code is similar
to that for character classes, but repeated for efficiency. Then obey
similar code to character type repeats - written out again for speed.
However, if the referenced string is the empty string, always treat
it as matched, any number of times (otherwise there could be infinite
loops). If the reference is unset, there are two possibilities:
(a) In the default, Perl-compatible state, set the length negative;
this ensures that every attempt at a match fails. We can't just fail
here, because of the possibility of quantifiers with zero minima.
(b) If the JavaScript compatibility flag is set, set the length to zero
so that the back reference matches an empty string.
Otherwise, set the length to the length of what was matched by the
referenced subpattern.
item to see if there is repeat information following.
The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
or to a non-duplicated named group. For a duplicated named group, OP_DNREF
and OP_DNREFI are used. In this case we must scan the list of groups to
@ -2701,20 +2703,14 @@ for (;;)
PCRE2_SPTR slot = mb->name_table + GET2(ecode, 1) * mb->name_entry_size;
ecode += 1 + 2*IMM2_SIZE;
/* Setting the default length first and initializing 'offset' avoids
compiler warnings in the REF_REPEAT code. */
length = ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)? 0 : -1;
/* Initializing 'offset' avoids a compiler warning in the REF_REPEAT
code. */
offset = 0;
while (count-- > 0)
{
offset = GET2(slot, 0) << 1;
if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET)
{
length = mb->ovector[offset+1] - mb->ovector[offset];
break;
}
if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET) break;
slot += mb->name_entry_size;
}
}
@ -2725,11 +2721,7 @@ for (;;)
caseless = op == OP_REFI;
offset = GET2(ecode, 1) << 1; /* Doubled ref number */
ecode += 1 + IMM2_SIZE;
if (offset >= offset_top || mb->ovector[offset] == PCRE2_UNSET)
length = ((mb->poptions & PCRE2_MATCH_UNSET_BACKREF) != 0)? 0 : -1;
else
length = mb->ovector[offset+1] - mb->ovector[offset];
/* Set up for repetition, or handle the non-repeated case */
REF_REPEAT:
@ -2757,25 +2749,35 @@ for (;;)
ecode += 1 + 2 * IMM2_SIZE;
break;
default: /* No repeat follows */
if ((length = match_ref(offset, eptr, length, mb, caseless)) < 0)
{
if (length == -2) eptr = mb->end_subject; /* Partial match */
CHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
default: /* No repeat follows */
{
int rc = match_ref(offset, offset_top, eptr, mb, caseless, &length);
if (rc != 0)
{
if (rc > 0) eptr = mb->end_subject; /* Partial match */
CHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
}
eptr += length;
continue; /* With the main loop */
}
/* Handle repeated back references. If the length of the reference is
zero, just continue with the main loop. If the length is negative, it
means the reference is unset in non-Java-compatible mode. If the minimum is
/* Handle repeated back references. If a set group has length zero, just
continue with the main loop, because it matches however many times. For an
unset reference, in non-match-unset-backref mode, if the minimum is
zero, we can continue at the same level without recursion. For any other
minimum, carrying on will result in NOMATCH. */
if (length == 0) continue;
if (length < 0 && min == 0) continue;
if (offset < offset_top && mb->ovector[offset] != PCRE2_UNSET)
{
if (mb->ovector[offset] == mb->ovector[offset + 1]) continue;
}
else
{
if (min == 0 && (mb->poptions & PCRE2_MATCH_UNSET_BACKREF) == 0)
continue;
}
/* First, ensure the minimum number of matches are present. We get back
the length of the reference string explicitly rather than passing the
@ -2783,10 +2785,11 @@ for (;;)
for (i = 1; i <= min; i++)
{
int slength;
if ((slength = match_ref(offset, eptr, length, mb, caseless)) < 0)
PCRE2_SIZE slength;
int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
if (rc != 0)
{
if (slength == -2) eptr = mb->end_subject; /* Partial match */
if (rc > 0) eptr = mb->end_subject; /* Partial match */
CHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
@ -2804,13 +2807,15 @@ for (;;)
{
for (fi = min;; fi++)
{
int slength;
int rc;
PCRE2_SIZE slength;
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM14);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max) RRETURN(MATCH_NOMATCH);
if ((slength = match_ref(offset, eptr, length, mb, caseless)) < 0)
rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
if (rc != 0)
{
if (slength == -2) eptr = mb->end_subject; /* Partial match */
if (rc > 0) eptr = mb->end_subject; /* Partial match */
CHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
@ -2819,20 +2824,26 @@ for (;;)
/* Control never gets here */
}
/* If maximizing, find the longest string and work backwards */
/* If maximizing, find the longest string and work backwards, as long as
the matched lengths for each iteration are the same. */
else
{
BOOL samelengths = TRUE;
pp = eptr;
length = mb->ovector[offset+1] - mb->ovector[offset];
for (i = min; i < max; i++)
{
int slength;
if ((slength = match_ref(offset, eptr, length, mb, caseless)) < 0)
PCRE2_SIZE slength;
int rc = match_ref(offset, offset_top, eptr, mb, caseless, &slength);
if (rc != 0)
{
/* Can't use CHECK_PARTIAL because we don't want to update eptr in
the soft partial matching case. */
if (slength == -2 && mb->partial != 0 &&
if (rc > 0 && mb->partial != 0 &&
mb->end_subject > mb->start_used_ptr)
{
mb->hitend = TRUE;
@ -2840,15 +2851,49 @@ for (;;)
}
break;
}
if (slength != length) samelengths = FALSE;
eptr += slength;
}
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr -= length;
/* If the length matched for each repetiaion is the same as the length of
the captured group, we can easily work backwards. This is the normal
case. However, in caseless UTF-8 mode there are pairs of case-equivalent
characters whose lengths (in terms of code units) differ. However, this
is very rare, so we handle it by re-matching fewer and fewer times. */
if (samelengths)
{
while (eptr >= pp)
{
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr -= length;
}
}
/* The rare case of non-matching lengths. Re-scan the repetition for each
iteration. We know that match_ref() will succeed every time. */
else
{
max = i;
for (;;)
{
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM15);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr == pp) break; /* Failed after minimal repetition */
eptr = pp;
max--;
for (i = min; i < max; i++)
{
PCRE2_SIZE slength;
(void)match_ref(offset, offset_top, eptr, mb, caseless, &slength);
eptr += slength;
}
}
}
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@ -3223,7 +3268,7 @@ for (;;)
length = 1;
ecode++;
GETCHARLEN(fc, ecode, length);
if (length > mb->end_subject - eptr)
if (length > (PCRE2_SIZE)(mb->end_subject - eptr))
{
CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
RRETURN(MATCH_NOMATCH);

21
testdata/testinput4 vendored
View File

@ -2194,5 +2194,26 @@
/^s?c/im,utf
scat
# The next four tests are for repeated caseless back references when the
# code unit length of the matched text is different to that of the original
# group in the UTF-8 case.
/^(\x{23a})\1*(.)/i,utf
\x{23a}\x{23a}\x{23a}\x{23a}
\x{23a}\x{2c65}\x{2c65}\x{2c65}
\x{23a}\x{23a}\x{2c65}\x{23a}
/^(\x{23a})\1*(..)/i,utf
\x{23a}\x{2c65}\x{2c65}\x{2c65}
\x{23a}\x{23a}\x{2c65}\x{23a}
/^(\x{23a})\1*(...)/i,utf
\x{23a}\x{2c65}\x{2c65}\x{2c65}
\x{23a}\x{23a}\x{2c65}\x{23a}
/^(\x{23a})\1*(....)/i,utf
\x{23a}\x{2c65}\x{2c65}\x{2c65}
\x{23a}\x{23a}\x{2c65}\x{23a}
# End of testinput4

44
testdata/testoutput4 vendored
View File

@ -3690,5 +3690,49 @@ No match
/^s?c/im,utf
scat
0: sc
# The next four tests are for repeated caseless back references when the
# code unit length of the matched text is different to that of the original
# group in the UTF-8 case.
/^(\x{23a})\1*(.)/i,utf
\x{23a}\x{23a}\x{23a}\x{23a}
0: \x{23a}\x{23a}\x{23a}\x{23a}
1: \x{23a}
2: \x{23a}
\x{23a}\x{2c65}\x{2c65}\x{2c65}
0: \x{23a}\x{2c65}\x{2c65}\x{2c65}
1: \x{23a}
2: \x{2c65}
\x{23a}\x{23a}\x{2c65}\x{23a}
0: \x{23a}\x{23a}\x{2c65}\x{23a}
1: \x{23a}
2: \x{23a}
/^(\x{23a})\1*(..)/i,utf
\x{23a}\x{2c65}\x{2c65}\x{2c65}
0: \x{23a}\x{2c65}\x{2c65}\x{2c65}
1: \x{23a}
2: \x{2c65}\x{2c65}
\x{23a}\x{23a}\x{2c65}\x{23a}
0: \x{23a}\x{23a}\x{2c65}\x{23a}
1: \x{23a}
2: \x{2c65}\x{23a}
/^(\x{23a})\1*(...)/i,utf
\x{23a}\x{2c65}\x{2c65}\x{2c65}
0: \x{23a}\x{2c65}\x{2c65}\x{2c65}
1: \x{23a}
2: \x{2c65}\x{2c65}\x{2c65}
\x{23a}\x{23a}\x{2c65}\x{23a}
0: \x{23a}\x{23a}\x{2c65}\x{23a}
1: \x{23a}
2: \x{23a}\x{2c65}\x{23a}
/^(\x{23a})\1*(....)/i,utf
\x{23a}\x{2c65}\x{2c65}\x{2c65}
No match
\x{23a}\x{23a}\x{2c65}\x{23a}
No match
# End of testinput4