Fix single-character POSIX class bug in UCP mode.

This commit is contained in:
Philip.Hazel 2015-11-17 17:13:43 +00:00
parent c0d0f2f65e
commit 6650a2fd9a
6 changed files with 74 additions and 11 deletions

View File

@ -314,6 +314,10 @@ with JIT (possibly caused by SSE2?).
94. Support offset_limit in JIT.
95. A sequence such as [[:punct:]b] that is, a POSIX character class followed
by a single ASCII character in a class item, was incorrectly compiled in UCP
mode. The POSIX class got lost, but only if the single character followed it.
Version 10.20 30-June-2015
--------------------------

View File

@ -1352,7 +1352,7 @@ if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 &&
/* A large and/or complex regex can take too long to process. We have to assume
it can match an empty string. This can happen more often when (?| groups are
present in the pattern and the caching is disabled. Setting the cap at 1100
present in the pattern and the caching is disabled. Setting the cap at 1100
allows the test for more than 1023 capturing patterns to work. */
if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED;
@ -4729,16 +4729,20 @@ for (;; ptr++)
CLASS_SINGLE_CHARACTER:
if (class_one_char < 2) class_one_char++;
/* If class_one_char is 1, we have the first single character in the
class, and there have been no prior ranges, or XCLASS items generated by
escapes. If this is the final character in the class, we can optimize by
turning the item into a 1-character OP_CHAR[I] if it's positive, or
OP_NOT[I] if it's negative. In the positive case, it can cause firstcu
to be set. Otherwise, there can be no first char if this item is first,
whatever repeat count may follow. In the case of reqcu, save the
previous value for reinstating. */
/* If class_one_char is 1 and xclass_has_prop is false, we have the first
single character in the class, and there have been no prior ranges, or
XCLASS items generated by escapes. If this is the final character in the
class, we can optimize by turning the item into a 1-character OP_CHAR[I]
if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
can cause firstcu to be set. Otherwise, there can be no first char if
this item is first, whatever repeat count may follow. In the case of
reqcu, save the previous value for reinstating. */
if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
if (!inescq &&
#ifdef SUPPORT_UNICODE
!xclass_has_prop &&
#endif
class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
{
ptr++;
zeroreqcu = reqcu;
@ -7287,7 +7291,7 @@ for (;; ptr++)
else
{
if (escape == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
if (escape == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
cb->max_lookbehind == 0)
cb->max_lookbehind = 1;

4
testdata/testinput2 vendored
View File

@ -4685,4 +4685,8 @@ a)"xI
"(*ANYCRLF)(?m)^(.*[^0-9\r\n].*|)$"g,replace=NaN
15\r\nfoo\r\n20\r\nbar\r\nbaz\r\n\r\n20
/a[[:punct:]b]/bincode
/a[b[:punct:]]/bincode
# End of testinput2

6
testdata/testinput5 vendored
View File

@ -1691,4 +1691,10 @@
/abc\Cdef/info,utf
/a[[:punct:]b]/ucp,bincode
/a[[:punct:]b]/utf,ucp,bincode
/a[b[:punct:]]/utf,ucp,bincode
# End of testinput5

18
testdata/testoutput2 vendored
View File

@ -14888,4 +14888,22 @@ Subject length lower bound = 0
15\r\nfoo\r\n20\r\nbar\r\nbaz\r\n\r\n20
4: 15\x0d\x0aNaN\x0d\x0a20\x0d\x0aNaN\x0d\x0aNaN\x0d\x0aNaN\x0d\x0a20
/a[[:punct:]b]/bincode
------------------------------------------------------------------
Bra
a
[!-/:-@[-`b{-~]
Ket
End
------------------------------------------------------------------
/a[b[:punct:]]/bincode
------------------------------------------------------------------
Bra
a
[!-/:-@[-`b{-~]
Ket
End
------------------------------------------------------------------
# End of testinput2

27
testdata/testoutput5 vendored
View File

@ -4070,4 +4070,31 @@ First code unit = 'a'
Last code unit = 'f'
Subject length lower bound = 0
/a[[:punct:]b]/ucp,bincode
------------------------------------------------------------------
Bra
a
[b[:punct:]]
Ket
End
------------------------------------------------------------------
/a[[:punct:]b]/utf,ucp,bincode
------------------------------------------------------------------
Bra
a
[b[:punct:]]
Ket
End
------------------------------------------------------------------
/a[b[:punct:]]/utf,ucp,bincode
------------------------------------------------------------------
Bra
a
[b[:punct:]]
Ket
End
------------------------------------------------------------------
# End of testinput5