Support \C in lookbehinds and DFA matching in UTF-32 mode.
This commit is contained in:
parent
fdc2becdcd
commit
149456dce0
|
@ -151,6 +151,9 @@ recently added to pcrecpp.cc in PCRE1.
|
||||||
the match extended over a line boundary, as it tried to find more matches "on
|
the match extended over a line boundary, as it tried to find more matches "on
|
||||||
the same line" - but it was already over the end.
|
the same line" - but it was already over the end.
|
||||||
|
|
||||||
|
39. Allow \C in lookbehinds and DFA matching in UTF-32 mode (by converting it
|
||||||
|
to the same code as '.' when PCRE2_DOTALL is set).
|
||||||
|
|
||||||
|
|
||||||
Version 10.21 12-January-2016
|
Version 10.21 12-January-2016
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
7
HACKING
7
HACKING
|
@ -228,6 +228,11 @@ OP_ASSERT_ACCEPT is used when (*ACCEPT) is encountered within an assertion.
|
||||||
This ends the assertion, not the entire pattern match. The assertion (?!) is
|
This ends the assertion, not the entire pattern match. The assertion (?!) is
|
||||||
always optimized to OP_FAIL.
|
always optimized to OP_FAIL.
|
||||||
|
|
||||||
|
OP_ALLANY is used for '.' when PCRE2_DOTALL is set. It is also used for \C in
|
||||||
|
non-UTF modes and in UTF-32 mode (since one code unit still equals one
|
||||||
|
character). Another use is for [^] when empty classes are permitted
|
||||||
|
(PCRE2_ALLOW_EMPTY_CLASS is set).
|
||||||
|
|
||||||
|
|
||||||
Backtracking control verbs with optional data
|
Backtracking control verbs with optional data
|
||||||
---------------------------------------------
|
---------------------------------------------
|
||||||
|
@ -601,4 +606,4 @@ not a real opcode, but is used to check that tables indexed by opcode are the
|
||||||
correct length, in order to catch updating errors.
|
correct length, in order to catch updating errors.
|
||||||
|
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
June 2015
|
June 2016
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2PATTERN 3 "13 November 2015" "PCRE2 10.21"
|
.TH PCRE2PATTERN 3 "20 June 2016" "PCRE2 10.22"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||||
|
@ -1256,16 +1256,20 @@ PCRE2 does not allow \eC to appear in lookbehind assertions
|
||||||
.\" </a>
|
.\" </a>
|
||||||
(described below)
|
(described below)
|
||||||
.\"
|
.\"
|
||||||
in a UTF mode, because this would make it impossible to calculate the length of
|
in UTF-8 or UTF-16 modes, because this would make it impossible to calculate
|
||||||
the lookbehind. Neither the alternative matching function
|
the length of the lookbehind. Neither the alternative matching function
|
||||||
\fBpcre2_dfa_match()\fP nor the JIT optimizer support \eC in a UTF mode. The
|
\fBpcre2_dfa_match()\fP nor the JIT optimizer support \eC in these UTF modes.
|
||||||
former gives a match-time error; the latter fails to optimize and so the match
|
The former gives a match-time error; the latter fails to optimize and so the
|
||||||
is always run using the interpreter.
|
match is always run using the interpreter.
|
||||||
|
.P
|
||||||
|
In the 32-bit library, however, \eC is always supported (when not explicitly
|
||||||
|
locked out) because it always matches a single code unit, whether or not UTF-32
|
||||||
|
is specified.
|
||||||
.P
|
.P
|
||||||
In general, the \eC escape sequence is best avoided. However, one way of using
|
In general, the \eC escape sequence is best avoided. However, one way of using
|
||||||
it that avoids the problem of malformed UTF characters is to use a lookahead to
|
it that avoids the problem of malformed UTF-8 or UTF-16 characters is to use a
|
||||||
check the length of the next character, as in this pattern, which could be used
|
lookahead to check the length of the next character, as in this pattern, which
|
||||||
with a UTF-8 string (ignore white space and line breaks):
|
could be used with a UTF-8 string (ignore white space and line breaks):
|
||||||
.sp
|
.sp
|
||||||
(?| (?=[\ex00-\ex7f])(\eC) |
|
(?| (?=[\ex00-\ex7f])(\eC) |
|
||||||
(?=[\ex80-\ex{7ff}])(\eC)(\eC) |
|
(?=[\ex80-\ex{7ff}])(\eC)(\eC) |
|
||||||
|
@ -3425,6 +3429,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 13 November 2015
|
Last updated: 20 June 2016
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1117,8 +1117,8 @@ for (;;)
|
||||||
cc++;
|
cc++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* The single-byte matcher isn't allowed. This only happens in UTF mode;
|
/* The single-byte matcher isn't allowed. This only happens in UTF-8 or
|
||||||
otherwise \C is coded as OP_ALLANY. */
|
UTF-16 mode; otherwise \C is coded as OP_ALLANY. */
|
||||||
|
|
||||||
case OP_ANYBYTE:
|
case OP_ANYBYTE:
|
||||||
return FFL_BACKSLASHC;
|
return FFL_BACKSLASHC;
|
||||||
|
@ -7420,12 +7420,17 @@ for (;; ptr++)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
/* In non-UTF mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
|
/* In non-UTF mode, and for both 32-bit modes, we turn \C into
|
||||||
so that it works in DFA mode and in lookbehinds. */
|
OP_ALLANY instead of OP_ANYBYTE so that it works in DFA mode and in
|
||||||
|
lookbehinds. */
|
||||||
|
|
||||||
{
|
{
|
||||||
previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
|
previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||||
|
*code++ = (escape == ESC_C)? OP_ALLANY : escape;
|
||||||
|
#else
|
||||||
*code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
|
*code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
|
|
@ -106,7 +106,7 @@ static const unsigned char compile_error_texts[] =
|
||||||
"character code point value in \\x{} or \\o{} is too large\0"
|
"character code point value in \\x{} or \\o{} is too large\0"
|
||||||
/* 35 */
|
/* 35 */
|
||||||
"invalid condition (?(0)\0"
|
"invalid condition (?(0)\0"
|
||||||
"\\C is not allowed in a lookbehind assertion\0"
|
"\\C is not allowed in a lookbehind assertion in UTF-" XSTRING(PCRE2_CODE_UNIT_WIDTH) " mode\0"
|
||||||
"PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
|
"PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
|
||||||
"number after (?C is greater than 255\0"
|
"number after (?C is greater than 255\0"
|
||||||
"closing parenthesis for (?C expected\0"
|
"closing parenthesis for (?C expected\0"
|
||||||
|
|
|
@ -6,9 +6,11 @@
|
||||||
/ab\Cde/utf,info
|
/ab\Cde/utf,info
|
||||||
abXde
|
abXde
|
||||||
|
|
||||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
# This should produce an error diagnostic (\C in UTF lookbehind) in 8-bit and
|
||||||
|
# 16-bit modes, but not in 32-bit mode.
|
||||||
|
|
||||||
/(?<=ab\Cde)X/utf
|
/(?<=ab\Cde)X/utf
|
||||||
|
ab!deXYZ
|
||||||
|
|
||||||
# Autopossessification tests
|
# Autopossessification tests
|
||||||
|
|
||||||
|
|
|
@ -13,10 +13,12 @@ Subject length lower bound = 0
|
||||||
abXde
|
abXde
|
||||||
0: abXde
|
0: abXde
|
||||||
|
|
||||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
# This should produce an error diagnostic (\C in UTF lookbehind) in 8-bit and
|
||||||
|
# 16-bit modes, but not in 32-bit mode.
|
||||||
|
|
||||||
/(?<=ab\Cde)X/utf
|
/(?<=ab\Cde)X/utf
|
||||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion in UTF-16 mode
|
||||||
|
ab!deXYZ
|
||||||
|
|
||||||
# Autopossessification tests
|
# Autopossessification tests
|
||||||
|
|
||||||
|
|
|
@ -9,14 +9,16 @@ Contains \C
|
||||||
Options: utf
|
Options: utf
|
||||||
First code unit = 'a'
|
First code unit = 'a'
|
||||||
Last code unit = 'e'
|
Last code unit = 'e'
|
||||||
Subject length lower bound = 0
|
Subject length lower bound = 5
|
||||||
abXde
|
abXde
|
||||||
0: abXde
|
0: abXde
|
||||||
|
|
||||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
# This should produce an error diagnostic (\C in UTF lookbehind) in 8-bit and
|
||||||
|
# 16-bit modes, but not in 32-bit mode.
|
||||||
|
|
||||||
/(?<=ab\Cde)X/utf
|
/(?<=ab\Cde)X/utf
|
||||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
ab!deXYZ
|
||||||
|
0: X
|
||||||
|
|
||||||
# Autopossessification tests
|
# Autopossessification tests
|
||||||
|
|
||||||
|
@ -34,10 +36,10 @@ Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||||
/\C+\X \X+\C/Bx,utf
|
/\C+\X \X+\C/Bx,utf
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
Anybyte+
|
AllAny+
|
||||||
extuni
|
extuni
|
||||||
extuni+
|
extuni+
|
||||||
Anybyte
|
AllAny
|
||||||
Ket
|
Ket
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
|
@ -13,10 +13,12 @@ Subject length lower bound = 0
|
||||||
abXde
|
abXde
|
||||||
0: abXde
|
0: abXde
|
||||||
|
|
||||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
# This should produce an error diagnostic (\C in UTF lookbehind) in 8-bit and
|
||||||
|
# 16-bit modes, but not in 32-bit mode.
|
||||||
|
|
||||||
/(?<=ab\Cde)X/utf
|
/(?<=ab\Cde)X/utf
|
||||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion in UTF-8 mode
|
||||||
|
ab!deXYZ
|
||||||
|
|
||||||
# Autopossessification tests
|
# Autopossessification tests
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue