diff --git a/ChangeLog b/ChangeLog index de7dc9d..0774840 100644 --- a/ChangeLog +++ b/ChangeLog @@ -151,6 +151,9 @@ recently added to pcrecpp.cc in PCRE1. the match extended over a line boundary, as it tried to find more matches "on the same line" - but it was already over the end. +39. Allow \C in lookbehinds and DFA matching in UTF-32 mode (by converting it +to the same code as '.' when PCRE2_DOTALL is set). + Version 10.21 12-January-2016 ----------------------------- diff --git a/HACKING b/HACKING index 051520c..883aa64 100644 --- a/HACKING +++ b/HACKING @@ -228,6 +228,11 @@ OP_ASSERT_ACCEPT is used when (*ACCEPT) is encountered within an assertion. This ends the assertion, not the entire pattern match. The assertion (?!) is always optimized to OP_FAIL. +OP_ALLANY is used for '.' when PCRE2_DOTALL is set. It is also used for \C in +non-UTF modes and in UTF-32 mode (since one code unit still equals one +character). Another use is for [^] when empty classes are permitted +(PCRE2_ALLOW_EMPTY_CLASS is set). + Backtracking control verbs with optional data --------------------------------------------- @@ -601,4 +606,4 @@ not a real opcode, but is used to check that tables indexed by opcode are the correct length, in order to catch updating errors. Philip Hazel -June 2015 +June 2016 diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index 6211e1a..9300509 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2PATTERN 3 "13 November 2015" "PCRE2 10.21" +.TH PCRE2PATTERN 3 "20 June 2016" "PCRE2 10.22" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" @@ -1256,16 +1256,20 @@ PCRE2 does not allow \eC to appear in lookbehind assertions .\" (described below) .\" -in a UTF mode, because this would make it impossible to calculate the length of -the lookbehind. Neither the alternative matching function -\fBpcre2_dfa_match()\fP nor the JIT optimizer support \eC in a UTF mode. The -former gives a match-time error; the latter fails to optimize and so the match -is always run using the interpreter. +in UTF-8 or UTF-16 modes, because this would make it impossible to calculate +the length of the lookbehind. Neither the alternative matching function +\fBpcre2_dfa_match()\fP nor the JIT optimizer support \eC in these UTF modes. +The former gives a match-time error; the latter fails to optimize and so the +match is always run using the interpreter. +.P +In the 32-bit library, however, \eC is always supported (when not explicitly +locked out) because it always matches a single code unit, whether or not UTF-32 +is specified. .P In general, the \eC escape sequence is best avoided. However, one way of using -it that avoids the problem of malformed UTF characters is to use a lookahead to -check the length of the next character, as in this pattern, which could be used -with a UTF-8 string (ignore white space and line breaks): +it that avoids the problem of malformed UTF-8 or UTF-16 characters is to use a +lookahead to check the length of the next character, as in this pattern, which +could be used with a UTF-8 string (ignore white space and line breaks): .sp (?| (?=[\ex00-\ex7f])(\eC) | (?=[\ex80-\ex{7ff}])(\eC)(\eC) | @@ -3425,6 +3429,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 13 November 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 20 June 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 35f37ba..2696f44 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -1117,8 +1117,8 @@ for (;;) cc++; break; - /* The single-byte matcher isn't allowed. This only happens in UTF mode; - otherwise \C is coded as OP_ALLANY. */ + /* The single-byte matcher isn't allowed. This only happens in UTF-8 or + UTF-16 mode; otherwise \C is coded as OP_ALLANY. */ case OP_ANYBYTE: return FFL_BACKSLASHC; @@ -7420,12 +7420,17 @@ for (;; ptr++) } else #endif - /* In non-UTF mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE - so that it works in DFA mode and in lookbehinds. */ + /* In non-UTF mode, and for both 32-bit modes, we turn \C into + OP_ALLANY instead of OP_ANYBYTE so that it works in DFA mode and in + lookbehinds. */ { previous = (escape > ESC_b && escape < ESC_Z)? code : NULL; +#if PCRE2_CODE_UNIT_WIDTH == 32 + *code++ = (escape == ESC_C)? OP_ALLANY : escape; +#else *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape; +#endif } } continue; diff --git a/src/pcre2_error.c b/src/pcre2_error.c index e7229ac..77fd5f4 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -106,7 +106,7 @@ static const unsigned char compile_error_texts[] = "character code point value in \\x{} or \\o{} is too large\0" /* 35 */ "invalid condition (?(0)\0" - "\\C is not allowed in a lookbehind assertion\0" + "\\C is not allowed in a lookbehind assertion in UTF-" XSTRING(PCRE2_CODE_UNIT_WIDTH) " mode\0" "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0" "number after (?C is greater than 255\0" "closing parenthesis for (?C expected\0" diff --git a/testdata/testinput22 b/testdata/testinput22 index f684cf4..7ada9aa 100644 --- a/testdata/testinput22 +++ b/testdata/testinput22 @@ -6,9 +6,11 @@ /ab\Cde/utf,info abXde -# This should produce an error diagnostic (\C in UTF lookbehind) +# This should produce an error diagnostic (\C in UTF lookbehind) in 8-bit and +# 16-bit modes, but not in 32-bit mode. /(?<=ab\Cde)X/utf + ab!deXYZ # Autopossessification tests diff --git a/testdata/testoutput22-16 b/testdata/testoutput22-16 index ca1fd76..01c9153 100644 --- a/testdata/testoutput22-16 +++ b/testdata/testoutput22-16 @@ -13,10 +13,12 @@ Subject length lower bound = 0 abXde 0: abXde -# This should produce an error diagnostic (\C in UTF lookbehind) +# This should produce an error diagnostic (\C in UTF lookbehind) in 8-bit and +# 16-bit modes, but not in 32-bit mode. /(?<=ab\Cde)X/utf -Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion +Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion in UTF-16 mode + ab!deXYZ # Autopossessification tests diff --git a/testdata/testoutput22-32 b/testdata/testoutput22-32 index 91d0b05..100333f 100644 --- a/testdata/testoutput22-32 +++ b/testdata/testoutput22-32 @@ -9,14 +9,16 @@ Contains \C Options: utf First code unit = 'a' Last code unit = 'e' -Subject length lower bound = 0 +Subject length lower bound = 5 abXde 0: abXde -# This should produce an error diagnostic (\C in UTF lookbehind) +# This should produce an error diagnostic (\C in UTF lookbehind) in 8-bit and +# 16-bit modes, but not in 32-bit mode. /(?<=ab\Cde)X/utf -Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion + ab!deXYZ + 0: X # Autopossessification tests @@ -34,10 +36,10 @@ Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion /\C+\X \X+\C/Bx,utf ------------------------------------------------------------------ Bra - Anybyte+ + AllAny+ extuni extuni+ - Anybyte + AllAny Ket End ------------------------------------------------------------------ diff --git a/testdata/testoutput22-8 b/testdata/testoutput22-8 index acb31d6..4814039 100644 --- a/testdata/testoutput22-8 +++ b/testdata/testoutput22-8 @@ -13,10 +13,12 @@ Subject length lower bound = 0 abXde 0: abXde -# This should produce an error diagnostic (\C in UTF lookbehind) +# This should produce an error diagnostic (\C in UTF lookbehind) in 8-bit and +# 16-bit modes, but not in 32-bit mode. /(?<=ab\Cde)X/utf -Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion +Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion in UTF-8 mode + ab!deXYZ # Autopossessification tests