Implement PCRE2_NEVER_BACKSLASH_C.
This commit is contained in:
parent
3e1748390b
commit
e47a6ebe87
|
@ -85,6 +85,8 @@ a very long time if mutual recursion was present many times in a pattern, for
|
|||
example, /((?2){73}(?2))((?1))/. A better mutual recursion detection method has
|
||||
been implemented. This infelicity was discovered by the LLVM fuzzer.
|
||||
|
||||
21. Implemented PCRE2_NEVER_BACKSLASH_C.
|
||||
|
||||
|
||||
Version 10.10 06-March-2015
|
||||
---------------------------
|
||||
|
|
20
doc/pcre2.3
20
doc/pcre2.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2 3 "18 November 2014" "PCRE2 10.00"
|
||||
.TH PCRE2 3 "13 April 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH INTRODUCTION
|
||||
|
@ -103,14 +103,24 @@ lose performance.
|
|||
.P
|
||||
One way of guarding against this possibility is to use the
|
||||
\fBpcre2_pattern_info()\fP function to check the compiled pattern's options for
|
||||
UTF. Alternatively, you can set the PCRE2_NEVER_UTF option at compile time.
|
||||
This causes an compile time error if a pattern contains a UTF-setting sequence.
|
||||
PCRE2_UTF. Alternatively, you can set the PCRE2_NEVER_UTF option when calling
|
||||
\fBpcre2_compile()\fP. This causes an compile time error if a pattern contains
|
||||
a UTF-setting sequence.
|
||||
.P
|
||||
The use of Unicode properties for character types such as \ed can also be
|
||||
enabled from within the pattern, by specifying "(*UCP)". This feature can be
|
||||
disallowed by setting the PCRE2_NEVER_UCP option.
|
||||
.P
|
||||
If your application is one that supports UTF, be aware that validity checking
|
||||
can take time. If the same data string is to be matched many times, you can use
|
||||
the PCRE2_NO_UTF_CHECK option for the second and subsequent matches to avoid
|
||||
running redundant checks.
|
||||
.P
|
||||
The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||
problems, because it may leave the current matching point in the middle of a
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to
|
||||
lock out the use of \eC, causing a compile-time error if it is encountered.
|
||||
.P
|
||||
Another way that performance can be hit is by running a pattern that has a very
|
||||
large search tree against a string that will never match. Nested unlimited
|
||||
repeats in a pattern are a common example. PCRE2 provides some protection
|
||||
|
@ -177,6 +187,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 18 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 13 April 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_COMPILE 3 "02 January 2015" "PCRE2 10.00"
|
||||
.TH PCRE2_COMPILE 3 "13 April 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -46,6 +46,7 @@ or provide an external function for stack size checking. The option bits are:
|
|||
PCRE2_FIRSTLINE Force matching to be before newline
|
||||
PCRE2_MATCH_UNSET_BACKREF Match unset back references
|
||||
PCRE2_MULTILINE ^ and $ match newlines within data
|
||||
PCRE2_NEVER_BACKSLASH_C Lock out the use of \C in patterns
|
||||
PCRE2_NEVER_UCP Lock out PCRE2_UCP, e.g. via (*UCP)
|
||||
PCRE2_NEVER_UTF Lock out PCRE2_UTF, e.g. via (*UTF)
|
||||
PCRE2_NO_AUTO_CAPTURE Disable numbered capturing paren-
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "23 March 2015" "PCRE2 10.20"
|
||||
.TH PCRE2API 3 "13 April 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -1149,6 +1149,14 @@ in the subject string, respectively, as well as at the very start and end. This
|
|||
is equivalent to Perl's /m option, and it can be changed within a pattern by a
|
||||
(?m) option setting. If there are no newlines in a subject string, or no
|
||||
occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect.
|
||||
.sp
|
||||
PCRE2_NEVER_BACKSLASH_C
|
||||
.sp
|
||||
This option locks out the use of \eC in the pattern that is being compiled.
|
||||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||
it may leave the current matching point in the middle of a multi-code-unit
|
||||
character. This option may be useful in applications that process patterns from
|
||||
external sources.
|
||||
.sp
|
||||
PCRE2_NEVER_UCP
|
||||
.sp
|
||||
|
@ -1156,17 +1164,17 @@ This option locks out the use of Unicode properties for handling \eB, \eb, \eD,
|
|||
\ed, \eS, \es, \eW, \ew, and some of the POSIX character classes, as described
|
||||
for the PCRE2_UCP option below. In particular, it prevents the creator of the
|
||||
pattern from enabling this facility by starting the pattern with (*UCP). This
|
||||
may be useful in applications that process patterns from external sources. The
|
||||
option combination PCRE_UCP and PCRE_NEVER_UCP causes an error.
|
||||
option may be useful in applications that process patterns from external
|
||||
sources. The option combination PCRE_UCP and PCRE_NEVER_UCP causes an error.
|
||||
.sp
|
||||
PCRE2_NEVER_UTF
|
||||
.sp
|
||||
This option locks out interpretation of the pattern as UTF-8, UTF-16, or
|
||||
UTF-32, depending on which library is in use. In particular, it prevents the
|
||||
creator of the pattern from switching to UTF interpretation by starting the
|
||||
pattern with (*UTF). This may be useful in applications that process patterns
|
||||
from external sources. The combination of PCRE2_UTF and PCRE2_NEVER_UTF causes
|
||||
an error.
|
||||
pattern with (*UTF). This option may be useful in applications that process
|
||||
patterns from external sources. The combination of PCRE2_UTF and
|
||||
PCRE2_NEVER_UTF causes an error.
|
||||
.sp
|
||||
PCRE2_NO_AUTO_CAPTURE
|
||||
.sp
|
||||
|
@ -2919,6 +2927,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 March 2015
|
||||
Last updated: 13 April 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2BUILD 3 "26 January 2015" "PCRE2 10.00"
|
||||
.TH PCRE2BUILD 3 "13 April 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.
|
||||
|
@ -132,6 +132,11 @@ Pattern escapes such as \ed and \ew do not by default make use of Unicode
|
|||
properties. The application can request that they do by setting the PCRE2_UCP
|
||||
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
|
||||
request this by starting with (*UCP).
|
||||
.P
|
||||
The \eC escape sequence, which matches a single code unit, even in a UTF mode,
|
||||
can cause unpredictable behaviour because it may leave the current matching
|
||||
point in the middle of a multi-code-unit character. It can be locked out by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
.
|
||||
.
|
||||
.SH "JUST-IN-TIME COMPILER SUPPORT"
|
||||
|
@ -494,6 +499,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 26 January 2015
|
||||
Last updated: 13 April 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "15 March 2015" "PCRE2 10.20"
|
||||
.TH PCRE2PATTERN 3 "13 April 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -1200,12 +1200,15 @@ whether or not a UTF mode is set. In the 8-bit library, one code unit is one
|
|||
byte; in the 16-bit library it is a 16-bit unit; in the 32-bit library it is a
|
||||
32-bit unit. Unlike a dot, \eC always matches line-ending characters. The
|
||||
feature is provided in Perl in order to match individual bytes in UTF-8 mode,
|
||||
but it is unclear how it can usefully be used. Because \eC breaks up characters
|
||||
into individual code units, matching one unit with \eC in a UTF mode means that
|
||||
the rest of the string may start with a malformed UTF character. This has
|
||||
undefined results, because PCRE2 assumes that it is dealing with valid UTF
|
||||
strings (and by default it checks this at the start of processing unless the
|
||||
PCRE2_NO_UTF_CHECK option is used).
|
||||
but it is unclear how it can usefully be used.
|
||||
.P
|
||||
Because \eC breaks up characters into individual code units, matching one unit
|
||||
with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start
|
||||
with a malformed UTF character. This has undefined results, because PCRE2
|
||||
assumes that it is matching character by character in a valid UTF string (by
|
||||
default it checks the subject string's validity at the start of processing
|
||||
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the
|
||||
use of \eC by setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
.P
|
||||
PCRE2 does not allow \eC to appear in lookbehind assertions
|
||||
.\" HTML <a href="#lookbehind">
|
||||
|
@ -3329,6 +3332,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 15 March 2015
|
||||
Last updated: 13 April 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SYNTAX 3 "15 March 2015" "PCRE2 10.20"
|
||||
.TH PCRE2SYNTAX 3 "13 April 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||
|
@ -44,7 +44,7 @@ characters "8" and "9".
|
|||
.sp
|
||||
. any character except newline;
|
||||
in dotall mode, any character whatsoever
|
||||
\eC one data unit, even in UTF mode (best avoided)
|
||||
\eC one code unit, even in UTF mode (best avoided)
|
||||
\ed a decimal digit
|
||||
\eD a character that is not a decimal digit
|
||||
\eh a horizontal white space character
|
||||
|
@ -61,6 +61,10 @@ characters "8" and "9".
|
|||
\eW a "non-word" character
|
||||
\eX a Unicode extended grapheme cluster
|
||||
.sp
|
||||
The application can lock out the use of \eC by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the
|
||||
current matching point in the middle of a UTF-8 or UTF-16 character.
|
||||
.P
|
||||
By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode
|
||||
or in the 16-bit and 32-bit libraries. However, if locale-specific matching is
|
||||
happening, \es and \ew may also match characters with code points in the range
|
||||
|
@ -396,7 +400,9 @@ appear.
|
|||
(*UCP) set PCRE2_UCP (use Unicode properties for \ed etc)
|
||||
.sp
|
||||
Note that LIMIT_MATCH and LIMIT_RECURSION can only reduce the value of the
|
||||
limits set by the caller of pcre2_match(), not increase them.
|
||||
limits set by the caller of pcre2_match(), not increase them. The application
|
||||
can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or
|
||||
PCRE2_NEVER_UCP options, respectively, at compile time.
|
||||
.
|
||||
.
|
||||
.SH "NEWLINE CONVENTION"
|
||||
|
@ -543,6 +549,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 15 March 2015
|
||||
Last updated: 13 April 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "22 March 2015" "PCRE 10.20"
|
||||
.TH PCRE2TEST 1 "13 April 2015" "PCRE 10.20"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -447,6 +447,7 @@ for a description of their effects.
|
|||
firstline set PCRE2_FIRSTLINE
|
||||
match_unset_backref set PCRE2_MATCH_UNSET_BACKREF
|
||||
/m multiline set PCRE2_MULTILINE
|
||||
never_backslash_c set PCRE2_NEVER_BACKSLASH_C
|
||||
never_ucp set PCRE2_NEVER_UCP
|
||||
never_utf set PCRE2_NEVER_UTF
|
||||
no_auto_capture set PCRE2_NO_AUTO_CAPTURE
|
||||
|
@ -1443,6 +1444,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 22 March 2015
|
||||
Last updated: 13 April 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -118,6 +118,7 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_UCP 0x00020000u /* C J M D */
|
||||
#define PCRE2_UNGREEDY 0x00040000u /* C */
|
||||
#define PCRE2_UTF 0x00080000u /* C J M D */
|
||||
#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
|
|
|
@ -556,9 +556,10 @@ static PCRE2_SPTR posix_substitutes[] = {
|
|||
(PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_AUTO_CALLOUT| \
|
||||
PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
|
||||
PCRE2_EXTENDED|PCRE2_FIRSTLINE|PCRE2_MATCH_UNSET_BACKREF| \
|
||||
PCRE2_MULTILINE|PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \
|
||||
PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \
|
||||
PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF)
|
||||
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
|
||||
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
|
||||
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK| \
|
||||
PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF)
|
||||
|
||||
/* Compile time error code numbers. They are given names so that they can more
|
||||
easily be tracked. When a new number is added, the tables called eint1 and
|
||||
|
@ -574,7 +575,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
|
|||
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
|
||||
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
|
||||
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
|
||||
ERR81, ERR82 };
|
||||
ERR81, ERR82, ERR83 };
|
||||
|
||||
/* This is a table of start-of-pattern options such as (*UTF) and settings such
|
||||
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
|
||||
|
@ -6676,6 +6677,14 @@ for (;; ptr++)
|
|||
}
|
||||
#endif
|
||||
|
||||
/* The use of \C can be locked out. */
|
||||
|
||||
else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
|
||||
{
|
||||
*errorcodeptr = ERR83;
|
||||
goto FAILED;
|
||||
}
|
||||
|
||||
/* For the rest (including \X when Unicode properties are supported), we
|
||||
can obtain the OP value by negating the escape value in the default
|
||||
situation when PCRE2_UCP is not set. When it *is* set, we substitute
|
||||
|
|
|
@ -84,7 +84,7 @@ static const char compile_error_texts[] =
|
|||
/* 15 */
|
||||
"reference to non-existent subpattern\0"
|
||||
"pattern passed as NULL\0"
|
||||
"unknown compile-time option bit(s)\0"
|
||||
"unrecognised compile-time option bit(s)\0"
|
||||
"missing ) after (?# comment\0"
|
||||
"parentheses are too deeply nested\0"
|
||||
/* 20 */
|
||||
|
@ -163,6 +163,7 @@ static const char compile_error_texts[] =
|
|||
"internal error: unknown opcode in auto_possessify()\0"
|
||||
"missing terminating delimiter for callout with string argument\0"
|
||||
"unrecognized string delimiter follows (?C\0"
|
||||
"using \\C is disabled by the application\0"
|
||||
;
|
||||
|
||||
/* Match-time and UTF error texts are in the same format. */
|
||||
|
|
|
@ -525,6 +525,7 @@ static modstruct modlist[] = {
|
|||
{ "match_unset_backref", MOD_PAT, MOD_OPT, PCRE2_MATCH_UNSET_BACKREF, PO(options) },
|
||||
{ "memory", MOD_PD, MOD_CTL, CTL_MEMORY, PD(control) },
|
||||
{ "multiline", MOD_PATP, MOD_OPT, PCRE2_MULTILINE, PO(options) },
|
||||
{ "never_backslash_c", MOD_PAT, MOD_OPT, PCRE2_NEVER_BACKSLASH_C, PO(options) },
|
||||
{ "never_ucp", MOD_PAT, MOD_OPT, PCRE2_NEVER_UCP, PO(options) },
|
||||
{ "never_utf", MOD_PAT, MOD_OPT, PCRE2_NEVER_UTF, PO(options) },
|
||||
{ "newline", MOD_CTC, MOD_NL, 0, CO(newline_convention) },
|
||||
|
@ -3459,7 +3460,7 @@ static void
|
|||
show_compile_options(uint32_t options, const char *before, const char *after)
|
||||
{
|
||||
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
||||
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
before,
|
||||
((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "",
|
||||
((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "",
|
||||
|
@ -3473,6 +3474,7 @@ else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
|||
((options & PCRE2_FIRSTLINE) != 0)? " firstline" : "",
|
||||
((options & PCRE2_MATCH_UNSET_BACKREF) != 0)? " match_unset_backref" : "",
|
||||
((options & PCRE2_MULTILINE) != 0)? " multiline" : "",
|
||||
((options & PCRE2_NEVER_BACKSLASH_C) != 0)? " never_backslash_c" : "",
|
||||
((options & PCRE2_NEVER_UCP) != 0)? " never_ucp" : "",
|
||||
((options & PCRE2_NEVER_UTF) != 0)? " never_utf" : "",
|
||||
((options & PCRE2_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "",
|
||||
|
|
|
@ -4265,4 +4265,6 @@ a random value. /Ix
|
|||
|
||||
/((?2){73}(?2))((?1))/info
|
||||
|
||||
/ab\Cde/never_backslash_c
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -14293,4 +14293,7 @@ Capturing subpattern count = 2
|
|||
May match empty string
|
||||
Subject length lower bound = 0
|
||||
|
||||
/ab\Cde/never_backslash_c
|
||||
Failed: error 183 at offset 3: using \C is disabled by the application
|
||||
|
||||
# End of testinput2
|
||||
|
|
Loading…
Reference in New Issue