From e47a6ebe8739f0f8abaac84021cf3a4249fba5ac Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Mon, 13 Apr 2015 17:29:05 +0000 Subject: [PATCH] Implement PCRE2_NEVER_BACKSLASH_C. --- ChangeLog | 2 ++ doc/pcre2.3 | 20 +++++++++++++++----- doc/pcre2_compile.3 | 3 ++- doc/pcre2api.3 | 22 +++++++++++++++------- doc/pcre2build.3 | 9 +++++++-- doc/pcre2pattern.3 | 19 +++++++++++-------- doc/pcre2syntax.3 | 14 ++++++++++---- doc/pcre2test.1 | 5 +++-- src/pcre2.h.in | 1 + src/pcre2_compile.c | 17 +++++++++++++---- src/pcre2_error.c | 3 ++- src/pcre2test.c | 4 +++- testdata/testinput2 | 2 ++ testdata/testoutput2 | 3 +++ 14 files changed, 89 insertions(+), 35 deletions(-) diff --git a/ChangeLog b/ChangeLog index f3ae0ed..dec4940 100644 --- a/ChangeLog +++ b/ChangeLog @@ -85,6 +85,8 @@ a very long time if mutual recursion was present many times in a pattern, for example, /((?2){73}(?2))((?1))/. A better mutual recursion detection method has been implemented. This infelicity was discovered by the LLVM fuzzer. +21. Implemented PCRE2_NEVER_BACKSLASH_C. + Version 10.10 06-March-2015 --------------------------- diff --git a/doc/pcre2.3 b/doc/pcre2.3 index dd4e53c..3fc8038 100644 --- a/doc/pcre2.3 +++ b/doc/pcre2.3 @@ -1,4 +1,4 @@ -.TH PCRE2 3 "18 November 2014" "PCRE2 10.00" +.TH PCRE2 3 "13 April 2015" "PCRE2 10.20" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH INTRODUCTION @@ -103,14 +103,24 @@ lose performance. .P One way of guarding against this possibility is to use the \fBpcre2_pattern_info()\fP function to check the compiled pattern's options for -UTF. Alternatively, you can set the PCRE2_NEVER_UTF option at compile time. -This causes an compile time error if a pattern contains a UTF-setting sequence. +PCRE2_UTF. Alternatively, you can set the PCRE2_NEVER_UTF option when calling +\fBpcre2_compile()\fP. This causes an compile time error if a pattern contains +a UTF-setting sequence. +.P +The use of Unicode properties for character types such as \ed can also be +enabled from within the pattern, by specifying "(*UCP)". This feature can be +disallowed by setting the PCRE2_NEVER_UCP option. .P If your application is one that supports UTF, be aware that validity checking can take time. If the same data string is to be matched many times, you can use the PCRE2_NO_UTF_CHECK option for the second and subsequent matches to avoid running redundant checks. .P +The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to +problems, because it may leave the current matching point in the middle of a +multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to +lock out the use of \eC, causing a compile-time error if it is encountered. +.P Another way that performance can be hit is by running a pattern that has a very large search tree against a string that will never match. Nested unlimited repeats in a pattern are a common example. PCRE2 provides some protection @@ -177,6 +187,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk. .rs .sp .nf -Last updated: 18 November 2014 -Copyright (c) 1997-2014 University of Cambridge. +Last updated: 13 April 2015 +Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2_compile.3 b/doc/pcre2_compile.3 index cf0858d..3173804 100644 --- a/doc/pcre2_compile.3 +++ b/doc/pcre2_compile.3 @@ -1,4 +1,4 @@ -.TH PCRE2_COMPILE 3 "02 January 2015" "PCRE2 10.00" +.TH PCRE2_COMPILE 3 "13 April 2015" "PCRE2 10.20" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -46,6 +46,7 @@ or provide an external function for stack size checking. The option bits are: PCRE2_FIRSTLINE Force matching to be before newline PCRE2_MATCH_UNSET_BACKREF Match unset back references PCRE2_MULTILINE ^ and $ match newlines within data + PCRE2_NEVER_BACKSLASH_C Lock out the use of \C in patterns PCRE2_NEVER_UCP Lock out PCRE2_UCP, e.g. via (*UCP) PCRE2_NEVER_UTF Lock out PCRE2_UTF, e.g. via (*UTF) PCRE2_NO_AUTO_CAPTURE Disable numbered capturing paren- diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 39a849b..668db78 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "23 March 2015" "PCRE2 10.20" +.TH PCRE2API 3 "13 April 2015" "PCRE2 10.20" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -1149,6 +1149,14 @@ in the subject string, respectively, as well as at the very start and end. This is equivalent to Perl's /m option, and it can be changed within a pattern by a (?m) option setting. If there are no newlines in a subject string, or no occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect. +.sp + PCRE2_NEVER_BACKSLASH_C +.sp +This option locks out the use of \eC in the pattern that is being compiled. +This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because +it may leave the current matching point in the middle of a multi-code-unit +character. This option may be useful in applications that process patterns from +external sources. .sp PCRE2_NEVER_UCP .sp @@ -1156,17 +1164,17 @@ This option locks out the use of Unicode properties for handling \eB, \eb, \eD, \ed, \eS, \es, \eW, \ew, and some of the POSIX character classes, as described for the PCRE2_UCP option below. In particular, it prevents the creator of the pattern from enabling this facility by starting the pattern with (*UCP). This -may be useful in applications that process patterns from external sources. The -option combination PCRE_UCP and PCRE_NEVER_UCP causes an error. +option may be useful in applications that process patterns from external +sources. The option combination PCRE_UCP and PCRE_NEVER_UCP causes an error. .sp PCRE2_NEVER_UTF .sp This option locks out interpretation of the pattern as UTF-8, UTF-16, or UTF-32, depending on which library is in use. In particular, it prevents the creator of the pattern from switching to UTF interpretation by starting the -pattern with (*UTF). This may be useful in applications that process patterns -from external sources. The combination of PCRE2_UTF and PCRE2_NEVER_UTF causes -an error. +pattern with (*UTF). This option may be useful in applications that process +patterns from external sources. The combination of PCRE2_UTF and +PCRE2_NEVER_UTF causes an error. .sp PCRE2_NO_AUTO_CAPTURE .sp @@ -2919,6 +2927,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 23 March 2015 +Last updated: 13 April 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2build.3 b/doc/pcre2build.3 index 55eab15..e298321 100644 --- a/doc/pcre2build.3 +++ b/doc/pcre2build.3 @@ -1,4 +1,4 @@ -.TH PCRE2BUILD 3 "26 January 2015" "PCRE2 10.00" +.TH PCRE2BUILD 3 "13 April 2015" "PCRE2 10.20" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) . @@ -132,6 +132,11 @@ Pattern escapes such as \ed and \ew do not by default make use of Unicode properties. The application can request that they do by setting the PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also request this by starting with (*UCP). +.P +The \eC escape sequence, which matches a single code unit, even in a UTF mode, +can cause unpredictable behaviour because it may leave the current matching +point in the middle of a multi-code-unit character. It can be locked out by +setting the PCRE2_NEVER_BACKSLASH_C option. . . .SH "JUST-IN-TIME COMPILER SUPPORT" @@ -494,6 +499,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 26 January 2015 +Last updated: 13 April 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index 7c237fa..cd52503 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2PATTERN 3 "15 March 2015" "PCRE2 10.20" +.TH PCRE2PATTERN 3 "13 April 2015" "PCRE2 10.20" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" @@ -1200,12 +1200,15 @@ whether or not a UTF mode is set. In the 8-bit library, one code unit is one byte; in the 16-bit library it is a 16-bit unit; in the 32-bit library it is a 32-bit unit. Unlike a dot, \eC always matches line-ending characters. The feature is provided in Perl in order to match individual bytes in UTF-8 mode, -but it is unclear how it can usefully be used. Because \eC breaks up characters -into individual code units, matching one unit with \eC in a UTF mode means that -the rest of the string may start with a malformed UTF character. This has -undefined results, because PCRE2 assumes that it is dealing with valid UTF -strings (and by default it checks this at the start of processing unless the -PCRE2_NO_UTF_CHECK option is used). +but it is unclear how it can usefully be used. +.P +Because \eC breaks up characters into individual code units, matching one unit +with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start +with a malformed UTF character. This has undefined results, because PCRE2 +assumes that it is matching character by character in a valid UTF string (by +default it checks the subject string's validity at the start of processing +unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the +use of \eC by setting the PCRE2_NEVER_BACKSLASH_C option. .P PCRE2 does not allow \eC to appear in lookbehind assertions .\" HTML @@ -3329,6 +3332,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 15 March 2015 +Last updated: 13 April 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3 index cfc6d0f..10a1584 100644 --- a/doc/pcre2syntax.3 +++ b/doc/pcre2syntax.3 @@ -1,4 +1,4 @@ -.TH PCRE2SYNTAX 3 "15 March 2015" "PCRE2 10.20" +.TH PCRE2SYNTAX 3 "13 April 2015" "PCRE2 10.20" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY" @@ -44,7 +44,7 @@ characters "8" and "9". .sp . any character except newline; in dotall mode, any character whatsoever - \eC one data unit, even in UTF mode (best avoided) + \eC one code unit, even in UTF mode (best avoided) \ed a decimal digit \eD a character that is not a decimal digit \eh a horizontal white space character @@ -61,6 +61,10 @@ characters "8" and "9". \eW a "non-word" character \eX a Unicode extended grapheme cluster .sp +The application can lock out the use of \eC by setting the +PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the +current matching point in the middle of a UTF-8 or UTF-16 character. +.P By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode or in the 16-bit and 32-bit libraries. However, if locale-specific matching is happening, \es and \ew may also match characters with code points in the range @@ -396,7 +400,9 @@ appear. (*UCP) set PCRE2_UCP (use Unicode properties for \ed etc) .sp Note that LIMIT_MATCH and LIMIT_RECURSION can only reduce the value of the -limits set by the caller of pcre2_match(), not increase them. +limits set by the caller of pcre2_match(), not increase them. The application +can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or +PCRE2_NEVER_UCP options, respectively, at compile time. . . .SH "NEWLINE CONVENTION" @@ -543,6 +549,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 15 March 2015 +Last updated: 13 April 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index eb2b04a..63e3da6 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "22 March 2015" "PCRE 10.20" +.TH PCRE2TEST 1 "13 April 2015" "PCRE 10.20" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -447,6 +447,7 @@ for a description of their effects. firstline set PCRE2_FIRSTLINE match_unset_backref set PCRE2_MATCH_UNSET_BACKREF /m multiline set PCRE2_MULTILINE + never_backslash_c set PCRE2_NEVER_BACKSLASH_C never_ucp set PCRE2_NEVER_UCP never_utf set PCRE2_NEVER_UTF no_auto_capture set PCRE2_NO_AUTO_CAPTURE @@ -1443,6 +1444,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 22 March 2015 +Last updated: 13 April 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/src/pcre2.h.in b/src/pcre2.h.in index 79f4d25..a752114 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -118,6 +118,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_UCP 0x00020000u /* C J M D */ #define PCRE2_UNGREEDY 0x00040000u /* C */ #define PCRE2_UTF 0x00080000u /* C J M D */ +#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */ /* These are for pcre2_jit_compile(). */ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 416f7aa..2276fff 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -556,9 +556,10 @@ static PCRE2_SPTR posix_substitutes[] = { (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_AUTO_CALLOUT| \ PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \ PCRE2_EXTENDED|PCRE2_FIRSTLINE|PCRE2_MATCH_UNSET_BACKREF| \ - PCRE2_MULTILINE|PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \ - PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \ - PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF) + PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \ + PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \ + PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK| \ + PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF) /* Compile time error code numbers. They are given names so that they can more easily be tracked. When a new number is added, the tables called eint1 and @@ -574,7 +575,7 @@ enum { ERR0 = COMPILE_ERROR_BASE, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, - ERR81, ERR82 }; + ERR81, ERR82, ERR83 }; /* This is a table of start-of-pattern options such as (*UTF) and settings such as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward @@ -6676,6 +6677,14 @@ for (;; ptr++) } #endif + /* The use of \C can be locked out. */ + + else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0) + { + *errorcodeptr = ERR83; + goto FAILED; + } + /* For the rest (including \X when Unicode properties are supported), we can obtain the OP value by negating the escape value in the default situation when PCRE2_UCP is not set. When it *is* set, we substitute diff --git a/src/pcre2_error.c b/src/pcre2_error.c index 59d89bd..31a686f 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -84,7 +84,7 @@ static const char compile_error_texts[] = /* 15 */ "reference to non-existent subpattern\0" "pattern passed as NULL\0" - "unknown compile-time option bit(s)\0" + "unrecognised compile-time option bit(s)\0" "missing ) after (?# comment\0" "parentheses are too deeply nested\0" /* 20 */ @@ -163,6 +163,7 @@ static const char compile_error_texts[] = "internal error: unknown opcode in auto_possessify()\0" "missing terminating delimiter for callout with string argument\0" "unrecognized string delimiter follows (?C\0" + "using \\C is disabled by the application\0" ; /* Match-time and UTF error texts are in the same format. */ diff --git a/src/pcre2test.c b/src/pcre2test.c index 22d3681..c7331e2 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -525,6 +525,7 @@ static modstruct modlist[] = { { "match_unset_backref", MOD_PAT, MOD_OPT, PCRE2_MATCH_UNSET_BACKREF, PO(options) }, { "memory", MOD_PD, MOD_CTL, CTL_MEMORY, PD(control) }, { "multiline", MOD_PATP, MOD_OPT, PCRE2_MULTILINE, PO(options) }, + { "never_backslash_c", MOD_PAT, MOD_OPT, PCRE2_NEVER_BACKSLASH_C, PO(options) }, { "never_ucp", MOD_PAT, MOD_OPT, PCRE2_NEVER_UCP, PO(options) }, { "never_utf", MOD_PAT, MOD_OPT, PCRE2_NEVER_UTF, PO(options) }, { "newline", MOD_CTC, MOD_NL, 0, CO(newline_convention) }, @@ -3459,7 +3460,7 @@ static void show_compile_options(uint32_t options, const char *before, const char *after) { if (options == 0) fprintf(outfile, "%s %s", before, after); -else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", +else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "", ((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "", @@ -3473,6 +3474,7 @@ else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", ((options & PCRE2_FIRSTLINE) != 0)? " firstline" : "", ((options & PCRE2_MATCH_UNSET_BACKREF) != 0)? " match_unset_backref" : "", ((options & PCRE2_MULTILINE) != 0)? " multiline" : "", + ((options & PCRE2_NEVER_BACKSLASH_C) != 0)? " never_backslash_c" : "", ((options & PCRE2_NEVER_UCP) != 0)? " never_ucp" : "", ((options & PCRE2_NEVER_UTF) != 0)? " never_utf" : "", ((options & PCRE2_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "", diff --git a/testdata/testinput2 b/testdata/testinput2 index 7b29e1c..2da1204 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4265,4 +4265,6 @@ a random value. /Ix /((?2){73}(?2))((?1))/info +/ab\Cde/never_backslash_c + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 46adcdd..b09b237 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -14293,4 +14293,7 @@ Capturing subpattern count = 2 May match empty string Subject length lower bound = 0 +/ab\Cde/never_backslash_c +Failed: error 183 at offset 3: using \C is disabled by the application + # End of testinput2