From 54f59d3c050de360c10ac7d1f9733bc50dcb382e Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 17 Jul 2015 13:41:09 +0000 Subject: [PATCH] Ignore {1} quantifiers. --- ChangeLog | 3 +++ src/pcre2_compile.c | 26 +++++++++++++++----------- testdata/testinput2 | 6 +++++- testdata/testoutput2 | 27 ++++++++++++++++++++++----- 4 files changed, 45 insertions(+), 17 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6b3177e..58ee47f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -37,6 +37,9 @@ overflow. This bug was discovered by Karl Skomski with the LLVM fuzzer. 9. The handling of callouts during the pre-pass for named group identification has been tightened up. +10. The quantifier {1} can be ignored, whether greedy, non-greedy, or +possessive. This is a very minor optimization. + Version 10.20 30-June-2015 -------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 7e0d4fc..8563502 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -3253,11 +3253,11 @@ for (; ptr < cb->end_pattern; ptr++) else top_nest->nest_depth = nest_depth; } break; - + /* Skip over a numerical or string argument for a callout. */ - + case CHAR_C: - ptr += 2; + ptr += 2; if (ptr[1] == CHAR_RIGHT_PARENTHESIS) break; if (IS_DIGIT(ptr[1])) { @@ -3265,14 +3265,14 @@ for (; ptr < cb->end_pattern; ptr++) if (ptr[1] != CHAR_RIGHT_PARENTHESIS) { errorcode = ERR39; - ptr++; + ptr++; goto FAILED; - } + } break; - } + } /* Handle a string argument */ - + ptr++; delimiter = 0; for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) @@ -3302,8 +3302,8 @@ for (; ptr < cb->end_pattern; ptr++) if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2; } while (ptr[0] != delimiter); - break; - + break; + case CHAR_NUMBER_SIGN: ptr += 3; while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; @@ -4719,6 +4719,10 @@ for (;; ptr++) } else repeat_type = greedy_default; + /* If the repeat is {1} we can ignore it. */ + + if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT; + /* If previous was a recursion call, wrap it in atomic brackets so that previous becomes the atomic group. All recursions were so wrapped in the past, but it no longer happens for non-repeated recursions. In fact, the @@ -6113,8 +6117,8 @@ for (;; ptr++) } /* During the pre-compile phase, we parse the string and update the - length. There is no need to generate any code. (In fact, the string - has already been parsed in the pre-pass that looks for named + length. There is no need to generate any code. (In fact, the string + has already been parsed in the pre-pass that looks for named parentheses, but it does no harm to leave this code in.) */ if (lengthptr != NULL) /* Only check the string */ diff --git a/testdata/testinput2 b/testdata/testinput2 index 8ab3043..2958e6d 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -1259,7 +1259,11 @@ /(a(b(?2)c)){0,2}/IB -/[ab]{1}+/IB +/[ab]{1}+/B + +/()(?1){1}/B + +/()(?1)/B /((w\/|-|with)*(free|immediate)*.*?shipping\s*[!.-]*)/Ii Baby Bjorn Active Carrier - With free SHIPPING!! diff --git a/testdata/testoutput2 b/testdata/testoutput2 index c446aaf..4d1a54c 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -4533,16 +4533,33 @@ Capturing subpattern count = 2 May match empty string Subject length lower bound = 0 -/[ab]{1}+/IB +/[ab]{1}+/B ------------------------------------------------------------------ Bra - [ab]{1,1}+ + [ab] + Ket + End +------------------------------------------------------------------ + +/()(?1){1}/B +------------------------------------------------------------------ + Bra + CBra 1 + Ket + Recurse + Ket + End +------------------------------------------------------------------ + +/()(?1)/B +------------------------------------------------------------------ + Bra + CBra 1 + Ket + Recurse Ket End ------------------------------------------------------------------ -Capturing subpattern count = 0 -Starting code units: a b -Subject length lower bound = 1 /((w\/|-|with)*(free|immediate)*.*?shipping\s*[!.-]*)/Ii Capturing subpattern count = 3