From 7bf5d9efd0241832b9071de86bbfe9c8db030d00 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Wed, 11 Nov 2015 09:42:26 +0000 Subject: [PATCH] Small optimizations in pcre2_study.c --- ChangeLog | 2 ++ doc/pcre2api.3 | 11 +++++---- src/pcre2_study.c | 55 ++++++++++++++++++++++++------------------- testdata/testoutput15 | 6 ++--- testdata/testoutput17 | 6 ++--- testdata/testoutput2 | 2 +- 6 files changed, 47 insertions(+), 35 deletions(-) diff --git a/ChangeLog b/ChangeLog index c7b2a96..5ad577b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -282,6 +282,8 @@ a factor of the size of the compiling workspace (it currently is). 81. Check for integer overflow in minimum length calculation and cap it at 65535. +82. Small optimizations in code for finding the minimum matching length. + Version 10.20 30-June-2015 -------------------------- diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 4c15471..9d71c93 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "05 November 2015" "PCRE2 10.21" +.TH PCRE2API 3 "10 November 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -1684,8 +1684,11 @@ value, 0 is returned. .sp PCRE2_INFO_MATCHEMPTY .sp -Return 1 if the pattern can match an empty string, otherwise 0. The third -argument should point to an \fBuint32_t\fP variable. +Return 1 if the pattern might match an empty string, otherwise 0. The third +argument should point to an \fBuint32_t\fP variable. When a pattern contains +recursive subroutine calls it is not always possible to determine whether or +not it can match an empty string. PCRE2 takes a cautious approach and returns 1 +in such cases. .sp PCRE2_INFO_MATCHLIMIT .sp @@ -3084,6 +3087,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 05 November 2015 +Last updated: 10 November 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/src/pcre2_study.c b/src/pcre2_study.c index de5ea92..3c61012 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -104,18 +104,21 @@ recurse_check this_recurse; register int branchlength = 0; register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE; +/* If this is a "could be empty" group, its minimum length is 0. */ + +if (*code >= OP_SBRA && *code <= OP_SCOND) return 0; + +/* Skip over capturing bracket number */ + +if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE; + /* A large and/or complex regex can take too long to process. */ if ((*countptr)++ > 1000) return -1; -/* Skip over capturing bracket number */ - -if (*code == OP_CBRA || *code == OP_SCBRA || - *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE; - -/* Scan along the opcodes for this branch. If we get to the end of the -branch, check the length against that of the other branches. If the accumulated -length passes 16-bits, stop and return it. */ +/* Scan along the opcodes for this branch. If we get to the end of the branch, +check the length against that of the other branches. If the accumulated length +passes 16-bits, stop. */ for (;;) { @@ -1543,24 +1546,28 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 && if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET; } -/* Find the minimum length of subject string. */ +/* Find the minimum length of subject string. If it can match an empty string, +the minimum length is already known. */ -switch(min = find_minlength(re, code, code, utf, NULL, &count)) +if ((re->flags & PCRE2_MATCH_EMPTY) == 0) { - case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */ - break; /* Leave minlength unchanged (will be zero) */ - - case -2: - return 2; /* missing capturing bracket */ - - case -3: - return 3; /* unrecognized opcode */ - - default: - if (min > UINT16_MAX) min = UINT16_MAX; - re->minlength = min; - break; - } + switch(min = find_minlength(re, code, code, utf, NULL, &count)) + { + case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */ + break; /* Leave minlength unchanged (will be zero) */ + + case -2: + return 2; /* missing capturing bracket */ + + case -3: + return 3; /* unrecognized opcode */ + + default: + if (min > UINT16_MAX) min = UINT16_MAX; + re->minlength = min; + break; + } + } return 0; } diff --git a/testdata/testoutput15 b/testdata/testoutput15 index 0dad75a..bb29a49 100644 --- a/testdata/testoutput15 +++ b/testdata/testoutput15 @@ -252,7 +252,7 @@ Failed: error -52: nested recursion at the same subject position /(a|(?R))/I Capturing subpattern count = 1 May match empty string -Subject length lower bound = 1 +Subject length lower bound = 0 abcd 0: a 1: a @@ -262,7 +262,7 @@ Failed: error -52: nested recursion at the same subject position /(ab|(bc|(de|(?R))))/I Capturing subpattern count = 3 May match empty string -Subject length lower bound = 2 +Subject length lower bound = 0 abcd 0: ab 1: ab @@ -272,7 +272,7 @@ Failed: error -52: nested recursion at the same subject position /(ab|(bc|(de|(?1))))/I Capturing subpattern count = 3 May match empty string -Subject length lower bound = 2 +Subject length lower bound = 0 abcd 0: ab 1: ab diff --git a/testdata/testoutput17 b/testdata/testoutput17 index 53c55b8..9227ee6 100644 --- a/testdata/testoutput17 +++ b/testdata/testoutput17 @@ -416,7 +416,7 @@ Failed: error -46: JIT stack limit reached /(a|(?R))/I Capturing subpattern count = 1 May match empty string -Subject length lower bound = 1 +Subject length lower bound = 0 JIT compilation was successful abcd 0: a (JIT) @@ -427,7 +427,7 @@ Failed: error -46: JIT stack limit reached /(ab|(bc|(de|(?R))))/I Capturing subpattern count = 3 May match empty string -Subject length lower bound = 2 +Subject length lower bound = 0 JIT compilation was successful abcd 0: ab (JIT) @@ -438,7 +438,7 @@ Failed: error -46: JIT stack limit reached /(ab|(bc|(de|(?1))))/I Capturing subpattern count = 3 May match empty string -Subject length lower bound = 2 +Subject length lower bound = 0 JIT compilation was successful abcd 0: ab (JIT) diff --git a/testdata/testoutput2 b/testdata/testoutput2 index c149674..2807628 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -3960,7 +3960,7 @@ Subject length lower bound = 3 Capturing subpattern count = 2 Compile options: Overall options: anchored -Subject length lower bound = 3 +Subject length lower bound = 2 a=a 0: a=a 1: a