diff --git a/CMakeLists.txt b/CMakeLists.txt index b76a953..94460d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,7 +67,8 @@ # 2013-10-08 PH got rid of the "source" command, which is a bash-ism (use ".") # 2013-11-05 PH added support for PARENS_NEST_LIMIT # 2014-08-29 PH converted the file for PCRE2 (which has no C++). -# 2015-04024 PH added support for PCRE2_DEBUG +# 2015-04-24 PH added support for PCRE2_DEBUG +# 2015-07-16 PH updated for new pcre2_find_bracket source module PROJECT(PCRE2 C) @@ -390,6 +391,7 @@ SET(PCRE2_SOURCES src/pcre2_context.c src/pcre2_dfa_match.c src/pcre2_error.c + src/pcre2_find_bracket.c src/pcre2_jit_compile.c src/pcre2_maketables.c src/pcre2_match.c diff --git a/ChangeLog b/ChangeLog index 8b3a4f3..f6d4414 100644 --- a/ChangeLog +++ b/ChangeLog @@ -25,6 +25,10 @@ during the scan for named groups, and hence the external # was incorrectly treated as a comment and the invalid (?' at the end of the pattern was not diagnosed. This caused a buffer overflow during the real compile. +7. Moved the pcre2_find_bracket() function from src/pcre2_compile.c into its +own source module to avoid a circular dependency between src/pcre2_compile.c +and src/pcre2_study.c + Version 10.20 30-June-2015 -------------------------- diff --git a/Makefile.am b/Makefile.am index 56f93db..2da886b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -319,6 +319,7 @@ COMMON_SOURCES = \ src/pcre2_context.c \ src/pcre2_dfa_match.c \ src/pcre2_error.c \ + src/pcre2_find_bracket.c \ src/pcre2_internal.h \ src/pcre2_intmodedep.h \ src/pcre2_jit_compile.c \ diff --git a/NON-AUTOTOOLS-BUILD b/NON-AUTOTOOLS-BUILD index d8d9d2b..bdbff78 100644 --- a/NON-AUTOTOOLS-BUILD +++ b/NON-AUTOTOOLS-BUILD @@ -97,6 +97,7 @@ can skip ahead to the CMake section. pcre2_context.c pcre2_dfa_match.c pcre2_error.c + pcre2_find_bracket.c pcre2_jit_compile.c pcre2_maketables.c pcre2_match.c @@ -388,4 +389,4 @@ and executable, is in EBCDIC and native z/OS file formats and this is the recommended download site. ============================= -Last Updated: 15 June 2015 +Last Updated: 16 July 2015 diff --git a/PrepareRelease b/PrepareRelease index f6b138f..9323903 100755 --- a/PrepareRelease +++ b/PrepareRelease @@ -204,6 +204,7 @@ files="\ src/pcre2_context.c \ src/pcre2_dfa_match.c \ src/pcre2_error.c \ + src/pcre2_find_bracket.c \ src/pcre2_internal.h \ src/pcre2_intmodedep.h \ src/pcre2_jit_compile.c \ diff --git a/README b/README index 7367924..955d45a 100644 --- a/README +++ b/README @@ -724,6 +724,7 @@ The distribution should contain the files listed below. src/pcre2_context.c ) src/pcre2_dfa_match.c ) src/pcre2_error.c ) + src/pcre2_find_bracket.c ) src/pcre2_jit_compile.c ) src/pcre2_jit_match.c ) sources for the functions in the library, src/pcre2_jit_misc.c ) and some internal functions that they use @@ -832,4 +833,4 @@ The distribution should contain the files listed below. Philip Hazel Email local part: ph10 Email domain: cam.ac.uk -Last updated: 24 April 2015 +Last updated: 16 July 2015 diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index cc09ebd..84c1e65 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -2333,175 +2333,6 @@ return p; -/************************************************* -* Scan compiled regex for specific bracket * -*************************************************/ - -/* This function scans through a compiled pattern until it finds a -capturing bracket with the given number, or, if the number is negative, an -instance of OP_REVERSE for a lookbehind. The function is global in the C sense -so that it can be called from pcre2_study() when finding the minimum matching -length. - -Arguments: - code points to start of expression - utf TRUE in UTF mode - number the required bracket number or negative to find a lookbehind - -Returns: pointer to the opcode for the bracket, or NULL if not found -*/ - -PCRE2_SPTR -PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number) -{ -for (;;) - { - register PCRE2_UCHAR c = *code; - - if (c == OP_END) return NULL; - - /* XCLASS is used for classes that cannot be represented just by a bit map. - This includes negated single high-valued characters. CALLOUT_STR is used for - callouts with string arguments. In both cases the length in the table is - zero; the actual length is stored in the compiled code. */ - - if (c == OP_XCLASS) code += GET(code, 1); - else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); - - /* Handle recursion */ - - else if (c == OP_REVERSE) - { - if (number < 0) return (PCRE2_UCHAR *)code; - code += PRIV(OP_lengths)[c]; - } - - /* Handle capturing bracket */ - - else if (c == OP_CBRA || c == OP_SCBRA || - c == OP_CBRAPOS || c == OP_SCBRAPOS) - { - int n = (int)GET2(code, 1+LINK_SIZE); - if (n == number) return (PCRE2_UCHAR *)code; - code += PRIV(OP_lengths)[c]; - } - - /* Otherwise, we can get the item's length from the table, except that for - repeated character types, we have to test for \p and \P, which have an extra - two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we - must add in its length. */ - - else - { - switch(c) - { - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - case OP_TYPEPOSSTAR: - case OP_TYPEPOSPLUS: - case OP_TYPEPOSQUERY: - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; - break; - - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - case OP_TYPEEXACT: - case OP_TYPEPOSUPTO: - if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) - code += 2; - break; - - case OP_MARK: - case OP_PRUNE_ARG: - case OP_SKIP_ARG: - case OP_THEN_ARG: - code += code[1]; - break; - } - - /* Add in the fixed length from the table */ - - code += PRIV(OP_lengths)[c]; - - /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be - followed by a multi-byte character. The length in the table is a minimum, so - we have to arrange to skip the extra bytes. */ - -#ifdef MAYBE_UTF_MULTI - if (utf) switch(c) - { - case OP_CHAR: - case OP_CHARI: - case OP_NOT: - case OP_NOTI: - case OP_EXACT: - case OP_EXACTI: - case OP_NOTEXACT: - case OP_NOTEXACTI: - case OP_UPTO: - case OP_UPTOI: - case OP_NOTUPTO: - case OP_NOTUPTOI: - case OP_MINUPTO: - case OP_MINUPTOI: - case OP_NOTMINUPTO: - case OP_NOTMINUPTOI: - case OP_POSUPTO: - case OP_POSUPTOI: - case OP_NOTPOSUPTO: - case OP_NOTPOSUPTOI: - case OP_STAR: - case OP_STARI: - case OP_NOTSTAR: - case OP_NOTSTARI: - case OP_MINSTAR: - case OP_MINSTARI: - case OP_NOTMINSTAR: - case OP_NOTMINSTARI: - case OP_POSSTAR: - case OP_POSSTARI: - case OP_NOTPOSSTAR: - case OP_NOTPOSSTARI: - case OP_PLUS: - case OP_PLUSI: - case OP_NOTPLUS: - case OP_NOTPLUSI: - case OP_MINPLUS: - case OP_MINPLUSI: - case OP_NOTMINPLUS: - case OP_NOTMINPLUSI: - case OP_POSPLUS: - case OP_POSPLUSI: - case OP_NOTPOSPLUS: - case OP_NOTPOSPLUSI: - case OP_QUERY: - case OP_QUERYI: - case OP_NOTQUERY: - case OP_NOTQUERYI: - case OP_MINQUERY: - case OP_MINQUERYI: - case OP_NOTMINQUERY: - case OP_NOTMINQUERYI: - case OP_POSQUERY: - case OP_POSQUERYI: - case OP_NOTPOSQUERY: - case OP_NOTPOSQUERYI: - if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); - break; - } -#else - (void)(utf); /* Keep compiler happy by referencing function argument */ -#endif /* MAYBE_UTF_MULTI */ - } - } -} - - - /************************************************* * Scan compiled regex for recursion reference * *************************************************/ diff --git a/src/pcre2_find_bracket.c b/src/pcre2_find_bracket.c new file mode 100644 index 0000000..3cdc419 --- /dev/null +++ b/src/pcre2_find_bracket.c @@ -0,0 +1,218 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2015 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + + +/* This module contains a single function that scans through a compiled pattern +until it finds a capturing bracket with the given number, or, if the number is +negative, an instance of OP_REVERSE for a lookbehind. The function is called +from pcre2_compile.c and also from pcre2_study.c when finding the minimum +matching length. */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pcre2_internal.h" + + +/************************************************* +* Scan compiled regex for specific bracket * +*************************************************/ + +/* +Arguments: + code points to start of expression + utf TRUE in UTF mode + number the required bracket number or negative to find a lookbehind + +Returns: pointer to the opcode for the bracket, or NULL if not found +*/ + +PCRE2_SPTR +PRIV(find_bracket)(PCRE2_SPTR code, BOOL utf, int number) +{ +for (;;) + { + register PCRE2_UCHAR c = *code; + + if (c == OP_END) return NULL; + + /* XCLASS is used for classes that cannot be represented just by a bit map. + This includes negated single high-valued characters. CALLOUT_STR is used for + callouts with string arguments. In both cases the length in the table is + zero; the actual length is stored in the compiled code. */ + + if (c == OP_XCLASS) code += GET(code, 1); + else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE); + + /* Handle recursion */ + + else if (c == OP_REVERSE) + { + if (number < 0) return (PCRE2_UCHAR *)code; + code += PRIV(OP_lengths)[c]; + } + + /* Handle capturing bracket */ + + else if (c == OP_CBRA || c == OP_SCBRA || + c == OP_CBRAPOS || c == OP_SCBRAPOS) + { + int n = (int)GET2(code, 1+LINK_SIZE); + if (n == number) return (PCRE2_UCHAR *)code; + code += PRIV(OP_lengths)[c]; + } + + /* Otherwise, we can get the item's length from the table, except that for + repeated character types, we have to test for \p and \P, which have an extra + two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we + must add in its length. */ + + else + { + switch(c) + { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + break; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + case OP_TYPEPOSUPTO: + if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) + code += 2; + break; + + case OP_MARK: + case OP_PRUNE_ARG: + case OP_SKIP_ARG: + case OP_THEN_ARG: + code += code[1]; + break; + } + + /* Add in the fixed length from the table */ + + code += PRIV(OP_lengths)[c]; + + /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be + followed by a multi-byte character. The length in the table is a minimum, so + we have to arrange to skip the extra bytes. */ + +#ifdef MAYBE_UTF_MULTI + if (utf) switch(c) + { + case OP_CHAR: + case OP_CHARI: + case OP_NOT: + case OP_NOTI: + case OP_EXACT: + case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: + case OP_UPTO: + case OP_UPTOI: + case OP_NOTUPTO: + case OP_NOTUPTOI: + case OP_MINUPTO: + case OP_MINUPTOI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: + case OP_POSUPTO: + case OP_POSUPTOI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: + case OP_STAR: + case OP_STARI: + case OP_NOTSTAR: + case OP_NOTSTARI: + case OP_MINSTAR: + case OP_MINSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: + case OP_POSSTAR: + case OP_POSSTARI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: + case OP_PLUS: + case OP_PLUSI: + case OP_NOTPLUS: + case OP_NOTPLUSI: + case OP_MINPLUS: + case OP_MINPLUSI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: + case OP_POSPLUS: + case OP_POSPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: + case OP_QUERY: + case OP_QUERYI: + case OP_NOTQUERY: + case OP_NOTQUERYI: + case OP_MINQUERY: + case OP_MINQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: + case OP_POSQUERY: + case OP_POSQUERYI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: + if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); + break; + } +#else + (void)(utf); /* Keep compiler happy by referencing function argument */ +#endif /* MAYBE_UTF_MULTI */ + } + } +} + +/* End of pcre2_find_bracket.c */