From 7858fa702d99267daa7ba772b1235d80849e4ce8 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Tue, 26 Aug 2014 11:46:21 +0000 Subject: [PATCH] Implement (*NOTEMPTY) and (?(VERSION= features. --- src/pcre2.h.in | 4 +- src/pcre2_compile.c | 100 ++++++++++++++++++++++++---- src/pcre2_dfa_match.c | 33 ++++++++-- src/pcre2_error.c | 3 +- src/pcre2_internal.h | 148 +++++++++++++++++++++++------------------- src/pcre2_match.c | 24 ++++++- src/pcre2_printint.c | 8 ++- src/pcre2_study.c | 6 +- testdata/testinput2 | 36 ++++++++++ testdata/testinput6 | 12 ++++ testdata/testoutput2 | 90 +++++++++++++++++++++++-- testdata/testoutput6 | 24 +++++++ 12 files changed, 391 insertions(+), 97 deletions(-) diff --git a/src/pcre2.h.in b/src/pcre2.h.in index 5b14129..fd41f08 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -130,8 +130,8 @@ functions, so take care not to define synonyms by mistake. */ #define PCRE2_NOTBOL 0x00000001u #define PCRE2_NOTEOL 0x00000002u -#define PCRE2_NOTEMPTY 0x00000004u -#define PCRE2_NOTEMPTY_ATSTART 0x00000008u +#define PCRE2_NOTEMPTY 0x00000004u /* ) These two must be kept */ +#define PCRE2_NOTEMPTY_ATSTART 0x00000008u /* ) adjacent to each other. */ #define PCRE2_PARTIAL_SOFT 0x00000010u #define PCRE2_PARTIAL_HARD 0x00000020u diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 5f5b7ea..ec34caa 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -566,7 +566,7 @@ enum { ERR0 = COMPILE_ERROR_BASE, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, - ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78 }; + ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79 }; /* This is a table of start-of-pattern options such as (*UTF) and settings such as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward @@ -574,6 +574,7 @@ compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is generic and always supported. */ enum { PSO_OPT, /* Value is an option bit */ + PSO_FLG, /* Value is a flag bit */ PSO_NL, /* Value is a newline type */ PSO_BSR, /* Value is a \R type */ PSO_LIMM, /* Read integer value for match limit */ @@ -592,6 +593,8 @@ static pso pso_list[] = { { (uint8_t *)STRING_UTFn_RIGHTPAR, PSO_OPT, PCRE2_UTF }, { (uint8_t *)STRING_UTF_RIGHTPAR, 4, PSO_OPT, PCRE2_UTF }, { (uint8_t *)STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP }, + { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET }, + { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,17, PSO_FLG, PCRE2_NE_ATST_SET }, { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS }, { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE }, { (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 }, @@ -854,7 +857,8 @@ for (;;) case OP_CLOSE: case OP_COMMIT: case OP_CREF: - case OP_DEF: + case OP_FALSE: + case OP_TRUE: case OP_DNCREF: case OP_DNRREF: case OP_DOLL: @@ -1118,7 +1122,8 @@ for (;;) case OP_DNCREF: case OP_RREF: case OP_DNRREF: - case OP_DEF: + case OP_FALSE: + case OP_TRUE: code += PRIV(OP_lengths)[*code]; break; @@ -4449,10 +4454,12 @@ for (;; ptr++) PCRE2_UCHAR *bralink = NULL; PCRE2_UCHAR *brazeroptr = NULL; - /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so - we just ignore the repeat. */ + /* Repeating a DEFINE group (or any group where the condition is always + FALSE and there is only one branch) is pointless, but Perl allows the + syntax, so we just ignore the repeat. */ - if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) + if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE && + previous[GET(previous, 1)] != OP_ALT) goto END_REPEAT; /* There is no sense in actually repeating assertions. The only potential @@ -5159,10 +5166,66 @@ for (;; ptr++) namelen = -1; /* => not a name; must set to avoid warning */ name = NULL; /* Always set to avoid warning */ recno = 0; /* Always set to avoid warning */ + + /* Point at character after (?( */ + + ptr++; + /* Check for (?(VERSION[>]=n.m), which is a facility whereby indirect + users of PCRE2 via an application can discover which release of PCRE2 + is being used. */ + + if (PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 && + ptr[7] != CHAR_RIGHT_PARENTHESIS) + { + BOOL ge = FALSE; + int major = 0; + int minor = 0; + + ptr += 7; + if (*ptr == CHAR_GREATER_THAN_SIGN) + { + ge = TRUE; + ptr++; + } + + /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT + references its argument twice. */ + + if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr))) + { + *errorcodeptr = ERR79; + goto FAILED; + } + + while (IS_DIGIT(*ptr)) major = major * 10 + *ptr++ - '0'; + if (*ptr == CHAR_DOT) + { + ptr++; + while (IS_DIGIT(*ptr)) minor = minor * 10 + *ptr++ - '0'; + } + + if (*ptr != CHAR_RIGHT_PARENTHESIS) + { + *errorcodeptr = ERR79; + goto FAILED; + } + + if (ge) + code[1+LINK_SIZE] = ((PCRE2_MAJOR > major) || + (PCRE2_MAJOR == major && PCRE2_MINOR >= minor))? + OP_TRUE : OP_FALSE; + else + code[1+LINK_SIZE] = (PCRE2_MAJOR == major && PCRE2_MINOR == minor)? + OP_TRUE : OP_FALSE; + + ptr++; + skipbytes = 1; + break; /* End of condition processing */ + } + /* Check for a test for recursion in a named group. */ - ptr++; if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND) { terminator = -1; @@ -5338,11 +5401,13 @@ for (;; ptr++) } /* Similarly, check for the (?(DEFINE) "condition", which is always - false. */ + false. During compilation we set OP_DEFINE to distinguish this from + other OP_FALSE conditions so that it can be checked for having only one + branch, but after that the opcode is changed to OP_FALSE. */ else if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0) { - code[1+LINK_SIZE] = OP_DEF; + code[1+LINK_SIZE] = OP_DEFINE; skipbytes = 1; } @@ -6065,16 +6130,18 @@ for (;; ptr++) while (*tc != OP_KET); /* A DEFINE group is never obeyed inline (the "condition" is always - false). It must have only one branch. */ + false). It must have only one branch. Having checked this, change the + opcode to OP_FALSE. */ - if (code[LINK_SIZE+1] == OP_DEF) + if (code[LINK_SIZE+1] == OP_DEFINE) { if (condcount > 1) { *errorcodeptr = ERR54; goto FAILED; } - bravalue = OP_DEF; /* Just a flag to suppress char handling below */ + code[LINK_SIZE+1] = OP_FALSE; + bravalue = OP_DEFINE; /* Just a flag to suppress char handling below */ } /* A "normal" conditional group. If there is just one branch, we must not @@ -6127,7 +6194,7 @@ for (;; ptr++) /* For a DEFINE group, required and first character settings are not relevant. */ - if (bravalue == OP_DEF) break; + if (bravalue == OP_DEFINE) break; /* Handle updating of the required and first characters for other types of group. Update for normal brackets of all kinds, and conditions with two @@ -7011,7 +7078,8 @@ do { case OP_DNCREF: case OP_RREF: case OP_DNRREF: - case OP_DEF: + case OP_FALSE: + case OP_TRUE: return FALSE; default: /* Assertion */ @@ -7413,6 +7481,10 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && case PSO_OPT: cb.external_options |= p->value; break; + + case PSO_FLG: + setflags |= p->value; + break; case PSO_NL: newline = p->value; diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index 323ff33..72925e0 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -177,12 +177,12 @@ static const uint8_t coptable[] = { 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 0, 0, /* CREF, DNCREF */ 0, 0, /* RREF, DNRREF */ - 0, /* DEF */ + 0, 0, /* FALSE, TRUE */ 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ - 0, 0 /* CLOSE, SKIPZERO */ + 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */ }; /* This table identifies those opcodes that inspect a character. It is used to @@ -249,12 +249,12 @@ static const uint8_t poptable[] = { 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */ 0, 0, /* CREF, DNCREF */ 0, 0, /* RREF, DNRREF */ - 0, /* DEF */ + 0, 0, /* FALSE, TRUE */ 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */ 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */ 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ - 0, 0 /* CLOSE, SKIPZERO */ + 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */ }; /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, @@ -2642,8 +2642,13 @@ for (;;) /* The DEFINE condition is always false */ - if (condcode == OP_DEF) + if (condcode == OP_FALSE) { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); } + + /* There is also an always-true condition */ + + if (condcode == OP_TRUE) + { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); } /* The only supported version of OP_RREF is for the value RREF_ANY, which means "test if in any recursion". We can't test for specifically @@ -3115,6 +3120,24 @@ if (re->magic_number != MAGIC_NUMBER) if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) return PCRE2_ERROR_BADMODE; +/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the +options variable for this function. Users of PCRE2 who are not calling the +function directly would like to have a way of setting these flags, in the same +way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with +constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and +(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be +transferred to the options for this function. The bits are guaranteed to be +adjacent, but do not have the same values. This bit of Boolean trickery assumes +that the match-time bits are not more significant than the flag bits. If by +accident this is not the case, a compile-time division by zero error will +occur. */ + +#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) +#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) +options |= (re->flags & FF) / ((FF & -FF) / (OO & -OO)); +#undef FF +#undef OO + /* A NULL match context means "use a default context" */ if (mcontext == NULL) diff --git a/src/pcre2_error.c b/src/pcre2_error.c index f752b5d..b60841c 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -157,7 +157,8 @@ static const char compile_error_texts[] = "using UCP is disabled by the application\0" "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0" "character code point value in \\u.... sequence is too large\0" - "digits missing in \\x{} or \\o{}\0" + "digits missing in \\x{} or \\o{}\0" + "syntax error in (?(VERSION condition\0" ; /* Match-time and UTF error texts are in the same format. */ diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index f1caeaa..759f3d5 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -223,10 +223,10 @@ else #endif /* not HAVE_MEMMOVE */ #endif /* not VPCOMPAT */ -/* External (in the C sense) functions and tables that are private to the +/* External (in the C sense) functions and tables that are private to the libraries are always referenced using the PRIV macro. This makes it possible for pcre2test.c to include some of the source files from the libraries using a -different PRIV definition to avoid name clashes. It also makes it clear in the +different PRIV definition to avoid name clashes. It also makes it clear in the code that a non-static object is being referenced. */ #ifndef PRIV @@ -387,10 +387,10 @@ other. NOTE: The values also appear in pcre2_jit_compile.c. */ #ifndef EBCDIC -/* Character U+180E (Mongolian Vowel Separator) is not included in the list of -spaces in the Unicode file PropList.txt, and Perl does not recognize it as a +/* Character U+180E (Mongolian Vowel Separator) is not included in the list of +spaces in the Unicode file PropList.txt, and Perl does not recognize it as a space. However, in many other sources it is listed as a space and has been in -PCRE for a long time. */ +PCRE for a long time. */ #define HSPACE_LIST \ CHAR_HT, CHAR_SPACE, 0xa0, \ @@ -532,6 +532,8 @@ bytes in a code unit in that mode. */ #define PCRE2_MATCH_EMPTY 0x00002000 /* pattern can match empty string */ #define PCRE2_BSR_SET 0x00004000 /* BSR was set in the pattern */ #define PCRE2_NL_SET 0x00008000 /* newline was set in the pattern */ +#define PCRE2_NOTEMPTY_SET 0x00010000 /* (*NOTEMPTY) used ) keep */ +#define PCRE2_NE_ATST_SET 0x00020000 /* (*NOTEMPTY_ATSTART) used) together */ #define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32) @@ -895,25 +897,28 @@ a positive value. */ #define STRING_xdigit "xdigit" #define STRING_DEFINE "DEFINE" +#define STRING_VERSION "VERSION" #define STRING_WEIRD_STARTWORD "[:<:]]" #define STRING_WEIRD_ENDWORD "[:>:]]" -#define STRING_CR_RIGHTPAR "CR)" -#define STRING_LF_RIGHTPAR "LF)" -#define STRING_CRLF_RIGHTPAR "CRLF)" -#define STRING_ANY_RIGHTPAR "ANY)" -#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" -#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" -#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" -#define STRING_UTF8_RIGHTPAR "UTF8)" -#define STRING_UTF16_RIGHTPAR "UTF16)" -#define STRING_UTF32_RIGHTPAR "UTF32)" -#define STRING_UTF_RIGHTPAR "UTF)" -#define STRING_UCP_RIGHTPAR "UCP)" -#define STRING_NO_AUTO_POSSESS_RIGHTPAR "NO_AUTO_POSSESS)" -#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" -#define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH=" -#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION=" +#define STRING_CR_RIGHTPAR "CR)" +#define STRING_LF_RIGHTPAR "LF)" +#define STRING_CRLF_RIGHTPAR "CRLF)" +#define STRING_ANY_RIGHTPAR "ANY)" +#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)" +#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)" +#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)" +#define STRING_UTF8_RIGHTPAR "UTF8)" +#define STRING_UTF16_RIGHTPAR "UTF16)" +#define STRING_UTF32_RIGHTPAR "UTF32)" +#define STRING_UTF_RIGHTPAR "UTF)" +#define STRING_UCP_RIGHTPAR "UCP)" +#define STRING_NO_AUTO_POSSESS_RIGHTPAR "NO_AUTO_POSSESS)" +#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)" +#define STRING_NOTEMPTY_RIGHTPAR "NOTEMPTY)" +#define STRING_NOTEMPTY_ATSTART_RIGHTPAR "NOTEMPTY_ATSTART)" +#define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH=" +#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION=" #else /* SUPPORT_UTF */ @@ -1161,25 +1166,28 @@ only. */ #define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t #define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E +#define STRING_VERSION STR_V STR_E STR_R STR_S STR_I STR_O STR_N #define STRING_WEIRD_STARTWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET #define STRING_WEIRD_ENDWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET -#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS -#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS -#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS -#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS -#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS -#define STRING_UTF16_RIGHTPAR STR_U STR_T STR_F STR_1 STR_6 STR_RIGHT_PARENTHESIS -#define STRING_UTF32_RIGHTPAR STR_U STR_T STR_F STR_3 STR_2 STR_RIGHT_PARENTHESIS -#define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_RIGHT_PARENTHESIS -#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS -#define STRING_NO_AUTO_POSSESS_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_A STR_U STR_T STR_O STR_UNDERSCORE STR_P STR_O STR_S STR_S STR_E STR_S STR_S STR_RIGHT_PARENTHESIS -#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS -#define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN -#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN +#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS +#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS +#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS +#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS +#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS +#define STRING_UTF16_RIGHTPAR STR_U STR_T STR_F STR_1 STR_6 STR_RIGHT_PARENTHESIS +#define STRING_UTF32_RIGHTPAR STR_U STR_T STR_F STR_3 STR_2 STR_RIGHT_PARENTHESIS +#define STRING_UTF_RIGHTPAR STR_U STR_T STR_F STR_RIGHT_PARENTHESIS +#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS +#define STRING_NO_AUTO_POSSESS_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_A STR_U STR_T STR_O STR_UNDERSCORE STR_P STR_O STR_S STR_S STR_E STR_S STR_S STR_RIGHT_PARENTHESIS +#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS +#define STRING_NOTEMPTY_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_RIGHT_PARENTHESIS +#define STRING_NOTEMPTY_ATSTART_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_UNDERSCORE STR_A STR_T STR_S STR_T STR_A STR_R STR_T STR_RIGHT_PARENTHESIS +#define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN +#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN #endif /* SUPPORT_UTF */ @@ -1517,39 +1525,47 @@ enum { OP_DNCREF, /* 142 Used to point to duplicate names as a condition */ OP_RREF, /* 143 Used to hold a recursion number as condition */ OP_DNRREF, /* 144 Used to point to duplicate names as a condition */ - OP_DEF, /* 145 The DEFINE condition */ + OP_FALSE, /* 145 Always false (used by DEFINE and VERSION) */ + OP_TRUE, /* 146 Always true (used by VERSION) */ - OP_BRAZERO, /* 146 These two must remain together and in this */ - OP_BRAMINZERO, /* 147 order. */ - OP_BRAPOSZERO, /* 148 */ + OP_BRAZERO, /* 147 These two must remain together and in this */ + OP_BRAMINZERO, /* 148 order. */ + OP_BRAPOSZERO, /* 149 */ /* These are backtracking control verbs */ - OP_MARK, /* 149 always has an argument */ - OP_PRUNE, /* 150 */ - OP_PRUNE_ARG, /* 151 same, but with argument */ - OP_SKIP, /* 152 */ - OP_SKIP_ARG, /* 153 same, but with argument */ - OP_THEN, /* 154 */ - OP_THEN_ARG, /* 155 same, but with argument */ - OP_COMMIT, /* 156 */ + OP_MARK, /* 150 always has an argument */ + OP_PRUNE, /* 151 */ + OP_PRUNE_ARG, /* 152 same, but with argument */ + OP_SKIP, /* 153 */ + OP_SKIP_ARG, /* 154 same, but with argument */ + OP_THEN, /* 155 */ + OP_THEN_ARG, /* 156 same, but with argument */ + OP_COMMIT, /* 157 */ /* These are forced failure and success verbs */ - OP_FAIL, /* 157 */ - OP_ACCEPT, /* 158 */ - OP_ASSERT_ACCEPT, /* 159 Used inside assertions */ - OP_CLOSE, /* 160 Used before OP_ACCEPT to close open captures */ + OP_FAIL, /* 158 */ + OP_ACCEPT, /* 159 */ + OP_ASSERT_ACCEPT, /* 160 Used inside assertions */ + OP_CLOSE, /* 161 Used before OP_ACCEPT to close open captures */ /* This is used to skip a subpattern with a {0} quantifier */ - OP_SKIPZERO, /* 161 */ + OP_SKIPZERO, /* 162 */ + + /* This is used to identify a DEFINE group during compilation so that it can + be checked for having only one branch. It is changed to OP_FALSE before + compilation finishes. */ + + OP_DEFINE, /* 163 */ /* This is not an opcode, but is used to check that tables indexed by opcode are the correct length, in order to catch updating errors - there have been some in the past. */ OP_TABLE_LENGTH + }; /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro @@ -1594,12 +1610,13 @@ some cases doesn't actually use these names at all). */ "Cond", \ "SBra", "SBraPos", "SCBra", "SCBraPos", \ "SCond", \ - "Cond ref", "Cond dnref", "Cond rec", "Cond dnrec", "Cond def", \ + "Cond ref", "Cond dnref", "Cond rec", "Cond dnrec", \ + "Cond false", "Cond true", \ "Brazero", "Braminzero", "Braposzero", \ "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \ "*THEN", "*THEN", "*COMMIT", "*FAIL", \ "*ACCEPT", "*ASSERT_ACCEPT", \ - "Close", "Skip zero" + "Close", "Skip zero", "Define" /* This macro defines the length of fixed length operations in the compiled @@ -1684,14 +1701,15 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1+LINK_SIZE, /* SCOND */ \ 1+IMM2_SIZE, 1+2*IMM2_SIZE, /* CREF, DNCREF */ \ 1+IMM2_SIZE, 1+2*IMM2_SIZE, /* RREF, DNRREF */ \ - 1, /* DEF */ \ + 1, 1, /* FALSE, TRUE */ \ 1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \ 3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \ 1, 3, /* SKIP, SKIP_ARG */ \ 1, 3, /* THEN, THEN_ARG */ \ 1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \ - 1+IMM2_SIZE, 1 /* CLOSE, SKIPZERO */ - + 1+IMM2_SIZE, 1, /* CLOSE, SKIPZERO */ \ + 1 /* DEFINE */ + /* A magic value for OP_RREF to indicate the "any recursion" condition. */ #define RREF_ANY 0xffff @@ -1757,7 +1775,7 @@ typedef struct { /* ----------------- Items that need PCRE2_CODE_UNIT_WIDTH ----------------- */ -/* When this file is included by pcre2test, PCRE2_CODE_UNIT_WIDTH is not +/* When this file is included by pcre2test, PCRE2_CODE_UNIT_WIDTH is not defined, so the following items are omitted. */ #ifdef PCRE2_CODE_UNIT_WIDTH @@ -1776,11 +1794,11 @@ However, UTF-8 tables are needed only when compiling the 8-bit library. */ #if PCRE2_CODE_UNIT_WIDTH == 8 extern const int PRIV(utf8_table1)[]; -extern const int PRIV(utf8_table1_size); +extern const int PRIV(utf8_table1_size); extern const int PRIV(utf8_table2)[]; extern const int PRIV(utf8_table3)[]; -extern const uint8_t PRIV(utf8_table4)[]; -#endif +extern const uint8_t PRIV(utf8_table4)[]; +#endif #define _pcre2_OP_lengths PCRE2_SUFFIX(_pcre2_OP_lengths_) #define _pcre2_default_tables PCRE2_SUFFIX(_pcre2_default_tables_) @@ -1857,7 +1875,7 @@ is available. */ extern void _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, const compile_block *); extern void _pcre2_compile_context_init(pcre2_compile_context *, BOOL); extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int); -extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, +extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, BOOL); extern size_t _pcre2_jit_get_size(void *); extern void _pcre2_match_context_init(pcre2_match_context *, BOOL); @@ -1870,7 +1888,7 @@ extern int _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t); extern int _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t); extern int _pcre2_study(pcre2_real_code *); extern int _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *); -extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, +extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, BOOL); extern BOOL _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL); #endif /* PCRE2_CODE_UNIT_WIDTH */ diff --git a/src/pcre2_match.c b/src/pcre2_match.c index ed36dc7..1fc8268 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -1363,8 +1363,12 @@ for (;;) } break; - case OP_DEF: /* DEFINE - always false */ + case OP_FALSE: break; + + case OP_TRUE: + condition = TRUE; + break; /* The condition is an assertion. Call match() to evaluate it - setting mb->match_function_type to MATCH_CONDASSERT causes it to stop at the end @@ -6362,6 +6366,24 @@ if (re->magic_number != MAGIC_NUMBER) if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) return PCRE2_ERROR_BADMODE; + +/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the +options variable for this function. Users of PCRE2 who are not calling the +function directly would like to have a way of setting these flags, in the same +way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with +constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and +(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be +transferred to the options for this function. The bits are guaranteed to be +adjacent, but do not have the same values. This bit of Boolean trickery assumes +that the match-time bits are not more significant than the flag bits. If by +accident this is not the case, a compile-time division by zero error will +occur. */ + +#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) +#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) +options |= (re->flags & FF) / ((FF & -FF) / (OO & -OO)); +#undef FF +#undef OO /* A NULL match context means "use a default context" */ diff --git a/src/pcre2_printint.c b/src/pcre2_printint.c index add6312..3067b38 100644 --- a/src/pcre2_printint.c +++ b/src/pcre2_printint.c @@ -422,8 +422,12 @@ for(;;) } break; - case OP_DEF: - fprintf(f, " Cond def"); + case OP_FALSE: + fprintf(f, " Cond false"); + break; + + case OP_TRUE: + fprintf(f, " Cond true"); break; case OP_STARI: diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 7ede428..8bd18b4 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -181,7 +181,8 @@ for (;;) case OP_DNCREF: case OP_RREF: case OP_DNRREF: - case OP_DEF: + case OP_FALSE: + case OP_TRUE: case OP_CALLOUT: case OP_SOD: case OP_SOM: @@ -792,7 +793,8 @@ do case OP_COMMIT: case OP_COND: case OP_CREF: - case OP_DEF: + case OP_FALSE: + case OP_TRUE: case OP_DNCREF: case OP_DNREF: case OP_DNREFI: diff --git a/testdata/testinput2 b/testdata/testinput2 index a63220d..bf348c9 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4070,4 +4070,40 @@ a random value. /Ix /abc(?=abcde)(?=ab)/allusedtext abcabcdefg +/a*?b*?/ + ab + +/(*NOTEMPTY)a*?b*?/ + ab + ba + cb + +/(*NOTEMPTY_ATSTART)a*?b*?/aftertext + ab + cdab + +/(?(VERSION>=10.0)yes|no)/I + yesno + +/(?(VERSION=8)yes){3}/BI,aftertext + yesno + +/(?(VERSION=8)yes|no){3}/I + yesnononoyes + ** Failers + yesno + +/(?:(?abc)|xyz)(?(VERSION)yes|no)/I + abcyes + xyzno + ** Failers + abcno + xyzyes + +/(?(VERSION<10)yes|no)/ + +/(?(VERSION>10)yes|no)/ + +/(?(VERSION>=10.0.0)yes|no)/ + # End of testinput2 diff --git a/testdata/testinput6 b/testdata/testinput6 index b5e89ed..925df41 100644 --- a/testdata/testinput6 +++ b/testdata/testinput6 @@ -4798,4 +4798,16 @@ /abc(?=abcde)(?=ab)/allusedtext abcabcdefg +/a*?b*?/ + ab + +/(*NOTEMPTY)a*?b*?/ + ab + ba + cb + +/(*NOTEMPTY_ATSTART)a*?b*?/aftertext + ab + cdab + # End of testinput6 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 2c38319..a10e4dd 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -9357,7 +9357,7 @@ Partial match at offset 3: +ab Recurse Recurse Cond - Cond def + Cond false CBra 1 < [^m] @@ -9379,7 +9379,7 @@ Partial match at offset 3: +ab Recurse Recurse Cond - Cond def + Cond false CBra 1 < [\x00-/:-\xff] (neg) @@ -10095,7 +10095,7 @@ No match Recurse KetRpos Cond - Cond def + Cond false CBra 1 Any Ket @@ -10114,7 +10114,7 @@ No match Recurse KetRmax Cond - Cond def + Cond false CBra 1 Any Ket @@ -11058,7 +11058,7 @@ Matched, but too many substrings ------------------------------------------------------------------ Bra Cond - Cond def + Cond false CBra 1 a Ket @@ -13720,4 +13720,84 @@ No match 0: abcabcde >>>>> +/a*?b*?/ + ab + 0: + +/(*NOTEMPTY)a*?b*?/ + ab + 0: a + ba + 0: b + cb + 0: b + +/(*NOTEMPTY_ATSTART)a*?b*?/aftertext + ab + 0: a + 0+ b + cdab + 0: + 0+ dab + +/(?(VERSION>=10.0)yes|no)/I +Capturing subpattern count = 0 +Subject length lower bound = 2 + yesno + 0: yes + +/(?(VERSION=8)yes){3}/BI,aftertext +------------------------------------------------------------------ + Bra + Cond + Cond false + yes + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +May match empty string +Subject length lower bound = 0 + yesno + 0: + 0+ yesno + +/(?(VERSION=8)yes|no){3}/I +Capturing subpattern count = 0 +Subject length lower bound = 6 + yesnononoyes + 0: nonono + ** Failers +No match + yesno +No match + +/(?:(?abc)|xyz)(?(VERSION)yes|no)/I +Capturing subpattern count = 1 +Named capturing subpatterns: + VERSION 1 +Starting code units: a x +Subject length lower bound = 5 + abcyes + 0: abcyes + 1: abc + xyzno + 0: xyzno + ** Failers +No match + abcno +No match + xyzyes +No match + +/(?(VERSION<10)yes|no)/ +Failed: error 179 at offset 10: syntax error in (?(VERSION condition + +/(?(VERSION>10)yes|no)/ +Failed: error 179 at offset 11: syntax error in (?(VERSION condition + +/(?(VERSION>=10.0.0)yes|no)/ +Failed: error 179 at offset 16: syntax error in (?(VERSION condition + # End of testinput2 diff --git a/testdata/testoutput6 b/testdata/testoutput6 index b705798..57e39c7 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -7689,4 +7689,28 @@ Matched, but offsets vector is too small to show all matches 0: abcabcde >>>>> +/a*?b*?/ + ab + 0: ab + 1: a + 2: + +/(*NOTEMPTY)a*?b*?/ + ab + 0: ab + 1: a + ba + 0: b + cb + 0: b + +/(*NOTEMPTY_ATSTART)a*?b*?/aftertext + ab + 0: ab + 0+ + 1: a + cdab + 0: + 0+ dab + # End of testinput6