Code for callouts with string arguments. Documentation not yet updated.

This commit is contained in:
Philip.Hazel 2015-03-11 17:44:16 +00:00
parent 24189152fe
commit d0cf279d87
18 changed files with 650 additions and 155 deletions

View File

@ -1,6 +1,12 @@
Change Log for PCRE2
--------------------
Version 10.20 xx-xx-2015
------------------------
1. Callouts with string arguments have been added.
Version 10.10 06-March-2015
---------------------------

View File

@ -9,9 +9,9 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
dnl be defined as -RC2, for example. For real releases, it should be empty.
m4_define(pcre2_major, [10])
m4_define(pcre2_minor, [10])
m4_define(pcre2_prerelease, [])
m4_define(pcre2_date, [2015-03-06])
m4_define(pcre2_minor, [20])
m4_define(pcre2_prerelease, [-RC1])
m4_define(pcre2_date, [2015-03-11])
# NOTE: The CMakeLists.txt file searches for the above variables in the first
# 50 lines of this file. Please update that if the variables above are moved.

View File

@ -337,6 +337,9 @@ typedef struct pcre2_callout_block { \
PCRE2_SIZE current_position; /* Where we currently are in the subject */ \
PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \
PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \
/* ------------------- Added for Version 1 -------------------------- */ \
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
uint32_t callout_string_length; /* Length of string compiled into pattern */ \
/* ------------------------------------------------------------------ */ \
} pcre2_callout_block;

View File

@ -604,6 +604,12 @@ for(;;)
continue;
}
if (c == OP_CALLOUT_STR)
{
code += GET(code, 1 + 2*LINK_SIZE);
continue;
}
if (c == OP_ALT)
{
do code += GET(code, 1); while (*code == OP_ALT);
@ -1234,6 +1240,10 @@ for (;;)
code += 2;
break;
case OP_CALLOUT_STR:
code += GET(code, 1 + 2*LINK_SIZE);
break;
#ifdef SUPPORT_WIDE_CHARS
case OP_XCLASS:
code += GET(code, 1);

View File

@ -573,7 +573,8 @@ enum { ERR0 = COMPILE_ERROR_BASE,
ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80 };
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
ERR81, ERR82 };
/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -617,7 +618,6 @@ static pso pso_list[] = {
{ (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR, 12, PSO_BSR, PCRE2_BSR_UNICODE }
};
/* This table is used when converting repeating opcodes into possessified
versions as a result of an explicit possessive quantifier such as ++. A zero
value means there is no possessified version - in those cases the item in
@ -730,11 +730,11 @@ Returns: new code pointer
static PCRE2_UCHAR *
auto_callout(PCRE2_UCHAR *code, PCRE2_SPTR ptr, compile_block *cb)
{
*code++ = OP_CALLOUT;
*code++ = 255;
PUT(code, 0, ptr - cb->start_pattern); /* Pattern offset */
PUT(code, LINK_SIZE, 0); /* Default length */
return code + 2 * LINK_SIZE;
code[0] = OP_CALLOUT;
PUT(code, 1, ptr - cb->start_pattern); /* Pattern offset */
PUT(code, 1 + LINK_SIZE, 0); /* Default length */
code[1 + 2*LINK_SIZE] = 255;
return code + PRIV(OP_lengths)[OP_CALLOUT];
}
@ -759,8 +759,8 @@ static void
complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr,
compile_block *cb)
{
size_t length = ptr - cb->start_pattern - GET(previous_callout, 2);
PUT(previous_callout, 2 + LINK_SIZE, length);
size_t length = ptr - cb->start_pattern - GET(previous_callout, 1);
PUT(previous_callout, 1 + LINK_SIZE, length);
}
@ -909,6 +909,10 @@ for (;;)
cc += PRIV(OP_lengths)[*cc];
break;
case OP_CALLOUT_STR:
cc += GET(cc, 1 + 2*LINK_SIZE);
break;
/* Handle literal characters */
case OP_CHAR:
@ -1157,6 +1161,10 @@ for (;;)
code += PRIV(OP_lengths)[*code];
break;
case OP_CALLOUT_STR:
code += GET(code, 1 + 2*LINK_SIZE);
break;
default:
return code;
}
@ -2279,11 +2287,13 @@ for (;;)
if (c == OP_END) return NULL;
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
the table is zero; the actual length is stored in the compiled code. */
/* XCLASS is used for classes that cannot be represented just by a bit map.
This includes negated single high-valued characters. CALLOUT_STR is used for
callouts with string arguments. In both cases the length in the table is
zero; the actual length is stored in the compiled code. */
if (c == OP_XCLASS) code += GET(code, 1);
else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
/* Handle recursion */
@ -2442,11 +2452,13 @@ for (;;)
if (c == OP_END) return NULL;
if (c == OP_RECURSE) return code;
/* XCLASS is used for classes that cannot be represented just by a bit
map. This includes negated single high-valued characters. The length in
the table is zero; the actual length is stored in the compiled code. */
/* XCLASS is used for classes that cannot be represented just by a bit map.
This includes negated single high-valued characters. CALLOUT_STR is used for
callouts with string arguments. In both cases the length in the table is
zero; the actual length is stored in the compiled code. */
if (c == OP_XCLASS) code += GET(code, 1);
else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
/* Otherwise, we can get the item's length from the table, except that for
repeated character types, we have to test for \p and \P, which have an extra
@ -5558,30 +5570,124 @@ for (;; ptr++)
/* ------------------------------------------------------------ */
case CHAR_C: /* Callout - may be followed by digits; */
case CHAR_C: /* Callout */
previous_callout = code; /* Save for later completion */
after_manual_callout = 1; /* Skip one item before completing */
*code++ = OP_CALLOUT;
ptr++; /* Character after (?C */
/* A callout may have a string argument, delimited by one of a fixed
number of characters, or an undelimited numerical argument, or no
argument, which is the same as (?C0). Different opcodes are used for
the two cases. */
if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
{
uint32_t delimiter = 0;
for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
{
if (*ptr == PRIV(callout_start_delims)[i])
{
delimiter = PRIV(callout_end_delims)[i];
break;
}
}
if (delimiter == 0)
{
*errorcodeptr = ERR82;
goto FAILED;
}
/* During the pre-compile phase, we parse the string and update the
length. There is no need to generate any code. */
if (lengthptr != NULL) /* Only check the string */
{
PCRE2_SPTR start = ptr;
do
{
if (++ptr >= cb->end_pattern)
{
*errorcodeptr = ERR81;
ptr = start; /* To give a more useful message */
goto FAILED;
}
if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2;
}
while (ptr[0] != delimiter);
/* Start points to the opening delimiter, ptr points to the
closing delimiter. We must allow for including the delimiter and
for the terminating zero. Any doubled delimiters within the string
make this an overestimate, but it is not worth bothering about. */
(*lengthptr) += (ptr - start) + 2 + (1 + 3*LINK_SIZE);
}
/* In the real compile we can copy the string, knowing that it is
syntactically OK. The starting delimiter is included so that the
client can discover it if they want. */
else
{
PCRE2_UCHAR *callout_string = code + (1 + 3*LINK_SIZE);
*callout_string++ = *ptr++;
for(;;)
{
if (*ptr == delimiter)
{
if (ptr[1] == delimiter) ptr++; else break;
}
*callout_string++ = *ptr++;
}
*callout_string++ = CHAR_NULL;
code[0] = OP_CALLOUT_STR;
PUT(code, 1, (int)(ptr + 2 - cb->start_pattern)); /* Next offset */
PUT(code, 1 + LINK_SIZE, 0); /* Default length */
PUT(code, 1 + 2*LINK_SIZE, /* Compute size */
(int)(callout_string - code));
code = callout_string;
}
/* Advance to what should be the closing parenthesis, which is
checked below. */
ptr++;
}
/* Handle a callout with an optional numerical argument, which must be
less than or equal to 255. A missing argument gives 0. */
else
{
int n = 0;
ptr++;
while(IS_DIGIT(*ptr))
code[0] = OP_CALLOUT; /* Numerical callout */
while (IS_DIGIT(*ptr))
{
n = n * 10 + *ptr++ - CHAR_0;
if (*ptr != CHAR_RIGHT_PARENTHESIS)
{
*errorcodeptr = ERR39;
goto FAILED;
if (n > 255)
{
*errorcodeptr = ERR38;
goto FAILED;
}
}
if (n > 255)
{
*errorcodeptr = ERR38;
goto FAILED;
}
*code++ = n;
PUT(code, 0, (int)(ptr - cb->start_pattern + 1)); /* Pattern offset */
PUT(code, LINK_SIZE, 0); /* Default length */
code += 2 * LINK_SIZE;
PUT(code, 1, (int)(ptr - cb->start_pattern + 1)); /* Next offset */
PUT(code, 1 + LINK_SIZE, 0); /* Default length */
code[1 + 2*LINK_SIZE] = n; /* Callout number */
code += PRIV(OP_lengths)[OP_CALLOUT];
}
/* Both formats must have a closing parenthesis */
if (*ptr != CHAR_RIGHT_PARENTHESIS)
{
*errorcodeptr = ERR39;
goto FAILED;
}
/* Callouts cannot be quantified. */
previous = NULL;
continue;
@ -7164,7 +7270,10 @@ do {
if (op == OP_COND)
{
scode += 1 + LINK_SIZE;
if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
switch (*scode)
{
case OP_CREF:

View File

@ -161,6 +161,7 @@ static const uint8_t coptable[] = {
0, /* DNREFI */
0, /* RECURSE */
0, /* CALLOUT */
0, /* CALLOUT_STR */
0, /* Alt */
0, /* Ket */
0, /* KetRmax */
@ -233,6 +234,7 @@ static const uint8_t poptable[] = {
0, /* DNREFI */
0, /* RECURSE */
0, /* CALLOUT */
0, /* CALLOUT_STR */
0, /* Alt */
0, /* Ket */
0, /* KetRmax */
@ -2605,14 +2607,16 @@ for (;;)
is inserted between OP_COND and an assertion condition. This does not
happen for the other conditions. */
if (code[LINK_SIZE+1] == OP_CALLOUT)
if (code[LINK_SIZE + 1] == OP_CALLOUT
|| code[LINK_SIZE + 1] == OP_CALLOUT_STR)
{
unsigned int callout_length = (code[LINK_SIZE + 1] == OP_CALLOUT)
? PRIV(OP_lengths)[OP_CALLOUT] : GET(code, 2 + 3*LINK_SIZE);
rrc = 0;
if (mb->callout != NULL)
{
pcre2_callout_block cb;
cb.version = 0;
cb.callout_number = code[LINK_SIZE+2];
cb.version = 1;
cb.capture_top = 1;
cb.capture_last = 0;
cb.offset_vector = offsets;
@ -2621,13 +2625,28 @@ for (;;)
cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject);
cb.start_match = (PCRE2_SIZE)(current_subject - start_subject);
cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
cb.pattern_position = GET(code, LINK_SIZE + 3);
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
cb.pattern_position = GET(code, LINK_SIZE + 2);
cb.next_item_length = GET(code, LINK_SIZE + 2 + LINK_SIZE);
if (code[LINK_SIZE + 1] == OP_CALLOUT)
{
cb.callout_number = code[2 + 3*LINK_SIZE];
cb.callout_string = NULL;
cb.callout_string_length = 0;
}
else
{
cb.callout_number = 0;
cb.callout_string = code + (2 + 4*LINK_SIZE) + 1;
cb.callout_string_length =
callout_length - (1 + 3*LINK_SIZE) - 2;
}
if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
return rrc; /* Abandon */
}
if (rrc > 0) break; /* Fail this thread */
code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
code += callout_length; /* Skip callout data */
}
condcode = code[LINK_SIZE+1];
@ -2954,27 +2973,47 @@ for (;;)
/* Handle callouts */
case OP_CALLOUT:
rrc = 0;
if (mb->callout != NULL)
case OP_CALLOUT_STR:
{
pcre2_callout_block cb;
cb.version = 0;
cb.callout_number = code[1];
cb.capture_top = 1;
cb.capture_last = 0;
cb.offset_vector = offsets;
cb.mark = NULL; /* No (*MARK) support */
cb.subject = start_subject;
cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject);
cb.start_match = (PCRE2_SIZE)(current_subject - start_subject);
cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
cb.pattern_position = GET(code, 2);
cb.next_item_length = GET(code, 2 + LINK_SIZE);
if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
return rrc; /* Abandon */
unsigned int callout_length = (*code == OP_CALLOUT)
? PRIV(OP_lengths)[OP_CALLOUT] : GET(code, 1 + 2*LINK_SIZE);
rrc = 0;
if (mb->callout != NULL)
{
pcre2_callout_block cb;
cb.version = 1;
cb.capture_top = 1;
cb.capture_last = 0;
cb.offset_vector = offsets;
cb.mark = NULL; /* No (*MARK) support */
cb.subject = start_subject;
cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject);
cb.start_match = (PCRE2_SIZE)(current_subject - start_subject);
cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
cb.pattern_position = GET(code, 1);
cb.next_item_length = GET(code, 1 + LINK_SIZE);
if (*code == OP_CALLOUT)
{
cb.callout_number = code[1 + 2*LINK_SIZE];
cb.callout_string = NULL;
cb.callout_string_length = 0;
}
else
{
cb.callout_number = 0;
cb.callout_string = code + (1 + 3*LINK_SIZE) + 1;
cb.callout_string_length =
callout_length - (1 + 3*LINK_SIZE) - 2;
}
if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
return rrc; /* Abandon */
}
if (rrc == 0)
{ ADD_ACTIVE(state_offset + callout_length, 0); }
}
if (rrc == 0)
{ ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
break;

View File

@ -161,6 +161,8 @@ static const char compile_error_texts[] =
"syntax error in (?(VERSION condition\0"
/* 80 */
"internal error: unknown opcode in auto_possessify()\0"
"missing terminating delimiter for callout with string argument\0"
"unrecognized string delimiter follows (?C\0"
;
/* Match-time and UTF error texts are in the same format. */

View File

@ -1477,84 +1477,85 @@ enum {
OP_DNREFI, /* 116 Match a duplicate name backref, caselessly */
OP_RECURSE, /* 117 Match a numbered subpattern (possibly recursive) */
OP_CALLOUT, /* 118 Call out to external function if provided */
OP_CALLOUT_STR, /* 119 Call out with string argument */
OP_ALT, /* 119 Start of alternation */
OP_KET, /* 120 End of group that doesn't have an unbounded repeat */
OP_KETRMAX, /* 121 These two must remain together and in this */
OP_KETRMIN, /* 122 order. They are for groups the repeat for ever. */
OP_KETRPOS, /* 123 Possessive unlimited repeat. */
OP_ALT, /* 120 Start of alternation */
OP_KET, /* 121 End of group that doesn't have an unbounded repeat */
OP_KETRMAX, /* 122 These two must remain together and in this */
OP_KETRMIN, /* 123 order. They are for groups the repeat for ever. */
OP_KETRPOS, /* 124 Possessive unlimited repeat. */
/* The assertions must come before BRA, CBRA, ONCE, and COND, and the four
asserts must remain in order. */
OP_REVERSE, /* 124 Move pointer back - used in lookbehind assertions */
OP_ASSERT, /* 125 Positive lookahead */
OP_ASSERT_NOT, /* 126 Negative lookahead */
OP_ASSERTBACK, /* 127 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 128 Negative lookbehind */
OP_REVERSE, /* 125 Move pointer back - used in lookbehind assertions */
OP_ASSERT, /* 126 Positive lookahead */
OP_ASSERT_NOT, /* 127 Negative lookahead */
OP_ASSERTBACK, /* 128 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 129 Negative lookbehind */
/* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately
after the assertions, with ONCE first, as there's a test for >= ONCE for a
subpattern that isn't an assertion. The POS versions must immediately follow
the non-POS versions in each case. */
OP_ONCE, /* 129 Atomic group, contains captures */
OP_ONCE_NC, /* 130 Atomic group containing no captures */
OP_BRA, /* 131 Start of non-capturing bracket */
OP_BRAPOS, /* 132 Ditto, with unlimited, possessive repeat */
OP_CBRA, /* 133 Start of capturing bracket */
OP_CBRAPOS, /* 134 Ditto, with unlimited, possessive repeat */
OP_COND, /* 135 Conditional group */
OP_ONCE, /* 130 Atomic group, contains captures */
OP_ONCE_NC, /* 131 Atomic group containing no captures */
OP_BRA, /* 132 Start of non-capturing bracket */
OP_BRAPOS, /* 133 Ditto, with unlimited, possessive repeat */
OP_CBRA, /* 134 Start of capturing bracket */
OP_CBRAPOS, /* 135 Ditto, with unlimited, possessive repeat */
OP_COND, /* 136 Conditional group */
/* These five must follow the previous five, in the same order. There's a
check for >= SBRA to distinguish the two sets. */
OP_SBRA, /* 136 Start of non-capturing bracket, check empty */
OP_SBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */
OP_SCBRA, /* 138 Start of capturing bracket, check empty */
OP_SCBRAPOS, /* 139 Ditto, with unlimited, possessive repeat */
OP_SCOND, /* 140 Conditional group, check empty */
OP_SBRA, /* 137 Start of non-capturing bracket, check empty */
OP_SBRAPOS, /* 138 Ditto, with unlimited, possessive repeat */
OP_SCBRA, /* 139 Start of capturing bracket, check empty */
OP_SCBRAPOS, /* 140 Ditto, with unlimited, possessive repeat */
OP_SCOND, /* 141 Conditional group, check empty */
/* The next two pairs must (respectively) be kept together. */
OP_CREF, /* 141 Used to hold a capture number as condition */
OP_DNCREF, /* 142 Used to point to duplicate names as a condition */
OP_RREF, /* 143 Used to hold a recursion number as condition */
OP_DNRREF, /* 144 Used to point to duplicate names as a condition */
OP_FALSE, /* 145 Always false (used by DEFINE and VERSION) */
OP_TRUE, /* 146 Always true (used by VERSION) */
OP_CREF, /* 142 Used to hold a capture number as condition */
OP_DNCREF, /* 143 Used to point to duplicate names as a condition */
OP_RREF, /* 144 Used to hold a recursion number as condition */
OP_DNRREF, /* 145 Used to point to duplicate names as a condition */
OP_FALSE, /* 146 Always false (used by DEFINE and VERSION) */
OP_TRUE, /* 147 Always true (used by VERSION) */
OP_BRAZERO, /* 147 These two must remain together and in this */
OP_BRAMINZERO, /* 148 order. */
OP_BRAPOSZERO, /* 149 */
OP_BRAZERO, /* 148 These two must remain together and in this */
OP_BRAMINZERO, /* 149 order. */
OP_BRAPOSZERO, /* 150 */
/* These are backtracking control verbs */
OP_MARK, /* 150 always has an argument */
OP_PRUNE, /* 151 */
OP_PRUNE_ARG, /* 152 same, but with argument */
OP_SKIP, /* 153 */
OP_SKIP_ARG, /* 154 same, but with argument */
OP_THEN, /* 155 */
OP_THEN_ARG, /* 156 same, but with argument */
OP_COMMIT, /* 157 */
OP_MARK, /* 151 always has an argument */
OP_PRUNE, /* 152 */
OP_PRUNE_ARG, /* 153 same, but with argument */
OP_SKIP, /* 154 */
OP_SKIP_ARG, /* 155 same, but with argument */
OP_THEN, /* 156 */
OP_THEN_ARG, /* 157 same, but with argument */
OP_COMMIT, /* 158 */
/* These are forced failure and success verbs */
OP_FAIL, /* 158 */
OP_ACCEPT, /* 159 */
OP_ASSERT_ACCEPT, /* 160 Used inside assertions */
OP_CLOSE, /* 161 Used before OP_ACCEPT to close open captures */
OP_FAIL, /* 159 */
OP_ACCEPT, /* 160 */
OP_ASSERT_ACCEPT, /* 161 Used inside assertions */
OP_CLOSE, /* 162 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
OP_SKIPZERO, /* 162 */
OP_SKIPZERO, /* 163 */
/* This is used to identify a DEFINE group during compilation so that it can
be checked for having only one branch. It is changed to OP_FALSE before
compilation finishes. */
OP_DEFINE, /* 163 */
OP_DEFINE, /* 164 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
@ -1598,7 +1599,7 @@ some cases doesn't actually use these names at all). */
"*", "*?", "+", "+?", "?", "??", "{", "{", \
"*+","++", "?+", "{", \
"class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \
"Recurse", "Callout", \
"Recurse", "Callout", "CalloutStr", \
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \
"Once", "Once_NC", \
@ -1672,7 +1673,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1+2*IMM2_SIZE, /* DNREF */ \
1+2*IMM2_SIZE, /* DNREFI */ \
1+LINK_SIZE, /* RECURSE */ \
2+2*LINK_SIZE, /* CALLOUT */ \
1+2*LINK_SIZE+1, /* CALLOUT */ \
0, /* CALLOUT_STR - variable length */ \
1+LINK_SIZE, /* Alt */ \
1+LINK_SIZE, /* Ket */ \
1+LINK_SIZE, /* KetRmax */ \
@ -1806,6 +1808,8 @@ extern const uint8_t PRIV(utf8_table4)[];
#endif
#define _pcre2_OP_lengths PCRE2_SUFFIX(_pcre2_OP_lengths_)
#define _pcre2_callout_end_delims PCRE2_SUFFIX(_pcre2_callout_end_delims_)
#define _pcre2_callout_start_delims PCRE2_SUFFIX(_pcre2_callout_start_delims_)
#define _pcre2_default_compile_context PCRE2_SUFFIX(_pcre2_default_compile_context_)
#define _pcre2_default_match_context PCRE2_SUFFIX(_pcre2_default_match_context_)
#define _pcre2_default_tables PCRE2_SUFFIX(_pcre2_default_tables_)
@ -1824,6 +1828,8 @@ extern const uint8_t PRIV(utf8_table4)[];
#define _pcre2_utt_size PCRE2_SUFFIX(_pcre2_utt_size_)
extern const uint8_t PRIV(OP_lengths)[];
extern const uint32_t PRIV(callout_end_delims)[];
extern const uint32_t PRIV(callout_start_delims)[];
extern const pcre2_compile_context PRIV(default_compile_context);
extern const pcre2_match_context PRIV(default_match_context);
extern const uint8_t PRIV(default_tables)[];

View File

@ -771,6 +771,9 @@ switch(*cc)
#endif
return cc + 1;
case OP_CALLOUT_STR:
return cc + GET(cc, 1 + 2*LINK_SIZE);
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
case OP_XCLASS:
return cc + GET(cc, 1);
@ -821,7 +824,7 @@ while (cc < ccend)
case OP_SCOND:
/* Only AUTO_CALLOUT can insert this opcode. We do
not intend to support this case. */
if (cc[1 + LINK_SIZE] == OP_CALLOUT)
if (cc[1 + LINK_SIZE] == OP_CALLOUT || cc[1 + LINK_SIZE] == OP_CALLOUT_STR)
return FALSE;
cc += 1 + LINK_SIZE;
break;
@ -855,12 +858,13 @@ while (cc < ccend)
break;
case OP_CALLOUT:
case OP_CALLOUT_STR:
if (common->capture_last_ptr == 0)
{
common->capture_last_ptr = common->ovector_start;
common->ovector_start += sizeof(sljit_sw);
}
cc += 2 + 2 * LINK_SIZE;
cc += (*cc == OP_CALLOUT) ? PRIV(OP_lengths)[OP_CALLOUT] : GET(cc, 1 + 2*LINK_SIZE);
break;
case OP_THEN_ARG:
@ -6296,7 +6300,7 @@ uint32_t i;
if (arguments->callout == NULL)
return 0;
callout_block->version = 0;
callout_block->version = 1;
/* Offsets in subject. */
callout_block->subject_length = arguments->end - arguments->begin;
@ -6333,6 +6337,10 @@ static SLJIT_INLINE PCRE2_SPTR compile_callout_matchingpath(compiler_common *com
DEFINE_COMPILER;
backtrack_common *backtrack;
sljit_si mov_opcode;
unsigned int callout_length = (*cc == OP_CALLOUT)
? PRIV(OP_lengths)[OP_CALLOUT] : GET(cc, 1 + 2 * LINK_SIZE);
sljit_sw value1;
sljit_sw value2;
PUSH_BACKTRACK(sizeof(backtrack_common), cc, NULL);
@ -6341,7 +6349,8 @@ allocate_stack(common, CALLOUT_ARG_SIZE / sizeof(sljit_sw));
SLJIT_ASSERT(common->capture_last_ptr != 0);
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), common->capture_last_ptr);
OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
OP1(SLJIT_MOV_UI, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_number), SLJIT_IMM, cc[1]);
value1 = (*cc == OP_CALLOUT) ? cc[1 + 2 * LINK_SIZE] : 0;
OP1(SLJIT_MOV_UI, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_number), SLJIT_IMM, value1);
OP1(SLJIT_MOV_UI, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(capture_last), TMP2, 0);
/* These pointer sized fields temporarly stores internal variables. */
@ -6352,8 +6361,22 @@ OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(subject), TMP2, 0);
if (common->mark_ptr != 0)
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, mark_ptr));
mov_opcode = (sizeof(PCRE2_SIZE) == 4) ? SLJIT_MOV_UI : SLJIT_MOV;
OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(pattern_position), SLJIT_IMM, GET(cc, 2));
OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(next_item_length), SLJIT_IMM, GET(cc, 2 + LINK_SIZE));
OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(pattern_position), SLJIT_IMM, GET(cc, 1));
OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(next_item_length), SLJIT_IMM, GET(cc, 1 + LINK_SIZE));
if (*cc == OP_CALLOUT)
{
value1 = 0;
value2 = 0;
}
else
{
value1 = (sljit_sw) (cc + (1 + 3*LINK_SIZE) + 1);
value2 = (callout_length - (1 + 3*LINK_SIZE + 2));
}
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_string), SLJIT_IMM, value1);
OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_string_length), SLJIT_IMM, value2);
OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(mark), (common->mark_ptr != 0) ? TMP2 : SLJIT_IMM, 0);
/* Needed to save important temporary registers. */
@ -6372,7 +6395,7 @@ if (common->forced_quit_label == NULL)
add_jump(compiler, &common->forced_quit, JUMP(SLJIT_SIG_LESS));
else
JUMPTO(SLJIT_SIG_LESS, common->forced_quit_label);
return cc + 2 + 2 * LINK_SIZE;
return cc + callout_length;
}
#undef CALLOUT_ARG_SIZE
@ -8377,6 +8400,7 @@ while (cc < ccend)
break;
case OP_CALLOUT:
case OP_CALLOUT_STR:
cc = compile_callout_matchingpath(common, cc, parent);
break;
@ -9561,6 +9585,7 @@ while (current)
break;
case OP_CALLOUT:
case OP_CALLOUT_STR:
case OP_FAIL:
case OP_ACCEPT:
case OP_ASSERT_ACCEPT:

View File

@ -1310,13 +1310,15 @@ for (;;)
/* Because of the way auto-callout works during compile, a callout item is
inserted between OP_COND and an assertion condition. */
if (*ecode == OP_CALLOUT)
if (*ecode == OP_CALLOUT || *ecode == OP_CALLOUT_STR)
{
unsigned int callout_length = (*ecode == OP_CALLOUT)
? PRIV(OP_lengths)[OP_CALLOUT] : GET(ecode, 1 + 2*LINK_SIZE);
if (mb->callout != NULL)
{
pcre2_callout_block cb;
cb.version = 0;
cb.callout_number = ecode[1];
cb.version = 1;
cb.capture_top = offset_top/2;
cb.capture_last = mb->capture_last & CAPLMASK;
cb.offset_vector = mb->ovector;
@ -1325,8 +1327,23 @@ for (;;)
cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject);
cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject);
cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject);
cb.pattern_position = GET(ecode, 2);
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
cb.pattern_position = GET(ecode, 1);
cb.next_item_length = GET(ecode, 1 + LINK_SIZE);
if (*ecode == OP_CALLOUT)
{
cb.callout_number = ecode[1 + 2*LINK_SIZE];
cb.callout_string = NULL;
cb.callout_string_length = 0;
}
else
{
cb.callout_number = 0;
cb.callout_string = ecode + (1 + 3*LINK_SIZE) + 1;
cb.callout_string_length =
callout_length - (1 + 3*LINK_SIZE) - 2;
}
if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
RRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
@ -1335,8 +1352,8 @@ for (;;)
/* Advance ecode past the callout, so it now points to the condition. We
must adjust codelink so that the value of ecode+codelink is unchanged. */
ecode += PRIV(OP_lengths)[OP_CALLOUT];
codelink -= PRIV(OP_lengths)[OP_CALLOUT];
ecode += callout_length;
codelink -= callout_length;
}
/* Test the various possible conditions */
@ -1716,26 +1733,47 @@ for (;;)
function is able to force a failure. */
case OP_CALLOUT:
if (mb->callout != NULL)
case OP_CALLOUT_STR:
{
pcre2_callout_block cb;
cb.version = 0;
cb.callout_number = ecode[1];
cb.capture_top = offset_top/2;
cb.capture_last = mb->capture_last & CAPLMASK;
cb.offset_vector = mb->ovector;
cb.mark = mb->nomatch_mark;
cb.subject = mb->start_subject;
cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject);
cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject);
cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject);
cb.pattern_position = GET(ecode, 2);
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
RRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
unsigned int callout_length = (*ecode == OP_CALLOUT)
? PRIV(OP_lengths)[OP_CALLOUT] : GET(ecode, 1 + 2*LINK_SIZE);
if (mb->callout != NULL)
{
pcre2_callout_block cb;
cb.version = 1;
cb.callout_number = ecode[LINK_SIZE + 1];
cb.capture_top = offset_top/2;
cb.capture_last = mb->capture_last & CAPLMASK;
cb.offset_vector = mb->ovector;
cb.mark = mb->nomatch_mark;
cb.subject = mb->start_subject;
cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject);
cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject);
cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject);
cb.pattern_position = GET(ecode, 1);
cb.next_item_length = GET(ecode, 1 + LINK_SIZE);
if (*ecode == OP_CALLOUT)
{
cb.callout_number = ecode[1 + 2*LINK_SIZE];
cb.callout_string = NULL;
cb.callout_string_length = 0;
}
else
{
cb.callout_number = 0;
cb.callout_string = ecode + (1 + 3*LINK_SIZE) + 1;
cb.callout_string_length =
callout_length - (1 + 3*LINK_SIZE) - 2;
}
if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
RRETURN(MATCH_NOMATCH);
if (rrc < 0) RRETURN(rrc);
}
ecode += callout_length;
}
ecode += 2 + 2*LINK_SIZE;
break;
/* Recursion either matches the current regex, or some subexpression. The

View File

@ -305,6 +305,7 @@ for(;;)
{
PCRE2_SPTR ccode;
uint32_t c;
int i;
const char *flag = " ";
unsigned int extra = 0;
@ -594,8 +595,23 @@ for(;;)
goto CLASS_REF_REPEAT;
case OP_CALLOUT:
fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2),
GET(code, 2 + LINK_SIZE));
fprintf(f, " %s %d %d %d", OP_names[*code], code[1 + 2*LINK_SIZE],
GET(code, 1), GET(code, 1 + LINK_SIZE));
break;
case OP_CALLOUT_STR:
c = code[1 + 3*LINK_SIZE];
fprintf(f, " %s %c", OP_names[*code], c);
extra = GET(code, 1 + 2*LINK_SIZE);
print_custring(f, code + 2 + 3*LINK_SIZE);
for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
if (c == PRIV(callout_start_delims)[i])
{
c = PRIV(callout_end_delims)[i];
break;
}
fprintf(f, "%c %d %d", c, GET(code, 1), GET(code, 1 + LINK_SIZE));
break;
case OP_PROP:
@ -611,7 +627,6 @@ for(;;)
case OP_NCLASS:
case OP_XCLASS:
{
int i;
unsigned int min, max;
BOOL printmap;
BOOL invertmap = FALSE;

View File

@ -199,6 +199,10 @@ for (;;)
cc += PRIV(OP_lengths)[*cc];
break;
case OP_CALLOUT_STR:
cc += GET(cc, 1 + 2*LINK_SIZE);
break;
/* Skip over a subpattern that has a {0} or {0,x} quantifier */
case OP_BRAZERO:
@ -935,7 +939,11 @@ do
/* Skip over callout */
case OP_CALLOUT:
tcode += 2 + 2*LINK_SIZE;
tcode += PRIV(OP_lengths)[OP_CALLOUT];
break;
case OP_CALLOUT_STR:
tcode += GET(tcode, 1 + 2*LINK_SIZE);
break;
/* Skip over lookbehind and negative lookahead assertions */

View File

@ -66,6 +66,20 @@ adding to classes. */
const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST };
const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST };
/* These tables are the pairs of delimiters that are valid for callout string
arguments. For each starting delimiter there must be a matching ending
delimiter, which in fact is different only for bracket-like delimiters. */
const uint32_t PRIV(callout_start_delims)[] = {
CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK,
CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN,
CHAR_DOLLAR_SIGN, CHAR_LEFT_CURLY_BRACKET, 0 };
const uint32_t PRIV(callout_end_delims[]) = {
CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK,
CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN,
CHAR_DOLLAR_SIGN, CHAR_RIGHT_CURLY_BRACKET, 0 };
/*************************************************
* Tables for UTF-8 support *

View File

@ -4519,9 +4519,9 @@ return capcount;
/* Called from a PCRE2 library as a result of the (?C) item. We print out where
we are in the match. Yield zero unless more callouts than the fail count, or
the callout data is not zero. The only differences in the callout block for
different code unit widths are that the pointers to the subject and the most
recent MARK point to strings of the appropriate width. Casts can be used to
deal with this.
different code unit widths are that the pointers to the subject, the most
recent MARK, and a callout argument string point to strings of the appropriate
width. Casts can be used to deal with this.
Argument: a pointer to a callout block
Return:
@ -4535,11 +4535,31 @@ BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0;
BOOL callout_capture = (dat_datctl.control & CTL_CALLOUT_CAPTURE) != 0;
FILE *f = (first_callout || callout_capture)? outfile : NULL;
/* For a callout with a string argument, show the string first because there
isn't a tidy way to fit it in the rest of the data. */
if (cb->callout_string != NULL)
{
uint32_t delimiter = CODE_UNIT(cb->callout_string, -1);
fprintf(f, "Callout: %c", delimiter);
PCHARSV(cb->callout_string, 0,
cb->callout_string_length, utf, outfile);
for (i = 0; callout_start_delims[i] != 0; i++)
if (delimiter == callout_start_delims[i])
{
delimiter = callout_end_delims[i];
break;
}
fprintf(outfile, "%c", delimiter);
if (!callout_capture) fprintf(f, "\n");
}
/* Show captured strings if required */
if (callout_capture)
{
fprintf(f, "Callout %d: last capture = %d\n",
cb->callout_number, cb->capture_last);
if (cb->callout_string == NULL) fprintf(f, "Callout %d:", cb->callout_number);
fprintf(f, " last capture = %d\n", cb->capture_last);
for (i = 0; i < cb->capture_top * 2; i += 2)
{
fprintf(f, "%2d: ", i/2);
@ -4553,7 +4573,7 @@ if (callout_capture)
fprintf(f, "\n");
}
}
/* Re-print the subject in canonical form, the first time or if giving full
datails. On subsequent calls in the same match, we use pchars just to find the
printed lengths of the substrings. */
@ -4572,19 +4592,22 @@ PCHARSV(cb->subject, cb->current_position,
if (f != NULL) fprintf(f, "\n");
/* Always print appropriate indicators, with callout number if not already
shown. For automatic callouts, show the pattern offset. */
/* For automatic callouts, show the pattern offset. Otherwise, for a numerical
callout whose number has not already been shown with captured strings, show the
number here. A callout with a string argument has been displayed above. */
if (cb->callout_number == 255)
{
fprintf(outfile, "%+3d ", (int)cb->pattern_position);
if (cb->pattern_position > 99) fprintf(outfile, "\n ");
}
else
else
{
if (callout_capture) fprintf(outfile, " ");
if (callout_capture || cb->callout_string != NULL) fprintf(outfile, " ");
else fprintf(outfile, "%3d ", cb->callout_number);
}
/* Now show position indicators */
for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
fprintf(outfile, "^");

28
testdata/testinput2 vendored
View File

@ -4178,4 +4178,32 @@ a random value. /Ix
/((?+1)(\1))/B
# Callouts with string arguments
/a(?C"/
/a(?C"a/
/a(?C"a"/
/a(?C"a"bcde(?C"b")xyz/
/a(?C"a)b""c")/B
/ab(?C" any text with spaces ")cde/B
abcde
12abcde
/^a(b)c(?C1)def/
abcdef
/^a(b)c(?C"AB")def/
abcdef
/^a(b)c(?C1)def/
abcdef\=callout_capture
/^a(b)c(?C{AB})def/B
abcdef\=callout_capture
# End of testinput2

16
testdata/testinput6 vendored
View File

@ -4811,4 +4811,20 @@
/a(b)c(d)/
abc\=ph,copy=0,copy=1,getall
/ab(?C" any text with spaces ")cde/B
abcde
12abcde
/^a(b)c(?C1)def/
abcdef
/^a(b)c(?C"AB")def/
abcdef
/^a(b)c(?C1)def/
abcdef\=callout_capture
/^a(b)c(?C{AB})def/B
abcdef\=callout_capture
# End of testinput6

93
testdata/testoutput2 vendored
View File

@ -3538,7 +3538,7 @@ Subject length lower bound = 2
Failed: error 138 at offset 6: number after (?C is greater than 255
/(?Cab)xx/I
Failed: error 139 at offset 3: closing parenthesis for (?C expected
Failed: error 182 at offset 3: unrecognized string delimiter follows (?C
/(?C12vr)x/I
Failed: error 139 at offset 5: closing parenthesis for (?C expected
@ -13969,4 +13969,95 @@ Matched, but too many substrings
End
------------------------------------------------------------------
# Callouts with string arguments
/a(?C"/
Failed: error 181 at offset 4: missing terminating delimiter for callout with string argument
/a(?C"a/
Failed: error 181 at offset 4: missing terminating delimiter for callout with string argument
/a(?C"a"/
Failed: error 139 at offset 7: closing parenthesis for (?C expected
/a(?C"a"bcde(?C"b")xyz/
Failed: error 139 at offset 7: closing parenthesis for (?C expected
/a(?C"a)b""c")/B
------------------------------------------------------------------
Bra
a
CalloutStr "a)b"c" 13 0
Ket
End
------------------------------------------------------------------
/ab(?C" any text with spaces ")cde/B
------------------------------------------------------------------
Bra
ab
CalloutStr " any text with spaces " 30 1
cde
Ket
End
------------------------------------------------------------------
abcde
Callout: " any text with spaces "
--->abcde
^ ^ c
0: abcde
12abcde
Callout: " any text with spaces "
--->12abcde
^ ^ c
0: abcde
/^a(b)c(?C1)def/
abcdef
--->abcdef
1 ^ ^ d
0: abcdef
1: b
/^a(b)c(?C"AB")def/
abcdef
Callout: "AB"
--->abcdef
^ ^ d
0: abcdef
1: b
/^a(b)c(?C1)def/
abcdef\=callout_capture
Callout 1: last capture = 1
0: <unset>
1: b
--->abcdef
^ ^ d
0: abcdef
1: b
/^a(b)c(?C{AB})def/B
------------------------------------------------------------------
Bra
^
a
CBra 1
b
Ket
c
CalloutStr {AB} 14 1
def
Ket
End
------------------------------------------------------------------
abcdef\=callout_capture
Callout: {AB} last capture = 1
0: <unset>
1: b
--->abcdef
^ ^ d
0: abcdef
1: b
# End of testinput2

62
testdata/testoutput6 vendored
View File

@ -7773,4 +7773,66 @@ Partial match: abc
Copy substring 1 failed (-2): partial match
get substring list failed (-2): partial match
/ab(?C" any text with spaces ")cde/B
------------------------------------------------------------------
Bra
ab
CalloutStr " any text with spaces " 30 1
cde
Ket
End
------------------------------------------------------------------
abcde
Callout: " any text with spaces "
--->abcde
^ ^ c
0: abcde
12abcde
Callout: " any text with spaces "
--->12abcde
^ ^ c
0: abcde
/^a(b)c(?C1)def/
abcdef
--->abcdef
1 ^ ^ d
0: abcdef
/^a(b)c(?C"AB")def/
abcdef
Callout: "AB"
--->abcdef
^ ^ d
0: abcdef
/^a(b)c(?C1)def/
abcdef\=callout_capture
Callout 1: last capture = 0
0:
--->abcdef
^ ^ d
0: abcdef
/^a(b)c(?C{AB})def/B
------------------------------------------------------------------
Bra
^
a
CBra 1
b
Ket
c
CalloutStr {AB} 14 1
def
Ket
End
------------------------------------------------------------------
abcdef\=callout_capture
Callout: {AB} last capture = 0
0:
--->abcdef
^ ^ d
0: abcdef
# End of testinput6