From 15e034c9c29f550f338373fa9d36d524deedad78 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sat, 14 Mar 2015 12:20:18 +0000 Subject: [PATCH] Add string offset within the pattern to the data passed to a callout with a string argument. --- src/pcre2.h.in | 1 + src/pcre2_compile.c | 10 ++++---- src/pcre2_dfa_match.c | 12 ++++++---- src/pcre2_jit_compile.c | 8 +++++-- src/pcre2_match.c | 12 ++++++---- src/pcre2_printint.c | 8 +++---- src/pcre2test.c | 3 ++- testdata/testoutput2 | 52 ++++++++++++++++++++--------------------- testdata/testoutput6 | 28 +++++++++++----------- 9 files changed, 75 insertions(+), 59 deletions(-) diff --git a/src/pcre2.h.in b/src/pcre2.h.in index d2cc3d6..04b82c6 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -338,6 +338,7 @@ typedef struct pcre2_callout_block { \ PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \ PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \ /* ------------------- Added for Version 1 -------------------------- */ \ + PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \ PCRE2_SPTR callout_string; /* String compiled into pattern */ \ uint32_t callout_string_length; /* Length of string compiled into pattern */ \ /* ------------------------------------------------------------------ */ \ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index bb4d97a..ee167d4 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -5652,17 +5652,19 @@ for (;; ptr++) for the terminating zero. Any doubled delimiters within the string make this an overestimate, but it is not worth bothering about. */ - (*lengthptr) += (ptr - start) + 2 + (1 + 3*LINK_SIZE); + (*lengthptr) += (ptr - start) + 2 + (1 + 4*LINK_SIZE); } /* In the real compile we can copy the string, knowing that it is syntactically OK. The starting delimiter is included so that the - client can discover it if they want. */ + client can discover it if they want. We also pass the start offset to + help a script language give better error messages. */ else { - PCRE2_UCHAR *callout_string = code + (1 + 3*LINK_SIZE); + PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE); *callout_string++ = *ptr++; + PUT(code, 1 + 3*LINK_SIZE, (int)(ptr - cb->start_pattern)); /* Start offset */ for(;;) { if (*ptr == delimiter) @@ -7302,7 +7304,7 @@ do { scode += 1 + LINK_SIZE; if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT]; - else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE); + else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE); switch (*scode) { diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index be23fc8..d6b29e8 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -2631,15 +2631,17 @@ for (;;) if (code[LINK_SIZE + 1] == OP_CALLOUT) { cb.callout_number = code[2 + 3*LINK_SIZE]; + cb.callout_string_offset = 0; cb.callout_string = NULL; cb.callout_string_length = 0; } else { cb.callout_number = 0; - cb.callout_string = code + (2 + 4*LINK_SIZE) + 1; + cb.callout_string_offset = GET(code, 2 + 4*LINK_SIZE); + cb.callout_string = code + (2 + 5*LINK_SIZE) + 1; cb.callout_string_length = - callout_length - (1 + 3*LINK_SIZE) - 2; + callout_length - (1 + 4*LINK_SIZE) - 2; } if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0) @@ -2997,15 +2999,17 @@ for (;;) if (*code == OP_CALLOUT) { cb.callout_number = code[1 + 2*LINK_SIZE]; + cb.callout_string_offset = 0; cb.callout_string = NULL; cb.callout_string_length = 0; } else { cb.callout_number = 0; - cb.callout_string = code + (1 + 3*LINK_SIZE) + 1; + cb.callout_string_offset = GET(code, 1 + 3*LINK_SIZE); + cb.callout_string = code + (1 + 4*LINK_SIZE) + 1; cb.callout_string_length = - callout_length - (1 + 3*LINK_SIZE) - 2; + callout_length - (1 + 4*LINK_SIZE) - 2; } if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0) diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 33a2e49..8578891 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -6346,6 +6346,7 @@ unsigned int callout_length = (*cc == OP_CALLOUT) ? PRIV(OP_lengths)[OP_CALLOUT] : GET(cc, 1 + 2 * LINK_SIZE); sljit_sw value1; sljit_sw value2; +sljit_sw value3; PUSH_BACKTRACK(sizeof(backtrack_common), cc, NULL); @@ -6373,15 +6374,18 @@ if (*cc == OP_CALLOUT) { value1 = 0; value2 = 0; + value3 = 0; } else { - value1 = (sljit_sw) (cc + (1 + 3*LINK_SIZE) + 1); - value2 = (callout_length - (1 + 3*LINK_SIZE + 2)); + value1 = (sljit_sw) (cc + (1 + 4*LINK_SIZE) + 1); + value2 = (callout_length - (1 + 4*LINK_SIZE + 2)); + value3 = (sljit_sw) (GET(cc, 1 + 3*LINK_SIZE)); } OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_string), SLJIT_IMM, value1); OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_string_length), SLJIT_IMM, value2); +OP1(mov_opcode, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(callout_string_offset), SLJIT_IMM, value3); OP1(SLJIT_MOV, SLJIT_MEM1(STACK_TOP), CALLOUT_ARG_OFFSET(mark), (common->mark_ptr != 0) ? TMP2 : SLJIT_IMM, 0); /* Needed to save important temporary registers. */ diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 119a8f6..acc695a 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -1333,15 +1333,17 @@ for (;;) if (*ecode == OP_CALLOUT) { cb.callout_number = ecode[1 + 2*LINK_SIZE]; + cb.callout_string_offset = 0; cb.callout_string = NULL; cb.callout_string_length = 0; } else { cb.callout_number = 0; - cb.callout_string = ecode + (1 + 3*LINK_SIZE) + 1; + cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); + cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1; cb.callout_string_length = - callout_length - (1 + 3*LINK_SIZE) - 2; + callout_length - (1 + 4*LINK_SIZE) - 2; } if ((rrc = mb->callout(&cb, mb->callout_data)) > 0) @@ -1757,15 +1759,17 @@ for (;;) if (*ecode == OP_CALLOUT) { cb.callout_number = ecode[1 + 2*LINK_SIZE]; + cb.callout_string_offset = 0; cb.callout_string = NULL; cb.callout_string_length = 0; } else { cb.callout_number = 0; - cb.callout_string = ecode + (1 + 3*LINK_SIZE) + 1; + cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); + cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1; cb.callout_string_length = - callout_length - (1 + 3*LINK_SIZE) - 2; + callout_length - (1 + 4*LINK_SIZE) - 2; } if ((rrc = mb->callout(&cb, mb->callout_data)) > 0) diff --git a/src/pcre2_printint.c b/src/pcre2_printint.c index 0465359..e0dc33e 100644 --- a/src/pcre2_printint.c +++ b/src/pcre2_printint.c @@ -600,18 +600,18 @@ for(;;) break; case OP_CALLOUT_STR: - c = code[1 + 3*LINK_SIZE]; + c = code[1 + 4*LINK_SIZE]; fprintf(f, " %s %c", OP_names[*code], c); extra = GET(code, 1 + 2*LINK_SIZE); - print_custring(f, code + 2 + 3*LINK_SIZE); - + print_custring(f, code + 2 + 4*LINK_SIZE); for (i = 0; PRIV(callout_start_delims)[i] != 0; i++) if (c == PRIV(callout_start_delims)[i]) { c = PRIV(callout_end_delims)[i]; break; } - fprintf(f, "%c %d %d", c, GET(code, 1), GET(code, 1 + LINK_SIZE)); + fprintf(f, "%c %d %d %d", c, GET(code, 1 + 3*LINK_SIZE), GET(code, 1), + GET(code, 1 + LINK_SIZE)); break; case OP_PROP: diff --git a/src/pcre2test.c b/src/pcre2test.c index 03b0db7..ee2d904 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -4546,7 +4546,8 @@ isn't a tidy way to fit it in the rest of the data. */ if (cb->callout_string != NULL) { uint32_t delimiter = CODE_UNIT(cb->callout_string, -1); - fprintf(outfile, "Callout: %c", delimiter); + fprintf(outfile, "Callout (%lu): %c", + (unsigned long int)cb->callout_string_offset, delimiter); PCHARSV(cb->callout_string, 0, cb->callout_string_length, utf, outfile); for (i = 0; callout_start_delims[i] != 0; i++) diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 20dce48..5e22343 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -13987,7 +13987,7 @@ Failed: error 139 at offset 7: closing parenthesis for (?C expected ------------------------------------------------------------------ Bra a - CalloutStr "a)b"c" 13 0 + CalloutStr "a)b"c" 5 13 0 Ket End ------------------------------------------------------------------ @@ -13996,18 +13996,18 @@ Failed: error 139 at offset 7: closing parenthesis for (?C expected ------------------------------------------------------------------ Bra ab - CalloutStr " any text with spaces " 30 1 + CalloutStr " any text with spaces " 6 30 1 cde Ket End ------------------------------------------------------------------ abcde -Callout: " any text with spaces " +Callout (6): " any text with spaces " --->abcde ^ ^ c 0: abcde 12abcde -Callout: " any text with spaces " +Callout (6): " any text with spaces " --->12abcde ^ ^ c 0: abcde @@ -14021,7 +14021,7 @@ Callout: " any text with spaces " /^a(b)c(?C"AB")def/ abcdef -Callout: "AB" +Callout (10): "AB" --->abcdef ^ ^ d 0: abcdef @@ -14046,13 +14046,13 @@ Callout 1: last capture = 1 b Ket c - CalloutStr {AB} 14 1 + CalloutStr {AB} 10 14 1 def Ket End ------------------------------------------------------------------ abcdef\=callout_capture -Callout: {AB} last capture = 1 +Callout (10): {AB} last capture = 1 0: 1: b --->abcdef @@ -14063,14 +14063,14 @@ Callout: {AB} last capture = 1 /(?C`a``b`)(?C'a''b')(?C"a""b")(?C^a^^b^)(?C%a%%b%)(?C#a##b#)(?C$a$$b$)(?C{a}}b})/B ------------------------------------------------------------------ Bra - CalloutStr `a`b` 10 0 - CalloutStr 'a'b' 20 0 - CalloutStr "a"b" 30 0 - CalloutStr ^a^b^ 40 0 - CalloutStr %a%b% 50 0 - CalloutStr #a#b# 60 0 - CalloutStr $a$b$ 70 0 - CalloutStr {a}b} 80 0 + CalloutStr `a`b` 4 10 0 + CalloutStr 'a'b' 14 20 0 + CalloutStr "a"b" 24 30 0 + CalloutStr ^a^b^ 34 40 0 + CalloutStr %a%b% 44 50 0 + CalloutStr #a#b# 54 60 0 + CalloutStr $a$b$ 64 70 0 + CalloutStr {a}b} 74 80 0 Ket End ------------------------------------------------------------------ @@ -14080,15 +14080,15 @@ Callout: {AB} last capture = 1 Bra Bra a - CalloutStr `code` 14 0 + CalloutStr `code` 8 14 0 Ket Bra a - CalloutStr `code` 14 0 + CalloutStr `code` 8 14 0 Ket Bra a - CalloutStr `code` 14 0 + CalloutStr `code` 8 14 0 Ket Ket End @@ -14124,7 +14124,7 @@ Callout: {AB} last capture = 1 Bra ^ Cond - CalloutStr $abc$ 12 7 + CalloutStr $abc$ 7 12 7 Assert abc Ket @@ -14136,35 +14136,35 @@ Callout: {AB} last capture = 1 End ------------------------------------------------------------------ abcdefg -Callout: $abc$ +Callout (7): $abc$ --->abcdefg ^ (?=abc) 0: abcd xyz123 -Callout: $abc$ +Callout (7): $abc$ --->xyz123 ^ (?=abc) 0: xyz /^ab(?C'first')cd(?C"second")ef/ abcdefg -Callout: 'first' +Callout (7): 'first' --->abcdefg ^ ^ c -Callout: "second" +Callout (20): "second" --->abcdefg ^ ^ e 0: abcdef /(?:a(?C`code`)){3}X/ aaaXY -Callout: `code` +Callout (8): `code` --->aaaXY ^^ ) -Callout: `code` +Callout (8): `code` --->aaaXY ^ ^ ) -Callout: `code` +Callout (8): `code` --->aaaXY ^ ^ ) 0: aaaX diff --git a/testdata/testoutput6 b/testdata/testoutput6 index f05dddf..1470d2c 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -7777,18 +7777,18 @@ get substring list failed (-2): partial match ------------------------------------------------------------------ Bra ab - CalloutStr " any text with spaces " 30 1 + CalloutStr " any text with spaces " 6 30 1 cde Ket End ------------------------------------------------------------------ abcde -Callout: " any text with spaces " +Callout (6): " any text with spaces " --->abcde ^ ^ c 0: abcde 12abcde -Callout: " any text with spaces " +Callout (6): " any text with spaces " --->12abcde ^ ^ c 0: abcde @@ -7801,7 +7801,7 @@ Callout: " any text with spaces " /^a(b)c(?C"AB")def/ abcdef -Callout: "AB" +Callout (10): "AB" --->abcdef ^ ^ d 0: abcdef @@ -7823,13 +7823,13 @@ Callout 1: last capture = 0 b Ket c - CalloutStr {AB} 14 1 + CalloutStr {AB} 10 14 1 def Ket End ------------------------------------------------------------------ abcdef\=callout_capture -Callout: {AB} last capture = 0 +Callout (10): {AB} last capture = 0 0: --->abcdef ^ ^ d @@ -7865,7 +7865,7 @@ Callout: {AB} last capture = 0 Bra ^ Cond - CalloutStr $abc$ 12 7 + CalloutStr $abc$ 7 12 7 Assert abc Ket @@ -7877,35 +7877,35 @@ Callout: {AB} last capture = 0 End ------------------------------------------------------------------ abcdefg -Callout: $abc$ +Callout (7): $abc$ --->abcdefg ^ (?=abc) 0: abcd xyz123 -Callout: $abc$ +Callout (7): $abc$ --->xyz123 ^ (?=abc) 0: xyz /^ab(?C'first')cd(?C"second")ef/ abcdefg -Callout: 'first' +Callout (7): 'first' --->abcdefg ^ ^ c -Callout: "second" +Callout (20): "second" --->abcdefg ^ ^ e 0: abcdef /(?:a(?C`code`)){3}X/ aaaXY -Callout: `code` +Callout (8): `code` --->aaaXY ^^ ) -Callout: `code` +Callout (8): `code` --->aaaXY ^ ^ ) -Callout: `code` +Callout (8): `code` --->aaaXY ^ ^ ) 0: aaaX