diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 9835abc..4da565b 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -485,12 +485,12 @@ typedef struct compiler_common { jump_list *getucdtype; #if PCRE2_CODE_UNIT_WIDTH == 8 jump_list *utfreadchar; - jump_list *utfreadnewline_invalid; jump_list *utfreadtype8; jump_list *utfpeakcharback; #endif #if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 jump_list *utfreadchar_invalid; + jump_list *utfreadnewline_invalid; jump_list *utfmoveback_invalid; jump_list *utfpeakcharback_invalid; #endif @@ -640,7 +640,7 @@ the start pointers when the end of the capturing group has not yet reached. */ { \ c = ptr[1] - 0x80; \ \ - if (ptr[0] >= 0xc0 && ptr[0] <= 0xdf) \ + if (ptr[0] >= 0xc2 && ptr[0] <= 0xdf) \ { \ c |= (ptr[0] - 0xc0) << 6; \ ptr += 2; \ @@ -653,6 +653,11 @@ the start pointers when the end of the capturing group has not yet reached. */ { \ c |= (ptr[0] - 0xe0) << 12; \ ptr += 3; \ + \ + if (c < 0x800 || (c >= 0xd800 && c < 0xe000)) \ + { \ + invalid_action; \ + } \ } \ else if (ptr + 3 < end && ptr[3] >= 0x80 && ptr[3] < 0xc0) \ { \ @@ -663,7 +668,7 @@ the start pointers when the end of the capturing group has not yet reached. */ c |= (ptr[0] - 0xf0) << 18; \ ptr += 4; \ \ - if (c >= 0x110000) \ + if (c >= 0x110000 || c < 0x10000) \ { \ invalid_action; \ } \ @@ -697,7 +702,7 @@ the start pointers when the end of the capturing group has not yet reached. */ { \ c = ptr[-1] - 0x80; \ \ - if (ptr[-2] >= 0xc0 && ptr[-2] <= 0xdf) \ + if (ptr[-2] >= 0xc2 && ptr[-2] <= 0xdf) \ { \ c |= (ptr[-2] - 0xc0) << 6; \ ptr -= 2; \ @@ -710,6 +715,11 @@ the start pointers when the end of the capturing group has not yet reached. */ { \ c |= (ptr[-3] - 0xe0) << 12; \ ptr -= 3; \ + \ + if (c < 0x800 || (c >= 0xd800 && c < 0xe000)) \ + { \ + invalid_action; \ + } \ } \ else if (ptr - 3 > start && ptr[-3] >= 0x80 && ptr[-3] < 0xc0) \ { \ @@ -720,7 +730,7 @@ the start pointers when the end of the capturing group has not yet reached. */ c |= (ptr[-4] - 0xf0) << 18; \ ptr -= 4; \ \ - if (c >= 0x110000) \ + if (c >= 0x110000 || c < 0x10000) \ { \ invalid_action; \ } \ @@ -3343,7 +3353,7 @@ else JUMPHERE(jump); } -static void peek_char(compiler_common *common, sljit_u32 max, sljit_s32 dst, sljit_sw dstw) +static void peek_char(compiler_common *common, sljit_u32 max, sljit_s32 dst, sljit_sw dstw, jump_list **backtracks) { /* Reads the character into TMP1, keeps STR_PTR. Does not check STR_END. TMP2, dst, RETURN_ADDR Destroyed. */ @@ -3355,6 +3365,7 @@ struct sljit_jump *jump; SLJIT_UNUSED_ARG(max); SLJIT_UNUSED_ARG(dst); SLJIT_UNUSED_ARG(dstw); +SLJIT_UNUSED_ARG(backtracks); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); @@ -3369,6 +3380,8 @@ if (common->utf) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); add_jump(compiler, common->invalid_utf ? &common->utfreadchar_invalid : &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw); + if (backtracks && common->invalid_utf) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); JUMPHERE(jump); } #elif PCRE2_CODE_UNIT_WIDTH == 16 @@ -3381,7 +3394,12 @@ if (common->utf) if (common->invalid_utf) { jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); + OP1(SLJIT_MOV, dst, dstw, STR_PTR, 0); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw); + if (backtracks && common->invalid_utf) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); } else { @@ -3392,10 +3410,20 @@ if (common->utf) OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); } + JUMPHERE(jump); } #elif PCRE2_CODE_UNIT_WIDTH == 32 - +if (common->invalid_utf) + { + if (backtracks != NULL) + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); + else + { + OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000); + CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); + } + } #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ #endif /* SUPPORT_UNICODE */ } @@ -3458,7 +3486,8 @@ if (common->utf) JUMPHERE(jump); } #elif PCRE2_CODE_UNIT_WIDTH == 32 - + if (common->invalid_utf) + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ #endif /* SUPPORT_UNICODE */ } @@ -3591,7 +3620,12 @@ if (common->utf) { OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); - add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + + if (options & READ_CHAR_UTF8_NEWLINE) + add_jump(compiler, &common->utfreadnewline_invalid, JUMP(SLJIT_FAST_CALL)); + else + add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + if (backtracks != NULL) add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); JUMPHERE(jump); @@ -3868,7 +3902,18 @@ if (common->utf) if (common->invalid_utf && !must_be_valid) { OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); + if (backtracks != NULL) + { + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + return; + } + + OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000); + OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_LESS); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2); + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + return; } #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ #endif /* SUPPORT_UNICODE */ @@ -4132,8 +4177,9 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0); static void do_utfreadnewline_invalid(compiler_common *common) { -/* Slow decoding a UTF-8 character. TMP1 contains the first byte -of the character (>= 0xc0). Return char value in TMP1. */ +/* Slow decoding a UTF-8 character, specialized for newlines. +TMP1 contains the first byte of the character (>= 0xc0). Return +char value in TMP1. */ DEFINE_COMPILER; struct sljit_jump *jump; struct sljit_jump *buffer_end_close; @@ -4506,7 +4552,8 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0); static void do_utfreadchar_invalid(compiler_common *common) { /* Slow decoding a UTF-16 character. TMP1 contains the first half -of the character (>= 0xd800). Return char value in TMP1, length in TMP2. */ +of the character (>= 0xd800). Return char value in TMP1. STR_PTR is +undefined for invalid characters. */ DEFINE_COMPILER; struct sljit_jump *exit_invalid[3]; @@ -4534,6 +4581,38 @@ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); } +static void do_utfreadnewline_invalid(compiler_common *common) +{ +/* Slow decoding a UTF-16 character, specialized for newlines. +TMP1 contains the first half of the character (>= 0xd800). Return +char value in TMP1. */ + +DEFINE_COMPILER; +struct sljit_jump *exit_invalid[2]; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); + +/* TMP2 contains the high surrogate. */ +exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); + +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); +exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00); + +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xdc00); +OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x400); +OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000); +OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); + +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(exit_invalid[0]); +JUMPHERE(exit_invalid[1]); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} + static void do_utfmoveback_invalid(compiler_common *common) { /* Goes one character back. */ @@ -6651,7 +6730,7 @@ JUMPHERE(skipread); OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0); check_str_end(common, &skipread_list); -peek_char(common, READ_CHAR_MAX, SLJIT_MEM1(SLJIT_SP), LOCALS1); +peek_char(common, READ_CHAR_MAX, SLJIT_MEM1(SLJIT_SP), LOCALS1, &invalid_utf); /* Testing char type. This is a code duplication. */ #ifdef SUPPORT_UNICODE @@ -8144,7 +8223,7 @@ switch(type) } else { - peek_char(common, common->nlmax, TMP3, 0); + peek_char(common, common->nlmax, TMP3, 0, NULL); check_newlinechar(common, common->nltype, backtracks, FALSE); } JUMPHERE(jump[0]); @@ -8506,7 +8585,7 @@ switch(type) case OP_ALLANY: if (check_str_ptr) detect_partial_match(common, backtracks); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +#ifdef SUPPORT_UNICODE if (common->utf) { if (common->invalid_utf) @@ -8515,9 +8594,9 @@ switch(type) return cc; } +#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 #if PCRE2_CODE_UNIT_WIDTH == 8 jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); @@ -8529,12 +8608,12 @@ switch(type) OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); -#endif +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ JUMPHERE(jump[0]); -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */ return cc; +#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */ } -#endif +#endif /* SUPPORT_UNICODE */ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); return cc; @@ -13749,11 +13828,6 @@ if (common->utfreadchar != NULL) set_jumps(common->utfreadchar, LABEL()); do_utfreadchar(common); } -if (common->utfreadnewline_invalid != NULL) - { - set_jumps(common->utfreadnewline_invalid, LABEL()); - do_utfreadnewline_invalid(common); - } if (common->utfreadtype8 != NULL) { set_jumps(common->utfreadtype8, LABEL()); @@ -13771,6 +13845,11 @@ if (common->utfreadchar_invalid != NULL) set_jumps(common->utfreadchar_invalid, LABEL()); do_utfreadchar_invalid(common); } +if (common->utfreadnewline_invalid != NULL) + { + set_jumps(common->utfreadnewline_invalid, LABEL()); + do_utfreadnewline_invalid(common); + } if (common->utfmoveback_invalid) { set_jumps(common->utfmoveback_invalid, LABEL()); diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c index 748398b..14b547f 100644 --- a/src/pcre2_jit_test.c +++ b/src/pcre2_jit_test.c @@ -95,6 +95,7 @@ POSSIBILITY OF SUCH DAMAGE. static int regression_tests(void); static int invalid_utf8_regression_tests(void); static int invalid_utf16_regression_tests(void); +static int invalid_utf32_regression_tests(void); int main(void) { @@ -111,8 +112,9 @@ int main(void) return 1; } return regression_tests() - || invalid_utf8_regression_tests() - || invalid_utf16_regression_tests(); + | invalid_utf8_regression_tests() + | invalid_utf16_regression_tests() + | invalid_utf32_regression_tests(); } /* --------------------------------------------------------------------------------------- */ @@ -1917,6 +1919,11 @@ static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cas { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" }, { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" }, + { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "#" }, + { UDA, CPI, 0, 0, 0, 0, 4, { "[^#]", NULL }, "\xf4\x8f\xbf\xbf" }, + { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xf4\x90\x80\x80" }, + { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xc1\x80" }, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"}, { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"}, { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"}, @@ -2069,53 +2076,65 @@ struct invalid_utf16_regression_test_case { const PCRE2_UCHAR16 *input; }; -static PCRE2_UCHAR16 allany[] = { '.', 0 }; -static PCRE2_UCHAR16 non_word_boundary[] = { '\\', 'B', 0 }; -static PCRE2_UCHAR16 word_boundary[] = { '\\', 'b', 0 }; -static PCRE2_UCHAR16 backreference[] = { '(', '.', ')', '\\', '1', 0 }; -static PCRE2_UCHAR16 grapheme[] = { '\\', 'X', 0 }; -static PCRE2_UCHAR16 test1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 }; -static PCRE2_UCHAR16 test2[] = { 0xd800, 0xdc00, '#', 0 }; -static PCRE2_UCHAR16 test3[] = { 0xdbff, 0xdfff, '#', 0 }; -static PCRE2_UCHAR16 test4[] = { 0xd800, 0xdbff, '#', 0 }; -static PCRE2_UCHAR16 test5[] = { '#', 0xd800, '#', 0 }; -static PCRE2_UCHAR16 test6[] = { 'a', 'A', 0xdc28, 0 }; -static PCRE2_UCHAR16 test7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 }; +static PCRE2_UCHAR16 allany16[] = { '.', 0 }; +static PCRE2_UCHAR16 non_word_boundary16[] = { '\\', 'B', 0 }; +static PCRE2_UCHAR16 word_boundary16[] = { '\\', 'b', 0 }; +static PCRE2_UCHAR16 backreference16[] = { '(', '.', ')', '\\', '1', 0 }; +static PCRE2_UCHAR16 grapheme16[] = { '\\', 'X', 0 }; +static PCRE2_UCHAR16 nothashmark16[] = { '[', '^', '#', ']', 0 }; +static PCRE2_UCHAR16 afternl16[] = { '^', '\\', 'W', 0 }; +static PCRE2_UCHAR16 test16_1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 }; +static PCRE2_UCHAR16 test16_2[] = { 0xd800, 0xdc00, '#', 0 }; +static PCRE2_UCHAR16 test16_3[] = { 0xdbff, 0xdfff, '#', 0 }; +static PCRE2_UCHAR16 test16_4[] = { 0xd800, 0xdbff, '#', 0 }; +static PCRE2_UCHAR16 test16_5[] = { '#', 0xd800, '#', 0 }; +static PCRE2_UCHAR16 test16_6[] = { 'a', 'A', 0xdc28, 0 }; +static PCRE2_UCHAR16 test16_7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 }; +static PCRE2_UCHAR16 test16_8[] = { '#', 0xd800, 0xdc00, 0 }; +static PCRE2_UCHAR16 test16_9[] = { ' ', 0x2028, '#', 0 }; +static PCRE2_UCHAR16 test16_10[] = { ' ', 0xdc00, 0xd800, 0x2028, '#', 0 }; static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = { - { UDA, CI, 0, 0, 0, 0, 1, { allany, NULL }, test1 }, - { UDA, CI, 1, 0, 0, 1, 2, { allany, NULL }, test1 }, - { UDA, CI, 2, 0, 0, 2, 3, { allany, NULL }, test1 }, - { UDA, CI, 3, 0, 0, 3, 4, { allany, NULL }, test1 }, - { UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test2 }, - { UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test2 }, - { UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test2 }, - { UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test3 }, - { UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test3 }, - { UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test3 }, + { UDA, CI, 0, 0, 0, 0, 1, { allany16, NULL }, test16_1 }, + { UDA, CI, 1, 0, 0, 1, 2, { allany16, NULL }, test16_1 }, + { UDA, CI, 2, 0, 0, 2, 3, { allany16, NULL }, test16_1 }, + { UDA, CI, 3, 0, 0, 3, 4, { allany16, NULL }, test16_1 }, + { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_2 }, + { UDA, CI, 0, 0, 2, -1, -1, { allany16, NULL }, test16_2 }, + { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_2 }, + { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_3 }, + { UDA, CI, 0, 0, 2, -1, -1, { allany16, NULL }, test16_3 }, + { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_3 }, - { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary, NULL }, test1 }, - { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test1 }, - { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary, NULL }, test1 }, - { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary, NULL }, test1 }, - { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test2 }, - { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test3 }, - { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test2 }, - { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test3 }, - { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test4 }, - { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test5 }, + { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary16, NULL }, test16_1 }, + { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_1 }, + { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary16, NULL }, test16_1 }, + { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary16, NULL }, test16_1 }, + { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_2 }, + { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_3 }, + { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_2 }, + { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_3 }, + { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_4 }, + { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_5 }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference, NULL }, test6 }, - { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference, NULL }, test6 }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference, NULL }, test7 }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference, NULL }, test7 }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference16, NULL }, test16_6 }, + { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference16, NULL }, test16_6 }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference16, NULL }, test16_7 }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference16, NULL }, test16_7 }, - { UDA, CPI, 0, 0, 0, 0, 1, { grapheme, NULL }, test6 }, - { UDA, CPI, 1, 0, 0, 1, 2, { grapheme, NULL }, test6 }, - { UDA, CPI, 2, 0, 0, -1, -1, { grapheme, NULL }, test6 }, - { UDA, CPI, 0, 0, 0, 0, 2, { grapheme, NULL }, test7 }, - { UDA, CPI, 2, 0, 0, 2, 4, { grapheme, NULL }, test7 }, - { UDA, CPI, 1, 0, 0, -1, -1, { grapheme, NULL }, test7 }, + { UDA, CPI, 0, 0, 0, 0, 1, { grapheme16, NULL }, test16_6 }, + { UDA, CPI, 1, 0, 0, 1, 2, { grapheme16, NULL }, test16_6 }, + { UDA, CPI, 2, 0, 0, -1, -1, { grapheme16, NULL }, test16_6 }, + { UDA, CPI, 0, 0, 0, 0, 2, { grapheme16, NULL }, test16_7 }, + { UDA, CPI, 2, 0, 0, 2, 4, { grapheme16, NULL }, test16_7 }, + { UDA, CPI, 1, 0, 0, -1, -1, { grapheme16, NULL }, test16_7 }, + + { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 }, + { UDA, CPI, 1, 0, 0, 1, 3, { nothashmark16, NULL }, test16_8 }, + { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 }, + + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl16, NULL }, test16_9 }, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { afternl16, NULL }, test16_10 }, { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } }; @@ -2239,4 +2258,178 @@ static int invalid_utf16_regression_tests(void) #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */ +#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_32 + +#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED) +#define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF) +#define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF) + +struct invalid_utf32_regression_test_case { + int compile_options; + int jit_compile_options; + int start_offset; + int skip_left; + int skip_right; + int match_start; + int match_end; + const PCRE2_UCHAR32 *pattern[2]; + const PCRE2_UCHAR32 *input; +}; + +static PCRE2_UCHAR32 allany32[] = { '.', 0 }; +static PCRE2_UCHAR32 non_word_boundary32[] = { '\\', 'B', 0 }; +static PCRE2_UCHAR32 word_boundary32[] = { '\\', 'b', 0 }; +static PCRE2_UCHAR32 backreference32[] = { '(', '.', ')', '\\', '1', 0 }; +static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 }; +static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 }; +static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 }; +static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x10ffff, 0 }; +static PCRE2_UCHAR32 test32_2[] = { 'a', 'A', 0x110000, 0 }; +static PCRE2_UCHAR32 test32_3[] = { '#', 0x10ffff, 0x110000, 0 }; +static PCRE2_UCHAR32 test32_4[] = { ' ', 0x2028, '#', 0 }; +static PCRE2_UCHAR32 test32_5[] = { ' ', 0x110000, 0x2028, '#', 0 }; + +static struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = { + { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 }, + { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 }, + + { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 }, + { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 }, + { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 }, + + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_2 }, + { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_2 }, + + { UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 }, + { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 }, + + { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 }, + { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_3 }, + { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 }, + + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_4 }, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_5 }, + + { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } +}; + +#undef UDA +#undef CI +#undef CPI + +static int run_invalid_utf32_test(struct invalid_utf32_regression_test_case *current, + int pattern_index, int i, pcre2_compile_context_32 *ccontext, pcre2_match_data_32 *mdata) +{ + pcre2_code_32 *code; + int result, errorcode; + PCRE2_SIZE length, erroroffset; + const PCRE2_UCHAR32 *input; + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(mdata); + + if (current->pattern[i] == NULL) + return 1; + + code = pcre2_compile_32(current->pattern[i], PCRE2_ZERO_TERMINATED, + current->compile_options, &errorcode, &erroroffset, ccontext); + + if (!code) { + printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset); + return 0; + } + + if (pcre2_jit_compile_32(code, current->jit_compile_options) != 0) { + printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index); + pcre2_code_free_32(code); + return 0; + } + + input = current->input; + length = 0; + + while (*input++ != 0) + length++; + + length -= current->skip_left + current->skip_right; + + if (current->jit_compile_options & PCRE2_JIT_COMPLETE) { + result = pcre2_jit_match_32(code, (current->input + current->skip_left), + length, current->start_offset - current->skip_left, 0, mdata, NULL); + + if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) { + pcre2_code_free_32(code); + return 0; + } + } + + if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) { + result = pcre2_jit_match_32(code, (current->input + current->skip_left), + length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL); + + if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) { + pcre2_code_free_32(code); + return 0; + } + } + + pcre2_code_free_32(code); + return 1; +} + +static int invalid_utf32_regression_tests(void) +{ + struct invalid_utf32_regression_test_case *current; + pcre2_compile_context_32 *ccontext; + pcre2_match_data_32 *mdata; + int total = 0, successful = 0; + int result; + + printf("\nRunning invalid-utf32 JIT regression tests\n"); + + ccontext = pcre2_compile_context_create_32(NULL); + pcre2_set_newline_32(ccontext, PCRE2_NEWLINE_ANY); + mdata = pcre2_match_data_create_32(4, NULL); + + for (current = invalid_utf32_regression_test_cases; current->pattern[0]; current++) { + /* printf("\nPattern: %s :\n", current->pattern); */ + total++; + + result = 1; + if (!run_invalid_utf32_test(current, total - 1, 0, ccontext, mdata)) + result = 0; + if (!run_invalid_utf32_test(current, total - 1, 1, ccontext, mdata)) + result = 0; + + if (result) { + successful++; + } + + printf("."); + if ((total % 60) == 0) + printf("\n"); + } + + if ((total % 60) != 0) + printf("\n"); + + pcre2_match_data_free_32(mdata); + pcre2_compile_context_free_32(ccontext); + + if (total == successful) { + printf("\nAll invalid UTF32 JIT regression tests are successfully passed.\n"); + return 0; + } else { + printf("\nInvalid UTF32 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful); + return 1; + } +} + +#else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_32 */ + +static int invalid_utf32_regression_tests(void) +{ + return 0; +} + +#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_32 */ + /* End of pcre2_jit_test.c */