diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 3c614be..ba9d3bf 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -485,6 +485,7 @@ typedef struct compiler_common { jump_list *getucdtype; #if PCRE2_CODE_UNIT_WIDTH == 8 jump_list *utfreadchar; + jump_list *utfreadchar_invalid_precise; jump_list *utfreadtype8; jump_list *utfpeakcharback; #endif @@ -3462,8 +3463,13 @@ if (common->utf) #endif /* SUPPORT_UNICODE */ } -static void read_char_range(compiler_common *common, sljit_u32 min, sljit_u32 max, - jump_list **backtracks, BOOL update_str_ptr) +#define READ_CHAR_UPDATE_STR_PTR 0x1 +#define READ_CHAR_UPDATE_STR_PTR_INVALID 0x2 +#define READ_CHAR_UPDATE_STR_PTR_PRECISE (READ_CHAR_UPDATE_STR_PTR | READ_CHAR_UPDATE_STR_PTR_INVALID) +#define READ_CHAR_VALID_UTF 0x4 + +static void read_char(compiler_common *common, sljit_u32 min, sljit_u32 max, + jump_list **backtracks, sljit_u32 options) { /* Reads the precise value of a character into TMP1, if the character is between min and max (c >= min && c <= max). Otherwise it returns with a value @@ -3476,24 +3482,30 @@ struct sljit_jump *jump; struct sljit_jump *jump2; #endif -SLJIT_UNUSED_ARG(update_str_ptr); SLJIT_UNUSED_ARG(min); SLJIT_UNUSED_ARG(max); SLJIT_UNUSED_ARG(backtracks); +SLJIT_UNUSED_ARG(options); SLJIT_ASSERT(min <= max); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 if (common->utf) { - if (max < 128 && !update_str_ptr) return; + if (max < 128 && !(options & READ_CHAR_UPDATE_STR_PTR)) return; - if (common->invalid_utf) + if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF)) { jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80); - add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + + if (options & READ_CHAR_UPDATE_STR_PTR_INVALID) + add_jump(compiler, &common->utfreadchar_invalid_precise, JUMP(SLJIT_FAST_CALL)); + else + add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + if (backtracks != NULL) add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); JUMPHERE(jump); @@ -3504,7 +3516,7 @@ if (common->utf) if (min >= 0x10000) { OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf0); - if (update_str_ptr) + if (options & READ_CHAR_UPDATE_STR_PTR) OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x7); @@ -3516,19 +3528,19 @@ if (common->utf) OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); - if (!update_str_ptr) + if (!(options & READ_CHAR_UPDATE_STR_PTR)) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); JUMPHERE(jump2); - if (update_str_ptr) + if (options & READ_CHAR_UPDATE_STR_PTR) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0); } else if (min >= 0x800 && max <= 0xffff) { OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xe0); - if (update_str_ptr) + if (options & READ_CHAR_UPDATE_STR_PTR) OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xf); @@ -3536,13 +3548,13 @@ if (common->utf) OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); - if (!update_str_ptr) + if (!(options & READ_CHAR_UPDATE_STR_PTR)) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); JUMPHERE(jump2); - if (update_str_ptr) + if (options & READ_CHAR_UPDATE_STR_PTR) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0); } else if (max >= 0x800) @@ -3557,7 +3569,7 @@ if (common->utf) else { OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - if (!update_str_ptr) + if (!(options & READ_CHAR_UPDATE_STR_PTR)) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); else OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); @@ -3565,39 +3577,37 @@ if (common->utf) OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - if (update_str_ptr) + if (options & READ_CHAR_UPDATE_STR_PTR) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0); } JUMPHERE(jump); } -#endif - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16 +#elif PCRE2_CODE_UNIT_WIDTH == 16 if (common->utf) { - if (max < 0xd800 && !update_str_ptr) return; + if (max < 0xd800 && !(options & READ_CHAR_UPDATE_STR_PTR)) return; - if (max >= 0x10000 || common->invalid_utf) + if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF)) { OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); + jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); + add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + if (backtracks != NULL) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); + JUMPHERE(jump); + return; + } - if (common->invalid_utf) - { - jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); - add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); - if (backtracks != NULL) - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); - } - else - { - jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800); - /* TMP2 contains the high surrogate. */ - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); - OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); - } + if (max >= 0x10000) + { + OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); + jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800); + /* TMP2 contains the high surrogate. */ + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); + OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); + OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); JUMPHERE(jump); return; } @@ -3605,13 +3615,25 @@ if (common->utf) /* Skip low surrogate if necessary. */ OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800); - if (update_str_ptr) + if (options & READ_CHAR_UPDATE_STR_PTR) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); if (max >= 0xd800) OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000); JUMPHERE(jump); } -#endif +#elif PCRE2_CODE_UNIT_WIDTH == 32 +if (common->invalid_utf) + { + if (backtracks != NULL) + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); + else + { + OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000); + CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); + } + } +#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ +#endif /* SUPPORT_UNICODE */ } #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 @@ -3646,6 +3668,7 @@ SLJIT_ASSERT(common->utf); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +/* All values > 127 are zero in ctypes. */ OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); if (negated) @@ -3700,14 +3723,15 @@ if (common->utf) OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0); + OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2); if (common->invalid_utf) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x1f)); + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe0 - 0xc2)); + OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 2); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80); if (common->invalid_utf) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x3f)); + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40)); OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); @@ -3718,6 +3742,7 @@ if (common->utf) else if (common->invalid_utf) { add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + OP1(SLJIT_MOV, TMP2, 0, TMP1, 0); add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); @@ -3970,6 +3995,122 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0); static void do_utfreadchar_invalid(compiler_common *common) { /* Slow decoding a UTF-8 character. TMP1 contains the first byte +of the character (>= 0xc0). Return char value in TMP1. STR_PTR is +undefined for invalid characters. */ +DEFINE_COMPILER; +sljit_s32 i; +struct sljit_jump *jump; +struct sljit_jump *buffer_end_close; +struct sljit_label *three_byte_entry; +struct sljit_label *exit_invalid_label; +struct sljit_jump *exit_invalid[11]; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); + +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc2); + +/* Usually more than 3 characters remained in the subject buffer. */ +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); + +/* Not a valid start of a multi-byte sequence, no more bytes read. */ +exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf5 - 0xc2); + +buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0); + +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3)); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + +OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); +jump = JUMP(SLJIT_NOT_ZERO); + +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump); + +/* Three-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); + +OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000); +jump = JUMP(SLJIT_NOT_ZERO); + +three_byte_entry = LABEL(); + +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2d800); +exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + +exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump); + +/* Four-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); + +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc10000); +exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000); + +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(buffer_end_close); +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); +exit_invalid[7] = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0); + +/* Two-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +exit_invalid[8] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); + +OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); +jump = JUMP(SLJIT_NOT_ZERO); + +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +/* Three-byte sequence. */ +JUMPHERE(jump); +exit_invalid[9] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); + +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +exit_invalid[10] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); + +/* One will be substracted from STR_PTR later. */ +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); + +/* Four byte sequences are not possible. */ +CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x30000, three_byte_entry); + +exit_invalid_label = LABEL(); +for (i = 0; i < 11; i++) + sljit_set_label(exit_invalid[i], exit_invalid_label); + +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} + +static void do_utfreadchar_invalid_precise(compiler_common *common) +{ +/* Slow decoding a UTF-8 character. TMP1 contains the first byte of the character (>= 0xc0). Return char value in TMP1. */ DEFINE_COMPILER; struct sljit_jump *jump; @@ -3987,7 +4128,7 @@ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); /* Not a valid start of a multi-byte sequence, no more bytes read. */ -exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf8 - 0xc); +exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf8 - 0xc0); buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0); @@ -4576,7 +4717,7 @@ if ((overall_options & PCRE2_FIRSTLINE) != 0) mainloop = LABEL(); /* Continual stores does not cause data dependency. */ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0); - read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE); + read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_UPDATE_STR_PTR_PRECISE); check_newlinechar(common, common->nltype, &newline, TRUE); CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop); JUMPHERE(end); @@ -6206,7 +6347,7 @@ move_back(common, NULL, FALSE); loop = LABEL(); common->ff_newline_shortcut = loop; -read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE); +read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_UPDATE_STR_PTR_PRECISE); lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF) foundcr = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); @@ -6451,7 +6592,8 @@ else { move_back(common, &invalid_utf, FALSE); check_start_used_ptr(common); - read_char_range(common, 0, READ_CHAR_MAX, &invalid_utf, TRUE); + /* No need precise read since match fails anyway. */ + read_char(common, 0, READ_CHAR_MAX, &invalid_utf, READ_CHAR_UPDATE_STR_PTR); } /* Testing char type. */ @@ -7394,7 +7536,10 @@ SLJIT_ASSERT(compares > 0); /* We are not necessary in utf mode even in 8 bit mode. */ cc = ccbegin; -read_char_range(common, min, max, ((cc[-1] & XCL_NOT) != 0) ? backtracks : NULL, (cc[-1] & XCL_NOT) != 0); +if ((cc[-1] & XCL_NOT) != 0) + read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR); +else + read_char(common, min, max, NULL, 0); if ((cc[-1] & XCL_HASPROP) == 0) { @@ -7920,13 +8065,13 @@ switch(type) } else { - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, STR_PTR, 0); - read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE); + OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0); + read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR); add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, STR_END, 0)); add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL)); sljit_set_current_flags(compiler, SLJIT_SET_Z); add_jump(compiler, backtracks, JUMP(SLJIT_ZERO)); - OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1); + OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0); } JUMPHERE(jump[2]); JUMPHERE(jump[3]); @@ -8325,7 +8470,7 @@ switch(type) case OP_ANY: if (check_str_ptr) detect_partial_match(common, backtracks); - read_char_range(common, common->nlmin, common->nlmax, backtracks, TRUE); + read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR); if (common->nltype == NLTYPE_FIXED && common->newline > 255) { jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff); @@ -8352,7 +8497,7 @@ switch(type) { if (common->invalid_utf) { - read_char_range(common, 0, READ_CHAR_MAX, backtracks, TRUE); + read_char(common, 0, READ_CHAR_MAX, backtracks, READ_CHAR_UPDATE_STR_PTR); return cc; } @@ -8402,7 +8547,7 @@ switch(type) case OP_ANYNL: if (check_str_ptr) detect_partial_match(common, backtracks); - read_char_range(common, common->bsr_nlmin, common->bsr_nlmax, NULL, FALSE); + read_char(common, common->bsr_nlmin, common->bsr_nlmax, NULL, 0); jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); /* We don't need to handle soft partial matching case. */ end_list = NULL; @@ -8425,7 +8570,12 @@ switch(type) case OP_HSPACE: if (check_str_ptr) detect_partial_match(common, backtracks); - read_char_range(common, 0x9, 0x3000, NULL, type == OP_NOT_HSPACE); + + if (type == OP_NOT_HSPACE) + read_char(common, 0x9, 0x3000, backtracks, READ_CHAR_UPDATE_STR_PTR); + else + read_char(common, 0x9, 0x3000, NULL, 0); + add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL)); sljit_set_current_flags(compiler, SLJIT_SET_Z); add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO)); @@ -8435,7 +8585,12 @@ switch(type) case OP_VSPACE: if (check_str_ptr) detect_partial_match(common, backtracks); - read_char_range(common, 0xa, 0x2029, NULL, type == OP_NOT_VSPACE); + + if (type == OP_NOT_VSPACE) + read_char(common, 0xa, 0x2029, backtracks, READ_CHAR_UPDATE_STR_PTR); + else + read_char(common, 0xa, 0x2029, NULL, 0); + add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL)); sljit_set_current_flags(compiler, SLJIT_SET_Z); add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO)); @@ -8477,6 +8632,7 @@ switch(type) #ifdef SUPPORT_UNICODE if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc); #endif + if (common->mode == PCRE2_JIT_COMPLETE && check_str_ptr && (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)) { @@ -8504,12 +8660,13 @@ switch(type) if (type == OP_CHAR || !char_has_othercase(common, cc)) { - read_char_range(common, c, c, NULL, FALSE); + read_char(common, c, c, NULL, 0); add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c)); return cc + length; } + oc = char_othercase(common, c); - read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, FALSE); + read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0); bit = c ^ oc; if (is_powerof2(bit)) { @@ -8517,6 +8674,7 @@ switch(type) add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit)); return cc + length; } + jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c); add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc)); JUMPHERE(jump[0]); @@ -8533,7 +8691,7 @@ switch(type) { #if PCRE2_CODE_UNIT_WIDTH == 8 c = *cc; - if (c < 128) + if (c < 128 && !common->invalid_utf) { OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); if (type == OP_NOT || !char_has_othercase(common, cc)) @@ -8564,13 +8722,13 @@ switch(type) if (type == OP_NOT || !char_has_othercase(common, cc)) { - read_char_range(common, c, c, NULL, TRUE); + read_char(common, c, c, backtracks, READ_CHAR_UPDATE_STR_PTR); add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c)); } else { oc = char_othercase(common, c); - read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, TRUE); + read_char(common, c < oc ? c : oc, c > oc ? c : oc, backtracks, READ_CHAR_UPDATE_STR_PTR); bit = c ^ oc; if (is_powerof2(bit)) { @@ -8592,9 +8750,15 @@ switch(type) #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255; - read_char_range(common, 0, bit, NULL, type == OP_NCLASS); + if (type == OP_NCLASS) + read_char(common, 0, bit, backtracks, READ_CHAR_UPDATE_STR_PTR); + else + read_char(common, 0, bit, NULL, 0); #else - read_char_range(common, 0, 255, NULL, type == OP_NCLASS); + if (type == OP_NCLASS) + read_char(common, 0, 255, backtracks, READ_CHAR_UPDATE_STR_PTR); + else + read_char(common, 0, 255, NULL, 0); #endif if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks)) @@ -8788,7 +8952,6 @@ jump_list *no_match = NULL; int source_reg = COUNT_MATCH; int source_end_reg = ARGUMENTS; int char1_reg = STACK_LIMIT; -BOOL saved_invalid_utf; #endif /* SUPPORT_UNICODE */ if (ref) @@ -8830,17 +8993,14 @@ if (common->utf && *cc == OP_REFI) OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0); OP1(SLJIT_MOV, STR_PTR, 0, source_reg, 0); - saved_invalid_utf = common->invalid_utf; - common->invalid_utf = FALSE; - read_char_range(common, 0, READ_CHAR_MAX, NULL, TRUE); - common->invalid_utf = saved_invalid_utf; + read_char(common, 0, READ_CHAR_MAX, NULL, READ_CHAR_UPDATE_STR_PTR | READ_CHAR_VALID_UTF); OP1(SLJIT_MOV, source_reg, 0, STR_PTR, 0); OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0); OP1(SLJIT_MOV, char1_reg, 0, TMP1, 0); /* Read second character. */ - read_char_range(common, 0, READ_CHAR_MAX, &no_match, TRUE); + read_char(common, 0, READ_CHAR_MAX, &no_match, READ_CHAR_UPDATE_STR_PTR); CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop); @@ -13572,6 +13732,11 @@ if (common->utfreadchar != NULL) set_jumps(common->utfreadchar, LABEL()); do_utfreadchar(common); } +if (common->utfreadchar_invalid_precise != NULL) + { + set_jumps(common->utfreadchar_invalid_precise, LABEL()); + do_utfreadchar_invalid_precise(common); + } if (common->utfreadtype8 != NULL) { set_jumps(common->utfreadtype8, LABEL()); diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c index 78fdcca..27553b4 100644 --- a/src/pcre2_jit_test.c +++ b/src/pcre2_jit_test.c @@ -1755,6 +1755,41 @@ static int regression_tests(void) } } +#if defined SUPPORT_UNICODE && (defined SUPPORT_PCRE2_8 || defined SUPPORT_PCRE2_16) + +static int check_invalid_utf_result(int pattern_index, char *type, int result, + int match_start, int match_end, PCRE2_SIZE *ovector) +{ + if (match_start < 0) { + if (result != -1) { + printf("Pattern[%d] %s result is not -1.\n", pattern_index, type); + return 1; + } + return 0; + } + + if (result <= 0) { + printf("Pattern[%d] %s result (%d) is not greater than 0.\n", pattern_index, type, result); + return 1; + } + + if (ovector[0] != (PCRE2_SIZE)match_start) { + printf("Pattern[%d] %s ovector[0] is unexpected (%d instead of %d)\n", + pattern_index, type, (int)ovector[0], match_start); + return 1; + } + + if (ovector[1] != (PCRE2_SIZE)match_end) { + printf("Pattern[%d] %s ovector[1] is unexpected (%d instead of %d)\n", + pattern_index, type, (int)ovector[1], match_end); + return 1; + } + + return 0; +} + +#endif /* SUPPORT_UNICODE && (SUPPORT_PCRE2_8 || SUPPORT_PCRE2_16) */ + #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED) @@ -1767,121 +1802,132 @@ struct invalid_utf8_regression_test_case { int start_offset; int skip_left; int skip_right; - int expected_result; + int match_start; + int match_end; const char *pattern[2]; const char *input; }; static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = { - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf4\x8f\xbf\xbf" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf0\x90\x80\x80" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf4\x90\x80\x80" }, - { UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\x7f" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\xc0" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf#" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80#" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80" }, - { UDA, CI, 0, 0, 2, -1, { ".", NULL }, "\xef\xbf\xbf#" }, - { UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xef\xbf\xbf" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\x7f#" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\xc0" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf#" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xed\x9f\xbf#" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xa0\x80#" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xee\x80\x80#" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xbf\xbf#" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf##" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf#" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80##" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80#" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80##" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0##" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf##" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80###" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8###" }, - { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8" }, - { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\x7f" }, + { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf4\x8f\xbf\xbf" }, + { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf0\x90\x80\x80" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf4\x90\x80\x80" }, + { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\x7f" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\xc0" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" }, + { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf#" }, + { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf" }, + { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80#" }, + { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80" }, + { UDA, CI, 0, 0, 2, -1, -1, { ".", NULL }, "\xef\xbf\xbf#" }, + { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xef\xbf\xbf" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\x7f#" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\xc0" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf#" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf" }, + { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xed\x9f\xbf#" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xa0\x80#" }, + { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xee\x80\x80#" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xbf\xbf#" }, + { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf##" }, + { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf#" }, + { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf" }, + { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80##" }, + { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80#" }, + { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80##" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0##" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf##" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80###" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8###" }, + { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8" }, + { UDA, CI, 0, 0, 0, 0, 1, { ".", NULL }, "\x7f" }, - { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" }, - { UDA, CPI, 4, 1, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" }, - { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xef\xbf\xbf#" }, - { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xe0\xa0\x80#" }, - { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf0\x90\x80\x80#" }, - { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" }, - { UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" }, - { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" }, - { UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" }, - { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xc2\x80#" }, - { UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xc2\x80#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xc1\xbf#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xdf\xc0#" }, - { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" }, - { UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" }, + { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" }, + { UDA, CPI, 4, 1, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" }, + { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xef\xbf\xbf#" }, + { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xe0\xa0\x80#" }, + { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf0\x90\x80\x80#" }, + { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" }, + { UDA, CPI, 4, 2, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" }, + { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xdf\xbf#" }, + { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xdf\xbf#" }, + { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xc2\x80#" }, + { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xc2\x80#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xc1\xbf#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xdf\xc0#" }, + { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80#" }, + { UDA, CPI, 4, 2, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80#" }, - { UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xef\xbf\xbf#" }, - { UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xe0\xa0\x80#" }, - { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" }, - { UDA, CPI, 3, 1, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" }, - { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" }, - { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" }, - { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" }, - { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" }, + { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xef\xbf\xbf#" }, + { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xe0\xa0\x80#" }, + { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" }, + { UDA, CPI, 3, 1, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" }, + { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" }, + { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" }, + { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" }, + { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" }, - { UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xdf\xbf#" }, - { UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xc2\x80#" }, - { UDA, CPI, 2, 1, 0, -1, { "\\B", "\\b" }, "\xdf\xbf#" }, - { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xc1\xbf#" }, - { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x80#" }, - { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xdf\xff#" }, - { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xff\xbf#" }, + { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xdf\xbf#" }, + { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xc2\x80#" }, + { UDA, CPI, 2, 1, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xbf#" }, + { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xc1\xbf#" }, + { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x80#" }, + { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xff#" }, + { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xff\xbf#" }, - { UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x7f#" }, - { UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x01#" }, - { UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" }, - { UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" }, + { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x7f#" }, + { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x01#" }, + { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80#" }, + { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80#" }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "a\xff" }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "a\xff" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 6, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 8, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" }, - { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "A" }, - { UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xff" }, - { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xc3\xa1" }, - { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xc3\xa1" }, - { UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xc3\x7f" }, - { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xe1\xbd\xb8" }, - { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" }, - { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xf0\x90\x90\x80" }, - { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" }, + { UDA, CPI, 0, 0, 0, 0, 1, { "\\X", NULL }, "A" }, + { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xff" }, + { UDA, CPI, 0, 0, 0, 0, 2, { "\\X", NULL }, "\xc3\xa1" }, + { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xc3\xa1" }, + { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xc3\x7f" }, + { UDA, CPI, 0, 0, 0, 0, 3, { "\\X", NULL }, "\xe1\xbd\xb8" }, + { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" }, + { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" }, + { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" }, - { 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"}, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"}, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"}, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xc3\x0a#"}, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf1\x0a#"}, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xf2\xbf\x0a#"}, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"}, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"}, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"}, + + { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } }; #undef UDA @@ -1889,17 +1935,18 @@ static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cas #undef CPI static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *current, - int pattern_index, int i, pcre2_match_data_8 *mdata) + int pattern_index, int i, pcre2_compile_context_8 *ccontext, pcre2_match_data_8 *mdata) { pcre2_code_8 *code; int result, errorcode; PCRE2_SIZE length, erroroffset; + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_8(mdata); if (current->pattern[i] == NULL) return 1; code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED, - current->compile_options, &errorcode, &erroroffset, NULL); + current->compile_options, &errorcode, &erroroffset, ccontext); if (!code) { printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset); @@ -1918,8 +1965,7 @@ static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *curre result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left), length, current->start_offset - current->skip_left, 0, mdata, NULL); - if (result != current->expected_result) { - printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result); + if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) { pcre2_code_free_8(code); return 0; } @@ -1929,8 +1975,7 @@ static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *curre result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left), length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL); - if (result != current->expected_result) { - printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result); + if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) { pcre2_code_free_8(code); return 0; } @@ -1943,12 +1988,15 @@ static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *curre static int invalid_utf8_regression_tests(void) { struct invalid_utf8_regression_test_case *current; + pcre2_compile_context_8 *ccontext; pcre2_match_data_8 *mdata; int total = 0, successful = 0; int result; printf("\nRunning invalid-utf8 JIT regression tests\n"); + ccontext = pcre2_compile_context_create_8(NULL); + pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY); mdata = pcre2_match_data_create_8(4, NULL); for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) { @@ -1956,9 +2004,9 @@ static int invalid_utf8_regression_tests(void) total++; result = 1; - if (!run_invalid_utf8_test(current, total - 1, 0, mdata)) + if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata)) result = 0; - if (!run_invalid_utf8_test(current, total - 1, 1, mdata)) + if (!run_invalid_utf8_test(current, total - 1, 1, ccontext, mdata)) result = 0; if (result) { @@ -1974,6 +2022,7 @@ static int invalid_utf8_regression_tests(void) printf("\n"); pcre2_match_data_free_8(mdata); + pcre2_compile_context_free_8(ccontext); if (total == successful) { printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n"); @@ -2005,7 +2054,8 @@ struct invalid_utf16_regression_test_case { int start_offset; int skip_left; int skip_right; - int expected_result; + int match_start; + int match_end; const PCRE2_UCHAR16 *pattern[2]; const PCRE2_UCHAR16 *input; }; @@ -2024,41 +2074,41 @@ static PCRE2_UCHAR16 test6[] = { 'a', 'A', 0xdc28, 0 }; static PCRE2_UCHAR16 test7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 }; static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = { - { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test1 }, - { UDA, CI, 1, 0, 0, 1, { allany, NULL }, test1 }, - { UDA, CI, 2, 0, 0, 1, { allany, NULL }, test1 }, - { UDA, CI, 3, 0, 0, 1, { allany, NULL }, test1 }, - { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test2 }, - { UDA, CI, 0, 0, 2, -1, { allany, NULL }, test2 }, - { UDA, CI, 1, 0, 0, -1, { allany, NULL }, test2 }, - { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test3 }, - { UDA, CI, 0, 0, 2, -1, { allany, NULL }, test3 }, - { UDA, CI, 1, 0, 0, -1, { allany, NULL }, test3 }, + { UDA, CI, 0, 0, 0, 0, 1, { allany, NULL }, test1 }, + { UDA, CI, 1, 0, 0, 1, 2, { allany, NULL }, test1 }, + { UDA, CI, 2, 0, 0, 2, 3, { allany, NULL }, test1 }, + { UDA, CI, 3, 0, 0, 3, 4, { allany, NULL }, test1 }, + { UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test2 }, + { UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test2 }, + { UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test2 }, + { UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test3 }, + { UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test3 }, + { UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test3 }, - { UDA, CPI, 1, 0, 0, 1, { non_word_boundary, NULL }, test1 }, - { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test1 }, - { UDA, CPI, 3, 0, 0, 1, { non_word_boundary, NULL }, test1 }, - { UDA, CPI, 4, 0, 0, 1, { non_word_boundary, NULL }, test1 }, - { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test2 }, - { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test3 }, - { UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test2 }, - { UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test3 }, - { UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test4 }, - { UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test5 }, + { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary, NULL }, test1 }, + { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test1 }, + { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary, NULL }, test1 }, + { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary, NULL }, test1 }, + { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test2 }, + { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test3 }, + { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test2 }, + { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test3 }, + { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test4 }, + { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test5 }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test6 }, - { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, { backreference, NULL }, test6 }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test7 }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { backreference, NULL }, test7 }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference, NULL }, test6 }, + { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference, NULL }, test6 }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference, NULL }, test7 }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference, NULL }, test7 }, - { UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test6 }, - { UDA, CPI, 1, 0, 0, 1, { grapheme, NULL }, test6 }, - { UDA, CPI, 2, 0, 0, -1, { grapheme, NULL }, test6 }, - { UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test7 }, - { UDA, CPI, 2, 0, 0, 1, { grapheme, NULL }, test7 }, - { UDA, CPI, 1, 0, 0, -1, { grapheme, NULL }, test7 }, + { UDA, CPI, 0, 0, 0, 0, 1, { grapheme, NULL }, test6 }, + { UDA, CPI, 1, 0, 0, 1, 2, { grapheme, NULL }, test6 }, + { UDA, CPI, 2, 0, 0, -1, -1, { grapheme, NULL }, test6 }, + { UDA, CPI, 0, 0, 0, 0, 2, { grapheme, NULL }, test7 }, + { UDA, CPI, 2, 0, 0, 2, 4, { grapheme, NULL }, test7 }, + { UDA, CPI, 1, 0, 0, -1, -1, { grapheme, NULL }, test7 }, - { 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } + { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } }; #undef UDA @@ -2066,18 +2116,19 @@ static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_c #undef CPI static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *current, - int pattern_index, int i, pcre2_match_data_16 *mdata) + int pattern_index, int i, pcre2_compile_context_16 *ccontext, pcre2_match_data_16 *mdata) { pcre2_code_16 *code; int result, errorcode; PCRE2_SIZE length, erroroffset; const PCRE2_UCHAR16 *input; + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(mdata); if (current->pattern[i] == NULL) return 1; code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED, - current->compile_options, &errorcode, &erroroffset, NULL); + current->compile_options, &errorcode, &erroroffset, ccontext); if (!code) { printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset); @@ -2102,8 +2153,7 @@ static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *cur result = pcre2_jit_match_16(code, (current->input + current->skip_left), length, current->start_offset - current->skip_left, 0, mdata, NULL); - if (result != current->expected_result) { - printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result); + if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) { pcre2_code_free_16(code); return 0; } @@ -2113,8 +2163,7 @@ static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *cur result = pcre2_jit_match_16(code, (current->input + current->skip_left), length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL); - if (result != current->expected_result) { - printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result); + if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) { pcre2_code_free_16(code); return 0; } @@ -2127,12 +2176,15 @@ static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *cur static int invalid_utf16_regression_tests(void) { struct invalid_utf16_regression_test_case *current; + pcre2_compile_context_16 *ccontext; pcre2_match_data_16 *mdata; int total = 0, successful = 0; int result; printf("\nRunning invalid-utf16 JIT regression tests\n"); + ccontext = pcre2_compile_context_create_16(NULL); + pcre2_set_newline_16(ccontext, PCRE2_NEWLINE_ANY); mdata = pcre2_match_data_create_16(4, NULL); for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) { @@ -2140,9 +2192,9 @@ static int invalid_utf16_regression_tests(void) total++; result = 1; - if (!run_invalid_utf16_test(current, total - 1, 0, mdata)) + if (!run_invalid_utf16_test(current, total - 1, 0, ccontext, mdata)) result = 0; - if (!run_invalid_utf16_test(current, total - 1, 1, mdata)) + if (!run_invalid_utf16_test(current, total - 1, 1, ccontext, mdata)) result = 0; if (result) { @@ -2158,6 +2210,7 @@ static int invalid_utf16_regression_tests(void) printf("\n"); pcre2_match_data_free_16(mdata); + pcre2_compile_context_free_16(ccontext); if (total == successful) { printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");