diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic index 3d2feb7..f6ed686 100644 --- a/src/pcre2.h.generic +++ b/src/pcre2.h.generic @@ -164,6 +164,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */ #define PCRE2_JIT_PARTIAL_SOFT 0x00000002u #define PCRE2_JIT_PARTIAL_HARD 0x00000004u +#define PCRE2_JIT_INVALID_UTF 0x00000100u /* These are for pcre2_match(), pcre2_dfa_match(), and pcre2_jit_match(). Note that PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these diff --git a/src/pcre2.h.in b/src/pcre2.h.in index a9396e0..5bb4bb6 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -164,6 +164,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */ #define PCRE2_JIT_PARTIAL_SOFT 0x00000002u #define PCRE2_JIT_PARTIAL_HARD 0x00000004u +#define PCRE2_JIT_INVALID_UTF 0x00000100u /* These are for pcre2_match(), pcre2_dfa_match(), and pcre2_jit_match(). Note that PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 32e985b..89d302c 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -477,12 +477,18 @@ typedef struct compiler_common { BOOL alt_circumflex; #ifdef SUPPORT_UNICODE BOOL utf; + BOOL invalid_utf; BOOL use_ucp; jump_list *getucd; #if PCRE2_CODE_UNIT_WIDTH == 8 jump_list *utfreadchar; - jump_list *utfreadchar16; jump_list *utfreadtype8; + jump_list *utfpeakcharback; +#endif +#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 + jump_list *utfreadchar_invalid; + jump_list *utfmoveback_invalid; + jump_list *utfpeakcharback_invalid; #endif #endif /* SUPPORT_UNICODE */ } compiler_common; @@ -616,7 +622,173 @@ the start pointers when the end of the capturing group has not yet reached. */ #define READ_CHAR_MAX 0x7fffffff -#define INVALID_UTF_CHAR 888 +#define INVALID_UTF_CHAR -1 +#define UNASSIGNED_UTF_CHAR 888 + +#if defined SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + +#define GETCHARINC_INVALID(c, ptr, end, invalid_action) \ + { \ + if (ptr[0] <= 0x7f) \ + c = *ptr++; \ + else if (ptr + 1 < end && ptr[1] >= 0x80 && ptr[1] < 0xc0) \ + { \ + c = ptr[1] - 0x80; \ + \ + if (ptr[0] >= 0xc0 && ptr[0] <= 0xdf) \ + { \ + c |= (ptr[0] - 0xc0) << 6; \ + ptr += 2; \ + } \ + else if (ptr + 2 < end && ptr[2] >= 0x80 && ptr[2] < 0xc0) \ + { \ + c = c << 6 | (ptr[2] - 0x80); \ + \ + if (ptr[0] >= 0xe0 && ptr[0] <= 0xef) \ + { \ + c |= (ptr[0] - 0xe0) << 12; \ + ptr += 3; \ + } \ + else if (ptr + 3 < end && ptr[3] >= 0x80 && ptr[3] < 0xc0) \ + { \ + c = c << 6 | (ptr[3] - 0x80); \ + \ + if (ptr[0] >= 0xf0 && ptr[0] <= 0xf4) \ + { \ + c |= (ptr[0] - 0xf0) << 18; \ + ptr += 4; \ + \ + if (c >= 0x110000) \ + { \ + invalid_action; \ + } \ + } \ + else \ + { \ + invalid_action; \ + } \ + } \ + else \ + { \ + invalid_action; \ + } \ + } \ + else \ + { \ + invalid_action; \ + } \ + } \ + else \ + { \ + invalid_action; \ + } \ + } + +#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \ + { \ + if (ptr[-1] <= 0x7f) \ + c = *ptr--; \ + else if (ptr - 1 > start && ptr[-1] >= 0x80 && ptr[-1] < 0xc0) \ + { \ + c = ptr[-1] - 0x80; \ + \ + if (ptr[-2] >= 0xc0 && ptr[-2] <= 0xdf) \ + { \ + c |= (ptr[-2] - 0xc0) << 6; \ + ptr -= 2; \ + } \ + else if (ptr - 2 > start && ptr[-2] >= 0x80 && ptr[-2] < 0xc0) \ + { \ + c = c << 6 | (ptr[-2] - 0x80); \ + \ + if (ptr[-3] >= 0xe0 && ptr[-3] <= 0xef) \ + { \ + c |= (ptr[-3] - 0xe0) << 12; \ + ptr -= 3; \ + } \ + else if (ptr - 3 > start && ptr[-3] >= 0x80 && ptr[-3] < 0xc0) \ + { \ + c = c << 6 | (ptr[-3] - 0x80); \ + \ + if (ptr[-4] >= 0xf0 && ptr[-4] <= 0xf4) \ + { \ + c |= (ptr[-4] - 0xf0) << 18; \ + ptr -= 4; \ + \ + if (c >= 0x110000) \ + { \ + invalid_action; \ + } \ + } \ + else \ + { \ + invalid_action; \ + } \ + } \ + else \ + { \ + invalid_action; \ + } \ + } \ + else \ + { \ + invalid_action; \ + } \ + } \ + else \ + { \ + invalid_action; \ + } \ + } + +#elif PCRE2_CODE_UNIT_WIDTH == 16 + +#define GETCHARINC_INVALID(c, ptr, end, invalid_action) \ + { \ + if (ptr[0] < 0xd800 || ptr[0] >= 0xe000) \ + c = *ptr++; \ + else if (ptr[0] < 0xdc00 && ptr + 1 < end && ptr[1] >= 0xdc00 && ptr[1] < 0xe000) \ + { \ + c = (((ptr[0] - 0xd800) << 10) | (ptr[1] - 0xdc00)) + 0x10000; \ + ptr += 2; \ + } \ + else \ + { \ + invalid_action; \ + } \ + } + +#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \ + { \ + if (ptr[-1] < 0xd800 || ptr[-1] >= 0xe000) \ + c = *ptr--; \ + else if (ptr[-1] >= 0xdc00 && ptr - 1 > start && ptr[-2] >= 0xd800 && ptr[-2] < 0xdc00) \ + { \ + c = (((ptr[-2] - 0xd800) << 10) | (ptr[-1] - 0xdc00)) + 0x10000; \ + ptr -= 2; \ + } \ + else \ + { \ + invalid_action; \ + } \ + } + + +#elif PCRE2_CODE_UNIT_WIDTH == 32 + +#define GETCHARINC_INVALID(c, ptr, end, invalid_action) \ + { \ + if (ptr[0] < 0x110000) \ + c = *ptr++; \ + else \ + { \ + invalid_action; \ + } \ + } + +#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ +#endif /* SUPPORT_UNICODE */ static PCRE2_SPTR bracketend(PCRE2_SPTR cc) { @@ -3159,95 +3331,128 @@ else JUMPHERE(jump); } -static void peek_char(compiler_common *common, sljit_u32 max) +static void peek_char(compiler_common *common, sljit_u32 max, sljit_s32 dst, sljit_sw dstw) { /* Reads the character into TMP1, keeps STR_PTR. -Does not check STR_END. TMP2 Destroyed. */ +Does not check STR_END. TMP2, dst, RETURN_ADDR Destroyed. */ DEFINE_COMPILER; #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 struct sljit_jump *jump; -#endif +#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ SLJIT_UNUSED_ARG(max); +SLJIT_UNUSED_ARG(dst); +SLJIT_UNUSED_ARG(dstw); -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); + +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 if (common->utf) { if (max < 128) return; - jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); + jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80); + OP1(SLJIT_MOV, dst, dstw, STR_PTR, 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); + add_jump(compiler, common->invalid_utf ? &common->utfreadchar_invalid : &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); + OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw); JUMPHERE(jump); } -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ - -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16 +#elif PCRE2_CODE_UNIT_WIDTH == 16 if (common->utf) { if (max < 0xd800) return; OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800 - 1); - /* TMP2 contains the high surrogate. */ - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x40); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + + if (common->invalid_utf) + { + jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); + add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + } + else + { + /* TMP2 contains the high surrogate. */ + jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); + OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); + OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); + OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); + } JUMPHERE(jump); } -#endif +#elif PCRE2_CODE_UNIT_WIDTH == 32 + +#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ +#endif /* SUPPORT_UNICODE */ } -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 - -static BOOL is_char7_bitset(const sljit_u8 *bitset, BOOL nclass) +static void peek_char_back(compiler_common *common, sljit_u32 max, jump_list **backtracks) { -/* Tells whether the character codes below 128 are enough -to determine a match. */ -const sljit_u8 value = nclass ? 0xff : 0; -const sljit_u8 *end = bitset + 32; - -bitset += 16; -do - { - if (*bitset++ != value) - return FALSE; - } -while (bitset < end); -return TRUE; -} - -static void read_char7_type(compiler_common *common, BOOL full_read) -{ -/* Reads the precise character type of a character into TMP1, if the character -is less than 128. Otherwise it returns with zero. Does not check STR_END. The -full_read argument tells whether characters above max are accepted or not. */ +/* Reads one character back without moving STR_PTR. TMP2 must +contain the start of the subject buffer. Affects TMP1, TMP2, and RETURN_ADDR. */ DEFINE_COMPILER; + +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 struct sljit_jump *jump; +#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ -SLJIT_ASSERT(common->utf); +SLJIT_UNUSED_ARG(max); +SLJIT_UNUSED_ARG(backtracks); -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); -OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); - -if (full_read) +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 +if (common->utf) { - jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xc0); - OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); + if (max < 128) return; + + jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80); + if (common->invalid_utf) + { + add_jump(compiler, &common->utfpeakcharback_invalid, JUMP(SLJIT_FAST_CALL)); + if (backtracks != NULL) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); + } + else + add_jump(compiler, &common->utfpeakcharback, JUMP(SLJIT_FAST_CALL)); JUMPHERE(jump); } +#elif PCRE2_CODE_UNIT_WIDTH == 16 +if (common->utf) + { + if (max < 0xd800) return; + + if (common->invalid_utf) + { + jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xd800); + add_jump(compiler, &common->utfpeakcharback_invalid, JUMP(SLJIT_FAST_CALL)); + if (backtracks != NULL) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); + } + else + { + OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xdc00); + jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xdc00); + /* TMP2 contains the low surrogate. */ + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); + OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x10000); + OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10); + OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); + } + JUMPHERE(jump); + } +#elif PCRE2_CODE_UNIT_WIDTH == 32 + +#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ +#endif /* SUPPORT_UNICODE */ } -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ - -static void read_char_range(compiler_common *common, sljit_u32 min, sljit_u32 max, BOOL update_str_ptr) +static void read_char_range(compiler_common *common, sljit_u32 min, sljit_u32 max, + jump_list **backtracks, BOOL update_str_ptr) { /* Reads the precise value of a character into TMP1, if the character is between min and max (c >= min && c <= max). Otherwise it returns with a value @@ -3263,6 +3468,7 @@ struct sljit_jump *jump2; SLJIT_UNUSED_ARG(update_str_ptr); SLJIT_UNUSED_ARG(min); SLJIT_UNUSED_ARG(max); +SLJIT_UNUSED_ARG(backtracks); SLJIT_ASSERT(min <= max); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); @@ -3273,6 +3479,16 @@ if (common->utf) { if (max < 128 && !update_str_ptr) return; + if (common->invalid_utf) + { + jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80); + add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + if (backtracks != NULL) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); + JUMPHERE(jump); + return; + } + jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); if (min >= 0x10000) { @@ -3319,7 +3535,9 @@ if (common->utf) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0); } else if (max >= 0x800) - add_jump(compiler, (max < 0x10000) ? &common->utfreadchar16 : &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); + { + add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); + } else if (max < 128) { OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); @@ -3346,26 +3564,36 @@ if (common->utf) #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16 if (common->utf) { - if (max >= 0x10000) + if (max < 0xd800 && !update_str_ptr) return; + + if (max >= 0x10000 || common->invalid_utf) { OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800 - 1); - /* TMP2 contains the high surrogate. */ - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); - OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x40); - OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff); - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + + if (common->invalid_utf) + { + jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); + add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + if (backtracks != NULL) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); + } + else + { + jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800); + /* TMP2 contains the high surrogate. */ + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); + OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); + OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); + } JUMPHERE(jump); return; } - if (max < 0xd800 && !update_str_ptr) return; - /* Skip low surrogate if necessary. */ OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); - jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800 - 1); + jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800); if (update_str_ptr) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); if (max >= 0xd800) @@ -3375,12 +3603,62 @@ if (common->utf) #endif } -static SLJIT_INLINE void read_char(compiler_common *common) +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 + +static BOOL is_char7_bitset(const sljit_u8 *bitset, BOOL nclass) { -read_char_range(common, 0, READ_CHAR_MAX, TRUE); +/* Tells whether the character codes below 128 are enough +to determine a match. */ +const sljit_u8 value = nclass ? 0xff : 0; +const sljit_u8 *end = bitset + 32; + +bitset += 16; +do + { + if (*bitset++ != value) + return FALSE; + } +while (bitset < end); +return TRUE; } -static void read_char8_type(compiler_common *common, BOOL update_str_ptr) +static void read_char7_type(compiler_common *common, jump_list **backtracks, BOOL negated) +{ +/* Reads the precise character type of a character into TMP1, if the character +is less than 128. Otherwise it returns with zero. Does not check STR_END. The +full_read argument tells whether characters above max are accepted or not. */ +DEFINE_COMPILER; +struct sljit_jump *jump; + +SLJIT_ASSERT(common->utf); + +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + +OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); + +if (negated) + { + jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x80); + + if (common->invalid_utf) + { + add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); + } + else + { + OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); + } + JUMPHERE(jump); + } +} + +#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ + +static void read_char8_type(compiler_common *common, jump_list **backtracks, BOOL negated) { /* Reads the character type into TMP1, updates STR_PTR. Does not check STR_END. */ DEFINE_COMPILER; @@ -3391,7 +3669,8 @@ struct sljit_jump *jump; struct sljit_jump *jump2; #endif -SLJIT_UNUSED_ARG(update_str_ptr); +SLJIT_UNUSED_ARG(backtracks); +SLJIT_UNUSED_ARG(negated); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); @@ -3399,18 +3678,37 @@ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 if (common->utf) { - /* This can be an extra read in some situations, but hopefully - it is needed in most cases. */ + /* The result of this read may be unused, but saves an "else" part. */ OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); - jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xc0); - if (!update_str_ptr) + jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x80); + + if (!negated) { + if (common->invalid_utf) + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); - OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); + OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0); + if (common->invalid_utf) + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x1f)); + OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); + OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80); + if (common->invalid_utf) + add_jump(compiler, backtracks, CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x3f)); + OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); + jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255); + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); + JUMPHERE(jump2); + } + else if (common->invalid_utf) + { + add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255); OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); @@ -3418,43 +3716,86 @@ if (common->utf) } else add_jump(compiler, &common->utfreadtype8, JUMP(SLJIT_FAST_CALL)); + JUMPHERE(jump); return; } #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32 +if (common->invalid_utf && negated) + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x110000)); +#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32 */ + #if PCRE2_CODE_UNIT_WIDTH != 8 /* The ctypes array contains only 256 values. */ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255); -#endif +#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); #if PCRE2_CODE_UNIT_WIDTH != 8 JUMPHERE(jump); -#endif +#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16 -if (common->utf && update_str_ptr) +if (common->utf && negated) { /* Skip low surrogate if necessary. */ + if (!common->invalid_utf) + { + OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800); + jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + JUMPHERE(jump); + } + OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800); - jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800 - 1); + jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400)); + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); + + OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xdc00); + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400)); + JUMPHERE(jump); } #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16 */ } -static void skip_char_back(compiler_common *common) +static void move_back(compiler_common *common, jump_list **backtracks, BOOL must_be_valid) { -/* Goes one character back. Affects STR_PTR and TMP1. Does not check begin. */ +/* Goes one character back. TMP2 must contain the start of +the subject buffer. Affects STR_PTR and TMP1. Does not modify +STR_PTR for invalid character sequences. */ DEFINE_COMPILER; + +SLJIT_UNUSED_ARG(backtracks); +SLJIT_UNUSED_ARG(must_be_valid); + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 +struct sljit_jump *jump; +#endif + +#ifdef SUPPORT_UNICODE #if PCRE2_CODE_UNIT_WIDTH == 8 struct sljit_label *label; if (common->utf) { + if (!must_be_valid && common->invalid_utf) + { + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80); + add_jump(compiler, &common->utfmoveback_invalid, JUMP(SLJIT_FAST_CALL)); + if (backtracks != NULL) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0)); + JUMPHERE(jump); + return; + } + label = LABEL(); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); @@ -3467,6 +3808,18 @@ if (common->utf) { OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + + if (!must_be_valid && common->invalid_utf) + { + OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); + jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000 - 0xd800); + add_jump(compiler, &common->utfmoveback_invalid, JUMP(SLJIT_FAST_CALL)); + if (backtracks != NULL) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0)); + JUMPHERE(jump); + return; + } + /* Skip low surrogate if necessary. */ OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xdc00); @@ -3475,8 +3828,14 @@ if (common->utf) OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0); return; } -#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */ -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ +#elif PCRE2_CODE_UNIT_WIDTH == 32 +if (common->invalid_utf && !must_be_valid) + { + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); + } +#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ +#endif /* SUPPORT_UNICODE */ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); } @@ -3519,13 +3878,12 @@ else static void do_utfreadchar(compiler_common *common) { /* Fast decoding a UTF-8 character. TMP1 contains the first byte -of the character (>= 0xc0). Return char value in TMP1, length in TMP2. */ +of the character (>= 0xc0). Return char value in TMP1. */ DEFINE_COMPILER; struct sljit_jump *jump; sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); @@ -3534,13 +3892,12 @@ OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); jump = JUMP(SLJIT_NOT_ZERO); /* Two byte sequence. */ +OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3000); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(2)); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); JUMPHERE(jump); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); -OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x800); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); @@ -3548,55 +3905,18 @@ OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000); jump = JUMP(SLJIT_NOT_ZERO); /* Three byte sequence. */ +OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0000); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(3)); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); /* Four byte sequence. */ JUMPHERE(jump); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); -OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xf0000); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(4)); -sljit_emit_fast_return(compiler, RETURN_ADDR, 0); -} - -static void do_utfreadchar16(compiler_common *common) -{ -/* Fast decoding a UTF-8 character. TMP1 contains the first byte -of the character (>= 0xc0). Return value in TMP1. */ -DEFINE_COMPILER; -struct sljit_jump *jump; - -sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); -OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); - -/* Searching for the first zero. */ -OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); -jump = JUMP(SLJIT_NOT_ZERO); -/* Two byte sequence. */ -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -sljit_emit_fast_return(compiler, RETURN_ADDR, 0); - -JUMPHERE(jump); -OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x400); -OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO); -/* This code runs only in 8 bit mode. No need to shift the value. */ -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); -OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); -OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x800); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -/* Three byte sequence. */ -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); } @@ -3636,8 +3956,476 @@ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); } +static void do_utfreadchar_invalid(compiler_common *common) +{ +/* Slow decoding a UTF-8 character. TMP1 contains the first byte +of the character (>= 0xc0). Return char value in TMP1. */ +DEFINE_COMPILER; +struct sljit_jump *jump; +struct sljit_jump *buffer_end_close; +struct sljit_label *three_byte_entry; +struct sljit_label *exit_invalid_label; +struct sljit_jump *exit_invalid[11]; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); + +/* Continuation byte, or invalid two byte sequence. */ +exit_invalid[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc2); + +/* Usually more than 3 characters remained in the subject buffer. */ +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); + +buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0); + +/* Two-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3)); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + +OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); +jump = JUMP(SLJIT_NOT_ZERO); + +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump); + +/* Three-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); + +OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000); +jump = JUMP(SLJIT_NOT_ZERO); + +three_byte_entry = LABEL(); + +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2d800); +exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + +exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump); + +/* Four-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); + +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc10000); +exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000); + +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(buffer_end_close); +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); +exit_invalid[7] = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0); + +/* Two-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +exit_invalid[8] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); + +OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800); +jump = JUMP(SLJIT_NOT_ZERO); + +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump); + +/* Three-byte sequence. */ +exit_invalid[9] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); + +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +exit_invalid[10] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); + +/* One will be substracted from STR_PTR later. */ +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); + +/* Four byte sequences are not possible. */ +CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x30000, three_byte_entry); + +exit_invalid_label = LABEL(); +sljit_set_label(exit_invalid[3], exit_invalid_label); +sljit_set_label(exit_invalid[5], exit_invalid_label); +sljit_set_label(exit_invalid[7], exit_invalid_label); +sljit_set_label(exit_invalid[8], exit_invalid_label); + +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + +exit_invalid_label = LABEL(); +sljit_set_label(exit_invalid[0], exit_invalid_label); +sljit_set_label(exit_invalid[4], exit_invalid_label); +sljit_set_label(exit_invalid[6], exit_invalid_label); +sljit_set_label(exit_invalid[9], exit_invalid_label); +sljit_set_label(exit_invalid[10], exit_invalid_label); + +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(exit_invalid[1]); +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + +JUMPHERE(exit_invalid[2]); +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} + +static void do_utfmoveback_invalid(compiler_common *common) +{ +/* Goes one character back. */ +DEFINE_COMPILER; +sljit_s32 i; +struct sljit_jump *jump; +struct sljit_jump *buffer_start_close; +struct sljit_label *exit_ok_label; +struct sljit_label *exit_invalid_label; +struct sljit_jump *exit_invalid[7]; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); + +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); +exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xc0); + +/* Two-byte sequence. */ +buffer_start_close = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0); + +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); + +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); +jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x20); + +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +/* Three-byte sequence. */ +JUMPHERE(jump); +exit_invalid[1] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, -0x40); + +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); + +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0); +jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x10); + +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +/* Four-byte sequence. */ +JUMPHERE(jump); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0 - 0x80); +exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x40); + +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xf0); +exit_invalid[3] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x05); + +exit_ok_label = LABEL(); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +/* Two-byte sequence. */ +JUMPHERE(buffer_start_close); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); + +exit_invalid[4] = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0); + +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); + +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); +CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x20, exit_ok_label); + +/* Three-byte sequence. */ +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +exit_invalid[5] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, -0x40); +exit_invalid[6] = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0); + +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); + +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0); +CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x10, exit_ok_label); + +/* Four-byte sequences are not possible. */ + +exit_invalid_label = LABEL(); +sljit_set_label(exit_invalid[5], exit_invalid_label); +sljit_set_label(exit_invalid[6], exit_invalid_label); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(exit_invalid[4]); +/* -2 + 4 = 2 */ +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); + +exit_invalid_label = LABEL(); +for (i = 0; i < 4; i++) + sljit_set_label(exit_invalid[i], exit_invalid_label); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(4)); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} + +static void do_utfpeakcharback(compiler_common *common) +{ +/* Peak a character back. */ +DEFINE_COMPILER; +struct sljit_jump *jump[2]; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); + +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); +jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x20); + +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3)); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0); +jump[1] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x10); + +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-4)); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0 - 0x80); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf0); +OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + +JUMPHERE(jump[1]); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + +JUMPHERE(jump[0]); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} + +static void do_utfpeakcharback_invalid(compiler_common *common) +{ +/* Peak a character back. */ +DEFINE_COMPILER; +sljit_s32 i; +struct sljit_jump *jump[2]; +struct sljit_label *two_byte_entry; +struct sljit_label *three_byte_entry; +struct sljit_label *exit_invalid_label; +struct sljit_jump *exit_invalid[8]; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); + +OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(3)); +exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xc0); +jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0); + +/* Two-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2); +jump[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x1e); + +two_byte_entry = LABEL(); +OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x02); +OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump[1]); +OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2 - 0x80); +exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); +OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + +/* Three-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3)); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0); +jump[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x10); + +three_byte_entry = LABEL(); +OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + +exit_invalid[2] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); +exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump[1]); +OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0 - 0x80); +exit_invalid[4] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); +OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + +/* Four-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-4)); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf0); +OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 18); +/* ADD is used instead of OR because of the SUB 0x10000 above. */ +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); + +exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump[0]); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); +jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0); + +/* Two-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2); +CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x1e, two_byte_entry); + +OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2 - 0x80); +exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40); +OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); + +/* Three-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3)); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0); +CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x10, three_byte_entry); + +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump[0]); +exit_invalid[7] = CMP(SLJIT_GREATER, TMP2, 0, STR_PTR, 0); + +/* Two-byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2); +CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x1e, two_byte_entry); + +exit_invalid_label = LABEL(); +for (i = 0; i < 8; i++) + sljit_set_label(exit_invalid[i], exit_invalid_label); + +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} + #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ +#if PCRE2_CODE_UNIT_WIDTH == 16 + +static void do_utfreadchar_invalid(compiler_common *common) +{ +/* Slow decoding a UTF-16 character. TMP1 contains the first half +of the character (>= 0xd800). Return char value in TMP1, length in TMP2. */ +DEFINE_COMPILER; +struct sljit_jump *exit_invalid[3]; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); + +/* TMP2 contains the high surrogate. */ +exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00); +exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); + +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); +OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xdc00); +OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x10000); +exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x400); + +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(exit_invalid[0]); +JUMPHERE(exit_invalid[1]); +JUMPHERE(exit_invalid[2]); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} + +static void do_utfmoveback_invalid(compiler_common *common) +{ +/* Goes one character back. */ +DEFINE_COMPILER; +struct sljit_jump *exit_invalid[3]; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); + +exit_invalid[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x400); +exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0); + +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); +OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800); +exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x400); + +OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(exit_invalid[0]); +JUMPHERE(exit_invalid[1]); +JUMPHERE(exit_invalid[2]); + +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} + +static void do_utfpeakcharback_invalid(compiler_common *common) +{ +/* Peak a character back. */ +DEFINE_COMPILER; +struct sljit_jump *jump; +struct sljit_jump *exit_invalid[3]; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); + +jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000); +OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); +exit_invalid[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xdc00); +exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0); + +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); +OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800); +exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400); +OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); + +JUMPHERE(jump); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(exit_invalid[0]); +JUMPHERE(exit_invalid[1]); +JUMPHERE(exit_invalid[2]); + +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} + +#endif /* PCRE2_CODE_UNIT_WIDTH == 16 */ + /* UCD_BLOCK_SIZE must be 128 (see the assert below). */ #define UCD_BLOCK_MASK 127 #define UCD_BLOCK_SHIFT 7 @@ -3653,7 +4441,7 @@ struct sljit_jump *jump; #if defined SLJIT_DEBUG && SLJIT_DEBUG /* dummy_ucd_record */ -const ucd_record *record = GET_UCD(INVALID_UTF_CHAR); +const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR); SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther); SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0); #endif @@ -3666,7 +4454,7 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); if (!common->utf) { jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, UNASSIGNED_UTF_CHAR); JUMPHERE(jump); } #endif @@ -3733,7 +4521,7 @@ if ((overall_options & PCRE2_FIRSTLINE) != 0) mainloop = LABEL(); /* Continual stores does not cause data dependency. */ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0); - read_char_range(common, common->nlmin, common->nlmax, TRUE); + read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE); check_newlinechar(common, common->nltype, &newline, TRUE); CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop); JUMPHERE(end); @@ -5358,12 +6146,12 @@ if (common->nltype == NLTYPE_FIXED && common->newline > 255) OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str)); firstchar = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0); -skip_char_back(common); +move_back(common, NULL, FALSE); loop = LABEL(); common->ff_newline_shortcut = loop; -read_char_range(common, common->nlmin, common->nlmax, TRUE); +read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE); lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF) foundcr = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); @@ -5588,21 +6376,28 @@ static void check_wordboundary(compiler_common *common) DEFINE_COMPILER; struct sljit_jump *skipread; jump_list *skipread_list = NULL; +jump_list *invalid_utf = NULL; #if PCRE2_CODE_UNIT_WIDTH != 8 || defined SUPPORT_UNICODE struct sljit_jump *jump; -#endif +#endif /* PCRE2_CODE_UNIT_WIDTH != 8 || SUPPORT_UNICODE */ SLJIT_COMPILE_ASSERT(ctype_word == 0x10, ctype_word_must_be_16); sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0); -/* Get type of the previous char, and put it to LOCALS1. */ +/* Get type of the previous char, and put it to TMP3. */ OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); -OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); -OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, SLJIT_IMM, 0); -skipread = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP1, 0); -skip_char_back(common); -check_start_used_ptr(common); -read_char(common); +OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); +OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 0); +skipread = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0); + +if (common->mode == PCRE2_JIT_COMPLETE) + peek_char_back(common, READ_CHAR_MAX, &invalid_utf); +else + { + move_back(common, &invalid_utf, FALSE); + check_start_used_ptr(common); + read_char_range(common, 0, READ_CHAR_MAX, &invalid_utf, TRUE); + } /* Testing char type. */ #ifdef SUPPORT_UNICODE @@ -5618,23 +6413,22 @@ if (common->use_ucp) OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_No - ucp_Nd); OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); JUMPHERE(jump); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP2, 0); + OP1(SLJIT_MOV, TMP3, 0, TMP2, 0); } else -#endif +#endif /* SUPPORT_UNICODE */ { #if PCRE2_CODE_UNIT_WIDTH != 8 jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); #elif defined SUPPORT_UNICODE - /* Here LOCALS1 has already been zeroed. */ + /* Here TMP3 has already been zeroed. */ jump = NULL; if (common->utf) jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 4 /* ctype_word */); - OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP1, 0); + OP2(SLJIT_AND, TMP3, 0, TMP1, 0, SLJIT_IMM, 1); #if PCRE2_CODE_UNIT_WIDTH != 8 JUMPHERE(jump); #elif defined SUPPORT_UNICODE @@ -5646,7 +6440,7 @@ JUMPHERE(skipread); OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0); check_str_end(common, &skipread_list); -peek_char(common, READ_CHAR_MAX); +peek_char(common, READ_CHAR_MAX, SLJIT_MEM1(SLJIT_SP), LOCALS1); /* Testing char type. This is a code duplication. */ #ifdef SUPPORT_UNICODE @@ -5664,7 +6458,7 @@ if (common->use_ucp) JUMPHERE(jump); } else -#endif +#endif /* SUPPORT_UNICODE */ { #if PCRE2_CODE_UNIT_WIDTH != 8 /* TMP2 may be destroyed by peek_char. */ @@ -5688,8 +6482,23 @@ else } set_jumps(skipread_list, LABEL()); -OP2(SLJIT_XOR | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1); -sljit_emit_fast_return(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0); +OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); +OP2(SLJIT_XOR, TMP2, 0, TMP2, 0, TMP3, 0); +sljit_emit_fast_return(compiler, TMP1, 0); + +#ifdef SUPPORT_UNICODE +if (common->invalid_utf) + { + SLJIT_ASSERT(invalid_utf != NULL); + + set_jumps(invalid_utf, LABEL()); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); + OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, -1); + sljit_emit_fast_return(compiler, TMP1, 0); + return; + } +#endif /* SUPPORT_UNICODE */ +SLJIT_ASSERT(invalid_utf == NULL); } static BOOL optimize_class_ranges(compiler_common *common, const sljit_u8 *bits, BOOL nclass, BOOL invert, jump_list **backtracks) @@ -6237,7 +7046,7 @@ const sljit_u32 *pp; while (src1 < end1) { if (src2 >= end2) - return (PCRE2_SPTR)1; + return (PCRE2_SPTR)0x1; GETCHARINC(c1, src1); GETCHARINC(c2, src2); ur = GET_UCD(c2); @@ -6254,6 +7063,33 @@ while (src1 < end1) return src2; } +static PCRE2_SPTR SLJIT_FUNC do_utf_caselesscmp_invalid(PCRE2_SPTR src1, PCRE2_SPTR src2, PCRE2_SPTR end1, PCRE2_SPTR end2) +{ +/* This function would be ineffective to do in JIT level. */ +sljit_u32 c1, c2; +const ucd_record *ur; +const sljit_u32 *pp; + +while (src1 < end1) + { + if (src2 >= end2) + return (PCRE2_SPTR)0x1; + GETCHARINC(c1, src1); + GETCHARINC_INVALID(c2, src2, end2, return NULL); + ur = GET_UCD(c2); + if (c1 != c2 && c1 != c2 + ur->other_case) + { + pp = PRIV(ucd_caseless_sets) + ur->caseset; + for (;;) + { + if (c1 < *pp) return NULL; + if (c1 == *pp++) break; + } + } + } +return src2; +} + #endif /* SUPPORT_UNICODE */ static PCRE2_SPTR byte_sequence_compare(compiler_common *common, BOOL caseless, PCRE2_SPTR cc, @@ -6561,7 +7397,7 @@ SLJIT_ASSERT(compares > 0); /* We are not necessary in utf mode even in 8 bit mode. */ cc = ccbegin; -read_char_range(common, min, max, (cc[-1] & XCL_NOT) != 0); +read_char_range(common, min, max, ((cc[-1] & XCL_NOT) != 0) ? backtracks : NULL, (cc[-1] & XCL_NOT) != 0); if ((cc[-1] & XCL_HASPROP) == 0) { @@ -6630,7 +7466,7 @@ if (needstype || needsscript) if (!common->utf) { jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, UNASSIGNED_UTF_CHAR); JUMPHERE(jump); } #endif @@ -7020,6 +7856,15 @@ switch(type) case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: add_jump(compiler, &common->wordboundary, JUMP(SLJIT_FAST_CALL)); +#ifdef SUPPORT_UNICODE + if (common->invalid_utf) + { + OP2(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_SIG_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0); + add_jump(compiler, backtracks, JUMP(SLJIT_SIG_LESS)); + add_jump(compiler, backtracks, JUMP(type == OP_NOT_WORD_BOUNDARY ? SLJIT_NOT_ZERO : SLJIT_ZERO)); + return cc; + } +#endif /* SUPPORT_UNICODE */ sljit_set_current_flags(compiler, SLJIT_SET_Z); add_jump(compiler, backtracks, JUMP(type == OP_NOT_WORD_BOUNDARY ? SLJIT_NOT_ZERO : SLJIT_ZERO)); return cc; @@ -7079,7 +7924,7 @@ switch(type) else { OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, STR_PTR, 0); - read_char_range(common, common->nlmin, common->nlmax, TRUE); + read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE); add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, STR_END, 0)); add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL)); sljit_set_current_flags(compiler, SLJIT_SET_Z); @@ -7143,7 +7988,7 @@ switch(type) } else { - peek_char(common, common->nlmax); + peek_char(common, common->nlmax, TMP3, 0); check_newlinechar(common, common->nltype, backtracks, FALSE); } JUMPHERE(jump[0]); @@ -7158,10 +8003,10 @@ switch(type) return cc; case OP_CIRCM: - OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0); - OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin)); - jump[1] = CMP(SLJIT_GREATER, STR_PTR, 0, TMP1, 0); - OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL); + OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0); + OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); + jump[1] = CMP(SLJIT_GREATER, STR_PTR, 0, TMP2, 0); + OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL); add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32)); jump[0] = JUMP(SLJIT_JUMP); JUMPHERE(jump[1]); @@ -7171,8 +8016,8 @@ switch(type) if (common->nltype == NLTYPE_FIXED && common->newline > 255) { - OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); - add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, TMP1, 0)); + OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); + add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP1, 0, TMP2, 0)); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2)); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1)); add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff)); @@ -7180,8 +8025,7 @@ switch(type) } else { - skip_char_back(common); - read_char_range(common, common->nlmin, common->nlmax, TRUE); + peek_char_back(common, common->nlmax, backtracks); check_newlinechar(common, common->nltype, backtracks, FALSE); } JUMPHERE(jump[0]); @@ -7195,12 +8039,12 @@ switch(type) #ifdef SUPPORT_UNICODE if (common->utf) { - OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); - OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, length); + OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin)); + OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, length); label = LABEL(); - add_jump(compiler, backtracks, CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP3, 0)); - skip_char_back(common); - OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); + add_jump(compiler, backtracks, CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0)); + move_back(common, backtracks, FALSE); + OP2(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1); JUMPTO(SLJIT_NOT_ZERO, label); } else @@ -7225,21 +8069,28 @@ static PCRE2_SPTR SLJIT_FUNC do_extuni_utf(jit_arguments *args, PCRE2_SPTR cc) { PCRE2_SPTR start_subject = args->begin; PCRE2_SPTR end_subject = args->end; -int lgb, rgb, len, ricount; -PCRE2_SPTR prevcc, bptr; +int lgb, rgb, ricount; +PCRE2_SPTR prevcc, startcc, bptr; +BOOL first = TRUE; uint32_t c; prevcc = cc; -GETCHARINC(c, cc); -lgb = UCD_GRAPHBREAK(c); - -while (cc < end_subject) +startcc = NULL; +do { - len = 1; - GETCHARLEN(c, cc, len); + GETCHARINC(c, cc); rgb = UCD_GRAPHBREAK(c); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + if (first) + { + lgb = rgb; + startcc = cc; + first = FALSE; + continue; + } + + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) + break; /* Not breaking between Regional Indicators is allowed only if there are an even number of preceding RIs. */ @@ -7256,7 +8107,8 @@ while (cc < end_subject) BACKCHAR(bptr); GETCHAR(c, bptr); - if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break; + if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) + break; ricount++; } @@ -7271,14 +8123,80 @@ while (cc < end_subject) lgb != ucp_gbExtended_Pictographic) lgb = rgb; - prevcc = cc; - cc += len; + prevcc = startcc; + startcc = cc; } +while (cc < end_subject); -return cc; +return startcc; } -#endif +static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc) +{ +PCRE2_SPTR start_subject = args->begin; +PCRE2_SPTR end_subject = args->end; +int lgb, rgb, ricount; +PCRE2_SPTR prevcc, startcc, bptr; +BOOL first = TRUE; +uint32_t c; + +prevcc = cc; +startcc = NULL; +do + { + GETCHARINC_INVALID(c, cc, end_subject, break); + rgb = UCD_GRAPHBREAK(c); + + if (first) + { + lgb = rgb; + startcc = cc; + first = FALSE; + continue; + } + + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) + break; + + /* Not breaking between Regional Indicators is allowed only if there + are an even number of preceding RIs. */ + + if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) + { + ricount = 0; + bptr = prevcc; + + /* bptr is pointing to the left-hand character */ + while (bptr > start_subject) + { + GETCHARBACK_INVALID(c, bptr, start_subject, break); + + if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) + break; + + ricount++; + } + + if ((ricount & 1) != 0) + break; /* Grapheme break required */ + } + + /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this + allows any number of them before a following Extended_Pictographic. */ + + if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) || + lgb != ucp_gbExtended_Pictographic) + lgb = rgb; + + prevcc = startcc; + startcc = cc; + } +while (cc < end_subject); + +return startcc; +} + +#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc) { @@ -7289,14 +8207,23 @@ PCRE2_SPTR bptr; uint32_t c; GETCHARINC(c, cc); +#if PCRE2_CODE_UNIT_WIDTH == 32 +if (c >= 0x110000) + return NULL; +#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ lgb = UCD_GRAPHBREAK(c); while (cc < end_subject) { c = *cc; +#if PCRE2_CODE_UNIT_WIDTH == 32 + if (c >= 0x110000) + break; +#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ rgb = UCD_GRAPHBREAK(c); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) + break; /* Not breaking between Regional Indicators is allowed only if there are an even number of preceding RIs. */ @@ -7311,13 +8238,18 @@ while (cc < end_subject) { bptr--; c = *bptr; +#if PCRE2_CODE_UNIT_WIDTH == 32 + if (c >= 0x110000) + break; +#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break; ricount++; } - if ((ricount & 1) != 0) break; /* Grapheme break required */ + if ((ricount & 1) != 0) + break; /* Grapheme break required */ } /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this @@ -7333,7 +8265,7 @@ while (cc < end_subject) return cc; } -#endif +#endif /* SUPPORT_UNICODE */ static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr) { @@ -7356,10 +8288,10 @@ switch(type) detect_partial_match(common, backtracks); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_digit, FALSE)) - read_char7_type(common, type == OP_NOT_DIGIT); + read_char7_type(common, backtracks, type == OP_NOT_DIGIT); else #endif - read_char8_type(common, type == OP_NOT_DIGIT); + read_char8_type(common, backtracks, type == OP_NOT_DIGIT); /* Flip the starting bit in the negative case. */ OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_digit); add_jump(compiler, backtracks, JUMP(type == OP_DIGIT ? SLJIT_ZERO : SLJIT_NOT_ZERO)); @@ -7371,10 +8303,10 @@ switch(type) detect_partial_match(common, backtracks); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_space, FALSE)) - read_char7_type(common, type == OP_NOT_WHITESPACE); + read_char7_type(common, backtracks, type == OP_NOT_WHITESPACE); else #endif - read_char8_type(common, type == OP_NOT_WHITESPACE); + read_char8_type(common, backtracks, type == OP_NOT_WHITESPACE); OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_space); add_jump(compiler, backtracks, JUMP(type == OP_WHITESPACE ? SLJIT_ZERO : SLJIT_NOT_ZERO)); return cc; @@ -7385,10 +8317,10 @@ switch(type) detect_partial_match(common, backtracks); #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_word, FALSE)) - read_char7_type(common, type == OP_NOT_WORDCHAR); + read_char7_type(common, backtracks, type == OP_NOT_WORDCHAR); else #endif - read_char8_type(common, type == OP_NOT_WORDCHAR); + read_char8_type(common, backtracks, type == OP_NOT_WORDCHAR); OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_word); add_jump(compiler, backtracks, JUMP(type == OP_WORDCHAR ? SLJIT_ZERO : SLJIT_NOT_ZERO)); return cc; @@ -7396,7 +8328,7 @@ switch(type) case OP_ANY: if (check_str_ptr) detect_partial_match(common, backtracks); - read_char_range(common, common->nlmin, common->nlmax, TRUE); + read_char_range(common, common->nlmin, common->nlmax, backtracks, TRUE); if (common->nltype == NLTYPE_FIXED && common->newline > 255) { jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff); @@ -7421,6 +8353,12 @@ switch(type) #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 if (common->utf) { + if (common->invalid_utf) + { + read_char_range(common, 0, READ_CHAR_MAX, backtracks, TRUE); + return cc; + } + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); #if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 @@ -7467,7 +8405,7 @@ switch(type) case OP_ANYNL: if (check_str_ptr) detect_partial_match(common, backtracks); - read_char_range(common, common->bsr_nlmin, common->bsr_nlmax, FALSE); + read_char_range(common, common->bsr_nlmin, common->bsr_nlmax, NULL, FALSE); jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); /* We don't need to handle soft partial matching case. */ end_list = NULL; @@ -7490,7 +8428,7 @@ switch(type) case OP_HSPACE: if (check_str_ptr) detect_partial_match(common, backtracks); - read_char_range(common, 0x9, 0x3000, type == OP_NOT_HSPACE); + read_char_range(common, 0x9, 0x3000, NULL, type == OP_NOT_HSPACE); add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL)); sljit_set_current_flags(compiler, SLJIT_SET_Z); add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO)); @@ -7500,7 +8438,7 @@ switch(type) case OP_VSPACE: if (check_str_ptr) detect_partial_match(common, backtracks); - read_char_range(common, 0xa, 0x2029, type == OP_NOT_VSPACE); + read_char_range(common, 0xa, 0x2029, NULL, type == OP_NOT_VSPACE); add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL)); sljit_set_current_flags(compiler, SLJIT_SET_Z); add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO)); @@ -7516,9 +8454,12 @@ switch(type) #if PCRE2_CODE_UNIT_WIDTH != 32 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, - common->utf ? SLJIT_FUNC_OFFSET(do_extuni_utf) : SLJIT_FUNC_OFFSET(do_extuni_no_utf)); + common->utf ? (common->invalid_utf ? SLJIT_FUNC_OFFSET(do_extuni_utf_invalid) : SLJIT_FUNC_OFFSET(do_extuni_utf)) : SLJIT_FUNC_OFFSET(do_extuni_no_utf)); + if (common->invalid_utf) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); #else sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_extuni_no_utf)); + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); #endif OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0); @@ -7566,12 +8507,12 @@ switch(type) if (type == OP_CHAR || !char_has_othercase(common, cc)) { - read_char_range(common, c, c, FALSE); + read_char_range(common, c, c, NULL, FALSE); add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c)); return cc + length; } oc = char_othercase(common, c); - read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, FALSE); + read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, FALSE); bit = c ^ oc; if (is_powerof2(bit)) { @@ -7626,13 +8567,13 @@ switch(type) if (type == OP_NOT || !char_has_othercase(common, cc)) { - read_char_range(common, c, c, TRUE); + read_char_range(common, c, c, NULL, TRUE); add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c)); } else { oc = char_othercase(common, c); - read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, TRUE); + read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, TRUE); bit = c ^ oc; if (is_powerof2(bit)) { @@ -7654,9 +8595,9 @@ switch(type) #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255; - read_char_range(common, 0, bit, type == OP_NCLASS); + read_char_range(common, 0, bit, NULL, type == OP_NCLASS); #else - read_char_range(common, 0, 255, type == OP_NCLASS); + read_char_range(common, 0, 255, NULL, type == OP_NCLASS); #endif if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks)) @@ -7869,7 +8810,8 @@ if (common->utf && *cc == OP_REFI) /* No free saved registers so save data on stack. */ OP1(SLJIT_MOV, SLJIT_R3, 0, STR_END, 0); - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_utf_caselesscmp)); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), + SLJIT_IMM, (common->invalid_utf) ? SLJIT_FUNC_OFFSET(do_utf_caselesscmp_invalid) : SLJIT_FUNC_OFFSET(do_utf_caselesscmp)); OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0); if (common->mode == PCRE2_JIT_COMPLETE) @@ -10787,14 +11729,14 @@ switch(opcode) if (CURRENT_AS(char_iterator_backtrack)->u.charpos.othercasebit != 0) OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, CURRENT_AS(char_iterator_backtrack)->u.charpos.othercasebit); CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CURRENT_AS(char_iterator_backtrack)->u.charpos.chr, CURRENT_AS(char_iterator_backtrack)->matchingpath); - skip_char_back(common); + move_back(common, NULL, TRUE); CMPTO(SLJIT_GREATER, STR_PTR, 0, TMP2, 0, label); } else { OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, base, offset1); - skip_char_back(common); + move_back(common, NULL, TRUE); OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath); } @@ -12016,6 +12958,9 @@ sljit_emit_fast_return(compiler, TMP2, 0); #undef COMPILE_BACKTRACKINGPATH #undef CURRENT_AS +#define PUBLIC_JIT_COMPILE_CONFIGURATION_OPTIONS \ + (PCRE2_JIT_INVALID_UTF) + static int jit_compile(pcre2_code *code, sljit_u32 mode) { pcre2_real_code *re = (pcre2_real_code *)code; @@ -12052,6 +12997,11 @@ common->re = re; common->name_table = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)); rootbacktrack.cc = common->name_table + re->name_count * re->name_entry_size; +#ifdef SUPPORT_UNICODE +common->invalid_utf = (mode & PCRE2_JIT_INVALID_UTF) != 0; +#endif /* SUPPORT_UNICODE */ +mode &= ~PUBLIC_JIT_COMPILE_CONFIGURATION_OPTIONS; + common->start = rootbacktrack.cc; common->read_only_data_head = NULL; common->fcc = tables + fcc_offset; @@ -12117,6 +13067,8 @@ if (common->utf) common->bsr_nlmax = (CHAR_CR > CHAR_NL) ? CHAR_CR : CHAR_NL; common->bsr_nlmin = (CHAR_CR < CHAR_NL) ? CHAR_CR : CHAR_NL; } +else + common->invalid_utf = FALSE; #endif /* SUPPORT_UNICODE */ ccend = bracketend(common->start); @@ -12557,17 +13509,34 @@ if (common->utfreadchar != NULL) set_jumps(common->utfreadchar, LABEL()); do_utfreadchar(common); } -if (common->utfreadchar16 != NULL) - { - set_jumps(common->utfreadchar16, LABEL()); - do_utfreadchar16(common); - } if (common->utfreadtype8 != NULL) { set_jumps(common->utfreadtype8, LABEL()); do_utfreadtype8(common); } +if (common->utfpeakcharback != NULL) + { + set_jumps(common->utfpeakcharback, LABEL()); + do_utfpeakcharback(common); + } #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ +#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 +if (common->utfreadchar_invalid != NULL) + { + set_jumps(common->utfreadchar_invalid, LABEL()); + do_utfreadchar_invalid(common); + } +if (common->utfmoveback_invalid) + { + set_jumps(common->utfmoveback_invalid, LABEL()); + do_utfmoveback_invalid(common); + } +if (common->utfpeakcharback_invalid) + { + set_jumps(common->utfpeakcharback_invalid, LABEL()); + do_utfpeakcharback_invalid(common); + } +#endif /* PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 */ if (common->getucd != NULL) { set_jumps(common->getucd, LABEL()); @@ -12644,7 +13613,7 @@ Returns: 0: success or (*NOJIT) was used */ #define PUBLIC_JIT_COMPILE_OPTIONS \ - (PCRE2_JIT_COMPLETE|PCRE2_JIT_PARTIAL_SOFT|PCRE2_JIT_PARTIAL_HARD) + (PCRE2_JIT_COMPLETE|PCRE2_JIT_PARTIAL_SOFT|PCRE2_JIT_PARTIAL_HARD|PCRE2_JIT_INVALID_UTF) PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION pcre2_jit_compile(pcre2_code *code, uint32_t options) @@ -12659,6 +13628,7 @@ return PCRE2_ERROR_JIT_BADOPTION; pcre2_real_code *re = (pcre2_real_code *)code; executable_functions *functions; +uint32_t excluded_options; int result; if (code == NULL) @@ -12673,21 +13643,24 @@ functions = (executable_functions *)re->executable_jit; if ((options & PCRE2_JIT_COMPLETE) != 0 && (functions == NULL || functions->executable_funcs[0] == NULL)) { - result = jit_compile(code, PCRE2_JIT_COMPLETE); + excluded_options = (PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_PARTIAL_HARD); + result = jit_compile(code, options & ~excluded_options); if (result != 0) return result; } if ((options & PCRE2_JIT_PARTIAL_SOFT) != 0 && (functions == NULL || functions->executable_funcs[1] == NULL)) { - result = jit_compile(code, PCRE2_JIT_PARTIAL_SOFT); + excluded_options = (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_HARD); + result = jit_compile(code, options & ~excluded_options); if (result != 0) return result; } if ((options & PCRE2_JIT_PARTIAL_HARD) != 0 && (functions == NULL || functions->executable_funcs[2] == NULL)) { - result = jit_compile(code, PCRE2_JIT_PARTIAL_HARD); + excluded_options = (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT); + result = jit_compile(code, options & ~excluded_options); if (result != 0) return result; } diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c index a28e9a0..78fdcca 100644 --- a/src/pcre2_jit_test.c +++ b/src/pcre2_jit_test.c @@ -93,6 +93,8 @@ POSSIBILITY OF SUCH DAMAGE. */ static int regression_tests(void); +static int invalid_utf8_regression_tests(void); +static int invalid_utf16_regression_tests(void); int main(void) { @@ -108,7 +110,9 @@ int main(void) printf("JIT must be enabled to run pcre_jit_test\n"); return 1; } - return regression_tests(); + return regression_tests() + || invalid_utf8_regression_tests() + || invalid_utf16_regression_tests(); } /* --------------------------------------------------------------------------------------- */ @@ -184,7 +188,7 @@ static struct regression_test_case regression_test_cases[] = { { CM, A, 0, 0, "\\Ca", "CDA" }, { M, A, 0, 0 | F_NOMATCH, "\\Cx", "cda" }, { CM, A, 0, 0 | F_NOMATCH, "\\Cx", "CDA" }, -#endif +#endif /* !NEVER_BACKSLASH_C */ { CMUP, A, 0, 0, "\xf0\x90\x90\x80\xf0\x90\x90\xa8", "\xf0\x90\x90\xa8\xf0\x90\x90\x80" }, { CMUP, A, 0, 0, "\xf0\x90\x90\x80{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" }, { CMUP, A, 0, 0, "\xf0\x90\x90\xa8{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" }, @@ -262,6 +266,9 @@ static struct regression_test_case regression_test_cases[] = { { MU, A, 0, 0, "\xc6\x82\xc6\x82|\xc7\x83\xc7\x83|\xc8\x84\xc8\x84", "\xf1\x83\x82\x82\xc8\x84\xc8\x84" }, { U, A, 0, 0, "\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80", "\xdf\xbf\xc2\x80\xe4\x84\x80" }, { U, A, 0, 0, "(?:\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80)#", "\xdf\xbf\xc2\x80#\xe4\x84\x80#" }, + { CM, A, 0, 0, "ab|cd", "CD" }, + { CM, A, 0, 0, "a1277|a1377|bX487", "bx487" }, + { CM, A, 0, 0, "a1277|a1377|bx487", "bX487" }, /* Greedy and non-greedy ? operators. */ { MU, A, 0, 0, "(?:a)?a", "laab" }, @@ -1163,7 +1170,7 @@ static int regression_tests(void) #elif defined SUPPORT_PCRE2_32 PCRE2_UCHAR32 cpu_info[128]; #endif -#if defined SUPPORT_UTF && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2) +#if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2) int return_value; #endif @@ -1331,9 +1338,8 @@ static int regression_tests(void) ovector8_2[i] = -2; } if (re8) { - (void)pcre2_set_match_limit_8(mcontext8, 10000000); return_value8[1] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input), - current->start_offset & OFFSET_MASK, current->match_options, mdata8_2, mcontext8); + current->start_offset & OFFSET_MASK, current->match_options, mdata8_2, NULL); if (pcre2_jit_compile_8(re8, jit_compile_mode)) { printf("\n8 bit: JIT compiler does not support \"%s\"\n", current->pattern); @@ -1376,9 +1382,8 @@ static int regression_tests(void) else length16 = copy_char8_to_char16((PCRE2_SPTR8)current->input, regtest_buf16, REGTEST_MAX_LENGTH16); - (void)pcre2_set_match_limit_16(mcontext16, 10000000); return_value16[1] = pcre2_match_16(re16, regtest_buf16, length16, - current->start_offset & OFFSET_MASK, current->match_options, mdata16_2, mcontext16); + current->start_offset & OFFSET_MASK, current->match_options, mdata16_2, NULL); if (pcre2_jit_compile_16(re16, jit_compile_mode)) { printf("\n16 bit: JIT compiler does not support \"%s\"\n", current->pattern); @@ -1421,9 +1426,8 @@ static int regression_tests(void) else length32 = copy_char8_to_char32((PCRE2_SPTR8)current->input, regtest_buf32, REGTEST_MAX_LENGTH32); - (void)pcre2_set_match_limit_32(mcontext32, 10000000); return_value32[1] = pcre2_match_32(re32, regtest_buf32, length32, - current->start_offset & OFFSET_MASK, current->match_options, mdata32_2, mcontext32); + current->start_offset & OFFSET_MASK, current->match_options, mdata32_2, NULL); if (pcre2_jit_compile_32(re32, jit_compile_mode)) { printf("\n32 bit: JIT compiler does not support \"%s\"\n", current->pattern); @@ -1451,7 +1455,7 @@ static int regression_tests(void) is_successful = 1; if (!(current->start_offset & F_DIFF)) { -#if defined SUPPORT_UTF && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2) +#if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2) if (!(current->start_offset & F_FORCECONV)) { /* All results must be the same. */ @@ -1500,8 +1504,8 @@ static int regression_tests(void) is_successful = 0; } else #endif - if (return_value >= 0 || return_value == PCRE_ERROR_PARTIAL) { - if (return_value == PCRE_ERROR_PARTIAL) { + if (return_value >= 0 || return_value == PCRE2_ERROR_PARTIAL) { + if (return_value == PCRE2_ERROR_PARTIAL) { return_value = 2; } else { return_value *= 2; @@ -1516,20 +1520,20 @@ static int regression_tests(void) return_value32[0] = return_value; #endif /* Transform back the results. */ - if (current->flags & PCRE_UTF8) { + if (current->compile_options & PCRE2_UTF) { #ifdef SUPPORT_PCRE2_16 for (i = 0; i < return_value; ++i) { - if (ovector16_1[i] >= 0) + if (ovector16_1[i] != PCRE2_UNSET) ovector16_1[i] = regtest_offsetmap16[ovector16_1[i]]; - if (ovector16_2[i] >= 0) + if (ovector16_2[i] != PCRE2_UNSET) ovector16_2[i] = regtest_offsetmap16[ovector16_2[i]]; } #endif #ifdef SUPPORT_PCRE2_32 for (i = 0; i < return_value; ++i) { - if (ovector32_1[i] >= 0) + if (ovector32_1[i] != PCRE2_UNSET) ovector32_1[i] = regtest_offsetmap32[ovector32_1[i]]; - if (ovector32_2[i] >= 0) + if (ovector32_2[i] != PCRE2_UNSET) ovector32_2[i] = regtest_offsetmap32[ovector32_2[i]]; } #endif @@ -1539,7 +1543,7 @@ static int regression_tests(void) #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector16_1[i] || ovector8_1[i] != ovector16_2[i]) { printf("\n8 and 16 bit: Ovector[%d] value differs(J8:%d,I8:%d,J16:%d,I16:%d): [%d] '%s' @ '%s' \n", - i, ovector8_1[i], ovector8_2[i], ovector16_1[i], ovector16_2[i], + i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector16_1[i], (int)ovector16_2[i], total, current->pattern, current->input); is_successful = 0; } @@ -1547,7 +1551,7 @@ static int regression_tests(void) #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector32_1[i] || ovector8_1[i] != ovector32_2[i]) { printf("\n8 and 32 bit: Ovector[%d] value differs(J8:%d,I8:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n", - i, ovector8_1[i], ovector8_2[i], ovector32_1[i], ovector32_2[i], + i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector32_1[i], (int)ovector32_2[i], total, current->pattern, current->input); is_successful = 0; } @@ -1555,7 +1559,7 @@ static int regression_tests(void) #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32 if (ovector16_1[i] != ovector16_2[i] || ovector16_1[i] != ovector32_1[i] || ovector16_1[i] != ovector32_2[i]) { printf("\n16 and 32 bit: Ovector[%d] value differs(J16:%d,I16:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n", - i, ovector16_1[i], ovector16_2[i], ovector32_1[i], ovector32_2[i], + i, (int)ovector16_1[i], (int)ovector16_2[i], (int)ovector32_1[i], (int)ovector32_2[i], total, current->pattern, current->input); is_successful = 0; } @@ -1751,4 +1755,426 @@ static int regression_tests(void) } } +#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8 + +#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED) +#define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF) +#define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF) + +struct invalid_utf8_regression_test_case { + int compile_options; + int jit_compile_options; + int start_offset; + int skip_left; + int skip_right; + int expected_result; + const char *pattern[2]; + const char *input; +}; + +static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = { + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf4\x8f\xbf\xbf" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf0\x90\x80\x80" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf4\x90\x80\x80" }, + { UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\x7f" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\xc0" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf#" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80#" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80" }, + { UDA, CI, 0, 0, 2, -1, { ".", NULL }, "\xef\xbf\xbf#" }, + { UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xef\xbf\xbf" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\x7f#" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\xc0" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf#" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xed\x9f\xbf#" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xa0\x80#" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xee\x80\x80#" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xbf\xbf#" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf##" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf#" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80##" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80#" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80##" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0##" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf##" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80###" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8###" }, + { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8" }, + { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\x7f" }, + + { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" }, + { UDA, CPI, 4, 1, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" }, + { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xef\xbf\xbf#" }, + { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xe0\xa0\x80#" }, + { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf0\x90\x80\x80#" }, + { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" }, + { UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" }, + { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" }, + { UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" }, + { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xc2\x80#" }, + { UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xc2\x80#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xc1\xbf#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xdf\xc0#" }, + { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" }, + { UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" }, + + { UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xef\xbf\xbf#" }, + { UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xe0\xa0\x80#" }, + { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" }, + { UDA, CPI, 3, 1, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" }, + { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" }, + { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" }, + { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" }, + { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" }, + + { UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xdf\xbf#" }, + { UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xc2\x80#" }, + { UDA, CPI, 2, 1, 0, -1, { "\\B", "\\b" }, "\xdf\xbf#" }, + { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xc1\xbf#" }, + { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x80#" }, + { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xdf\xff#" }, + { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xff\xbf#" }, + + { UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x7f#" }, + { UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x01#" }, + { UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" }, + { UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" }, + + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "a\xff" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" }, + + { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "A" }, + { UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xff" }, + { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xc3\xa1" }, + { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xc3\xa1" }, + { UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xc3\x7f" }, + { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xe1\xbd\xb8" }, + { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" }, + { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xf0\x90\x90\x80" }, + { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" }, + + { 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } +}; + +#undef UDA +#undef CI +#undef CPI + +static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *current, + int pattern_index, int i, pcre2_match_data_8 *mdata) +{ + pcre2_code_8 *code; + int result, errorcode; + PCRE2_SIZE length, erroroffset; + + if (current->pattern[i] == NULL) + return 1; + + code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED, + current->compile_options, &errorcode, &erroroffset, NULL); + + if (!code) { + printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset); + return 0; + } + + if (pcre2_jit_compile_8(code, current->jit_compile_options) != 0) { + printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index); + pcre2_code_free_8(code); + return 0; + } + + length = (PCRE2_SIZE)(strlen(current->input) - current->skip_left - current->skip_right); + + if (current->jit_compile_options & PCRE2_JIT_COMPLETE) { + result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left), + length, current->start_offset - current->skip_left, 0, mdata, NULL); + + if (result != current->expected_result) { + printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result); + pcre2_code_free_8(code); + return 0; + } + } + + if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) { + result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left), + length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL); + + if (result != current->expected_result) { + printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result); + pcre2_code_free_8(code); + return 0; + } + } + + pcre2_code_free_8(code); + return 1; +} + +static int invalid_utf8_regression_tests(void) +{ + struct invalid_utf8_regression_test_case *current; + pcre2_match_data_8 *mdata; + int total = 0, successful = 0; + int result; + + printf("\nRunning invalid-utf8 JIT regression tests\n"); + + mdata = pcre2_match_data_create_8(4, NULL); + + for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) { + /* printf("\nPattern: %s :\n", current->pattern); */ + total++; + + result = 1; + if (!run_invalid_utf8_test(current, total - 1, 0, mdata)) + result = 0; + if (!run_invalid_utf8_test(current, total - 1, 1, mdata)) + result = 0; + + if (result) { + successful++; + } + + printf("."); + if ((total % 60) == 0) + printf("\n"); + } + + if ((total % 60) != 0) + printf("\n"); + + pcre2_match_data_free_8(mdata); + + if (total == successful) { + printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n"); + return 0; + } else { + printf("\nInvalid UTF8 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful); + return 1; + } +} + +#else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_8 */ + +static int invalid_utf8_regression_tests(void) +{ + return 0; +} + +#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_8 */ + +#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_16 + +#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED) +#define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF) +#define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF) + +struct invalid_utf16_regression_test_case { + int compile_options; + int jit_compile_options; + int start_offset; + int skip_left; + int skip_right; + int expected_result; + const PCRE2_UCHAR16 *pattern[2]; + const PCRE2_UCHAR16 *input; +}; + +static PCRE2_UCHAR16 allany[] = { '.', 0 }; +static PCRE2_UCHAR16 non_word_boundary[] = { '\\', 'B', 0 }; +static PCRE2_UCHAR16 word_boundary[] = { '\\', 'b', 0 }; +static PCRE2_UCHAR16 backreference[] = { '(', '.', ')', '\\', '1', 0 }; +static PCRE2_UCHAR16 grapheme[] = { '\\', 'X', 0 }; +static PCRE2_UCHAR16 test1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 }; +static PCRE2_UCHAR16 test2[] = { 0xd800, 0xdc00, '#', 0 }; +static PCRE2_UCHAR16 test3[] = { 0xdbff, 0xdfff, '#', 0 }; +static PCRE2_UCHAR16 test4[] = { 0xd800, 0xdbff, '#', 0 }; +static PCRE2_UCHAR16 test5[] = { '#', 0xd800, '#', 0 }; +static PCRE2_UCHAR16 test6[] = { 'a', 'A', 0xdc28, 0 }; +static PCRE2_UCHAR16 test7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 }; + +static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = { + { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test1 }, + { UDA, CI, 1, 0, 0, 1, { allany, NULL }, test1 }, + { UDA, CI, 2, 0, 0, 1, { allany, NULL }, test1 }, + { UDA, CI, 3, 0, 0, 1, { allany, NULL }, test1 }, + { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test2 }, + { UDA, CI, 0, 0, 2, -1, { allany, NULL }, test2 }, + { UDA, CI, 1, 0, 0, -1, { allany, NULL }, test2 }, + { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test3 }, + { UDA, CI, 0, 0, 2, -1, { allany, NULL }, test3 }, + { UDA, CI, 1, 0, 0, -1, { allany, NULL }, test3 }, + + { UDA, CPI, 1, 0, 0, 1, { non_word_boundary, NULL }, test1 }, + { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test1 }, + { UDA, CPI, 3, 0, 0, 1, { non_word_boundary, NULL }, test1 }, + { UDA, CPI, 4, 0, 0, 1, { non_word_boundary, NULL }, test1 }, + { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test2 }, + { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test3 }, + { UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test2 }, + { UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test3 }, + { UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test4 }, + { UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test5 }, + + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test6 }, + { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, { backreference, NULL }, test6 }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test7 }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { backreference, NULL }, test7 }, + + { UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test6 }, + { UDA, CPI, 1, 0, 0, 1, { grapheme, NULL }, test6 }, + { UDA, CPI, 2, 0, 0, -1, { grapheme, NULL }, test6 }, + { UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test7 }, + { UDA, CPI, 2, 0, 0, 1, { grapheme, NULL }, test7 }, + { UDA, CPI, 1, 0, 0, -1, { grapheme, NULL }, test7 }, + + { 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } +}; + +#undef UDA +#undef CI +#undef CPI + +static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *current, + int pattern_index, int i, pcre2_match_data_16 *mdata) +{ + pcre2_code_16 *code; + int result, errorcode; + PCRE2_SIZE length, erroroffset; + const PCRE2_UCHAR16 *input; + + if (current->pattern[i] == NULL) + return 1; + + code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED, + current->compile_options, &errorcode, &erroroffset, NULL); + + if (!code) { + printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset); + return 0; + } + + if (pcre2_jit_compile_16(code, current->jit_compile_options) != 0) { + printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index); + pcre2_code_free_16(code); + return 0; + } + + input = current->input; + length = 0; + + while (*input++ != 0) + length++; + + length -= current->skip_left + current->skip_right; + + if (current->jit_compile_options & PCRE2_JIT_COMPLETE) { + result = pcre2_jit_match_16(code, (current->input + current->skip_left), + length, current->start_offset - current->skip_left, 0, mdata, NULL); + + if (result != current->expected_result) { + printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result); + pcre2_code_free_16(code); + return 0; + } + } + + if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) { + result = pcre2_jit_match_16(code, (current->input + current->skip_left), + length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL); + + if (result != current->expected_result) { + printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result); + pcre2_code_free_16(code); + return 0; + } + } + + pcre2_code_free_16(code); + return 1; +} + +static int invalid_utf16_regression_tests(void) +{ + struct invalid_utf16_regression_test_case *current; + pcre2_match_data_16 *mdata; + int total = 0, successful = 0; + int result; + + printf("\nRunning invalid-utf16 JIT regression tests\n"); + + mdata = pcre2_match_data_create_16(4, NULL); + + for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) { + /* printf("\nPattern: %s :\n", current->pattern); */ + total++; + + result = 1; + if (!run_invalid_utf16_test(current, total - 1, 0, mdata)) + result = 0; + if (!run_invalid_utf16_test(current, total - 1, 1, mdata)) + result = 0; + + if (result) { + successful++; + } + + printf("."); + if ((total % 60) == 0) + printf("\n"); + } + + if ((total % 60) != 0) + printf("\n"); + + pcre2_match_data_free_16(mdata); + + if (total == successful) { + printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n"); + return 0; + } else { + printf("\nInvalid UTF16 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful); + return 1; + } +} + +#else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_16 */ + +static int invalid_utf16_regression_tests(void) +{ + return 0; +} + +#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */ + /* End of pcre2_jit_test.c */