Add option bits for read_char in JIT.

This commit is contained in:
Zoltán Herczeg 2018-09-15 12:35:56 +00:00
parent baa91ecc79
commit 142c667bbc
2 changed files with 437 additions and 219 deletions

View File

@ -485,6 +485,7 @@ typedef struct compiler_common {
jump_list *getucdtype; jump_list *getucdtype;
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
jump_list *utfreadchar; jump_list *utfreadchar;
jump_list *utfreadchar_invalid_precise;
jump_list *utfreadtype8; jump_list *utfreadtype8;
jump_list *utfpeakcharback; jump_list *utfpeakcharback;
#endif #endif
@ -3462,8 +3463,13 @@ if (common->utf)
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
} }
static void read_char_range(compiler_common *common, sljit_u32 min, sljit_u32 max, #define READ_CHAR_UPDATE_STR_PTR 0x1
jump_list **backtracks, BOOL update_str_ptr) #define READ_CHAR_UPDATE_STR_PTR_INVALID 0x2
#define READ_CHAR_UPDATE_STR_PTR_PRECISE (READ_CHAR_UPDATE_STR_PTR | READ_CHAR_UPDATE_STR_PTR_INVALID)
#define READ_CHAR_VALID_UTF 0x4
static void read_char(compiler_common *common, sljit_u32 min, sljit_u32 max,
jump_list **backtracks, sljit_u32 options)
{ {
/* Reads the precise value of a character into TMP1, if the character is /* Reads the precise value of a character into TMP1, if the character is
between min and max (c >= min && c <= max). Otherwise it returns with a value between min and max (c >= min && c <= max). Otherwise it returns with a value
@ -3476,24 +3482,30 @@ struct sljit_jump *jump;
struct sljit_jump *jump2; struct sljit_jump *jump2;
#endif #endif
SLJIT_UNUSED_ARG(update_str_ptr);
SLJIT_UNUSED_ARG(min); SLJIT_UNUSED_ARG(min);
SLJIT_UNUSED_ARG(max); SLJIT_UNUSED_ARG(max);
SLJIT_UNUSED_ARG(backtracks); SLJIT_UNUSED_ARG(backtracks);
SLJIT_UNUSED_ARG(options);
SLJIT_ASSERT(min <= max); SLJIT_ASSERT(min <= max);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 #ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf) if (common->utf)
{ {
if (max < 128 && !update_str_ptr) return; if (max < 128 && !(options & READ_CHAR_UPDATE_STR_PTR)) return;
if (common->invalid_utf) if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF))
{ {
jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80); jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
if (options & READ_CHAR_UPDATE_STR_PTR_INVALID)
add_jump(compiler, &common->utfreadchar_invalid_precise, JUMP(SLJIT_FAST_CALL));
else
add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
if (backtracks != NULL) if (backtracks != NULL)
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
JUMPHERE(jump); JUMPHERE(jump);
@ -3504,7 +3516,7 @@ if (common->utf)
if (min >= 0x10000) if (min >= 0x10000)
{ {
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf0); OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xf0);
if (update_str_ptr) if (options & READ_CHAR_UPDATE_STR_PTR)
OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x7); jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x7);
@ -3516,19 +3528,19 @@ if (common->utf)
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
if (!update_str_ptr) if (!(options & READ_CHAR_UPDATE_STR_PTR))
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
JUMPHERE(jump2); JUMPHERE(jump2);
if (update_str_ptr) if (options & READ_CHAR_UPDATE_STR_PTR)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
} }
else if (min >= 0x800 && max <= 0xffff) else if (min >= 0x800 && max <= 0xffff)
{ {
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xe0); OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xe0);
if (update_str_ptr) if (options & READ_CHAR_UPDATE_STR_PTR)
OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xf); jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xf);
@ -3536,13 +3548,13 @@ if (common->utf)
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
if (!update_str_ptr) if (!(options & READ_CHAR_UPDATE_STR_PTR))
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
JUMPHERE(jump2); JUMPHERE(jump2);
if (update_str_ptr) if (options & READ_CHAR_UPDATE_STR_PTR)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
} }
else if (max >= 0x800) else if (max >= 0x800)
@ -3557,7 +3569,7 @@ if (common->utf)
else else
{ {
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
if (!update_str_ptr) if (!(options & READ_CHAR_UPDATE_STR_PTR))
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
else else
OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
@ -3565,31 +3577,30 @@ if (common->utf)
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
if (update_str_ptr) if (options & READ_CHAR_UPDATE_STR_PTR)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
} }
JUMPHERE(jump); JUMPHERE(jump);
} }
#endif #elif PCRE2_CODE_UNIT_WIDTH == 16
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16
if (common->utf) if (common->utf)
{ {
if (max < 0xd800 && !update_str_ptr) return; if (max < 0xd800 && !(options & READ_CHAR_UPDATE_STR_PTR)) return;
if (max >= 0x10000 || common->invalid_utf) if (common->invalid_utf && !(options & READ_CHAR_VALID_UTF))
{ {
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
if (common->invalid_utf)
{
jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
if (backtracks != NULL) if (backtracks != NULL)
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
JUMPHERE(jump);
return;
} }
else
if (max >= 0x10000)
{ {
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800); jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
/* TMP2 contains the high surrogate. */ /* TMP2 contains the high surrogate. */
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
@ -3597,7 +3608,6 @@ if (common->utf)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
}
JUMPHERE(jump); JUMPHERE(jump);
return; return;
} }
@ -3605,13 +3615,25 @@ if (common->utf)
/* Skip low surrogate if necessary. */ /* Skip low surrogate if necessary. */
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800); jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
if (update_str_ptr) if (options & READ_CHAR_UPDATE_STR_PTR)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
if (max >= 0xd800) if (max >= 0xd800)
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000);
JUMPHERE(jump); JUMPHERE(jump);
} }
#endif #elif PCRE2_CODE_UNIT_WIDTH == 32
if (common->invalid_utf)
{
if (backtracks != NULL)
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
else
{
OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
}
}
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
#endif /* SUPPORT_UNICODE */
} }
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
@ -3646,6 +3668,7 @@ SLJIT_ASSERT(common->utf);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
/* All values > 127 are zero in ctypes. */
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
if (negated) if (negated)
@ -3700,14 +3723,15 @@ if (common->utf)
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0); OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2);
if (common->invalid_utf) if (common->invalid_utf)
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x1f)); add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe0 - 0xc2));
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80); OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
if (common->invalid_utf) if (common->invalid_utf)
add_jump(compiler, backtracks, CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x3f)); add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40));
OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0); OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
@ -3718,6 +3742,7 @@ if (common->utf)
else if (common->invalid_utf) else if (common->invalid_utf)
{ {
add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
@ -3970,6 +3995,122 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
static void do_utfreadchar_invalid(compiler_common *common) static void do_utfreadchar_invalid(compiler_common *common)
{ {
/* Slow decoding a UTF-8 character. TMP1 contains the first byte /* Slow decoding a UTF-8 character. TMP1 contains the first byte
of the character (>= 0xc0). Return char value in TMP1. STR_PTR is
undefined for invalid characters. */
DEFINE_COMPILER;
sljit_s32 i;
struct sljit_jump *jump;
struct sljit_jump *buffer_end_close;
struct sljit_label *three_byte_entry;
struct sljit_label *exit_invalid_label;
struct sljit_jump *exit_invalid[11];
sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc2);
/* Usually more than 3 characters remained in the subject buffer. */
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
/* Not a valid start of a multi-byte sequence, no more bytes read. */
exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf5 - 0xc2);
buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
jump = JUMP(SLJIT_NOT_ZERO);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);
/* Three-byte sequence. */
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000);
jump = JUMP(SLJIT_NOT_ZERO);
three_byte_entry = LABEL();
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2d800);
exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);
/* Four-byte sequence. */
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc10000);
exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000);
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(buffer_end_close);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
exit_invalid[7] = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
/* Two-byte sequence. */
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
exit_invalid[8] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
jump = JUMP(SLJIT_NOT_ZERO);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
/* Three-byte sequence. */
JUMPHERE(jump);
exit_invalid[9] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
exit_invalid[10] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
/* One will be substracted from STR_PTR later. */
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
/* Four byte sequences are not possible. */
CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x30000, three_byte_entry);
exit_invalid_label = LABEL();
for (i = 0; i < 11; i++)
sljit_set_label(exit_invalid[i], exit_invalid_label);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}
static void do_utfreadchar_invalid_precise(compiler_common *common)
{
/* Slow decoding a UTF-8 character. TMP1 contains the first byte
of the character (>= 0xc0). Return char value in TMP1. */ of the character (>= 0xc0). Return char value in TMP1. */
DEFINE_COMPILER; DEFINE_COMPILER;
struct sljit_jump *jump; struct sljit_jump *jump;
@ -3987,7 +4128,7 @@ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
/* Not a valid start of a multi-byte sequence, no more bytes read. */ /* Not a valid start of a multi-byte sequence, no more bytes read. */
exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf8 - 0xc); exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xf8 - 0xc0);
buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0); buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
@ -4576,7 +4717,7 @@ if ((overall_options & PCRE2_FIRSTLINE) != 0)
mainloop = LABEL(); mainloop = LABEL();
/* Continual stores does not cause data dependency. */ /* Continual stores does not cause data dependency. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0); OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0);
read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE); read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_UPDATE_STR_PTR_PRECISE);
check_newlinechar(common, common->nltype, &newline, TRUE); check_newlinechar(common, common->nltype, &newline, TRUE);
CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop); CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop);
JUMPHERE(end); JUMPHERE(end);
@ -6206,7 +6347,7 @@ move_back(common, NULL, FALSE);
loop = LABEL(); loop = LABEL();
common->ff_newline_shortcut = loop; common->ff_newline_shortcut = loop;
read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE); read_char(common, common->nlmin, common->nlmax, NULL, READ_CHAR_UPDATE_STR_PTR_PRECISE);
lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF) if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
foundcr = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); foundcr = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
@ -6451,7 +6592,8 @@ else
{ {
move_back(common, &invalid_utf, FALSE); move_back(common, &invalid_utf, FALSE);
check_start_used_ptr(common); check_start_used_ptr(common);
read_char_range(common, 0, READ_CHAR_MAX, &invalid_utf, TRUE); /* No need precise read since match fails anyway. */
read_char(common, 0, READ_CHAR_MAX, &invalid_utf, READ_CHAR_UPDATE_STR_PTR);
} }
/* Testing char type. */ /* Testing char type. */
@ -7394,7 +7536,10 @@ SLJIT_ASSERT(compares > 0);
/* We are not necessary in utf mode even in 8 bit mode. */ /* We are not necessary in utf mode even in 8 bit mode. */
cc = ccbegin; cc = ccbegin;
read_char_range(common, min, max, ((cc[-1] & XCL_NOT) != 0) ? backtracks : NULL, (cc[-1] & XCL_NOT) != 0); if ((cc[-1] & XCL_NOT) != 0)
read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR);
else
read_char(common, min, max, NULL, 0);
if ((cc[-1] & XCL_HASPROP) == 0) if ((cc[-1] & XCL_HASPROP) == 0)
{ {
@ -7920,13 +8065,13 @@ switch(type)
} }
else else
{ {
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, STR_PTR, 0); OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE); read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, STR_END, 0)); add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, STR_END, 0));
add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z); sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(SLJIT_ZERO)); add_jump(compiler, backtracks, JUMP(SLJIT_ZERO));
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1); OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
} }
JUMPHERE(jump[2]); JUMPHERE(jump[2]);
JUMPHERE(jump[3]); JUMPHERE(jump[3]);
@ -8325,7 +8470,7 @@ switch(type)
case OP_ANY: case OP_ANY:
if (check_str_ptr) if (check_str_ptr)
detect_partial_match(common, backtracks); detect_partial_match(common, backtracks);
read_char_range(common, common->nlmin, common->nlmax, backtracks, TRUE); read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR);
if (common->nltype == NLTYPE_FIXED && common->newline > 255) if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{ {
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff); jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
@ -8352,7 +8497,7 @@ switch(type)
{ {
if (common->invalid_utf) if (common->invalid_utf)
{ {
read_char_range(common, 0, READ_CHAR_MAX, backtracks, TRUE); read_char(common, 0, READ_CHAR_MAX, backtracks, READ_CHAR_UPDATE_STR_PTR);
return cc; return cc;
} }
@ -8402,7 +8547,7 @@ switch(type)
case OP_ANYNL: case OP_ANYNL:
if (check_str_ptr) if (check_str_ptr)
detect_partial_match(common, backtracks); detect_partial_match(common, backtracks);
read_char_range(common, common->bsr_nlmin, common->bsr_nlmax, NULL, FALSE); read_char(common, common->bsr_nlmin, common->bsr_nlmax, NULL, 0);
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
/* We don't need to handle soft partial matching case. */ /* We don't need to handle soft partial matching case. */
end_list = NULL; end_list = NULL;
@ -8425,7 +8570,12 @@ switch(type)
case OP_HSPACE: case OP_HSPACE:
if (check_str_ptr) if (check_str_ptr)
detect_partial_match(common, backtracks); detect_partial_match(common, backtracks);
read_char_range(common, 0x9, 0x3000, NULL, type == OP_NOT_HSPACE);
if (type == OP_NOT_HSPACE)
read_char(common, 0x9, 0x3000, backtracks, READ_CHAR_UPDATE_STR_PTR);
else
read_char(common, 0x9, 0x3000, NULL, 0);
add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z); sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO)); add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
@ -8435,7 +8585,12 @@ switch(type)
case OP_VSPACE: case OP_VSPACE:
if (check_str_ptr) if (check_str_ptr)
detect_partial_match(common, backtracks); detect_partial_match(common, backtracks);
read_char_range(common, 0xa, 0x2029, NULL, type == OP_NOT_VSPACE);
if (type == OP_NOT_VSPACE)
read_char(common, 0xa, 0x2029, backtracks, READ_CHAR_UPDATE_STR_PTR);
else
read_char(common, 0xa, 0x2029, NULL, 0);
add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z); sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO)); add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
@ -8477,6 +8632,7 @@ switch(type)
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc); if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc);
#endif #endif
if (common->mode == PCRE2_JIT_COMPLETE && check_str_ptr if (common->mode == PCRE2_JIT_COMPLETE && check_str_ptr
&& (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0)) && (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0))
{ {
@ -8504,12 +8660,13 @@ switch(type)
if (type == OP_CHAR || !char_has_othercase(common, cc)) if (type == OP_CHAR || !char_has_othercase(common, cc))
{ {
read_char_range(common, c, c, NULL, FALSE); read_char(common, c, c, NULL, 0);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c)); add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
return cc + length; return cc + length;
} }
oc = char_othercase(common, c); oc = char_othercase(common, c);
read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, FALSE); read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0);
bit = c ^ oc; bit = c ^ oc;
if (is_powerof2(bit)) if (is_powerof2(bit))
{ {
@ -8517,6 +8674,7 @@ switch(type)
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit)); add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit));
return cc + length; return cc + length;
} }
jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c); jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc)); add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
JUMPHERE(jump[0]); JUMPHERE(jump[0]);
@ -8533,7 +8691,7 @@ switch(type)
{ {
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
c = *cc; c = *cc;
if (c < 128) if (c < 128 && !common->invalid_utf)
{ {
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
if (type == OP_NOT || !char_has_othercase(common, cc)) if (type == OP_NOT || !char_has_othercase(common, cc))
@ -8564,13 +8722,13 @@ switch(type)
if (type == OP_NOT || !char_has_othercase(common, cc)) if (type == OP_NOT || !char_has_othercase(common, cc))
{ {
read_char_range(common, c, c, NULL, TRUE); read_char(common, c, c, backtracks, READ_CHAR_UPDATE_STR_PTR);
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c)); add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
} }
else else
{ {
oc = char_othercase(common, c); oc = char_othercase(common, c);
read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, TRUE); read_char(common, c < oc ? c : oc, c > oc ? c : oc, backtracks, READ_CHAR_UPDATE_STR_PTR);
bit = c ^ oc; bit = c ^ oc;
if (is_powerof2(bit)) if (is_powerof2(bit))
{ {
@ -8592,9 +8750,15 @@ switch(type)
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255; bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255;
read_char_range(common, 0, bit, NULL, type == OP_NCLASS); if (type == OP_NCLASS)
read_char(common, 0, bit, backtracks, READ_CHAR_UPDATE_STR_PTR);
else
read_char(common, 0, bit, NULL, 0);
#else #else
read_char_range(common, 0, 255, NULL, type == OP_NCLASS); if (type == OP_NCLASS)
read_char(common, 0, 255, backtracks, READ_CHAR_UPDATE_STR_PTR);
else
read_char(common, 0, 255, NULL, 0);
#endif #endif
if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks)) if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks))
@ -8788,7 +8952,6 @@ jump_list *no_match = NULL;
int source_reg = COUNT_MATCH; int source_reg = COUNT_MATCH;
int source_end_reg = ARGUMENTS; int source_end_reg = ARGUMENTS;
int char1_reg = STACK_LIMIT; int char1_reg = STACK_LIMIT;
BOOL saved_invalid_utf;
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
if (ref) if (ref)
@ -8830,17 +8993,14 @@ if (common->utf && *cc == OP_REFI)
OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0); OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
OP1(SLJIT_MOV, STR_PTR, 0, source_reg, 0); OP1(SLJIT_MOV, STR_PTR, 0, source_reg, 0);
saved_invalid_utf = common->invalid_utf; read_char(common, 0, READ_CHAR_MAX, NULL, READ_CHAR_UPDATE_STR_PTR | READ_CHAR_VALID_UTF);
common->invalid_utf = FALSE;
read_char_range(common, 0, READ_CHAR_MAX, NULL, TRUE);
common->invalid_utf = saved_invalid_utf;
OP1(SLJIT_MOV, source_reg, 0, STR_PTR, 0); OP1(SLJIT_MOV, source_reg, 0, STR_PTR, 0);
OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0); OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
OP1(SLJIT_MOV, char1_reg, 0, TMP1, 0); OP1(SLJIT_MOV, char1_reg, 0, TMP1, 0);
/* Read second character. */ /* Read second character. */
read_char_range(common, 0, READ_CHAR_MAX, &no_match, TRUE); read_char(common, 0, READ_CHAR_MAX, &no_match, READ_CHAR_UPDATE_STR_PTR);
CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop); CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop);
@ -13572,6 +13732,11 @@ if (common->utfreadchar != NULL)
set_jumps(common->utfreadchar, LABEL()); set_jumps(common->utfreadchar, LABEL());
do_utfreadchar(common); do_utfreadchar(common);
} }
if (common->utfreadchar_invalid_precise != NULL)
{
set_jumps(common->utfreadchar_invalid_precise, LABEL());
do_utfreadchar_invalid_precise(common);
}
if (common->utfreadtype8 != NULL) if (common->utfreadtype8 != NULL)
{ {
set_jumps(common->utfreadtype8, LABEL()); set_jumps(common->utfreadtype8, LABEL());

View File

@ -1755,6 +1755,41 @@ static int regression_tests(void)
} }
} }
#if defined SUPPORT_UNICODE && (defined SUPPORT_PCRE2_8 || defined SUPPORT_PCRE2_16)
static int check_invalid_utf_result(int pattern_index, char *type, int result,
int match_start, int match_end, PCRE2_SIZE *ovector)
{
if (match_start < 0) {
if (result != -1) {
printf("Pattern[%d] %s result is not -1.\n", pattern_index, type);
return 1;
}
return 0;
}
if (result <= 0) {
printf("Pattern[%d] %s result (%d) is not greater than 0.\n", pattern_index, type, result);
return 1;
}
if (ovector[0] != (PCRE2_SIZE)match_start) {
printf("Pattern[%d] %s ovector[0] is unexpected (%d instead of %d)\n",
pattern_index, type, (int)ovector[0], match_start);
return 1;
}
if (ovector[1] != (PCRE2_SIZE)match_end) {
printf("Pattern[%d] %s ovector[1] is unexpected (%d instead of %d)\n",
pattern_index, type, (int)ovector[1], match_end);
return 1;
}
return 0;
}
#endif /* SUPPORT_UNICODE && (SUPPORT_PCRE2_8 || SUPPORT_PCRE2_16) */
#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8
#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED) #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
@ -1767,121 +1802,132 @@ struct invalid_utf8_regression_test_case {
int start_offset; int start_offset;
int skip_left; int skip_left;
int skip_right; int skip_right;
int expected_result; int match_start;
int match_end;
const char *pattern[2]; const char *pattern[2];
const char *input; const char *input;
}; };
static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = { static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = {
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf4\x8f\xbf\xbf" }, { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf0\x90\x80\x80" }, { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf0\x90\x80\x80" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf4\x90\x80\x80" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
{ UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" }, { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\x7f" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\xc0" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf#" }, { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf#" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf" }, { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80#" }, { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80#" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80" }, { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80" },
{ UDA, CI, 0, 0, 2, -1, { ".", NULL }, "\xef\xbf\xbf#" }, { UDA, CI, 0, 0, 2, -1, -1, { ".", NULL }, "\xef\xbf\xbf#" },
{ UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xef\xbf\xbf" }, { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xef\xbf\xbf" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\x7f#" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\x7f#" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\xc0" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\xc0" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf#" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xed\x9f\xbf#" }, { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xed\x9f\xbf#" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xa0\x80#" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xa0\x80#" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xee\x80\x80#" }, { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xee\x80\x80#" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xbf\xbf#" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xbf\xbf#" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf##" }, { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf##" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf#" }, { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf#" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf" }, { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80##" }, { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80##" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80#" }, { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80#" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80" }, { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80##" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80##" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0##" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0##" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf##" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf##" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80###" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80###" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8###" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8###" },
{ UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8" }, { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8" },
{ UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\x7f" }, { UDA, CI, 0, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
{ UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" }, { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" },
{ UDA, CPI, 4, 1, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" }, { UDA, CPI, 4, 1, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" },
{ UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xef\xbf\xbf#" }, { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xef\xbf\xbf#" },
{ UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xe0\xa0\x80#" }, { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xe0\xa0\x80#" },
{ UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf0\x90\x80\x80#" }, { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
{ UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" }, { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" },
{ UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" }, { UDA, CPI, 4, 2, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" },
{ UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" }, { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xdf\xbf#" },
{ UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" }, { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xdf\xbf#" },
{ UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xc2\x80#" }, { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xc2\x80#" },
{ UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xc2\x80#" }, { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xc2\x80#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xc1\xbf#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xc1\xbf#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xdf\xc0#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xdf\xc0#" },
{ UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" }, { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
{ UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" }, { UDA, CPI, 4, 2, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
{ UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xef\xbf\xbf#" }, { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xef\xbf\xbf#" },
{ UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xe0\xa0\x80#" }, { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xe0\xa0\x80#" },
{ UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" }, { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" },
{ UDA, CPI, 3, 1, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" }, { UDA, CPI, 3, 1, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" },
{ UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" }, { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" },
{ UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" }, { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" },
{ UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" }, { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" },
{ UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" }, { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" },
{ UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xdf\xbf#" }, { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xdf\xbf#" },
{ UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xc2\x80#" }, { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xc2\x80#" },
{ UDA, CPI, 2, 1, 0, -1, { "\\B", "\\b" }, "\xdf\xbf#" }, { UDA, CPI, 2, 1, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xbf#" },
{ UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xc1\xbf#" }, { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xc1\xbf#" },
{ UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x80#" }, { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x80#" },
{ UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xdf\xff#" }, { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xff#" },
{ UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xff\xbf#" }, { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xff\xbf#" },
{ UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x7f#" }, { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x7f#" },
{ UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x01#" }, { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x01#" },
{ UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" }, { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80#" },
{ UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" }, { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80#" },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "a\xff" }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "a\xff" },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 6, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 8, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
{ UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "A" }, { UDA, CPI, 0, 0, 0, 0, 1, { "\\X", NULL }, "A" },
{ UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xff" }, { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xff" },
{ UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xc3\xa1" }, { UDA, CPI, 0, 0, 0, 0, 2, { "\\X", NULL }, "\xc3\xa1" },
{ UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xc3\xa1" }, { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xc3\xa1" },
{ UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xc3\x7f" }, { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xc3\x7f" },
{ UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xe1\xbd\xb8" }, { UDA, CPI, 0, 0, 0, 0, 3, { "\\X", NULL }, "\xe1\xbd\xb8" },
{ UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" }, { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
{ UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xf0\x90\x90\x80" }, { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
{ UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" }, { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
{ 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xc3\x0a#"},
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf1\x0a#"},
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xf2\xbf\x0a#"},
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"},
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"},
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"},
{ 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
}; };
#undef UDA #undef UDA
@ -1889,17 +1935,18 @@ static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cas
#undef CPI #undef CPI
static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *current, static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *current,
int pattern_index, int i, pcre2_match_data_8 *mdata) int pattern_index, int i, pcre2_compile_context_8 *ccontext, pcre2_match_data_8 *mdata)
{ {
pcre2_code_8 *code; pcre2_code_8 *code;
int result, errorcode; int result, errorcode;
PCRE2_SIZE length, erroroffset; PCRE2_SIZE length, erroroffset;
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_8(mdata);
if (current->pattern[i] == NULL) if (current->pattern[i] == NULL)
return 1; return 1;
code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED, code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED,
current->compile_options, &errorcode, &erroroffset, NULL); current->compile_options, &errorcode, &erroroffset, ccontext);
if (!code) { if (!code) {
printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset); printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
@ -1918,8 +1965,7 @@ static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *curre
result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left), result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
length, current->start_offset - current->skip_left, 0, mdata, NULL); length, current->start_offset - current->skip_left, 0, mdata, NULL);
if (result != current->expected_result) { if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
pcre2_code_free_8(code); pcre2_code_free_8(code);
return 0; return 0;
} }
@ -1929,8 +1975,7 @@ static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *curre
result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left), result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL); length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
if (result != current->expected_result) { if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
pcre2_code_free_8(code); pcre2_code_free_8(code);
return 0; return 0;
} }
@ -1943,12 +1988,15 @@ static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *curre
static int invalid_utf8_regression_tests(void) static int invalid_utf8_regression_tests(void)
{ {
struct invalid_utf8_regression_test_case *current; struct invalid_utf8_regression_test_case *current;
pcre2_compile_context_8 *ccontext;
pcre2_match_data_8 *mdata; pcre2_match_data_8 *mdata;
int total = 0, successful = 0; int total = 0, successful = 0;
int result; int result;
printf("\nRunning invalid-utf8 JIT regression tests\n"); printf("\nRunning invalid-utf8 JIT regression tests\n");
ccontext = pcre2_compile_context_create_8(NULL);
pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
mdata = pcre2_match_data_create_8(4, NULL); mdata = pcre2_match_data_create_8(4, NULL);
for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) { for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) {
@ -1956,9 +2004,9 @@ static int invalid_utf8_regression_tests(void)
total++; total++;
result = 1; result = 1;
if (!run_invalid_utf8_test(current, total - 1, 0, mdata)) if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
result = 0; result = 0;
if (!run_invalid_utf8_test(current, total - 1, 1, mdata)) if (!run_invalid_utf8_test(current, total - 1, 1, ccontext, mdata))
result = 0; result = 0;
if (result) { if (result) {
@ -1974,6 +2022,7 @@ static int invalid_utf8_regression_tests(void)
printf("\n"); printf("\n");
pcre2_match_data_free_8(mdata); pcre2_match_data_free_8(mdata);
pcre2_compile_context_free_8(ccontext);
if (total == successful) { if (total == successful) {
printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n"); printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n");
@ -2005,7 +2054,8 @@ struct invalid_utf16_regression_test_case {
int start_offset; int start_offset;
int skip_left; int skip_left;
int skip_right; int skip_right;
int expected_result; int match_start;
int match_end;
const PCRE2_UCHAR16 *pattern[2]; const PCRE2_UCHAR16 *pattern[2];
const PCRE2_UCHAR16 *input; const PCRE2_UCHAR16 *input;
}; };
@ -2024,41 +2074,41 @@ static PCRE2_UCHAR16 test6[] = { 'a', 'A', 0xdc28, 0 };
static PCRE2_UCHAR16 test7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 }; static PCRE2_UCHAR16 test7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = { static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
{ UDA, CI, 0, 0, 0, 1, { allany, NULL }, test1 }, { UDA, CI, 0, 0, 0, 0, 1, { allany, NULL }, test1 },
{ UDA, CI, 1, 0, 0, 1, { allany, NULL }, test1 }, { UDA, CI, 1, 0, 0, 1, 2, { allany, NULL }, test1 },
{ UDA, CI, 2, 0, 0, 1, { allany, NULL }, test1 }, { UDA, CI, 2, 0, 0, 2, 3, { allany, NULL }, test1 },
{ UDA, CI, 3, 0, 0, 1, { allany, NULL }, test1 }, { UDA, CI, 3, 0, 0, 3, 4, { allany, NULL }, test1 },
{ UDA, CI, 0, 0, 0, 1, { allany, NULL }, test2 }, { UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test2 },
{ UDA, CI, 0, 0, 2, -1, { allany, NULL }, test2 }, { UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test2 },
{ UDA, CI, 1, 0, 0, -1, { allany, NULL }, test2 }, { UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test2 },
{ UDA, CI, 0, 0, 0, 1, { allany, NULL }, test3 }, { UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test3 },
{ UDA, CI, 0, 0, 2, -1, { allany, NULL }, test3 }, { UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test3 },
{ UDA, CI, 1, 0, 0, -1, { allany, NULL }, test3 }, { UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test3 },
{ UDA, CPI, 1, 0, 0, 1, { non_word_boundary, NULL }, test1 }, { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary, NULL }, test1 },
{ UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test1 }, { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test1 },
{ UDA, CPI, 3, 0, 0, 1, { non_word_boundary, NULL }, test1 }, { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary, NULL }, test1 },
{ UDA, CPI, 4, 0, 0, 1, { non_word_boundary, NULL }, test1 }, { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary, NULL }, test1 },
{ UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test2 }, { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test2 },
{ UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test3 }, { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test3 },
{ UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test2 }, { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test2 },
{ UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test3 }, { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test3 },
{ UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test4 }, { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test4 },
{ UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test5 }, { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test5 },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test6 }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference, NULL }, test6 },
{ UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, { backreference, NULL }, test6 }, { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference, NULL }, test6 },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test7 }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference, NULL }, test7 },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { backreference, NULL }, test7 }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference, NULL }, test7 },
{ UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test6 }, { UDA, CPI, 0, 0, 0, 0, 1, { grapheme, NULL }, test6 },
{ UDA, CPI, 1, 0, 0, 1, { grapheme, NULL }, test6 }, { UDA, CPI, 1, 0, 0, 1, 2, { grapheme, NULL }, test6 },
{ UDA, CPI, 2, 0, 0, -1, { grapheme, NULL }, test6 }, { UDA, CPI, 2, 0, 0, -1, -1, { grapheme, NULL }, test6 },
{ UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test7 }, { UDA, CPI, 0, 0, 0, 0, 2, { grapheme, NULL }, test7 },
{ UDA, CPI, 2, 0, 0, 1, { grapheme, NULL }, test7 }, { UDA, CPI, 2, 0, 0, 2, 4, { grapheme, NULL }, test7 },
{ UDA, CPI, 1, 0, 0, -1, { grapheme, NULL }, test7 }, { UDA, CPI, 1, 0, 0, -1, -1, { grapheme, NULL }, test7 },
{ 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
}; };
#undef UDA #undef UDA
@ -2066,18 +2116,19 @@ static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_c
#undef CPI #undef CPI
static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *current, static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *current,
int pattern_index, int i, pcre2_match_data_16 *mdata) int pattern_index, int i, pcre2_compile_context_16 *ccontext, pcre2_match_data_16 *mdata)
{ {
pcre2_code_16 *code; pcre2_code_16 *code;
int result, errorcode; int result, errorcode;
PCRE2_SIZE length, erroroffset; PCRE2_SIZE length, erroroffset;
const PCRE2_UCHAR16 *input; const PCRE2_UCHAR16 *input;
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(mdata);
if (current->pattern[i] == NULL) if (current->pattern[i] == NULL)
return 1; return 1;
code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED, code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED,
current->compile_options, &errorcode, &erroroffset, NULL); current->compile_options, &errorcode, &erroroffset, ccontext);
if (!code) { if (!code) {
printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset); printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
@ -2102,8 +2153,7 @@ static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *cur
result = pcre2_jit_match_16(code, (current->input + current->skip_left), result = pcre2_jit_match_16(code, (current->input + current->skip_left),
length, current->start_offset - current->skip_left, 0, mdata, NULL); length, current->start_offset - current->skip_left, 0, mdata, NULL);
if (result != current->expected_result) { if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
pcre2_code_free_16(code); pcre2_code_free_16(code);
return 0; return 0;
} }
@ -2113,8 +2163,7 @@ static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *cur
result = pcre2_jit_match_16(code, (current->input + current->skip_left), result = pcre2_jit_match_16(code, (current->input + current->skip_left),
length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL); length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
if (result != current->expected_result) { if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
pcre2_code_free_16(code); pcre2_code_free_16(code);
return 0; return 0;
} }
@ -2127,12 +2176,15 @@ static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *cur
static int invalid_utf16_regression_tests(void) static int invalid_utf16_regression_tests(void)
{ {
struct invalid_utf16_regression_test_case *current; struct invalid_utf16_regression_test_case *current;
pcre2_compile_context_16 *ccontext;
pcre2_match_data_16 *mdata; pcre2_match_data_16 *mdata;
int total = 0, successful = 0; int total = 0, successful = 0;
int result; int result;
printf("\nRunning invalid-utf16 JIT regression tests\n"); printf("\nRunning invalid-utf16 JIT regression tests\n");
ccontext = pcre2_compile_context_create_16(NULL);
pcre2_set_newline_16(ccontext, PCRE2_NEWLINE_ANY);
mdata = pcre2_match_data_create_16(4, NULL); mdata = pcre2_match_data_create_16(4, NULL);
for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) { for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) {
@ -2140,9 +2192,9 @@ static int invalid_utf16_regression_tests(void)
total++; total++;
result = 1; result = 1;
if (!run_invalid_utf16_test(current, total - 1, 0, mdata)) if (!run_invalid_utf16_test(current, total - 1, 0, ccontext, mdata))
result = 0; result = 0;
if (!run_invalid_utf16_test(current, total - 1, 1, mdata)) if (!run_invalid_utf16_test(current, total - 1, 1, ccontext, mdata))
result = 0; result = 0;
if (result) { if (result) {
@ -2158,6 +2210,7 @@ static int invalid_utf16_regression_tests(void)
printf("\n"); printf("\n");
pcre2_match_data_free_16(mdata); pcre2_match_data_free_16(mdata);
pcre2_compile_context_free_16(ccontext);
if (total == successful) { if (total == successful) {
printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n"); printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");