Fix more invalid UTF issues revealed by new tests in the JIT compiler.
This commit is contained in:
parent
7035170527
commit
a0188b9ee1
|
@ -485,12 +485,12 @@ typedef struct compiler_common {
|
|||
jump_list *getucdtype;
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
jump_list *utfreadchar;
|
||||
jump_list *utfreadnewline_invalid;
|
||||
jump_list *utfreadtype8;
|
||||
jump_list *utfpeakcharback;
|
||||
#endif
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
|
||||
jump_list *utfreadchar_invalid;
|
||||
jump_list *utfreadnewline_invalid;
|
||||
jump_list *utfmoveback_invalid;
|
||||
jump_list *utfpeakcharback_invalid;
|
||||
#endif
|
||||
|
@ -640,7 +640,7 @@ the start pointers when the end of the capturing group has not yet reached. */
|
|||
{ \
|
||||
c = ptr[1] - 0x80; \
|
||||
\
|
||||
if (ptr[0] >= 0xc0 && ptr[0] <= 0xdf) \
|
||||
if (ptr[0] >= 0xc2 && ptr[0] <= 0xdf) \
|
||||
{ \
|
||||
c |= (ptr[0] - 0xc0) << 6; \
|
||||
ptr += 2; \
|
||||
|
@ -653,6 +653,11 @@ the start pointers when the end of the capturing group has not yet reached. */
|
|||
{ \
|
||||
c |= (ptr[0] - 0xe0) << 12; \
|
||||
ptr += 3; \
|
||||
\
|
||||
if (c < 0x800 || (c >= 0xd800 && c < 0xe000)) \
|
||||
{ \
|
||||
invalid_action; \
|
||||
} \
|
||||
} \
|
||||
else if (ptr + 3 < end && ptr[3] >= 0x80 && ptr[3] < 0xc0) \
|
||||
{ \
|
||||
|
@ -663,7 +668,7 @@ the start pointers when the end of the capturing group has not yet reached. */
|
|||
c |= (ptr[0] - 0xf0) << 18; \
|
||||
ptr += 4; \
|
||||
\
|
||||
if (c >= 0x110000) \
|
||||
if (c >= 0x110000 || c < 0x10000) \
|
||||
{ \
|
||||
invalid_action; \
|
||||
} \
|
||||
|
@ -697,7 +702,7 @@ the start pointers when the end of the capturing group has not yet reached. */
|
|||
{ \
|
||||
c = ptr[-1] - 0x80; \
|
||||
\
|
||||
if (ptr[-2] >= 0xc0 && ptr[-2] <= 0xdf) \
|
||||
if (ptr[-2] >= 0xc2 && ptr[-2] <= 0xdf) \
|
||||
{ \
|
||||
c |= (ptr[-2] - 0xc0) << 6; \
|
||||
ptr -= 2; \
|
||||
|
@ -710,6 +715,11 @@ the start pointers when the end of the capturing group has not yet reached. */
|
|||
{ \
|
||||
c |= (ptr[-3] - 0xe0) << 12; \
|
||||
ptr -= 3; \
|
||||
\
|
||||
if (c < 0x800 || (c >= 0xd800 && c < 0xe000)) \
|
||||
{ \
|
||||
invalid_action; \
|
||||
} \
|
||||
} \
|
||||
else if (ptr - 3 > start && ptr[-3] >= 0x80 && ptr[-3] < 0xc0) \
|
||||
{ \
|
||||
|
@ -720,7 +730,7 @@ the start pointers when the end of the capturing group has not yet reached. */
|
|||
c |= (ptr[-4] - 0xf0) << 18; \
|
||||
ptr -= 4; \
|
||||
\
|
||||
if (c >= 0x110000) \
|
||||
if (c >= 0x110000 || c < 0x10000) \
|
||||
{ \
|
||||
invalid_action; \
|
||||
} \
|
||||
|
@ -3343,7 +3353,7 @@ else
|
|||
JUMPHERE(jump);
|
||||
}
|
||||
|
||||
static void peek_char(compiler_common *common, sljit_u32 max, sljit_s32 dst, sljit_sw dstw)
|
||||
static void peek_char(compiler_common *common, sljit_u32 max, sljit_s32 dst, sljit_sw dstw, jump_list **backtracks)
|
||||
{
|
||||
/* Reads the character into TMP1, keeps STR_PTR.
|
||||
Does not check STR_END. TMP2, dst, RETURN_ADDR Destroyed. */
|
||||
|
@ -3355,6 +3365,7 @@ struct sljit_jump *jump;
|
|||
SLJIT_UNUSED_ARG(max);
|
||||
SLJIT_UNUSED_ARG(dst);
|
||||
SLJIT_UNUSED_ARG(dstw);
|
||||
SLJIT_UNUSED_ARG(backtracks);
|
||||
|
||||
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
|
||||
|
||||
|
@ -3369,6 +3380,8 @@ if (common->utf)
|
|||
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
|
||||
add_jump(compiler, common->invalid_utf ? &common->utfreadchar_invalid : &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
|
||||
OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw);
|
||||
if (backtracks && common->invalid_utf)
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
|
||||
JUMPHERE(jump);
|
||||
}
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
|
@ -3381,7 +3394,12 @@ if (common->utf)
|
|||
if (common->invalid_utf)
|
||||
{
|
||||
jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
|
||||
OP1(SLJIT_MOV, dst, dstw, STR_PTR, 0);
|
||||
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
|
||||
add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
|
||||
OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw);
|
||||
if (backtracks && common->invalid_utf)
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -3392,10 +3410,20 @@ if (common->utf)
|
|||
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
|
||||
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
|
||||
}
|
||||
|
||||
JUMPHERE(jump);
|
||||
}
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 32
|
||||
|
||||
if (common->invalid_utf)
|
||||
{
|
||||
if (backtracks != NULL)
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
|
||||
else
|
||||
{
|
||||
OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
|
||||
CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
|
||||
}
|
||||
}
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
}
|
||||
|
@ -3458,7 +3486,8 @@ if (common->utf)
|
|||
JUMPHERE(jump);
|
||||
}
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 32
|
||||
|
||||
if (common->invalid_utf)
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
}
|
||||
|
@ -3591,7 +3620,12 @@ if (common->utf)
|
|||
{
|
||||
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
|
||||
jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
|
||||
add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
|
||||
|
||||
if (options & READ_CHAR_UTF8_NEWLINE)
|
||||
add_jump(compiler, &common->utfreadnewline_invalid, JUMP(SLJIT_FAST_CALL));
|
||||
else
|
||||
add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
|
||||
|
||||
if (backtracks != NULL)
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
|
||||
JUMPHERE(jump);
|
||||
|
@ -3868,7 +3902,18 @@ if (common->utf)
|
|||
if (common->invalid_utf && !must_be_valid)
|
||||
{
|
||||
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
|
||||
if (backtracks != NULL)
|
||||
{
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
|
||||
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
|
||||
return;
|
||||
}
|
||||
|
||||
OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
|
||||
OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_LESS);
|
||||
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
|
||||
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
|
||||
return;
|
||||
}
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
@ -4132,8 +4177,9 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
|
|||
|
||||
static void do_utfreadnewline_invalid(compiler_common *common)
|
||||
{
|
||||
/* Slow decoding a UTF-8 character. TMP1 contains the first byte
|
||||
of the character (>= 0xc0). Return char value in TMP1. */
|
||||
/* Slow decoding a UTF-8 character, specialized for newlines.
|
||||
TMP1 contains the first byte of the character (>= 0xc0). Return
|
||||
char value in TMP1. */
|
||||
DEFINE_COMPILER;
|
||||
struct sljit_jump *jump;
|
||||
struct sljit_jump *buffer_end_close;
|
||||
|
@ -4506,7 +4552,8 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
|
|||
static void do_utfreadchar_invalid(compiler_common *common)
|
||||
{
|
||||
/* Slow decoding a UTF-16 character. TMP1 contains the first half
|
||||
of the character (>= 0xd800). Return char value in TMP1, length in TMP2. */
|
||||
of the character (>= 0xd800). Return char value in TMP1. STR_PTR is
|
||||
undefined for invalid characters. */
|
||||
DEFINE_COMPILER;
|
||||
struct sljit_jump *exit_invalid[3];
|
||||
|
||||
|
@ -4534,6 +4581,38 @@ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
|
|||
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
|
||||
}
|
||||
|
||||
static void do_utfreadnewline_invalid(compiler_common *common)
|
||||
{
|
||||
/* Slow decoding a UTF-16 character, specialized for newlines.
|
||||
TMP1 contains the first half of the character (>= 0xd800). Return
|
||||
char value in TMP1. */
|
||||
|
||||
DEFINE_COMPILER;
|
||||
struct sljit_jump *exit_invalid[2];
|
||||
|
||||
sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
|
||||
|
||||
/* TMP2 contains the high surrogate. */
|
||||
exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
|
||||
|
||||
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
|
||||
exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00);
|
||||
|
||||
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xdc00);
|
||||
OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x400);
|
||||
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS);
|
||||
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000);
|
||||
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
|
||||
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
|
||||
|
||||
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
|
||||
|
||||
JUMPHERE(exit_invalid[0]);
|
||||
JUMPHERE(exit_invalid[1]);
|
||||
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
|
||||
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
|
||||
}
|
||||
|
||||
static void do_utfmoveback_invalid(compiler_common *common)
|
||||
{
|
||||
/* Goes one character back. */
|
||||
|
@ -6651,7 +6730,7 @@ JUMPHERE(skipread);
|
|||
|
||||
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
|
||||
check_str_end(common, &skipread_list);
|
||||
peek_char(common, READ_CHAR_MAX, SLJIT_MEM1(SLJIT_SP), LOCALS1);
|
||||
peek_char(common, READ_CHAR_MAX, SLJIT_MEM1(SLJIT_SP), LOCALS1, &invalid_utf);
|
||||
|
||||
/* Testing char type. This is a code duplication. */
|
||||
#ifdef SUPPORT_UNICODE
|
||||
|
@ -8144,7 +8223,7 @@ switch(type)
|
|||
}
|
||||
else
|
||||
{
|
||||
peek_char(common, common->nlmax, TMP3, 0);
|
||||
peek_char(common, common->nlmax, TMP3, 0, NULL);
|
||||
check_newlinechar(common, common->nltype, backtracks, FALSE);
|
||||
}
|
||||
JUMPHERE(jump[0]);
|
||||
|
@ -8506,7 +8585,7 @@ switch(type)
|
|||
case OP_ALLANY:
|
||||
if (check_str_ptr)
|
||||
detect_partial_match(common, backtracks);
|
||||
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (common->utf)
|
||||
{
|
||||
if (common->invalid_utf)
|
||||
|
@ -8515,9 +8594,9 @@ switch(type)
|
|||
return cc;
|
||||
}
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
|
||||
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
|
||||
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
|
||||
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
|
||||
|
@ -8529,12 +8608,12 @@ switch(type)
|
|||
OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
|
||||
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
|
||||
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
|
||||
#endif
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
|
||||
JUMPHERE(jump[0]);
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */
|
||||
return cc;
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */
|
||||
}
|
||||
#endif
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
|
||||
return cc;
|
||||
|
||||
|
@ -13749,11 +13828,6 @@ if (common->utfreadchar != NULL)
|
|||
set_jumps(common->utfreadchar, LABEL());
|
||||
do_utfreadchar(common);
|
||||
}
|
||||
if (common->utfreadnewline_invalid != NULL)
|
||||
{
|
||||
set_jumps(common->utfreadnewline_invalid, LABEL());
|
||||
do_utfreadnewline_invalid(common);
|
||||
}
|
||||
if (common->utfreadtype8 != NULL)
|
||||
{
|
||||
set_jumps(common->utfreadtype8, LABEL());
|
||||
|
@ -13771,6 +13845,11 @@ if (common->utfreadchar_invalid != NULL)
|
|||
set_jumps(common->utfreadchar_invalid, LABEL());
|
||||
do_utfreadchar_invalid(common);
|
||||
}
|
||||
if (common->utfreadnewline_invalid != NULL)
|
||||
{
|
||||
set_jumps(common->utfreadnewline_invalid, LABEL());
|
||||
do_utfreadnewline_invalid(common);
|
||||
}
|
||||
if (common->utfmoveback_invalid)
|
||||
{
|
||||
set_jumps(common->utfmoveback_invalid, LABEL());
|
||||
|
|
|
@ -95,6 +95,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
static int regression_tests(void);
|
||||
static int invalid_utf8_regression_tests(void);
|
||||
static int invalid_utf16_regression_tests(void);
|
||||
static int invalid_utf32_regression_tests(void);
|
||||
|
||||
int main(void)
|
||||
{
|
||||
|
@ -111,8 +112,9 @@ int main(void)
|
|||
return 1;
|
||||
}
|
||||
return regression_tests()
|
||||
|| invalid_utf8_regression_tests()
|
||||
|| invalid_utf16_regression_tests();
|
||||
| invalid_utf8_regression_tests()
|
||||
| invalid_utf16_regression_tests()
|
||||
| invalid_utf32_regression_tests();
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------------------------- */
|
||||
|
@ -1917,6 +1919,11 @@ static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cas
|
|||
{ UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
|
||||
{ UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
|
||||
|
||||
{ UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "#" },
|
||||
{ UDA, CPI, 0, 0, 0, 0, 4, { "[^#]", NULL }, "\xf4\x8f\xbf\xbf" },
|
||||
{ UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xf4\x90\x80\x80" },
|
||||
{ UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xc1\x80" },
|
||||
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
|
||||
|
@ -2069,53 +2076,65 @@ struct invalid_utf16_regression_test_case {
|
|||
const PCRE2_UCHAR16 *input;
|
||||
};
|
||||
|
||||
static PCRE2_UCHAR16 allany[] = { '.', 0 };
|
||||
static PCRE2_UCHAR16 non_word_boundary[] = { '\\', 'B', 0 };
|
||||
static PCRE2_UCHAR16 word_boundary[] = { '\\', 'b', 0 };
|
||||
static PCRE2_UCHAR16 backreference[] = { '(', '.', ')', '\\', '1', 0 };
|
||||
static PCRE2_UCHAR16 grapheme[] = { '\\', 'X', 0 };
|
||||
static PCRE2_UCHAR16 test1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 };
|
||||
static PCRE2_UCHAR16 test2[] = { 0xd800, 0xdc00, '#', 0 };
|
||||
static PCRE2_UCHAR16 test3[] = { 0xdbff, 0xdfff, '#', 0 };
|
||||
static PCRE2_UCHAR16 test4[] = { 0xd800, 0xdbff, '#', 0 };
|
||||
static PCRE2_UCHAR16 test5[] = { '#', 0xd800, '#', 0 };
|
||||
static PCRE2_UCHAR16 test6[] = { 'a', 'A', 0xdc28, 0 };
|
||||
static PCRE2_UCHAR16 test7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
|
||||
static PCRE2_UCHAR16 allany16[] = { '.', 0 };
|
||||
static PCRE2_UCHAR16 non_word_boundary16[] = { '\\', 'B', 0 };
|
||||
static PCRE2_UCHAR16 word_boundary16[] = { '\\', 'b', 0 };
|
||||
static PCRE2_UCHAR16 backreference16[] = { '(', '.', ')', '\\', '1', 0 };
|
||||
static PCRE2_UCHAR16 grapheme16[] = { '\\', 'X', 0 };
|
||||
static PCRE2_UCHAR16 nothashmark16[] = { '[', '^', '#', ']', 0 };
|
||||
static PCRE2_UCHAR16 afternl16[] = { '^', '\\', 'W', 0 };
|
||||
static PCRE2_UCHAR16 test16_1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 };
|
||||
static PCRE2_UCHAR16 test16_2[] = { 0xd800, 0xdc00, '#', 0 };
|
||||
static PCRE2_UCHAR16 test16_3[] = { 0xdbff, 0xdfff, '#', 0 };
|
||||
static PCRE2_UCHAR16 test16_4[] = { 0xd800, 0xdbff, '#', 0 };
|
||||
static PCRE2_UCHAR16 test16_5[] = { '#', 0xd800, '#', 0 };
|
||||
static PCRE2_UCHAR16 test16_6[] = { 'a', 'A', 0xdc28, 0 };
|
||||
static PCRE2_UCHAR16 test16_7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
|
||||
static PCRE2_UCHAR16 test16_8[] = { '#', 0xd800, 0xdc00, 0 };
|
||||
static PCRE2_UCHAR16 test16_9[] = { ' ', 0x2028, '#', 0 };
|
||||
static PCRE2_UCHAR16 test16_10[] = { ' ', 0xdc00, 0xd800, 0x2028, '#', 0 };
|
||||
|
||||
static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
|
||||
{ UDA, CI, 0, 0, 0, 0, 1, { allany, NULL }, test1 },
|
||||
{ UDA, CI, 1, 0, 0, 1, 2, { allany, NULL }, test1 },
|
||||
{ UDA, CI, 2, 0, 0, 2, 3, { allany, NULL }, test1 },
|
||||
{ UDA, CI, 3, 0, 0, 3, 4, { allany, NULL }, test1 },
|
||||
{ UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test2 },
|
||||
{ UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test2 },
|
||||
{ UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test2 },
|
||||
{ UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test3 },
|
||||
{ UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test3 },
|
||||
{ UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test3 },
|
||||
{ UDA, CI, 0, 0, 0, 0, 1, { allany16, NULL }, test16_1 },
|
||||
{ UDA, CI, 1, 0, 0, 1, 2, { allany16, NULL }, test16_1 },
|
||||
{ UDA, CI, 2, 0, 0, 2, 3, { allany16, NULL }, test16_1 },
|
||||
{ UDA, CI, 3, 0, 0, 3, 4, { allany16, NULL }, test16_1 },
|
||||
{ UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_2 },
|
||||
{ UDA, CI, 0, 0, 2, -1, -1, { allany16, NULL }, test16_2 },
|
||||
{ UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_2 },
|
||||
{ UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_3 },
|
||||
{ UDA, CI, 0, 0, 2, -1, -1, { allany16, NULL }, test16_3 },
|
||||
{ UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_3 },
|
||||
|
||||
{ UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary, NULL }, test1 },
|
||||
{ UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test1 },
|
||||
{ UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary, NULL }, test1 },
|
||||
{ UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary, NULL }, test1 },
|
||||
{ UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test2 },
|
||||
{ UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test3 },
|
||||
{ UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test2 },
|
||||
{ UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test3 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test4 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test5 },
|
||||
{ UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary16, NULL }, test16_1 },
|
||||
{ UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_1 },
|
||||
{ UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary16, NULL }, test16_1 },
|
||||
{ UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary16, NULL }, test16_1 },
|
||||
{ UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_2 },
|
||||
{ UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_3 },
|
||||
{ UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_2 },
|
||||
{ UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_3 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_4 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_5 },
|
||||
|
||||
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference, NULL }, test6 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference, NULL }, test6 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference, NULL }, test7 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference, NULL }, test7 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference16, NULL }, test16_6 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference16, NULL }, test16_6 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference16, NULL }, test16_7 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference16, NULL }, test16_7 },
|
||||
|
||||
{ UDA, CPI, 0, 0, 0, 0, 1, { grapheme, NULL }, test6 },
|
||||
{ UDA, CPI, 1, 0, 0, 1, 2, { grapheme, NULL }, test6 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { grapheme, NULL }, test6 },
|
||||
{ UDA, CPI, 0, 0, 0, 0, 2, { grapheme, NULL }, test7 },
|
||||
{ UDA, CPI, 2, 0, 0, 2, 4, { grapheme, NULL }, test7 },
|
||||
{ UDA, CPI, 1, 0, 0, -1, -1, { grapheme, NULL }, test7 },
|
||||
{ UDA, CPI, 0, 0, 0, 0, 1, { grapheme16, NULL }, test16_6 },
|
||||
{ UDA, CPI, 1, 0, 0, 1, 2, { grapheme16, NULL }, test16_6 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { grapheme16, NULL }, test16_6 },
|
||||
{ UDA, CPI, 0, 0, 0, 0, 2, { grapheme16, NULL }, test16_7 },
|
||||
{ UDA, CPI, 2, 0, 0, 2, 4, { grapheme16, NULL }, test16_7 },
|
||||
{ UDA, CPI, 1, 0, 0, -1, -1, { grapheme16, NULL }, test16_7 },
|
||||
|
||||
{ UDA, CPI, 0, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
|
||||
{ UDA, CPI, 1, 0, 0, 1, 3, { nothashmark16, NULL }, test16_8 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
|
||||
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl16, NULL }, test16_9 },
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { afternl16, NULL }, test16_10 },
|
||||
|
||||
{ 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
|
||||
};
|
||||
|
@ -2239,4 +2258,178 @@ static int invalid_utf16_regression_tests(void)
|
|||
|
||||
#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */
|
||||
|
||||
#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_32
|
||||
|
||||
#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
|
||||
#define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
|
||||
#define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
|
||||
|
||||
struct invalid_utf32_regression_test_case {
|
||||
int compile_options;
|
||||
int jit_compile_options;
|
||||
int start_offset;
|
||||
int skip_left;
|
||||
int skip_right;
|
||||
int match_start;
|
||||
int match_end;
|
||||
const PCRE2_UCHAR32 *pattern[2];
|
||||
const PCRE2_UCHAR32 *input;
|
||||
};
|
||||
|
||||
static PCRE2_UCHAR32 allany32[] = { '.', 0 };
|
||||
static PCRE2_UCHAR32 non_word_boundary32[] = { '\\', 'B', 0 };
|
||||
static PCRE2_UCHAR32 word_boundary32[] = { '\\', 'b', 0 };
|
||||
static PCRE2_UCHAR32 backreference32[] = { '(', '.', ')', '\\', '1', 0 };
|
||||
static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 };
|
||||
static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 };
|
||||
static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 };
|
||||
static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x10ffff, 0 };
|
||||
static PCRE2_UCHAR32 test32_2[] = { 'a', 'A', 0x110000, 0 };
|
||||
static PCRE2_UCHAR32 test32_3[] = { '#', 0x10ffff, 0x110000, 0 };
|
||||
static PCRE2_UCHAR32 test32_4[] = { ' ', 0x2028, '#', 0 };
|
||||
static PCRE2_UCHAR32 test32_5[] = { ' ', 0x110000, 0x2028, '#', 0 };
|
||||
|
||||
static struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = {
|
||||
{ UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 },
|
||||
{ UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 },
|
||||
|
||||
{ UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
|
||||
{ UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
|
||||
|
||||
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_2 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_2 },
|
||||
|
||||
{ UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 },
|
||||
|
||||
{ UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 },
|
||||
{ UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_3 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 },
|
||||
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_4 },
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_5 },
|
||||
|
||||
{ 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
|
||||
};
|
||||
|
||||
#undef UDA
|
||||
#undef CI
|
||||
#undef CPI
|
||||
|
||||
static int run_invalid_utf32_test(struct invalid_utf32_regression_test_case *current,
|
||||
int pattern_index, int i, pcre2_compile_context_32 *ccontext, pcre2_match_data_32 *mdata)
|
||||
{
|
||||
pcre2_code_32 *code;
|
||||
int result, errorcode;
|
||||
PCRE2_SIZE length, erroroffset;
|
||||
const PCRE2_UCHAR32 *input;
|
||||
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(mdata);
|
||||
|
||||
if (current->pattern[i] == NULL)
|
||||
return 1;
|
||||
|
||||
code = pcre2_compile_32(current->pattern[i], PCRE2_ZERO_TERMINATED,
|
||||
current->compile_options, &errorcode, &erroroffset, ccontext);
|
||||
|
||||
if (!code) {
|
||||
printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (pcre2_jit_compile_32(code, current->jit_compile_options) != 0) {
|
||||
printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
|
||||
pcre2_code_free_32(code);
|
||||
return 0;
|
||||
}
|
||||
|
||||
input = current->input;
|
||||
length = 0;
|
||||
|
||||
while (*input++ != 0)
|
||||
length++;
|
||||
|
||||
length -= current->skip_left + current->skip_right;
|
||||
|
||||
if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
|
||||
result = pcre2_jit_match_32(code, (current->input + current->skip_left),
|
||||
length, current->start_offset - current->skip_left, 0, mdata, NULL);
|
||||
|
||||
if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
|
||||
pcre2_code_free_32(code);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
|
||||
result = pcre2_jit_match_32(code, (current->input + current->skip_left),
|
||||
length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
|
||||
|
||||
if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
|
||||
pcre2_code_free_32(code);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
pcre2_code_free_32(code);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int invalid_utf32_regression_tests(void)
|
||||
{
|
||||
struct invalid_utf32_regression_test_case *current;
|
||||
pcre2_compile_context_32 *ccontext;
|
||||
pcre2_match_data_32 *mdata;
|
||||
int total = 0, successful = 0;
|
||||
int result;
|
||||
|
||||
printf("\nRunning invalid-utf32 JIT regression tests\n");
|
||||
|
||||
ccontext = pcre2_compile_context_create_32(NULL);
|
||||
pcre2_set_newline_32(ccontext, PCRE2_NEWLINE_ANY);
|
||||
mdata = pcre2_match_data_create_32(4, NULL);
|
||||
|
||||
for (current = invalid_utf32_regression_test_cases; current->pattern[0]; current++) {
|
||||
/* printf("\nPattern: %s :\n", current->pattern); */
|
||||
total++;
|
||||
|
||||
result = 1;
|
||||
if (!run_invalid_utf32_test(current, total - 1, 0, ccontext, mdata))
|
||||
result = 0;
|
||||
if (!run_invalid_utf32_test(current, total - 1, 1, ccontext, mdata))
|
||||
result = 0;
|
||||
|
||||
if (result) {
|
||||
successful++;
|
||||
}
|
||||
|
||||
printf(".");
|
||||
if ((total % 60) == 0)
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
if ((total % 60) != 0)
|
||||
printf("\n");
|
||||
|
||||
pcre2_match_data_free_32(mdata);
|
||||
pcre2_compile_context_free_32(ccontext);
|
||||
|
||||
if (total == successful) {
|
||||
printf("\nAll invalid UTF32 JIT regression tests are successfully passed.\n");
|
||||
return 0;
|
||||
} else {
|
||||
printf("\nInvalid UTF32 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
#else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_32 */
|
||||
|
||||
static int invalid_utf32_regression_tests(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_32 */
|
||||
|
||||
/* End of pcre2_jit_test.c */
|
||||
|
|
Loading…
Reference in New Issue