Fix more invalid UTF issues revealed by new tests in the JIT compiler.

This commit is contained in:
Zoltán Herczeg 2018-09-17 08:09:51 +00:00
parent 7035170527
commit a0188b9ee1
2 changed files with 341 additions and 69 deletions

View File

@ -485,12 +485,12 @@ typedef struct compiler_common {
jump_list *getucdtype; jump_list *getucdtype;
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
jump_list *utfreadchar; jump_list *utfreadchar;
jump_list *utfreadnewline_invalid;
jump_list *utfreadtype8; jump_list *utfreadtype8;
jump_list *utfpeakcharback; jump_list *utfpeakcharback;
#endif #endif
#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 #if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
jump_list *utfreadchar_invalid; jump_list *utfreadchar_invalid;
jump_list *utfreadnewline_invalid;
jump_list *utfmoveback_invalid; jump_list *utfmoveback_invalid;
jump_list *utfpeakcharback_invalid; jump_list *utfpeakcharback_invalid;
#endif #endif
@ -640,7 +640,7 @@ the start pointers when the end of the capturing group has not yet reached. */
{ \ { \
c = ptr[1] - 0x80; \ c = ptr[1] - 0x80; \
\ \
if (ptr[0] >= 0xc0 && ptr[0] <= 0xdf) \ if (ptr[0] >= 0xc2 && ptr[0] <= 0xdf) \
{ \ { \
c |= (ptr[0] - 0xc0) << 6; \ c |= (ptr[0] - 0xc0) << 6; \
ptr += 2; \ ptr += 2; \
@ -653,6 +653,11 @@ the start pointers when the end of the capturing group has not yet reached. */
{ \ { \
c |= (ptr[0] - 0xe0) << 12; \ c |= (ptr[0] - 0xe0) << 12; \
ptr += 3; \ ptr += 3; \
\
if (c < 0x800 || (c >= 0xd800 && c < 0xe000)) \
{ \
invalid_action; \
} \
} \ } \
else if (ptr + 3 < end && ptr[3] >= 0x80 && ptr[3] < 0xc0) \ else if (ptr + 3 < end && ptr[3] >= 0x80 && ptr[3] < 0xc0) \
{ \ { \
@ -663,7 +668,7 @@ the start pointers when the end of the capturing group has not yet reached. */
c |= (ptr[0] - 0xf0) << 18; \ c |= (ptr[0] - 0xf0) << 18; \
ptr += 4; \ ptr += 4; \
\ \
if (c >= 0x110000) \ if (c >= 0x110000 || c < 0x10000) \
{ \ { \
invalid_action; \ invalid_action; \
} \ } \
@ -697,7 +702,7 @@ the start pointers when the end of the capturing group has not yet reached. */
{ \ { \
c = ptr[-1] - 0x80; \ c = ptr[-1] - 0x80; \
\ \
if (ptr[-2] >= 0xc0 && ptr[-2] <= 0xdf) \ if (ptr[-2] >= 0xc2 && ptr[-2] <= 0xdf) \
{ \ { \
c |= (ptr[-2] - 0xc0) << 6; \ c |= (ptr[-2] - 0xc0) << 6; \
ptr -= 2; \ ptr -= 2; \
@ -710,6 +715,11 @@ the start pointers when the end of the capturing group has not yet reached. */
{ \ { \
c |= (ptr[-3] - 0xe0) << 12; \ c |= (ptr[-3] - 0xe0) << 12; \
ptr -= 3; \ ptr -= 3; \
\
if (c < 0x800 || (c >= 0xd800 && c < 0xe000)) \
{ \
invalid_action; \
} \
} \ } \
else if (ptr - 3 > start && ptr[-3] >= 0x80 && ptr[-3] < 0xc0) \ else if (ptr - 3 > start && ptr[-3] >= 0x80 && ptr[-3] < 0xc0) \
{ \ { \
@ -720,7 +730,7 @@ the start pointers when the end of the capturing group has not yet reached. */
c |= (ptr[-4] - 0xf0) << 18; \ c |= (ptr[-4] - 0xf0) << 18; \
ptr -= 4; \ ptr -= 4; \
\ \
if (c >= 0x110000) \ if (c >= 0x110000 || c < 0x10000) \
{ \ { \
invalid_action; \ invalid_action; \
} \ } \
@ -3343,7 +3353,7 @@ else
JUMPHERE(jump); JUMPHERE(jump);
} }
static void peek_char(compiler_common *common, sljit_u32 max, sljit_s32 dst, sljit_sw dstw) static void peek_char(compiler_common *common, sljit_u32 max, sljit_s32 dst, sljit_sw dstw, jump_list **backtracks)
{ {
/* Reads the character into TMP1, keeps STR_PTR. /* Reads the character into TMP1, keeps STR_PTR.
Does not check STR_END. TMP2, dst, RETURN_ADDR Destroyed. */ Does not check STR_END. TMP2, dst, RETURN_ADDR Destroyed. */
@ -3355,6 +3365,7 @@ struct sljit_jump *jump;
SLJIT_UNUSED_ARG(max); SLJIT_UNUSED_ARG(max);
SLJIT_UNUSED_ARG(dst); SLJIT_UNUSED_ARG(dst);
SLJIT_UNUSED_ARG(dstw); SLJIT_UNUSED_ARG(dstw);
SLJIT_UNUSED_ARG(backtracks);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
@ -3369,6 +3380,8 @@ if (common->utf)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
add_jump(compiler, common->invalid_utf ? &common->utfreadchar_invalid : &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, common->invalid_utf ? &common->utfreadchar_invalid : &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw); OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw);
if (backtracks && common->invalid_utf)
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
JUMPHERE(jump); JUMPHERE(jump);
} }
#elif PCRE2_CODE_UNIT_WIDTH == 16 #elif PCRE2_CODE_UNIT_WIDTH == 16
@ -3381,7 +3394,12 @@ if (common->utf)
if (common->invalid_utf) if (common->invalid_utf)
{ {
jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
OP1(SLJIT_MOV, dst, dstw, STR_PTR, 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw);
if (backtracks && common->invalid_utf)
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
} }
else else
{ {
@ -3392,10 +3410,20 @@ if (common->utf)
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00); OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
} }
JUMPHERE(jump); JUMPHERE(jump);
} }
#elif PCRE2_CODE_UNIT_WIDTH == 32 #elif PCRE2_CODE_UNIT_WIDTH == 32
if (common->invalid_utf)
{
if (backtracks != NULL)
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
else
{
OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
}
}
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
} }
@ -3458,7 +3486,8 @@ if (common->utf)
JUMPHERE(jump); JUMPHERE(jump);
} }
#elif PCRE2_CODE_UNIT_WIDTH == 32 #elif PCRE2_CODE_UNIT_WIDTH == 32
if (common->invalid_utf)
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
} }
@ -3591,7 +3620,12 @@ if (common->utf)
{ {
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
if (options & READ_CHAR_UTF8_NEWLINE)
add_jump(compiler, &common->utfreadnewline_invalid, JUMP(SLJIT_FAST_CALL));
else
add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
if (backtracks != NULL) if (backtracks != NULL)
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR)); add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
JUMPHERE(jump); JUMPHERE(jump);
@ -3868,7 +3902,18 @@ if (common->utf)
if (common->invalid_utf && !must_be_valid) if (common->invalid_utf && !must_be_valid)
{ {
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); if (backtracks != NULL)
{
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
return;
}
OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_LESS);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
return;
} }
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
@ -4132,8 +4177,9 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
static void do_utfreadnewline_invalid(compiler_common *common) static void do_utfreadnewline_invalid(compiler_common *common)
{ {
/* Slow decoding a UTF-8 character. TMP1 contains the first byte /* Slow decoding a UTF-8 character, specialized for newlines.
of the character (>= 0xc0). Return char value in TMP1. */ TMP1 contains the first byte of the character (>= 0xc0). Return
char value in TMP1. */
DEFINE_COMPILER; DEFINE_COMPILER;
struct sljit_jump *jump; struct sljit_jump *jump;
struct sljit_jump *buffer_end_close; struct sljit_jump *buffer_end_close;
@ -4506,7 +4552,8 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
static void do_utfreadchar_invalid(compiler_common *common) static void do_utfreadchar_invalid(compiler_common *common)
{ {
/* Slow decoding a UTF-16 character. TMP1 contains the first half /* Slow decoding a UTF-16 character. TMP1 contains the first half
of the character (>= 0xd800). Return char value in TMP1, length in TMP2. */ of the character (>= 0xd800). Return char value in TMP1. STR_PTR is
undefined for invalid characters. */
DEFINE_COMPILER; DEFINE_COMPILER;
struct sljit_jump *exit_invalid[3]; struct sljit_jump *exit_invalid[3];
@ -4534,6 +4581,38 @@ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0); sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
} }
static void do_utfreadnewline_invalid(compiler_common *common)
{
/* Slow decoding a UTF-16 character, specialized for newlines.
TMP1 contains the first half of the character (>= 0xd800). Return
char value in TMP1. */
DEFINE_COMPILER;
struct sljit_jump *exit_invalid[2];
sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
/* TMP2 contains the high surrogate. */
exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00);
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xdc00);
OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x400);
OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0x10000);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(exit_invalid[0]);
JUMPHERE(exit_invalid[1]);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}
static void do_utfmoveback_invalid(compiler_common *common) static void do_utfmoveback_invalid(compiler_common *common)
{ {
/* Goes one character back. */ /* Goes one character back. */
@ -6651,7 +6730,7 @@ JUMPHERE(skipread);
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0); OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
check_str_end(common, &skipread_list); check_str_end(common, &skipread_list);
peek_char(common, READ_CHAR_MAX, SLJIT_MEM1(SLJIT_SP), LOCALS1); peek_char(common, READ_CHAR_MAX, SLJIT_MEM1(SLJIT_SP), LOCALS1, &invalid_utf);
/* Testing char type. This is a code duplication. */ /* Testing char type. This is a code duplication. */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
@ -8144,7 +8223,7 @@ switch(type)
} }
else else
{ {
peek_char(common, common->nlmax, TMP3, 0); peek_char(common, common->nlmax, TMP3, 0, NULL);
check_newlinechar(common, common->nltype, backtracks, FALSE); check_newlinechar(common, common->nltype, backtracks, FALSE);
} }
JUMPHERE(jump[0]); JUMPHERE(jump[0]);
@ -8506,7 +8585,7 @@ switch(type)
case OP_ALLANY: case OP_ALLANY:
if (check_str_ptr) if (check_str_ptr)
detect_partial_match(common, backtracks); detect_partial_match(common, backtracks);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 #ifdef SUPPORT_UNICODE
if (common->utf) if (common->utf)
{ {
if (common->invalid_utf) if (common->invalid_utf)
@ -8515,9 +8594,9 @@ switch(type)
return cc; return cc;
} }
#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
#if PCRE2_CODE_UNIT_WIDTH == 8 #if PCRE2_CODE_UNIT_WIDTH == 8
jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
@ -8529,12 +8608,12 @@ switch(type)
OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL); OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
#endif #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
JUMPHERE(jump[0]); JUMPHERE(jump[0]);
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */
return cc; return cc;
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */
} }
#endif #endif /* SUPPORT_UNICODE */
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
return cc; return cc;
@ -13749,11 +13828,6 @@ if (common->utfreadchar != NULL)
set_jumps(common->utfreadchar, LABEL()); set_jumps(common->utfreadchar, LABEL());
do_utfreadchar(common); do_utfreadchar(common);
} }
if (common->utfreadnewline_invalid != NULL)
{
set_jumps(common->utfreadnewline_invalid, LABEL());
do_utfreadnewline_invalid(common);
}
if (common->utfreadtype8 != NULL) if (common->utfreadtype8 != NULL)
{ {
set_jumps(common->utfreadtype8, LABEL()); set_jumps(common->utfreadtype8, LABEL());
@ -13771,6 +13845,11 @@ if (common->utfreadchar_invalid != NULL)
set_jumps(common->utfreadchar_invalid, LABEL()); set_jumps(common->utfreadchar_invalid, LABEL());
do_utfreadchar_invalid(common); do_utfreadchar_invalid(common);
} }
if (common->utfreadnewline_invalid != NULL)
{
set_jumps(common->utfreadnewline_invalid, LABEL());
do_utfreadnewline_invalid(common);
}
if (common->utfmoveback_invalid) if (common->utfmoveback_invalid)
{ {
set_jumps(common->utfmoveback_invalid, LABEL()); set_jumps(common->utfmoveback_invalid, LABEL());

View File

@ -95,6 +95,7 @@ POSSIBILITY OF SUCH DAMAGE.
static int regression_tests(void); static int regression_tests(void);
static int invalid_utf8_regression_tests(void); static int invalid_utf8_regression_tests(void);
static int invalid_utf16_regression_tests(void); static int invalid_utf16_regression_tests(void);
static int invalid_utf32_regression_tests(void);
int main(void) int main(void)
{ {
@ -111,8 +112,9 @@ int main(void)
return 1; return 1;
} }
return regression_tests() return regression_tests()
|| invalid_utf8_regression_tests() | invalid_utf8_regression_tests()
|| invalid_utf16_regression_tests(); | invalid_utf16_regression_tests()
| invalid_utf32_regression_tests();
} }
/* --------------------------------------------------------------------------------------- */ /* --------------------------------------------------------------------------------------- */
@ -1917,6 +1919,11 @@ static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cas
{ UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" }, { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
{ UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" }, { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
{ UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "#" },
{ UDA, CPI, 0, 0, 0, 0, 4, { "[^#]", NULL }, "\xf4\x8f\xbf\xbf" },
{ UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xf4\x90\x80\x80" },
{ UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xc1\x80" },
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"}, { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"}, { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"}, { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
@ -2069,53 +2076,65 @@ struct invalid_utf16_regression_test_case {
const PCRE2_UCHAR16 *input; const PCRE2_UCHAR16 *input;
}; };
static PCRE2_UCHAR16 allany[] = { '.', 0 }; static PCRE2_UCHAR16 allany16[] = { '.', 0 };
static PCRE2_UCHAR16 non_word_boundary[] = { '\\', 'B', 0 }; static PCRE2_UCHAR16 non_word_boundary16[] = { '\\', 'B', 0 };
static PCRE2_UCHAR16 word_boundary[] = { '\\', 'b', 0 }; static PCRE2_UCHAR16 word_boundary16[] = { '\\', 'b', 0 };
static PCRE2_UCHAR16 backreference[] = { '(', '.', ')', '\\', '1', 0 }; static PCRE2_UCHAR16 backreference16[] = { '(', '.', ')', '\\', '1', 0 };
static PCRE2_UCHAR16 grapheme[] = { '\\', 'X', 0 }; static PCRE2_UCHAR16 grapheme16[] = { '\\', 'X', 0 };
static PCRE2_UCHAR16 test1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 }; static PCRE2_UCHAR16 nothashmark16[] = { '[', '^', '#', ']', 0 };
static PCRE2_UCHAR16 test2[] = { 0xd800, 0xdc00, '#', 0 }; static PCRE2_UCHAR16 afternl16[] = { '^', '\\', 'W', 0 };
static PCRE2_UCHAR16 test3[] = { 0xdbff, 0xdfff, '#', 0 }; static PCRE2_UCHAR16 test16_1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 };
static PCRE2_UCHAR16 test4[] = { 0xd800, 0xdbff, '#', 0 }; static PCRE2_UCHAR16 test16_2[] = { 0xd800, 0xdc00, '#', 0 };
static PCRE2_UCHAR16 test5[] = { '#', 0xd800, '#', 0 }; static PCRE2_UCHAR16 test16_3[] = { 0xdbff, 0xdfff, '#', 0 };
static PCRE2_UCHAR16 test6[] = { 'a', 'A', 0xdc28, 0 }; static PCRE2_UCHAR16 test16_4[] = { 0xd800, 0xdbff, '#', 0 };
static PCRE2_UCHAR16 test7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 }; static PCRE2_UCHAR16 test16_5[] = { '#', 0xd800, '#', 0 };
static PCRE2_UCHAR16 test16_6[] = { 'a', 'A', 0xdc28, 0 };
static PCRE2_UCHAR16 test16_7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
static PCRE2_UCHAR16 test16_8[] = { '#', 0xd800, 0xdc00, 0 };
static PCRE2_UCHAR16 test16_9[] = { ' ', 0x2028, '#', 0 };
static PCRE2_UCHAR16 test16_10[] = { ' ', 0xdc00, 0xd800, 0x2028, '#', 0 };
static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = { static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
{ UDA, CI, 0, 0, 0, 0, 1, { allany, NULL }, test1 }, { UDA, CI, 0, 0, 0, 0, 1, { allany16, NULL }, test16_1 },
{ UDA, CI, 1, 0, 0, 1, 2, { allany, NULL }, test1 }, { UDA, CI, 1, 0, 0, 1, 2, { allany16, NULL }, test16_1 },
{ UDA, CI, 2, 0, 0, 2, 3, { allany, NULL }, test1 }, { UDA, CI, 2, 0, 0, 2, 3, { allany16, NULL }, test16_1 },
{ UDA, CI, 3, 0, 0, 3, 4, { allany, NULL }, test1 }, { UDA, CI, 3, 0, 0, 3, 4, { allany16, NULL }, test16_1 },
{ UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test2 }, { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_2 },
{ UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test2 }, { UDA, CI, 0, 0, 2, -1, -1, { allany16, NULL }, test16_2 },
{ UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test2 }, { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_2 },
{ UDA, CI, 0, 0, 0, 0, 2, { allany, NULL }, test3 }, { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_3 },
{ UDA, CI, 0, 0, 2, -1, -1, { allany, NULL }, test3 }, { UDA, CI, 0, 0, 2, -1, -1, { allany16, NULL }, test16_3 },
{ UDA, CI, 1, 0, 0, -1, -1, { allany, NULL }, test3 }, { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_3 },
{ UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary, NULL }, test1 }, { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary16, NULL }, test16_1 },
{ UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test1 }, { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_1 },
{ UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary, NULL }, test1 }, { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary16, NULL }, test16_1 },
{ UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary, NULL }, test1 }, { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary16, NULL }, test16_1 },
{ UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test2 }, { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_2 },
{ UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary, NULL }, test3 }, { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_3 },
{ UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test2 }, { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_2 },
{ UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary, word_boundary }, test3 }, { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_3 },
{ UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test4 }, { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_4 },
{ UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary, word_boundary }, test5 }, { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_5 },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference, NULL }, test6 }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference16, NULL }, test16_6 },
{ UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference, NULL }, test6 }, { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference16, NULL }, test16_6 },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference, NULL }, test7 }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference16, NULL }, test16_7 },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference, NULL }, test7 }, { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference16, NULL }, test16_7 },
{ UDA, CPI, 0, 0, 0, 0, 1, { grapheme, NULL }, test6 }, { UDA, CPI, 0, 0, 0, 0, 1, { grapheme16, NULL }, test16_6 },
{ UDA, CPI, 1, 0, 0, 1, 2, { grapheme, NULL }, test6 }, { UDA, CPI, 1, 0, 0, 1, 2, { grapheme16, NULL }, test16_6 },
{ UDA, CPI, 2, 0, 0, -1, -1, { grapheme, NULL }, test6 }, { UDA, CPI, 2, 0, 0, -1, -1, { grapheme16, NULL }, test16_6 },
{ UDA, CPI, 0, 0, 0, 0, 2, { grapheme, NULL }, test7 }, { UDA, CPI, 0, 0, 0, 0, 2, { grapheme16, NULL }, test16_7 },
{ UDA, CPI, 2, 0, 0, 2, 4, { grapheme, NULL }, test7 }, { UDA, CPI, 2, 0, 0, 2, 4, { grapheme16, NULL }, test16_7 },
{ UDA, CPI, 1, 0, 0, -1, -1, { grapheme, NULL }, test7 }, { UDA, CPI, 1, 0, 0, -1, -1, { grapheme16, NULL }, test16_7 },
{ UDA, CPI, 0, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
{ UDA, CPI, 1, 0, 0, 1, 3, { nothashmark16, NULL }, test16_8 },
{ UDA, CPI, 2, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl16, NULL }, test16_9 },
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { afternl16, NULL }, test16_10 },
{ 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
}; };
@ -2239,4 +2258,178 @@ static int invalid_utf16_regression_tests(void)
#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */ #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */
#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_32
#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
#define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
#define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
struct invalid_utf32_regression_test_case {
int compile_options;
int jit_compile_options;
int start_offset;
int skip_left;
int skip_right;
int match_start;
int match_end;
const PCRE2_UCHAR32 *pattern[2];
const PCRE2_UCHAR32 *input;
};
static PCRE2_UCHAR32 allany32[] = { '.', 0 };
static PCRE2_UCHAR32 non_word_boundary32[] = { '\\', 'B', 0 };
static PCRE2_UCHAR32 word_boundary32[] = { '\\', 'b', 0 };
static PCRE2_UCHAR32 backreference32[] = { '(', '.', ')', '\\', '1', 0 };
static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 };
static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 };
static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 };
static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x10ffff, 0 };
static PCRE2_UCHAR32 test32_2[] = { 'a', 'A', 0x110000, 0 };
static PCRE2_UCHAR32 test32_3[] = { '#', 0x10ffff, 0x110000, 0 };
static PCRE2_UCHAR32 test32_4[] = { ' ', 0x2028, '#', 0 };
static PCRE2_UCHAR32 test32_5[] = { ' ', 0x110000, 0x2028, '#', 0 };
static struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = {
{ UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 },
{ UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 },
{ UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 },
{ UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
{ UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_2 },
{ UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_2 },
{ UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 },
{ UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 },
{ UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 },
{ UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_3 },
{ UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 },
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_4 },
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_5 },
{ 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
};
#undef UDA
#undef CI
#undef CPI
static int run_invalid_utf32_test(struct invalid_utf32_regression_test_case *current,
int pattern_index, int i, pcre2_compile_context_32 *ccontext, pcre2_match_data_32 *mdata)
{
pcre2_code_32 *code;
int result, errorcode;
PCRE2_SIZE length, erroroffset;
const PCRE2_UCHAR32 *input;
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(mdata);
if (current->pattern[i] == NULL)
return 1;
code = pcre2_compile_32(current->pattern[i], PCRE2_ZERO_TERMINATED,
current->compile_options, &errorcode, &erroroffset, ccontext);
if (!code) {
printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
return 0;
}
if (pcre2_jit_compile_32(code, current->jit_compile_options) != 0) {
printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
pcre2_code_free_32(code);
return 0;
}
input = current->input;
length = 0;
while (*input++ != 0)
length++;
length -= current->skip_left + current->skip_right;
if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
result = pcre2_jit_match_32(code, (current->input + current->skip_left),
length, current->start_offset - current->skip_left, 0, mdata, NULL);
if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
pcre2_code_free_32(code);
return 0;
}
}
if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
result = pcre2_jit_match_32(code, (current->input + current->skip_left),
length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
pcre2_code_free_32(code);
return 0;
}
}
pcre2_code_free_32(code);
return 1;
}
static int invalid_utf32_regression_tests(void)
{
struct invalid_utf32_regression_test_case *current;
pcre2_compile_context_32 *ccontext;
pcre2_match_data_32 *mdata;
int total = 0, successful = 0;
int result;
printf("\nRunning invalid-utf32 JIT regression tests\n");
ccontext = pcre2_compile_context_create_32(NULL);
pcre2_set_newline_32(ccontext, PCRE2_NEWLINE_ANY);
mdata = pcre2_match_data_create_32(4, NULL);
for (current = invalid_utf32_regression_test_cases; current->pattern[0]; current++) {
/* printf("\nPattern: %s :\n", current->pattern); */
total++;
result = 1;
if (!run_invalid_utf32_test(current, total - 1, 0, ccontext, mdata))
result = 0;
if (!run_invalid_utf32_test(current, total - 1, 1, ccontext, mdata))
result = 0;
if (result) {
successful++;
}
printf(".");
if ((total % 60) == 0)
printf("\n");
}
if ((total % 60) != 0)
printf("\n");
pcre2_match_data_free_32(mdata);
pcre2_compile_context_free_32(ccontext);
if (total == successful) {
printf("\nAll invalid UTF32 JIT regression tests are successfully passed.\n");
return 0;
} else {
printf("\nInvalid UTF32 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
return 1;
}
}
#else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_32 */
static int invalid_utf32_regression_tests(void)
{
return 0;
}
#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_32 */
/* End of pcre2_jit_test.c */ /* End of pcre2_jit_test.c */