From 274efb8ded5987cd5ba328e91e1424b6609787f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Herczeg?= Date: Fri, 10 May 2019 13:15:20 +0000 Subject: [PATCH] Improved the invalid utf32 support of the JIT compiler. --- ChangeLog | 3 ++ src/pcre2_jit_compile.c | 87 +++++++++++++++++++++++++++++------------ src/pcre2_jit_test.c | 40 +++++++++++++------ 3 files changed, 91 insertions(+), 39 deletions(-) diff --git a/ChangeLog b/ChangeLog index da4ffb6..48f6b95 100644 --- a/ChangeLog +++ b/ChangeLog @@ -9,6 +9,9 @@ Version 10.34 22-April-2019 check on this was ever implemented. This omission has been rectified; it fixes ClusterFuzz 14376. +2. Improved the invalid utf32 support of the JIT compiler. Now it correctly +detects invalid characters in the 0xd800-0xdfff range. + Version 10.33 16-April-2019 --------------------------- diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 1f21bfb..ae0fbcf 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -696,11 +696,12 @@ the start pointers when the end of the capturing group has not yet reached. */ #define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \ { \ - if (ptr[-1] <= 0x7f) \ - c = *ptr--; \ + c = ptr[-1]; \ + if (c <= 0x7f) \ + ptr--; \ else if (ptr - 1 > start && ptr[-1] >= 0x80 && ptr[-1] < 0xc0) \ { \ - c = ptr[-1] - 0x80; \ + c -= 0x80; \ \ if (ptr[-2] >= 0xc2 && ptr[-2] <= 0xdf) \ { \ @@ -775,11 +776,12 @@ the start pointers when the end of the capturing group has not yet reached. */ #define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \ { \ - if (ptr[-1] < 0xd800 || ptr[-1] >= 0xe000) \ - c = *ptr--; \ - else if (ptr[-1] >= 0xdc00 && ptr - 1 > start && ptr[-2] >= 0xd800 && ptr[-2] < 0xdc00) \ + c = ptr[-1]; \ + if (c < 0xd800 || c >= 0xe000) \ + ptr--; \ + else if (c >= 0xdc00 && ptr - 1 > start && ptr[-2] >= 0xd800 && ptr[-2] < 0xdc00) \ { \ - c = (((ptr[-2] - 0xd800) << 10) | (ptr[-1] - 0xdc00)) + 0x10000; \ + c = (((ptr[-2] - 0xd800) << 10) | (c - 0xdc00)) + 0x10000; \ ptr -= 2; \ } \ else \ @@ -793,7 +795,7 @@ the start pointers when the end of the capturing group has not yet reached. */ #define GETCHARINC_INVALID(c, ptr, end, invalid_action) \ { \ - if (ptr[0] < 0x110000) \ + if (ptr[0] < 0xd800 || (ptr[0] >= 0xe000 && ptr[0] < 0x110000)) \ c = *ptr++; \ else \ { \ @@ -801,6 +803,17 @@ the start pointers when the end of the capturing group has not yet reached. */ } \ } +#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \ + { \ + c = ptr[-1]; \ + if (ptr[-1] < 0xd800 || (ptr[-1] >= 0xe000 && ptr[-1] < 0x110000)) \ + ptr--; \ + else \ + { \ + invalid_action; \ + } \ + } + #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ #endif /* SUPPORT_UNICODE */ @@ -3420,12 +3433,21 @@ if (common->utf) #elif PCRE2_CODE_UNIT_WIDTH == 32 if (common->invalid_utf) { + if (max < 0xd800) return; + if (backtracks != NULL) + { + OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); + add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800)); + } else { + OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000); CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); + OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); + CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); } } #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ @@ -3490,8 +3512,12 @@ if (common->utf) JUMPHERE(jump); } #elif PCRE2_CODE_UNIT_WIDTH == 32 - if (common->invalid_utf) - add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); +if (common->invalid_utf) + { + OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); + add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); + add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800)); + } #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ #endif /* SUPPORT_UNICODE */ } @@ -3677,11 +3703,18 @@ if (common->utf) if (common->invalid_utf) { if (backtracks != NULL) + { + OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000)); + add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800)); + } else { + OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800); OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000); CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); + OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800); + CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR); } } #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ @@ -8402,12 +8435,12 @@ static PCRE2_SPTR SLJIT_FUNC do_extuni_utf(jit_arguments *args, PCRE2_SPTR cc) PCRE2_SPTR start_subject = args->begin; PCRE2_SPTR end_subject = args->end; int lgb, rgb, ricount; -PCRE2_SPTR prevcc, startcc, bptr; +PCRE2_SPTR prevcc, endcc, bptr; BOOL first = TRUE; uint32_t c; prevcc = cc; -startcc = NULL; +endcc = NULL; do { GETCHARINC(c, cc); @@ -8416,7 +8449,7 @@ do if (first) { lgb = rgb; - startcc = cc; + endcc = cc; first = FALSE; continue; } @@ -8455,25 +8488,27 @@ do lgb != ucp_gbExtended_Pictographic) lgb = rgb; - prevcc = startcc; - startcc = cc; + prevcc = endcc; + endcc = cc; } while (cc < end_subject); -return startcc; +return endcc; } +#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ + static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc) { PCRE2_SPTR start_subject = args->begin; PCRE2_SPTR end_subject = args->end; int lgb, rgb, ricount; -PCRE2_SPTR prevcc, startcc, bptr; +PCRE2_SPTR prevcc, endcc, bptr; BOOL first = TRUE; uint32_t c; prevcc = cc; -startcc = NULL; +endcc = NULL; do { GETCHARINC_INVALID(c, cc, end_subject, break); @@ -8482,7 +8517,7 @@ do if (first) { lgb = rgb; - startcc = cc; + endcc = cc; first = FALSE; continue; } @@ -8520,16 +8555,14 @@ do lgb != ucp_gbExtended_Pictographic) lgb = rgb; - prevcc = startcc; - startcc = cc; + prevcc = endcc; + endcc = cc; } while (cc < end_subject); -return startcc; +return endcc; } -#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ - static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc) { PCRE2_SPTR start_subject = args->begin; @@ -8800,8 +8833,10 @@ switch(type) if (common->invalid_utf) add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); #else - sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_extuni_no_utf)); - add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, + common->invalid_utf ? SLJIT_FUNC_OFFSET(do_extuni_utf_invalid) : SLJIT_FUNC_OFFSET(do_extuni_no_utf)); + if (!common->utf || common->invalid_utf) + add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); #endif OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0); diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c index fa14329..2247c8c 100644 --- a/src/pcre2_jit_test.c +++ b/src/pcre2_jit_test.c @@ -1770,7 +1770,7 @@ static int regression_tests(void) } } -#if defined SUPPORT_UNICODE && (defined SUPPORT_PCRE2_8 || defined SUPPORT_PCRE2_16 || defined SUPPORT_PCRE2_32) +#if defined SUPPORT_UNICODE static int check_invalid_utf_result(int pattern_index, const char *type, int result, int match_start, int match_end, PCRE2_SIZE *ovector) @@ -1803,7 +1803,7 @@ static int check_invalid_utf_result(int pattern_index, const char *type, int res return 0; } -#endif /* SUPPORT_UNICODE && (SUPPORT_PCRE2_8 || SUPPORT_PCRE2_16 || SUPPORT_PCRE2_32) */ +#endif /* SUPPORT_UNICODE */ #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8 @@ -2314,31 +2314,45 @@ static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 }; static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 }; static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 }; static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x10ffff, 0 }; -static PCRE2_UCHAR32 test32_2[] = { 'a', 'A', 0x110000, 0 }; -static PCRE2_UCHAR32 test32_3[] = { '#', 0x10ffff, 0x110000, 0 }; -static PCRE2_UCHAR32 test32_4[] = { ' ', 0x2028, '#', 0 }; -static PCRE2_UCHAR32 test32_5[] = { ' ', 0x110000, 0x2028, '#', 0 }; +static PCRE2_UCHAR32 test32_2[] = { 0xd7ff, 0xe000, 0xd800, 0xdfff, 0xe000, 0 }; +static PCRE2_UCHAR32 test32_3[] = { 'a', 'A', 0x110000, 0 }; +static PCRE2_UCHAR32 test32_4[] = { '#', 0x10ffff, 0x110000, 0 }; +static PCRE2_UCHAR32 test32_5[] = { ' ', 0x2028, '#', 0 }; +static PCRE2_UCHAR32 test32_6[] = { ' ', 0x110000, 0x2028, '#', 0 }; static struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = { { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 }, { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 }, + { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_2 }, + { UDA, CI, 1, 0, 0, 1, 2, { allany32, NULL }, test32_2 }, + { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_2 }, + { UDA, CI, 3, 0, 0, -1, -1, { allany32, NULL }, test32_2 }, { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 }, { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 }, { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 }, + { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_2 }, + { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 }, + { UDA, CPI, 4, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 }, - { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_2 }, - { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_2 }, + { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_3 }, + { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_3 }, { UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 }, { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 }, + { UDA, CPI, 1, 0, 0, 1, 2, { grapheme32, NULL }, test32_2 }, + { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 }, + { UDA, CPI, 3, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 }, + { UDA, CPI, 4, 0, 0, 4, 5, { grapheme32, NULL }, test32_2 }, - { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 }, - { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_3 }, - { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 }, + { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 }, + { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_4 }, + { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 }, + { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_2 }, + { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_2 }, - { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_4 }, - { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_5 }, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_5 }, + { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_6 }, { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL } };