Improved the invalid utf32 support of the JIT compiler.
This commit is contained in:
parent
16de9003e5
commit
274efb8ded
|
@ -9,6 +9,9 @@ Version 10.34 22-April-2019
|
|||
check on this was ever implemented. This omission has been rectified; it fixes
|
||||
ClusterFuzz 14376.
|
||||
|
||||
2. Improved the invalid utf32 support of the JIT compiler. Now it correctly
|
||||
detects invalid characters in the 0xd800-0xdfff range.
|
||||
|
||||
|
||||
Version 10.33 16-April-2019
|
||||
---------------------------
|
||||
|
|
|
@ -696,11 +696,12 @@ the start pointers when the end of the capturing group has not yet reached. */
|
|||
|
||||
#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
|
||||
{ \
|
||||
if (ptr[-1] <= 0x7f) \
|
||||
c = *ptr--; \
|
||||
c = ptr[-1]; \
|
||||
if (c <= 0x7f) \
|
||||
ptr--; \
|
||||
else if (ptr - 1 > start && ptr[-1] >= 0x80 && ptr[-1] < 0xc0) \
|
||||
{ \
|
||||
c = ptr[-1] - 0x80; \
|
||||
c -= 0x80; \
|
||||
\
|
||||
if (ptr[-2] >= 0xc2 && ptr[-2] <= 0xdf) \
|
||||
{ \
|
||||
|
@ -775,11 +776,12 @@ the start pointers when the end of the capturing group has not yet reached. */
|
|||
|
||||
#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
|
||||
{ \
|
||||
if (ptr[-1] < 0xd800 || ptr[-1] >= 0xe000) \
|
||||
c = *ptr--; \
|
||||
else if (ptr[-1] >= 0xdc00 && ptr - 1 > start && ptr[-2] >= 0xd800 && ptr[-2] < 0xdc00) \
|
||||
c = ptr[-1]; \
|
||||
if (c < 0xd800 || c >= 0xe000) \
|
||||
ptr--; \
|
||||
else if (c >= 0xdc00 && ptr - 1 > start && ptr[-2] >= 0xd800 && ptr[-2] < 0xdc00) \
|
||||
{ \
|
||||
c = (((ptr[-2] - 0xd800) << 10) | (ptr[-1] - 0xdc00)) + 0x10000; \
|
||||
c = (((ptr[-2] - 0xd800) << 10) | (c - 0xdc00)) + 0x10000; \
|
||||
ptr -= 2; \
|
||||
} \
|
||||
else \
|
||||
|
@ -793,7 +795,7 @@ the start pointers when the end of the capturing group has not yet reached. */
|
|||
|
||||
#define GETCHARINC_INVALID(c, ptr, end, invalid_action) \
|
||||
{ \
|
||||
if (ptr[0] < 0x110000) \
|
||||
if (ptr[0] < 0xd800 || (ptr[0] >= 0xe000 && ptr[0] < 0x110000)) \
|
||||
c = *ptr++; \
|
||||
else \
|
||||
{ \
|
||||
|
@ -801,6 +803,17 @@ the start pointers when the end of the capturing group has not yet reached. */
|
|||
} \
|
||||
}
|
||||
|
||||
#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
|
||||
{ \
|
||||
c = ptr[-1]; \
|
||||
if (ptr[-1] < 0xd800 || (ptr[-1] >= 0xe000 && ptr[-1] < 0x110000)) \
|
||||
ptr--; \
|
||||
else \
|
||||
{ \
|
||||
invalid_action; \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
|
@ -3420,12 +3433,21 @@ if (common->utf)
|
|||
#elif PCRE2_CODE_UNIT_WIDTH == 32
|
||||
if (common->invalid_utf)
|
||||
{
|
||||
if (max < 0xd800) return;
|
||||
|
||||
if (backtracks != NULL)
|
||||
{
|
||||
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800));
|
||||
}
|
||||
else
|
||||
{
|
||||
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
|
||||
OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
|
||||
CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
|
||||
OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
|
||||
CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
|
||||
}
|
||||
}
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
|
||||
|
@ -3490,8 +3512,12 @@ if (common->utf)
|
|||
JUMPHERE(jump);
|
||||
}
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 32
|
||||
if (common->invalid_utf)
|
||||
if (common->invalid_utf)
|
||||
{
|
||||
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800));
|
||||
}
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
}
|
||||
|
@ -3677,11 +3703,18 @@ if (common->utf)
|
|||
if (common->invalid_utf)
|
||||
{
|
||||
if (backtracks != NULL)
|
||||
{
|
||||
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800));
|
||||
}
|
||||
else
|
||||
{
|
||||
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
|
||||
OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x110000);
|
||||
CMOV(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
|
||||
OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
|
||||
CMOV(SLJIT_LESS, TMP1, SLJIT_IMM, INVALID_UTF_CHAR);
|
||||
}
|
||||
}
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
|
||||
|
@ -8402,12 +8435,12 @@ static PCRE2_SPTR SLJIT_FUNC do_extuni_utf(jit_arguments *args, PCRE2_SPTR cc)
|
|||
PCRE2_SPTR start_subject = args->begin;
|
||||
PCRE2_SPTR end_subject = args->end;
|
||||
int lgb, rgb, ricount;
|
||||
PCRE2_SPTR prevcc, startcc, bptr;
|
||||
PCRE2_SPTR prevcc, endcc, bptr;
|
||||
BOOL first = TRUE;
|
||||
uint32_t c;
|
||||
|
||||
prevcc = cc;
|
||||
startcc = NULL;
|
||||
endcc = NULL;
|
||||
do
|
||||
{
|
||||
GETCHARINC(c, cc);
|
||||
|
@ -8416,7 +8449,7 @@ do
|
|||
if (first)
|
||||
{
|
||||
lgb = rgb;
|
||||
startcc = cc;
|
||||
endcc = cc;
|
||||
first = FALSE;
|
||||
continue;
|
||||
}
|
||||
|
@ -8455,25 +8488,27 @@ do
|
|||
lgb != ucp_gbExtended_Pictographic)
|
||||
lgb = rgb;
|
||||
|
||||
prevcc = startcc;
|
||||
startcc = cc;
|
||||
prevcc = endcc;
|
||||
endcc = cc;
|
||||
}
|
||||
while (cc < end_subject);
|
||||
|
||||
return startcc;
|
||||
return endcc;
|
||||
}
|
||||
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
|
||||
|
||||
static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc)
|
||||
{
|
||||
PCRE2_SPTR start_subject = args->begin;
|
||||
PCRE2_SPTR end_subject = args->end;
|
||||
int lgb, rgb, ricount;
|
||||
PCRE2_SPTR prevcc, startcc, bptr;
|
||||
PCRE2_SPTR prevcc, endcc, bptr;
|
||||
BOOL first = TRUE;
|
||||
uint32_t c;
|
||||
|
||||
prevcc = cc;
|
||||
startcc = NULL;
|
||||
endcc = NULL;
|
||||
do
|
||||
{
|
||||
GETCHARINC_INVALID(c, cc, end_subject, break);
|
||||
|
@ -8482,7 +8517,7 @@ do
|
|||
if (first)
|
||||
{
|
||||
lgb = rgb;
|
||||
startcc = cc;
|
||||
endcc = cc;
|
||||
first = FALSE;
|
||||
continue;
|
||||
}
|
||||
|
@ -8520,16 +8555,14 @@ do
|
|||
lgb != ucp_gbExtended_Pictographic)
|
||||
lgb = rgb;
|
||||
|
||||
prevcc = startcc;
|
||||
startcc = cc;
|
||||
prevcc = endcc;
|
||||
endcc = cc;
|
||||
}
|
||||
while (cc < end_subject);
|
||||
|
||||
return startcc;
|
||||
return endcc;
|
||||
}
|
||||
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
|
||||
|
||||
static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc)
|
||||
{
|
||||
PCRE2_SPTR start_subject = args->begin;
|
||||
|
@ -8800,7 +8833,9 @@ switch(type)
|
|||
if (common->invalid_utf)
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
|
||||
#else
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_extuni_no_utf));
|
||||
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM,
|
||||
common->invalid_utf ? SLJIT_FUNC_OFFSET(do_extuni_utf_invalid) : SLJIT_FUNC_OFFSET(do_extuni_no_utf));
|
||||
if (!common->utf || common->invalid_utf)
|
||||
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1770,7 +1770,7 @@ static int regression_tests(void)
|
|||
}
|
||||
}
|
||||
|
||||
#if defined SUPPORT_UNICODE && (defined SUPPORT_PCRE2_8 || defined SUPPORT_PCRE2_16 || defined SUPPORT_PCRE2_32)
|
||||
#if defined SUPPORT_UNICODE
|
||||
|
||||
static int check_invalid_utf_result(int pattern_index, const char *type, int result,
|
||||
int match_start, int match_end, PCRE2_SIZE *ovector)
|
||||
|
@ -1803,7 +1803,7 @@ static int check_invalid_utf_result(int pattern_index, const char *type, int res
|
|||
return 0;
|
||||
}
|
||||
|
||||
#endif /* SUPPORT_UNICODE && (SUPPORT_PCRE2_8 || SUPPORT_PCRE2_16 || SUPPORT_PCRE2_32) */
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
|
||||
#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8
|
||||
|
||||
|
@ -2314,31 +2314,45 @@ static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 };
|
|||
static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 };
|
||||
static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 };
|
||||
static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x10ffff, 0 };
|
||||
static PCRE2_UCHAR32 test32_2[] = { 'a', 'A', 0x110000, 0 };
|
||||
static PCRE2_UCHAR32 test32_3[] = { '#', 0x10ffff, 0x110000, 0 };
|
||||
static PCRE2_UCHAR32 test32_4[] = { ' ', 0x2028, '#', 0 };
|
||||
static PCRE2_UCHAR32 test32_5[] = { ' ', 0x110000, 0x2028, '#', 0 };
|
||||
static PCRE2_UCHAR32 test32_2[] = { 0xd7ff, 0xe000, 0xd800, 0xdfff, 0xe000, 0 };
|
||||
static PCRE2_UCHAR32 test32_3[] = { 'a', 'A', 0x110000, 0 };
|
||||
static PCRE2_UCHAR32 test32_4[] = { '#', 0x10ffff, 0x110000, 0 };
|
||||
static PCRE2_UCHAR32 test32_5[] = { ' ', 0x2028, '#', 0 };
|
||||
static PCRE2_UCHAR32 test32_6[] = { ' ', 0x110000, 0x2028, '#', 0 };
|
||||
|
||||
static struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = {
|
||||
{ UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 },
|
||||
{ UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 },
|
||||
{ UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_2 },
|
||||
{ UDA, CI, 1, 0, 0, 1, 2, { allany32, NULL }, test32_2 },
|
||||
{ UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
|
||||
{ UDA, CI, 3, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
|
||||
|
||||
{ UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
|
||||
{ UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
|
||||
{ UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_2 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
|
||||
{ UDA, CPI, 4, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
|
||||
|
||||
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_2 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_2 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_3 },
|
||||
{ UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_3 },
|
||||
|
||||
{ UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 },
|
||||
{ UDA, CPI, 1, 0, 0, 1, 2, { grapheme32, NULL }, test32_2 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
|
||||
{ UDA, CPI, 3, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
|
||||
{ UDA, CPI, 4, 0, 0, 4, 5, { grapheme32, NULL }, test32_2 },
|
||||
|
||||
{ UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 },
|
||||
{ UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_3 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 },
|
||||
{ UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
|
||||
{ UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_4 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
|
||||
{ UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_2 },
|
||||
{ UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_2 },
|
||||
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_4 },
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_5 },
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_5 },
|
||||
{ PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_6 },
|
||||
|
||||
{ 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue