From bdac9df4cfeaa66c0a76366b4f6ab004edc86048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Herczeg?= Date: Fri, 31 Mar 2017 05:40:37 +0000 Subject: [PATCH] Fix character type detection when 32-bit and UCP are enabled but UTF is not in JIT. --- src/pcre2_jit_compile.c | 31 +++++++++++++++++++++++++++++++ testdata/testinput12 | 2 +- testdata/testoutput12-16 | 2 +- testdata/testoutput12-32 | 2 +- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 846510a..e93143d 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -588,6 +588,8 @@ the start pointers when the end of the capturing group has not yet reached. */ #define READ_CHAR_MAX 0x7fffffff +#define INVALID_UTF_CHAR 888 + static PCRE2_SPTR bracketend(PCRE2_SPTR cc) { SLJIT_ASSERT((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NOT) || (*cc >= OP_ONCE && *cc <= OP_SCOND)); @@ -3558,10 +3560,30 @@ static void do_getucd(compiler_common *common) /* Search the UCD record for the character comes in TMP1. Returns chartype in TMP1 and UCD offset in TMP2. */ DEFINE_COMPILER; +#if PCRE2_CODE_UNIT_WIDTH == 32 +struct sljit_jump *jump; +#endif + +#if defined SLJIT_DEBUG && SLJIT_DEBUG +/* dummy_ucd_record */ +const ucd_record *record = GET_UCD(INVALID_UTF_CHAR); +SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther); +SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0); +#endif SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8); sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); + +#if PCRE2_CODE_UNIT_WIDTH == 32 +if (!common->utf) + { + jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); + JUMPHERE(jump); + } +#endif + OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK); @@ -5969,6 +5991,15 @@ if (needstype || needsscript) if (needschar && !charsaved) OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0); +#if PCRE2_CODE_UNIT_WIDTH == 32 + if (!common->utf) + { + jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1); + OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR); + JUMPHERE(jump); + } +#endif + OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK); diff --git a/testdata/testinput12 b/testdata/testinput12 index decfe82..cca5dfa 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -361,6 +361,6 @@ /[\s[:^ascii:]]/B,ucp /\pP/ucp - \x{7fffffff}\=no_jit + \x{7fffffff} # End of testinput12 diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index 41e0a48..33b8a33 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1416,7 +1416,7 @@ No match ------------------------------------------------------------------ /\pP/ucp - \x{7fffffff}\=no_jit + \x{7fffffff} ** Character \x{7fffffff} is greater than 0xffff and UTF-16 mode is not enabled. ** Truncation will probably give the wrong result. No match diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index e9130b9..1abeb59 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1410,7 +1410,7 @@ No match ------------------------------------------------------------------ /\pP/ucp - \x{7fffffff}\=no_jit + \x{7fffffff} No match # End of testinput12