From 435140a0ac6e761cade9cf78dc747ab2c73594af Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Mon, 3 Jan 2022 16:49:26 +0100 Subject: [PATCH] Fix script extension support on jit (#69) Co-authored-by: Zoltan Herczeg --- src/pcre2_jit_compile.c | 82 ++++++++++++++++++++++++++++++++--------- src/pcre2_jit_test.c | 2 + testdata/testinput4 | 8 ---- testdata/testinput5 | 4 -- testdata/testoutput4 | 8 ---- testdata/testoutput5 | 4 -- 6 files changed, 67 insertions(+), 41 deletions(-) diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 06dc0a5..50f3820 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -7413,14 +7413,18 @@ return cc; static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr); #ifdef SUPPORT_UNICODE -#define XCLASS_SAVE_CHAR 0x01 -#define XCLASS_CHAR_SAVED 0x02 -#define XCLASS_HAS_TYPE 0x04 -#define XCLASS_HAS_SCRIPT 0x08 -#define XCLASS_HAS_SCRIPT_EXTENSION 0x10 -#define XCLASS_HAS_BIDICO 0x20 -#define XCLASS_HAS_BIDICL 0x40 +#define XCLASS_SAVE_CHAR 0x001 +#define XCLASS_CHAR_SAVED 0x002 +#define XCLASS_HAS_TYPE 0x004 +#define XCLASS_HAS_SCRIPT 0x008 +#define XCLASS_HAS_SCRIPT_EXTENSION 0x010 +#define XCLASS_HAS_BIDICO 0x020 +#define XCLASS_HAS_BIDICL 0x040 #define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL) +#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080 +#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100 +#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0 0x200 + #endif /* SUPPORT_UNICODE */ static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks) @@ -7521,6 +7525,11 @@ while (*cc != XCL_END) case PT_SCX: unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION; + if (cc[-1] == XCL_NOTPROP) + { + unicode_status |= XCLASS_SCRIPT_EXTENSION_NOTPROP; + break; + } compares++; case PT_SC: @@ -7679,14 +7688,19 @@ if (unicode_status & XCLASS_NEEDS_UCD) { SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP); cc++; - if (*cc == PT_SC || *cc == PT_SCX) + switch (*cc) { + case PT_SCX: + if (cc[-1] == XCL_NOTPROP) + break; + + case PT_SC: compares--; invertcmp = (compares == 0 && list != backtracks); if (cc[-1] == XCL_NOTPROP) invertcmp ^= 0x1; - jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]); - add_jump(compiler, compares > 0 ? list : backtracks, jump); + + add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1])); } cc += 2; } @@ -7697,6 +7711,27 @@ if (unicode_status & XCLASS_NEEDS_UCD) if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION) { + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx)); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2); + + if (unicode_status & XCLASS_SCRIPT_EXTENSION_NOTPROP) + { + if (unicode_status & (XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL | XCLASS_HAS_TYPE)) + { + if (unicode_status & XCLASS_SAVE_CHAR) + { + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, TMP2, 0); + unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0; + } + else + { + OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0); + unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR; + } + } + OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script)); + } + while (*cc != XCL_END) { if (*cc == XCL_SINGLE) @@ -7716,22 +7751,35 @@ if (unicode_status & XCLASS_NEEDS_UCD) cc++; if (*cc == PT_SCX) { - OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx)); - OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2); - OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5))); - OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f)); - compares--; invertcmp = (compares == 0 && list != backtracks); + + jump = NULL; if (cc[-1] == XCL_NOTPROP) + { + jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]); + if (invertcmp) + { + add_jump(compiler, backtracks, jump); + jump = NULL; + } invertcmp ^= 0x1; - jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); - add_jump(compiler, compares > 0 ? list : backtracks, jump); + } + + OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f)); + add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp)); + + if (jump != NULL) + JUMPHERE(jump); } cc += 2; } } + if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0) + OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); + else if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR) + OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0); cc = ccbegin; } diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c index 1f60225..3b57ce2 100644 --- a/src/pcre2_jit_test.c +++ b/src/pcre2_jit_test.c @@ -413,6 +413,8 @@ static struct regression_test_case regression_test_cases[] = { { MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" }, { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" }, { MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " }, + { MUP, 0, 0, 0, "[\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" }, + { MUP, 0, 0, 0, "[\\x{a92e}\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" }, /* Possible empty brackets. */ { MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" }, diff --git a/testdata/testinput4 b/testdata/testinput4 index 69c5475..654176f 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -1144,8 +1144,6 @@ \= Expect no match X\x{06e9} -#subject no_jit - /^\P{Katakana}+/utf \x{3105} \= Expect no match @@ -1157,8 +1155,6 @@ \x{a014} \x{a4c6} -#subject -no_jit - /^\p{Any}X/utf AXYZ \x{1234}XYZ @@ -1410,8 +1406,6 @@ \x{2116} \x{1D183} -#subject no_jit - /^\p{Inherited}/utf \x{200c} \= Expect no match @@ -1464,8 +1458,6 @@ /\p{sc:katakana}{3,}?/utf \x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC -#subject -no_jit - /\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf \x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}==== diff --git a/testdata/testinput5 b/testdata/testinput5 index 3f62216..2a2e3fa 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2035,8 +2035,6 @@ # doesn't recognize all these scripts. In time these three tests can be moved # to test 4. -#subject no_jit - /^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+) (\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+) (\p{Zanabazar_Square}+)/x,utf @@ -2085,8 +2083,6 @@ \x{655} \x{1D1AA} -#subject -no_jit - /\N{U+}/ /\N{U+}/utf diff --git a/testdata/testoutput4 b/testdata/testoutput4 index a4d5662..b6798d7 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1892,8 +1892,6 @@ No match X\x{06e9} No match -#subject no_jit - /^\P{Katakana}+/utf \x{3105} 0: \x{3105} @@ -1910,8 +1908,6 @@ No match \x{a4c6} No match -#subject -no_jit - /^\p{Any}X/utf AXYZ 0: AX @@ -2312,8 +2308,6 @@ No match \x{1D183} 0: \x{1d183} -#subject no_jit - /^\p{Inherited}/utf \x{200c} 0: \x{200c} @@ -2392,8 +2386,6 @@ No match \x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC 0: \x{30a1}\x{30fa}\x{32d0} -#subject -no_jit - /\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf \x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}==== 0: \x{102a4}\x{aa52}\x{a91d}\x{1c46}\x{10283}\x{1092e}\x{1c6b}\x{a93b}\x{a8bf}\x{1ba0}\x{a50a} diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 8382203..9d70a34 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -4599,8 +4599,6 @@ No match # doesn't recognize all these scripts. In time these three tests can be moved # to test 4. -#subject no_jit - /^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+) (\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+) (\p{Zanabazar_Square}+)/x,utf @@ -4742,8 +4740,6 @@ Callout 0: last capture = 1 \x{1D1AA} 0: \x{1d1aa} -#subject -no_jit - /\N{U+}/ Failed: error 193 at offset 2: \N{U+dddd} is supported only in Unicode (UTF) mode