From 6614b281bc24831167fae5170f4b4bfed89ff814 Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Wed, 29 Dec 2021 16:57:32 +0100 Subject: [PATCH] Implement script extension support in JIT. (#66) Fix incorect operator in GenerateUcd.py (modulo -> bitwise and) Co-authored-by: Zoltan Herczeg --- maint/GenerateUcd.py | 2 +- src/pcre2_jit_compile.c | 54 ++++++++++++++++++++++++-- src/pcre2_ucd.c | 84 ++++++++++++++++++++--------------------- testdata/testinput4 | 4 -- testdata/testinput5 | 4 -- testdata/testoutput4 | 4 -- testdata/testoutput5 | 4 -- 7 files changed, 93 insertions(+), 63 deletions(-) diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py index 34cc57b..08f3949 100755 --- a/maint/GenerateUcd.py +++ b/maint/GenerateUcd.py @@ -780,7 +780,7 @@ for d in script_lists: bitwords = [0] * script_list_item_size for idx in d: - bitwords[idx // 32] |= 1 << (idx % 31) + bitwords[idx // 32] |= 1 << (idx & 31) s = " " for x in bitwords: diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 3fb2731..06dc0a5 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -7417,9 +7417,10 @@ static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHA #define XCLASS_CHAR_SAVED 0x02 #define XCLASS_HAS_TYPE 0x04 #define XCLASS_HAS_SCRIPT 0x08 -#define XCLASS_HAS_BIDICO 0x10 -#define XCLASS_HAS_BIDICL 0x20 -#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL) +#define XCLASS_HAS_SCRIPT_EXTENSION 0x10 +#define XCLASS_HAS_BIDICO 0x20 +#define XCLASS_HAS_BIDICL 0x40 +#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL) #endif /* SUPPORT_UNICODE */ static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks) @@ -7518,6 +7519,10 @@ while (*cc != XCL_END) unicode_status |= XCLASS_HAS_TYPE; break; + case PT_SCX: + unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION; + compares++; + case PT_SC: unicode_status |= XCLASS_HAS_SCRIPT; break; @@ -7674,7 +7679,7 @@ if (unicode_status & XCLASS_NEEDS_UCD) { SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP); cc++; - if (*cc == PT_SC) + if (*cc == PT_SC || *cc == PT_SCX) { compares--; invertcmp = (compares == 0 && list != backtracks); @@ -7690,6 +7695,46 @@ if (unicode_status & XCLASS_NEEDS_UCD) cc = ccbegin; } + if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION) + { + while (*cc != XCL_END) + { + if (*cc == XCL_SINGLE) + { + cc ++; + GETCHARINCTEST(c, cc); + } + else if (*cc == XCL_RANGE) + { + cc ++; + GETCHARINCTEST(c, cc); + GETCHARINCTEST(c, cc); + } + else + { + SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP); + cc++; + if (*cc == PT_SCX) + { + OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx)); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2); + OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5))); + OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f)); + + compares--; + invertcmp = (compares == 0 && list != backtracks); + if (cc[-1] == XCL_NOTPROP) + invertcmp ^= 0x1; + jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); + add_jump(compiler, compares > 0 ? list : backtracks, jump); + } + cc += 2; + } + } + + cc = ccbegin; + } + if (unicode_status & (XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)) { OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, bidi)); @@ -7879,6 +7924,7 @@ while (*cc != XCL_END) break; case PT_SC: + case PT_SCX: case PT_BIDICO: case PT_BIDICL: compares++; diff --git a/src/pcre2_ucd.c b/src/pcre2_ucd.c index 96306e7..fe8619b 100644 --- a/src/pcre2_ucd.c +++ b/src/pcre2_ucd.c @@ -172,66 +172,66 @@ const uint32_t PRIV(ucd_script_sets)[] = { 0x00000000u, 0x00000000u, 0x00000000u, 0x00000002u, 0x00000000u, 0x00000000u, 0x00000100u, 0x00000000u, 0x00000000u, - 0x00000000u, 0x00008000u, 0x00000000u, + 0x00000000u, 0x00004000u, 0x00000000u, 0x00000800u, 0x00000000u, 0x00000000u, 0x00004000u, 0x00000000u, 0x00000000u, 0x00100000u, 0x00000000u, 0x00000000u, - 0x00000000u, 0x00000000u, 0x00000004u, + 0x00000000u, 0x00000000u, 0x00000001u, 0x20000000u, 0x00000000u, 0x00000000u, 0x00000021u, 0x00000000u, 0x00000000u, - 0x00000001u, 0x00000001u, 0x00000000u, - 0x00000001u, 0x00000040u, 0x00000000u, + 0x00000001u, 0x40000000u, 0x00000000u, + 0x00000001u, 0x00000020u, 0x00000000u, 0x20000001u, 0x00000000u, 0x00000000u, - 0x00000001u, 0x00000010u, 0x00000000u, + 0x00000001u, 0x00000008u, 0x00000000u, 0x00000102u, 0x00000000u, 0x00000000u, 0x00004004u, 0x00000000u, 0x00000000u, - 0x00000008u, 0x00000200u, 0x00000000u, + 0x00000008u, 0x00000100u, 0x00000000u, 0x00400040u, 0x00000000u, 0x00000000u, 0x00000480u, 0x00000000u, 0x00000000u, 0x00100080u, 0x00000000u, 0x00000000u, - 0x00000080u, 0x00800000u, 0x00000000u, + 0x00000080u, 0x00400000u, 0x00000000u, 0x20000080u, 0x00000000u, 0x00000000u, - 0x00000100u, 0x00010000u, 0x00000000u, - 0x00000100u, 0x00000000u, 0x00000004u, - 0x00000100u, 0x00002000u, 0x00000000u, - 0x00000100u, 0x00000004u, 0x00000000u, + 0x00000100u, 0x00008000u, 0x00000000u, + 0x00000100u, 0x00000000u, 0x00000001u, + 0x00000100u, 0x00001000u, 0x00000000u, + 0x00000100u, 0x00000002u, 0x00000000u, 0x00100200u, 0x00000000u, 0x00000000u, - 0x00000000u, 0x00010004u, 0x00000000u, - 0x00001000u, 0x00020000u, 0x00000000u, - 0x00002000u, 0x04000000u, 0x00000000u, + 0x00000000u, 0x00008002u, 0x00000000u, + 0x00001000u, 0x00010000u, 0x00000000u, + 0x00002000u, 0x02000000u, 0x00000000u, 0x00104000u, 0x00000000u, 0x00000000u, 0x000a0000u, 0x00000000u, 0x00000000u, - 0x00040000u, 0x00000000u, 0x00000004u, + 0x00040000u, 0x00000000u, 0x00000001u, 0x01100000u, 0x00000000u, 0x00000000u, - 0x00000000u, 0x00200000u, 0x00000020u, - 0x01000000u, 0x00000080u, 0x00000000u, - 0x20000001u, 0x00000010u, 0x00000000u, - 0x00000001u, 0x00000010u, 0x00000008u, - 0x10000002u, 0x00001000u, 0x00000000u, - 0x02000000u, 0x00001002u, 0x00000000u, - 0x00400040u, 0x00000000u, 0x00000010u, - 0x00400040u, 0x00080000u, 0x00000000u, - 0x00040100u, 0x00010000u, 0x00000000u, - 0x00100100u, 0x00010000u, 0x00000000u, + 0x00000000u, 0x00100000u, 0x00000008u, + 0x01000000u, 0x00000040u, 0x00000000u, + 0x20000001u, 0x00000008u, 0x00000000u, + 0x00000001u, 0x00000008u, 0x00000002u, + 0x10000002u, 0x00000800u, 0x00000000u, + 0x02000000u, 0x00000801u, 0x00000000u, + 0x00400040u, 0x00000000u, 0x00000004u, + 0x00400040u, 0x00040000u, 0x00000000u, + 0x00040100u, 0x00008000u, 0x00000000u, + 0x00100100u, 0x00008000u, 0x00000000u, 0x000a4000u, 0x00000000u, 0x00000000u, - 0x02100000u, 0x00000100u, 0x00000000u, - 0x00040102u, 0x00010000u, 0x00000000u, - 0x40010011u, 0x00000000u, 0x00000000u, - 0x00000100u, 0x20100400u, 0x00000000u, + 0x02100000u, 0x00000080u, 0x00000000u, + 0x00040102u, 0x00008000u, 0x00000000u, + 0xc0010010u, 0x00000000u, 0x00000000u, + 0x00000100u, 0x10080200u, 0x00000000u, 0x000ac004u, 0x00000000u, 0x00000000u, - 0x20000001u, 0x00000051u, 0x00000008u, - 0x000ac004u, 0x00000020u, 0x00000000u, - 0x04840100u, 0x0000000cu, 0x00000000u, - 0x20000001u, 0x08000051u, 0x00000008u, - 0x04040102u, 0x02010008u, 0x00000004u, - 0x20000001u, 0x09200803u, 0x00000020u, - 0x00003100u, 0x22564400u, 0x00000000u, - 0x04943102u, 0x0201000cu, 0x00000000u, - 0x04943102u, 0x0201200cu, 0x00000000u, - 0x00043100u, 0x22564400u, 0x00000004u, - 0x00843100u, 0x22564400u, 0x00000004u, - 0x1c843102u, 0x7215400cu, 0x00000004u, - 0x1ca43102u, 0x7215400cu, 0x00000004u, + 0x20000001u, 0x40000028u, 0x00000002u, + 0x000ac004u, 0x00000010u, 0x00000000u, + 0x04840100u, 0x00000006u, 0x00000000u, + 0x20000001u, 0x44000028u, 0x00000002u, + 0x04040102u, 0x01008004u, 0x00000001u, + 0x20000001u, 0xc4900400u, 0x00000008u, + 0x00003100u, 0x112b2200u, 0x00000000u, + 0x04943102u, 0x01008006u, 0x00000000u, + 0x04943102u, 0x01009006u, 0x00000000u, + 0x00043100u, 0x112b2200u, 0x00000001u, + 0x00843100u, 0x112b2200u, 0x00000001u, + 0x1c843102u, 0x390aa006u, 0x00000001u, + 0x1ca43102u, 0x390aa006u, 0x00000001u, }; /* These are the main two-stage UCD tables. The fields in each record are: diff --git a/testdata/testinput4 b/testdata/testinput4 index 6f424ce..6a2430a 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -1133,8 +1133,6 @@ A\x{300}\x{301}\x{302}BC \x{300} -#subject no_jit - /^\p{Han}+/utf \x{2e81}\x{3007}\x{2f804}\x{31a0} \= Expect no match @@ -1157,8 +1155,6 @@ \x{a014} \x{a4c6} -#subject -no_jit - /^\p{Any}X/utf AXYZ \x{1234}XYZ diff --git a/testdata/testinput5 b/testdata/testinput5 index 6f4948a..33204d6 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -1337,8 +1337,6 @@ # These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE -#subject no_jit - /^[\p{Batak}]/utf \x{1bc0} \x{1bff} @@ -1358,8 +1356,6 @@ \x{85c} \x{85d} -#subject -no_jit - /(\X*)(.)/s,utf A\x{300} diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 5fe1bc8..a4d919e 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1876,8 +1876,6 @@ No match \x{300} 0: \x{300} -#subject no_jit - /^\p{Han}+/utf \x{2e81}\x{3007}\x{2f804}\x{31a0} 0: \x{2e81}\x{3007}\x{2f804} @@ -1910,8 +1908,6 @@ No match \x{a4c6} No match -#subject -no_jit - /^\p{Any}X/utf AXYZ 0: AX diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 9936db3..ab8a185 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -2842,8 +2842,6 @@ No match # These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE -#subject no_jit - /^[\p{Batak}]/utf \x{1bc0} 0: \x{1bc0} @@ -2873,8 +2871,6 @@ No match \x{85d} No match -#subject -no_jit - /(\X*)(.)/s,utf A\x{300} 0: A