Implement script extension support in JIT. (#66)

Fix incorect operator in GenerateUcd.py (modulo -> bitwise and)

Co-authored-by: Zoltan Herczeg <hzmester@freemail.hu>
This commit is contained in:
Zoltan Herczeg 2021-12-29 16:57:32 +01:00 committed by GitHub
parent afa4756d19
commit 6614b281bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 93 additions and 63 deletions

View File

@ -780,7 +780,7 @@ for d in script_lists:
bitwords = [0] * script_list_item_size
for idx in d:
bitwords[idx // 32] |= 1 << (idx % 31)
bitwords[idx // 32] |= 1 << (idx & 31)
s = " "
for x in bitwords:

View File

@ -7417,9 +7417,10 @@ static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHA
#define XCLASS_CHAR_SAVED 0x02
#define XCLASS_HAS_TYPE 0x04
#define XCLASS_HAS_SCRIPT 0x08
#define XCLASS_HAS_BIDICO 0x10
#define XCLASS_HAS_BIDICL 0x20
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
#define XCLASS_HAS_SCRIPT_EXTENSION 0x10
#define XCLASS_HAS_BIDICO 0x20
#define XCLASS_HAS_BIDICL 0x40
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
#endif /* SUPPORT_UNICODE */
static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
@ -7518,6 +7519,10 @@ while (*cc != XCL_END)
unicode_status |= XCLASS_HAS_TYPE;
break;
case PT_SCX:
unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION;
compares++;
case PT_SC:
unicode_status |= XCLASS_HAS_SCRIPT;
break;
@ -7674,7 +7679,7 @@ if (unicode_status & XCLASS_NEEDS_UCD)
{
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
cc++;
if (*cc == PT_SC)
if (*cc == PT_SC || *cc == PT_SCX)
{
compares--;
invertcmp = (compares == 0 && list != backtracks);
@ -7690,6 +7695,46 @@ if (unicode_status & XCLASS_NEEDS_UCD)
cc = ccbegin;
}
if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION)
{
while (*cc != XCL_END)
{
if (*cc == XCL_SINGLE)
{
cc ++;
GETCHARINCTEST(c, cc);
}
else if (*cc == XCL_RANGE)
{
cc ++;
GETCHARINCTEST(c, cc);
GETCHARINCTEST(c, cc);
}
else
{
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
cc++;
if (*cc == PT_SCX)
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)));
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));
compares--;
invertcmp = (compares == 0 && list != backtracks);
if (cc[-1] == XCL_NOTPROP)
invertcmp ^= 0x1;
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
add_jump(compiler, compares > 0 ? list : backtracks, jump);
}
cc += 2;
}
}
cc = ccbegin;
}
if (unicode_status & (XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL))
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, bidi));
@ -7879,6 +7924,7 @@ while (*cc != XCL_END)
break;
case PT_SC:
case PT_SCX:
case PT_BIDICO:
case PT_BIDICL:
compares++;

View File

@ -172,66 +172,66 @@ const uint32_t PRIV(ucd_script_sets)[] = {
0x00000000u, 0x00000000u, 0x00000000u,
0x00000002u, 0x00000000u, 0x00000000u,
0x00000100u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00008000u, 0x00000000u,
0x00000000u, 0x00004000u, 0x00000000u,
0x00000800u, 0x00000000u, 0x00000000u,
0x00004000u, 0x00000000u, 0x00000000u,
0x00100000u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00000000u, 0x00000004u,
0x00000000u, 0x00000000u, 0x00000001u,
0x20000000u, 0x00000000u, 0x00000000u,
0x00000021u, 0x00000000u, 0x00000000u,
0x00000001u, 0x00000001u, 0x00000000u,
0x00000001u, 0x00000040u, 0x00000000u,
0x00000001u, 0x40000000u, 0x00000000u,
0x00000001u, 0x00000020u, 0x00000000u,
0x20000001u, 0x00000000u, 0x00000000u,
0x00000001u, 0x00000010u, 0x00000000u,
0x00000001u, 0x00000008u, 0x00000000u,
0x00000102u, 0x00000000u, 0x00000000u,
0x00004004u, 0x00000000u, 0x00000000u,
0x00000008u, 0x00000200u, 0x00000000u,
0x00000008u, 0x00000100u, 0x00000000u,
0x00400040u, 0x00000000u, 0x00000000u,
0x00000480u, 0x00000000u, 0x00000000u,
0x00100080u, 0x00000000u, 0x00000000u,
0x00000080u, 0x00800000u, 0x00000000u,
0x00000080u, 0x00400000u, 0x00000000u,
0x20000080u, 0x00000000u, 0x00000000u,
0x00000100u, 0x00010000u, 0x00000000u,
0x00000100u, 0x00000000u, 0x00000004u,
0x00000100u, 0x00002000u, 0x00000000u,
0x00000100u, 0x00000004u, 0x00000000u,
0x00000100u, 0x00008000u, 0x00000000u,
0x00000100u, 0x00000000u, 0x00000001u,
0x00000100u, 0x00001000u, 0x00000000u,
0x00000100u, 0x00000002u, 0x00000000u,
0x00100200u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00010004u, 0x00000000u,
0x00001000u, 0x00020000u, 0x00000000u,
0x00002000u, 0x04000000u, 0x00000000u,
0x00000000u, 0x00008002u, 0x00000000u,
0x00001000u, 0x00010000u, 0x00000000u,
0x00002000u, 0x02000000u, 0x00000000u,
0x00104000u, 0x00000000u, 0x00000000u,
0x000a0000u, 0x00000000u, 0x00000000u,
0x00040000u, 0x00000000u, 0x00000004u,
0x00040000u, 0x00000000u, 0x00000001u,
0x01100000u, 0x00000000u, 0x00000000u,
0x00000000u, 0x00200000u, 0x00000020u,
0x01000000u, 0x00000080u, 0x00000000u,
0x20000001u, 0x00000010u, 0x00000000u,
0x00000001u, 0x00000010u, 0x00000008u,
0x10000002u, 0x00001000u, 0x00000000u,
0x02000000u, 0x00001002u, 0x00000000u,
0x00400040u, 0x00000000u, 0x00000010u,
0x00400040u, 0x00080000u, 0x00000000u,
0x00040100u, 0x00010000u, 0x00000000u,
0x00100100u, 0x00010000u, 0x00000000u,
0x00000000u, 0x00100000u, 0x00000008u,
0x01000000u, 0x00000040u, 0x00000000u,
0x20000001u, 0x00000008u, 0x00000000u,
0x00000001u, 0x00000008u, 0x00000002u,
0x10000002u, 0x00000800u, 0x00000000u,
0x02000000u, 0x00000801u, 0x00000000u,
0x00400040u, 0x00000000u, 0x00000004u,
0x00400040u, 0x00040000u, 0x00000000u,
0x00040100u, 0x00008000u, 0x00000000u,
0x00100100u, 0x00008000u, 0x00000000u,
0x000a4000u, 0x00000000u, 0x00000000u,
0x02100000u, 0x00000100u, 0x00000000u,
0x00040102u, 0x00010000u, 0x00000000u,
0x40010011u, 0x00000000u, 0x00000000u,
0x00000100u, 0x20100400u, 0x00000000u,
0x02100000u, 0x00000080u, 0x00000000u,
0x00040102u, 0x00008000u, 0x00000000u,
0xc0010010u, 0x00000000u, 0x00000000u,
0x00000100u, 0x10080200u, 0x00000000u,
0x000ac004u, 0x00000000u, 0x00000000u,
0x20000001u, 0x00000051u, 0x00000008u,
0x000ac004u, 0x00000020u, 0x00000000u,
0x04840100u, 0x0000000cu, 0x00000000u,
0x20000001u, 0x08000051u, 0x00000008u,
0x04040102u, 0x02010008u, 0x00000004u,
0x20000001u, 0x09200803u, 0x00000020u,
0x00003100u, 0x22564400u, 0x00000000u,
0x04943102u, 0x0201000cu, 0x00000000u,
0x04943102u, 0x0201200cu, 0x00000000u,
0x00043100u, 0x22564400u, 0x00000004u,
0x00843100u, 0x22564400u, 0x00000004u,
0x1c843102u, 0x7215400cu, 0x00000004u,
0x1ca43102u, 0x7215400cu, 0x00000004u,
0x20000001u, 0x40000028u, 0x00000002u,
0x000ac004u, 0x00000010u, 0x00000000u,
0x04840100u, 0x00000006u, 0x00000000u,
0x20000001u, 0x44000028u, 0x00000002u,
0x04040102u, 0x01008004u, 0x00000001u,
0x20000001u, 0xc4900400u, 0x00000008u,
0x00003100u, 0x112b2200u, 0x00000000u,
0x04943102u, 0x01008006u, 0x00000000u,
0x04943102u, 0x01009006u, 0x00000000u,
0x00043100u, 0x112b2200u, 0x00000001u,
0x00843100u, 0x112b2200u, 0x00000001u,
0x1c843102u, 0x390aa006u, 0x00000001u,
0x1ca43102u, 0x390aa006u, 0x00000001u,
};
/* These are the main two-stage UCD tables. The fields in each record are:

4
testdata/testinput4 vendored
View File

@ -1133,8 +1133,6 @@
A\x{300}\x{301}\x{302}BC
\x{300}
#subject no_jit
/^\p{Han}+/utf
\x{2e81}\x{3007}\x{2f804}\x{31a0}
\= Expect no match
@ -1157,8 +1155,6 @@
\x{a014}
\x{a4c6}
#subject -no_jit
/^\p{Any}X/utf
AXYZ
\x{1234}XYZ

4
testdata/testinput5 vendored
View File

@ -1337,8 +1337,6 @@
# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE
#subject no_jit
/^[\p{Batak}]/utf
\x{1bc0}
\x{1bff}
@ -1358,8 +1356,6 @@
\x{85c}
\x{85d}
#subject -no_jit
/(\X*)(.)/s,utf
A\x{300}

View File

@ -1876,8 +1876,6 @@ No match
\x{300}
0: \x{300}
#subject no_jit
/^\p{Han}+/utf
\x{2e81}\x{3007}\x{2f804}\x{31a0}
0: \x{2e81}\x{3007}\x{2f804}
@ -1910,8 +1908,6 @@ No match
\x{a4c6}
No match
#subject -no_jit
/^\p{Any}X/utf
AXYZ
0: AX

View File

@ -2842,8 +2842,6 @@ No match
# These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE
#subject no_jit
/^[\p{Batak}]/utf
\x{1bc0}
0: \x{1bc0}
@ -2873,8 +2871,6 @@ No match
\x{85d}
No match
#subject -no_jit
/(\X*)(.)/s,utf
A\x{300}
0: A