Fix script extension support on jit (#69)

Co-authored-by: Zoltan Herczeg <hzmester@freemail.hu>
This commit is contained in:
Zoltan Herczeg 2022-01-03 16:49:26 +01:00 committed by GitHub
parent c24047f15d
commit 435140a0ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 67 additions and 41 deletions

View File

@ -7413,14 +7413,18 @@ return cc;
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr); static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr);
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
#define XCLASS_SAVE_CHAR 0x01 #define XCLASS_SAVE_CHAR 0x001
#define XCLASS_CHAR_SAVED 0x02 #define XCLASS_CHAR_SAVED 0x002
#define XCLASS_HAS_TYPE 0x04 #define XCLASS_HAS_TYPE 0x004
#define XCLASS_HAS_SCRIPT 0x08 #define XCLASS_HAS_SCRIPT 0x008
#define XCLASS_HAS_SCRIPT_EXTENSION 0x10 #define XCLASS_HAS_SCRIPT_EXTENSION 0x010
#define XCLASS_HAS_BIDICO 0x20 #define XCLASS_HAS_BIDICO 0x020
#define XCLASS_HAS_BIDICL 0x40 #define XCLASS_HAS_BIDICL 0x040
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL) #define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080
#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100
#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0 0x200
#endif /* SUPPORT_UNICODE */ #endif /* SUPPORT_UNICODE */
static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks) static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
@ -7521,6 +7525,11 @@ while (*cc != XCL_END)
case PT_SCX: case PT_SCX:
unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION; unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION;
if (cc[-1] == XCL_NOTPROP)
{
unicode_status |= XCLASS_SCRIPT_EXTENSION_NOTPROP;
break;
}
compares++; compares++;
case PT_SC: case PT_SC:
@ -7679,14 +7688,19 @@ if (unicode_status & XCLASS_NEEDS_UCD)
{ {
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP); SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
cc++; cc++;
if (*cc == PT_SC || *cc == PT_SCX) switch (*cc)
{ {
case PT_SCX:
if (cc[-1] == XCL_NOTPROP)
break;
case PT_SC:
compares--; compares--;
invertcmp = (compares == 0 && list != backtracks); invertcmp = (compares == 0 && list != backtracks);
if (cc[-1] == XCL_NOTPROP) if (cc[-1] == XCL_NOTPROP)
invertcmp ^= 0x1; invertcmp ^= 0x1;
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]);
add_jump(compiler, compares > 0 ? list : backtracks, jump); add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]));
} }
cc += 2; cc += 2;
} }
@ -7697,6 +7711,27 @@ if (unicode_status & XCLASS_NEEDS_UCD)
if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION) if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION)
{ {
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
if (unicode_status & XCLASS_SCRIPT_EXTENSION_NOTPROP)
{
if (unicode_status & (XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL | XCLASS_HAS_TYPE))
{
if (unicode_status & XCLASS_SAVE_CHAR)
{
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, TMP2, 0);
unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0;
}
else
{
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0);
unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR;
}
}
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
}
while (*cc != XCL_END) while (*cc != XCL_END)
{ {
if (*cc == XCL_SINGLE) if (*cc == XCL_SINGLE)
@ -7716,22 +7751,35 @@ if (unicode_status & XCLASS_NEEDS_UCD)
cc++; cc++;
if (*cc == PT_SCX) if (*cc == PT_SCX)
{ {
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)));
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));
compares--; compares--;
invertcmp = (compares == 0 && list != backtracks); invertcmp = (compares == 0 && list != backtracks);
jump = NULL;
if (cc[-1] == XCL_NOTPROP) if (cc[-1] == XCL_NOTPROP)
{
jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]);
if (invertcmp)
{
add_jump(compiler, backtracks, jump);
jump = NULL;
}
invertcmp ^= 0x1; invertcmp ^= 0x1;
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); }
add_jump(compiler, compares > 0 ? list : backtracks, jump);
OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
if (jump != NULL)
JUMPHERE(jump);
} }
cc += 2; cc += 2;
} }
} }
if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0)
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
else if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR)
OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0);
cc = ccbegin; cc = ccbegin;
} }

View File

@ -413,6 +413,8 @@ static struct regression_test_case regression_test_cases[] = {
{ MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" }, { MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
{ PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" }, { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
{ MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " }, { MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " },
{ MUP, 0, 0, 0, "[\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
{ MUP, 0, 0, 0, "[\\x{a92e}\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
/* Possible empty brackets. */ /* Possible empty brackets. */
{ MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" }, { MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },

8
testdata/testinput4 vendored
View File

@ -1144,8 +1144,6 @@
\= Expect no match \= Expect no match
X\x{06e9} X\x{06e9}
#subject no_jit
/^\P{Katakana}+/utf /^\P{Katakana}+/utf
\x{3105} \x{3105}
\= Expect no match \= Expect no match
@ -1157,8 +1155,6 @@
\x{a014} \x{a014}
\x{a4c6} \x{a4c6}
#subject -no_jit
/^\p{Any}X/utf /^\p{Any}X/utf
AXYZ AXYZ
\x{1234}XYZ \x{1234}XYZ
@ -1410,8 +1406,6 @@
\x{2116} \x{2116}
\x{1D183} \x{1D183}
#subject no_jit
/^\p{Inherited}/utf /^\p{Inherited}/utf
\x{200c} \x{200c}
\= Expect no match \= Expect no match
@ -1464,8 +1458,6 @@
/\p{sc:katakana}{3,}?/utf /\p{sc:katakana}{3,}?/utf
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC \x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC
#subject -no_jit
/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf /\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}==== \x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====

4
testdata/testinput5 vendored
View File

@ -2035,8 +2035,6 @@
# doesn't recognize all these scripts. In time these three tests can be moved # doesn't recognize all these scripts. In time these three tests can be moved
# to test 4. # to test 4.
#subject no_jit
/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+) /^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+) (\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
(\p{Zanabazar_Square}+)/x,utf (\p{Zanabazar_Square}+)/x,utf
@ -2085,8 +2083,6 @@
\x{655} \x{655}
\x{1D1AA} \x{1D1AA}
#subject -no_jit
/\N{U+}/ /\N{U+}/
/\N{U+}/utf /\N{U+}/utf

View File

@ -1892,8 +1892,6 @@ No match
X\x{06e9} X\x{06e9}
No match No match
#subject no_jit
/^\P{Katakana}+/utf /^\P{Katakana}+/utf
\x{3105} \x{3105}
0: \x{3105} 0: \x{3105}
@ -1910,8 +1908,6 @@ No match
\x{a4c6} \x{a4c6}
No match No match
#subject -no_jit
/^\p{Any}X/utf /^\p{Any}X/utf
AXYZ AXYZ
0: AX 0: AX
@ -2312,8 +2308,6 @@ No match
\x{1D183} \x{1D183}
0: \x{1d183} 0: \x{1d183}
#subject no_jit
/^\p{Inherited}/utf /^\p{Inherited}/utf
\x{200c} \x{200c}
0: \x{200c} 0: \x{200c}
@ -2392,8 +2386,6 @@ No match
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC \x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC
0: \x{30a1}\x{30fa}\x{32d0} 0: \x{30a1}\x{30fa}\x{32d0}
#subject -no_jit
/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf /\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}==== \x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====
0: \x{102a4}\x{aa52}\x{a91d}\x{1c46}\x{10283}\x{1092e}\x{1c6b}\x{a93b}\x{a8bf}\x{1ba0}\x{a50a} 0: \x{102a4}\x{aa52}\x{a91d}\x{1c46}\x{10283}\x{1092e}\x{1c6b}\x{a93b}\x{a8bf}\x{1ba0}\x{a50a}

View File

@ -4599,8 +4599,6 @@ No match
# doesn't recognize all these scripts. In time these three tests can be moved # doesn't recognize all these scripts. In time these three tests can be moved
# to test 4. # to test 4.
#subject no_jit
/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+) /^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+) (\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
(\p{Zanabazar_Square}+)/x,utf (\p{Zanabazar_Square}+)/x,utf
@ -4742,8 +4740,6 @@ Callout 0: last capture = 1
\x{1D1AA} \x{1D1AA}
0: \x{1d1aa} 0: \x{1d1aa}
#subject -no_jit
/\N{U+}/ /\N{U+}/
Failed: error 193 at offset 2: \N{U+dddd} is supported only in Unicode (UTF) mode Failed: error 193 at offset 2: \N{U+dddd} is supported only in Unicode (UTF) mode