Fix script extension support on jit (#69)
Co-authored-by: Zoltan Herczeg <hzmester@freemail.hu>
This commit is contained in:
parent
c24047f15d
commit
435140a0ac
|
@ -7413,14 +7413,18 @@ return cc;
|
||||||
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr);
|
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr);
|
||||||
|
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
#define XCLASS_SAVE_CHAR 0x01
|
#define XCLASS_SAVE_CHAR 0x001
|
||||||
#define XCLASS_CHAR_SAVED 0x02
|
#define XCLASS_CHAR_SAVED 0x002
|
||||||
#define XCLASS_HAS_TYPE 0x04
|
#define XCLASS_HAS_TYPE 0x004
|
||||||
#define XCLASS_HAS_SCRIPT 0x08
|
#define XCLASS_HAS_SCRIPT 0x008
|
||||||
#define XCLASS_HAS_SCRIPT_EXTENSION 0x10
|
#define XCLASS_HAS_SCRIPT_EXTENSION 0x010
|
||||||
#define XCLASS_HAS_BIDICO 0x20
|
#define XCLASS_HAS_BIDICO 0x020
|
||||||
#define XCLASS_HAS_BIDICL 0x40
|
#define XCLASS_HAS_BIDICL 0x040
|
||||||
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
|
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
|
||||||
|
#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080
|
||||||
|
#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100
|
||||||
|
#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0 0x200
|
||||||
|
|
||||||
#endif /* SUPPORT_UNICODE */
|
#endif /* SUPPORT_UNICODE */
|
||||||
|
|
||||||
static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
|
static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
|
||||||
|
@ -7521,6 +7525,11 @@ while (*cc != XCL_END)
|
||||||
|
|
||||||
case PT_SCX:
|
case PT_SCX:
|
||||||
unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION;
|
unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION;
|
||||||
|
if (cc[-1] == XCL_NOTPROP)
|
||||||
|
{
|
||||||
|
unicode_status |= XCLASS_SCRIPT_EXTENSION_NOTPROP;
|
||||||
|
break;
|
||||||
|
}
|
||||||
compares++;
|
compares++;
|
||||||
|
|
||||||
case PT_SC:
|
case PT_SC:
|
||||||
|
@ -7679,14 +7688,19 @@ if (unicode_status & XCLASS_NEEDS_UCD)
|
||||||
{
|
{
|
||||||
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
|
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
|
||||||
cc++;
|
cc++;
|
||||||
if (*cc == PT_SC || *cc == PT_SCX)
|
switch (*cc)
|
||||||
{
|
{
|
||||||
|
case PT_SCX:
|
||||||
|
if (cc[-1] == XCL_NOTPROP)
|
||||||
|
break;
|
||||||
|
|
||||||
|
case PT_SC:
|
||||||
compares--;
|
compares--;
|
||||||
invertcmp = (compares == 0 && list != backtracks);
|
invertcmp = (compares == 0 && list != backtracks);
|
||||||
if (cc[-1] == XCL_NOTPROP)
|
if (cc[-1] == XCL_NOTPROP)
|
||||||
invertcmp ^= 0x1;
|
invertcmp ^= 0x1;
|
||||||
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]);
|
|
||||||
add_jump(compiler, compares > 0 ? list : backtracks, jump);
|
add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]));
|
||||||
}
|
}
|
||||||
cc += 2;
|
cc += 2;
|
||||||
}
|
}
|
||||||
|
@ -7697,6 +7711,27 @@ if (unicode_status & XCLASS_NEEDS_UCD)
|
||||||
|
|
||||||
if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION)
|
if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION)
|
||||||
{
|
{
|
||||||
|
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
|
||||||
|
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
|
||||||
|
|
||||||
|
if (unicode_status & XCLASS_SCRIPT_EXTENSION_NOTPROP)
|
||||||
|
{
|
||||||
|
if (unicode_status & (XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL | XCLASS_HAS_TYPE))
|
||||||
|
{
|
||||||
|
if (unicode_status & XCLASS_SAVE_CHAR)
|
||||||
|
{
|
||||||
|
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, TMP2, 0);
|
||||||
|
unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0);
|
||||||
|
unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
|
||||||
|
}
|
||||||
|
|
||||||
while (*cc != XCL_END)
|
while (*cc != XCL_END)
|
||||||
{
|
{
|
||||||
if (*cc == XCL_SINGLE)
|
if (*cc == XCL_SINGLE)
|
||||||
|
@ -7716,22 +7751,35 @@ if (unicode_status & XCLASS_NEEDS_UCD)
|
||||||
cc++;
|
cc++;
|
||||||
if (*cc == PT_SCX)
|
if (*cc == PT_SCX)
|
||||||
{
|
{
|
||||||
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
|
|
||||||
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
|
|
||||||
OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)));
|
|
||||||
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));
|
|
||||||
|
|
||||||
compares--;
|
compares--;
|
||||||
invertcmp = (compares == 0 && list != backtracks);
|
invertcmp = (compares == 0 && list != backtracks);
|
||||||
|
|
||||||
|
jump = NULL;
|
||||||
if (cc[-1] == XCL_NOTPROP)
|
if (cc[-1] == XCL_NOTPROP)
|
||||||
|
{
|
||||||
|
jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]);
|
||||||
|
if (invertcmp)
|
||||||
|
{
|
||||||
|
add_jump(compiler, backtracks, jump);
|
||||||
|
jump = NULL;
|
||||||
|
}
|
||||||
invertcmp ^= 0x1;
|
invertcmp ^= 0x1;
|
||||||
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
|
}
|
||||||
add_jump(compiler, compares > 0 ? list : backtracks, jump);
|
|
||||||
|
OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));
|
||||||
|
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));
|
||||||
|
|
||||||
|
if (jump != NULL)
|
||||||
|
JUMPHERE(jump);
|
||||||
}
|
}
|
||||||
cc += 2;
|
cc += 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0)
|
||||||
|
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
|
||||||
|
else if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR)
|
||||||
|
OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0);
|
||||||
cc = ccbegin;
|
cc = ccbegin;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -413,6 +413,8 @@ static struct regression_test_case regression_test_cases[] = {
|
||||||
{ MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
|
{ MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
|
||||||
{ PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
|
{ PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
|
||||||
{ MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " },
|
{ MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " },
|
||||||
|
{ MUP, 0, 0, 0, "[\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
|
||||||
|
{ MUP, 0, 0, 0, "[\\x{a92e}\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
|
||||||
|
|
||||||
/* Possible empty brackets. */
|
/* Possible empty brackets. */
|
||||||
{ MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },
|
{ MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },
|
||||||
|
|
|
@ -1144,8 +1144,6 @@
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
X\x{06e9}
|
X\x{06e9}
|
||||||
|
|
||||||
#subject no_jit
|
|
||||||
|
|
||||||
/^\P{Katakana}+/utf
|
/^\P{Katakana}+/utf
|
||||||
\x{3105}
|
\x{3105}
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
|
@ -1157,8 +1155,6 @@
|
||||||
\x{a014}
|
\x{a014}
|
||||||
\x{a4c6}
|
\x{a4c6}
|
||||||
|
|
||||||
#subject -no_jit
|
|
||||||
|
|
||||||
/^\p{Any}X/utf
|
/^\p{Any}X/utf
|
||||||
AXYZ
|
AXYZ
|
||||||
\x{1234}XYZ
|
\x{1234}XYZ
|
||||||
|
@ -1410,8 +1406,6 @@
|
||||||
\x{2116}
|
\x{2116}
|
||||||
\x{1D183}
|
\x{1D183}
|
||||||
|
|
||||||
#subject no_jit
|
|
||||||
|
|
||||||
/^\p{Inherited}/utf
|
/^\p{Inherited}/utf
|
||||||
\x{200c}
|
\x{200c}
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
|
@ -1464,8 +1458,6 @@
|
||||||
/\p{sc:katakana}{3,}?/utf
|
/\p{sc:katakana}{3,}?/utf
|
||||||
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC
|
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC
|
||||||
|
|
||||||
#subject -no_jit
|
|
||||||
|
|
||||||
/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
|
/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
|
||||||
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====
|
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====
|
||||||
|
|
||||||
|
|
|
@ -2035,8 +2035,6 @@
|
||||||
# doesn't recognize all these scripts. In time these three tests can be moved
|
# doesn't recognize all these scripts. In time these three tests can be moved
|
||||||
# to test 4.
|
# to test 4.
|
||||||
|
|
||||||
#subject no_jit
|
|
||||||
|
|
||||||
/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
|
/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
|
||||||
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
|
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
|
||||||
(\p{Zanabazar_Square}+)/x,utf
|
(\p{Zanabazar_Square}+)/x,utf
|
||||||
|
@ -2085,8 +2083,6 @@
|
||||||
\x{655}
|
\x{655}
|
||||||
\x{1D1AA}
|
\x{1D1AA}
|
||||||
|
|
||||||
#subject -no_jit
|
|
||||||
|
|
||||||
/\N{U+}/
|
/\N{U+}/
|
||||||
|
|
||||||
/\N{U+}/utf
|
/\N{U+}/utf
|
||||||
|
|
|
@ -1892,8 +1892,6 @@ No match
|
||||||
X\x{06e9}
|
X\x{06e9}
|
||||||
No match
|
No match
|
||||||
|
|
||||||
#subject no_jit
|
|
||||||
|
|
||||||
/^\P{Katakana}+/utf
|
/^\P{Katakana}+/utf
|
||||||
\x{3105}
|
\x{3105}
|
||||||
0: \x{3105}
|
0: \x{3105}
|
||||||
|
@ -1910,8 +1908,6 @@ No match
|
||||||
\x{a4c6}
|
\x{a4c6}
|
||||||
No match
|
No match
|
||||||
|
|
||||||
#subject -no_jit
|
|
||||||
|
|
||||||
/^\p{Any}X/utf
|
/^\p{Any}X/utf
|
||||||
AXYZ
|
AXYZ
|
||||||
0: AX
|
0: AX
|
||||||
|
@ -2312,8 +2308,6 @@ No match
|
||||||
\x{1D183}
|
\x{1D183}
|
||||||
0: \x{1d183}
|
0: \x{1d183}
|
||||||
|
|
||||||
#subject no_jit
|
|
||||||
|
|
||||||
/^\p{Inherited}/utf
|
/^\p{Inherited}/utf
|
||||||
\x{200c}
|
\x{200c}
|
||||||
0: \x{200c}
|
0: \x{200c}
|
||||||
|
@ -2392,8 +2386,6 @@ No match
|
||||||
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC
|
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC
|
||||||
0: \x{30a1}\x{30fa}\x{32d0}
|
0: \x{30a1}\x{30fa}\x{32d0}
|
||||||
|
|
||||||
#subject -no_jit
|
|
||||||
|
|
||||||
/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
|
/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
|
||||||
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====
|
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====
|
||||||
0: \x{102a4}\x{aa52}\x{a91d}\x{1c46}\x{10283}\x{1092e}\x{1c6b}\x{a93b}\x{a8bf}\x{1ba0}\x{a50a}
|
0: \x{102a4}\x{aa52}\x{a91d}\x{1c46}\x{10283}\x{1092e}\x{1c6b}\x{a93b}\x{a8bf}\x{1ba0}\x{a50a}
|
||||||
|
|
|
@ -4599,8 +4599,6 @@ No match
|
||||||
# doesn't recognize all these scripts. In time these three tests can be moved
|
# doesn't recognize all these scripts. In time these three tests can be moved
|
||||||
# to test 4.
|
# to test 4.
|
||||||
|
|
||||||
#subject no_jit
|
|
||||||
|
|
||||||
/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
|
/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
|
||||||
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
|
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
|
||||||
(\p{Zanabazar_Square}+)/x,utf
|
(\p{Zanabazar_Square}+)/x,utf
|
||||||
|
@ -4742,8 +4740,6 @@ Callout 0: last capture = 1
|
||||||
\x{1D1AA}
|
\x{1D1AA}
|
||||||
0: \x{1d1aa}
|
0: \x{1d1aa}
|
||||||
|
|
||||||
#subject -no_jit
|
|
||||||
|
|
||||||
/\N{U+}/
|
/\N{U+}/
|
||||||
Failed: error 193 at offset 2: \N{U+dddd} is supported only in Unicode (UTF) mode
|
Failed: error 193 at offset 2: \N{U+dddd} is supported only in Unicode (UTF) mode
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue