From b29732063b8f124d1270ecfa4772c6ee285f1be6 Mon Sep 17 00:00:00 2001 From: Philip Hazel Date: Tue, 21 Dec 2021 15:39:46 +0000 Subject: [PATCH] Revised script handling (see ChangeLog) --- ChangeLog | 13 + maint/GenerateUtt.py | 3 +- maint/MultiStage2.py | 72 ++-- maint/ucptest.c | 47 ++- maint/ucptestdata/testoutput1 | 10 +- maint/ucptestdata/testoutput2 | 6 +- src/pcre2_auto_possess.c | 44 ++- src/pcre2_compile.c | 78 ++++- src/pcre2_dfa_match.c | 26 ++ src/pcre2_error.c | 2 +- src/pcre2_internal.h | 59 +++- src/pcre2_match.c | 93 +++++- src/pcre2_printint.c | 7 +- src/pcre2_script_run.c | 128 +++---- src/pcre2_tables.c | 324 +++++++++--------- src/pcre2_ucd.c | 287 ++++++++-------- src/pcre2_ucp.h | 5 +- src/pcre2_xclass.c | 10 + testdata/testinput4 | 598 +++++++++++++++++---------------- testdata/testinput5 | 24 +- testdata/testinput7 | 28 ++ testdata/testoutput4 | 610 ++++++++++++++++++---------------- testdata/testoutput5 | 55 ++- testdata/testoutput7 | 42 +++ 24 files changed, 1507 insertions(+), 1064 deletions(-) diff --git a/ChangeLog b/ChangeLog index fb2be23..ccbdefc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -45,6 +45,19 @@ of applications treat NULL/0 in this way. 16. Very minor code speed up for maximizing character property matches. +17. A number of changes to script matching for \p and \P: + + (a) Script extensions for a character are now coded as a bitmap instead of + a list of script numbers, which should be faster and does not need a + loop. + + (b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms + sc and scx). + + (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being + the same as \p{scx:scriptname} because this change happened in Perl at + release 5.26. + Version 10.39 29-October-2021 ----------------------------- diff --git a/maint/GenerateUtt.py b/maint/GenerateUtt.py index be2f337..2167569 100755 --- a/maint/GenerateUtt.py +++ b/maint/GenerateUtt.py @@ -32,6 +32,7 @@ # Added support for bidi class and bidi control, 06-December-2021 # This also involved lower casing strings and removing underscores, in # accordance with Unicode's "loose matching" rules, which Perl observes. +# Changed default script type from PT_SC to PT_SCX, 18-December-2021 script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ @@ -104,7 +105,7 @@ std_bidiclass_names = stdnames(bidiclass_names) # names. We keep both the standardized name and the original, because the # latter is used for the ucp_xx names. -utt_table = list(zip(std_script_names, script_names, ['PT_SC'] * len(script_names))) +utt_table = list(zip(std_script_names, script_names, ['PT_SCX'] * len(script_names))) utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names))) utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names))) utt_table += list(zip(std_bidiclass_names, bidiclass_names, ['PT_BIDICL'] * len(bidiclass_names))) diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py index 4620fd6..c56e8dd 100755 --- a/maint/MultiStage2.py +++ b/maint/MultiStage2.py @@ -100,6 +100,8 @@ # PCRE2-10.39: Updated for Unicode 14.0.0 # 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class, # and also PropList.txt for the Bidi_Control property +# 19-December-2021: Reworked script extensions lists to be bit maps instead +# of zero-terminated lists of script numbers. # ---------------------------------------------------------------------------- # # @@ -128,11 +130,12 @@ # in script runs all come from the same set. The first element in the vector # contains the number of subsequent elements, which are in ascending order. # -# The ucd_script_sets vector contains lists of script numbers that are the -# Script Extensions properties of certain characters. Each list is terminated -# by zero (ucp_Unknown). A character with more than one script listed for its -# Script Extension property has a negative value in its record. This is the -# negated offset to the start of the relevant list in the ucd_script_sets +# The ucd_script_sets vector contains bitmaps that represent lists of scripts +# for the Script Extensions properties of certain characters. Each bitmap +# consists of a fixed number of unsigned 32-bit numbers, enough to allocate +# a bit for every known script. A character with more than one script listed +# for its Script Extension property has a negative value in its record. This is +# the negated offset to the start of the relevant bitmap in the ucd_script_sets # vector. # # The ucd_records table contains one instance of every unique record that is @@ -186,15 +189,15 @@ # 3 = ucp_gbExtend => Grapheme break property "Extend" # 0 => Not part of a caseless set # 0 => No other case -# -122 => Script Extension list offset = 122 -# 19 = ucp_bidiNSM => Bidi class non-spacing mark +# -228 => Script Extension list offset = 228 +# 13 = ucp_bidiNSM => Bidi class non-spacing mark # 0 => Dummy value, unused at present # -# At offset 101 in the ucd_script_sets vector we find the list 3, 15, 107, 29, -# and terminator 0. This means that this character is expected to be used with -# any of those scripts, which are Bengali, Devanagari, Grantha, and Kannada. +# At offset 228 in the ucd_script_sets vector we find a bitmap with bits 3, 15, +# 29, and 107 set. This means that this character is expected to be used with +# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. # -# Philip Hazel, last updated 05 December 2021. +# Philip Hazel, last updated 19 December 2021. ############################################################################## @@ -507,7 +510,7 @@ break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend', # BIDI class property names in the DerivedBidiClass.txt file -bidiclass_names = ['AL', 'AN', 'B', 'BN', 'CS', 'EN', 'ES', 'ET', 'FSI', 'L', +bidiclass_names = ['AL', 'AN', 'B', 'BN', 'CS', 'EN', 'ES', 'ET', 'FSI', 'L', 'LRE', 'LRI', 'LRO', 'NSM', 'ON', 'PDF', 'PDI', 'R', 'RLE', 'RLI', 'RLO', 'S', 'WS' ] @@ -574,7 +577,7 @@ file.close() # file, setting 'Unknown' as the default (this will never be a Script Extension # value), then scan it and fill in the default from Scripts. Code added by PH # in October 2018. Positive values are used for just a single script for a -# code point. Negative values are negated offsets in a list of lists of +# code point. Negative values are negated offsets in a list of bitsets of # multiple scripts. Initialize this list with a single entry, as the zeroth # element is never used. @@ -582,9 +585,22 @@ script_lists = [0] script_abbrevs_default = script_abbrevs.index('Zzzz') scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default) +# Scan all characters and set their default script extension to the main +# script. We also have to adjust negative scriptx values, following a change in +# the way these work. They are currently negated offsets into the script_lists +# list, but have to be changed into indices in the new ucd_script_sets vector, +# which has fixed-size entries. We can compute the new offset by counting the +# zeros that precede the current offset. + for i in range(0, MAX_UNICODE): if scriptx[i] == script_abbrevs_default: scriptx[i] = script[i] + elif scriptx[i] < 0: + count = 1 + for j in range(-scriptx[i], 0, -1): + if script_lists[j] == 0: + count += 1 + scriptx[i] = -count * (int(len(script_names)/32) + 1) # With the addition of the Script Extensions field, we needed some padding to # get the Unicode records up to 12 bytes (multiple of 4). Originally this was a @@ -803,18 +819,30 @@ for d in digitsets: count += 1 print("\n};\n") -print("/* This vector is a list of lists of scripts for the Script Extension") -print("property. Each sublist is zero-terminated. */\n") -print("const uint8_t PRIV(ucd_script_sets)[] = {") +print("/* This vector is a list of script bitsets for the Script Extension") +print("property. */\n") +print("const uint32_t PRIV(ucd_script_sets)[] = {") + +bitword_count = len(script_names)/32 + 1 +bitwords = [0] * int(bitword_count) -count = 0 -print(" /* 0 */", end='') for d in script_lists: - print(" %3d," % d, end='') - count += 1 if d == 0: - print("\n /* %3d */" % count, end='') -print("\n};\n") + s = " " + print(" ", end='') + for x in bitwords: + print("%s" %s, end='') + s = ", " + print("0x%08xu" % x, end='') + print(",\n", end='') + bitwords = [0] * int(bitword_count) + + else: + x = int(d/32) + y = int(d%32) + bitwords[x] = bitwords[x] | (1 << y) + +print("};\n") # Output the main UCD tables. diff --git a/maint/ucptest.c b/maint/ucptest.c index 3c62da3..8a9497d 100644 --- a/maint/ucptest.c +++ b/maint/ucptest.c @@ -308,7 +308,7 @@ const ucp_type_table *u; for (i = 0; i < PRIV(utt_size); i++) { u = PRIV(utt) + i; - if (u->type == PT_SC && u->value == script) break; + if (u->type == PT_SCX && u->value == script) break; } if (i < PRIV(utt_size)) @@ -461,12 +461,30 @@ if (scriptx != script) else { const char *sep = ""; + + +/* const uint8_t *p = PRIV(ucd_script_sets) - scriptx; while (*p != 0) { printf("%s%s", sep, get_scriptname(*p++)); sep = ", "; } +*/ + + const uint32_t *p = PRIV(ucd_script_sets) - scriptx; + for (int i = 0; i < ucp_Script_Count; i++) + { + int x = i/32; + int y = i%32; + + if ((p[x] & (1u<type == PT_SC && strcmp(CS(value + offset), + if (u->type == PT_SCX && strcmp(CS(value + offset), PRIV(utt_names) + u->name_offset) == 0) { c = u->value; @@ -686,11 +704,11 @@ for (c = 0; c <= 0x10ffff; c++) if (scriptx_count > 0) { - const uint8_t *char_scriptx = NULL; + const uint32_t *bits_scriptx = NULL; unsigned int found = 0; int scriptx = UCD_SCRIPTX(c); - if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx; + if (scriptx < 0) bits_scriptx = PRIV(ucd_script_sets) - scriptx; for (i = 0; i < scriptx_count; i++) { @@ -704,15 +722,9 @@ for (c = 0; c <= 0x10ffff; c++) else { - const uint8_t *p; - for (p = char_scriptx; *p != 0; p++) - { - if (scriptx_list[i] == *p) - { - found++; - break; - } - } + int x = scriptx_list[i]/32; + int y = scriptx_list[i]%32; + if ((bits_scriptx[x] & (1u<script) == negated; + case PT_SCX: + scriptx = prop->scriptx; + ok = pdata == prop->script || pdata == (unsigned int)scriptx; + if (!ok && scriptx < 0) + ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, pdata) != 0; + return ok == negated; + /* These are specials */ case PT_ALNUM: @@ -253,14 +263,14 @@ switch(ptype) if (c == *p++) return negated; } break; /* Control never reaches here */ - + /* Haven't yet thought these through. */ case PT_BIDICL: return FALSE; - + case PT_BIDICO: - return FALSE; + return FALSE; } return FALSE; diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index dcc809f..31964db 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -2092,6 +2092,7 @@ PCRE2_SIZE i, bot, top; PCRE2_SPTR ptr = *ptrptr; PCRE2_UCHAR name[50]; PCRE2_UCHAR *vptr = NULL; +uint16_t ptscript = PT_NOTSCRIPT; if (ptr >= cb->end_pattern) goto ERROR_RETURN; c = *ptr++; @@ -2118,8 +2119,9 @@ if (c == CHAR_LEFT_CURLY_BRACKET) if (c == CHAR_NUL) goto ERROR_RETURN; if (c == CHAR_RIGHT_CURLY_BRACKET) break; name[i] = tolower(c); - if (c == ':' || c == '=') vptr = name + i; + if ((c == ':' || c == '=') && vptr == NULL) vptr = name + i; } + if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; name[i] = 0; } @@ -2137,25 +2139,56 @@ else goto ERROR_RETURN; *ptrptr = ptr; /* If the property contains ':' or '=' we have class name and value separately -specified. The only case currently supported is Bidi_Class (synonym BC), for -which the property names are "bidi". */ +specified. The following are supported: + + . Bidi_Class (synonym bc), for which the property names are "bidi". + . Script (synonym sc) for which the property name is the script name + . Script_Extensions (synonym scx), ditto + +As this is a small number, we currently just check the names directly. If this +grows, a sorted table and a switch will be neater. + +For both the script properties, set a PT_xxx value so that (1) they can be +distinguished and (2) invalid script names that happen to be the name of +another property can be diagnosed. */ if (vptr != NULL) { - *vptr = 0; /* Terminate property name */ - if (PRIV(strcmp_c8)(name, "bidiclass") != 0 && - PRIV(strcmp_c8)(name, "bc") != 0) + int offset = 0; + PCRE2_UCHAR sname[8]; + + *vptr = 0; /* Terminate property name */ + if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 || + PRIV(strcmp_c8)(name, STRING_bc) == 0) + { + offset = 4; + sname[0] = CHAR_b; + sname[1] = CHAR_i; /* There is no strcpy_c8 function */ + sname[2] = CHAR_d; + sname[3] = CHAR_i; + } + + else if (PRIV(strcmp_c8)(name, STRING_script) == 0 || + PRIV(strcmp_c8)(name, STRING_sc) == 0) + ptscript = PT_SC; + + else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 || + PRIV(strcmp_c8)(name, STRING_scx) == 0) + ptscript = PT_SCX; + + else { *errorcodeptr = ERR47; return FALSE; } - memmove(name + 4, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR)); - name[1] = 'i'; /* Can't use PRIV(strcpy)() because it adds 0 */ - name[2] = 'd'; - name[3] = 'i'; + + /* Adjust the string in name[] as needed */ + + memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR)); + if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR)); } -/* Search for a recognized property name using binary chop. */ +/* Search for a recognized property using binary chop. */ bot = 0; top = PRIV(utt_size); @@ -2165,16 +2198,27 @@ while (bot < top) int r; i = (bot + top) >> 1; r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); + + /* When a matching property is found, some extra checking is needed when the + \p{xx:yy} syntax is used and xx is either sc or scx. */ + if (r == 0) { - *ptypeptr = PRIV(utt)[i].type; *pdataptr = PRIV(utt)[i].value; + if (vptr == NULL || ptscript == PT_NOTSCRIPT) + *ptypeptr = PRIV(utt)[i].type; + else + { + if (PRIV(utt)[i].type != PT_SCX) break; /* Non-script found */ + *ptypeptr = ptscript; + } return TRUE; } + if (r > 0) bot = i + 1; else top = i; } -*errorcodeptr = ERR47; /* Unrecognized name */ +*errorcodeptr = ERR47; /* Unrecognized property */ return FALSE; ERROR_RETURN: /* Malformed \P or \p */ @@ -5858,7 +5902,7 @@ for (;; pptr++) case ESC_D: should_flip_negation = TRUE; - for (int i = 0; i < 32; i++) + for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]); break; @@ -5868,7 +5912,7 @@ for (;; pptr++) case ESC_W: should_flip_negation = TRUE; - for (int i = 0; i < 32; i++) + for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~cbits[i+cbit_word]); break; @@ -5885,7 +5929,7 @@ for (;; pptr++) case ESC_S: should_flip_negation = TRUE; - for (int i = 0; i < 32; i++) + for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~cbits[i+cbit_space]); break; @@ -6276,7 +6320,7 @@ for (;; pptr++) bravalue = OP_COND; { int count, index; - unsigned int i; + unsigned int i; PCRE2_SPTR name; named_group *ng = cb->named_groups; uint32_t length = *(++pptr); diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index f0570b9..829f84b 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -1193,6 +1193,12 @@ for (;;) OK = prop->script == code[2]; break; + case PT_SCX: + OK = prop->script == code[2] || prop->scriptx == (int)code[2]; + if (!OK && prop->scriptx < 0) + OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx, code[2]) != 0; + break; + /* These are specials for combination cases. */ case PT_ALNUM: @@ -1459,6 +1465,12 @@ for (;;) OK = prop->script == code[3]; break; + case PT_SCX: + OK = prop->script == code[3] || prop->scriptx == (int)code[3]; + if (!OK && prop->scriptx < 0) + OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx, code[3]) != 0; + break; + /* These are specials for combination cases. */ case PT_ALNUM: @@ -1708,6 +1720,12 @@ for (;;) OK = prop->script == code[3]; break; + case PT_SCX: + OK = prop->script == code[3] || prop->scriptx == (int)code[3]; + if (!OK && prop->scriptx < 0) + OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx, code[3]) != 0; + break; + /* These are specials for combination cases. */ case PT_ALNUM: @@ -1982,6 +2000,14 @@ for (;;) OK = prop->script == code[1 + IMM2_SIZE + 2]; break; + case PT_SCX: + OK = prop->script == code[1 + IMM2_SIZE + 2] || + prop->scriptx == (int)code[1 + IMM2_SIZE + 2]; + if (!OK && prop->scriptx < 0) + OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx, + code[1 + IMM2_SIZE + 2]) != 0; + break; + /* These are specials for combination cases. */ case PT_ALNUM: diff --git a/src/pcre2_error.c b/src/pcre2_error.c index d3bb466..09904c0 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -119,7 +119,7 @@ static const unsigned char compile_error_texts[] = /* 45 */ "this version of PCRE2 does not have support for \\P, \\p, or \\X\0" "malformed \\P or \\p sequence\0" - "unknown property name after \\P or \\p\0" + "unknown property after \\P or \\p\0" "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " code units)\0" "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" /* 50 */ diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 2901352..7a8efd9 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -954,6 +954,13 @@ a positive value. */ #define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION=" #define STRING_MARK "MARK" +#define STRING_bc "bc" +#define STRING_bidiclass "bidiclass" +#define STRING_sc "sc" +#define STRING_script "script" +#define STRING_scriptextensions "scriptextensions" +#define STRING_scx "scx" + #else /* SUPPORT_UNICODE */ /* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This @@ -1248,28 +1255,39 @@ only. */ #define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN #define STRING_MARK STR_M STR_A STR_R STR_K +#define STRING_bc STR_b STR_c +#define STRING_bidiclass STR_b STR_i STR_d STR_i STR_c STR_l STR_a STR_s STR_s +#define STRING_sc STR_s STR_c +#define STRING_script STR_s STR_c STR_r STR_i STR_p STR_t +#define STRING_scriptextensions STR_s STR_c STR_r STR_i STR_p STR_t STR_e STR_x STR_t STR_e STR_n STR_s STR_i STR_o STR_n STR_s +#define STRING_scx STR_s STR_c STR_x + + #endif /* SUPPORT_UNICODE */ /* -------------------- End of character and string names -------------------*/ /* -------------------- Definitions for compiled patterns -------------------*/ -/* Codes for different types of Unicode property */ +/* Codes for different types of Unicode property. If these definitions are +changed, the autopossessifying table in pcre2_auto_possess.c must be updated to +match. */ #define PT_ANY 0 /* Any property - matches all chars */ #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ #define PT_GC 2 /* Specified general characteristic (e.g. L) */ #define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */ -#define PT_SC 4 /* Script (e.g. Han) */ -#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */ -#define PT_SPACE 6 /* Perl space - general category Z plus 9,10,12,13 */ -#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */ -#define PT_WORD 8 /* Word - L plus N plus underscore */ -#define PT_CLIST 9 /* Pseudo-property: match character list */ -#define PT_UCNC 10 /* Universal Character nameable character */ -#define PT_BIDICL 11 /* Specified bidi class */ -#define PT_BIDICO 12 /* Bidi control character */ -#define PT_TABSIZE 13 /* Size of square table for autopossessify tests */ +#define PT_SC 4 /* Script only (e.g. Han) */ +#define PT_SCX 5 /* Script extensions (includes SC) */ +#define PT_ALNUM 6 /* Alphanumeric - the union of L and N */ +#define PT_SPACE 7 /* Perl space - general category Z plus 9,10,12,13 */ +#define PT_PXSPACE 8 /* POSIX space - Z plus 9,10,11,12,13 */ +#define PT_WORD 9 /* Word - L plus N plus underscore */ +#define PT_CLIST 10 /* Pseudo-property: match character list */ +#define PT_UCNC 11 /* Universal Character nameable character */ +#define PT_BIDICL 12 /* Specified bidi class */ +#define PT_BIDICO 13 /* Bidi control character */ +#define PT_TABSIZE 14 /* Size of square table for autopossessify tests */ /* The following special properties are used only in XCLASS items, when POSIX classes are specified and PCRE2_UCP is set - in other words, for Unicode @@ -1277,9 +1295,14 @@ handling of these classes. They are not available via the \p or \P escapes like those in the above list, and so they do not take part in the autopossessifying table. */ -#define PT_PXGRAPH 13 /* [:graph:] - characters that mark the paper */ -#define PT_PXPRINT 14 /* [:print:] - [:graph:] plus non-control spaces */ -#define PT_PXPUNCT 15 /* [:punct:] - punctuation characters */ +#define PT_PXGRAPH 14 /* [:graph:] - characters that mark the paper */ +#define PT_PXPRINT 15 /* [:print:] - [:graph:] plus non-control spaces */ +#define PT_PXPUNCT 16 /* [:punct:] - punctuation characters */ + +/* This value is used when parsing \p and \P escapes to indicate that neither +\p{script:...} nor \p{scx:...} has been encountered. */ + +#define PT_NOTSCRIPT 255 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that contain characters with values greater than 255. */ @@ -1826,6 +1849,12 @@ typedef struct { #define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case))) #define UCD_SCRIPTX(ch) GET_UCD(ch)->scriptx +/* The "scriptx" field, when negative, gives an offset into a vector of 32-bit +words that form a bitmap representing a list of scripts. This macro tests for a +script in the map by number. */ + +#define MAPBIT(map,script) ((map)[(script)/32]&(1u<<((script)%32))) + /* The "bidi" field has the 0x80 bit set if the character has the Bidi_Control property. The remaining bits hold the bidi class, but as there are only 23 classes, we can mask off 5 bits - leaving two free for the future. */ @@ -1916,7 +1945,7 @@ extern const uint32_t PRIV(hspace_list)[]; extern const uint32_t PRIV(vspace_list)[]; extern const uint32_t PRIV(ucd_caseless_sets)[]; extern const uint32_t PRIV(ucd_digit_sets)[]; -extern const uint8_t PRIV(ucd_script_sets)[]; +extern const uint32_t PRIV(ucd_script_sets)[]; extern const ucd_record PRIV(ucd_records)[]; #if PCRE2_CODE_UNIT_WIDTH == 32 extern const ucd_record PRIV(dummy_ucd_record)[]; diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 58fd815..7b519fd 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -160,7 +160,7 @@ enum { RM100=100, RM101 }; enum { RM200=200, RM201, RM202, RM203, RM204, RM205, RM206, RM207, RM208, RM209, RM210, RM211, RM212, RM213, RM214, RM215, RM216, RM217, RM218, RM219, RM220, RM221, RM222, RM223, - RM224 }; + RM224, RM225 }; #endif /* Define short names for general fields in the current backtrack frame, which @@ -2452,6 +2452,17 @@ fprintf(stderr, "++ op=%d\n", *Fecode); RRETURN(MATCH_NOMATCH); break; + case PT_SCX: + { + int scriptx = prop->scriptx; + BOOL ok = Fecode[2] == prop->script || + Fecode[2] == (unsigned int)scriptx; + if (!ok && scriptx < 0) + ok = MAPBIT((PRIV(ucd_script_sets) - scriptx), Fecode[2]) != 0; + if (ok == notmatch) RRETURN(MATCH_NOMATCH); + } + break; + /* These are specials */ case PT_ALNUM: @@ -2713,6 +2724,28 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } break; + case PT_SCX: + for (i = 1; i <= Lmin; i++) + { + BOOL ok; + int scriptx; + const ucd_record *prop; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + prop = GET_UCD(fc); + scriptx = prop->scriptx; + ok = prop->script == Lpropvalue || scriptx == (int)Lpropvalue; + if (!ok && scriptx < 0) + ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, Lpropvalue) != 0; + if (ok == notmatch) + RRETURN(MATCH_NOMATCH); + } + break; + case PT_ALNUM: for (i = 1; i <= Lmin; i++) { @@ -3385,8 +3418,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Lmin == Lmax) continue; /* If minimizing, we have to test the rest of the pattern before each - subsequent match. This means we cannot use a local "notmatch" variable as - in the other cases. As all 4 temporary 32-bit values in the frame are + subsequent match. This means we cannot use a local "notmatch" variable as + in the other cases. As all 4 temporary 32-bit values in the frame are already in use, just test the type each time. */ if (reptype == REPTYPE_MIN) @@ -3484,6 +3517,31 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } /* Control never gets here */ + case PT_SCX: + for (;;) + { + BOOL ok; + int scriptx; + const ucd_record *prop; + RMATCH(Fecode, RM225); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH); + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + RRETURN(MATCH_NOMATCH); + } + GETCHARINCTEST(fc, Feptr); + prop = GET_UCD(fc); + scriptx = prop->scriptx; + ok = prop->script == Lpropvalue || scriptx == (int)Lpropvalue; + if (!ok && scriptx < 0) + ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, Lpropvalue) != 0; + if (ok == (Lctype == OP_NOTPROP)) + RRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + case PT_ALNUM: for (;;) { @@ -3947,8 +4005,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } /* If maximizing, it is worth using inline code for speed, doing the type - test once at the start (i.e. keep it out of the loops). Once again, - "notmatch" can be an ordinary local variable because the loops do not call + test once at the start (i.e. keep it out of the loops). Once again, + "notmatch" can be an ordinary local variable because the loops do not call RMATCH. */ else @@ -4041,6 +4099,29 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } break; + case PT_SCX: + for (i = Lmin; i < Lmax; i++) + { + BOOL ok; + const ucd_record *prop; + int scriptx; + int len = 1; + if (Feptr >= mb->end_subject) + { + SCHECK_PARTIAL(); + break; + } + GETCHARLENTEST(fc, Feptr, len); + prop = GET_UCD(fc); + scriptx = prop->scriptx; + ok = prop->script == Lpropvalue || scriptx == (int)Lpropvalue; + if (!ok && scriptx < 0) + ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, Lpropvalue) != 0; + if (ok == notmatch) break; + Feptr+= len; + } + break; + case PT_ALNUM: for (i = Lmin; i < Lmax; i++) { @@ -6172,7 +6253,7 @@ switch (Freturn_id) LBL(200) LBL(201) LBL(202) LBL(203) LBL(204) LBL(205) LBL(206) LBL(207) LBL(208) LBL(209) LBL(210) LBL(211) LBL(212) LBL(213) LBL(214) LBL(215) LBL(216) LBL(217) LBL(218) LBL(219) LBL(220) - LBL(221) LBL(222) LBL(223) LBL(224) + LBL(221) LBL(222) LBL(223) LBL(224) LBL(225) #endif default: diff --git a/src/pcre2_printint.c b/src/pcre2_printint.c index 017c6e0..e213b44 100644 --- a/src/pcre2_printint.c +++ b/src/pcre2_printint.c @@ -237,11 +237,15 @@ get_ucpname(unsigned int ptype, unsigned int pvalue) { #ifdef SUPPORT_UNICODE int i; + +if (ptype == PT_SC) ptype = PT_SCX; /* Table has scx values */ for (i = PRIV(utt_size) - 1; i >= 0; i--) { if (ptype == PRIV(utt)[i].type && pvalue == PRIV(utt)[i].value) break; } + return (i >= 0)? PRIV(utt_names) + PRIV(utt)[i].name_offset : "??"; + #else /* No UTF support */ (void)ptype; (void)pvalue; @@ -273,8 +277,9 @@ print_prop(FILE *f, PCRE2_SPTR code, const char *before, const char *after) { if (code[1] != PT_CLIST) { + const char *sc = (code[1] == PT_SC)? "script:" : ""; const char *s = get_ucpname(code[1], code[2]); - fprintf(f, "%s%s %c%s%s", before, OP_names[*code], toupper(s[0]), s+1, after); + fprintf(f, "%s%s %s%c%s%s", before, OP_names[*code], sc, toupper(s[0]), s+1, after); } else { diff --git a/src/pcre2_script_run.c b/src/pcre2_script_run.c index 91a4833..bee312a 100644 --- a/src/pcre2_script_run.c +++ b/src/pcre2_script_run.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -77,17 +77,17 @@ records (and is only likely to be a few hundred). */ #define SCRIPT_HANHIRAKATA (-99997) #define SCRIPT_HANBOPOMOFO (-99996) #define SCRIPT_HANHANGUL (-99995) -#define SCRIPT_LIST (-99994) +#define SCRIPT_MAP (-99994) -#define INTERSECTION_LIST_SIZE 50 +#define MAPSIZE (ucp_Script_Count/32 + 1) BOOL PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf) { #ifdef SUPPORT_UNICODE int require_script = SCRIPT_UNSET; -uint8_t intersection_list[INTERSECTION_LIST_SIZE]; -const uint8_t *require_list = NULL; +uint32_t intersection_map[MAPSIZE]; +const uint32_t *require_map = NULL; uint32_t require_digitset = 0; uint32_t c; @@ -197,20 +197,13 @@ for (;;) if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE; break; - /* We have a list of scripts to check that is derived from one or - more previous characters. This is either one of the lists in + /* We have a bitmap of scripts to check that is derived from one or + more previous characters. This is either one of the maps in ucd_script_sets[] (for one previous character) or the intersection of - several lists for multiple characters. */ + several maps for multiple characters. */ - case SCRIPT_LIST: - { - const uint8_t *list; - for (list = require_list; *list != 0; list++) - { - if (*list == scriptx) break; - } - if (*list == 0) return FALSE; - } + case SCRIPT_MAP: + if (MAPBIT(require_map, scriptx) == 0) return FALSE; /* The rest of the string must be in this script, but we have to allow for the Han complications. */ @@ -249,19 +242,18 @@ for (;;) } /* End of handing positive scriptx */ /* If scriptx is negative, this character is a mark-type character that - has a list of permitted scripts. */ + has a list of permitted scripts, which are encoded in a bitmap. */ else { uint32_t chspecial; - const uint8_t *clist, *rlist; - const uint8_t *list = PRIV(ucd_script_sets) - scriptx; + const uint32_t *map = PRIV(ucd_script_sets) - scriptx; switch(require_script) { case SCRIPT_UNSET: - require_list = PRIV(ucd_script_sets) - scriptx; - require_script = SCRIPT_LIST; + require_map = PRIV(ucd_script_sets) - scriptx; + require_script = SCRIPT_MAP; break; /* An inspection of the Unicode 11.0.0 files shows that there are the @@ -282,17 +274,11 @@ for (;;) case SCRIPT_HANPENDING: chspecial = 0; - for (; *list != 0; list++) - { - switch (*list) - { - case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break; - case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break; - case ucp_Katakana: chspecial |= FOUND_KATAKANA; break; - case ucp_Hangul: chspecial |= FOUND_HANGUL; break; - default: break; - } - } + + if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO; + if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA; + if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA; + if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL; if (chspecial == 0) return FALSE; @@ -311,76 +297,44 @@ for (;;) break; case SCRIPT_HANHIRAKATA: - for (; *list != 0; list++) - { - if (*list == ucp_Hiragana || *list == ucp_Katakana) break; - } - if (*list == 0) return FALSE; - break; + if (MAPBIT(map, ucp_Hiragana) != 0) break; + if (MAPBIT(map, ucp_Katakana) != 0) break; + return FALSE; case SCRIPT_HANBOPOMOFO: - for (; *list != 0; list++) - { - if (*list == ucp_Bopomofo) break; - } - if (*list == 0) return FALSE; - break; + if (MAPBIT(map, ucp_Bopomofo) != 0) break; + return FALSE; case SCRIPT_HANHANGUL: - for (; *list != 0; list++) - { - if (*list == ucp_Hangul) break; - } - if (*list == 0) return FALSE; - break; + if (MAPBIT(map, ucp_Hangul) != 0) break; + return FALSE; /* Previously encountered one or more characters that are allowed with a list of scripts. Build the intersection of the required list - with this character's list in intersection_list[]. This code is - written so that it still works OK if the required list is already in - that vector. */ + with this character's list in intersection_map[]. */ - case SCRIPT_LIST: - { - int i = 0; - for (rlist = require_list; *rlist != 0; rlist++) - { - for (clist = list; *clist != 0; clist++) - { - if (*rlist == *clist) - { - intersection_list[i++] = *rlist; - break; - } - } - } - if (i == 0) return FALSE; /* No scripts in common */ + case SCRIPT_MAP: + for (int i = 0; i < MAPSIZE; i++) + intersection_map[i] = require_map[i] & map[i]; + + /* If there's just one script in common, we could set it as the + unique required script. However, in the new bitmap arrangements, + finding the one script is expensive, so leave this out for now. + Otherwise, make the intersection map the required map. */ - /* If there's just one script in common, we can set it as the - unique required script. Otherwise, terminate the intersection list - and make it the required list. */ + /* + if (onescript >= 0) require_script = onescript; + else require_map = intersection_map; + */ - if (i == 1) - { - require_script = intersection_list[0]; - } - else - { - intersection_list[i] = 0; - require_list = intersection_list; - } - } + require_map = intersection_map; break; /* The previously set required script is a single script, not Han-related. Check that it is in this character's list. */ default: - for (; *list != 0; list++) - { - if (*list == require_script) break; - } - if (*list == 0) return FALSE; + if (MAPBIT(map, require_script) == 0) return FALSE; break; } } /* End of handling negative scriptx */ diff --git a/src/pcre2_tables.c b/src/pcre2_tables.c index e74f857..827e1ea 100644 --- a/src/pcre2_tables.c +++ b/src/pcre2_tables.c @@ -710,19 +710,19 @@ const char PRIV(utt_names)[] = STRING_zs0; const ucp_type_table PRIV(utt)[] = { - { 0, PT_SC, ucp_Adlam }, - { 6, PT_SC, ucp_Ahom }, - { 11, PT_SC, ucp_Anatolian_Hieroglyphs }, + { 0, PT_SCX, ucp_Adlam }, + { 6, PT_SCX, ucp_Ahom }, + { 11, PT_SCX, ucp_Anatolian_Hieroglyphs }, { 32, PT_ANY, 0 }, - { 36, PT_SC, ucp_Arabic }, - { 43, PT_SC, ucp_Armenian }, - { 52, PT_SC, ucp_Avestan }, - { 60, PT_SC, ucp_Balinese }, - { 69, PT_SC, ucp_Bamum }, - { 75, PT_SC, ucp_Bassa_Vah }, - { 84, PT_SC, ucp_Batak }, - { 90, PT_SC, ucp_Bengali }, - { 98, PT_SC, ucp_Bhaiksuki }, + { 36, PT_SCX, ucp_Arabic }, + { 43, PT_SCX, ucp_Armenian }, + { 52, PT_SCX, ucp_Avestan }, + { 60, PT_SCX, ucp_Balinese }, + { 69, PT_SCX, ucp_Bamum }, + { 75, PT_SCX, ucp_Bassa_Vah }, + { 84, PT_SCX, ucp_Batak }, + { 90, PT_SCX, ucp_Bengali }, + { 98, PT_SCX, ucp_Bhaiksuki }, { 108, PT_BIDICL, ucp_bidiAL }, { 115, PT_BIDICL, ucp_bidiAN }, { 122, PT_BIDICL, ucp_bidiB }, @@ -748,197 +748,197 @@ const ucp_type_table PRIV(utt)[] = { { 272, PT_BIDICL, ucp_bidiRLO }, { 280, PT_BIDICL, ucp_bidiS }, { 286, PT_BIDICL, ucp_bidiWS }, - { 293, PT_SC, ucp_Bopomofo }, - { 302, PT_SC, ucp_Brahmi }, - { 309, PT_SC, ucp_Braille }, - { 317, PT_SC, ucp_Buginese }, - { 326, PT_SC, ucp_Buhid }, + { 293, PT_SCX, ucp_Bopomofo }, + { 302, PT_SCX, ucp_Brahmi }, + { 309, PT_SCX, ucp_Braille }, + { 317, PT_SCX, ucp_Buginese }, + { 326, PT_SCX, ucp_Buhid }, { 332, PT_GC, ucp_C }, - { 334, PT_SC, ucp_Canadian_Aboriginal }, - { 353, PT_SC, ucp_Carian }, - { 360, PT_SC, ucp_Caucasian_Albanian }, + { 334, PT_SCX, ucp_Canadian_Aboriginal }, + { 353, PT_SCX, ucp_Carian }, + { 360, PT_SCX, ucp_Caucasian_Albanian }, { 378, PT_PC, ucp_Cc }, { 381, PT_PC, ucp_Cf }, - { 384, PT_SC, ucp_Chakma }, - { 391, PT_SC, ucp_Cham }, - { 396, PT_SC, ucp_Cherokee }, - { 405, PT_SC, ucp_Chorasmian }, + { 384, PT_SCX, ucp_Chakma }, + { 391, PT_SCX, ucp_Cham }, + { 396, PT_SCX, ucp_Cherokee }, + { 405, PT_SCX, ucp_Chorasmian }, { 416, PT_PC, ucp_Cn }, { 419, PT_PC, ucp_Co }, - { 422, PT_SC, ucp_Common }, - { 429, PT_SC, ucp_Coptic }, + { 422, PT_SCX, ucp_Common }, + { 429, PT_SCX, ucp_Coptic }, { 436, PT_PC, ucp_Cs }, - { 439, PT_SC, ucp_Cuneiform }, - { 449, PT_SC, ucp_Cypriot }, - { 457, PT_SC, ucp_Cypro_Minoan }, - { 469, PT_SC, ucp_Cyrillic }, - { 478, PT_SC, ucp_Deseret }, - { 486, PT_SC, ucp_Devanagari }, - { 497, PT_SC, ucp_Dives_Akuru }, - { 508, PT_SC, ucp_Dogra }, - { 514, PT_SC, ucp_Duployan }, - { 523, PT_SC, ucp_Egyptian_Hieroglyphs }, - { 543, PT_SC, ucp_Elbasan }, - { 551, PT_SC, ucp_Elymaic }, - { 559, PT_SC, ucp_Ethiopic }, - { 568, PT_SC, ucp_Georgian }, - { 577, PT_SC, ucp_Glagolitic }, - { 588, PT_SC, ucp_Gothic }, - { 595, PT_SC, ucp_Grantha }, - { 603, PT_SC, ucp_Greek }, - { 609, PT_SC, ucp_Gujarati }, - { 618, PT_SC, ucp_Gunjala_Gondi }, - { 631, PT_SC, ucp_Gurmukhi }, - { 640, PT_SC, ucp_Han }, - { 644, PT_SC, ucp_Hangul }, - { 651, PT_SC, ucp_Hanifi_Rohingya }, - { 666, PT_SC, ucp_Hanunoo }, - { 674, PT_SC, ucp_Hatran }, - { 681, PT_SC, ucp_Hebrew }, - { 688, PT_SC, ucp_Hiragana }, - { 697, PT_SC, ucp_Imperial_Aramaic }, - { 713, PT_SC, ucp_Inherited }, - { 723, PT_SC, ucp_Inscriptional_Pahlavi }, - { 744, PT_SC, ucp_Inscriptional_Parthian }, - { 766, PT_SC, ucp_Javanese }, - { 775, PT_SC, ucp_Kaithi }, - { 782, PT_SC, ucp_Kannada }, - { 790, PT_SC, ucp_Katakana }, - { 799, PT_SC, ucp_Kayah_Li }, - { 807, PT_SC, ucp_Kharoshthi }, - { 818, PT_SC, ucp_Khitan_Small_Script }, - { 836, PT_SC, ucp_Khmer }, - { 842, PT_SC, ucp_Khojki }, - { 849, PT_SC, ucp_Khudawadi }, + { 439, PT_SCX, ucp_Cuneiform }, + { 449, PT_SCX, ucp_Cypriot }, + { 457, PT_SCX, ucp_Cypro_Minoan }, + { 469, PT_SCX, ucp_Cyrillic }, + { 478, PT_SCX, ucp_Deseret }, + { 486, PT_SCX, ucp_Devanagari }, + { 497, PT_SCX, ucp_Dives_Akuru }, + { 508, PT_SCX, ucp_Dogra }, + { 514, PT_SCX, ucp_Duployan }, + { 523, PT_SCX, ucp_Egyptian_Hieroglyphs }, + { 543, PT_SCX, ucp_Elbasan }, + { 551, PT_SCX, ucp_Elymaic }, + { 559, PT_SCX, ucp_Ethiopic }, + { 568, PT_SCX, ucp_Georgian }, + { 577, PT_SCX, ucp_Glagolitic }, + { 588, PT_SCX, ucp_Gothic }, + { 595, PT_SCX, ucp_Grantha }, + { 603, PT_SCX, ucp_Greek }, + { 609, PT_SCX, ucp_Gujarati }, + { 618, PT_SCX, ucp_Gunjala_Gondi }, + { 631, PT_SCX, ucp_Gurmukhi }, + { 640, PT_SCX, ucp_Han }, + { 644, PT_SCX, ucp_Hangul }, + { 651, PT_SCX, ucp_Hanifi_Rohingya }, + { 666, PT_SCX, ucp_Hanunoo }, + { 674, PT_SCX, ucp_Hatran }, + { 681, PT_SCX, ucp_Hebrew }, + { 688, PT_SCX, ucp_Hiragana }, + { 697, PT_SCX, ucp_Imperial_Aramaic }, + { 713, PT_SCX, ucp_Inherited }, + { 723, PT_SCX, ucp_Inscriptional_Pahlavi }, + { 744, PT_SCX, ucp_Inscriptional_Parthian }, + { 766, PT_SCX, ucp_Javanese }, + { 775, PT_SCX, ucp_Kaithi }, + { 782, PT_SCX, ucp_Kannada }, + { 790, PT_SCX, ucp_Katakana }, + { 799, PT_SCX, ucp_Kayah_Li }, + { 807, PT_SCX, ucp_Kharoshthi }, + { 818, PT_SCX, ucp_Khitan_Small_Script }, + { 836, PT_SCX, ucp_Khmer }, + { 842, PT_SCX, ucp_Khojki }, + { 849, PT_SCX, ucp_Khudawadi }, { 859, PT_GC, ucp_L }, { 861, PT_LAMP, 0 }, - { 864, PT_SC, ucp_Lao }, - { 868, PT_SC, ucp_Latin }, + { 864, PT_SCX, ucp_Lao }, + { 868, PT_SCX, ucp_Latin }, { 874, PT_LAMP, 0 }, - { 877, PT_SC, ucp_Lepcha }, - { 884, PT_SC, ucp_Limbu }, - { 890, PT_SC, ucp_Linear_A }, - { 898, PT_SC, ucp_Linear_B }, - { 906, PT_SC, ucp_Lisu }, + { 877, PT_SCX, ucp_Lepcha }, + { 884, PT_SCX, ucp_Limbu }, + { 890, PT_SCX, ucp_Linear_A }, + { 898, PT_SCX, ucp_Linear_B }, + { 906, PT_SCX, ucp_Lisu }, { 911, PT_PC, ucp_Ll }, { 914, PT_PC, ucp_Lm }, { 917, PT_PC, ucp_Lo }, { 920, PT_PC, ucp_Lt }, { 923, PT_PC, ucp_Lu }, - { 926, PT_SC, ucp_Lycian }, - { 933, PT_SC, ucp_Lydian }, + { 926, PT_SCX, ucp_Lycian }, + { 933, PT_SCX, ucp_Lydian }, { 940, PT_GC, ucp_M }, - { 942, PT_SC, ucp_Mahajani }, - { 951, PT_SC, ucp_Makasar }, - { 959, PT_SC, ucp_Malayalam }, - { 969, PT_SC, ucp_Mandaic }, - { 977, PT_SC, ucp_Manichaean }, - { 988, PT_SC, ucp_Marchen }, - { 996, PT_SC, ucp_Masaram_Gondi }, + { 942, PT_SCX, ucp_Mahajani }, + { 951, PT_SCX, ucp_Makasar }, + { 959, PT_SCX, ucp_Malayalam }, + { 969, PT_SCX, ucp_Mandaic }, + { 977, PT_SCX, ucp_Manichaean }, + { 988, PT_SCX, ucp_Marchen }, + { 996, PT_SCX, ucp_Masaram_Gondi }, { 1009, PT_PC, ucp_Mc }, { 1012, PT_PC, ucp_Me }, - { 1015, PT_SC, ucp_Medefaidrin }, - { 1027, PT_SC, ucp_Meetei_Mayek }, - { 1039, PT_SC, ucp_Mende_Kikakui }, - { 1052, PT_SC, ucp_Meroitic_Cursive }, - { 1068, PT_SC, ucp_Meroitic_Hieroglyphs }, - { 1088, PT_SC, ucp_Miao }, + { 1015, PT_SCX, ucp_Medefaidrin }, + { 1027, PT_SCX, ucp_Meetei_Mayek }, + { 1039, PT_SCX, ucp_Mende_Kikakui }, + { 1052, PT_SCX, ucp_Meroitic_Cursive }, + { 1068, PT_SCX, ucp_Meroitic_Hieroglyphs }, + { 1088, PT_SCX, ucp_Miao }, { 1093, PT_PC, ucp_Mn }, - { 1096, PT_SC, ucp_Modi }, - { 1101, PT_SC, ucp_Mongolian }, - { 1111, PT_SC, ucp_Mro }, - { 1115, PT_SC, ucp_Multani }, - { 1123, PT_SC, ucp_Myanmar }, + { 1096, PT_SCX, ucp_Modi }, + { 1101, PT_SCX, ucp_Mongolian }, + { 1111, PT_SCX, ucp_Mro }, + { 1115, PT_SCX, ucp_Multani }, + { 1123, PT_SCX, ucp_Myanmar }, { 1131, PT_GC, ucp_N }, - { 1133, PT_SC, ucp_Nabataean }, - { 1143, PT_SC, ucp_Nandinagari }, + { 1133, PT_SCX, ucp_Nabataean }, + { 1143, PT_SCX, ucp_Nandinagari }, { 1155, PT_PC, ucp_Nd }, - { 1158, PT_SC, ucp_Newa }, - { 1163, PT_SC, ucp_New_Tai_Lue }, - { 1173, PT_SC, ucp_Nko }, + { 1158, PT_SCX, ucp_Newa }, + { 1163, PT_SCX, ucp_New_Tai_Lue }, + { 1173, PT_SCX, ucp_Nko }, { 1177, PT_PC, ucp_Nl }, { 1180, PT_PC, ucp_No }, - { 1183, PT_SC, ucp_Nushu }, - { 1189, PT_SC, ucp_Nyiakeng_Puachue_Hmong }, - { 1210, PT_SC, ucp_Ogham }, - { 1216, PT_SC, ucp_Ol_Chiki }, - { 1224, PT_SC, ucp_Old_Hungarian }, - { 1237, PT_SC, ucp_Old_Italic }, - { 1247, PT_SC, ucp_Old_North_Arabian }, - { 1263, PT_SC, ucp_Old_Permic }, - { 1273, PT_SC, ucp_Old_Persian }, - { 1284, PT_SC, ucp_Old_Sogdian }, - { 1295, PT_SC, ucp_Old_South_Arabian }, - { 1311, PT_SC, ucp_Old_Turkic }, - { 1321, PT_SC, ucp_Old_Uyghur }, - { 1331, PT_SC, ucp_Oriya }, - { 1337, PT_SC, ucp_Osage }, - { 1343, PT_SC, ucp_Osmanya }, + { 1183, PT_SCX, ucp_Nushu }, + { 1189, PT_SCX, ucp_Nyiakeng_Puachue_Hmong }, + { 1210, PT_SCX, ucp_Ogham }, + { 1216, PT_SCX, ucp_Ol_Chiki }, + { 1224, PT_SCX, ucp_Old_Hungarian }, + { 1237, PT_SCX, ucp_Old_Italic }, + { 1247, PT_SCX, ucp_Old_North_Arabian }, + { 1263, PT_SCX, ucp_Old_Permic }, + { 1273, PT_SCX, ucp_Old_Persian }, + { 1284, PT_SCX, ucp_Old_Sogdian }, + { 1295, PT_SCX, ucp_Old_South_Arabian }, + { 1311, PT_SCX, ucp_Old_Turkic }, + { 1321, PT_SCX, ucp_Old_Uyghur }, + { 1331, PT_SCX, ucp_Oriya }, + { 1337, PT_SCX, ucp_Osage }, + { 1343, PT_SCX, ucp_Osmanya }, { 1351, PT_GC, ucp_P }, - { 1353, PT_SC, ucp_Pahawh_Hmong }, - { 1365, PT_SC, ucp_Palmyrene }, - { 1375, PT_SC, ucp_Pau_Cin_Hau }, + { 1353, PT_SCX, ucp_Pahawh_Hmong }, + { 1365, PT_SCX, ucp_Palmyrene }, + { 1375, PT_SCX, ucp_Pau_Cin_Hau }, { 1385, PT_PC, ucp_Pc }, { 1388, PT_PC, ucp_Pd }, { 1391, PT_PC, ucp_Pe }, { 1394, PT_PC, ucp_Pf }, - { 1397, PT_SC, ucp_Phags_Pa }, - { 1405, PT_SC, ucp_Phoenician }, + { 1397, PT_SCX, ucp_Phags_Pa }, + { 1405, PT_SCX, ucp_Phoenician }, { 1416, PT_PC, ucp_Pi }, { 1419, PT_PC, ucp_Po }, { 1422, PT_PC, ucp_Ps }, - { 1425, PT_SC, ucp_Psalter_Pahlavi }, - { 1440, PT_SC, ucp_Rejang }, - { 1447, PT_SC, ucp_Runic }, + { 1425, PT_SCX, ucp_Psalter_Pahlavi }, + { 1440, PT_SCX, ucp_Rejang }, + { 1447, PT_SCX, ucp_Runic }, { 1453, PT_GC, ucp_S }, - { 1455, PT_SC, ucp_Samaritan }, - { 1465, PT_SC, ucp_Saurashtra }, + { 1455, PT_SCX, ucp_Samaritan }, + { 1465, PT_SCX, ucp_Saurashtra }, { 1476, PT_PC, ucp_Sc }, - { 1479, PT_SC, ucp_Sharada }, - { 1487, PT_SC, ucp_Shavian }, - { 1495, PT_SC, ucp_Siddham }, - { 1503, PT_SC, ucp_SignWriting }, - { 1515, PT_SC, ucp_Sinhala }, + { 1479, PT_SCX, ucp_Sharada }, + { 1487, PT_SCX, ucp_Shavian }, + { 1495, PT_SCX, ucp_Siddham }, + { 1503, PT_SCX, ucp_SignWriting }, + { 1515, PT_SCX, ucp_Sinhala }, { 1523, PT_PC, ucp_Sk }, { 1526, PT_PC, ucp_Sm }, { 1529, PT_PC, ucp_So }, - { 1532, PT_SC, ucp_Sogdian }, - { 1540, PT_SC, ucp_Sora_Sompeng }, - { 1552, PT_SC, ucp_Soyombo }, - { 1560, PT_SC, ucp_Sundanese }, - { 1570, PT_SC, ucp_Syloti_Nagri }, - { 1582, PT_SC, ucp_Syriac }, - { 1589, PT_SC, ucp_Tagalog }, - { 1597, PT_SC, ucp_Tagbanwa }, - { 1606, PT_SC, ucp_Tai_Le }, - { 1612, PT_SC, ucp_Tai_Tham }, - { 1620, PT_SC, ucp_Tai_Viet }, - { 1628, PT_SC, ucp_Takri }, - { 1634, PT_SC, ucp_Tamil }, - { 1640, PT_SC, ucp_Tangsa }, - { 1647, PT_SC, ucp_Tangut }, - { 1654, PT_SC, ucp_Telugu }, - { 1661, PT_SC, ucp_Thaana }, - { 1668, PT_SC, ucp_Thai }, - { 1673, PT_SC, ucp_Tibetan }, - { 1681, PT_SC, ucp_Tifinagh }, - { 1690, PT_SC, ucp_Tirhuta }, - { 1698, PT_SC, ucp_Toto }, - { 1703, PT_SC, ucp_Ugaritic }, - { 1712, PT_SC, ucp_Unknown }, - { 1720, PT_SC, ucp_Vai }, - { 1724, PT_SC, ucp_Vithkuqi }, - { 1733, PT_SC, ucp_Wancho }, - { 1740, PT_SC, ucp_Warang_Citi }, + { 1532, PT_SCX, ucp_Sogdian }, + { 1540, PT_SCX, ucp_Sora_Sompeng }, + { 1552, PT_SCX, ucp_Soyombo }, + { 1560, PT_SCX, ucp_Sundanese }, + { 1570, PT_SCX, ucp_Syloti_Nagri }, + { 1582, PT_SCX, ucp_Syriac }, + { 1589, PT_SCX, ucp_Tagalog }, + { 1597, PT_SCX, ucp_Tagbanwa }, + { 1606, PT_SCX, ucp_Tai_Le }, + { 1612, PT_SCX, ucp_Tai_Tham }, + { 1620, PT_SCX, ucp_Tai_Viet }, + { 1628, PT_SCX, ucp_Takri }, + { 1634, PT_SCX, ucp_Tamil }, + { 1640, PT_SCX, ucp_Tangsa }, + { 1647, PT_SCX, ucp_Tangut }, + { 1654, PT_SCX, ucp_Telugu }, + { 1661, PT_SCX, ucp_Thaana }, + { 1668, PT_SCX, ucp_Thai }, + { 1673, PT_SCX, ucp_Tibetan }, + { 1681, PT_SCX, ucp_Tifinagh }, + { 1690, PT_SCX, ucp_Tirhuta }, + { 1698, PT_SCX, ucp_Toto }, + { 1703, PT_SCX, ucp_Ugaritic }, + { 1712, PT_SCX, ucp_Unknown }, + { 1720, PT_SCX, ucp_Vai }, + { 1724, PT_SCX, ucp_Vithkuqi }, + { 1733, PT_SCX, ucp_Wancho }, + { 1740, PT_SCX, ucp_Warang_Citi }, { 1751, PT_ALNUM, 0 }, { 1755, PT_PXSPACE, 0 }, { 1759, PT_SPACE, 0 }, { 1763, PT_UCNC, 0 }, { 1767, PT_WORD, 0 }, - { 1771, PT_SC, ucp_Yezidi }, - { 1778, PT_SC, ucp_Yi }, + { 1771, PT_SCX, ucp_Yezidi }, + { 1778, PT_SCX, ucp_Yi }, { 1781, PT_GC, ucp_Z }, - { 1783, PT_SC, ucp_Zanabazar_Square }, + { 1783, PT_SCX, ucp_Zanabazar_Square }, { 1799, PT_PC, ucp_Zl }, { 1802, PT_PC, ucp_Zp }, { 1805, PT_PC, ucp_Zs } diff --git a/src/pcre2_ucd.c b/src/pcre2_ucd.c index f883b7e..f0d7488 100644 --- a/src/pcre2_ucd.c +++ b/src/pcre2_ucd.c @@ -130,66 +130,65 @@ const uint32_t PRIV(ucd_digit_sets)[] = { 0x1e959, 0x1fbf9, }; -/* This vector is a list of lists of scripts for the Script Extension -property. Each sublist is zero-terminated. */ +/* This vector is a list of script bitsets for the Script Extension +property. */ -const uint8_t PRIV(ucd_script_sets)[] = { - /* 0 */ 0, - /* 1 */ 1, 11, 0, - /* 4 */ 1, 144, 0, - /* 7 */ 1, 64, 0, - /* 10 */ 1, 50, 0, - /* 13 */ 1, 56, 0, - /* 16 */ 3, 15, 0, - /* 19 */ 4, 23, 0, - /* 22 */ 6, 84, 0, - /* 25 */ 12, 36, 0, - /* 28 */ 13, 18, 0, - /* 31 */ 13, 34, 0, - /* 34 */ 13, 118, 0, - /* 37 */ 13, 50, 0, - /* 40 */ 15, 107, 0, - /* 43 */ 15, 150, 0, - /* 46 */ 15, 100, 0, - /* 49 */ 15, 54, 0, - /* 52 */ 17, 34, 0, - /* 55 */ 107, 54, 0, - /* 58 */ 21, 108, 0, - /* 61 */ 22, 129, 0, - /* 64 */ 23, 34, 0, - /* 67 */ 27, 30, 0, - /* 70 */ 29, 150, 0, - /* 73 */ 34, 38, 0, - /* 76 */ 112, 158, 0, - /* 79 */ 38, 65, 0, - /* 82 */ 1, 50, 56, 0, - /* 86 */ 1, 56, 156, 0, - /* 90 */ 3, 96, 49, 0, - /* 94 */ 96, 39, 53, 0, - /* 98 */ 157, 12, 36, 0, - /* 102 */ 12, 110, 36, 0, - /* 106 */ 15, 107, 29, 0, - /* 110 */ 15, 107, 34, 0, - /* 114 */ 23, 27, 30, 0, - /* 118 */ 69, 34, 39, 0, - /* 122 */ 3, 15, 107, 29, 0, - /* 127 */ 7, 25, 52, 51, 0, - /* 132 */ 15, 142, 85, 111, 0, - /* 137 */ 4, 24, 23, 27, 30, 0, - /* 143 */ 1, 64, 144, 50, 56, 156, 0, - /* 150 */ 4, 24, 23, 27, 30, 61, 0, - /* 157 */ 15, 29, 37, 44, 54, 55, 0, - /* 164 */ 132, 1, 64, 144, 50, 56, 156, 0, - /* 172 */ 3, 15, 107, 29, 150, 44, 55, 124, 0, - /* 181 */ 132, 1, 95, 112, 158, 121, 144, 148, 50, 0, - /* 191 */ 15, 142, 21, 22, 108, 85, 111, 114, 109, 102, 124, 0, - /* 203 */ 3, 15, 107, 21, 22, 29, 34, 37, 44, 54, 55, 124, 0, - /* 216 */ 3, 15, 107, 21, 22, 29, 34, 37, 44, 100, 54, 55, 124, 0, - /* 230 */ 15, 142, 21, 22, 108, 29, 85, 111, 114, 150, 109, 102, 124, 0, - /* 244 */ 15, 142, 21, 22, 108, 29, 85, 111, 37, 114, 150, 109, 102, 124, 0, - /* 259 */ 3, 15, 142, 143, 138, 107, 21, 22, 29, 111, 37, 150, 44, 109, 48, 49, 102, 54, 55, 124, 0, - /* 280 */ 3, 15, 142, 143, 138, 107, 21, 22, 29, 35, 111, 37, 150, 44, 109, 48, 49, 102, 54, 55, 124, 0, - /* 302 */ +const uint32_t PRIV(ucd_script_sets)[] = { + 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000802u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000002u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00010000u, 0x00000000u, + 0x00000002u, 0x00000000u, 0x00000001u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000002u, 0x00040000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000002u, 0x01000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00008008u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00800010u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000040u, 0x00000000u, 0x00100000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00001000u, 0x00000010u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00042000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00002000u, 0x00000004u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00002000u, 0x00000000u, 0x00000000u, 0x00400000u, 0x00000000u, 0x00000000u, + 0x00002000u, 0x00040000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00008000u, 0x00000000u, 0x00000000u, 0x00000800u, 0x00000000u, 0x00000000u, + 0x00008000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00400000u, 0x00000000u, + 0x00008000u, 0x00000000u, 0x00000000u, 0x00000010u, 0x00000000u, 0x00000000u, + 0x00008000u, 0x00400000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00020000u, 0x00000004u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000000u, 0x00400000u, 0x00000000u, 0x00000800u, 0x00000000u, 0x00000000u, + 0x00200000u, 0x00000000u, 0x00000000u, 0x00001000u, 0x00000000u, 0x00000000u, + 0x00400000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000002u, 0x00000000u, + 0x00800000u, 0x00000004u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x48000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x20000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00400000u, 0x00000000u, + 0x00000000u, 0x00000044u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000000u, 0x00000000u, 0x00000000u, 0x00010000u, 0x40000000u, 0x00000000u, + 0x00000000u, 0x00000040u, 0x00000002u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000002u, 0x01040000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000002u, 0x01000000u, 0x00000000u, 0x00000000u, 0x10000000u, 0x00000000u, + 0x00000008u, 0x00020000u, 0x00000000u, 0x00000001u, 0x00000000u, 0x00000000u, + 0x00000000u, 0x00200080u, 0x00000000u, 0x00000001u, 0x00000000u, 0x00000000u, + 0x00001000u, 0x00000010u, 0x00000000u, 0x00000000u, 0x20000000u, 0x00000000u, + 0x00001000u, 0x00000010u, 0x00000000u, 0x00004000u, 0x00000000u, 0x00000000u, + 0x20008000u, 0x00000000u, 0x00000000u, 0x00000800u, 0x00000000u, 0x00000000u, + 0x00008000u, 0x00000004u, 0x00000000u, 0x00000800u, 0x00000000u, 0x00000000u, + 0x48800000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000000u, 0x00000084u, 0x00000020u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x20008008u, 0x00000000u, 0x00000000u, 0x00000800u, 0x00000000u, 0x00000000u, + 0x02000080u, 0x00180000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00008000u, 0x00000000u, 0x00200000u, 0x00008000u, 0x00004000u, 0x00000000u, + 0x49800010u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000002u, 0x01040000u, 0x00000001u, 0x00000000u, 0x10010000u, 0x00000000u, + 0x49800010u, 0x20000000u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x20008000u, 0x00c01020u, 0x00000000u, 0x00000000u, 0x00000000u, 0x00000000u, + 0x00000002u, 0x01040000u, 0x00000001u, 0x00000000u, 0x10010010u, 0x00000000u, + 0x20008008u, 0x00801000u, 0x00000000u, 0x10000800u, 0x00400000u, 0x00000000u, + 0x00000002u, 0x00040000u, 0x80000000u, 0x02010000u, 0x40110010u, 0x00000000u, + 0x00608000u, 0x00000000u, 0x00200000u, 0x1004b040u, 0x00004000u, 0x00000000u, + 0x20608008u, 0x00c01024u, 0x00000000u, 0x10000800u, 0x00000000u, 0x00000000u, + 0x20608008u, 0x00c01024u, 0x00000000u, 0x10000810u, 0x00000000u, 0x00000000u, + 0x20608000u, 0x00000000u, 0x00200000u, 0x1004b040u, 0x00404000u, 0x00000000u, + 0x20608000u, 0x00000020u, 0x00200000u, 0x1004b040u, 0x00404000u, 0x00000000u, + 0x20608008u, 0x00c31020u, 0x00000000u, 0x1000a840u, 0x0040c400u, 0x00000000u, + 0x20608008u, 0x00c31028u, 0x00000000u, 0x1000a840u, 0x0040c400u, 0x00000000u, }; /* These are the main two-stage UCD tables. The fields in each record are: @@ -407,9 +406,9 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 13, 9, 12, 88, 1, 13, 9, 0, }, /* 204 */ { 13, 5, 12, 88, -1, 13, 9, 0, }, /* 205 */ { 13, 26, 12, 0, 0, 13, 9, 0, }, /* 206 */ - { 13, 12, 3, 0, 0, -34, 13, 0, }, /* 207 */ - { 13, 12, 3, 0, 0, -28, 13, 0, }, /* 208 */ - { 28, 12, 3, 0, 0, -31, 13, 0, }, /* 209 */ + { 13, 12, 3, 0, 0, -72, 13, 0, }, /* 207 */ + { 13, 12, 3, 0, 0, -60, 13, 0, }, /* 208 */ + { 28, 12, 3, 0, 0, -66, 13, 0, }, /* 209 */ { 13, 11, 3, 0, 0, 13, 13, 0, }, /* 210 */ { 13, 9, 12, 0, 15, 13, 9, 0, }, /* 211 */ { 13, 5, 12, 0, -15, 13, 9, 0, }, /* 212 */ @@ -432,19 +431,19 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 1, 25, 12, 0, 0, 1, 0, 0, }, /* 229 */ { 1, 21, 12, 0, 0, 1, 7, 0, }, /* 230 */ { 1, 23, 12, 0, 0, 1, 0, 0, }, /* 231 */ - { 10, 21, 12, 0, 0, -143, 4, 0, }, /* 232 */ + { 10, 21, 12, 0, 0, -252, 4, 0, }, /* 232 */ { 1, 21, 12, 0, 0, 1, 0, 0, }, /* 233 */ { 1, 26, 12, 0, 0, 1, 14, 0, }, /* 234 */ { 1, 12, 3, 0, 0, 1, 13, 0, }, /* 235 */ - { 10, 21, 12, 0, 0, -143, 0, 0, }, /* 236 */ - { 1, 1, 2, 0, 0, -82, 128, 0, }, /* 237 */ - { 10, 21, 12, 0, 0, -164, 0, 0, }, /* 238 */ + { 10, 21, 12, 0, 0, -252, 0, 0, }, /* 236 */ + { 1, 1, 2, 0, 0, -168, 128, 0, }, /* 237 */ + { 10, 21, 12, 0, 0, -270, 0, 0, }, /* 238 */ { 1, 7, 12, 0, 0, 1, 0, 0, }, /* 239 */ - { 10, 6, 12, 0, 0, -181, 0, 0, }, /* 240 */ - { 28, 12, 3, 0, 0, -10, 13, 0, }, /* 241 */ - { 1, 13, 12, 0, 0, -86, 1, 0, }, /* 242 */ + { 10, 6, 12, 0, 0, -282, 0, 0, }, /* 240 */ + { 28, 12, 3, 0, 0, -24, 13, 0, }, /* 241 */ + { 1, 13, 12, 0, 0, -174, 1, 0, }, /* 242 */ { 1, 21, 12, 0, 0, 1, 1, 0, }, /* 243 */ - { 1, 21, 12, 0, 0, -4, 0, 0, }, /* 244 */ + { 1, 21, 12, 0, 0, -12, 0, 0, }, /* 244 */ { 1, 6, 12, 0, 0, 1, 0, 0, }, /* 245 */ { 1, 13, 12, 0, 0, 1, 5, 0, }, /* 246 */ { 1, 26, 12, 0, 0, 1, 0, 0, }, /* 247 */ @@ -473,18 +472,18 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 15, 12, 3, 0, 0, 15, 13, 0, }, /* 270 */ { 15, 10, 5, 0, 0, 15, 9, 0, }, /* 271 */ { 15, 7, 12, 0, 0, 15, 9, 0, }, /* 272 */ - { 28, 12, 3, 0, 0, -216, 13, 0, }, /* 273 */ - { 28, 12, 3, 0, 0, -203, 13, 0, }, /* 274 */ - { 10, 21, 12, 0, 0, -259, 9, 0, }, /* 275 */ - { 10, 21, 12, 0, 0, -280, 9, 0, }, /* 276 */ - { 15, 13, 12, 0, 0, -132, 9, 0, }, /* 277 */ + { 28, 12, 3, 0, 0, -300, 13, 0, }, /* 273 */ + { 28, 12, 3, 0, 0, -294, 13, 0, }, /* 274 */ + { 10, 21, 12, 0, 0, -318, 9, 0, }, /* 275 */ + { 10, 21, 12, 0, 0, -324, 9, 0, }, /* 276 */ + { 15, 13, 12, 0, 0, -240, 9, 0, }, /* 277 */ { 15, 21, 12, 0, 0, 15, 9, 0, }, /* 278 */ { 15, 6, 12, 0, 0, 15, 9, 0, }, /* 279 */ { 3, 7, 12, 0, 0, 3, 9, 0, }, /* 280 */ { 3, 12, 3, 0, 0, 3, 13, 0, }, /* 281 */ { 3, 10, 5, 0, 0, 3, 9, 0, }, /* 282 */ { 3, 10, 3, 0, 0, 3, 9, 0, }, /* 283 */ - { 3, 13, 12, 0, 0, -90, 9, 0, }, /* 284 */ + { 3, 13, 12, 0, 0, -180, 9, 0, }, /* 284 */ { 3, 23, 12, 0, 0, 3, 7, 0, }, /* 285 */ { 3, 15, 12, 0, 0, 3, 9, 0, }, /* 286 */ { 3, 26, 12, 0, 0, 3, 9, 0, }, /* 287 */ @@ -492,12 +491,12 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 22, 12, 3, 0, 0, 22, 13, 0, }, /* 289 */ { 22, 10, 5, 0, 0, 22, 9, 0, }, /* 290 */ { 22, 7, 12, 0, 0, 22, 9, 0, }, /* 291 */ - { 22, 13, 12, 0, 0, -61, 9, 0, }, /* 292 */ + { 22, 13, 12, 0, 0, -126, 9, 0, }, /* 292 */ { 22, 21, 12, 0, 0, 22, 9, 0, }, /* 293 */ { 21, 12, 3, 0, 0, 21, 13, 0, }, /* 294 */ { 21, 10, 5, 0, 0, 21, 9, 0, }, /* 295 */ { 21, 7, 12, 0, 0, 21, 9, 0, }, /* 296 */ - { 21, 13, 12, 0, 0, -58, 9, 0, }, /* 297 */ + { 21, 13, 12, 0, 0, -120, 9, 0, }, /* 297 */ { 21, 21, 12, 0, 0, 21, 9, 0, }, /* 298 */ { 21, 23, 12, 0, 0, 21, 7, 0, }, /* 299 */ { 44, 12, 3, 0, 0, 44, 13, 0, }, /* 300 */ @@ -511,9 +510,9 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 54, 7, 12, 0, 0, 54, 9, 0, }, /* 308 */ { 54, 10, 3, 0, 0, 54, 9, 0, }, /* 309 */ { 54, 10, 5, 0, 0, 54, 9, 0, }, /* 310 */ - { 54, 13, 12, 0, 0, -55, 9, 0, }, /* 311 */ - { 54, 15, 12, 0, 0, -55, 9, 0, }, /* 312 */ - { 54, 26, 12, 0, 0, -55, 14, 0, }, /* 313 */ + { 54, 13, 12, 0, 0, -114, 9, 0, }, /* 311 */ + { 54, 15, 12, 0, 0, -114, 9, 0, }, /* 312 */ + { 54, 26, 12, 0, 0, -114, 14, 0, }, /* 313 */ { 54, 26, 12, 0, 0, 54, 14, 0, }, /* 314 */ { 54, 23, 12, 0, 0, 54, 7, 0, }, /* 315 */ { 55, 12, 3, 0, 0, 55, 13, 0, }, /* 316 */ @@ -529,7 +528,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 29, 21, 12, 0, 0, 29, 9, 0, }, /* 326 */ { 29, 12, 3, 0, 0, 29, 9, 0, }, /* 327 */ { 29, 10, 3, 0, 0, 29, 9, 0, }, /* 328 */ - { 29, 13, 12, 0, 0, -70, 9, 0, }, /* 329 */ + { 29, 13, 12, 0, 0, -144, 9, 0, }, /* 329 */ { 37, 12, 3, 0, 0, 37, 13, 0, }, /* 330 */ { 37, 10, 5, 0, 0, 37, 9, 0, }, /* 331 */ { 37, 7, 12, 0, 0, 37, 9, 0, }, /* 332 */ @@ -569,13 +568,13 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 39, 10, 12, 0, 0, 39, 9, 0, }, /* 366 */ { 39, 12, 3, 0, 0, 39, 13, 0, }, /* 367 */ { 39, 10, 5, 0, 0, 39, 9, 0, }, /* 368 */ - { 39, 13, 12, 0, 0, -94, 9, 0, }, /* 369 */ + { 39, 13, 12, 0, 0, -186, 9, 0, }, /* 369 */ { 39, 21, 12, 0, 0, 39, 9, 0, }, /* 370 */ { 39, 13, 12, 0, 0, 39, 9, 0, }, /* 371 */ { 39, 26, 12, 0, 0, 39, 9, 0, }, /* 372 */ { 17, 9, 12, 0, 7264, 17, 9, 0, }, /* 373 */ { 17, 5, 12, 0, 3008, 17, 9, 0, }, /* 374 */ - { 10, 21, 12, 0, 0, -52, 9, 0, }, /* 375 */ + { 10, 21, 12, 0, 0, -108, 9, 0, }, /* 375 */ { 17, 6, 12, 0, 0, 17, 9, 0, }, /* 376 */ { 24, 7, 6, 0, 0, 24, 9, 0, }, /* 377 */ { 24, 7, 7, 0, 0, 24, 9, 0, }, /* 378 */ @@ -605,7 +604,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 25, 7, 12, 0, 0, 25, 9, 0, }, /* 402 */ { 25, 12, 3, 0, 0, 25, 13, 0, }, /* 403 */ { 25, 10, 5, 0, 0, 25, 9, 0, }, /* 404 */ - { 10, 21, 12, 0, 0, -127, 9, 0, }, /* 405 */ + { 10, 21, 12, 0, 0, -234, 9, 0, }, /* 405 */ { 7, 7, 12, 0, 0, 7, 9, 0, }, /* 406 */ { 7, 12, 3, 0, 0, 7, 13, 0, }, /* 407 */ { 52, 7, 12, 0, 0, 52, 9, 0, }, /* 408 */ @@ -619,7 +618,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 32, 13, 12, 0, 0, 32, 9, 0, }, /* 416 */ { 32, 15, 12, 0, 0, 32, 14, 0, }, /* 417 */ { 38, 21, 12, 0, 0, 38, 14, 0, }, /* 418 */ - { 10, 21, 12, 0, 0, -79, 14, 0, }, /* 419 */ + { 10, 21, 12, 0, 0, -162, 14, 0, }, /* 419 */ { 38, 17, 12, 0, 0, 38, 14, 0, }, /* 420 */ { 38, 12, 3, 0, 0, 38, 13, 0, }, /* 421 */ { 38, 1, 2, 0, 0, 38, 3, 0, }, /* 422 */ @@ -685,28 +684,28 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 13, 5, 12, 108, 35267, 13, 9, 0, }, /* 482 */ { 17, 9, 12, 0, -3008, 17, 9, 0, }, /* 483 */ { 76, 21, 12, 0, 0, 76, 9, 0, }, /* 484 */ - { 28, 12, 3, 0, 0, -122, 13, 0, }, /* 485 */ + { 28, 12, 3, 0, 0, -228, 13, 0, }, /* 485 */ { 28, 12, 3, 0, 0, 15, 13, 0, }, /* 486 */ - { 10, 21, 12, 0, 0, -40, 9, 0, }, /* 487 */ - { 28, 12, 3, 0, 0, -16, 13, 0, }, /* 488 */ - { 28, 12, 3, 0, 0, -46, 13, 0, }, /* 489 */ - { 28, 12, 3, 0, 0, -157, 13, 0, }, /* 490 */ - { 10, 10, 5, 0, 0, -16, 9, 0, }, /* 491 */ - { 10, 7, 12, 0, 0, -43, 9, 0, }, /* 492 */ - { 10, 7, 12, 0, 0, -16, 9, 0, }, /* 493 */ + { 10, 21, 12, 0, 0, -84, 9, 0, }, /* 487 */ + { 28, 12, 3, 0, 0, -36, 13, 0, }, /* 488 */ + { 28, 12, 3, 0, 0, -96, 13, 0, }, /* 489 */ + { 28, 12, 3, 0, 0, -264, 13, 0, }, /* 490 */ + { 10, 10, 5, 0, 0, -36, 9, 0, }, /* 491 */ + { 10, 7, 12, 0, 0, -90, 9, 0, }, /* 492 */ + { 10, 7, 12, 0, 0, -36, 9, 0, }, /* 493 */ { 10, 7, 12, 0, 0, 15, 9, 0, }, /* 494 */ - { 10, 7, 12, 0, 0, -172, 9, 0, }, /* 495 */ - { 10, 7, 12, 0, 0, -40, 9, 0, }, /* 496 */ - { 28, 12, 3, 0, 0, -106, 13, 0, }, /* 497 */ + { 10, 7, 12, 0, 0, -276, 9, 0, }, /* 495 */ + { 10, 7, 12, 0, 0, -84, 9, 0, }, /* 496 */ + { 28, 12, 3, 0, 0, -204, 13, 0, }, /* 497 */ { 10, 10, 5, 0, 0, 3, 9, 0, }, /* 498 */ - { 28, 12, 3, 0, 0, -40, 13, 0, }, /* 499 */ + { 28, 12, 3, 0, 0, -84, 13, 0, }, /* 499 */ { 10, 7, 12, 0, 0, 150, 9, 0, }, /* 500 */ { 13, 5, 12, 0, 0, 13, 9, 0, }, /* 501 */ { 13, 6, 12, 0, 0, 13, 9, 0, }, /* 502 */ { 34, 5, 12, 0, 35332, 34, 9, 0, }, /* 503 */ { 34, 5, 12, 0, 3814, 34, 9, 0, }, /* 504 */ { 34, 5, 12, 0, 35384, 34, 9, 0, }, /* 505 */ - { 28, 12, 3, 0, 0, -37, 13, 0, }, /* 506 */ + { 28, 12, 3, 0, 0, -78, 13, 0, }, /* 506 */ { 28, 12, 3, 0, 0, 50, 13, 0, }, /* 507 */ { 34, 9, 12, 92, 1, 34, 9, 0, }, /* 508 */ { 34, 5, 12, 92, -1, 34, 9, 0, }, /* 509 */ @@ -742,7 +741,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 10, 1, 2, 0, 0, 10, 143, 0, }, /* 539 */ { 10, 1, 2, 0, 0, 10, 140, 0, }, /* 540 */ { 10, 1, 2, 0, 0, 10, 148, 0, }, /* 541 */ - { 10, 29, 12, 0, 0, -73, 4, 0, }, /* 542 */ + { 10, 29, 12, 0, 0, -150, 4, 0, }, /* 542 */ { 10, 21, 14, 0, 0, 10, 14, 0, }, /* 543 */ { 10, 25, 12, 0, 0, 10, 4, 0, }, /* 544 */ { 0, 2, 2, 0, 0, 0, 3, 0, }, /* 545 */ @@ -751,7 +750,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 10, 1, 2, 0, 0, 10, 136, 0, }, /* 548 */ { 10, 1, 2, 0, 0, 10, 144, 0, }, /* 549 */ { 0, 2, 12, 0, 0, 0, 7, 0, }, /* 550 */ - { 28, 12, 3, 0, 0, -110, 13, 0, }, /* 551 */ + { 28, 12, 3, 0, 0, -210, 13, 0, }, /* 551 */ { 10, 9, 12, 0, 0, 10, 9, 0, }, /* 552 */ { 10, 5, 12, 0, 0, 10, 9, 0, }, /* 553 */ { 20, 9, 12, 96, -7517, 20, 9, 0, }, /* 554 */ @@ -793,31 +792,31 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 59, 21, 12, 0, 0, 59, 9, 0, }, /* 590 */ { 59, 12, 3, 0, 0, 59, 13, 0, }, /* 591 */ { 13, 12, 3, 0, 0, 13, 13, 0, }, /* 592 */ - { 10, 21, 12, 0, 0, -28, 14, 0, }, /* 593 */ + { 10, 21, 12, 0, 0, -60, 14, 0, }, /* 593 */ { 23, 26, 12, 0, 0, 23, 14, 0, }, /* 594 */ - { 10, 21, 12, 0, 0, -150, 14, 0, }, /* 595 */ - { 10, 21, 12, 0, 0, -137, 14, 0, }, /* 596 */ + { 10, 21, 12, 0, 0, -258, 14, 0, }, /* 595 */ + { 10, 21, 12, 0, 0, -246, 14, 0, }, /* 596 */ { 23, 6, 12, 0, 0, 23, 9, 0, }, /* 597 */ { 10, 7, 12, 0, 0, 23, 9, 0, }, /* 598 */ { 23, 14, 12, 0, 0, 23, 9, 0, }, /* 599 */ - { 10, 22, 12, 0, 0, -150, 14, 0, }, /* 600 */ - { 10, 18, 12, 0, 0, -150, 14, 0, }, /* 601 */ - { 10, 26, 12, 0, 0, -137, 14, 0, }, /* 602 */ - { 10, 17, 12, 0, 0, -137, 14, 0, }, /* 603 */ - { 10, 22, 12, 0, 0, -137, 14, 0, }, /* 604 */ - { 10, 18, 12, 0, 0, -137, 14, 0, }, /* 605 */ - { 28, 12, 3, 0, 0, -19, 13, 0, }, /* 606 */ + { 10, 22, 12, 0, 0, -258, 14, 0, }, /* 600 */ + { 10, 18, 12, 0, 0, -258, 14, 0, }, /* 601 */ + { 10, 26, 12, 0, 0, -246, 14, 0, }, /* 602 */ + { 10, 17, 12, 0, 0, -246, 14, 0, }, /* 603 */ + { 10, 22, 12, 0, 0, -246, 14, 0, }, /* 604 */ + { 10, 18, 12, 0, 0, -246, 14, 0, }, /* 605 */ + { 28, 12, 3, 0, 0, -42, 13, 0, }, /* 606 */ { 24, 10, 3, 0, 0, 24, 9, 0, }, /* 607 */ - { 10, 17, 14, 0, 0, -137, 14, 0, }, /* 608 */ - { 10, 6, 12, 0, 0, -67, 9, 0, }, /* 609 */ - { 10, 7, 12, 0, 0, -114, 9, 0, }, /* 610 */ - { 10, 21, 14, 0, 0, -114, 14, 0, }, /* 611 */ + { 10, 17, 14, 0, 0, -246, 14, 0, }, /* 608 */ + { 10, 6, 12, 0, 0, -138, 9, 0, }, /* 609 */ + { 10, 7, 12, 0, 0, -216, 9, 0, }, /* 610 */ + { 10, 21, 14, 0, 0, -216, 14, 0, }, /* 611 */ { 10, 26, 12, 0, 0, 23, 14, 0, }, /* 612 */ { 27, 7, 12, 0, 0, 27, 9, 0, }, /* 613 */ - { 28, 12, 3, 0, 0, -67, 13, 0, }, /* 614 */ - { 10, 24, 12, 0, 0, -67, 14, 0, }, /* 615 */ + { 28, 12, 3, 0, 0, -138, 13, 0, }, /* 614 */ + { 10, 24, 12, 0, 0, -138, 14, 0, }, /* 615 */ { 27, 6, 12, 0, 0, 27, 9, 0, }, /* 616 */ - { 10, 17, 12, 0, 0, -67, 14, 0, }, /* 617 */ + { 10, 17, 12, 0, 0, -138, 14, 0, }, /* 617 */ { 30, 7, 12, 0, 0, 30, 9, 0, }, /* 618 */ { 30, 6, 12, 0, 0, 30, 9, 0, }, /* 619 */ { 4, 7, 12, 0, 0, 4, 9, 0, }, /* 620 */ @@ -849,7 +848,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 79, 14, 12, 0, 0, 79, 9, 0, }, /* 646 */ { 79, 12, 3, 0, 0, 79, 13, 0, }, /* 647 */ { 79, 21, 12, 0, 0, 79, 9, 0, }, /* 648 */ - { 10, 24, 12, 0, 0, -64, 14, 0, }, /* 649 */ + { 10, 24, 12, 0, 0, -132, 14, 0, }, /* 649 */ { 34, 9, 12, 0, -35332, 34, 9, 0, }, /* 650 */ { 10, 24, 12, 0, 0, 10, 9, 0, }, /* 651 */ { 34, 9, 12, 0, -42280, 34, 9, 0, }, /* 652 */ @@ -869,11 +868,11 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 49, 12, 3, 0, 0, 49, 13, 0, }, /* 666 */ { 49, 10, 5, 0, 0, 49, 9, 0, }, /* 667 */ { 49, 26, 12, 0, 0, 49, 14, 0, }, /* 668 */ - { 10, 15, 12, 0, 0, -244, 9, 0, }, /* 669 */ - { 10, 15, 12, 0, 0, -230, 9, 0, }, /* 670 */ - { 10, 26, 12, 0, 0, -191, 9, 0, }, /* 671 */ - { 10, 23, 12, 0, 0, -191, 7, 0, }, /* 672 */ - { 10, 26, 12, 0, 0, -191, 7, 0, }, /* 673 */ + { 10, 15, 12, 0, 0, -312, 9, 0, }, /* 669 */ + { 10, 15, 12, 0, 0, -306, 9, 0, }, /* 670 */ + { 10, 26, 12, 0, 0, -288, 9, 0, }, /* 671 */ + { 10, 23, 12, 0, 0, -288, 7, 0, }, /* 672 */ + { 10, 26, 12, 0, 0, -288, 7, 0, }, /* 673 */ { 65, 7, 12, 0, 0, 65, 9, 0, }, /* 674 */ { 65, 21, 12, 0, 0, 65, 14, 0, }, /* 675 */ { 75, 10, 5, 0, 0, 75, 9, 0, }, /* 676 */ @@ -881,12 +880,12 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 75, 12, 3, 0, 0, 75, 13, 0, }, /* 678 */ { 75, 21, 12, 0, 0, 75, 9, 0, }, /* 679 */ { 75, 13, 12, 0, 0, 75, 9, 0, }, /* 680 */ - { 15, 12, 3, 0, 0, -16, 13, 0, }, /* 681 */ - { 15, 7, 12, 0, 0, -49, 9, 0, }, /* 682 */ + { 15, 12, 3, 0, 0, -36, 13, 0, }, /* 681 */ + { 15, 7, 12, 0, 0, -102, 9, 0, }, /* 682 */ { 69, 13, 12, 0, 0, 69, 9, 0, }, /* 683 */ { 69, 7, 12, 0, 0, 69, 9, 0, }, /* 684 */ { 69, 12, 3, 0, 0, 69, 13, 0, }, /* 685 */ - { 10, 21, 12, 0, 0, -118, 9, 0, }, /* 686 */ + { 10, 21, 12, 0, 0, -222, 9, 0, }, /* 686 */ { 69, 21, 12, 0, 0, 69, 9, 0, }, /* 687 */ { 74, 7, 12, 0, 0, 74, 9, 0, }, /* 688 */ { 74, 12, 3, 0, 0, 74, 13, 0, }, /* 689 */ @@ -896,7 +895,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 84, 10, 5, 0, 0, 84, 9, 0, }, /* 693 */ { 84, 7, 12, 0, 0, 84, 9, 0, }, /* 694 */ { 84, 21, 12, 0, 0, 84, 9, 0, }, /* 695 */ - { 10, 6, 12, 0, 0, -22, 9, 0, }, /* 696 */ + { 10, 6, 12, 0, 0, -48, 9, 0, }, /* 696 */ { 84, 13, 12, 0, 0, 84, 9, 0, }, /* 697 */ { 39, 6, 12, 0, 0, 39, 9, 0, }, /* 698 */ { 68, 7, 12, 0, 0, 68, 9, 0, }, /* 699 */ @@ -921,27 +920,27 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 0, 4, 12, 0, 0, 0, 9, 0, }, /* 718 */ { 0, 3, 12, 0, 0, 0, 9, 0, }, /* 719 */ { 26, 25, 12, 0, 0, 26, 6, 0, }, /* 720 */ - { 10, 18, 12, 0, 0, -7, 14, 0, }, /* 721 */ - { 10, 22, 12, 0, 0, -7, 14, 0, }, /* 722 */ + { 10, 18, 12, 0, 0, -18, 14, 0, }, /* 721 */ + { 10, 22, 12, 0, 0, -18, 14, 0, }, /* 722 */ { 0, 2, 12, 0, 0, 0, 3, 0, }, /* 723 */ - { 1, 7, 12, 0, 0, -13, 0, 0, }, /* 724 */ - { 1, 26, 12, 0, 0, -13, 14, 0, }, /* 725 */ - { 10, 6, 3, 0, 0, -67, 9, 0, }, /* 726 */ + { 1, 7, 12, 0, 0, -30, 0, 0, }, /* 724 */ + { 1, 26, 12, 0, 0, -30, 14, 0, }, /* 725 */ + { 10, 6, 3, 0, 0, -138, 9, 0, }, /* 726 */ { 10, 1, 2, 0, 0, 10, 14, 0, }, /* 727 */ { 36, 7, 12, 0, 0, 36, 9, 0, }, /* 728 */ - { 10, 21, 12, 0, 0, -98, 9, 0, }, /* 729 */ - { 10, 21, 12, 0, 0, -98, 14, 0, }, /* 730 */ - { 10, 21, 12, 0, 0, -25, 9, 0, }, /* 731 */ - { 10, 15, 12, 0, 0, -102, 9, 0, }, /* 732 */ - { 10, 26, 12, 0, 0, -25, 9, 0, }, /* 733 */ + { 10, 21, 12, 0, 0, -192, 9, 0, }, /* 729 */ + { 10, 21, 12, 0, 0, -192, 14, 0, }, /* 730 */ + { 10, 21, 12, 0, 0, -54, 9, 0, }, /* 731 */ + { 10, 15, 12, 0, 0, -198, 9, 0, }, /* 732 */ + { 10, 26, 12, 0, 0, -54, 9, 0, }, /* 733 */ { 20, 14, 12, 0, 0, 20, 14, 0, }, /* 734 */ { 20, 15, 12, 0, 0, 20, 14, 0, }, /* 735 */ { 20, 26, 12, 0, 0, 20, 14, 0, }, /* 736 */ { 20, 26, 12, 0, 0, 20, 9, 0, }, /* 737 */ { 71, 7, 12, 0, 0, 71, 9, 0, }, /* 738 */ { 67, 7, 12, 0, 0, 67, 9, 0, }, /* 739 */ - { 28, 12, 3, 0, 0, -1, 13, 0, }, /* 740 */ - { 10, 15, 12, 0, 0, -1, 5, 0, }, /* 741 */ + { 28, 12, 3, 0, 0, -6, 13, 0, }, /* 740 */ + { 10, 15, 12, 0, 0, -6, 5, 0, }, /* 741 */ { 42, 7, 12, 0, 0, 42, 9, 0, }, /* 742 */ { 42, 15, 12, 0, 0, 42, 9, 0, }, /* 743 */ { 19, 7, 12, 0, 0, 19, 9, 0, }, /* 744 */ @@ -999,7 +998,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 112, 12, 3, 0, 0, 112, 13, 0, }, /* 796 */ { 112, 15, 12, 0, 0, 112, 17, 0, }, /* 797 */ { 112, 21, 12, 0, 0, 112, 17, 0, }, /* 798 */ - { 112, 21, 12, 0, 0, -76, 17, 0, }, /* 799 */ + { 112, 21, 12, 0, 0, -156, 17, 0, }, /* 799 */ { 78, 7, 12, 0, 0, 78, 17, 0, }, /* 800 */ { 78, 21, 12, 0, 0, 78, 14, 0, }, /* 801 */ { 83, 7, 12, 0, 0, 83, 17, 0, }, /* 802 */ @@ -1071,11 +1070,11 @@ const ucd_record PRIV(ucd_records)[] = { /* 12576 bytes, record size 12 */ { 109, 10, 5, 0, 0, 109, 9, 0, }, /* 868 */ { 109, 13, 12, 0, 0, 109, 9, 0, }, /* 869 */ { 107, 12, 3, 0, 0, 107, 13, 0, }, /* 870 */ - { 107, 12, 3, 0, 0, -55, 13, 0, }, /* 871 */ + { 107, 12, 3, 0, 0, -114, 13, 0, }, /* 871 */ { 107, 10, 5, 0, 0, 107, 9, 0, }, /* 872 */ - { 107, 10, 5, 0, 0, -55, 9, 0, }, /* 873 */ + { 107, 10, 5, 0, 0, -114, 9, 0, }, /* 873 */ { 107, 7, 12, 0, 0, 107, 9, 0, }, /* 874 */ - { 28, 12, 3, 0, 0, -55, 13, 0, }, /* 875 */ + { 28, 12, 3, 0, 0, -114, 13, 0, }, /* 875 */ { 107, 10, 3, 0, 0, 107, 9, 0, }, /* 876 */ { 135, 7, 12, 0, 0, 135, 9, 0, }, /* 877 */ { 135, 10, 5, 0, 0, 135, 9, 0, }, /* 878 */ diff --git a/src/pcre2_ucp.h b/src/pcre2_ucp.h index d7f7885..c2de91f 100644 --- a/src/pcre2_ucp.h +++ b/src/pcre2_ucp.h @@ -325,7 +325,10 @@ enum { ucp_Old_Uyghur, ucp_Tangsa, ucp_Toto, - ucp_Vithkuqi + ucp_Vithkuqi, + + /* This must be last */ + ucp_Script_Count }; #endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */ diff --git a/src/pcre2_xclass.c b/src/pcre2_xclass.c index 2b5e7cf..c79200d 100644 --- a/src/pcre2_xclass.c +++ b/src/pcre2_xclass.c @@ -134,7 +134,9 @@ while ((t = *data++) != XCL_END) else /* XCL_PROP & XCL_NOTPROP */ { const ucd_record *prop = GET_UCD(c); + int scriptx; BOOL isprop = t == XCL_PROP; + BOOL ok; switch(*data) { @@ -160,6 +162,14 @@ while ((t = *data++) != XCL_END) if ((data[1] == prop->script) == isprop) return !negated; break; + case PT_SCX: + scriptx = prop->scriptx; + ok = data[1] == prop->script || data[1] == (PCRE2_UCHAR)scriptx; + if (!ok && scriptx < 0) + ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, data[1]); + if (ok == isprop) return !negated; + break; + case PT_ALNUM: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L || PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop) diff --git a/testdata/testinput4 b/testdata/testinput4 index 9b4b5c3..1be2af4 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -1,4 +1,4 @@ -# This set of tests is for UTF support, including Unicode properties. The +# This set of tests is for UTF support, including Unicode properties. The # Unicode tests are all compatible with all versions of Perl >= 5.10, but # some of the property tests may differ because of different versions of # Unicode in use by PCRE2 and Perl. @@ -6,7 +6,7 @@ # WARNING: Use only / as the pattern delimiter. Although pcre2test supports # a number of delimiters, all those other than / give problems with the # perltest.sh script. - + #newline_default lf anycrlf any #perltest @@ -694,27 +694,27 @@ /^\d*\w{4}/utf 1234 -\= Expect no match +\= Expect no match 123 /^[^b]*\w{4}/utf aaaa -\= Expect no match +\= Expect no match aaa /^[^b]*\w{4}/i,utf aaaa -\= Expect no match +\= Expect no match aaa /^\x{100}*.{4}/utf \x{100}\x{100}\x{100}\x{100} -\= Expect no match +\= Expect no match \x{100}\x{100}\x{100} /^\x{100}*.{4}/i,utf \x{100}\x{100}\x{100}\x{100} -\= Expect no match +\= Expect no match \x{100}\x{100}\x{100} /^a+[a\x{200}]/utf @@ -725,144 +725,144 @@ /^#[^\x{ffff}]#[^\x{ffff}]#[^\x{ffff}]#/utf #\x{10000}#\x{100}#\x{10ffff}# - -# Unicode property support tests + +# Unicode property support tests /^\pC\pL\pM\pN\pP\pS\pZ\s+/utf,ucp - >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} - + >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} + /^>\pZ+/utf,ucp - >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} - + >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} + /^>[[:space:]]*/utf,ucp - >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} + >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} /^>[[:blank:]]*/utf,ucp - >\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028} + >\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028} /^[[:alpha:]]*/utf,ucp Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d} @@ -1496,7 +1532,7 @@ Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}1\x{660}\x{bef}\x{16ee} /^[[:cntrl:]]*/utf,ucp - \x{0}\x{09}\x{1f}\x{7f}\x{9f} + \x{0}\x{09}\x{1f}\x{7f}\x{9f} /^[[:graph:]]*/utf,ucp A\x{a1}\x{a0} @@ -1509,27 +1545,27 @@ /\p{Zs}*?\R/ \= Expect no match - a\xFCb + a\xFCb /\p{Zs}*\R/ -\= Expect no match - a\xFCb +\= Expect no match + a\xFCb /ⱥ/i,utf ⱥ - Ⱥx - Ⱥ + Ⱥx + Ⱥ /[ⱥ]/i,utf ⱥ - Ⱥx - Ⱥ + Ⱥx + Ⱥ /Ⱥ/i,utf Ⱥ ⱥ - -# These are tests for extended grapheme clusters + +# These are tests for extended grapheme clusters /^\X/utf,aftertext G\x{34e}\x{34e}X @@ -1537,7 +1573,7 @@ \x04X \x{1100}X \x{1100}\x{34e}X - \x{1b04}\x{1b04}X + \x{1b04}\x{1b04}X *These match up to the roman letters \x{1111}\x{1111}L,L \x{1111}\x{1111}\x{1169}L,L,V @@ -1577,10 +1613,10 @@ \x0b\x{0711}Control, extend \x09\x{1b04}Control, spacingmark *There are no Prepend characters, so we can't test Prepend, CR - + /^(?>\X{2})X/utf,aftertext \x{1111}\x{ae4c}\x{1111}\x{ae4c}X - + /^\X{2,4}X/utf,aftertext \x{1111}\x{ae4c}\x{1111}\x{ae4c}X \x{1111}\x{ae4c}\x{1111}\x{ae4c}\x{1111}\x{ae4c}X @@ -1613,17 +1649,17 @@ \x{1e9e}\x{00df} /\x{1f88}+/i,utf - \x{1f88}\x{1f80} + \x{1f88}\x{1f80} /[z\x{1f88}]+/i,utf - \x{1f88}\x{1f80} - + \x{1f88}\x{1f80} + # Check a reference with more than one other case -/^(\x{00b5})\1{2}$/i,utf - \x{00b5}\x{039c}\x{03bc} - -# Characters with more than one other case; test in classes +/^(\x{00b5})\1{2}$/i,utf + \x{00b5}\x{039c}\x{03bc} + +# Characters with more than one other case; test in classes /[z\x{00b5}]+/i,utf \x{00b5}\x{039c}\x{03bc} @@ -1731,13 +1767,13 @@ \x{039a}\x{03ba}\x{03f0} /[z\x{03a0}]+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} /[z\x{03c0}]+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} /[z\x{03d6}]+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} /[z\x{03a1}]+/i,utf \x{03a1}\x{03c1}\x{03f1} @@ -1758,13 +1794,13 @@ \x{03A3}\x{03C2}\x{03C3} /[z\x{03a6}]+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} /[z\x{03c6}]+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} /[z\x{03d5}]+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} /[z\x{03c9}]+/i,utf \x{03c9}\x{03a9}\x{2126} @@ -1784,7 +1820,7 @@ /[z\x{1e9b}]+/i,utf \x{1e60}\x{1e61}\x{1e9b} -# Perl 5.12.4 gets these wrong, but 5.15.3 is OK +# Perl 5.12.4 gets these wrong, but 5.15.3 is OK /[z\x{004b}]+/i,utf \x{004b}\x{006b}\x{212a} @@ -1803,28 +1839,28 @@ /[z\x{017f}]+/i,utf \x{0053}\x{0073}\x{017f} - -# -------------------------------------- + +# -------------------------------------- /(ΣΆΜΟΣ) \1/i,utf ΣΆΜΟΣ ΣΆΜΟΣ ΣΆΜΟΣ σάμος σάμος σάμος σάμος σάμοσ - σάμος ΣΆΜΟΣ + σάμος ΣΆΜΟΣ /(σάμος) \1/i,utf ΣΆΜΟΣ ΣΆΜΟΣ ΣΆΜΟΣ σάμος σάμος σάμος σάμος σάμοσ - σάμος ΣΆΜΟΣ + σάμος ΣΆΜΟΣ /(ΣΆΜΟΣ) \1*/i,utf ΣΆΜΟΣ\x20 ΣΆΜΟΣ ΣΆΜΟΣσάμοςσάμος -# Perl matches these +# Perl matches these /\x{00b5}+/i,utf \x{00b5}\x{039c}\x{03bc} @@ -1907,7 +1943,7 @@ /\x{03d0}+/i,utf \x{0392}\x{03b2}\x{03d0} - + /\x{0395}+/i,utf \x{0395}\x{03b5}\x{03f5} @@ -1930,7 +1966,7 @@ /\x{03f4}+/i,utf \x{0398}\x{03b8}\x{03d1}\x{03f4} - + /\x{039a}+/i,utf \x{039a}\x{03ba}\x{03f0} @@ -1940,16 +1976,16 @@ /\x{03f0}+/i,utf \x{039a}\x{03ba}\x{03f0} - + /\x{03a0}+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} /\x{03c0}+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} /\x{03d6}+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} /\x{03a1}+/i,utf @@ -1970,16 +2006,16 @@ /\x{03c3}+/i,utf \x{03A3}\x{03C2}\x{03C3} - + /\x{03a6}+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} /\x{03c6}+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} /\x{03d5}+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} /\x{03c9}+/i,utf @@ -1990,7 +2026,7 @@ /\x{2126}+/i,utf \x{03c9}\x{03a9}\x{2126} - + /\x{1e60}+/i,utf \x{1e60}\x{1e61}\x{1e9b} @@ -2000,22 +2036,22 @@ /\x{1e9b}+/i,utf \x{1e60}\x{1e61}\x{1e9b} - + /\x{1e9e}+/i,utf \x{1e9e}\x{00df} /\x{00df}+/i,utf \x{1e9e}\x{00df} - + /\x{1f88}+/i,utf - \x{1f88}\x{1f80} + \x{1f88}\x{1f80} /\x{1f80}+/i,utf - \x{1f88}\x{1f80} + \x{1f88}\x{1f80} -# Perl 5.12.4 gets these wrong, but 5.15.3 is OK +# Perl 5.12.4 gets these wrong, but 5.15.3 is OK /\x{004b}+/i,utf \x{004b}\x{006b}\x{212a} @@ -2039,12 +2075,12 @@ /^\p{Any}*\d{4}/utf 1234 \= Expect no match - 123 + 123 /^\X*\w{4}/utf 1234 \= Expect no match - 123 + 123 /^A\s+Z/utf,ucp A\x{2005}Z @@ -2086,13 +2122,13 @@ \x{3000} \x{e0002} \x{e001f} - \x{e0080} + \x{e0080} /^[[:print:]]+$/utf,ucp Space: \x{a0} \x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}\x{2005} - \x{2006}\x{2007}\x{2008}\x{2009}\x{200a} - \x{202f}\x{205f} + \x{2006}\x{2007}\x{2008}\x{2009}\x{200a} + \x{202f}\x{205f} \x{3000} Letter:ABC Mark:\x{300}\x{1d172}\x{1d17b} @@ -2120,15 +2156,15 @@ \x{2065} \x{e0002} \x{e001f} - \x{e0080} + \x{e0080} /^[[:punct:]]+$/utf,ucp \$+<=>^`|~ !\"#%&'()*,-./:;?@[\\]_{} - \x{a1}\x{a7} - \x{37e} + \x{a1}\x{a7} + \x{37e} \= Expect no match - abcde + abcde /^[[:^graph:]]+$/utf,ucp \x{09}\x{0a}\x{1D}\x{20}\x{85}\x{a0}\x{1680} @@ -2158,8 +2194,8 @@ \= Expect no match Space: \x{a0} \x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}\x{2005} - \x{2006}\x{2007}\x{2008}\x{2009}\x{200a} - \x{202f}\x{205f} + \x{2006}\x{2007}\x{2008}\x{2009}\x{200a} + \x{202f}\x{205f} \x{3000} Letter:ABC Mark:\x{300}\x{1d172}\x{1d17b} @@ -2180,16 +2216,16 @@ \x{e0020}\x{e0030}\x{e0040}\x{e0050}\x{e0060}\x{e0070}\x{e007f} /^[[:^punct:]]+$/utf,ucp - abcde + abcde \= Expect no match \$+<=>^`|~ !\"#%&'()*,-./:;?@[\\]_{} - \x{a1}\x{a7} - \x{37e} + \x{a1}\x{a7} + \x{37e} /[RST]+/i,utf,ucp Ss\x{17f} - + /[R-T]+/i,utf,ucp Ss\x{17f} @@ -2198,8 +2234,8 @@ /^s?c/im,utf scat - -# The next four tests are for repeated caseless back references when the + +# The next four tests are for repeated caseless back references when the # code unit length of the matched text is different to that of the original # group in the UTF-8 case. @@ -2230,17 +2266,17 @@ x /[[:punct:]]/utf,ucp - \x{b4} + \x{b4} /[[:^ascii:]]/utf,ucp \x{100} \x{200} \x{300} \x{37e} -\= Expect no match +\= Expect no match aa 99 - + /[[:^ascii:]\w]/utf,ucp aa 99 @@ -2262,7 +2298,7 @@ /[^[:ascii:]\W]/utf,ucp \x{100} \x{200} -\= Expect no match +\= Expect no match aa 99 gg @@ -2280,7 +2316,7 @@ /(?=.*b)\pL/ 11bb - + /(?(?=.*b)(?=.*b)\pL|.*c)/ 11bb @@ -2289,15 +2325,15 @@ /^\x{123}+?$/i,utf,no_auto_possess \x{123}\x{122}\x{123} -\= Expect no match +\= Expect no match \x{123}\x{124}\x{123} - + /\N{U+1234}/utf \x{1234} /[\N{U+1234}]/utf \x{1234} - + # Test the full list of Unicode "Pattern White Space" characters that are to # be ignored by /x. The pattern lines below may show up oddly in text editors # or when listed to the screen. Note that characters such as U+2002, which are @@ -2310,8 +2346,8 @@ A\x{2002}B \= Expect no match AB - -# ------- + +# ------- /[^\x{100}-\x{ffff}]*[\x80-\xff]/utf \x{99}\x{99}\x{99} @@ -2321,7 +2357,7 @@ /[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf \x{99}\x{99}\x{99} - + # Script run tests /^(*script_run:.{4})/utf @@ -2334,7 +2370,7 @@ \x{02ea}\x{2e80}\x{2e80}\x{3105} Bopomofo-Sk Han Han Bopomofo \x{3105}\x{2e80}\x{2e80}\x{3105} Bopomofo Han Han Bopomofo \x{0300}cd! Inherited Latin Latin Common - \x{0391}12\x{03a9} Greek Common-digits Greek + \x{0391}12\x{03a9} Greek Common-digits Greek \x{0400}12\x{fe2f} Cyrillic Common-digits Cyrillic \x{0531}12\x{fb17} Armenian Common-digits Armenian \x{0591}12\x{fb4f} Hebrew Common-digits Hebrew @@ -2346,7 +2382,7 @@ \x{3041}12\x{3041} Hiragana Common-digits Hiragana \x{0980}\x{09e6}\x{09e7}\x{0993} Bengali Bengali-digits Bengali !cde Common Latin Latin Latin - A..B Latin Common Common Latin + A..B Latin Common Common Latin 0abc Ascii-digit Latin Latin Latin 1\x{0700}\x{0700}\x{0700} Ascii-digit Syriac x 3 \x{1A80}\x{1A80}\x{1a40}\x{1a41} Tai Tham Hora digits, letters @@ -2355,16 +2391,16 @@ \x{1100}\x{02ea}\x{02ea}\x{02ea} Hangul Bopomofo x3 \x{02ea}\x{02ea}\x{02ea}\x{1100} Bopomofo x3 Hangul \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul - \x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek + \x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek \x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic \x{0600}\x{0669}7\x{1eef1} Arabic Arabic-digit ascii-digit Arabic - A5\x{ff19}B Latin Common-ascii/notascii-digits Latin + A5\x{ff19}B Latin Common-ascii/notascii-digits Latin \x{0300}cd\x{0391} Inherited Latin Latin Greek !cd\x{0391} Common Latin Latin Greek \x{1A80}\x{1A90}\x{1a40}\x{1a41} Tai Tham Hora digit, Tham digit, letters A\x{1d7ce}\x{1d7ff}B Common fancy-common-2-sets-digits Common \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana - + /^(*sr:.{4}|..)/utf \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana @@ -2388,7 +2424,7 @@ /^(*sr:\x{2e80}*)\x{2e80}/utf \x{2e80}\x{2e80}\x{3105} Han Han Bopomofo - + /^(*sr:.*)Test/utf Test script run on an empty string @@ -2398,7 +2434,7 @@ \x{1A80}\x{1a40}\x{1A90}\x{1a41} Tai Tham Hora digit, letter, Tham digit, letter \= Expect no match \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul - + /^(*sr:\S*)/utf \x{1cf4}\x{20f0}\x{900}\x{11305} [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Gran \x{1cf4}\x{20f0}\x{11305}\x{900} [Dev,Gran,Kan] [Dev,Gran,Lat] Gran Dev @@ -2406,7 +2442,7 @@ \x{1cf4}\x{20f0}ABC [Dev,Gran,Kan] [Dev,Gran,Lat] Lat \x{20f0}ABC [Dev,Gran,Lat] Lat XYZ\x{20f0}ABC Lat [Dev,Gran,Lat] Lat - \x{a36}\x{a33}\x{900} [Dev,...] [Dev,...] Dev + \x{a36}\x{a33}\x{900} [Dev,...] [Dev,...] Dev \x{3001}\x{2e80}\x{3041}\x{30a1} [Bopo, Han, etc] Han Hira Kata \x{3001}\x{30a1}\x{2e80}\x{3041} [Bopo, Han, etc] Kata Han Hira \x{3001}\x{3105}\x{2e80}\x{1101} [Bopo, Han, etc] Bopomofo Han Hangul @@ -2446,20 +2482,20 @@ \x{102e0}\x{06d4}\x{1ee4d} [Arabic Coptic] [Arab Rohingya] Arabic \x{102e0}\x{06d4}\x{2cc9} [Arabic Coptic] [Arab Rohingya] Coptic \x{102e0}\x{06d4}\x{10d30} [Arabic Coptic] [Arab Rohingya] Rohingya - + # Test loop breaking for empty string match /^(*sr:A|)*BCD/utf AABCD ABCD - BCD - -# The use of (*ACCEPT) breaks script run checking + BCD + +# The use of (*ACCEPT) breaks script run checking /^(*sr:.*(*ACCEPT)ZZ)/utf \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul -# ------- +# ------- # Test group names containing non-ASCII letters and digits @@ -2488,12 +2524,12 @@ \xF3aaa\xE4\xEA\xEB\xFEa /Я/i,utf - \x{42f} - \x{44f} + \x{42f} + \x{44f} /(?=Я)/i,utf - \x{42f} - \x{44f} + \x{42f} + \x{44f} # ----------------------------------------------------------------------------- # Tests for bidi control and bidi class properties. @@ -2598,8 +2634,8 @@ -->\x{590}\x{5c6}\x{200f}\x{10805}<-- /\p{bidi class:RLE}+\p{bidi class:RLI}*\p{bidi class:RLO}+/utf - -->\x{202b}\x{2067}\x{202e}<-- - + -->\x{202b}\x{2067}\x{202e}<-- + /\p{bidi class:S}+\p{bidiclass:WS}+/utf -->\x{9}\x{b}\x{1f} \x{c} \x{2000} \x{3000}<-- diff --git a/testdata/testinput5 b/testdata/testinput5 index da7a409..6f4948a 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -1337,6 +1337,8 @@ # These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE +#subject no_jit + /^[\p{Batak}]/utf \x{1bc0} \x{1bff} @@ -1356,6 +1358,8 @@ \x{85c} \x{85d} +#subject -no_jit + /(\X*)(.)/s,utf A\x{300} @@ -2035,6 +2039,8 @@ # doesn't recognize all these scripts. In time these three tests can be moved # to test 4. +#subject no_jit + /^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+) (\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+) (\p{Zanabazar_Square}+)/x,utf @@ -2043,7 +2049,7 @@ /^\x{1E900}\x{104B0}/i,utf \x{1E900}\x{104B0} \x{1E922}\x{104D8} - + /^(?:(\X)(?C))+$/utf \x{1E900}\x{1E924}\x{1E953}\x{11C00}\x{11C2D}\x{11C3E}\x{11C70}\x{11C77}\x{11CAB}\x{11400}\x{1142F}\x{11455}\x{104B0}\x{104D8}\x{104FB}\x{16FE0}\x{18800}\x{18AF2}\x{11D00}\x{11D3A}\x{11D59}\x{16FE1}\x{1B170}\x{1B2FB}\x{11A50}\x{11A58}\x{11AA2}\x{11A00}\x{11A07}\x{11A47}\=callout_capture,callout_no_where @@ -2092,6 +2098,8 @@ \x{655} \x{1D1AA} +#subject -no_jit + /\N{U+}/ /\N{U+}/utf @@ -2192,4 +2200,18 @@ /\p{bidi_control}+\p{L&}/B +/\p{han}/B + +/\p{script:han}/B + +/\p{sc:han}/B + +/\p{script extensions:han}/B + +/\p{scx:han}/B + +# Test error - invalid script name + +/\p{sc:L}/ + # End of testinput5 diff --git a/testdata/testinput7 b/testdata/testinput7 index 6703314..2d90b41 100644 --- a/testdata/testinput7 +++ b/testdata/testinput7 @@ -2203,4 +2203,32 @@ # ----------------------------------------------------------------------------- +/\p{katakana}/utf + \x{30a1} + \x{3001} + +/\p{scx:katakana}/utf + \x{30a1} + \x{3001} + +/\p{script extensions:katakana}/utf + \x{30a1} + \x{3001} + +/\p{sc:katakana}/utf + \x{30a1} +\= Expect no match + \x{3001} + +/\p{script:katakana}/utf + \x{30a1} +\= Expect no match + \x{3001} + +/\p{sc:katakana}{3,}/utf + \x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC + +/\p{sc:katakana}{3,}?/utf + \x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC + # End of testinput7 diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 20b3d43..0c9422d 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1,4 +1,4 @@ -# This set of tests is for UTF support, including Unicode properties. The +# This set of tests is for UTF support, including Unicode properties. The # Unicode tests are all compatible with all versions of Perl >= 5.10, but # some of the property tests may differ because of different versions of # Unicode in use by PCRE2 and Perl. @@ -6,7 +6,7 @@ # WARNING: Use only / as the pattern delimiter. Although pcre2test supports # a number of delimiters, all those other than / give problems with the # perltest.sh script. - + #newline_default lf anycrlf any #perltest @@ -1183,35 +1183,35 @@ MK: a\x{a3}b /^\d*\w{4}/utf 1234 0: 1234 -\= Expect no match +\= Expect no match 123 No match /^[^b]*\w{4}/utf aaaa 0: aaaa -\= Expect no match +\= Expect no match aaa No match /^[^b]*\w{4}/i,utf aaaa 0: aaaa -\= Expect no match +\= Expect no match aaa No match /^\x{100}*.{4}/utf \x{100}\x{100}\x{100}\x{100} 0: \x{100}\x{100}\x{100}\x{100} -\= Expect no match +\= Expect no match \x{100}\x{100}\x{100} No match /^\x{100}*.{4}/i,utf \x{100}\x{100}\x{100}\x{100} 0: \x{100}\x{100}\x{100}\x{100} -\= Expect no match +\= Expect no match \x{100}\x{100}\x{100} No match @@ -1226,113 +1226,113 @@ No match /^#[^\x{ffff}]#[^\x{ffff}]#[^\x{ffff}]#/utf #\x{10000}#\x{100}#\x{10ffff}# 0: #\x{10000}#\x{100}#\x{10ffff}# - -# Unicode property support tests + +# Unicode property support tests /^\pC\pL\pM\pN\pP\pS\pZ\s+/utf,ucp - >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} + >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} 0: > \x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{09}\x{0b} - + /^>\pZ+/utf,ucp - >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} + >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} 0: > \x{a0}\x{1680}\x{2028}\x{2029}\x{202f} - + /^>[[:space:]]*/utf,ucp - >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} + >\x{20}\x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{9}\x{b} 0: > \x{a0}\x{1680}\x{2028}\x{2029}\x{202f}\x{09}\x{0b} /^>[[:blank:]]*/utf,ucp - >\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028} + >\x{20}\x{a0}\x{1680}\x{2000}\x{202f}\x{9}\x{b}\x{2028} 0: > \x{a0}\x{1680}\x{2000}\x{202f}\x{09} /^[[:alpha:]]*/utf,ucp @@ -2446,7 +2494,7 @@ No match 0: Az\x{aa}\x{c0}\x{1c5}\x{2b0}\x{3b6}\x{1d7c9}\x{2fa1d}1\x{660}\x{bef}\x{16ee} /^[[:cntrl:]]*/utf,ucp - \x{0}\x{09}\x{1f}\x{7f}\x{9f} + \x{0}\x{09}\x{1f}\x{7f}\x{9f} 0: \x{00}\x{09}\x{1f}\x{7f}\x{9f} /^[[:graph:]]*/utf,ucp @@ -2463,28 +2511,28 @@ No match /\p{Zs}*?\R/ \= Expect no match - a\xFCb + a\xFCb No match /\p{Zs}*\R/ -\= Expect no match - a\xFCb +\= Expect no match + a\xFCb No match /ⱥ/i,utf ⱥ 0: \x{2c65} - Ⱥx + Ⱥx 0: \x{23a} - Ⱥ + Ⱥ 0: \x{23a} /[ⱥ]/i,utf ⱥ 0: \x{2c65} - Ⱥx + Ⱥx 0: \x{23a} - Ⱥ + Ⱥ 0: \x{23a} /Ⱥ/i,utf @@ -2492,8 +2540,8 @@ No match 0: \x{23a} ⱥ 0: \x{2c65} - -# These are tests for extended grapheme clusters + +# These are tests for extended grapheme clusters /^\X/utf,aftertext G\x{34e}\x{34e}X @@ -2511,7 +2559,7 @@ No match \x{1100}\x{34e}X 0: \x{1100}\x{34e} 0+ X - \x{1b04}\x{1b04}X + \x{1b04}\x{1b04}X 0: \x{1b04}\x{1b04} 0+ X *These match up to the roman letters @@ -2631,12 +2679,12 @@ No match *There are no Prepend characters, so we can't test Prepend, CR 0: * 0+ There are no Prepend characters, so we can't test Prepend, CR - + /^(?>\X{2})X/utf,aftertext \x{1111}\x{ae4c}\x{1111}\x{ae4c}X 0: \x{1111}\x{ae4c}\x{1111}\x{ae4c}X 0+ - + /^\X{2,4}X/utf,aftertext \x{1111}\x{ae4c}\x{1111}\x{ae4c}X 0: \x{1111}\x{ae4c}\x{1111}\x{ae4c}X @@ -2688,21 +2736,21 @@ No match 0: \x{1e9e}\x{df} /\x{1f88}+/i,utf - \x{1f88}\x{1f80} + \x{1f88}\x{1f80} 0: \x{1f88}\x{1f80} /[z\x{1f88}]+/i,utf - \x{1f88}\x{1f80} + \x{1f88}\x{1f80} 0: \x{1f88}\x{1f80} - + # Check a reference with more than one other case -/^(\x{00b5})\1{2}$/i,utf - \x{00b5}\x{039c}\x{03bc} +/^(\x{00b5})\1{2}$/i,utf + \x{00b5}\x{039c}\x{03bc} 0: \x{b5}\x{39c}\x{3bc} 1: \x{b5} - -# Characters with more than one other case; test in classes + +# Characters with more than one other case; test in classes /[z\x{00b5}]+/i,utf \x{00b5}\x{039c}\x{03bc} @@ -2845,15 +2893,15 @@ No match 0: \x{39a}\x{3ba}\x{3f0} /[z\x{03a0}]+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} 0: \x{3a0}\x{3c0}\x{3d6} /[z\x{03c0}]+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} 0: \x{3a0}\x{3c0}\x{3d6} /[z\x{03d6}]+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} 0: \x{3a0}\x{3c0}\x{3d6} /[z\x{03a1}]+/i,utf @@ -2881,15 +2929,15 @@ No match 0: \x{3a3}\x{3c2}\x{3c3} /[z\x{03a6}]+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} 0: \x{3a6}\x{3c6}\x{3d5} /[z\x{03c6}]+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} 0: \x{3a6}\x{3c6}\x{3d5} /[z\x{03d5}]+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} 0: \x{3a6}\x{3c6}\x{3d5} /[z\x{03c9}]+/i,utf @@ -2916,7 +2964,7 @@ No match \x{1e60}\x{1e61}\x{1e9b} 0: \x{1e60}\x{1e61}\x{1e9b} -# Perl 5.12.4 gets these wrong, but 5.15.3 is OK +# Perl 5.12.4 gets these wrong, but 5.15.3 is OK /[z\x{004b}]+/i,utf \x{004b}\x{006b}\x{212a} @@ -2941,8 +2989,8 @@ No match /[z\x{017f}]+/i,utf \x{0053}\x{0073}\x{017f} 0: Ss\x{17f} - -# -------------------------------------- + +# -------------------------------------- /(ΣΆΜΟΣ) \1/i,utf ΣΆΜΟΣ ΣΆΜΟΣ @@ -2957,7 +3005,7 @@ No match σάμος σάμοσ 0: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c3} 1: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} - σάμος ΣΆΜΟΣ + σάμος ΣΆΜΟΣ 0: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3} 1: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} @@ -2974,7 +3022,7 @@ No match σάμος σάμοσ 0: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c3} 1: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} - σάμος ΣΆΜΟΣ + σάμος ΣΆΜΟΣ 0: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3} 1: \x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} @@ -2986,7 +3034,7 @@ No match 0: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3} \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3}\x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2}\x{3c3}\x{3ac}\x{3bc}\x{3bf}\x{3c2} 1: \x{3a3}\x{386}\x{39c}\x{39f}\x{3a3} -# Perl matches these +# Perl matches these /\x{00b5}+/i,utf \x{00b5}\x{039c}\x{03bc} @@ -3094,7 +3142,7 @@ No match /\x{03d0}+/i,utf \x{0392}\x{03b2}\x{03d0} 0: \x{392}\x{3b2}\x{3d0} - + /\x{0395}+/i,utf \x{0395}\x{03b5}\x{03f5} @@ -3124,7 +3172,7 @@ No match /\x{03f4}+/i,utf \x{0398}\x{03b8}\x{03d1}\x{03f4} 0: \x{398}\x{3b8}\x{3d1}\x{3f4} - + /\x{039a}+/i,utf \x{039a}\x{03ba}\x{03f0} @@ -3137,18 +3185,18 @@ No match /\x{03f0}+/i,utf \x{039a}\x{03ba}\x{03f0} 0: \x{39a}\x{3ba}\x{3f0} - + /\x{03a0}+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} 0: \x{3a0}\x{3c0}\x{3d6} /\x{03c0}+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} 0: \x{3a0}\x{3c0}\x{3d6} /\x{03d6}+/i,utf - \x{03a0}\x{03c0}\x{03d6} + \x{03a0}\x{03c0}\x{03d6} 0: \x{3a0}\x{3c0}\x{3d6} @@ -3176,18 +3224,18 @@ No match /\x{03c3}+/i,utf \x{03A3}\x{03C2}\x{03C3} 0: \x{3a3}\x{3c2}\x{3c3} - + /\x{03a6}+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} 0: \x{3a6}\x{3c6}\x{3d5} /\x{03c6}+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} 0: \x{3a6}\x{3c6}\x{3d5} /\x{03d5}+/i,utf - \x{03a6}\x{03c6}\x{03d5} + \x{03a6}\x{03c6}\x{03d5} 0: \x{3a6}\x{3c6}\x{3d5} @@ -3202,7 +3250,7 @@ No match /\x{2126}+/i,utf \x{03c9}\x{03a9}\x{2126} 0: \x{3c9}\x{3a9}\x{2126} - + /\x{1e60}+/i,utf \x{1e60}\x{1e61}\x{1e9b} @@ -3215,7 +3263,7 @@ No match /\x{1e9b}+/i,utf \x{1e60}\x{1e61}\x{1e9b} 0: \x{1e60}\x{1e61}\x{1e9b} - + /\x{1e9e}+/i,utf \x{1e9e}\x{00df} @@ -3224,17 +3272,17 @@ No match /\x{00df}+/i,utf \x{1e9e}\x{00df} 0: \x{1e9e}\x{df} - + /\x{1f88}+/i,utf - \x{1f88}\x{1f80} + \x{1f88}\x{1f80} 0: \x{1f88}\x{1f80} /\x{1f80}+/i,utf - \x{1f88}\x{1f80} + \x{1f88}\x{1f80} 0: \x{1f88}\x{1f80} -# Perl 5.12.4 gets these wrong, but 5.15.3 is OK +# Perl 5.12.4 gets these wrong, but 5.15.3 is OK /\x{004b}+/i,utf \x{004b}\x{006b}\x{212a} @@ -3265,14 +3313,14 @@ No match 1234 0: 1234 \= Expect no match - 123 + 123 No match /^\X*\w{4}/utf 1234 0: 1234 \= Expect no match - 123 + 123 No match /^A\s+Z/utf,ucp @@ -3349,7 +3397,7 @@ No match No match \x{e001f} No match - \x{e0080} + \x{e0080} No match /^[[:print:]]+$/utf,ucp @@ -3357,9 +3405,9 @@ No match 0: Space: \x{a0} \x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}\x{2005} 0: \x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}\x{2005} - \x{2006}\x{2007}\x{2008}\x{2009}\x{200a} + \x{2006}\x{2007}\x{2008}\x{2009}\x{200a} 0: \x{2006}\x{2007}\x{2008}\x{2009}\x{200a} - \x{202f}\x{205f} + \x{202f}\x{205f} 0: \x{202f}\x{205f} \x{3000} 0: \x{3000} @@ -3414,7 +3462,7 @@ No match No match \x{e001f} No match - \x{e0080} + \x{e0080} No match /^[[:punct:]]+$/utf,ucp @@ -3422,12 +3470,12 @@ No match 0: $+<=>^`|~ !\"#%&'()*,-./:;?@[\\]_{} 0: !"#%&'()*,-./:;?@[\]_{} - \x{a1}\x{a7} + \x{a1}\x{a7} 0: \x{a1}\x{a7} - \x{37e} + \x{37e} 0: \x{37e} \= Expect no match - abcde + abcde No match /^[[:^graph:]]+$/utf,ucp @@ -3481,9 +3529,9 @@ No match No match \x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}\x{2005} No match - \x{2006}\x{2007}\x{2008}\x{2009}\x{200a} + \x{2006}\x{2007}\x{2008}\x{2009}\x{200a} No match - \x{202f}\x{205f} + \x{202f}\x{205f} No match \x{3000} No match @@ -3523,22 +3571,22 @@ No match No match /^[[:^punct:]]+$/utf,ucp - abcde + abcde 0: abcde \= Expect no match \$+<=>^`|~ No match !\"#%&'()*,-./:;?@[\\]_{} No match - \x{a1}\x{a7} + \x{a1}\x{a7} No match - \x{37e} + \x{37e} No match /[RST]+/i,utf,ucp Ss\x{17f} 0: Ss\x{17f} - + /[R-T]+/i,utf,ucp Ss\x{17f} 0: Ss\x{17f} @@ -3550,8 +3598,8 @@ No match /^s?c/im,utf scat 0: sc - -# The next four tests are for repeated caseless back references when the + +# The next four tests are for repeated caseless back references when the # code unit length of the matched text is different to that of the original # group in the UTF-8 case. @@ -3607,7 +3655,7 @@ No match 0: x /[[:punct:]]/utf,ucp - \x{b4} + \x{b4} No match /[[:^ascii:]]/utf,ucp @@ -3619,12 +3667,12 @@ No match 0: \x{300} \x{37e} 0: \x{37e} -\= Expect no match +\= Expect no match aa No match 99 No match - + /[[:^ascii:]\w]/utf,ucp aa 0: a @@ -3662,7 +3710,7 @@ No match 0: \x{100} \x{200} 0: \x{200} -\= Expect no match +\= Expect no match aa No match 99 @@ -3692,7 +3740,7 @@ No match /(?=.*b)\pL/ 11bb 0: b - + /(?(?=.*b)(?=.*b)\pL|.*c)/ 11bb 0: b @@ -3704,10 +3752,10 @@ No match /^\x{123}+?$/i,utf,no_auto_possess \x{123}\x{122}\x{123} 0: \x{123}\x{122}\x{123} -\= Expect no match +\= Expect no match \x{123}\x{124}\x{123} No match - + /\N{U+1234}/utf \x{1234} 0: \x{1234} @@ -3715,7 +3763,7 @@ No match /[\N{U+1234}]/utf \x{1234} 0: \x{1234} - + # Test the full list of Unicode "Pattern White Space" characters that are to # be ignored by /x. The pattern lines below may show up oddly in text editors # or when listed to the screen. Note that characters such as U+2002, which are @@ -3731,8 +3779,8 @@ No match \= Expect no match AB No match - -# ------- + +# ------- /[^\x{100}-\x{ffff}]*[\x80-\xff]/utf \x{99}\x{99}\x{99} @@ -3745,7 +3793,7 @@ No match /[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf \x{99}\x{99}\x{99} 0: \x{99}\x{99}\x{99} - + # Script run tests /^(*script_run:.{4})/utf @@ -3767,7 +3815,7 @@ No match 0: \x{3105}\x{2e80}\x{2e80}\x{3105} \x{0300}cd! Inherited Latin Latin Common 0: \x{300}cd! - \x{0391}12\x{03a9} Greek Common-digits Greek + \x{0391}12\x{03a9} Greek Common-digits Greek 0: \x{391}12\x{3a9} \x{0400}12\x{fe2f} Cyrillic Common-digits Cyrillic 0: \x{400}12\x{fe2f} @@ -3791,7 +3839,7 @@ No match 0: \x{980}\x{9e6}\x{9e7}\x{993} !cde Common Latin Latin Latin 0: !cde - A..B Latin Common Common Latin + A..B Latin Common Common Latin 0: A..B 0abc Ascii-digit Latin Latin Latin 0: 0abc @@ -3808,13 +3856,13 @@ No match No match \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul No match - \x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek + \x{0391}\x{09e6}\x{09e7}\x{03a9} Greek Bengali digits Greek No match \x{0600}7\x{0669}\x{1eef1} Arabic ascii-digit Arabic-digit Arabic No match \x{0600}\x{0669}7\x{1eef1} Arabic Arabic-digit ascii-digit Arabic No match - A5\x{ff19}B Latin Common-ascii/notascii-digits Latin + A5\x{ff19}B Latin Common-ascii/notascii-digits Latin No match \x{0300}cd\x{0391} Inherited Latin Latin Greek No match @@ -3826,7 +3874,7 @@ No match No match \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana No match - + /^(*sr:.{4}|..)/utf \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana 0: \x{2e80}\x{3105} @@ -3858,7 +3906,7 @@ No match /^(*sr:\x{2e80}*)\x{2e80}/utf \x{2e80}\x{2e80}\x{3105} Han Han Bopomofo 0: \x{2e80}\x{2e80} - + /^(*sr:.*)Test/utf Test script run on an empty string 0: Test @@ -3876,7 +3924,7 @@ No match \= Expect no match \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul No match - + /^(*sr:\S*)/utf \x{1cf4}\x{20f0}\x{900}\x{11305} [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Gran 0: \x{1cf4}\x{20f0}\x{900} @@ -3890,7 +3938,7 @@ No match 0: \x{20f0}ABC XYZ\x{20f0}ABC Lat [Dev,Gran,Lat] Lat 0: XYZ\x{20f0}ABC - \x{a36}\x{a33}\x{900} [Dev,...] [Dev,...] Dev + \x{a36}\x{a33}\x{900} [Dev,...] [Dev,...] Dev 0: \x{a36}\x{a33} \x{3001}\x{2e80}\x{3041}\x{30a1} [Bopo, Han, etc] Han Hira Kata 0: \x{3001}\x{2e80}\x{3041}\x{30a1} @@ -3960,7 +4008,7 @@ No match 0: \x{102e0}\x{6d4} \x{102e0}\x{06d4}\x{10d30} [Arabic Coptic] [Arab Rohingya] Rohingya 0: \x{102e0}\x{6d4} - + # Test loop breaking for empty string match /^(*sr:A|)*BCD/utf @@ -3968,16 +4016,16 @@ No match 0: AABCD ABCD 0: ABCD - BCD + BCD 0: BCD - -# The use of (*ACCEPT) breaks script run checking + +# The use of (*ACCEPT) breaks script run checking /^(*sr:.*(*ACCEPT)ZZ)/utf \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul 0: \x{1100}\x{2e80}\x{3041}\x{1101} Hangul Han Hiragana Hangul -# ------- +# ------- # Test group names containing non-ASCII letters and digits @@ -4021,15 +4069,15 @@ No match 0: \xf3aaa\xe4\xea\xeb\xfea /Я/i,utf - \x{42f} + \x{42f} 0: \x{42f} - \x{44f} + \x{44f} 0: \x{44f} /(?=Я)/i,utf - \x{42f} + \x{42f} 0: - \x{44f} + \x{44f} 0: # ----------------------------------------------------------------------------- @@ -4178,9 +4226,9 @@ No match 0: \x{590}\x{5c6}\x{200f}\x{10805} /\p{bidi class:RLE}+\p{bidi class:RLI}*\p{bidi class:RLO}+/utf - -->\x{202b}\x{2067}\x{202e}<-- + -->\x{202b}\x{2067}\x{202e}<-- 0: \x{202b}\x{2067}\x{202e} - + /\p{bidi class:S}+\p{bidiclass:WS}+/utf -->\x{9}\x{b}\x{1f} \x{c} \x{2000} \x{3000}<-- 0: \x{09}\x{0b}\x{1f} \x{0c} \x{2000} \x{3000} diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 3a45eb8..9936db3 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -2842,6 +2842,8 @@ No match # These scripts weren't yet in Perl when I added Unicode 6.0.0 to PCRE +#subject no_jit + /^[\p{Batak}]/utf \x{1bc0} 0: \x{1bc0} @@ -2871,6 +2873,8 @@ No match \x{85d} No match +#subject -no_jit + /(\X*)(.)/s,utf A\x{300} 0: A @@ -4599,6 +4603,8 @@ No match # doesn't recognize all these scripts. In time these three tests can be moved # to test 4. +#subject no_jit + /^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+) (\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+) (\p{Zanabazar_Square}+)/x,utf @@ -4620,7 +4626,7 @@ No match 0: \x{1e900}\x{104b0} \x{1E922}\x{104D8} 0: \x{1e922}\x{104d8} - + /^(?:(\X)(?C))+$/utf \x{1E900}\x{1E924}\x{1E953}\x{11C00}\x{11C2D}\x{11C3E}\x{11C70}\x{11C77}\x{11CAB}\x{11400}\x{1142F}\x{11455}\x{104B0}\x{104D8}\x{104FB}\x{16FE0}\x{18800}\x{18AF2}\x{11D00}\x{11D3A}\x{11D59}\x{16FE1}\x{1B170}\x{1B2FB}\x{11A50}\x{11A58}\x{11AA2}\x{11A00}\x{11A07}\x{11A47}\=callout_capture,callout_no_where Callout 0: last capture = 1 @@ -4755,6 +4761,8 @@ No match \x{1D1AA} 0: \x{1d1aa} +#subject -no_jit + /\N{U+}/ Failed: error 193 at offset 2: \N{U+dddd} is supported only in Unicode (UTF) mode @@ -4967,4 +4975,49 @@ Subject length lower bound = 3 End ------------------------------------------------------------------ +/\p{han}/B +------------------------------------------------------------------ + Bra + prop Han + Ket + End +------------------------------------------------------------------ + +/\p{script:han}/B +------------------------------------------------------------------ + Bra + prop script:Han + Ket + End +------------------------------------------------------------------ + +/\p{sc:han}/B +------------------------------------------------------------------ + Bra + prop script:Han + Ket + End +------------------------------------------------------------------ + +/\p{script extensions:han}/B +------------------------------------------------------------------ + Bra + prop Han + Ket + End +------------------------------------------------------------------ + +/\p{scx:han}/B +------------------------------------------------------------------ + Bra + prop Han + Ket + End +------------------------------------------------------------------ + +# Test error - invalid script name + +/\p{sc:L}/ +Failed: error 147 at offset 8: unknown property after \P or \p + # End of testinput5 diff --git a/testdata/testoutput7 b/testdata/testoutput7 index 329ff31..6e71fc8 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -3714,4 +3714,46 @@ No match # ----------------------------------------------------------------------------- +/\p{katakana}/utf + \x{30a1} + 0: \x{30a1} + \x{3001} + 0: \x{3001} + +/\p{scx:katakana}/utf + \x{30a1} + 0: \x{30a1} + \x{3001} + 0: \x{3001} + +/\p{script extensions:katakana}/utf + \x{30a1} + 0: \x{30a1} + \x{3001} + 0: \x{3001} + +/\p{sc:katakana}/utf + \x{30a1} + 0: \x{30a1} +\= Expect no match + \x{3001} +No match + +/\p{script:katakana}/utf + \x{30a1} + 0: \x{30a1} +\= Expect no match + \x{3001} +No match + +/\p{sc:katakana}{3,}/utf + \x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC + 0: \x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66} + +/\p{sc:katakana}{3,}?/utf + \x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC + 0: \x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66} + 1: \x{30a1}\x{30fa}\x{32d0}\x{1b122} + 2: \x{30a1}\x{30fa}\x{32d0} + # End of testinput7