From d888d360139a5f3ab1da81d70df0417984da6c1a Mon Sep 17 00:00:00 2001 From: Philip Hazel Date: Fri, 31 Dec 2021 16:06:05 +0000 Subject: [PATCH] Update script run code to work with new script extensions coding --- maint/GenerateUcd.py | 63 +++--- maint/ucptest.c | 40 +--- src/pcre2_internal.h | 5 +- src/pcre2_script_run.c | 445 ++++++++++++++++++----------------------- src/pcre2_ucd.c | 4 +- testdata/testinput4 | 23 ++- testdata/testinput5 | 9 - testdata/testoutput4 | 33 ++- testdata/testoutput5 | 15 -- 9 files changed, 290 insertions(+), 347 deletions(-) diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py index 08f3949..f4517f5 100755 --- a/maint/GenerateUcd.py +++ b/maint/GenerateUcd.py @@ -117,8 +117,9 @@ # Conceptually, there is a table of records (of type ucd_record), one for each # Unicode character. Each record contains the script number, script extension # value, character type, grapheme break type, offset to caseless matching set, -# offset to the character's other case, and the bidi class/control. However, a -# real table covering all Unicode characters would be far too big. It can be +# offset to the character's other case, and the bidi class/control. +# +# A real table covering all Unicode characters would be far too big. It can be # efficiently compressed by observing that many characters have the same # record, and many blocks of characters (taking 128 characters in a block) have # the same set of records as other blocks. This leads to a 2-stage lookup @@ -135,13 +136,20 @@ # in script runs all come from the same set. The first element in the vector # contains the number of subsequent elements, which are in ascending order. # +# The lists of scripts in script_names and script_abbrevs are partitioned into +# two groups. Scripts that appear in at least one character's script extension +# list come first, follwed by "Unknown" and then all the rest. This sorting is +# done certain automatically in the GenerateCommon.py script. A script's number +# is its index in these lists. +# # The ucd_script_sets vector contains bitmaps that represent lists of scripts -# for the Script Extensions properties of certain characters. Each bitmap -# consists of a fixed number of unsigned 32-bit numbers, enough to allocate -# a bit for every known script. A character with more than one script listed -# for its Script Extension property has a negative value in its record. This is -# the negated offset to the start of the relevant bitmap in the ucd_script_sets -# vector. +# for Script Extensions properties. Each bitmap consists of a fixed number of +# unsigned 32-bit numbers, enough to allocate a bit for every script that is +# used in any character's extension list, that is, enough for every script +# whose number is less than ucp_Unknown. A character's script extension value +# in its ucd record is an offset into the ucd_script_sets vector. The first +# bitmap has no bits set; characters that have no script extensions have zero +# as their script extensions value so that they use this map. # # The ucd_records table contains one instance of every unique record that is # required. The ucd_stage1 table is indexed by a character's block number, @@ -157,15 +165,15 @@ # # Example: lowercase "a" (U+0061) is in block 0 # lookup 0 in stage1 table yields 0 -# lookup 97 (0x61) in the first table in stage2 yields 22 -# record 22 is { 34, 5, 12, 0, -32, 34, 2, 0 } -# 34 = ucp_Latin => Latin script +# lookup 97 (0x61) in the first table in stage2 yields 23 +# record 23 is { 20, 5, 12, 0, -32, 0, 9, 0 } +# 20 = ucp_Latin => Latin script # 5 = ucp_Ll => Lower case letter # 12 = ucp_gbOther => Grapheme break property "Other" # 0 => Not part of a caseless set # -32 (-0x20) => Other case is U+0041 -# 34 = ucp_Latin => No special Script Extension property -# 2 = ucp_bidiL => Bidi class left-to-right +# 0 => No special Script Extension property +# 9 = ucp_bidiL => Bidi class left-to-right # 0 => Dummy value, unused at present # # Almost all lowercase latin characters resolve to the same record. One or two @@ -174,35 +182,35 @@ # # Example: hiragana letter A (U+3042) is in block 96 (0x60) # lookup 96 in stage1 table yields 91 -# lookup 66 (0x42) in table 91 in stage2 yields 613 -# record 613 is { 27, 7, 12, 0, 0, 27, 2, 0 } -# 27 = ucp_Hiragana => Hiragana script +# lookup 66 (0x42) in table 91 in stage2 yields 614 +# record 614 is { 17, 7, 12, 0, 0, 0, 9, 0 } +# 17 = ucp_Hiragana => Hiragana script # 7 = ucp_Lo => Other letter # 12 = ucp_gbOther => Grapheme break property "Other" # 0 => Not part of a caseless set # 0 => No other case -# 27 = ucp_Hiragana => No special Script Extension property -# 2 = ucp_bidiL => Bidi class left-to-right +# 0 => No special Script Extension property +# 9 = ucp_bidiL => Bidi class left-to-right # 0 => Dummy value, unused at present # # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) # lookup 57 in stage1 table yields 55 -# lookup 80 (0x50) in table 55 in stage2 yields 485 -# record 485 is { 28, 12, 3, 0, 0, -122, 19, 0 } -# 28 = ucp_Inherited => Script inherited from predecessor +# lookup 80 (0x50) in table 55 in stage2 yields 486 +# record 485 is { 78, 12, 3, 0, 0, 138, 13, 0 } +# 78 = ucp_Inherited => Script inherited from predecessor # 12 = ucp_Mn => Non-spacing mark # 3 = ucp_gbExtend => Grapheme break property "Extend" # 0 => Not part of a caseless set # 0 => No other case -# -228 => Script Extension list offset = 228 +# 138 => Script Extension list offset = 138 # 13 = ucp_bidiNSM => Bidi class non-spacing mark # 0 => Dummy value, unused at present # -# At offset 228 in the ucd_script_sets vector we find a bitmap with bits 3, 15, -# 29, and 107 set. This means that this character is expected to be used with +# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8, +# 18, and 47 set. This means that this character is expected to be used with # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. # -# Philip Hazel, last updated 19 December 2021. +# Philip Hazel, last updated 31 December 2021. ############################################################################## @@ -775,7 +783,6 @@ f.write("""\ const uint32_t PRIV(ucd_script_sets)[] = { """) - for d in script_lists: bitwords = [0] * script_list_item_size @@ -797,8 +804,8 @@ f.write("""\ /* These are the main two-stage UCD tables. The fields in each record are: script (8 bits), character type (8 bits), grapheme break property (8 bits), offset to multichar other cases or zero (8 bits), offset to other case or zero -(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and -a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */ +(32 bits, signed), script extension (8 bits), bidi class (8 bits), and a dummy +16-bit field to make the whole thing a multiple of 4 bytes. */ \n""") write_records(records, record_size) diff --git a/maint/ucptest.c b/maint/ucptest.c index 5bd5713..b81dab5 100644 --- a/maint/ucptest.c +++ b/maint/ucptest.c @@ -316,7 +316,7 @@ j = 0; for (i = 0; i < PRIV(utt_size); i++) { const ucp_type_table *u = PRIV(utt) + i; - if (u->type == PT_SCX && u->value == script) + if ((u->type == PT_SCX || u->type == PT_SC) && u->value == script) { foundlist[j++] = i; if (j >= 2) break; @@ -479,38 +479,16 @@ if (is_just_one && othercase != c) } } -if (scriptx != script) +if (scriptx != 0) { + const char *sep = ""; + const uint32_t *p = PRIV(ucd_script_sets) + scriptx; printf(", ["); - if (scriptx >= 0) - printf("%s", get_scriptname(scriptx)); - else - { - const char *sep = ""; - - -/* - const uint8_t *p = PRIV(ucd_script_sets) - scriptx; - while (*p != 0) - { - printf("%s%s", sep, get_scriptname(*p++)); - sep = ", "; - } -*/ - - const uint32_t *p = PRIV(ucd_script_sets) - scriptx; - for (int i = 0; i < ucp_Script_Count; i++) - { - int x = i/32; - int y = i%32; - - if ((p[x] & (1u<scriptx /* The "scriptx" field gives an offset into a vector of 32-bit words that -form a bitmap representing a list of scripts. This macro tests for a -script in the map by number. */ +form a bitmap representing a list of scripts. These macros test or set the bit +for a script in the map by number. */ #define MAPBIT(map,script) ((map)[(script)/32]&(1u<<((script)%32))) +#define MAPSET(map,script) ((map)[(script)/32]|=(1u<<((script)%32))) /* The "bidi" field has the 0x80 bit set if the character has the Bidi_Control property. The remaining bits hold the bidi class, but as there are only 23 diff --git a/src/pcre2_script_run.c b/src/pcre2_script_run.c index bee312a..c412981 100644 --- a/src/pcre2_script_run.c +++ b/src/pcre2_script_run.c @@ -68,26 +68,26 @@ Arguments: Returns: TRUE if this is a valid script run */ -/* These dummy values must be less than the negation of the largest offset in -the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD -records (and is only likely to be a few hundred). */ +/* These are states in the checking process. */ -#define SCRIPT_UNSET (-99999) -#define SCRIPT_HANPENDING (-99998) -#define SCRIPT_HANHIRAKATA (-99997) -#define SCRIPT_HANBOPOMOFO (-99996) -#define SCRIPT_HANHANGUL (-99995) -#define SCRIPT_MAP (-99994) +enum { SCRIPT_UNSET, /* Requirement as yet unknown */ + SCRIPT_MAP, /* Bitmap contains acceptable scripts */ + SCRIPT_HANPENDING, /* Have had only Han characters */ + SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */ + SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */ + SCRIPT_HANHANGUL /* Expect Han or Hangul */ + }; -#define MAPSIZE (ucp_Script_Count/32 + 1) +#define UCD_MAPSIZE (ucp_Unknown/32 + 1) +#define FULL_MAPSIZE (ucp_Script_Count/32 + 1) BOOL PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf) { #ifdef SUPPORT_UNICODE -int require_script = SCRIPT_UNSET; -uint32_t intersection_map[MAPSIZE]; -const uint32_t *require_map = NULL; +uint32_t require_state = SCRIPT_UNSET; +uint32_t require_map[FULL_MAPSIZE]; +uint32_t map[FULL_MAPSIZE]; uint32_t require_digitset = 0; uint32_t c; @@ -101,11 +101,17 @@ if (ptr >= endptr) return TRUE; GETCHARINCTEST(c, ptr); if (ptr >= endptr) return TRUE; +/* Initialize the require map. This is a full-size bitmap that has a bit for +every script, as opposed to the maps in ucd_script_sets, which only have bits +for scripts less than ucp_Unknown - those that appear in script extension +lists. */ + +for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0; + /* Scan strings of two or more characters, checking the Unicode characteristics -of each code point. We make use of the Script Extensions property. There is -special code for scripts that can be combined with characters from the Han -Chinese script. This may be used in conjunction with four other scripts in -these combinations: +of each code point. There is special code for scripts that can be combined with +characters from the Han Chinese script. This may be used in conjunction with +four other scripts in these combinations: . Han with Hiragana and Katakana is allowed (for Japanese). . Han with Bopomofo is allowed (for Taiwanese Mandarin). @@ -119,264 +125,207 @@ Hence the SCRIPT_HANPENDING state. */ for (;;) { const ucd_record *ucd = GET_UCD(c); - int32_t scriptx = ucd->scriptx; + uint32_t script = ucd->script; - /* If the script extension is Unknown, the string is not a valid script run. - Such characters can only form script runs of length one. */ + /* If the script is Unknown, the string is not a valid script run. Such + characters can only form script runs of length one (see test above). */ - if (scriptx == ucp_Unknown) return FALSE; + if (script == ucp_Unknown) return FALSE; - /* A character whose script extension is Inherited is always accepted with - any script, and plays no further part in this testing. A character whose - script is Common is always accepted, but must still be tested for a digit - below. The scriptx value at this point is non-zero, because zero is - ucp_Unknown, tested for above. */ + /* A character without any script extensions whose script is Inherited or + Common is always accepted with any script. If there are extensions, the + following processing happens for all scripts. */ - if (scriptx != ucp_Inherited) + if (ucd->scriptx != 0 || (script != ucp_Inherited && script != ucp_Common)) { - if (scriptx != ucp_Common) + BOOL OK; + + /* Set up a full-sized map for this character that can include bits for all + scripts. Copy the scriptx map for this character (which covers those + scripts that appear in script extension lists), set the remaining values to + zero, and then, except for Common or Inherited, add this script's bit to + the map. */ + + memcpy(map, PRIV(ucd_script_sets) + ucd->scriptx, UCD_MAPSIZE * sizeof(uint32_t)); + memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t)); + if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script); + + /* Handle the different checking states */ + + switch(require_state) { - /* If the script extension value is positive, the character is not a mark - that can be used with many scripts. In the simple case we either set or - compare with the required script. However, handling the scripts that can - combine with Han are more complicated, as is the case when the previous - characters have been man-script marks. */ + /* First significant character - it might follow Common or Inherited + characters that do not have any script extensions. */ - if (scriptx > 0) + case SCRIPT_UNSET: + switch(script) { - switch(require_script) - { - /* Either the first significant character (require_script unset) or - after only Han characters. */ + case ucp_Han: + require_state = SCRIPT_HANPENDING; + break; - case SCRIPT_UNSET: - case SCRIPT_HANPENDING: - switch(scriptx) - { - case ucp_Han: - require_script = SCRIPT_HANPENDING; - break; + case ucp_Hiragana: + case ucp_Katakana: + require_state = SCRIPT_HANHIRAKATA; + break; - case ucp_Hiragana: - case ucp_Katakana: - require_script = SCRIPT_HANHIRAKATA; - break; + case ucp_Bopomofo: + require_state = SCRIPT_HANBOPOMOFO; + break; - case ucp_Bopomofo: - require_script = SCRIPT_HANBOPOMOFO; - break; + case ucp_Hangul: + require_state = SCRIPT_HANHANGUL; + break; - case ucp_Hangul: - require_script = SCRIPT_HANHANGUL; - break; + default: + memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t)); + require_state = SCRIPT_MAP; + break; + } + break; - /* Not a Han-related script. If expecting one, fail. Otherise set - the requirement to this script. */ + /* The first significant character was Han. An inspection of the Unicode + 11.0.0 files shows that there are the following types of Script Extension + list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul + scripts: - default: - if (require_script == SCRIPT_HANPENDING) return FALSE; - require_script = scriptx; - break; - } - break; + . Bopomofo + Han + . Han + Hiragana + Katakana + . Hiragana + Katakana + . Bopopmofo + Hangul + Han + Hiragana + Katakana - /* Previously encountered one of the "with Han" scripts. Check that - this character is appropriate. */ - - case SCRIPT_HANHIRAKATA: - if (scriptx != ucp_Han && scriptx != ucp_Hiragana && - scriptx != ucp_Katakana) - return FALSE; - break; - - case SCRIPT_HANBOPOMOFO: - if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE; - break; - - case SCRIPT_HANHANGUL: - if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE; - break; - - /* We have a bitmap of scripts to check that is derived from one or - more previous characters. This is either one of the maps in - ucd_script_sets[] (for one previous character) or the intersection of - several maps for multiple characters. */ - - case SCRIPT_MAP: - if (MAPBIT(require_map, scriptx) == 0) return FALSE; - - /* The rest of the string must be in this script, but we have to - allow for the Han complications. */ - - switch(scriptx) - { - case ucp_Han: - require_script = SCRIPT_HANPENDING; - break; - - case ucp_Hiragana: - case ucp_Katakana: - require_script = SCRIPT_HANHIRAKATA; - break; - - case ucp_Bopomofo: - require_script = SCRIPT_HANBOPOMOFO; - break; - - case ucp_Hangul: - require_script = SCRIPT_HANHANGUL; - break; - - default: - require_script = scriptx; - break; - } - break; - - /* This is the easy case when a single script is required. */ - - default: - if (scriptx != require_script) return FALSE; - break; - } - } /* End of handing positive scriptx */ - - /* If scriptx is negative, this character is a mark-type character that - has a list of permitted scripts, which are encoded in a bitmap. */ - - else - { - uint32_t chspecial; - const uint32_t *map = PRIV(ucd_script_sets) - scriptx; - - switch(require_script) - { - case SCRIPT_UNSET: - require_map = PRIV(ucd_script_sets) - scriptx; - require_script = SCRIPT_MAP; - break; - - /* An inspection of the Unicode 11.0.0 files shows that there are the - following types of Script Extension list that involve the Han, - Bopomofo, Hiragana, Katakana, and Hangul scripts: - - . Bopomofo + Han - . Han + Hiragana + Katakana - . Hiragana + Katakana - . Bopopmofo + Hangul + Han + Hiragana + Katakana - - The following code tries to make sense of this. */ + The following code tries to make sense of this. */ #define FOUND_BOPOMOFO 1 #define FOUND_HIRAGANA 2 #define FOUND_KATAKANA 4 #define FOUND_HANGUL 8 - case SCRIPT_HANPENDING: - chspecial = 0; - - if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO; - if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA; - if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA; - if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL; - - if (chspecial == 0) return FALSE; - - if (chspecial == FOUND_BOPOMOFO) - { - require_script = SCRIPT_HANBOPOMOFO; - } - else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) - { - require_script = SCRIPT_HANHIRAKATA; - } - - /* Otherwise it must be allowed with all of them, so remain in - the pending state. */ - - break; - - case SCRIPT_HANHIRAKATA: - if (MAPBIT(map, ucp_Hiragana) != 0) break; - if (MAPBIT(map, ucp_Katakana) != 0) break; - return FALSE; - - case SCRIPT_HANBOPOMOFO: - if (MAPBIT(map, ucp_Bopomofo) != 0) break; - return FALSE; - - case SCRIPT_HANHANGUL: - if (MAPBIT(map, ucp_Hangul) != 0) break; - return FALSE; - - /* Previously encountered one or more characters that are allowed - with a list of scripts. Build the intersection of the required list - with this character's list in intersection_map[]. */ - - case SCRIPT_MAP: - for (int i = 0; i < MAPSIZE; i++) - intersection_map[i] = require_map[i] & map[i]; - - /* If there's just one script in common, we could set it as the - unique required script. However, in the new bitmap arrangements, - finding the one script is expensive, so leave this out for now. - Otherwise, make the intersection map the required map. */ - - /* - if (onescript >= 0) require_script = onescript; - else require_map = intersection_map; - */ - - require_map = intersection_map; - break; - - /* The previously set required script is a single script, not - Han-related. Check that it is in this character's list. */ - - default: - if (MAPBIT(map, require_script) == 0) return FALSE; - break; - } - } /* End of handling negative scriptx */ - } /* End of checking non-Common character */ - - /* The character is in an acceptable script. We must now ensure that all - decimal digits in the string come from the same set. Some scripts (e.g. - Common, Arabic) have more than one set of decimal digits. This code does - not allow mixing sets, even within the same script. The vector called - PRIV(ucd_digit_sets)[] contains, in its first element, the number of - following elements, and then, in ascending order, the code points of the - '9' characters in every set of 10 digits. Each set is identified by the - offset in the vector of its '9' character. An initial check of the first - value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ - - if (ucd->chartype == ucp_Nd) - { - uint32_t digitset; - - if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else + case SCRIPT_HANPENDING: + if (script != ucp_Han) /* Another Han does nothing */ { - int mid; - int bot = 1; - int top = PRIV(ucd_digit_sets)[0]; - for (;;) + uint32_t chspecial = 0; + + if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO; + if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA; + if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA; + if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL; + + if (chspecial == 0) return FALSE; /* Not allowed with Han */ + + if (chspecial == FOUND_BOPOMOFO) + require_state = SCRIPT_HANBOPOMOFO; + else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) + require_state = SCRIPT_HANHIRAKATA; + + /* Otherwise this character must be allowed with all of them, so remain + in the pending state. */ + } + break; + + /* Previously encountered one of the "with Han" scripts. Check that + this character is appropriate. */ + + case SCRIPT_HANHIRAKATA: + if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) + + MAPBIT(map, ucp_Katakana) == 0) return FALSE; + break; + + case SCRIPT_HANBOPOMOFO: + if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE; + break; + + case SCRIPT_HANHANGUL: + if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE; + break; + + /* Previously encountered one or more characters that are allowed with a + list of scripts. */ + + case SCRIPT_MAP: + OK = FALSE; + + for (int i = 0; i < FULL_MAPSIZE; i++) + { + if ((require_map[i] & map[i]) != 0) { - if (top <= bot + 1) /* <= rather than == is paranoia */ - { - digitset = top; - break; - } - mid = (top + bot) / 2; - if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; + OK = TRUE; + break; } } - /* A required value of 0 means "unset". */ + if (!OK) return FALSE; - if (require_digitset == 0) require_digitset = digitset; - else if (digitset != require_digitset) return FALSE; - } /* End digit handling */ - } /* End checking non-Inherited character */ + /* The rest of the string must be in this script, but we have to + allow for the Han complications. */ + + switch(script) + { + case ucp_Han: + require_state = SCRIPT_HANPENDING; + break; + + case ucp_Hiragana: + case ucp_Katakana: + require_state = SCRIPT_HANHIRAKATA; + break; + + case ucp_Bopomofo: + require_state = SCRIPT_HANBOPOMOFO; + break; + + case ucp_Hangul: + require_state = SCRIPT_HANHANGUL; + break; + + /* Compute the intersection of the required list of scripts and the + allowed scripts for this character. */ + + default: + for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i]; + break; + } + + break; + } + } /* End checking character's script and extensions. */ + + /* The character is in an acceptable script. We must now ensure that all + decimal digits in the string come from the same set. Some scripts (e.g. + Common, Arabic) have more than one set of decimal digits. This code does + not allow mixing sets, even within the same script. The vector called + PRIV(ucd_digit_sets)[] contains, in its first element, the number of + following elements, and then, in ascending order, the code points of the + '9' characters in every set of 10 digits. Each set is identified by the + offset in the vector of its '9' character. An initial check of the first + value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ + + if (ucd->chartype == ucp_Nd) + { + uint32_t digitset; + + if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else + { + int mid; + int bot = 1; + int top = PRIV(ucd_digit_sets)[0]; + for (;;) + { + if (top <= bot + 1) /* <= rather than == is paranoia */ + { + digitset = top; + break; + } + mid = (top + bot) / 2; + if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; + } + } + + /* A required value of 0 means "unset". */ + + if (require_digitset == 0) require_digitset = digitset; + else if (digitset != require_digitset) return FALSE; + } /* End digit handling */ /* If we haven't yet got to the end, pick up the next character. */ diff --git a/src/pcre2_ucd.c b/src/pcre2_ucd.c index fe8619b..206aac9 100644 --- a/src/pcre2_ucd.c +++ b/src/pcre2_ucd.c @@ -237,8 +237,8 @@ const uint32_t PRIV(ucd_script_sets)[] = { /* These are the main two-stage UCD tables. The fields in each record are: script (8 bits), character type (8 bits), grapheme break property (8 bits), offset to multichar other cases or zero (8 bits), offset to other case or zero -(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and -a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */ +(32 bits, signed), script extension (8 bits), bidi class (8 bits), and a dummy +16-bit field to make the whole thing a multiple of 4 bytes. */ const ucd_record PRIV(ucd_records)[] = { /* 12588 bytes, record size 12 */ { 73, 0, 2, 0, 0, 0, 3, 256, }, /* 0 */ diff --git a/testdata/testinput4 b/testdata/testinput4 index 6a2430a..69c5475 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -1138,23 +1138,27 @@ \= Expect no match \x{2e7f} -/^\P{Katakana}+/utf - \x{3105} -\= Expect no match - \x{30ff} - /^[\p{Arabic}]/utf \x{06e9} \x{060b} \= Expect no match X\x{06e9} +#subject no_jit + +/^\P{Katakana}+/utf + \x{3105} +\= Expect no match + \x{30ff} + /^[\P{Yi}]/utf \x{2f800} \= Expect no match \x{a014} \x{a4c6} +#subject -no_jit + /^\p{Any}X/utf AXYZ \x{1234}XYZ @@ -2640,4 +2644,13 @@ /[\p{taml}\p{sc:ugar}]+/utf \x{0b82}\x{10380} +/^[\p{sc:Arabic}]/utf +\= Expect no match + \x{650} + \x{651} + \x{652} + \x{653} + \x{654} + \x{655} + # End of testinput4 diff --git a/testdata/testinput5 b/testdata/testinput5 index 33204d6..3f62216 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2073,15 +2073,6 @@ # More differences from Perl -/^[\p{Arabic}]/utf -\= Expect no match - \x{650} - \x{651} - \x{652} - \x{653} - \x{654} - \x{655} - /^\p{Common}/utf \x{60c} \x{61f} diff --git a/testdata/testoutput4 b/testdata/testoutput4 index a4d919e..a4d5662 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1883,13 +1883,6 @@ No match \x{2e7f} No match -/^\P{Katakana}+/utf - \x{3105} - 0: \x{3105} -\= Expect no match - \x{30ff} -No match - /^[\p{Arabic}]/utf \x{06e9} 0: \x{6e9} @@ -1899,6 +1892,15 @@ No match X\x{06e9} No match +#subject no_jit + +/^\P{Katakana}+/utf + \x{3105} + 0: \x{3105} +\= Expect no match + \x{30ff} +No match + /^[\P{Yi}]/utf \x{2f800} 0: \x{2f800} @@ -1908,6 +1910,8 @@ No match \x{a4c6} No match +#subject -no_jit + /^\p{Any}X/utf AXYZ 0: AX @@ -4235,4 +4239,19 @@ No match \x{0b82}\x{10380} 0: \x{b82}\x{10380} +/^[\p{sc:Arabic}]/utf +\= Expect no match + \x{650} +No match + \x{651} +No match + \x{652} +No match + \x{653} +No match + \x{654} +No match + \x{655} +No match + # End of testinput4 diff --git a/testdata/testoutput5 b/testdata/testoutput5 index ab8a185..8382203 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -4722,21 +4722,6 @@ Callout 0: last capture = 1 # More differences from Perl -/^[\p{Arabic}]/utf -\= Expect no match - \x{650} -No match - \x{651} -No match - \x{652} -No match - \x{653} -No match - \x{654} -No match - \x{655} -No match - /^\p{Common}/utf \x{60c} 0: \x{60c}