Update script run code to work with new script extensions coding

2021-12-31 16:06:05 +00:00 · 2021-12-31 16:06:05 +00:00 · d888d36013
parent 6614b281bc
commit d888d36013
9 changed files with 290 additions and 347 deletions
--- a/maint/GenerateUcd.py
+++ b/maint/GenerateUcd.py
@ -117,8 +117,9 @@
 # Conceptually, there is a table of records (of type ucd_record), one for each
 # Unicode character. Each record contains the script number, script extension
 # value, character type, grapheme break type, offset to caseless matching set,
-# offset to the character's other case, and the bidi class/control. However, a
+# offset to the character's other case, and the bidi class/control. 
-# real table covering all Unicode characters would be far too big. It can be
+#
 # A real table covering all Unicode characters would be far too big. It can be
 # efficiently compressed by observing that many characters have the same
 # record, and many blocks of characters (taking 128 characters in a block) have
 # the same set of records as other blocks. This leads to a 2-stage lookup
@ -135,13 +136,20 @@
 # in script runs all come from the same set. The first element in the vector
 # contains the number of subsequent elements, which are in ascending order.
 #
 # The lists of scripts in script_names and script_abbrevs are partitioned into
 # two groups. Scripts that appear in at least one character's script extension
 # list come first, follwed by "Unknown" and then all the rest. This sorting is
 # done certain automatically in the GenerateCommon.py script. A script's number
 # is its index in these lists.
 #
 # The ucd_script_sets vector contains bitmaps that represent lists of scripts
-# for the Script Extensions properties of certain characters. Each bitmap
+# for Script Extensions properties. Each bitmap consists of a fixed number of
-# consists of a fixed number of unsigned 32-bit numbers, enough to allocate
+# unsigned 32-bit numbers, enough to allocate a bit for every script that is
-# a bit for every known script. A character with more than one script listed
+# used in any character's extension list, that is, enough for every script
-# for its Script Extension property has a negative value in its record. This is
+# whose number is less than ucp_Unknown. A character's script extension value
-# the negated offset to the start of the relevant bitmap in the ucd_script_sets
+# in its ucd record is an offset into the ucd_script_sets vector. The first
-# vector.
+# bitmap has no bits set; characters that have no script extensions have zero
 # as their script extensions value so that they use this map.
 #
 # The ucd_records table contains one instance of every unique record that is
 # required. The ucd_stage1 table is indexed by a character's block number,
@ -157,15 +165,15 @@
 #
 # Example: lowercase "a" (U+0061) is in block 0
 #          lookup 0 in stage1 table yields 0
-#          lookup 97 (0x61) in the first table in stage2 yields 22
+#          lookup 97 (0x61) in the first table in stage2 yields 23
-#          record 22 is { 34, 5, 12, 0, -32, 34, 2, 0 }
+#          record 23 is { 20, 5, 12, 0, -32, 0, 9, 0 }
-#            34 = ucp_Latin   => Latin script
+#            20 = ucp_Latin   => Latin script
 #             5 = ucp_Ll      => Lower case letter
 #            12 = ucp_gbOther => Grapheme break property "Other"
 #             0               => Not part of a caseless set
 #           -32 (-0x20)       => Other case is U+0041
-#            34 = ucp_Latin   => No special Script Extension property
+#             0               => No special Script Extension property
-#             2 = ucp_bidiL   => Bidi class left-to-right
+#             9 = ucp_bidiL   => Bidi class left-to-right
 #             0               => Dummy value, unused at present
 #
 # Almost all lowercase latin characters resolve to the same record. One or two
@ -174,35 +182,35 @@
 #
 # Example: hiragana letter A (U+3042) is in block 96 (0x60)
 #          lookup 96 in stage1 table yields 91
-#          lookup 66 (0x42) in table 91 in stage2 yields 613
+#          lookup 66 (0x42) in table 91 in stage2 yields 614
-#          record 613 is { 27, 7, 12, 0, 0, 27, 2, 0 }
+#          record 614 is { 17, 7, 12, 0, 0, 0, 9, 0 }
-#            27 = ucp_Hiragana => Hiragana script
+#            17 = ucp_Hiragana => Hiragana script
 #             7 = ucp_Lo       => Other letter
 #            12 = ucp_gbOther  => Grapheme break property "Other"
 #             0                => Not part of a caseless set
 #             0                => No other case
-#            27 = ucp_Hiragana => No special Script Extension property
+#             0                => No special Script Extension property
-#             2 = ucp_bidiL    => Bidi class left-to-right
+#             9 = ucp_bidiL    => Bidi class left-to-right
 #             0                => Dummy value, unused at present
 #
 # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
 #          lookup 57 in stage1 table yields 55
-#          lookup 80 (0x50) in table 55 in stage2 yields 485
+#          lookup 80 (0x50) in table 55 in stage2 yields 486
-#          record 485 is { 28, 12, 3, 0, 0, -122, 19, 0 }
+#          record 485 is { 78, 12, 3, 0, 0, 138, 13, 0 }
-#            28 = ucp_Inherited => Script inherited from predecessor
+#            78 = ucp_Inherited => Script inherited from predecessor
 #            12 = ucp_Mn        => Non-spacing mark
 #             3 = ucp_gbExtend  => Grapheme break property "Extend"
 #             0                 => Not part of a caseless set
 #             0                 => No other case
-#          -228                 => Script Extension list offset = 228
+#           138                 => Script Extension list offset = 138
 #            13 = ucp_bidiNSM   => Bidi class non-spacing mark
 #             0                 => Dummy value, unused at present
 #
-# At offset 228 in the ucd_script_sets vector we find a bitmap with bits 3, 15,
+# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
-# 29, and 107 set. This means that this character is expected to be used with
+# 18, and 47 set. This means that this character is expected to be used with
 # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
 #
-#  Philip Hazel, last updated 19 December 2021.
+#  Philip Hazel, last updated 31 December 2021.
 ##############################################################################
@ -775,7 +783,6 @@ f.write("""\
 const uint32_t PRIV(ucd_script_sets)[] = {
 """)
 for d in script_lists:
  bitwords = [0] * script_list_item_size
@ -797,8 +804,8 @@ f.write("""\
 /* These are the main two-stage UCD tables. The fields in each record are:
 script (8 bits), character type (8 bits), grapheme break property (8 bits),
 offset to multichar other cases or zero (8 bits), offset to other case or zero
-(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and
+(32 bits, signed), script extension (8 bits), bidi class (8 bits), and a dummy
-a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */
+16-bit field to make the whole thing a multiple of 4 bytes. */
 \n""")
 write_records(records, record_size)
--- a/maint/ucptest.c
+++ b/maint/ucptest.c
@ -316,7 +316,7 @@ j = 0;
 for (i = 0; i < PRIV(utt_size); i++)
  {
  const ucp_type_table *u = PRIV(utt) + i;
-  if (u->type == PT_SCX && u->value == script) 
+  if ((u->type == PT_SCX || u->type == PT_SC) && u->value == script) 
    {
    foundlist[j++] = i;
    if (j >= 2) break;
@ -479,38 +479,16 @@ if (is_just_one && othercase != c)
    }
  }
-if (scriptx != script)
+if (scriptx != 0)
  {
  const char *sep = ""; 
  const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
  printf(", [");
-  if (scriptx >= 0)
+  for (int i = 0; i < ucp_Unknown; i++)
-    printf("%s", get_scriptname(scriptx));
+  if (MAPBIT(p, i) != 0)
-  else
+    { 
-    {
+    printf("%s%s", sep, get_scriptname(i));
-    const char *sep = "";
+    sep = ", ";
 /* 
    const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
    while (*p != 0)
      {
      printf("%s%s", sep, get_scriptname(*p++));
      sep = ", ";
      }
 */
    const uint32_t *p = PRIV(ucd_script_sets) - scriptx;
    for (int i = 0; i < ucp_Script_Count; i++)
      {
      int x = i/32;
      int y = i%32;
      if ((p[x] & (1u<<y)) != 0)
        {
        printf("%s%s", sep, get_scriptname(i));
        sep = ", ";
        }
      }  
    }
  printf("]");
  }
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -1850,10 +1850,11 @@ typedef struct {
 #define UCD_SCRIPTX(ch)     GET_UCD(ch)->scriptx
 /* The "scriptx" field gives an offset into a vector of 32-bit words that
-form a bitmap representing a list of scripts. This macro tests for a
+form a bitmap representing a list of scripts. These macros test or set the bit
-script in the map by number. */
+for a script in the map by number. */
 #define MAPBIT(map,script) ((map)[(script)/32]&(1u<<((script)%32)))
 #define MAPSET(map,script) ((map)[(script)/32]|=(1u<<((script)%32)))
 /* The "bidi" field has the 0x80 bit set if the character has the Bidi_Control
 property. The remaining bits hold the bidi class, but as there are only 23
--- a/src/pcre2_script_run.c
+++ b/src/pcre2_script_run.c
@ -68,26 +68,26 @@ Arguments:
 Returns:    TRUE if this is a valid script run
 */
-/* These dummy values must be less than the negation of the largest offset in
+/* These are states in the checking process. */
 the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
 records (and is only likely to be a few hundred). */
-#define SCRIPT_UNSET        (-99999)
+enum { SCRIPT_UNSET,          /* Requirement as yet unknown */
-#define SCRIPT_HANPENDING   (-99998)
+       SCRIPT_MAP,            /* Bitmap contains acceptable scripts */
-#define SCRIPT_HANHIRAKATA  (-99997)
+       SCRIPT_HANPENDING,     /* Have had only Han characters */
-#define SCRIPT_HANBOPOMOFO  (-99996)
+       SCRIPT_HANHIRAKATA,    /* Expect Han or Hirikata */
-#define SCRIPT_HANHANGUL    (-99995)
+       SCRIPT_HANBOPOMOFO,    /* Expect Han or Bopomofo */
-#define SCRIPT_MAP          (-99994)
+       SCRIPT_HANHANGUL       /* Expect Han or Hangul */
       };
-#define MAPSIZE (ucp_Script_Count/32 + 1)
+#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
 #define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
 BOOL
 PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
 {
 #ifdef SUPPORT_UNICODE
-int require_script = SCRIPT_UNSET;
+uint32_t require_state = SCRIPT_UNSET;
-uint32_t intersection_map[MAPSIZE];
+uint32_t require_map[FULL_MAPSIZE];
-const uint32_t *require_map = NULL;
+uint32_t map[FULL_MAPSIZE];
 uint32_t require_digitset = 0;
 uint32_t c;
@ -101,11 +101,17 @@ if (ptr >= endptr) return TRUE;
 GETCHARINCTEST(c, ptr);
 if (ptr >= endptr) return TRUE;
 /* Initialize the require map. This is a full-size bitmap that has a bit for
 every script, as opposed to the maps in ucd_script_sets, which only have bits
 for scripts less than ucp_Unknown - those that appear in script extension
 lists. */
 for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
 /* Scan strings of two or more characters, checking the Unicode characteristics
-of each code point. We make use of the Script Extensions property. There is
+of each code point. There is special code for scripts that can be combined with
-special code for scripts that can be combined with characters from the Han
+characters from the Han Chinese script. This may be used in conjunction with
-Chinese script. This may be used in conjunction with four other scripts in
+four other scripts in these combinations:
 these combinations:
 . Han with Hiragana and Katakana is allowed (for Japanese).
 . Han with Bopomofo is allowed (for Taiwanese Mandarin).
@ -119,264 +125,207 @@ Hence the SCRIPT_HANPENDING state. */
 for (;;)
  {
  const ucd_record *ucd = GET_UCD(c);
-  int32_t scriptx = ucd->scriptx;
+  uint32_t script = ucd->script;
-  /* If the script extension is Unknown, the string is not a valid script run.
+  /* If the script is Unknown, the string is not a valid script run. Such
-  Such characters can only form script runs of length one. */
+  characters can only form script runs of length one (see test above). */
-  if (scriptx == ucp_Unknown) return FALSE;
+  if (script == ucp_Unknown) return FALSE;
-  /* A character whose script extension is Inherited is always accepted with
+  /* A character without any script extensions whose script is Inherited or
-  any script, and plays no further part in this testing. A character whose
+  Common is always accepted with any script. If there are extensions, the
-  script is Common is always accepted, but must still be tested for a digit
+  following processing happens for all scripts. */
  below. The scriptx value at this point is non-zero, because zero is
  ucp_Unknown, tested for above. */
-  if (scriptx != ucp_Inherited)
+  if (ucd->scriptx != 0 || (script != ucp_Inherited && script != ucp_Common))
    {
-    if (scriptx != ucp_Common)
+    BOOL OK;
    /* Set up a full-sized map for this character that can include bits for all
    scripts. Copy the scriptx map for this character (which covers those
    scripts that appear in script extension lists), set the remaining values to
    zero, and then, except for Common or Inherited, add this script's bit to
    the map. */
    memcpy(map, PRIV(ucd_script_sets) + ucd->scriptx, UCD_MAPSIZE * sizeof(uint32_t));
    memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
    if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
    /* Handle the different checking states */
    switch(require_state)
      {
-      /* If the script extension value is positive, the character is not a mark
+      /* First significant character - it might follow Common or Inherited
-      that can be used with many scripts. In the simple case we either set or
+      characters that do not have any script extensions. */
      compare with the required script. However, handling the scripts that can
      combine with Han are more complicated, as is the case when the previous
      characters have been man-script marks. */
-      if (scriptx > 0)
+      case SCRIPT_UNSET:
      switch(script)
        {
-        switch(require_script)
+        case ucp_Han:
-          {
+        require_state = SCRIPT_HANPENDING;
-          /* Either the first significant character (require_script unset) or
+        break;
          after only Han characters. */
-          case SCRIPT_UNSET:
+        case ucp_Hiragana:
-          case SCRIPT_HANPENDING:
+        case ucp_Katakana:
-          switch(scriptx)
+        require_state = SCRIPT_HANHIRAKATA;
-            {
+        break;
            case ucp_Han:
            require_script = SCRIPT_HANPENDING;
            break;
-            case ucp_Hiragana:
+        case ucp_Bopomofo:
-            case ucp_Katakana:
+        require_state = SCRIPT_HANBOPOMOFO;
-            require_script = SCRIPT_HANHIRAKATA;
+        break;
            break;
-            case ucp_Bopomofo:
+        case ucp_Hangul:
-            require_script = SCRIPT_HANBOPOMOFO;
+        require_state = SCRIPT_HANHANGUL;
-            break;
+        break;
-            case ucp_Hangul:
+        default:
-            require_script = SCRIPT_HANHANGUL;
+        memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
-            break;
+        require_state = SCRIPT_MAP;
        break;
        }
      break;
-            /* Not a Han-related script. If expecting one, fail. Otherise set
+      /* The first significant character was Han. An inspection of the Unicode
-            the requirement to this script. */
+      11.0.0 files shows that there are the following types of Script Extension
      list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
      scripts:
-            default:
+      . Bopomofo + Han
-            if (require_script == SCRIPT_HANPENDING) return FALSE;
+      . Han + Hiragana + Katakana
-            require_script = scriptx;
+      . Hiragana + Katakana
-            break;
+      . Bopopmofo + Hangul + Han + Hiragana + Katakana
            }
          break;
-          /* Previously encountered one of the "with Han" scripts. Check that
+      The following code tries to make sense of this. */
          this character is appropriate. */
          case SCRIPT_HANHIRAKATA:
          if (scriptx != ucp_Han && scriptx != ucp_Hiragana && 
              scriptx != ucp_Katakana)
            return FALSE;
          break;
          case SCRIPT_HANBOPOMOFO:
          if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
          break;
          case SCRIPT_HANHANGUL:
          if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
          break;
          /* We have a bitmap of scripts to check that is derived from one or
          more previous characters. This is either one of the maps in
          ucd_script_sets[] (for one previous character) or the intersection of
          several maps for multiple characters. */
          case SCRIPT_MAP:
          if (MAPBIT(require_map, scriptx) == 0) return FALSE; 
          /* The rest of the string must be in this script, but we have to 
          allow for the Han complications. */
          switch(scriptx)
            {
            case ucp_Han:
            require_script = SCRIPT_HANPENDING;
            break;
            case ucp_Hiragana:
            case ucp_Katakana:
            require_script = SCRIPT_HANHIRAKATA;
            break;
            case ucp_Bopomofo:
            require_script = SCRIPT_HANBOPOMOFO;
            break;
            case ucp_Hangul:
            require_script = SCRIPT_HANHANGUL;
            break;
            default:
            require_script = scriptx;
            break;
            }  
          break;
          /* This is the easy case when a single script is required. */
          default:
          if (scriptx != require_script) return FALSE;
          break;
          }
        }  /* End of handing positive scriptx */
      /* If scriptx is negative, this character is a mark-type character that
      has a list of permitted scripts, which are encoded in a bitmap. */
      else
        {
        uint32_t chspecial;
        const uint32_t *map = PRIV(ucd_script_sets) - scriptx;
        switch(require_script)
          {
          case SCRIPT_UNSET:
          require_map = PRIV(ucd_script_sets) - scriptx;
          require_script = SCRIPT_MAP;
          break;
          /* An inspection of the Unicode 11.0.0 files shows that there are the
          following types of Script Extension list that involve the Han,
          Bopomofo, Hiragana, Katakana, and Hangul scripts:
          . Bopomofo + Han
          . Han + Hiragana + Katakana
          . Hiragana + Katakana
          . Bopopmofo + Hangul + Han + Hiragana + Katakana
          The following code tries to make sense of this. */
 #define FOUND_BOPOMOFO 1
 #define FOUND_HIRAGANA 2
 #define FOUND_KATAKANA 4
 #define FOUND_HANGUL   8
-          case SCRIPT_HANPENDING:
+      case SCRIPT_HANPENDING:
-          chspecial = 0;
+      if (script != ucp_Han)   /* Another Han does nothing */
          if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
          if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
          if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
          if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL;
           if (chspecial == 0) return FALSE;
           if (chspecial == FOUND_BOPOMOFO)
             {
             require_script = SCRIPT_HANBOPOMOFO;
             }
           else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
             {
             require_script = SCRIPT_HANHIRAKATA;
             }
          /* Otherwise it must be allowed with all of them, so remain in
          the pending state. */
          break;
          case SCRIPT_HANHIRAKATA:
          if (MAPBIT(map, ucp_Hiragana) != 0) break;
          if (MAPBIT(map, ucp_Katakana) != 0) break;
          return FALSE;
          case SCRIPT_HANBOPOMOFO:
          if (MAPBIT(map, ucp_Bopomofo) != 0) break;
          return FALSE;
          case SCRIPT_HANHANGUL:
          if (MAPBIT(map, ucp_Hangul) != 0) break;
          return FALSE;
          /* Previously encountered one or more characters that are allowed
          with a list of scripts. Build the intersection of the required list
          with this character's list in intersection_map[]. */
          case SCRIPT_MAP:
          for (int i = 0; i < MAPSIZE; i++)
            intersection_map[i] = require_map[i] & map[i];
          /* If there's just one script in common, we could set it as the
          unique required script. However, in the new bitmap arrangements, 
          finding the one script is expensive, so leave this out for now.
          Otherwise, make the intersection map the required map. */
          /*
          if (onescript >= 0) require_script = onescript;
            else require_map = intersection_map;
          */   
          require_map = intersection_map;
          break;
          /* The previously set required script is a single script, not
          Han-related. Check that it is in this character's list. */
          default:
          if (MAPBIT(map, require_script) == 0) return FALSE; 
          break;
          }
        }  /* End of handling negative scriptx */
      }    /* End of checking non-Common character */
    /* The character is in an acceptable script. We must now ensure that all
    decimal digits in the string come from the same set. Some scripts (e.g.
    Common, Arabic) have more than one set of decimal digits. This code does
    not allow mixing sets, even within the same script. The vector called
    PRIV(ucd_digit_sets)[] contains, in its first element, the number of
    following elements, and then, in ascending order, the code points of the
    '9' characters in every set of 10 digits. Each set is identified by the
    offset in the vector of its '9' character. An initial check of the first
    value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
    if (ucd->chartype == ucp_Nd)
      {
      uint32_t digitset;
      if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
        {
-        int mid;
+        uint32_t chspecial = 0;
-        int bot = 1;
+
-        int top = PRIV(ucd_digit_sets)[0];
+        if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
-        for (;;)
+        if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
        if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
        if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL;
        if (chspecial == 0) return FALSE;   /* Not allowed with Han */
        if (chspecial == FOUND_BOPOMOFO)
          require_state = SCRIPT_HANBOPOMOFO;
        else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
          require_state = SCRIPT_HANHIRAKATA;
        /* Otherwise this character must be allowed with all of them, so remain
        in the pending state. */
        }
      break;
      /* Previously encountered one of the "with Han" scripts. Check that
      this character is appropriate. */
      case SCRIPT_HANHIRAKATA:
      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
          MAPBIT(map, ucp_Katakana) == 0) return FALSE;
      break;
      case SCRIPT_HANBOPOMOFO:
      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
      break;
      case SCRIPT_HANHANGUL:
      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
      break;
      /* Previously encountered one or more characters that are allowed with a
      list of scripts. */
      case SCRIPT_MAP:
      OK = FALSE;
      for (int i = 0; i < FULL_MAPSIZE; i++)
        {
        if ((require_map[i] & map[i]) != 0)
          {
-          if (top <= bot + 1)    /* <= rather than == is paranoia */
+          OK = TRUE;
-            {
+          break;
            digitset = top;
            break;
            }
          mid = (top + bot) / 2;
          if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
          }
        }
-      /* A required value of 0 means "unset". */
+      if (!OK) return FALSE;
-      if (require_digitset == 0) require_digitset = digitset;
+      /* The rest of the string must be in this script, but we have to
-        else if (digitset != require_digitset) return FALSE;
+      allow for the Han complications. */
-      }   /* End digit handling */
+
-    }     /* End checking non-Inherited character */
+      switch(script)
        {
        case ucp_Han:
        require_state = SCRIPT_HANPENDING;
        break;
        case ucp_Hiragana:
        case ucp_Katakana:
        require_state = SCRIPT_HANHIRAKATA;
        break;
        case ucp_Bopomofo:
        require_state = SCRIPT_HANBOPOMOFO;
        break;
        case ucp_Hangul:
        require_state = SCRIPT_HANHANGUL;
        break;
        /* Compute the intersection of the required list of scripts and the
        allowed scripts for this character. */
        default:
        for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
        break;
        }
      break;
      }
    }   /* End checking character's script and extensions. */
  /* The character is in an acceptable script. We must now ensure that all
  decimal digits in the string come from the same set. Some scripts (e.g.
  Common, Arabic) have more than one set of decimal digits. This code does
  not allow mixing sets, even within the same script. The vector called
  PRIV(ucd_digit_sets)[] contains, in its first element, the number of
  following elements, and then, in ascending order, the code points of the
  '9' characters in every set of 10 digits. Each set is identified by the
  offset in the vector of its '9' character. An initial check of the first
  value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
  if (ucd->chartype == ucp_Nd)
    {
    uint32_t digitset;
    if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
      {
      int mid;
      int bot = 1;
      int top = PRIV(ucd_digit_sets)[0];
      for (;;)
        {
        if (top <= bot + 1)    /* <= rather than == is paranoia */
          {
          digitset = top;
          break;
          }
        mid = (top + bot) / 2;
        if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
        }
      }
    /* A required value of 0 means "unset". */
    if (require_digitset == 0) require_digitset = digitset;
      else if (digitset != require_digitset) return FALSE;
    }   /* End digit handling */
  /* If we haven't yet got to the end, pick up the next character. */
--- a/src/pcre2_ucd.c
+++ b/src/pcre2_ucd.c
@ -237,8 +237,8 @@ const uint32_t PRIV(ucd_script_sets)[] = {
 /* These are the main two-stage UCD tables. The fields in each record are:
 script (8 bits), character type (8 bits), grapheme break property (8 bits),
 offset to multichar other cases or zero (8 bits), offset to other case or zero
-(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and
+(32 bits, signed), script extension (8 bits), bidi class (8 bits), and a dummy
-a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */
+16-bit field to make the whole thing a multiple of 4 bytes. */
 const ucd_record PRIV(ucd_records)[] = { /* 12588 bytes, record size 12 */
  {    73,      0,      2,      0,      0,      0,      3,    256, }, /*   0 */
--- a/testdata/testinput4
+++ b/testdata/testinput4
@ -1138,23 +1138,27 @@
 \= Expect no match
    \x{2e7f}
 /^\P{Katakana}+/utf
    \x{3105}
 \= Expect no match
    \x{30ff}
 /^[\p{Arabic}]/utf
    \x{06e9}
    \x{060b}
 \= Expect no match
    X\x{06e9}
 #subject no_jit
 /^\P{Katakana}+/utf
    \x{3105}
 \= Expect no match
    \x{30ff}
 /^[\P{Yi}]/utf
    \x{2f800}
 \= Expect no match
    \x{a014}
    \x{a4c6}
 #subject -no_jit     
 /^\p{Any}X/utf
    AXYZ
    \x{1234}XYZ
@ -2640,4 +2644,13 @@
 /[\p{taml}\p{sc:ugar}]+/utf
    \x{0b82}\x{10380}
 /^[\p{sc:Arabic}]/utf
 \= Expect no match
    \x{650}
    \x{651}  
    \x{652}  
    \x{653}  
    \x{654} 
    \x{655} 
 # End of testinput4
--- a/testdata/testinput5
+++ b/testdata/testinput5
@ -2073,15 +2073,6 @@
 # More differences from Perl
 /^[\p{Arabic}]/utf
 \= Expect no match
    \x{650}
    \x{651}  
    \x{652}  
    \x{653}  
    \x{654} 
    \x{655} 
 /^\p{Common}/utf
    \x{60c}
    \x{61f}  
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@ -1883,13 +1883,6 @@ No match
    \x{2e7f}
 No match
 /^\P{Katakana}+/utf
    \x{3105}
 0: \x{3105}
 \= Expect no match
    \x{30ff}
 No match
 /^[\p{Arabic}]/utf
    \x{06e9}
 0: \x{6e9}
@ -1899,6 +1892,15 @@ No match
    X\x{06e9}
 No match
 #subject no_jit
 /^\P{Katakana}+/utf
    \x{3105}
 0: \x{3105}
 \= Expect no match
    \x{30ff}
 No match
 /^[\P{Yi}]/utf
    \x{2f800}
 0: \x{2f800}
@ -1908,6 +1910,8 @@ No match
    \x{a4c6}
 No match
 #subject -no_jit     
 /^\p{Any}X/utf
    AXYZ
 0: AX
@ -4235,4 +4239,19 @@ No match
    \x{0b82}\x{10380}
 0: \x{b82}\x{10380}
 /^[\p{sc:Arabic}]/utf
 \= Expect no match
    \x{650}
 No match
    \x{651}  
 No match
    \x{652}  
 No match
    \x{653}  
 No match
    \x{654} 
 No match
    \x{655} 
 No match
 # End of testinput4
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@ -4722,21 +4722,6 @@ Callout 0: last capture = 1
 # More differences from Perl
 /^[\p{Arabic}]/utf
 \= Expect no match
    \x{650}
 No match
    \x{651}  
 No match
    \x{652}  
 No match
    \x{653}  
 No match
    \x{654} 
 No match
    \x{655} 
 No match
 /^\p{Common}/utf
    \x{60c}
 0: \x{60c}