Update script run code to work with new script extensions coding

2021-12-31 16:06:05 +00:00 · 2021-12-31 16:06:05 +00:00 · d888d36013
parent 6614b281bc
commit d888d36013
9 changed files with 290 additions and 347 deletions
--- a/maint/GenerateUcd.py
+++ b/maint/GenerateUcd.py
@ -117,8 +117,9 @@
 # Conceptually, there is a table of records (of type ucd_record), one for each
 # Unicode character. Each record contains the script number, script extension
 # value, character type, grapheme break type, offset to caseless matching set,
-# offset to the character's other case, and the bidi class/control. However, a
-# real table covering all Unicode characters would be far too big. It can be
+# offset to the character's other case, and the bidi class/control. 
+#
+# A real table covering all Unicode characters would be far too big. It can be
 # efficiently compressed by observing that many characters have the same
 # record, and many blocks of characters (taking 128 characters in a block) have
 # the same set of records as other blocks. This leads to a 2-stage lookup
@ -135,13 +136,20 @@
 # in script runs all come from the same set. The first element in the vector
 # contains the number of subsequent elements, which are in ascending order.
 #
+# The lists of scripts in script_names and script_abbrevs are partitioned into
+# two groups. Scripts that appear in at least one character's script extension
+# list come first, follwed by "Unknown" and then all the rest. This sorting is
+# done certain automatically in the GenerateCommon.py script. A script's number
+# is its index in these lists.
+#
 # The ucd_script_sets vector contains bitmaps that represent lists of scripts
-# for the Script Extensions properties of certain characters. Each bitmap
-# consists of a fixed number of unsigned 32-bit numbers, enough to allocate
-# a bit for every known script. A character with more than one script listed
-# for its Script Extension property has a negative value in its record. This is
-# the negated offset to the start of the relevant bitmap in the ucd_script_sets
-# vector.
+# for Script Extensions properties. Each bitmap consists of a fixed number of
+# unsigned 32-bit numbers, enough to allocate a bit for every script that is
+# used in any character's extension list, that is, enough for every script
+# whose number is less than ucp_Unknown. A character's script extension value
+# in its ucd record is an offset into the ucd_script_sets vector. The first
+# bitmap has no bits set; characters that have no script extensions have zero
+# as their script extensions value so that they use this map.
 #
 # The ucd_records table contains one instance of every unique record that is
 # required. The ucd_stage1 table is indexed by a character's block number,
@ -157,15 +165,15 @@
 #
 # Example: lowercase "a" (U+0061) is in block 0
 #          lookup 0 in stage1 table yields 0
-#          lookup 97 (0x61) in the first table in stage2 yields 22
-#          record 22 is { 34, 5, 12, 0, -32, 34, 2, 0 }
-#            34 = ucp_Latin   => Latin script
+#          lookup 97 (0x61) in the first table in stage2 yields 23
+#          record 23 is { 20, 5, 12, 0, -32, 0, 9, 0 }
+#            20 = ucp_Latin   => Latin script
 #             5 = ucp_Ll      => Lower case letter
 #            12 = ucp_gbOther => Grapheme break property "Other"
 #             0               => Not part of a caseless set
 #           -32 (-0x20)       => Other case is U+0041
-#            34 = ucp_Latin   => No special Script Extension property
-#             2 = ucp_bidiL   => Bidi class left-to-right
+#             0               => No special Script Extension property
+#             9 = ucp_bidiL   => Bidi class left-to-right
 #             0               => Dummy value, unused at present
 #
 # Almost all lowercase latin characters resolve to the same record. One or two
@ -174,35 +182,35 @@
 #
 # Example: hiragana letter A (U+3042) is in block 96 (0x60)
 #          lookup 96 in stage1 table yields 91
-#          lookup 66 (0x42) in table 91 in stage2 yields 613
-#          record 613 is { 27, 7, 12, 0, 0, 27, 2, 0 }
-#            27 = ucp_Hiragana => Hiragana script
+#          lookup 66 (0x42) in table 91 in stage2 yields 614
+#          record 614 is { 17, 7, 12, 0, 0, 0, 9, 0 }
+#            17 = ucp_Hiragana => Hiragana script
 #             7 = ucp_Lo       => Other letter
 #            12 = ucp_gbOther  => Grapheme break property "Other"
 #             0                => Not part of a caseless set
 #             0                => No other case
-#            27 = ucp_Hiragana => No special Script Extension property
-#             2 = ucp_bidiL    => Bidi class left-to-right
+#             0                => No special Script Extension property
+#             9 = ucp_bidiL    => Bidi class left-to-right
 #             0                => Dummy value, unused at present
 #
 # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
 #          lookup 57 in stage1 table yields 55
-#          lookup 80 (0x50) in table 55 in stage2 yields 485
-#          record 485 is { 28, 12, 3, 0, 0, -122, 19, 0 }
-#            28 = ucp_Inherited => Script inherited from predecessor
+#          lookup 80 (0x50) in table 55 in stage2 yields 486
+#          record 485 is { 78, 12, 3, 0, 0, 138, 13, 0 }
+#            78 = ucp_Inherited => Script inherited from predecessor
 #            12 = ucp_Mn        => Non-spacing mark
 #             3 = ucp_gbExtend  => Grapheme break property "Extend"
 #             0                 => Not part of a caseless set
 #             0                 => No other case
-#          -228                 => Script Extension list offset = 228
+#           138                 => Script Extension list offset = 138
 #            13 = ucp_bidiNSM   => Bidi class non-spacing mark
 #             0                 => Dummy value, unused at present
 #
-# At offset 228 in the ucd_script_sets vector we find a bitmap with bits 3, 15,
-# 29, and 107 set. This means that this character is expected to be used with
+# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
+# 18, and 47 set. This means that this character is expected to be used with
 # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
 #
-#  Philip Hazel, last updated 19 December 2021.
+#  Philip Hazel, last updated 31 December 2021.
 ##############################################################################


@ -775,7 +783,6 @@ f.write("""\
 const uint32_t PRIV(ucd_script_sets)[] = {
 """)

-
 for d in script_lists:
  bitwords = [0] * script_list_item_size

@ -797,8 +804,8 @@ f.write("""\
 /* These are the main two-stage UCD tables. The fields in each record are:
 script (8 bits), character type (8 bits), grapheme break property (8 bits),
 offset to multichar other cases or zero (8 bits), offset to other case or zero
-(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and
-a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */
+(32 bits, signed), script extension (8 bits), bidi class (8 bits), and a dummy
+16-bit field to make the whole thing a multiple of 4 bytes. */
 \n""")

 write_records(records, record_size)
--- a/maint/ucptest.c
+++ b/maint/ucptest.c
@ -316,7 +316,7 @@ j = 0;
 for (i = 0; i < PRIV(utt_size); i++)
  {
  const ucp_type_table *u = PRIV(utt) + i;
-  if (u->type == PT_SCX && u->value == script) 
+  if ((u->type == PT_SCX || u->type == PT_SC) && u->value == script) 
    {
    foundlist[j++] = i;
    if (j >= 2) break;
@ -479,38 +479,16 @@ if (is_just_one && othercase != c)
    }
  }

-if (scriptx != script)
+if (scriptx != 0)
  {
+  const char *sep = ""; 
+  const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
  printf(", [");
-  if (scriptx >= 0)
-    printf("%s", get_scriptname(scriptx));
-  else
-    {
-    const char *sep = "";
-    
-
-/* 
-    const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
-    while (*p != 0)
-      {
-      printf("%s%s", sep, get_scriptname(*p++));
-      sep = ", ";
-      }
-*/
-
-    const uint32_t *p = PRIV(ucd_script_sets) - scriptx;
-    for (int i = 0; i < ucp_Script_Count; i++)
-      {
-      int x = i/32;
-      int y = i%32;
-      
-      if ((p[x] & (1u<<y)) != 0)
-        {
-        printf("%s%s", sep, get_scriptname(i));
-        sep = ", ";
-        }
-      }  
- 
+  for (int i = 0; i < ucp_Unknown; i++)
+  if (MAPBIT(p, i) != 0)
+    { 
+    printf("%s%s", sep, get_scriptname(i));
+    sep = ", ";
    }
  printf("]");
  }
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -1850,10 +1850,11 @@ typedef struct {
 #define UCD_SCRIPTX(ch)     GET_UCD(ch)->scriptx

 /* The "scriptx" field gives an offset into a vector of 32-bit words that
-form a bitmap representing a list of scripts. This macro tests for a
-script in the map by number. */
+form a bitmap representing a list of scripts. These macros test or set the bit
+for a script in the map by number. */

 #define MAPBIT(map,script) ((map)[(script)/32]&(1u<<((script)%32)))
+#define MAPSET(map,script) ((map)[(script)/32]|=(1u<<((script)%32)))

 /* The "bidi" field has the 0x80 bit set if the character has the Bidi_Control
 property. The remaining bits hold the bidi class, but as there are only 23
--- a/src/pcre2_script_run.c
+++ b/src/pcre2_script_run.c
@ -68,26 +68,26 @@ Arguments:
 Returns:    TRUE if this is a valid script run
 */

-/* These dummy values must be less than the negation of the largest offset in
-the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
-records (and is only likely to be a few hundred). */
+/* These are states in the checking process. */

-#define SCRIPT_UNSET        (-99999)
-#define SCRIPT_HANPENDING   (-99998)
-#define SCRIPT_HANHIRAKATA  (-99997)
-#define SCRIPT_HANBOPOMOFO  (-99996)
-#define SCRIPT_HANHANGUL    (-99995)
-#define SCRIPT_MAP          (-99994)
+enum { SCRIPT_UNSET,          /* Requirement as yet unknown */
+       SCRIPT_MAP,            /* Bitmap contains acceptable scripts */
+       SCRIPT_HANPENDING,     /* Have had only Han characters */
+       SCRIPT_HANHIRAKATA,    /* Expect Han or Hirikata */
+       SCRIPT_HANBOPOMOFO,    /* Expect Han or Bopomofo */
+       SCRIPT_HANHANGUL       /* Expect Han or Hangul */
+       };

-#define MAPSIZE (ucp_Script_Count/32 + 1)
+#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
+#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)

 BOOL
 PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
 {
 #ifdef SUPPORT_UNICODE
-int require_script = SCRIPT_UNSET;
-uint32_t intersection_map[MAPSIZE];
-const uint32_t *require_map = NULL;
+uint32_t require_state = SCRIPT_UNSET;
+uint32_t require_map[FULL_MAPSIZE];
+uint32_t map[FULL_MAPSIZE];
 uint32_t require_digitset = 0;
 uint32_t c;

@ -101,11 +101,17 @@ if (ptr >= endptr) return TRUE;
 GETCHARINCTEST(c, ptr);
 if (ptr >= endptr) return TRUE;

+/* Initialize the require map. This is a full-size bitmap that has a bit for
+every script, as opposed to the maps in ucd_script_sets, which only have bits
+for scripts less than ucp_Unknown - those that appear in script extension
+lists. */
+
+for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
+
 /* Scan strings of two or more characters, checking the Unicode characteristics
-of each code point. We make use of the Script Extensions property. There is
-special code for scripts that can be combined with characters from the Han
-Chinese script. This may be used in conjunction with four other scripts in
-these combinations:
+of each code point. There is special code for scripts that can be combined with
+characters from the Han Chinese script. This may be used in conjunction with
+four other scripts in these combinations:

 . Han with Hiragana and Katakana is allowed (for Japanese).
 . Han with Bopomofo is allowed (for Taiwanese Mandarin).
@ -119,264 +125,207 @@ Hence the SCRIPT_HANPENDING state. */
 for (;;)
  {
  const ucd_record *ucd = GET_UCD(c);
-  int32_t scriptx = ucd->scriptx;
+  uint32_t script = ucd->script;

-  /* If the script extension is Unknown, the string is not a valid script run.
-  Such characters can only form script runs of length one. */
+  /* If the script is Unknown, the string is not a valid script run. Such
+  characters can only form script runs of length one (see test above). */

-  if (scriptx == ucp_Unknown) return FALSE;
+  if (script == ucp_Unknown) return FALSE;

-  /* A character whose script extension is Inherited is always accepted with
-  any script, and plays no further part in this testing. A character whose
-  script is Common is always accepted, but must still be tested for a digit
-  below. The scriptx value at this point is non-zero, because zero is
-  ucp_Unknown, tested for above. */
+  /* A character without any script extensions whose script is Inherited or
+  Common is always accepted with any script. If there are extensions, the
+  following processing happens for all scripts. */

-  if (scriptx != ucp_Inherited)
+  if (ucd->scriptx != 0 || (script != ucp_Inherited && script != ucp_Common))
    {
-    if (scriptx != ucp_Common)
+    BOOL OK;
+
+    /* Set up a full-sized map for this character that can include bits for all
+    scripts. Copy the scriptx map for this character (which covers those
+    scripts that appear in script extension lists), set the remaining values to
+    zero, and then, except for Common or Inherited, add this script's bit to
+    the map. */
+
+    memcpy(map, PRIV(ucd_script_sets) + ucd->scriptx, UCD_MAPSIZE * sizeof(uint32_t));
+    memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
+    if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
+
+    /* Handle the different checking states */
+
+    switch(require_state)
      {
-      /* If the script extension value is positive, the character is not a mark
-      that can be used with many scripts. In the simple case we either set or
-      compare with the required script. However, handling the scripts that can
-      combine with Han are more complicated, as is the case when the previous
-      characters have been man-script marks. */
+      /* First significant character - it might follow Common or Inherited
+      characters that do not have any script extensions. */

-      if (scriptx > 0)
+      case SCRIPT_UNSET:
+      switch(script)
        {
-        switch(require_script)
-          {
-          /* Either the first significant character (require_script unset) or
-          after only Han characters. */
+        case ucp_Han:
+        require_state = SCRIPT_HANPENDING;
+        break;

-          case SCRIPT_UNSET:
-          case SCRIPT_HANPENDING:
-          switch(scriptx)
-            {
-            case ucp_Han:
-            require_script = SCRIPT_HANPENDING;
-            break;
+        case ucp_Hiragana:
+        case ucp_Katakana:
+        require_state = SCRIPT_HANHIRAKATA;
+        break;

-            case ucp_Hiragana:
-            case ucp_Katakana:
-            require_script = SCRIPT_HANHIRAKATA;
-            break;
+        case ucp_Bopomofo:
+        require_state = SCRIPT_HANBOPOMOFO;
+        break;

-            case ucp_Bopomofo:
-            require_script = SCRIPT_HANBOPOMOFO;
-            break;
+        case ucp_Hangul:
+        require_state = SCRIPT_HANHANGUL;
+        break;

-            case ucp_Hangul:
-            require_script = SCRIPT_HANHANGUL;
-            break;
+        default:
+        memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
+        require_state = SCRIPT_MAP;
+        break;
+        }
+      break;

-            /* Not a Han-related script. If expecting one, fail. Otherise set
-            the requirement to this script. */
+      /* The first significant character was Han. An inspection of the Unicode
+      11.0.0 files shows that there are the following types of Script Extension
+      list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
+      scripts:

-            default:
-            if (require_script == SCRIPT_HANPENDING) return FALSE;
-            require_script = scriptx;
-            break;
-            }
-          break;
+      . Bopomofo + Han
+      . Han + Hiragana + Katakana
+      . Hiragana + Katakana
+      . Bopopmofo + Hangul + Han + Hiragana + Katakana

-          /* Previously encountered one of the "with Han" scripts. Check that
-          this character is appropriate. */
-
-          case SCRIPT_HANHIRAKATA:
-          if (scriptx != ucp_Han && scriptx != ucp_Hiragana && 
-              scriptx != ucp_Katakana)
-            return FALSE;
-          break;
-
-          case SCRIPT_HANBOPOMOFO:
-          if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
-          break;
-
-          case SCRIPT_HANHANGUL:
-          if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
-          break;
-
-          /* We have a bitmap of scripts to check that is derived from one or
-          more previous characters. This is either one of the maps in
-          ucd_script_sets[] (for one previous character) or the intersection of
-          several maps for multiple characters. */
-
-          case SCRIPT_MAP:
-          if (MAPBIT(require_map, scriptx) == 0) return FALSE; 
-
-          /* The rest of the string must be in this script, but we have to 
-          allow for the Han complications. */
-          
-          switch(scriptx)
-            {
-            case ucp_Han:
-            require_script = SCRIPT_HANPENDING;
-            break;
-
-            case ucp_Hiragana:
-            case ucp_Katakana:
-            require_script = SCRIPT_HANHIRAKATA;
-            break;
-
-            case ucp_Bopomofo:
-            require_script = SCRIPT_HANBOPOMOFO;
-            break;
-
-            case ucp_Hangul:
-            require_script = SCRIPT_HANHANGUL;
-            break;
-
-            default:
-            require_script = scriptx;
-            break;
-            }  
-          break;
-
-          /* This is the easy case when a single script is required. */
-
-          default:
-          if (scriptx != require_script) return FALSE;
-          break;
-          }
-        }  /* End of handing positive scriptx */
-
-      /* If scriptx is negative, this character is a mark-type character that
-      has a list of permitted scripts, which are encoded in a bitmap. */
-
-      else
-        {
-        uint32_t chspecial;
-        const uint32_t *map = PRIV(ucd_script_sets) - scriptx;
-        
-        switch(require_script)
-          {
-          case SCRIPT_UNSET:
-          require_map = PRIV(ucd_script_sets) - scriptx;
-          require_script = SCRIPT_MAP;
-          break;
-
-          /* An inspection of the Unicode 11.0.0 files shows that there are the
-          following types of Script Extension list that involve the Han,
-          Bopomofo, Hiragana, Katakana, and Hangul scripts:
-
-          . Bopomofo + Han
-          . Han + Hiragana + Katakana
-          . Hiragana + Katakana
-          . Bopopmofo + Hangul + Han + Hiragana + Katakana
-
-          The following code tries to make sense of this. */
+      The following code tries to make sense of this. */

 #define FOUND_BOPOMOFO 1
 #define FOUND_HIRAGANA 2
 #define FOUND_KATAKANA 4
 #define FOUND_HANGUL   8

-          case SCRIPT_HANPENDING:
-          chspecial = 0;
-
-          if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
-          if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
-          if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
-          if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL;
-
-           if (chspecial == 0) return FALSE;
-
-           if (chspecial == FOUND_BOPOMOFO)
-             {
-             require_script = SCRIPT_HANBOPOMOFO;
-             }
-           else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
-             {
-             require_script = SCRIPT_HANHIRAKATA;
-             }
-
-          /* Otherwise it must be allowed with all of them, so remain in
-          the pending state. */
-
-          break;
-
-          case SCRIPT_HANHIRAKATA:
-          if (MAPBIT(map, ucp_Hiragana) != 0) break;
-          if (MAPBIT(map, ucp_Katakana) != 0) break;
-          return FALSE;
-
-          case SCRIPT_HANBOPOMOFO:
-          if (MAPBIT(map, ucp_Bopomofo) != 0) break;
-          return FALSE;
-
-          case SCRIPT_HANHANGUL:
-          if (MAPBIT(map, ucp_Hangul) != 0) break;
-          return FALSE;
-
-          /* Previously encountered one or more characters that are allowed
-          with a list of scripts. Build the intersection of the required list
-          with this character's list in intersection_map[]. */
-
-          case SCRIPT_MAP:
-          for (int i = 0; i < MAPSIZE; i++)
-            intersection_map[i] = require_map[i] & map[i];
-          
-          /* If there's just one script in common, we could set it as the
-          unique required script. However, in the new bitmap arrangements, 
-          finding the one script is expensive, so leave this out for now.
-          Otherwise, make the intersection map the required map. */
-
-          /*
-          if (onescript >= 0) require_script = onescript;
-            else require_map = intersection_map;
-          */   
-
-          require_map = intersection_map;
-          break;
-
-          /* The previously set required script is a single script, not
-          Han-related. Check that it is in this character's list. */
-
-          default:
-          if (MAPBIT(map, require_script) == 0) return FALSE; 
-          break;
-          }
-        }  /* End of handling negative scriptx */
-      }    /* End of checking non-Common character */
-
-    /* The character is in an acceptable script. We must now ensure that all
-    decimal digits in the string come from the same set. Some scripts (e.g.
-    Common, Arabic) have more than one set of decimal digits. This code does
-    not allow mixing sets, even within the same script. The vector called
-    PRIV(ucd_digit_sets)[] contains, in its first element, the number of
-    following elements, and then, in ascending order, the code points of the
-    '9' characters in every set of 10 digits. Each set is identified by the
-    offset in the vector of its '9' character. An initial check of the first
-    value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
-
-    if (ucd->chartype == ucp_Nd)
-      {
-      uint32_t digitset;
-
-      if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
+      case SCRIPT_HANPENDING:
+      if (script != ucp_Han)   /* Another Han does nothing */
        {
-        int mid;
-        int bot = 1;
-        int top = PRIV(ucd_digit_sets)[0];
-        for (;;)
+        uint32_t chspecial = 0;
+
+        if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
+        if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
+        if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
+        if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL;
+
+        if (chspecial == 0) return FALSE;   /* Not allowed with Han */
+
+        if (chspecial == FOUND_BOPOMOFO)
+          require_state = SCRIPT_HANBOPOMOFO;
+        else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
+          require_state = SCRIPT_HANHIRAKATA;
+
+        /* Otherwise this character must be allowed with all of them, so remain
+        in the pending state. */
+        }
+      break;
+
+      /* Previously encountered one of the "with Han" scripts. Check that
+      this character is appropriate. */
+
+      case SCRIPT_HANHIRAKATA:
+      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
+          MAPBIT(map, ucp_Katakana) == 0) return FALSE;
+      break;
+
+      case SCRIPT_HANBOPOMOFO:
+      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
+      break;
+
+      case SCRIPT_HANHANGUL:
+      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
+      break;
+
+      /* Previously encountered one or more characters that are allowed with a
+      list of scripts. */
+
+      case SCRIPT_MAP:
+      OK = FALSE;
+
+      for (int i = 0; i < FULL_MAPSIZE; i++)
+        {
+        if ((require_map[i] & map[i]) != 0)
          {
-          if (top <= bot + 1)    /* <= rather than == is paranoia */
-            {
-            digitset = top;
-            break;
-            }
-          mid = (top + bot) / 2;
-          if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
+          OK = TRUE;
+          break;
          }
        }

-      /* A required value of 0 means "unset". */
+      if (!OK) return FALSE;

-      if (require_digitset == 0) require_digitset = digitset;
-        else if (digitset != require_digitset) return FALSE;
-      }   /* End digit handling */
-    }     /* End checking non-Inherited character */
+      /* The rest of the string must be in this script, but we have to
+      allow for the Han complications. */
+
+      switch(script)
+        {
+        case ucp_Han:
+        require_state = SCRIPT_HANPENDING;
+        break;
+
+        case ucp_Hiragana:
+        case ucp_Katakana:
+        require_state = SCRIPT_HANHIRAKATA;
+        break;
+
+        case ucp_Bopomofo:
+        require_state = SCRIPT_HANBOPOMOFO;
+        break;
+
+        case ucp_Hangul:
+        require_state = SCRIPT_HANHANGUL;
+        break;
+
+        /* Compute the intersection of the required list of scripts and the
+        allowed scripts for this character. */
+
+        default:
+        for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
+        break;
+        }
+
+      break;
+      }
+    }   /* End checking character's script and extensions. */
+
+  /* The character is in an acceptable script. We must now ensure that all
+  decimal digits in the string come from the same set. Some scripts (e.g.
+  Common, Arabic) have more than one set of decimal digits. This code does
+  not allow mixing sets, even within the same script. The vector called
+  PRIV(ucd_digit_sets)[] contains, in its first element, the number of
+  following elements, and then, in ascending order, the code points of the
+  '9' characters in every set of 10 digits. Each set is identified by the
+  offset in the vector of its '9' character. An initial check of the first
+  value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
+
+  if (ucd->chartype == ucp_Nd)
+    {
+    uint32_t digitset;
+
+    if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
+      {
+      int mid;
+      int bot = 1;
+      int top = PRIV(ucd_digit_sets)[0];
+      for (;;)
+        {
+        if (top <= bot + 1)    /* <= rather than == is paranoia */
+          {
+          digitset = top;
+          break;
+          }
+        mid = (top + bot) / 2;
+        if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
+        }
+      }
+
+    /* A required value of 0 means "unset". */
+
+    if (require_digitset == 0) require_digitset = digitset;
+      else if (digitset != require_digitset) return FALSE;
+    }   /* End digit handling */

  /* If we haven't yet got to the end, pick up the next character. */

--- a/src/pcre2_ucd.c
+++ b/src/pcre2_ucd.c
@ -237,8 +237,8 @@ const uint32_t PRIV(ucd_script_sets)[] = {
 /* These are the main two-stage UCD tables. The fields in each record are:
 script (8 bits), character type (8 bits), grapheme break property (8 bits),
 offset to multichar other cases or zero (8 bits), offset to other case or zero
-(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and
-a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */
+(32 bits, signed), script extension (8 bits), bidi class (8 bits), and a dummy
+16-bit field to make the whole thing a multiple of 4 bytes. */

 const ucd_record PRIV(ucd_records)[] = { /* 12588 bytes, record size 12 */
  {    73,      0,      2,      0,      0,      0,      3,    256, }, /*   0 */
--- a/testdata/testinput4
+++ b/testdata/testinput4
@ -1138,23 +1138,27 @@
 \= Expect no match
    \x{2e7f}

-/^\P{Katakana}+/utf
-    \x{3105}
-\= Expect no match
-    \x{30ff}
-
 /^[\p{Arabic}]/utf
    \x{06e9}
    \x{060b}
 \= Expect no match
    X\x{06e9}

+#subject no_jit
+
+/^\P{Katakana}+/utf
+    \x{3105}
+\= Expect no match
+    \x{30ff}
+
 /^[\P{Yi}]/utf
    \x{2f800}
 \= Expect no match
    \x{a014}
    \x{a4c6}

+#subject -no_jit     
+
 /^\p{Any}X/utf
    AXYZ
    \x{1234}XYZ
@ -2640,4 +2644,13 @@
 /[\p{taml}\p{sc:ugar}]+/utf
    \x{0b82}\x{10380}

+/^[\p{sc:Arabic}]/utf
+\= Expect no match
+    \x{650}
+    \x{651}  
+    \x{652}  
+    \x{653}  
+    \x{654} 
+    \x{655} 
+    
 # End of testinput4
--- a/testdata/testinput5
+++ b/testdata/testinput5
@ -2073,15 +2073,6 @@
    
 # More differences from Perl

-/^[\p{Arabic}]/utf
-\= Expect no match
-    \x{650}
-    \x{651}  
-    \x{652}  
-    \x{653}  
-    \x{654} 
-    \x{655} 
-    
 /^\p{Common}/utf
    \x{60c}
    \x{61f}  
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@ -1883,13 +1883,6 @@ No match
    \x{2e7f}
 No match

-/^\P{Katakana}+/utf
-    \x{3105}
- 0: \x{3105}
-\= Expect no match
-    \x{30ff}
-No match
-
 /^[\p{Arabic}]/utf
    \x{06e9}
 0: \x{6e9}
@ -1899,6 +1892,15 @@ No match
    X\x{06e9}
 No match

+#subject no_jit
+
+/^\P{Katakana}+/utf
+    \x{3105}
+ 0: \x{3105}
+\= Expect no match
+    \x{30ff}
+No match
+
 /^[\P{Yi}]/utf
    \x{2f800}
 0: \x{2f800}
@ -1908,6 +1910,8 @@ No match
    \x{a4c6}
 No match

+#subject -no_jit     
+
 /^\p{Any}X/utf
    AXYZ
 0: AX
@ -4235,4 +4239,19 @@ No match
    \x{0b82}\x{10380}
 0: \x{b82}\x{10380}

+/^[\p{sc:Arabic}]/utf
+\= Expect no match
+    \x{650}
+No match
+    \x{651}  
+No match
+    \x{652}  
+No match
+    \x{653}  
+No match
+    \x{654} 
+No match
+    \x{655} 
+No match
+    
 # End of testinput4
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@ -4722,21 +4722,6 @@ Callout 0: last capture = 1
    
 # More differences from Perl

-/^[\p{Arabic}]/utf
-\= Expect no match
-    \x{650}
-No match
-    \x{651}  
-No match
-    \x{652}  
-No match
-    \x{653}  
-No match
-    \x{654} 
-No match
-    \x{655} 
-No match
-    
 /^\p{Common}/utf
    \x{60c}
 0: \x{60c}