Unicode properties data records extended to 12-bytes to include a

ScriptExtensions property.
2018-10-06 17:39:52 +00:00 · 2018-10-06 17:39:52 +00:00 · 04ba4bce0f
parent cda4780fb6
commit 04ba4bce0f
8 changed files with 4642 additions and 3752 deletions
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@ -61,26 +61,39 @@
 #  property, which is used by PCRE2 as a grapheme breaking property. This was
 #  done when updating to Unicode 11.0.0 (July 2018).
 #
+#  Added code to add a Script Extensions field to records.
+#
 #
 # The main tables generated by this script are used by macros defined in
 # pcre2_internal.h. They look up Unicode character properties using short
 # sequences of code that contains no branches, which makes for greater speed.
 #
 # Conceptually, there is a table of records (of type ucd_record), containing a
-# script number, character type, grapheme break type, offset to caseless
-# matching set, and offset to the character's other case for every character.
-# However, a real table covering all Unicode characters would be far too big.
-# It can be efficiently compressed by observing that many characters have the
-# same record, and many blocks of characters (taking 128 characters in a block)
-# have the same set of records as other blocks. This leads to a 2-stage lookup
-# process.
+# script number, script extension value, character type, grapheme break type,
+# offset to caseless matching set, offset to the character's other case, for
+# every character. However, a real table covering all Unicode characters would
+# be far too big. It can be efficiently compressed by observing that many
+# characters have the same record, and many blocks of characters (taking 128
+# characters in a block) have the same set of records as other blocks. This
+# leads to a 2-stage lookup process.
 #
-# This script constructs four tables. The ucd_caseless_sets table contains
+# This script constructs six tables. The ucd_caseless_sets table contains
 # lists of characters that all match each other caselessly. Each list is
 # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
 # any valid character. The first list is empty; this is used for characters
 # that are not part of any list.
 #
+# The ucd_digit_sets table contains the code points of the '9' characters in
+# each set of 10 decimal digits in Unicode. This is used to ensure that digits
+# in script runs all come from the same set. The first element in the vector
+# contains the number of subsequent elements, which are in ascending order.
+#
+# The ucd_script_sets vector contains lists of script numbers that are the
+# Script Extensions properties of certain characters. Each list is terminated
+# by zero (ucp_Unknown). A character with more than one script listed for its
+# Script Extension property has a negative value in its record. This is the
+# negated offset to the start of the relevant list.
+#
 # The ucd_records table contains one instance of every unique record that is
 # required. The ucd_stage1 table is indexed by a character's block number, and
 # yields what is in effect a "virtual" block number. The ucd_stage2 table is a
@ -117,11 +130,8 @@
 # In these examples, no other blocks resolve to the same "virtual" block, as it
 # happens, but plenty of other blocks do share "virtual" blocks.
 #
-# There is a fourth table, maintained by hand, which translates from the
-# individual character types such as ucp_Cc to the general types like ucp_C.
-#
 #  Philip Hazel, 03 July 2008
-#  Last Updated: 07 July 2018
+#  Last Updated: 03 October 2018
 #
 #
 # 01-March-2010:     Updated list of scripts for Unicode 5.2.0
@ -144,6 +154,7 @@
 # 07-July-2018:      Added code to scan emoji-data.txt for the Extended
 #                      Pictographic property.
 # 01-October-2018:   Added the 'Unknown' script name
+# 03-October-2018:   Added new field for Script Extensions
 ##############################################################################


@ -165,6 +176,32 @@ def get_other_case(chardata):
          return int(chardata[2], 16) - int(chardata[0], 16)
        return 0
        
+# Parse a line of ScriptExtensions.txt
+def get_script_extension(chardata):
+        this_script_list = list(chardata[1].split(' '))
+        if len(this_script_list) == 1:
+          return script_abbrevs.index(this_script_list[0])
+            
+        script_numbers = []
+        for d in this_script_list:
+          script_numbers.append(script_abbrevs.index(d))
+        script_numbers.append(0)
+        script_numbers_length = len(script_numbers)
+
+        for i in range(1, len(script_lists) - script_numbers_length + 1):
+          for j in range(0, script_numbers_length):
+            found = True 
+            if script_lists[i+j] != script_numbers[j]:
+              found = False 
+              break
+          if found:
+            return -i
+            
+        # Not found in existing lists 
+        
+        return_value = len(script_lists)
+        script_lists.extend(script_numbers)
+        return -return_value 

 # Read the whole table in memory, setting/checking the Unicode version
 def read_table(file_name, get_value, default_value):
@ -330,24 +367,24 @@ def print_records(records, record_size):
                print(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
        print('};\n')

-script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
- 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
- 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
- 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
- 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
- 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
- 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
+script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal',
+ 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian',
+ 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana',
+ 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam',
+ 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic',
+ 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana',
+ 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi',
 # New for Unicode 5.0
- 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
+ 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician',
 # New for Unicode 5.1
- 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
+ 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai',
 # New for Unicode 5.2
- 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
- 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
- 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
- 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
+ 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic',
+ 'Inscriptional_Pahlavi', 'Inscriptional_Parthian',
+ 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek',
+ 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet',
 # New for Unicode 6.0.0
- 'Batak', 'Brahmi', 'Mandaic', \
+ 'Batak', 'Brahmi', 'Mandaic',
 # New for Unicode 6.1.0
 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
 # New for Unicode 7.0.0
@ -366,6 +403,39 @@ script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille
  'Old_Sogdian', 'Sogdian'
 ]
 
+script_abbrevs = [
+  'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
+  'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
+  'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
+  'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
+  'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
+  'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
+  'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
+#New for Unicode 5.0
+  'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
+#New for Unicode 5.1
+  'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
+  'Sund', 'Vaii',
+#New for Unicode 5.2
+  'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
+  'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
+#New for Unicode 6.0.0
+  'Batk', 'Brah', 'Mand',
+#New for Unicode 6.1.0
+  'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
+#New for Unicode 7.0.0
+  'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
+  'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
+  'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
+#New for Unicode 8.0.0
+  'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
+#New for Unicode 10.0.0
+  'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
+  'Zanb',
+#New for Unicode 11.0.0
+  'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd'
+  ] 
+
 category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
@ -415,6 +485,28 @@ for line in file:
                break_props[i] = break_property_names.index('Extended_Pictographic')
 file.close()

+# The Script Extensions property default value is the Script value. Parse the
+# file, setting 'Unknown' as the default (this will never be a Script Extension
+# value), then scan it and fill in the default from Scripts. Code added by PH
+# in October 2018. Positive values are used for just a single script for a
+# code point. Negative values are negated offsets in a list of lists of
+# multiple scripts. Initialize this list with a single entry, as the zeroth
+# element is never used.
+
+script_lists = [0]
+script_abbrevs_default = script_abbrevs.index('Zzzz')
+scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)
+
+for i in range(0, MAX_UNICODE):
+  if scriptx[i] == script_abbrevs_default:
+    scriptx[i] = script[i] 
+
+# With the addition of the new Script Extensions field, we need some padding 
+# to get the Unicode records up to 12 bytes (multiple of 4). Set a value 
+# greater than 255 to make the field 16 bits.
+
+padding_dummy = [0] * MAX_UNICODE
+padding_dummy[0] = 256

 # This block of code was added by PH in September 2012. I am not a Python
 # programmer, so the style is probably dreadful, but it does the job. It scans
@ -427,7 +519,7 @@ file.close()
 # sets only one value, so first we go through the table and set "return"
 # offsets for those that are not already set.

-for c in range(0x10ffff):
+for c in range(MAX_UNICODE):
  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
    other_case[c + other_case[c]] = -other_case[c]

@ -435,7 +527,7 @@ for c in range(0x10ffff):

 sets = []

-for c in range(0x10ffff):
+for c in range(MAX_UNICODE):
  o = c + other_case[c]

  # Trigger when this character's other case does not point back here. We
@ -489,7 +581,7 @@ for s in sets:
 # Combine the tables

 table, records = combine_tables(script, category, break_props,
-  caseless_offsets, other_case)
+  caseless_offsets, other_case, scriptx, padding_dummy)

 record_size, record_struct = get_record_size_struct(list(records.keys()))

@ -537,7 +629,7 @@ print("a comment was received about space saving - maybe the guy linked")
 print("all the modules rather than using a library - so we include a")
 print("condition to cut out the tables when not needed. But don't leave")
 print("a totally empty module because some compilers barf at that.")
-print("Instead, just supply small dummy tables. */")
+print("Instead, just supply some small dummy tables. */")
 print()
 print("#ifndef SUPPORT_UNICODE")
 print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
@ -559,6 +651,8 @@ print("  ucp_Cn,         /* type unassigned */")
 print("  ucp_gbOther,    /* grapheme break property */")
 print("  0,              /* case set */")
 print("  0,              /* other case */")
+print("  ucp_Unknown,    /* script extension */")
+print("  0,              /* dummy filler */")
 print("  }};")
 print("#endif")
 print()
@ -609,8 +703,7 @@ digitsets.sort()

 print("/* This table lists the code points for the '9' characters in each")
 print("set of decimal digits. It is used to ensure that all the digits in")
-print("a script run come from the same set. */")
-print()
+print("a script run come from the same set. */\n")
 print("const uint32_t PRIV(ucd_digit_sets)[] = {")

 print("  %d,  /* Number of subsequent values */" % len(digitsets), end='')
@ -621,12 +714,28 @@ for d in digitsets:
    count = 0
  print(" 0x%05x," % d, end='')
  count += 1
-print("\n};")
-print()
+print("\n};\n")
+
+print("/* This vector is a list of lists of scripts for the Script Extension")
+print("property. Each sublist is zero-terminated. */\n")
+print("const uint8_t PRIV(ucd_script_sets)[] = {")
+
+count = 0
+print("  /*   0 */", end='')
+for d in script_lists:
+  print(" %3d," % d, end='')
+  count += 1   
+  if d == 0:
+    print("\n  /* %3d */" % count, end='')  
+print("\n};\n")

 # Output the main UCD tables.

-print("/* These are the main two-stage UCD tables. */\n")
+print("/* These are the main two-stage UCD tables. The fields in each record are:")
+print("script (8 bits), character type (8 bits), grapheme break property (8 bits),")
+print("offset to multichar other cases or zero (8 bits), offset to other case")
+print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy")
+print("16-bit field to make the whole thing a multiple of 4 bytes. */\n")

 print_records(records, record_size)
 print_table(min_stage1, 'PRIV(ucd_stage1)')
--- a/maint/Unicode.tables/ScriptExtensions.txt
+++ b/maint/Unicode.tables/ScriptExtensions.txt
@ -0,0 +1,531 @@
+# ScriptExtensions-11.0.0.txt
+# Date: 2018-02-04, 20:04:00 GMT
+# © 2018 Unicode®, Inc.
+# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+#   For documentation, see http://www.unicode.org/reports/tr44/
+#
+# The Script_Extensions property indicates which characters are commonly used
+# with more than one script, but with a limited number of scripts.
+# For each code point, there is one or more property values.  Each such value is a Script property value.
+# For more information, see:
+#   UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/
+#     Especially the sections:
+#       http://www.unicode.org/reports/tr24/#Assignment_Script_Values
+#       http://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
+#
+# Each Script_Extensions value in this file consists of a set
+# of one or more abbreviated Script property values. The ordering of the
+# values in that set is not material, but for stability in presentation
+# it is given here as alphabetical.
+#
+# The Script_Extensions values are presented in sorted order in the file.
+# They are sorted first by the number of Script property values in their sets,
+# and then alphabetically by first differing Script property value.
+#
+# Following each distinct Script_Extensions value is the list of code
+# points associated with that value, listed in code point order.
+#
+# All code points not explicitly listed for Script_Extensions
+# have as their value the corresponding Script property value
+#
+# @missing: 0000..10FFFF; <script>
+
+# ================================================
+
+# Property:	Script_Extensions
+
+# ================================================
+
+# Script_Extensions=Beng
+
+1CF7          ; Beng # Mc       VEDIC SIGN ATIKRAMA
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Deva
+
+1CD1          ; Deva # Mn       VEDIC TONE SHARA
+1CD4          ; Deva # Mn       VEDIC SIGN YAJURVEDIC MIDLINE SVARITA
+1CDB          ; Deva # Mn       VEDIC TONE TRIPLE SVARITA
+1CDE..1CDF    ; Deva # Mn   [2] VEDIC TONE TWO DOTS BELOW..VEDIC TONE THREE DOTS BELOW
+1CE2..1CE8    ; Deva # Mn   [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
+1CE9          ; Deva # Lo       VEDIC SIGN ANUSVARA ANTARGOMUKHA
+1CEB..1CEC    ; Deva # Lo   [2] VEDIC SIGN ANUSVARA VAMAGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
+1CEE..1CF1    ; Deva # Lo   [4] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ANUSVARA UBHAYATO MUKHA
+
+# Total code points: 19
+
+# ================================================
+
+# Script_Extensions=Dupl
+
+1BCA0..1BCA3  ; Dupl # Cf   [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
+
+# Total code points: 4
+
+# ================================================
+
+# Script_Extensions=Grek
+
+0342          ; Grek # Mn       COMBINING GREEK PERISPOMENI
+0345          ; Grek # Mn       COMBINING GREEK YPOGEGRAMMENI
+1DC0..1DC1    ; Grek # Mn   [2] COMBINING DOTTED GRAVE ACCENT..COMBINING DOTTED ACUTE ACCENT
+
+# Total code points: 4
+
+# ================================================
+
+# Script_Extensions=Hani
+
+3006          ; Hani # Lo       IDEOGRAPHIC CLOSING MARK
+303E..303F    ; Hani # So   [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE
+3190..3191    ; Hani # So   [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
+3192..3195    ; Hani # No   [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
+3196..319F    ; Hani # So  [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
+31C0..31E3    ; Hani # So  [36] CJK STROKE T..CJK STROKE Q
+3220..3229    ; Hani # No  [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
+322A..3247    ; Hani # So  [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
+3280..3289    ; Hani # No  [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
+328A..32B0    ; Hani # So  [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
+32C0..32CB    ; Hani # So  [12] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DECEMBER
+3358..3370    ; Hani # So  [25] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR TWENTY-FOUR
+337B..337F    ; Hani # So   [5] SQUARE ERA NAME HEISEI..SQUARE CORPORATION
+33E0..33FE    ; Hani # So  [31] IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY ONE..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
+1D360..1D371  ; Hani # No  [18] COUNTING ROD UNIT DIGIT ONE..COUNTING ROD TENS DIGIT NINE
+1F250..1F251  ; Hani # So   [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
+
+# Total code points: 237
+
+# ================================================
+
+# Script_Extensions=Latn
+
+0363..036F    ; Latn # Mn  [13] COMBINING LATIN SMALL LETTER A..COMBINING LATIN SMALL LETTER X
+
+# Total code points: 13
+
+# ================================================
+
+# Script_Extensions=Arab Copt
+
+102E0         ; Arab Copt # Mn       COPTIC EPACT THOUSANDS MARK
+102E1..102FB  ; Arab Copt # No  [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
+
+# Total code points: 28
+
+# ================================================
+
+# Script_Extensions=Arab Rohg
+
+06D4          ; Arab Rohg # Po       ARABIC FULL STOP
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Arab Syrc
+
+064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
+0670          ; Arab Syrc # Mn       ARABIC LETTER SUPERSCRIPT ALEF
+
+# Total code points: 12
+
+# ================================================
+
+# Script_Extensions=Arab Thaa
+
+0660..0669    ; Arab Thaa # Nd  [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
+FDF2          ; Arab Thaa # Lo       ARABIC LIGATURE ALLAH ISOLATED FORM
+FDFD          ; Arab Thaa # So       ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
+
+# Total code points: 12
+
+# ================================================
+
+# Script_Extensions=Armn Geor
+
+0589          ; Armn Geor # Po       ARMENIAN FULL STOP
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Beng Deva
+
+1CD5..1CD6    ; Beng Deva # Mn   [2] VEDIC TONE YAJURVEDIC AGGRAVATED INDEPENDENT SVARITA..VEDIC TONE YAJURVEDIC INDEPENDENT SVARITA
+1CD8          ; Beng Deva # Mn       VEDIC TONE CANDRA BELOW
+1CE1          ; Beng Deva # Mc       VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA
+1CEA          ; Beng Deva # Lo       VEDIC SIGN ANUSVARA BAHIRGOMUKHA
+1CED          ; Beng Deva # Mn       VEDIC SIGN TIRYAK
+1CF5..1CF6    ; Beng Deva # Lo   [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
+A8F1          ; Beng Deva # Mn       COMBINING DEVANAGARI SIGN AVAGRAHA
+
+# Total code points: 9
+
+# ================================================
+
+# Script_Extensions=Bopo Hani
+
+302A..302D    ; Bopo Hani # Mn   [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
+
+# Total code points: 4
+
+# ================================================
+
+# Script_Extensions=Bugi Java
+
+A9CF          ; Bugi Java # Lm       JAVANESE PANGRANGKEP
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Cprt Linb
+
+10100..10102  ; Cprt Linb # Po   [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK
+10137..1013F  ; Cprt Linb # So   [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
+
+# Total code points: 12
+
+# ================================================
+
+# Script_Extensions=Cyrl Glag
+
+0484          ; Cyrl Glag # Mn       COMBINING CYRILLIC PALATALIZATION
+0487          ; Cyrl Glag # Mn       COMBINING CYRILLIC POKRYTIE
+2E43          ; Cyrl Glag # Po       DASH WITH LEFT UPTURN
+A66F          ; Cyrl Glag # Mn       COMBINING CYRILLIC VZMET
+
+# Total code points: 4
+
+# ================================================
+
+# Script_Extensions=Cyrl Latn
+
+0485..0486    ; Cyrl Latn # Mn   [2] COMBINING CYRILLIC DASIA PNEUMATA..COMBINING CYRILLIC PSILI PNEUMATA
+
+# Total code points: 2
+
+# ================================================
+
+# Script_Extensions=Cyrl Perm
+
+0483          ; Cyrl Perm # Mn       COMBINING CYRILLIC TITLO
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Deva Gran
+
+1CD3          ; Deva Gran # Po       VEDIC SIGN NIHSHVASA
+1CF2..1CF3    ; Deva Gran # Mc   [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA
+1CF8..1CF9    ; Deva Gran # Mn   [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
+
+# Total code points: 5
+
+# ================================================
+
+# Script_Extensions=Deva Shrd
+
+1CD7          ; Deva Shrd # Mn       VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA
+1CD9          ; Deva Shrd # Mn       VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA SCHROEDER
+1CDC..1CDD    ; Deva Shrd # Mn   [2] VEDIC TONE KATHAKA ANUDATTA..VEDIC TONE DOT BELOW
+1CE0          ; Deva Shrd # Mn       VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
+
+# Total code points: 5
+
+# ================================================
+
+# Script_Extensions=Deva Taml
+
+A8F3          ; Deva Taml # Lo       DEVANAGARI SIGN CANDRABINDU VIRAMA
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Geor Latn
+
+10FB          ; Geor Latn # Po       GEORGIAN PARAGRAPH SEPARATOR
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Gran Taml
+
+0BE6..0BEF    ; Gran Taml # Nd  [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE
+0BF0..0BF2    ; Gran Taml # No   [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND
+0BF3          ; Gran Taml # So       TAMIL DAY SIGN
+11301         ; Gran Taml # Mn       GRANTHA SIGN CANDRABINDU
+11303         ; Gran Taml # Mc       GRANTHA SIGN VISARGA
+1133B..1133C  ; Gran Taml # Mn   [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA
+
+# Total code points: 18
+
+# ================================================
+
+# Script_Extensions=Gujr Khoj
+
+0AE6..0AEF    ; Gujr Khoj # Nd  [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
+
+# Total code points: 10
+
+# ================================================
+
+# Script_Extensions=Guru Mult
+
+0A66..0A6F    ; Guru Mult # Nd  [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE
+
+# Total code points: 10
+
+# ================================================
+
+# Script_Extensions=Hira Kana
+
+3031..3035    ; Hira Kana # Lm   [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
+3099..309A    ; Hira Kana # Mn   [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+309B..309C    ; Hira Kana # Sk   [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+30A0          ; Hira Kana # Pd       KATAKANA-HIRAGANA DOUBLE HYPHEN
+30FC          ; Hira Kana # Lm       KATAKANA-HIRAGANA PROLONGED SOUND MARK
+FF70          ; Hira Kana # Lm       HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
+FF9E..FF9F    ; Hira Kana # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
+
+# Total code points: 14
+
+# ================================================
+
+# Script_Extensions=Mong Phag
+
+1802..1803    ; Mong Phag # Po   [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
+1805          ; Mong Phag # Po       MONGOLIAN FOUR DOTS
+
+# Total code points: 3
+
+# ================================================
+
+# Script_Extensions=Arab Syrc Thaa
+
+061C          ; Arab Syrc Thaa # Cf       ARABIC LETTER MARK
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Beng Cakm Sylo
+
+09E6..09EF    ; Beng Cakm Sylo # Nd  [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE
+
+# Total code points: 10
+
+# ================================================
+
+# Script_Extensions=Cakm Mymr Tale
+
+1040..1049    ; Cakm Mymr Tale # Nd  [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE
+
+# Total code points: 10
+
+# ================================================
+
+# Script_Extensions=Cprt Lina Linb
+
+10107..10133  ; Cprt Lina Linb # No  [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
+
+# Total code points: 45
+
+# ================================================
+
+# Script_Extensions=Deva Gran Knda
+
+1CF4          ; Deva Gran Knda # Mn       VEDIC TONE CANDRA ABOVE
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Deva Gran Latn
+
+20F0          ; Deva Gran Latn # Mn       COMBINING ASTERISK ABOVE
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Hani Hira Kana
+
+303C          ; Hani Hira Kana # Lo       MASU MARK
+303D          ; Hani Hira Kana # Po       PART ALTERNATION MARK
+
+# Total code points: 2
+
+# ================================================
+
+# Script_Extensions=Kali Latn Mymr
+
+A92E          ; Kali Latn Mymr # Po       KAYAH LI SIGN CWI
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Arab Rohg Syrc Thaa
+
+060C          ; Arab Rohg Syrc Thaa # Po       ARABIC COMMA
+061B          ; Arab Rohg Syrc Thaa # Po       ARABIC SEMICOLON
+061F          ; Arab Rohg Syrc Thaa # Po       ARABIC QUESTION MARK
+
+# Total code points: 3
+
+# ================================================
+
+# Script_Extensions=Beng Deva Gran Knda
+
+1CD0          ; Beng Deva Gran Knda # Mn       VEDIC TONE KARSHANA
+1CD2          ; Beng Deva Gran Knda # Mn       VEDIC TONE PRENKHA
+
+# Total code points: 2
+
+# ================================================
+
+# Script_Extensions=Buhd Hano Tagb Tglg
+
+1735..1736    ; Buhd Hano Tagb Tglg # Po   [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
+
+# Total code points: 2
+
+# ================================================
+
+# Script_Extensions=Deva Dogr Kthi Mahj
+
+0966..096F    ; Deva Dogr Kthi Mahj # Nd  [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
+
+# Total code points: 10
+
+# ================================================
+
+# Script_Extensions=Bopo Hang Hani Hira Kana
+
+3003          ; Bopo Hang Hani Hira Kana # Po       DITTO MARK
+3013          ; Bopo Hang Hani Hira Kana # So       GETA MARK
+301C          ; Bopo Hang Hani Hira Kana # Pd       WAVE DASH
+301D          ; Bopo Hang Hani Hira Kana # Ps       REVERSED DOUBLE PRIME QUOTATION MARK
+301E..301F    ; Bopo Hang Hani Hira Kana # Pe   [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
+3030          ; Bopo Hang Hani Hira Kana # Pd       WAVY DASH
+3037          ; Bopo Hang Hani Hira Kana # So       IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
+FE45..FE46    ; Bopo Hang Hani Hira Kana # Po   [2] SESAME DOT..WHITE SESAME DOT
+
+# Total code points: 10
+
+# ================================================
+
+# Script_Extensions=Bopo Hang Hani Hira Kana Yiii
+
+3001..3002    ; Bopo Hang Hani Hira Kana Yiii # Po   [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
+3008          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT ANGLE BRACKET
+3009          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT ANGLE BRACKET
+300A          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT DOUBLE ANGLE BRACKET
+300B          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT DOUBLE ANGLE BRACKET
+300C          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT CORNER BRACKET
+300D          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT CORNER BRACKET
+300E          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT WHITE CORNER BRACKET
+300F          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT WHITE CORNER BRACKET
+3010          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT BLACK LENTICULAR BRACKET
+3011          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT BLACK LENTICULAR BRACKET
+3014          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT TORTOISE SHELL BRACKET
+3015          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT TORTOISE SHELL BRACKET
+3016          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT WHITE LENTICULAR BRACKET
+3017          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT WHITE LENTICULAR BRACKET
+3018          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT WHITE TORTOISE SHELL BRACKET
+3019          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT WHITE TORTOISE SHELL BRACKET
+301A          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT WHITE SQUARE BRACKET
+301B          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT WHITE SQUARE BRACKET
+30FB          ; Bopo Hang Hani Hira Kana Yiii # Po       KATAKANA MIDDLE DOT
+FF61          ; Bopo Hang Hani Hira Kana Yiii # Po       HALFWIDTH IDEOGRAPHIC FULL STOP
+FF62          ; Bopo Hang Hani Hira Kana Yiii # Ps       HALFWIDTH LEFT CORNER BRACKET
+FF63          ; Bopo Hang Hani Hira Kana Yiii # Pe       HALFWIDTH RIGHT CORNER BRACKET
+FF64..FF65    ; Bopo Hang Hani Hira Kana Yiii # Po   [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT
+
+# Total code points: 26
+
+# ================================================
+
+# Script_Extensions=Deva Knda Mlym Orya Taml Telu
+
+1CDA          ; Deva Knda Mlym Orya Taml Telu # Mn       VEDIC TONE DOUBLE SVARITA
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Adlm Arab Mand Mani Phlp Rohg Sogd Syrc
+
+0640          ; Adlm Arab Mand Mani Phlp Rohg Sogd Syrc # Lm       ARABIC TATWEEL
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
+
+A836..A837    ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So   [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
+A838          ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # Sc       NORTH INDIC RUPEE MARK
+A839          ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So       NORTH INDIC QUANTITY MARK
+
+# Total code points: 4
+
+# ================================================
+
+# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
+
+0952          ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh # Mn       DEVANAGARI STRESS SIGN ANUDATTA
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh
+
+A833..A835    ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh # No   [3] NORTH INDIC FRACTION ONE SIXTEENTH..NORTH INDIC FRACTION THREE SIXTEENTHS
+
+# Total code points: 3
+
+# ================================================
+
+# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
+
+0951          ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh # Mn       DEVANAGARI STRESS SIGN UDATTA
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh
+
+A830..A832    ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh # No   [3] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE QUARTERS
+
+# Total code points: 3
+
+# ================================================
+
+# Script_Extensions=Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
+
+0964          ; Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po       DEVANAGARI DANDA
+
+# Total code points: 1
+
+# ================================================
+
+# Script_Extensions=Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
+
+0965          ; Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po       DEVANAGARI DOUBLE DANDA
+
+# Total code points: 1
+
+# EOF
--- a/maint/ucptest.c
+++ b/maint/ucptest.c
@ -9,11 +9,12 @@
     ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
 */

-/* The program expects to read commands on stdin, and it writes output
-to stdout. There is only one command, "findprop", followed by a list of Unicode 
-code points as hex numbers (without any prefixes). The output is one line per 
-character, giving its Unicode properties followed by its other case if there is 
-one. */
+/* If there are arguments, they are a list of hexadecimal code points whose
+properties are to be output. Otherwise, the program expects to read commands on
+stdin, and it writes output to stdout. There is only one command, "findprop",
+followed by a list of Unicode code points as hex numbers (without any
+prefixes). The output is one line per character, giving its Unicode properties
+followed by its other case if there is one. */

 #ifdef HAVE_CONFIG_H
 #include "../src/config.h"
@ -46,6 +47,183 @@ one. */



+/*************************************************
+*          Find a script name                    *
+*************************************************/
+
+static unsigned char *
+find_script_name(int script)
+{
+switch(script)
+  {
+  default:              return US"??"; 
+  case ucp_Unknown:     return US"Unknown";  
+  case ucp_Arabic:      return US"Arabic"; 
+  case ucp_Armenian:    return US"Armenian"; 
+  case ucp_Balinese:    return US"Balinese"; 
+  case ucp_Bengali:     return US"Bengali"; 
+  case ucp_Bopomofo:    return US"Bopomofo"; 
+  case ucp_Braille:     return US"Braille"; 
+  case ucp_Buginese:    return US"Buginese"; 
+  case ucp_Buhid:       return US"Buhid"; 
+  case ucp_Canadian_Aboriginal: return US"Canadian_Aboriginal"; 
+  case ucp_Cherokee:    return US"Cherokee"; 
+  case ucp_Common:      return US"Common"; 
+  case ucp_Coptic:      return US"Coptic"; 
+  case ucp_Cuneiform:   return US"Cuneiform"; 
+  case ucp_Cypriot:     return US"Cypriot"; 
+  case ucp_Cyrillic:    return US"Cyrillic"; 
+  case ucp_Deseret:     return US"Deseret"; 
+  case ucp_Devanagari:  return US"Devanagari"; 
+  case ucp_Ethiopic:    return US"Ethiopic"; 
+  case ucp_Georgian:    return US"Georgian"; 
+  case ucp_Glagolitic:  return US"Glagolitic"; 
+  case ucp_Gothic:      return US"Gothic"; 
+  case ucp_Greek:       return US"Greek"; 
+  case ucp_Gujarati:    return US"Gujarati"; 
+  case ucp_Gurmukhi:    return US"Gurmukhi"; 
+  case ucp_Han:         return US"Han"; 
+  case ucp_Hangul:      return US"Hangul"; 
+  case ucp_Hanunoo:     return US"Hanunoo"; 
+  case ucp_Hebrew:      return US"Hebrew"; 
+  case ucp_Hiragana:    return US"Hiragana"; 
+  case ucp_Inherited:   return US"Inherited"; 
+  case ucp_Kannada:     return US"Kannada"; 
+  case ucp_Katakana:    return US"Katakana"; 
+  case ucp_Kharoshthi:  return US"Kharoshthi"; 
+  case ucp_Khmer:       return US"Khmer"; 
+  case ucp_Lao:         return US"Lao"; 
+  case ucp_Latin:       return US"Latin"; 
+  case ucp_Limbu:       return US"Limbu"; 
+  case ucp_Linear_B:    return US"Linear_B"; 
+  case ucp_Malayalam:   return US"Malayalam"; 
+  case ucp_Mongolian:   return US"Mongolian"; 
+  case ucp_Myanmar:     return US"Myanmar"; 
+  case ucp_New_Tai_Lue: return US"New_Tai_Lue"; 
+  case ucp_Nko:         return US"Nko"; 
+  case ucp_Ogham:       return US"Ogham"; 
+  case ucp_Old_Italic:  return US"Old_Italic"; 
+  case ucp_Old_Persian: return US"Old_Persian"; 
+  case ucp_Oriya:       return US"Oriya"; 
+  case ucp_Osmanya:     return US"Osmanya"; 
+  case ucp_Phags_Pa:    return US"Phags_Pa"; 
+  case ucp_Phoenician:  return US"Phoenician"; 
+  case ucp_Runic:       return US"Runic"; 
+  case ucp_Shavian:     return US"Shavian"; 
+  case ucp_Sinhala:     return US"Sinhala"; 
+  case ucp_Syloti_Nagri: return US"Syloti_Nagri"; 
+  case ucp_Syriac:      return US"Syriac"; 
+  case ucp_Tagalog:     return US"Tagalog"; 
+  case ucp_Tagbanwa:    return US"Tagbanwa"; 
+  case ucp_Tai_Le:      return US"Tai_Le"; 
+  case ucp_Tamil:       return US"Tamil"; 
+  case ucp_Telugu:      return US"Telugu"; 
+  case ucp_Thaana:      return US"Thaana"; 
+  case ucp_Thai:        return US"Thai"; 
+  case ucp_Tibetan:     return US"Tibetan"; 
+  case ucp_Tifinagh:    return US"Tifinagh"; 
+  case ucp_Ugaritic:    return US"Ugaritic"; 
+  case ucp_Yi:          return US"Yi"; 
+  /* New for Unicode 5.1: */
+  case ucp_Carian:      return US"Carian"; 
+  case ucp_Cham:        return US"Cham"; 
+  case ucp_Kayah_Li:    return US"Kayah_Li"; 
+  case ucp_Lepcha:      return US"Lepcha"; 
+  case ucp_Lycian:      return US"Lycian"; 
+  case ucp_Lydian:      return US"Lydian"; 
+  case ucp_Ol_Chiki:    return US"Ol_Chiki"; 
+  case ucp_Rejang:      return US"Rejang"; 
+  case ucp_Saurashtra:  return US"Saurashtra"; 
+  case ucp_Sundanese:   return US"Sundanese"; 
+  case ucp_Vai:         return US"Vai"; 
+  /* New for Unicode 5.2: */
+  case ucp_Avestan:     return US"Avestan"; 
+  case ucp_Bamum:       return US"Bamum"; 
+  case ucp_Egyptian_Hieroglyphs: return US"Egyptian_Hieroglyphs"; 
+  case ucp_Imperial_Aramaic: return US"Imperial_Aramaic"; 
+  case ucp_Inscriptional_Pahlavi: return US"Inscriptional_Pahlavi"; 
+  case ucp_Inscriptional_Parthian: return US"Inscriptional_Parthian"; 
+  case ucp_Javanese:    return US"Javanese"; 
+  case ucp_Kaithi:      return US"Kaithi"; 
+  case ucp_Lisu:        return US"Lisu"; 
+  case ucp_Meetei_Mayek: return US"Meetei_Mayek"; 
+  case ucp_Old_South_Arabian: return US"Old_South_Arabian"; 
+  case ucp_Old_Turkic:  return US"Old_Turkic"; 
+  case ucp_Samaritan:   return US"Samaritan"; 
+  case ucp_Tai_Tham:    return US"Tai_Tham"; 
+  case ucp_Tai_Viet:    return US"Tai_Viet"; 
+  /* New for Unicode 6.0.0 */
+  case ucp_Batak:       return US"Batak"; 
+  case ucp_Brahmi:      return US"Brahmi"; 
+  case ucp_Mandaic:     return US"Mandaic"; 
+
+  /* New for Unicode 6.1.0 */
+  case ucp_Chakma:               return US"Chakma"; 
+  case ucp_Meroitic_Cursive:     return US"Meroitic_Cursive"; 
+  case ucp_Meroitic_Hieroglyphs: return US"Meroitic_Hieroglyphs"; 
+  case ucp_Miao:                 return US"Miao"; 
+  case ucp_Sharada:              return US"Sharada"; 
+  case ucp_Sora_Sompeng:         return US"Sora Sompent"; 
+  case ucp_Takri:                return US"Takri"; 
+
+  /* New for Unicode 7.0.0 */
+  case ucp_Bassa_Vah:          return US"Bassa_Vah"; 
+  case ucp_Caucasian_Albanian: return US"Caucasian_Albanian"; 
+  case ucp_Duployan:           return US"Duployan"; 
+  case ucp_Elbasan:            return US"Elbasan"; 
+  case ucp_Grantha:            return US"Grantha"; 
+  case ucp_Khojki:             return US"Khojki"; 
+  case ucp_Khudawadi:          return US"Khudawadi"; 
+  case ucp_Linear_A:           return US"Linear_A"; 
+  case ucp_Mahajani:           return US"Mahajani"; 
+  case ucp_Manichaean:         return US"Manichaean"; 
+  case ucp_Mende_Kikakui:      return US"Mende_Kikakui"; 
+  case ucp_Modi:               return US"Modi"; 
+  case ucp_Mro:                return US"Mro"; 
+  case ucp_Nabataean:          return US"Nabataean"; 
+  case ucp_Old_North_Arabian:  return US"Old_North_Arabian"; 
+  case ucp_Old_Permic:         return US"Old_Permic"; 
+  case ucp_Pahawh_Hmong:       return US"Pahawh_Hmong"; 
+  case ucp_Palmyrene:          return US"Palmyrene"; 
+  case ucp_Psalter_Pahlavi:    return US"Psalter_Pahlavi"; 
+  case ucp_Pau_Cin_Hau:        return US"Pau_Cin_Hau"; 
+  case ucp_Siddham:            return US"Siddham"; 
+  case ucp_Tirhuta:            return US"Tirhuta"; 
+  case ucp_Warang_Citi:        return US"Warang_Citi"; 
+
+  /* New for Unicode 8.0.0 */
+  case ucp_Ahom:                  return US"Ahom"; 
+  case ucp_Anatolian_Hieroglyphs: return US"Anatolian_Hieroglyphs"; 
+  case ucp_Hatran:                return US"Hatran"; 
+  case ucp_Multani:               return US"Multani"; 
+  case ucp_Old_Hungarian:         return US"Old_Hungarian"; 
+  case ucp_SignWriting:           return US"SignWriting"; 
+
+  /* New for Unicode 10.0.0 (no update since 8.0.0) */
+  case ucp_Adlam:               return US"Adlam"; 
+  case ucp_Bhaiksuki:           return US"Bhaiksuki"; 
+  case ucp_Marchen:             return US"Marchen"; 
+  case ucp_Newa:                return US"Newa"; 
+  case ucp_Osage:               return US"Osage"; 
+  case ucp_Tangut:              return US"Tangut"; 
+  case ucp_Masaram_Gondi:       return US"Masaram_Gondi"; 
+  case ucp_Nushu:               return US"Nushu"; 
+  case ucp_Soyombo:             return US"Soyombo"; 
+  case ucp_Zanabazar_Square:    return US"Zanabazar_Square"; 
+
+  /* New for Unicode 11.0.0 */ 
+  case ucp_Dogra:               return US"Dogra";  
+  case ucp_Gunjala_Gondi:       return US"Gunjala_Gondi";  
+  case ucp_Hanifi_Rohingya:     return US"Hanifi_Rohingya";  
+  case ucp_Makasar:             return US"Makasar";  
+  case ucp_Medefaidrin:         return US"Medefaidrin"; 
+  case ucp_Old_Sogdian:         return US"Old_Sogdian";  
+  case ucp_Sogdian:             return US"Sogdian"; 
+  }
+}
+
+
+
 /*************************************************
 *      Print Unicode property info for a char    *
 *************************************************/
@ -56,15 +234,17 @@ print_prop(int c)
 int type = UCD_CATEGORY(c);
 int fulltype = UCD_CHARTYPE(c);
 int script = UCD_SCRIPT(c);
+int scriptx = UCD_SCRIPTX(c);
 int gbprop = UCD_GRAPHBREAK(c);
 int othercase = UCD_OTHERCASE(c);
 int caseset = UCD_CASESET(c);

 unsigned char *fulltypename = US"??";
 unsigned char *typename = US"??";
-unsigned char *scriptname = US"??";
 unsigned char *graphbreak = US"??";

+unsigned char *scriptname = find_script_name(script); 
+
 switch (type)
  {
  case ucp_C: typename = US"Control"; break;
@ -132,172 +312,6 @@ switch(gbprop)
  default:                 graphbreak = US"Unknown"; break;  
  }
  
-switch(script)
-  {
-  case ucp_Unknown:     scriptname = US"Unknown"; break; 
-  case ucp_Arabic:      scriptname = US"Arabic"; break;
-  case ucp_Armenian:    scriptname = US"Armenian"; break;
-  case ucp_Balinese:    scriptname = US"Balinese"; break;
-  case ucp_Bengali:     scriptname = US"Bengali"; break;
-  case ucp_Bopomofo:    scriptname = US"Bopomofo"; break;
-  case ucp_Braille:     scriptname = US"Braille"; break;
-  case ucp_Buginese:    scriptname = US"Buginese"; break;
-  case ucp_Buhid:       scriptname = US"Buhid"; break;
-  case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
-  case ucp_Cherokee:    scriptname = US"Cherokee"; break;
-  case ucp_Common:      scriptname = US"Common"; break;
-  case ucp_Coptic:      scriptname = US"Coptic"; break;
-  case ucp_Cuneiform:   scriptname = US"Cuneiform"; break;
-  case ucp_Cypriot:     scriptname = US"Cypriot"; break;
-  case ucp_Cyrillic:    scriptname = US"Cyrillic"; break;
-  case ucp_Deseret:     scriptname = US"Deseret"; break;
-  case ucp_Devanagari:  scriptname = US"Devanagari"; break;
-  case ucp_Ethiopic:    scriptname = US"Ethiopic"; break;
-  case ucp_Georgian:    scriptname = US"Georgian"; break;
-  case ucp_Glagolitic:  scriptname = US"Glagolitic"; break;
-  case ucp_Gothic:      scriptname = US"Gothic"; break;
-  case ucp_Greek:       scriptname = US"Greek"; break;
-  case ucp_Gujarati:    scriptname = US"Gujarati"; break;
-  case ucp_Gurmukhi:    scriptname = US"Gurmukhi"; break;
-  case ucp_Han:         scriptname = US"Han"; break;
-  case ucp_Hangul:      scriptname = US"Hangul"; break;
-  case ucp_Hanunoo:     scriptname = US"Hanunoo"; break;
-  case ucp_Hebrew:      scriptname = US"Hebrew"; break;
-  case ucp_Hiragana:    scriptname = US"Hiragana"; break;
-  case ucp_Inherited:   scriptname = US"Inherited"; break;
-  case ucp_Kannada:     scriptname = US"Kannada"; break;
-  case ucp_Katakana:    scriptname = US"Katakana"; break;
-  case ucp_Kharoshthi:  scriptname = US"Kharoshthi"; break;
-  case ucp_Khmer:       scriptname = US"Khmer"; break;
-  case ucp_Lao:         scriptname = US"Lao"; break;
-  case ucp_Latin:       scriptname = US"Latin"; break;
-  case ucp_Limbu:       scriptname = US"Limbu"; break;
-  case ucp_Linear_B:    scriptname = US"Linear_B"; break;
-  case ucp_Malayalam:   scriptname = US"Malayalam"; break;
-  case ucp_Mongolian:   scriptname = US"Mongolian"; break;
-  case ucp_Myanmar:     scriptname = US"Myanmar"; break;
-  case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
-  case ucp_Nko:         scriptname = US"Nko"; break;
-  case ucp_Ogham:       scriptname = US"Ogham"; break;
-  case ucp_Old_Italic:  scriptname = US"Old_Italic"; break;
-  case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
-  case ucp_Oriya:       scriptname = US"Oriya"; break;
-  case ucp_Osmanya:     scriptname = US"Osmanya"; break;
-  case ucp_Phags_Pa:    scriptname = US"Phags_Pa"; break;
-  case ucp_Phoenician:  scriptname = US"Phoenician"; break;
-  case ucp_Runic:       scriptname = US"Runic"; break;
-  case ucp_Shavian:     scriptname = US"Shavian"; break;
-  case ucp_Sinhala:     scriptname = US"Sinhala"; break;
-  case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
-  case ucp_Syriac:      scriptname = US"Syriac"; break;
-  case ucp_Tagalog:     scriptname = US"Tagalog"; break;
-  case ucp_Tagbanwa:    scriptname = US"Tagbanwa"; break;
-  case ucp_Tai_Le:      scriptname = US"Tai_Le"; break;
-  case ucp_Tamil:       scriptname = US"Tamil"; break;
-  case ucp_Telugu:      scriptname = US"Telugu"; break;
-  case ucp_Thaana:      scriptname = US"Thaana"; break;
-  case ucp_Thai:        scriptname = US"Thai"; break;
-  case ucp_Tibetan:     scriptname = US"Tibetan"; break;
-  case ucp_Tifinagh:    scriptname = US"Tifinagh"; break;
-  case ucp_Ugaritic:    scriptname = US"Ugaritic"; break;
-  case ucp_Yi:          scriptname = US"Yi"; break;
-  /* New for Unicode 5.1: */
-  case ucp_Carian:      scriptname = US"Carian"; break;
-  case ucp_Cham:        scriptname = US"Cham"; break;
-  case ucp_Kayah_Li:    scriptname = US"Kayah_Li"; break;
-  case ucp_Lepcha:      scriptname = US"Lepcha"; break;
-  case ucp_Lycian:      scriptname = US"Lycian"; break;
-  case ucp_Lydian:      scriptname = US"Lydian"; break;
-  case ucp_Ol_Chiki:    scriptname = US"Ol_Chiki"; break;
-  case ucp_Rejang:      scriptname = US"Rejang"; break;
-  case ucp_Saurashtra:  scriptname = US"Saurashtra"; break;
-  case ucp_Sundanese:   scriptname = US"Sundanese"; break;
-  case ucp_Vai:         scriptname = US"Vai"; break;
-  /* New for Unicode 5.2: */
-  case ucp_Avestan:     scriptname = US"Avestan"; break;
-  case ucp_Bamum:       scriptname = US"Bamum"; break;
-  case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
-  case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
-  case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
-  case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
-  case ucp_Javanese:    scriptname = US"Javanese"; break;
-  case ucp_Kaithi:      scriptname = US"Kaithi"; break;
-  case ucp_Lisu:        scriptname = US"Lisu"; break;
-  case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
-  case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
-  case ucp_Old_Turkic:  scriptname = US"Old_Turkic"; break;
-  case ucp_Samaritan:   scriptname = US"Samaritan"; break;
-  case ucp_Tai_Tham:    scriptname = US"Tai_Tham"; break;
-  case ucp_Tai_Viet:    scriptname = US"Tai_Viet"; break;
-  /* New for Unicode 6.0.0 */
-  case ucp_Batak:       scriptname = US"Batak"; break;
-  case ucp_Brahmi:      scriptname = US"Brahmi"; break;
-  case ucp_Mandaic:     scriptname = US"Mandaic"; break;
-
-  /* New for Unicode 6.1.0 */
-  case ucp_Chakma:               scriptname = US"Chakma"; break;
-  case ucp_Meroitic_Cursive:     scriptname = US"Meroitic_Cursive"; break;
-  case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
-  case ucp_Miao:                 scriptname = US"Miao"; break;
-  case ucp_Sharada:              scriptname = US"Sharada"; break;
-  case ucp_Sora_Sompeng:         scriptname = US"Sora Sompent"; break;
-  case ucp_Takri:                scriptname = US"Takri"; break;
-
-  /* New for Unicode 7.0.0 */
-  case ucp_Bassa_Vah:          scriptname = US"Bassa_Vah"; break;
-  case ucp_Caucasian_Albanian: scriptname = US"Caucasian_Albanian"; break;
-  case ucp_Duployan:           scriptname = US"Duployan"; break;
-  case ucp_Elbasan:            scriptname = US"Elbasan"; break;
-  case ucp_Grantha:            scriptname = US"Grantha"; break;
-  case ucp_Khojki:             scriptname = US"Khojki"; break;
-  case ucp_Khudawadi:          scriptname = US"Khudawadi"; break;
-  case ucp_Linear_A:           scriptname = US"Linear_A"; break;
-  case ucp_Mahajani:           scriptname = US"Mahajani"; break;
-  case ucp_Manichaean:         scriptname = US"Manichaean"; break;
-  case ucp_Mende_Kikakui:      scriptname = US"Mende_Kikakui"; break;
-  case ucp_Modi:               scriptname = US"Modi"; break;
-  case ucp_Mro:                scriptname = US"Mro"; break;
-  case ucp_Nabataean:          scriptname = US"Nabataean"; break;
-  case ucp_Old_North_Arabian:  scriptname = US"Old_North_Arabian"; break;
-  case ucp_Old_Permic:         scriptname = US"Old_Permic"; break;
-  case ucp_Pahawh_Hmong:       scriptname = US"Pahawh_Hmong"; break;
-  case ucp_Palmyrene:          scriptname = US"Palmyrene"; break;
-  case ucp_Psalter_Pahlavi:    scriptname = US"Psalter_Pahlavi"; break;
-  case ucp_Pau_Cin_Hau:        scriptname = US"Pau_Cin_Hau"; break;
-  case ucp_Siddham:            scriptname = US"Siddham"; break;
-  case ucp_Tirhuta:            scriptname = US"Tirhuta"; break;
-  case ucp_Warang_Citi:        scriptname = US"Warang_Citi"; break;
-
-  /* New for Unicode 8.0.0 */
-  case ucp_Ahom:                  scriptname = US"Ahom"; break;
-  case ucp_Anatolian_Hieroglyphs: scriptname = US"Anatolian_Hieroglyphs"; break;
-  case ucp_Hatran:                scriptname = US"Hatran"; break;
-  case ucp_Multani:               scriptname = US"Multani"; break;
-  case ucp_Old_Hungarian:         scriptname = US"Old_Hungarian"; break;
-  case ucp_SignWriting:           scriptname = US"SignWriting"; break;
-
-  /* New for Unicode 10.0.0 (no update since 8.0.0) */
-  case ucp_Adlam:               scriptname = US"Adlam"; break;
-  case ucp_Bhaiksuki:           scriptname = US"Bhaiksuki"; break;
-  case ucp_Marchen:             scriptname = US"Marchen"; break;
-  case ucp_Newa:                scriptname = US"Newa"; break;
-  case ucp_Osage:               scriptname = US"Osage"; break;
-  case ucp_Tangut:              scriptname = US"Tangut"; break;
-  case ucp_Masaram_Gondi:       scriptname = US"Masaram_Gondi"; break;
-  case ucp_Nushu:               scriptname = US"Nushu"; break;
-  case ucp_Soyombo:             scriptname = US"Soyombo"; break;
-  case ucp_Zanabazar_Square:    scriptname = US"Zanabazar_Square"; break;
-
-  /* New for Unicode 11.0.0 */ 
-  case ucp_Dogra:               scriptname = US"Dogra"; break; 
-  case ucp_Gunjala_Gondi:       scriptname = US"Gunjala_Gondi"; break; 
-  case ucp_Hanifi_Rohingya:     scriptname = US"Hanifi_Rohingya"; break; 
-  case ucp_Makasar:             scriptname = US"Makasar"; break; 
-  case ucp_Medefaidrin:         scriptname = US"Medefaidrin"; break;
-  case ucp_Old_Sogdian:         scriptname = US"Old_Sogdian"; break; 
-  case ucp_Sogdian:             scriptname = US"Sogdian"; break;
-  }
-
 printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
 if (othercase != c) 
  {
@ -309,6 +323,23 @@ if (othercase != c)
      if (*p != othercase && *p != c) printf(", %04x", *p);
    }   
  } 
+  
+if (scriptx != script)
+  {
+  printf(", ["); 
+  if (scriptx >= 0) printf("%s", find_script_name(scriptx)); else
+    {
+    char *sep = ""; 
+    const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
+    while (*p != 0)
+      {
+      printf("%s%s", sep, find_script_name(*p++));
+      sep = ", "; 
+      }   
+    }  
+  printf("]");
+  } 
+ 
 printf("\n");
 }

@ -319,9 +350,22 @@ printf("\n");
 *************************************************/

 int
-main(void)
+main(int argc, char **argv)
 {
 unsigned char buffer[1024];
+
+if (argc > 1)
+  {
+  int i;
+  for (i = 1; i < argc; i++)
+    {
+    unsigned char *endptr; 
+    int c = strtoul(argv[i], CSS(&endptr), 16);
+    print_prop(c); 
+    }
+  return 0;
+  }    
+
 while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
  {
  unsigned char name[24];
--- a/maint/ucptestdata/testinput1
+++ b/maint/ucptestdata/testinput1
@ -38,3 +38,5 @@ findprop 118a0 11ac7 16ad0
 findprop 11700 14400 108e0 11280 1d800

 findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
+
+findprop  a836  a833  1cf4  20f0  1cd0
--- a/maint/ucptestdata/testoutput1
+++ b/maint/ucptestdata/testoutput1
@ -289,7 +289,7 @@ ffe3 Symbol: Modifier symbol, Common, Other
 ffe4 Symbol: Other symbol, Common, Other
 ffe5 Symbol: Currency symbol, Common, Other
 ffe6 Symbol: Currency symbol, Common, Other
-ffe7 Control: Unassigned, Common, Other
+ffe7 Control: Unassigned, Unknown, Other
 findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
 ffe8 Symbol: Other symbol, Common, Other
 ffe9 Symbol: Mathematical symbol, Common, Other
@ -298,22 +298,22 @@ ffeb Symbol: Mathematical symbol, Common, Other
 ffec Symbol: Mathematical symbol, Common, Other
 ffed Symbol: Other symbol, Common, Other
 ffee Symbol: Other symbol, Common, Other
-ffef Control: Unassigned, Common, Other
+ffef Control: Unassigned, Unknown, Other
 findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
-fff8 Control: Unassigned, Common, Control
+fff8 Control: Unassigned, Unknown, Control
 fff9 Control: Format, Common, Control
 fffa Control: Format, Common, Control
 fffb Control: Format, Common, Control
 fffc Symbol: Other symbol, Common, Other
 fffd Symbol: Other symbol, Common, Other
-fffe Control: Unassigned, Common, Other
-ffff Control: Unassigned, Common, Other
+fffe Control: Unassigned, Unknown, Other
+ffff Control: Unassigned, Unknown, Other
 findprop 10000 10001 e01ef f0000 100000
 10000 Letter: Other letter, Linear_B, Other
 10001 Letter: Other letter, Linear_B, Other
 e01ef Mark: Non-spacing mark, Inherited, Extend
-f0000 Control: Private use, Common, Other
-100000 Control: Private use, Common, Other
+f0000 Control: Private use, Unknown, Other
+100000 Control: Private use, Unknown, Other

 findprop 1b00 12000 7c0 a840 10900
 1b00 Mark: Non-spacing mark, Balinese, Extend
@ -379,3 +379,10 @@ findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
 16e48 Letter: Upper case letter, Medefaidrin, Other, 16e68
 10f27 Letter: Other letter, Old_Sogdian, Other
 10f30 Letter: Other letter, Sogdian, Other
+
+findprop  a836  a833  1cf4  20f0  1cd0
+a836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
+a833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
+1cf4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
+20f0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
+1cd0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -1778,6 +1778,8 @@ typedef struct {
  uint8_t gbprop;     /* ucp_gbControl, etc. (grapheme break property) */
  uint8_t caseset;    /* offset to multichar other cases or zero */
  int32_t other_case; /* offset to other case, or zero if none */
+  int16_t scriptx;    /* script extension value */
+  int16_t dummy;      /* spare - to round to multiple of 4 bytes */  
 } ucd_record;

 /* UCD access macros */
@ -1800,6 +1802,7 @@ typedef struct {
 #define UCD_GRAPHBREAK(ch)  GET_UCD(ch)->gbprop
 #define UCD_CASESET(ch)     GET_UCD(ch)->caseset
 #define UCD_OTHERCASE(ch)   ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
+#define UCD_SCRIPTX(ch)     GET_UCD(ch)->scriptx

 /* Header for serialized pcre2 codes. */

@ -1858,6 +1861,7 @@ extern const uint8_t          PRIV(utf8_table4)[];
 #define _pcre2_vspace_list             PCRE2_SUFFIX(_pcre2_vspace_list_)
 #define _pcre2_ucd_caseless_sets       PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
 #define _pcre2_ucd_digit_sets          PCRE2_SUFFIX(_pcre2_ucd_digit_sets_)
+#define _pcre2_ucd_script_sets         PCRE2_SUFFIX(_pcre2_ucd_script_sets_)
 #define _pcre2_ucd_records             PCRE2_SUFFIX(_pcre2_ucd_records_)
 #define _pcre2_ucd_stage1              PCRE2_SUFFIX(_pcre2_ucd_stage1_)
 #define _pcre2_ucd_stage2              PCRE2_SUFFIX(_pcre2_ucd_stage2_)
@ -1880,6 +1884,7 @@ extern const uint32_t                  PRIV(hspace_list)[];
 extern const uint32_t                  PRIV(vspace_list)[];
 extern const uint32_t                  PRIV(ucd_caseless_sets)[];
 extern const uint32_t                  PRIV(ucd_digit_sets)[];
+extern const uint8_t                   PRIV(ucd_script_sets)[];
 extern const ucd_record                PRIV(ucd_records)[];
 #if PCRE2_CODE_UNIT_WIDTH == 32
 extern const ucd_record                PRIV(dummy_ucd_record)[];
--- a/src/pcre2_jit_compile.c
+++ b/src/pcre2_jit_compile.c
@ -4716,11 +4716,11 @@ struct sljit_jump *jump;
 #if defined SLJIT_DEBUG && SLJIT_DEBUG
 /* dummy_ucd_record */
 const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
-SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
+SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
 SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
 #endif

-SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8);
+SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12);

 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);

@ -4756,11 +4756,11 @@ struct sljit_jump *jump;
 #if defined SLJIT_DEBUG && SLJIT_DEBUG
 /* dummy_ucd_record */
 const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
-SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
+SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
 SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
 #endif

-SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8);
+SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12);

 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);

@ -4781,8 +4781,19 @@ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2));
 OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1);
+
+// PH hacking
+//fprintf(stderr, "~~A\n");
+  OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
+  OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
+  OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
+  OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
+
 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
-OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
+
+  OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
+
+// OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
 }

@ -7775,8 +7786,18 @@ if (needstype || needsscript)
  /* Before anything else, we deal with scripts. */
  if (needsscript)
    {
+// PH hacking
+//fprintf(stderr, "~~B\n");
+
+      OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
+      OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
+      OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); 
+
    OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
-    OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
+
+      OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
+     
+    // OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);

    ccbegin = cc;

@ -7820,12 +7841,30 @@ if (needstype || needsscript)
    {
    if (!needschar)
      {
+// PH hacking
+//fprintf(stderr, "~~C\n");
+  OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
+  OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
+  OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); 
+  OP2(SLJIT_ADD, TMP1, 0, TMP2, 0, TMP1, 0);
+
      OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
-      OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
+
+  OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
+
+//      OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
      }
    else
      {
+// PH hacking
+//fprintf(stderr, "~~D\n");
+  OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
+ 
      OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
+      
+  OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
+  OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0);
+ 
      OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
      typereg = RETURN_ADDR;
      }
@ -9155,10 +9194,19 @@ if (common->utf && *cc == OP_REFI)

  CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop);

+// PH hacking
+//fprintf(stderr, "~~E\n");
+
  OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
+
  add_jump(compiler, &common->getucd, JUMP(SLJIT_FAST_CALL));

+    OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
+
  OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
+    
+    OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); 
+ 
  OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records));

  OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(ucd_record, other_case));
--- a/src/pcre2_ucd.c
+++ b/src/pcre2_ucd.c