Unicode properties data records extended to 12-bytes to include a

ScriptExtensions property.
2018-10-06 17:39:52 +00:00 · 2018-10-06 17:39:52 +00:00 · 04ba4bce0f
parent cda4780fb6
commit 04ba4bce0f
8 changed files with 4642 additions and 3752 deletions
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@ -61,26 +61,39 @@
 #  property, which is used by PCRE2 as a grapheme breaking property. This was
 #  done when updating to Unicode 11.0.0 (July 2018).
 #
 #  Added code to add a Script Extensions field to records.
 #
 #
 # The main tables generated by this script are used by macros defined in
 # pcre2_internal.h. They look up Unicode character properties using short
 # sequences of code that contains no branches, which makes for greater speed.
 #
 # Conceptually, there is a table of records (of type ucd_record), containing a
-# script number, character type, grapheme break type, offset to caseless
+# script number, script extension value, character type, grapheme break type,
-# matching set, and offset to the character's other case for every character.
+# offset to caseless matching set, offset to the character's other case, for
-# However, a real table covering all Unicode characters would be far too big.
+# every character. However, a real table covering all Unicode characters would
-# It can be efficiently compressed by observing that many characters have the
+# be far too big. It can be efficiently compressed by observing that many
-# same record, and many blocks of characters (taking 128 characters in a block)
+# characters have the same record, and many blocks of characters (taking 128
-# have the same set of records as other blocks. This leads to a 2-stage lookup
+# characters in a block) have the same set of records as other blocks. This
-# process.
+# leads to a 2-stage lookup process.
 #
-# This script constructs four tables. The ucd_caseless_sets table contains
+# This script constructs six tables. The ucd_caseless_sets table contains
 # lists of characters that all match each other caselessly. Each list is
 # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
 # any valid character. The first list is empty; this is used for characters
 # that are not part of any list.
 #
 # The ucd_digit_sets table contains the code points of the '9' characters in
 # each set of 10 decimal digits in Unicode. This is used to ensure that digits
 # in script runs all come from the same set. The first element in the vector
 # contains the number of subsequent elements, which are in ascending order.
 #
 # The ucd_script_sets vector contains lists of script numbers that are the
 # Script Extensions properties of certain characters. Each list is terminated
 # by zero (ucp_Unknown). A character with more than one script listed for its
 # Script Extension property has a negative value in its record. This is the
 # negated offset to the start of the relevant list.
 #
 # The ucd_records table contains one instance of every unique record that is
 # required. The ucd_stage1 table is indexed by a character's block number, and
 # yields what is in effect a "virtual" block number. The ucd_stage2 table is a
@ -117,11 +130,8 @@
 # In these examples, no other blocks resolve to the same "virtual" block, as it
 # happens, but plenty of other blocks do share "virtual" blocks.
 #
 # There is a fourth table, maintained by hand, which translates from the
 # individual character types such as ucp_Cc to the general types like ucp_C.
 #
 #  Philip Hazel, 03 July 2008
-#  Last Updated: 07 July 2018
+#  Last Updated: 03 October 2018
 #
 #
 # 01-March-2010:     Updated list of scripts for Unicode 5.2.0
@ -144,6 +154,7 @@
 # 07-July-2018:      Added code to scan emoji-data.txt for the Extended
 #                      Pictographic property.
 # 01-October-2018:   Added the 'Unknown' script name
 # 03-October-2018:   Added new field for Script Extensions
 ##############################################################################
@ -165,6 +176,32 @@ def get_other_case(chardata):
          return int(chardata[2], 16) - int(chardata[0], 16)
        return 0
 # Parse a line of ScriptExtensions.txt
 def get_script_extension(chardata):
        this_script_list = list(chardata[1].split(' '))
        if len(this_script_list) == 1:
          return script_abbrevs.index(this_script_list[0])
        script_numbers = []
        for d in this_script_list:
          script_numbers.append(script_abbrevs.index(d))
        script_numbers.append(0)
        script_numbers_length = len(script_numbers)
        for i in range(1, len(script_lists) - script_numbers_length + 1):
          for j in range(0, script_numbers_length):
            found = True 
            if script_lists[i+j] != script_numbers[j]:
              found = False 
              break
          if found:
            return -i
        # Not found in existing lists 
        return_value = len(script_lists)
        script_lists.extend(script_numbers)
        return -return_value 
 # Read the whole table in memory, setting/checking the Unicode version
 def read_table(file_name, get_value, default_value):
@ -330,24 +367,24 @@ def print_records(records, record_size):
                print(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
        print('};\n')
-script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
+script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal',
- 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
+ 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian',
- 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
+ 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana',
- 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
+ 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam',
- 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
+ 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic',
- 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
+ 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana',
- 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
+ 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi',
 # New for Unicode 5.0
- 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
+ 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician',
 # New for Unicode 5.1
- 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
+ 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai',
 # New for Unicode 5.2
- 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
+ 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic',
- 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
+ 'Inscriptional_Pahlavi', 'Inscriptional_Parthian',
- 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
+ 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek',
- 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
+ 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet',
 # New for Unicode 6.0.0
- 'Batak', 'Brahmi', 'Mandaic', \
+ 'Batak', 'Brahmi', 'Mandaic',
 # New for Unicode 6.1.0
 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
 # New for Unicode 7.0.0
@ -366,6 +403,39 @@ script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille
  'Old_Sogdian', 'Sogdian'
 ]
 script_abbrevs = [
  'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
  'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
  'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
  'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
  'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
  'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
  'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
 #New for Unicode 5.0
  'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
 #New for Unicode 5.1
  'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
  'Sund', 'Vaii',
 #New for Unicode 5.2
  'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
  'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
 #New for Unicode 6.0.0
  'Batk', 'Brah', 'Mand',
 #New for Unicode 6.1.0
  'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
 #New for Unicode 7.0.0
  'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
  'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
  'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
 #New for Unicode 8.0.0
  'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
 #New for Unicode 10.0.0
  'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
  'Zanb',
 #New for Unicode 11.0.0
  'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd'
  ] 
 category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
@ -415,6 +485,28 @@ for line in file:
                break_props[i] = break_property_names.index('Extended_Pictographic')
 file.close()
 # The Script Extensions property default value is the Script value. Parse the
 # file, setting 'Unknown' as the default (this will never be a Script Extension
 # value), then scan it and fill in the default from Scripts. Code added by PH
 # in October 2018. Positive values are used for just a single script for a
 # code point. Negative values are negated offsets in a list of lists of
 # multiple scripts. Initialize this list with a single entry, as the zeroth
 # element is never used.
 script_lists = [0]
 script_abbrevs_default = script_abbrevs.index('Zzzz')
 scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)
 for i in range(0, MAX_UNICODE):
  if scriptx[i] == script_abbrevs_default:
    scriptx[i] = script[i] 
 # With the addition of the new Script Extensions field, we need some padding 
 # to get the Unicode records up to 12 bytes (multiple of 4). Set a value 
 # greater than 255 to make the field 16 bits.
 padding_dummy = [0] * MAX_UNICODE
 padding_dummy[0] = 256
 # This block of code was added by PH in September 2012. I am not a Python
 # programmer, so the style is probably dreadful, but it does the job. It scans
@ -427,7 +519,7 @@ file.close()
 # sets only one value, so first we go through the table and set "return"
 # offsets for those that are not already set.
-for c in range(0x10ffff):
+for c in range(MAX_UNICODE):
  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
    other_case[c + other_case[c]] = -other_case[c]
@ -435,7 +527,7 @@ for c in range(0x10ffff):
 sets = []
-for c in range(0x10ffff):
+for c in range(MAX_UNICODE):
  o = c + other_case[c]
  # Trigger when this character's other case does not point back here. We
@ -489,7 +581,7 @@ for s in sets:
 # Combine the tables
 table, records = combine_tables(script, category, break_props,
-  caseless_offsets, other_case)
+  caseless_offsets, other_case, scriptx, padding_dummy)
 record_size, record_struct = get_record_size_struct(list(records.keys()))
@ -537,7 +629,7 @@ print("a comment was received about space saving - maybe the guy linked")
 print("all the modules rather than using a library - so we include a")
 print("condition to cut out the tables when not needed. But don't leave")
 print("a totally empty module because some compilers barf at that.")
-print("Instead, just supply small dummy tables. */")
+print("Instead, just supply some small dummy tables. */")
 print()
 print("#ifndef SUPPORT_UNICODE")
 print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
@ -559,6 +651,8 @@ print("  ucp_Cn,         /* type unassigned */")
 print("  ucp_gbOther,    /* grapheme break property */")
 print("  0,              /* case set */")
 print("  0,              /* other case */")
 print("  ucp_Unknown,    /* script extension */")
 print("  0,              /* dummy filler */")
 print("  }};")
 print("#endif")
 print()
@ -609,8 +703,7 @@ digitsets.sort()
 print("/* This table lists the code points for the '9' characters in each")
 print("set of decimal digits. It is used to ensure that all the digits in")
-print("a script run come from the same set. */")
+print("a script run come from the same set. */\n")
 print()
 print("const uint32_t PRIV(ucd_digit_sets)[] = {")
 print("  %d,  /* Number of subsequent values */" % len(digitsets), end='')
@ -621,12 +714,28 @@ for d in digitsets:
    count = 0
  print(" 0x%05x," % d, end='')
  count += 1
-print("\n};")
+print("\n};\n")
-print()
+
 print("/* This vector is a list of lists of scripts for the Script Extension")
 print("property. Each sublist is zero-terminated. */\n")
 print("const uint8_t PRIV(ucd_script_sets)[] = {")
 count = 0
 print("  /*   0 */", end='')
 for d in script_lists:
  print(" %3d," % d, end='')
  count += 1   
  if d == 0:
    print("\n  /* %3d */" % count, end='')  
 print("\n};\n")
 # Output the main UCD tables.
-print("/* These are the main two-stage UCD tables. */\n")
+print("/* These are the main two-stage UCD tables. The fields in each record are:")
 print("script (8 bits), character type (8 bits), grapheme break property (8 bits),")
 print("offset to multichar other cases or zero (8 bits), offset to other case")
 print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy")
 print("16-bit field to make the whole thing a multiple of 4 bytes. */\n")
 print_records(records, record_size)
 print_table(min_stage1, 'PRIV(ucd_stage1)')
--- a/maint/Unicode.tables/ScriptExtensions.txt
+++ b/maint/Unicode.tables/ScriptExtensions.txt
@ -0,0 +1,531 @@
 # ScriptExtensions-11.0.0.txt
 # Date: 2018-02-04, 20:04:00 GMT
 # © 2018 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 #
 # Unicode Character Database
 #   For documentation, see http://www.unicode.org/reports/tr44/
 #
 # The Script_Extensions property indicates which characters are commonly used
 # with more than one script, but with a limited number of scripts.
 # For each code point, there is one or more property values.  Each such value is a Script property value.
 # For more information, see:
 #   UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/
 #     Especially the sections:
 #       http://www.unicode.org/reports/tr24/#Assignment_Script_Values
 #       http://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
 #
 # Each Script_Extensions value in this file consists of a set
 # of one or more abbreviated Script property values. The ordering of the
 # values in that set is not material, but for stability in presentation
 # it is given here as alphabetical.
 #
 # The Script_Extensions values are presented in sorted order in the file.
 # They are sorted first by the number of Script property values in their sets,
 # and then alphabetically by first differing Script property value.
 #
 # Following each distinct Script_Extensions value is the list of code
 # points associated with that value, listed in code point order.
 #
 # All code points not explicitly listed for Script_Extensions
 # have as their value the corresponding Script property value
 #
 # @missing: 0000..10FFFF; <script>
 # ================================================
 # Property:	Script_Extensions
 # ================================================
 # Script_Extensions=Beng
 1CF7          ; Beng # Mc       VEDIC SIGN ATIKRAMA
 # Total code points: 1
 # ================================================
 # Script_Extensions=Deva
 1CD1          ; Deva # Mn       VEDIC TONE SHARA
 1CD4          ; Deva # Mn       VEDIC SIGN YAJURVEDIC MIDLINE SVARITA
 1CDB          ; Deva # Mn       VEDIC TONE TRIPLE SVARITA
 1CDE..1CDF    ; Deva # Mn   [2] VEDIC TONE TWO DOTS BELOW..VEDIC TONE THREE DOTS BELOW
 1CE2..1CE8    ; Deva # Mn   [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
 1CE9          ; Deva # Lo       VEDIC SIGN ANUSVARA ANTARGOMUKHA
 1CEB..1CEC    ; Deva # Lo   [2] VEDIC SIGN ANUSVARA VAMAGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
 1CEE..1CF1    ; Deva # Lo   [4] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ANUSVARA UBHAYATO MUKHA
 # Total code points: 19
 # ================================================
 # Script_Extensions=Dupl
 1BCA0..1BCA3  ; Dupl # Cf   [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
 # Total code points: 4
 # ================================================
 # Script_Extensions=Grek
 0342          ; Grek # Mn       COMBINING GREEK PERISPOMENI
 0345          ; Grek # Mn       COMBINING GREEK YPOGEGRAMMENI
 1DC0..1DC1    ; Grek # Mn   [2] COMBINING DOTTED GRAVE ACCENT..COMBINING DOTTED ACUTE ACCENT
 # Total code points: 4
 # ================================================
 # Script_Extensions=Hani
 3006          ; Hani # Lo       IDEOGRAPHIC CLOSING MARK
 303E..303F    ; Hani # So   [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE
 3190..3191    ; Hani # So   [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
 3192..3195    ; Hani # No   [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
 3196..319F    ; Hani # So  [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
 31C0..31E3    ; Hani # So  [36] CJK STROKE T..CJK STROKE Q
 3220..3229    ; Hani # No  [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
 322A..3247    ; Hani # So  [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
 3280..3289    ; Hani # No  [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
 328A..32B0    ; Hani # So  [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
 32C0..32CB    ; Hani # So  [12] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DECEMBER
 3358..3370    ; Hani # So  [25] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR TWENTY-FOUR
 337B..337F    ; Hani # So   [5] SQUARE ERA NAME HEISEI..SQUARE CORPORATION
 33E0..33FE    ; Hani # So  [31] IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY ONE..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
 1D360..1D371  ; Hani # No  [18] COUNTING ROD UNIT DIGIT ONE..COUNTING ROD TENS DIGIT NINE
 1F250..1F251  ; Hani # So   [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
 # Total code points: 237
 # ================================================
 # Script_Extensions=Latn
 0363..036F    ; Latn # Mn  [13] COMBINING LATIN SMALL LETTER A..COMBINING LATIN SMALL LETTER X
 # Total code points: 13
 # ================================================
 # Script_Extensions=Arab Copt
 102E0         ; Arab Copt # Mn       COPTIC EPACT THOUSANDS MARK
 102E1..102FB  ; Arab Copt # No  [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
 # Total code points: 28
 # ================================================
 # Script_Extensions=Arab Rohg
 06D4          ; Arab Rohg # Po       ARABIC FULL STOP
 # Total code points: 1
 # ================================================
 # Script_Extensions=Arab Syrc
 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
 0670          ; Arab Syrc # Mn       ARABIC LETTER SUPERSCRIPT ALEF
 # Total code points: 12
 # ================================================
 # Script_Extensions=Arab Thaa
 0660..0669    ; Arab Thaa # Nd  [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
 FDF2          ; Arab Thaa # Lo       ARABIC LIGATURE ALLAH ISOLATED FORM
 FDFD          ; Arab Thaa # So       ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
 # Total code points: 12
 # ================================================
 # Script_Extensions=Armn Geor
 0589          ; Armn Geor # Po       ARMENIAN FULL STOP
 # Total code points: 1
 # ================================================
 # Script_Extensions=Beng Deva
 1CD5..1CD6    ; Beng Deva # Mn   [2] VEDIC TONE YAJURVEDIC AGGRAVATED INDEPENDENT SVARITA..VEDIC TONE YAJURVEDIC INDEPENDENT SVARITA
 1CD8          ; Beng Deva # Mn       VEDIC TONE CANDRA BELOW
 1CE1          ; Beng Deva # Mc       VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA
 1CEA          ; Beng Deva # Lo       VEDIC SIGN ANUSVARA BAHIRGOMUKHA
 1CED          ; Beng Deva # Mn       VEDIC SIGN TIRYAK
 1CF5..1CF6    ; Beng Deva # Lo   [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
 A8F1          ; Beng Deva # Mn       COMBINING DEVANAGARI SIGN AVAGRAHA
 # Total code points: 9
 # ================================================
 # Script_Extensions=Bopo Hani
 302A..302D    ; Bopo Hani # Mn   [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
 # Total code points: 4
 # ================================================
 # Script_Extensions=Bugi Java
 A9CF          ; Bugi Java # Lm       JAVANESE PANGRANGKEP
 # Total code points: 1
 # ================================================
 # Script_Extensions=Cprt Linb
 10100..10102  ; Cprt Linb # Po   [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK
 10137..1013F  ; Cprt Linb # So   [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
 # Total code points: 12
 # ================================================
 # Script_Extensions=Cyrl Glag
 0484          ; Cyrl Glag # Mn       COMBINING CYRILLIC PALATALIZATION
 0487          ; Cyrl Glag # Mn       COMBINING CYRILLIC POKRYTIE
 2E43          ; Cyrl Glag # Po       DASH WITH LEFT UPTURN
 A66F          ; Cyrl Glag # Mn       COMBINING CYRILLIC VZMET
 # Total code points: 4
 # ================================================
 # Script_Extensions=Cyrl Latn
 0485..0486    ; Cyrl Latn # Mn   [2] COMBINING CYRILLIC DASIA PNEUMATA..COMBINING CYRILLIC PSILI PNEUMATA
 # Total code points: 2
 # ================================================
 # Script_Extensions=Cyrl Perm
 0483          ; Cyrl Perm # Mn       COMBINING CYRILLIC TITLO
 # Total code points: 1
 # ================================================
 # Script_Extensions=Deva Gran
 1CD3          ; Deva Gran # Po       VEDIC SIGN NIHSHVASA
 1CF2..1CF3    ; Deva Gran # Mc   [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA
 1CF8..1CF9    ; Deva Gran # Mn   [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
 # Total code points: 5
 # ================================================
 # Script_Extensions=Deva Shrd
 1CD7          ; Deva Shrd # Mn       VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA
 1CD9          ; Deva Shrd # Mn       VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA SCHROEDER
 1CDC..1CDD    ; Deva Shrd # Mn   [2] VEDIC TONE KATHAKA ANUDATTA..VEDIC TONE DOT BELOW
 1CE0          ; Deva Shrd # Mn       VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
 # Total code points: 5
 # ================================================
 # Script_Extensions=Deva Taml
 A8F3          ; Deva Taml # Lo       DEVANAGARI SIGN CANDRABINDU VIRAMA
 # Total code points: 1
 # ================================================
 # Script_Extensions=Geor Latn
 10FB          ; Geor Latn # Po       GEORGIAN PARAGRAPH SEPARATOR
 # Total code points: 1
 # ================================================
 # Script_Extensions=Gran Taml
 0BE6..0BEF    ; Gran Taml # Nd  [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE
 0BF0..0BF2    ; Gran Taml # No   [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND
 0BF3          ; Gran Taml # So       TAMIL DAY SIGN
 11301         ; Gran Taml # Mn       GRANTHA SIGN CANDRABINDU
 11303         ; Gran Taml # Mc       GRANTHA SIGN VISARGA
 1133B..1133C  ; Gran Taml # Mn   [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA
 # Total code points: 18
 # ================================================
 # Script_Extensions=Gujr Khoj
 0AE6..0AEF    ; Gujr Khoj # Nd  [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
 # Total code points: 10
 # ================================================
 # Script_Extensions=Guru Mult
 0A66..0A6F    ; Guru Mult # Nd  [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE
 # Total code points: 10
 # ================================================
 # Script_Extensions=Hira Kana
 3031..3035    ; Hira Kana # Lm   [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
 3099..309A    ; Hira Kana # Mn   [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
 309B..309C    ; Hira Kana # Sk   [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
 30A0          ; Hira Kana # Pd       KATAKANA-HIRAGANA DOUBLE HYPHEN
 30FC          ; Hira Kana # Lm       KATAKANA-HIRAGANA PROLONGED SOUND MARK
 FF70          ; Hira Kana # Lm       HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
 FF9E..FF9F    ; Hira Kana # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
 # Total code points: 14
 # ================================================
 # Script_Extensions=Mong Phag
 1802..1803    ; Mong Phag # Po   [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
 1805          ; Mong Phag # Po       MONGOLIAN FOUR DOTS
 # Total code points: 3
 # ================================================
 # Script_Extensions=Arab Syrc Thaa
 061C          ; Arab Syrc Thaa # Cf       ARABIC LETTER MARK
 # Total code points: 1
 # ================================================
 # Script_Extensions=Beng Cakm Sylo
 09E6..09EF    ; Beng Cakm Sylo # Nd  [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE
 # Total code points: 10
 # ================================================
 # Script_Extensions=Cakm Mymr Tale
 1040..1049    ; Cakm Mymr Tale # Nd  [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE
 # Total code points: 10
 # ================================================
 # Script_Extensions=Cprt Lina Linb
 10107..10133  ; Cprt Lina Linb # No  [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
 # Total code points: 45
 # ================================================
 # Script_Extensions=Deva Gran Knda
 1CF4          ; Deva Gran Knda # Mn       VEDIC TONE CANDRA ABOVE
 # Total code points: 1
 # ================================================
 # Script_Extensions=Deva Gran Latn
 20F0          ; Deva Gran Latn # Mn       COMBINING ASTERISK ABOVE
 # Total code points: 1
 # ================================================
 # Script_Extensions=Hani Hira Kana
 303C          ; Hani Hira Kana # Lo       MASU MARK
 303D          ; Hani Hira Kana # Po       PART ALTERNATION MARK
 # Total code points: 2
 # ================================================
 # Script_Extensions=Kali Latn Mymr
 A92E          ; Kali Latn Mymr # Po       KAYAH LI SIGN CWI
 # Total code points: 1
 # ================================================
 # Script_Extensions=Arab Rohg Syrc Thaa
 060C          ; Arab Rohg Syrc Thaa # Po       ARABIC COMMA
 061B          ; Arab Rohg Syrc Thaa # Po       ARABIC SEMICOLON
 061F          ; Arab Rohg Syrc Thaa # Po       ARABIC QUESTION MARK
 # Total code points: 3
 # ================================================
 # Script_Extensions=Beng Deva Gran Knda
 1CD0          ; Beng Deva Gran Knda # Mn       VEDIC TONE KARSHANA
 1CD2          ; Beng Deva Gran Knda # Mn       VEDIC TONE PRENKHA
 # Total code points: 2
 # ================================================
 # Script_Extensions=Buhd Hano Tagb Tglg
 1735..1736    ; Buhd Hano Tagb Tglg # Po   [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
 # Total code points: 2
 # ================================================
 # Script_Extensions=Deva Dogr Kthi Mahj
 0966..096F    ; Deva Dogr Kthi Mahj # Nd  [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
 # Total code points: 10
 # ================================================
 # Script_Extensions=Bopo Hang Hani Hira Kana
 3003          ; Bopo Hang Hani Hira Kana # Po       DITTO MARK
 3013          ; Bopo Hang Hani Hira Kana # So       GETA MARK
 301C          ; Bopo Hang Hani Hira Kana # Pd       WAVE DASH
 301D          ; Bopo Hang Hani Hira Kana # Ps       REVERSED DOUBLE PRIME QUOTATION MARK
 301E..301F    ; Bopo Hang Hani Hira Kana # Pe   [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
 3030          ; Bopo Hang Hani Hira Kana # Pd       WAVY DASH
 3037          ; Bopo Hang Hani Hira Kana # So       IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
 FE45..FE46    ; Bopo Hang Hani Hira Kana # Po   [2] SESAME DOT..WHITE SESAME DOT
 # Total code points: 10
 # ================================================
 # Script_Extensions=Bopo Hang Hani Hira Kana Yiii
 3001..3002    ; Bopo Hang Hani Hira Kana Yiii # Po   [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
 3008          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT ANGLE BRACKET
 3009          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT ANGLE BRACKET
 300A          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT DOUBLE ANGLE BRACKET
 300B          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT DOUBLE ANGLE BRACKET
 300C          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT CORNER BRACKET
 300D          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT CORNER BRACKET
 300E          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT WHITE CORNER BRACKET
 300F          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT WHITE CORNER BRACKET
 3010          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT BLACK LENTICULAR BRACKET
 3011          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT BLACK LENTICULAR BRACKET
 3014          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT TORTOISE SHELL BRACKET
 3015          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT TORTOISE SHELL BRACKET
 3016          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT WHITE LENTICULAR BRACKET
 3017          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT WHITE LENTICULAR BRACKET
 3018          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT WHITE TORTOISE SHELL BRACKET
 3019          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT WHITE TORTOISE SHELL BRACKET
 301A          ; Bopo Hang Hani Hira Kana Yiii # Ps       LEFT WHITE SQUARE BRACKET
 301B          ; Bopo Hang Hani Hira Kana Yiii # Pe       RIGHT WHITE SQUARE BRACKET
 30FB          ; Bopo Hang Hani Hira Kana Yiii # Po       KATAKANA MIDDLE DOT
 FF61          ; Bopo Hang Hani Hira Kana Yiii # Po       HALFWIDTH IDEOGRAPHIC FULL STOP
 FF62          ; Bopo Hang Hani Hira Kana Yiii # Ps       HALFWIDTH LEFT CORNER BRACKET
 FF63          ; Bopo Hang Hani Hira Kana Yiii # Pe       HALFWIDTH RIGHT CORNER BRACKET
 FF64..FF65    ; Bopo Hang Hani Hira Kana Yiii # Po   [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT
 # Total code points: 26
 # ================================================
 # Script_Extensions=Deva Knda Mlym Orya Taml Telu
 1CDA          ; Deva Knda Mlym Orya Taml Telu # Mn       VEDIC TONE DOUBLE SVARITA
 # Total code points: 1
 # ================================================
 # Script_Extensions=Adlm Arab Mand Mani Phlp Rohg Sogd Syrc
 0640          ; Adlm Arab Mand Mani Phlp Rohg Sogd Syrc # Lm       ARABIC TATWEEL
 # Total code points: 1
 # ================================================
 # Script_Extensions=Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
 A836..A837    ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So   [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
 A838          ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # Sc       NORTH INDIC RUPEE MARK
 A839          ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So       NORTH INDIC QUANTITY MARK
 # Total code points: 4
 # ================================================
 # Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
 0952          ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh # Mn       DEVANAGARI STRESS SIGN ANUDATTA
 # Total code points: 1
 # ================================================
 # Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh
 A833..A835    ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh # No   [3] NORTH INDIC FRACTION ONE SIXTEENTH..NORTH INDIC FRACTION THREE SIXTEENTHS
 # Total code points: 3
 # ================================================
 # Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
 0951          ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh # Mn       DEVANAGARI STRESS SIGN UDATTA
 # Total code points: 1
 # ================================================
 # Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh
 A830..A832    ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh # No   [3] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE QUARTERS
 # Total code points: 3
 # ================================================
 # Script_Extensions=Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
 0964          ; Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po       DEVANAGARI DANDA
 # Total code points: 1
 # ================================================
 # Script_Extensions=Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
 0965          ; Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po       DEVANAGARI DOUBLE DANDA
 # Total code points: 1
 # EOF
--- a/maint/ucptest.c
+++ b/maint/ucptest.c
@ -9,11 +9,12 @@
     ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
 */
-/* The program expects to read commands on stdin, and it writes output
+/* If there are arguments, they are a list of hexadecimal code points whose
-to stdout. There is only one command, "findprop", followed by a list of Unicode 
+properties are to be output. Otherwise, the program expects to read commands on
-code points as hex numbers (without any prefixes). The output is one line per 
+stdin, and it writes output to stdout. There is only one command, "findprop",
-character, giving its Unicode properties followed by its other case if there is 
+followed by a list of Unicode code points as hex numbers (without any
-one. */
+prefixes). The output is one line per character, giving its Unicode properties
 followed by its other case if there is one. */
 #ifdef HAVE_CONFIG_H
 #include "../src/config.h"
@ -46,6 +47,183 @@ one. */
 /*************************************************
 *          Find a script name                    *
 *************************************************/
 static unsigned char *
 find_script_name(int script)
 {
 switch(script)
  {
  default:              return US"??"; 
  case ucp_Unknown:     return US"Unknown";  
  case ucp_Arabic:      return US"Arabic"; 
  case ucp_Armenian:    return US"Armenian"; 
  case ucp_Balinese:    return US"Balinese"; 
  case ucp_Bengali:     return US"Bengali"; 
  case ucp_Bopomofo:    return US"Bopomofo"; 
  case ucp_Braille:     return US"Braille"; 
  case ucp_Buginese:    return US"Buginese"; 
  case ucp_Buhid:       return US"Buhid"; 
  case ucp_Canadian_Aboriginal: return US"Canadian_Aboriginal"; 
  case ucp_Cherokee:    return US"Cherokee"; 
  case ucp_Common:      return US"Common"; 
  case ucp_Coptic:      return US"Coptic"; 
  case ucp_Cuneiform:   return US"Cuneiform"; 
  case ucp_Cypriot:     return US"Cypriot"; 
  case ucp_Cyrillic:    return US"Cyrillic"; 
  case ucp_Deseret:     return US"Deseret"; 
  case ucp_Devanagari:  return US"Devanagari"; 
  case ucp_Ethiopic:    return US"Ethiopic"; 
  case ucp_Georgian:    return US"Georgian"; 
  case ucp_Glagolitic:  return US"Glagolitic"; 
  case ucp_Gothic:      return US"Gothic"; 
  case ucp_Greek:       return US"Greek"; 
  case ucp_Gujarati:    return US"Gujarati"; 
  case ucp_Gurmukhi:    return US"Gurmukhi"; 
  case ucp_Han:         return US"Han"; 
  case ucp_Hangul:      return US"Hangul"; 
  case ucp_Hanunoo:     return US"Hanunoo"; 
  case ucp_Hebrew:      return US"Hebrew"; 
  case ucp_Hiragana:    return US"Hiragana"; 
  case ucp_Inherited:   return US"Inherited"; 
  case ucp_Kannada:     return US"Kannada"; 
  case ucp_Katakana:    return US"Katakana"; 
  case ucp_Kharoshthi:  return US"Kharoshthi"; 
  case ucp_Khmer:       return US"Khmer"; 
  case ucp_Lao:         return US"Lao"; 
  case ucp_Latin:       return US"Latin"; 
  case ucp_Limbu:       return US"Limbu"; 
  case ucp_Linear_B:    return US"Linear_B"; 
  case ucp_Malayalam:   return US"Malayalam"; 
  case ucp_Mongolian:   return US"Mongolian"; 
  case ucp_Myanmar:     return US"Myanmar"; 
  case ucp_New_Tai_Lue: return US"New_Tai_Lue"; 
  case ucp_Nko:         return US"Nko"; 
  case ucp_Ogham:       return US"Ogham"; 
  case ucp_Old_Italic:  return US"Old_Italic"; 
  case ucp_Old_Persian: return US"Old_Persian"; 
  case ucp_Oriya:       return US"Oriya"; 
  case ucp_Osmanya:     return US"Osmanya"; 
  case ucp_Phags_Pa:    return US"Phags_Pa"; 
  case ucp_Phoenician:  return US"Phoenician"; 
  case ucp_Runic:       return US"Runic"; 
  case ucp_Shavian:     return US"Shavian"; 
  case ucp_Sinhala:     return US"Sinhala"; 
  case ucp_Syloti_Nagri: return US"Syloti_Nagri"; 
  case ucp_Syriac:      return US"Syriac"; 
  case ucp_Tagalog:     return US"Tagalog"; 
  case ucp_Tagbanwa:    return US"Tagbanwa"; 
  case ucp_Tai_Le:      return US"Tai_Le"; 
  case ucp_Tamil:       return US"Tamil"; 
  case ucp_Telugu:      return US"Telugu"; 
  case ucp_Thaana:      return US"Thaana"; 
  case ucp_Thai:        return US"Thai"; 
  case ucp_Tibetan:     return US"Tibetan"; 
  case ucp_Tifinagh:    return US"Tifinagh"; 
  case ucp_Ugaritic:    return US"Ugaritic"; 
  case ucp_Yi:          return US"Yi"; 
  /* New for Unicode 5.1: */
  case ucp_Carian:      return US"Carian"; 
  case ucp_Cham:        return US"Cham"; 
  case ucp_Kayah_Li:    return US"Kayah_Li"; 
  case ucp_Lepcha:      return US"Lepcha"; 
  case ucp_Lycian:      return US"Lycian"; 
  case ucp_Lydian:      return US"Lydian"; 
  case ucp_Ol_Chiki:    return US"Ol_Chiki"; 
  case ucp_Rejang:      return US"Rejang"; 
  case ucp_Saurashtra:  return US"Saurashtra"; 
  case ucp_Sundanese:   return US"Sundanese"; 
  case ucp_Vai:         return US"Vai"; 
  /* New for Unicode 5.2: */
  case ucp_Avestan:     return US"Avestan"; 
  case ucp_Bamum:       return US"Bamum"; 
  case ucp_Egyptian_Hieroglyphs: return US"Egyptian_Hieroglyphs"; 
  case ucp_Imperial_Aramaic: return US"Imperial_Aramaic"; 
  case ucp_Inscriptional_Pahlavi: return US"Inscriptional_Pahlavi"; 
  case ucp_Inscriptional_Parthian: return US"Inscriptional_Parthian"; 
  case ucp_Javanese:    return US"Javanese"; 
  case ucp_Kaithi:      return US"Kaithi"; 
  case ucp_Lisu:        return US"Lisu"; 
  case ucp_Meetei_Mayek: return US"Meetei_Mayek"; 
  case ucp_Old_South_Arabian: return US"Old_South_Arabian"; 
  case ucp_Old_Turkic:  return US"Old_Turkic"; 
  case ucp_Samaritan:   return US"Samaritan"; 
  case ucp_Tai_Tham:    return US"Tai_Tham"; 
  case ucp_Tai_Viet:    return US"Tai_Viet"; 
  /* New for Unicode 6.0.0 */
  case ucp_Batak:       return US"Batak"; 
  case ucp_Brahmi:      return US"Brahmi"; 
  case ucp_Mandaic:     return US"Mandaic"; 
  /* New for Unicode 6.1.0 */
  case ucp_Chakma:               return US"Chakma"; 
  case ucp_Meroitic_Cursive:     return US"Meroitic_Cursive"; 
  case ucp_Meroitic_Hieroglyphs: return US"Meroitic_Hieroglyphs"; 
  case ucp_Miao:                 return US"Miao"; 
  case ucp_Sharada:              return US"Sharada"; 
  case ucp_Sora_Sompeng:         return US"Sora Sompent"; 
  case ucp_Takri:                return US"Takri"; 
  /* New for Unicode 7.0.0 */
  case ucp_Bassa_Vah:          return US"Bassa_Vah"; 
  case ucp_Caucasian_Albanian: return US"Caucasian_Albanian"; 
  case ucp_Duployan:           return US"Duployan"; 
  case ucp_Elbasan:            return US"Elbasan"; 
  case ucp_Grantha:            return US"Grantha"; 
  case ucp_Khojki:             return US"Khojki"; 
  case ucp_Khudawadi:          return US"Khudawadi"; 
  case ucp_Linear_A:           return US"Linear_A"; 
  case ucp_Mahajani:           return US"Mahajani"; 
  case ucp_Manichaean:         return US"Manichaean"; 
  case ucp_Mende_Kikakui:      return US"Mende_Kikakui"; 
  case ucp_Modi:               return US"Modi"; 
  case ucp_Mro:                return US"Mro"; 
  case ucp_Nabataean:          return US"Nabataean"; 
  case ucp_Old_North_Arabian:  return US"Old_North_Arabian"; 
  case ucp_Old_Permic:         return US"Old_Permic"; 
  case ucp_Pahawh_Hmong:       return US"Pahawh_Hmong"; 
  case ucp_Palmyrene:          return US"Palmyrene"; 
  case ucp_Psalter_Pahlavi:    return US"Psalter_Pahlavi"; 
  case ucp_Pau_Cin_Hau:        return US"Pau_Cin_Hau"; 
  case ucp_Siddham:            return US"Siddham"; 
  case ucp_Tirhuta:            return US"Tirhuta"; 
  case ucp_Warang_Citi:        return US"Warang_Citi"; 
  /* New for Unicode 8.0.0 */
  case ucp_Ahom:                  return US"Ahom"; 
  case ucp_Anatolian_Hieroglyphs: return US"Anatolian_Hieroglyphs"; 
  case ucp_Hatran:                return US"Hatran"; 
  case ucp_Multani:               return US"Multani"; 
  case ucp_Old_Hungarian:         return US"Old_Hungarian"; 
  case ucp_SignWriting:           return US"SignWriting"; 
  /* New for Unicode 10.0.0 (no update since 8.0.0) */
  case ucp_Adlam:               return US"Adlam"; 
  case ucp_Bhaiksuki:           return US"Bhaiksuki"; 
  case ucp_Marchen:             return US"Marchen"; 
  case ucp_Newa:                return US"Newa"; 
  case ucp_Osage:               return US"Osage"; 
  case ucp_Tangut:              return US"Tangut"; 
  case ucp_Masaram_Gondi:       return US"Masaram_Gondi"; 
  case ucp_Nushu:               return US"Nushu"; 
  case ucp_Soyombo:             return US"Soyombo"; 
  case ucp_Zanabazar_Square:    return US"Zanabazar_Square"; 
  /* New for Unicode 11.0.0 */ 
  case ucp_Dogra:               return US"Dogra";  
  case ucp_Gunjala_Gondi:       return US"Gunjala_Gondi";  
  case ucp_Hanifi_Rohingya:     return US"Hanifi_Rohingya";  
  case ucp_Makasar:             return US"Makasar";  
  case ucp_Medefaidrin:         return US"Medefaidrin"; 
  case ucp_Old_Sogdian:         return US"Old_Sogdian";  
  case ucp_Sogdian:             return US"Sogdian"; 
  }
 }
 /*************************************************
 *      Print Unicode property info for a char    *
 *************************************************/
@ -56,15 +234,17 @@ print_prop(int c)
 int type = UCD_CATEGORY(c);
 int fulltype = UCD_CHARTYPE(c);
 int script = UCD_SCRIPT(c);
 int scriptx = UCD_SCRIPTX(c);
 int gbprop = UCD_GRAPHBREAK(c);
 int othercase = UCD_OTHERCASE(c);
 int caseset = UCD_CASESET(c);
 unsigned char *fulltypename = US"??";
 unsigned char *typename = US"??";
 unsigned char *scriptname = US"??";
 unsigned char *graphbreak = US"??";
 unsigned char *scriptname = find_script_name(script); 
 switch (type)
  {
  case ucp_C: typename = US"Control"; break;
@ -132,172 +312,6 @@ switch(gbprop)
  default:                 graphbreak = US"Unknown"; break;  
  }
 switch(script)
  {
  case ucp_Unknown:     scriptname = US"Unknown"; break; 
  case ucp_Arabic:      scriptname = US"Arabic"; break;
  case ucp_Armenian:    scriptname = US"Armenian"; break;
  case ucp_Balinese:    scriptname = US"Balinese"; break;
  case ucp_Bengali:     scriptname = US"Bengali"; break;
  case ucp_Bopomofo:    scriptname = US"Bopomofo"; break;
  case ucp_Braille:     scriptname = US"Braille"; break;
  case ucp_Buginese:    scriptname = US"Buginese"; break;
  case ucp_Buhid:       scriptname = US"Buhid"; break;
  case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
  case ucp_Cherokee:    scriptname = US"Cherokee"; break;
  case ucp_Common:      scriptname = US"Common"; break;
  case ucp_Coptic:      scriptname = US"Coptic"; break;
  case ucp_Cuneiform:   scriptname = US"Cuneiform"; break;
  case ucp_Cypriot:     scriptname = US"Cypriot"; break;
  case ucp_Cyrillic:    scriptname = US"Cyrillic"; break;
  case ucp_Deseret:     scriptname = US"Deseret"; break;
  case ucp_Devanagari:  scriptname = US"Devanagari"; break;
  case ucp_Ethiopic:    scriptname = US"Ethiopic"; break;
  case ucp_Georgian:    scriptname = US"Georgian"; break;
  case ucp_Glagolitic:  scriptname = US"Glagolitic"; break;
  case ucp_Gothic:      scriptname = US"Gothic"; break;
  case ucp_Greek:       scriptname = US"Greek"; break;
  case ucp_Gujarati:    scriptname = US"Gujarati"; break;
  case ucp_Gurmukhi:    scriptname = US"Gurmukhi"; break;
  case ucp_Han:         scriptname = US"Han"; break;
  case ucp_Hangul:      scriptname = US"Hangul"; break;
  case ucp_Hanunoo:     scriptname = US"Hanunoo"; break;
  case ucp_Hebrew:      scriptname = US"Hebrew"; break;
  case ucp_Hiragana:    scriptname = US"Hiragana"; break;
  case ucp_Inherited:   scriptname = US"Inherited"; break;
  case ucp_Kannada:     scriptname = US"Kannada"; break;
  case ucp_Katakana:    scriptname = US"Katakana"; break;
  case ucp_Kharoshthi:  scriptname = US"Kharoshthi"; break;
  case ucp_Khmer:       scriptname = US"Khmer"; break;
  case ucp_Lao:         scriptname = US"Lao"; break;
  case ucp_Latin:       scriptname = US"Latin"; break;
  case ucp_Limbu:       scriptname = US"Limbu"; break;
  case ucp_Linear_B:    scriptname = US"Linear_B"; break;
  case ucp_Malayalam:   scriptname = US"Malayalam"; break;
  case ucp_Mongolian:   scriptname = US"Mongolian"; break;
  case ucp_Myanmar:     scriptname = US"Myanmar"; break;
  case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
  case ucp_Nko:         scriptname = US"Nko"; break;
  case ucp_Ogham:       scriptname = US"Ogham"; break;
  case ucp_Old_Italic:  scriptname = US"Old_Italic"; break;
  case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
  case ucp_Oriya:       scriptname = US"Oriya"; break;
  case ucp_Osmanya:     scriptname = US"Osmanya"; break;
  case ucp_Phags_Pa:    scriptname = US"Phags_Pa"; break;
  case ucp_Phoenician:  scriptname = US"Phoenician"; break;
  case ucp_Runic:       scriptname = US"Runic"; break;
  case ucp_Shavian:     scriptname = US"Shavian"; break;
  case ucp_Sinhala:     scriptname = US"Sinhala"; break;
  case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
  case ucp_Syriac:      scriptname = US"Syriac"; break;
  case ucp_Tagalog:     scriptname = US"Tagalog"; break;
  case ucp_Tagbanwa:    scriptname = US"Tagbanwa"; break;
  case ucp_Tai_Le:      scriptname = US"Tai_Le"; break;
  case ucp_Tamil:       scriptname = US"Tamil"; break;
  case ucp_Telugu:      scriptname = US"Telugu"; break;
  case ucp_Thaana:      scriptname = US"Thaana"; break;
  case ucp_Thai:        scriptname = US"Thai"; break;
  case ucp_Tibetan:     scriptname = US"Tibetan"; break;
  case ucp_Tifinagh:    scriptname = US"Tifinagh"; break;
  case ucp_Ugaritic:    scriptname = US"Ugaritic"; break;
  case ucp_Yi:          scriptname = US"Yi"; break;
  /* New for Unicode 5.1: */
  case ucp_Carian:      scriptname = US"Carian"; break;
  case ucp_Cham:        scriptname = US"Cham"; break;
  case ucp_Kayah_Li:    scriptname = US"Kayah_Li"; break;
  case ucp_Lepcha:      scriptname = US"Lepcha"; break;
  case ucp_Lycian:      scriptname = US"Lycian"; break;
  case ucp_Lydian:      scriptname = US"Lydian"; break;
  case ucp_Ol_Chiki:    scriptname = US"Ol_Chiki"; break;
  case ucp_Rejang:      scriptname = US"Rejang"; break;
  case ucp_Saurashtra:  scriptname = US"Saurashtra"; break;
  case ucp_Sundanese:   scriptname = US"Sundanese"; break;
  case ucp_Vai:         scriptname = US"Vai"; break;
  /* New for Unicode 5.2: */
  case ucp_Avestan:     scriptname = US"Avestan"; break;
  case ucp_Bamum:       scriptname = US"Bamum"; break;
  case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
  case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
  case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
  case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
  case ucp_Javanese:    scriptname = US"Javanese"; break;
  case ucp_Kaithi:      scriptname = US"Kaithi"; break;
  case ucp_Lisu:        scriptname = US"Lisu"; break;
  case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
  case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
  case ucp_Old_Turkic:  scriptname = US"Old_Turkic"; break;
  case ucp_Samaritan:   scriptname = US"Samaritan"; break;
  case ucp_Tai_Tham:    scriptname = US"Tai_Tham"; break;
  case ucp_Tai_Viet:    scriptname = US"Tai_Viet"; break;
  /* New for Unicode 6.0.0 */
  case ucp_Batak:       scriptname = US"Batak"; break;
  case ucp_Brahmi:      scriptname = US"Brahmi"; break;
  case ucp_Mandaic:     scriptname = US"Mandaic"; break;
  /* New for Unicode 6.1.0 */
  case ucp_Chakma:               scriptname = US"Chakma"; break;
  case ucp_Meroitic_Cursive:     scriptname = US"Meroitic_Cursive"; break;
  case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
  case ucp_Miao:                 scriptname = US"Miao"; break;
  case ucp_Sharada:              scriptname = US"Sharada"; break;
  case ucp_Sora_Sompeng:         scriptname = US"Sora Sompent"; break;
  case ucp_Takri:                scriptname = US"Takri"; break;
  /* New for Unicode 7.0.0 */
  case ucp_Bassa_Vah:          scriptname = US"Bassa_Vah"; break;
  case ucp_Caucasian_Albanian: scriptname = US"Caucasian_Albanian"; break;
  case ucp_Duployan:           scriptname = US"Duployan"; break;
  case ucp_Elbasan:            scriptname = US"Elbasan"; break;
  case ucp_Grantha:            scriptname = US"Grantha"; break;
  case ucp_Khojki:             scriptname = US"Khojki"; break;
  case ucp_Khudawadi:          scriptname = US"Khudawadi"; break;
  case ucp_Linear_A:           scriptname = US"Linear_A"; break;
  case ucp_Mahajani:           scriptname = US"Mahajani"; break;
  case ucp_Manichaean:         scriptname = US"Manichaean"; break;
  case ucp_Mende_Kikakui:      scriptname = US"Mende_Kikakui"; break;
  case ucp_Modi:               scriptname = US"Modi"; break;
  case ucp_Mro:                scriptname = US"Mro"; break;
  case ucp_Nabataean:          scriptname = US"Nabataean"; break;
  case ucp_Old_North_Arabian:  scriptname = US"Old_North_Arabian"; break;
  case ucp_Old_Permic:         scriptname = US"Old_Permic"; break;
  case ucp_Pahawh_Hmong:       scriptname = US"Pahawh_Hmong"; break;
  case ucp_Palmyrene:          scriptname = US"Palmyrene"; break;
  case ucp_Psalter_Pahlavi:    scriptname = US"Psalter_Pahlavi"; break;
  case ucp_Pau_Cin_Hau:        scriptname = US"Pau_Cin_Hau"; break;
  case ucp_Siddham:            scriptname = US"Siddham"; break;
  case ucp_Tirhuta:            scriptname = US"Tirhuta"; break;
  case ucp_Warang_Citi:        scriptname = US"Warang_Citi"; break;
  /* New for Unicode 8.0.0 */
  case ucp_Ahom:                  scriptname = US"Ahom"; break;
  case ucp_Anatolian_Hieroglyphs: scriptname = US"Anatolian_Hieroglyphs"; break;
  case ucp_Hatran:                scriptname = US"Hatran"; break;
  case ucp_Multani:               scriptname = US"Multani"; break;
  case ucp_Old_Hungarian:         scriptname = US"Old_Hungarian"; break;
  case ucp_SignWriting:           scriptname = US"SignWriting"; break;
  /* New for Unicode 10.0.0 (no update since 8.0.0) */
  case ucp_Adlam:               scriptname = US"Adlam"; break;
  case ucp_Bhaiksuki:           scriptname = US"Bhaiksuki"; break;
  case ucp_Marchen:             scriptname = US"Marchen"; break;
  case ucp_Newa:                scriptname = US"Newa"; break;
  case ucp_Osage:               scriptname = US"Osage"; break;
  case ucp_Tangut:              scriptname = US"Tangut"; break;
  case ucp_Masaram_Gondi:       scriptname = US"Masaram_Gondi"; break;
  case ucp_Nushu:               scriptname = US"Nushu"; break;
  case ucp_Soyombo:             scriptname = US"Soyombo"; break;
  case ucp_Zanabazar_Square:    scriptname = US"Zanabazar_Square"; break;
  /* New for Unicode 11.0.0 */ 
  case ucp_Dogra:               scriptname = US"Dogra"; break; 
  case ucp_Gunjala_Gondi:       scriptname = US"Gunjala_Gondi"; break; 
  case ucp_Hanifi_Rohingya:     scriptname = US"Hanifi_Rohingya"; break; 
  case ucp_Makasar:             scriptname = US"Makasar"; break; 
  case ucp_Medefaidrin:         scriptname = US"Medefaidrin"; break;
  case ucp_Old_Sogdian:         scriptname = US"Old_Sogdian"; break; 
  case ucp_Sogdian:             scriptname = US"Sogdian"; break;
  }
 printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
 if (othercase != c) 
  {
@ -309,6 +323,23 @@ if (othercase != c)
      if (*p != othercase && *p != c) printf(", %04x", *p);
    }   
  } 
 if (scriptx != script)
  {
  printf(", ["); 
  if (scriptx >= 0) printf("%s", find_script_name(scriptx)); else
    {
    char *sep = ""; 
    const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
    while (*p != 0)
      {
      printf("%s%s", sep, find_script_name(*p++));
      sep = ", "; 
      }   
    }  
  printf("]");
  } 
 printf("\n");
 }
@ -319,9 +350,22 @@ printf("\n");
 *************************************************/
 int
-main(void)
+main(int argc, char **argv)
 {
 unsigned char buffer[1024];
 if (argc > 1)
  {
  int i;
  for (i = 1; i < argc; i++)
    {
    unsigned char *endptr; 
    int c = strtoul(argv[i], CSS(&endptr), 16);
    print_prop(c); 
    }
  return 0;
  }    
 while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
  {
  unsigned char name[24];
--- a/maint/ucptestdata/testinput1
+++ b/maint/ucptestdata/testinput1
@ -38,3 +38,5 @@ findprop 118a0 11ac7 16ad0
 findprop 11700 14400 108e0 11280 1d800
 findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
 findprop  a836  a833  1cf4  20f0  1cd0
--- a/maint/ucptestdata/testoutput1
+++ b/maint/ucptestdata/testoutput1
@ -289,7 +289,7 @@ ffe3 Symbol: Modifier symbol, Common, Other
 ffe4 Symbol: Other symbol, Common, Other
 ffe5 Symbol: Currency symbol, Common, Other
 ffe6 Symbol: Currency symbol, Common, Other
-ffe7 Control: Unassigned, Common, Other
+ffe7 Control: Unassigned, Unknown, Other
 findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
 ffe8 Symbol: Other symbol, Common, Other
 ffe9 Symbol: Mathematical symbol, Common, Other
@ -298,22 +298,22 @@ ffeb Symbol: Mathematical symbol, Common, Other
 ffec Symbol: Mathematical symbol, Common, Other
 ffed Symbol: Other symbol, Common, Other
 ffee Symbol: Other symbol, Common, Other
-ffef Control: Unassigned, Common, Other
+ffef Control: Unassigned, Unknown, Other
 findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
-fff8 Control: Unassigned, Common, Control
+fff8 Control: Unassigned, Unknown, Control
 fff9 Control: Format, Common, Control
 fffa Control: Format, Common, Control
 fffb Control: Format, Common, Control
 fffc Symbol: Other symbol, Common, Other
 fffd Symbol: Other symbol, Common, Other
-fffe Control: Unassigned, Common, Other
+fffe Control: Unassigned, Unknown, Other
-ffff Control: Unassigned, Common, Other
+ffff Control: Unassigned, Unknown, Other
 findprop 10000 10001 e01ef f0000 100000
 10000 Letter: Other letter, Linear_B, Other
 10001 Letter: Other letter, Linear_B, Other
 e01ef Mark: Non-spacing mark, Inherited, Extend
-f0000 Control: Private use, Common, Other
+f0000 Control: Private use, Unknown, Other
-100000 Control: Private use, Common, Other
+100000 Control: Private use, Unknown, Other
 findprop 1b00 12000 7c0 a840 10900
 1b00 Mark: Non-spacing mark, Balinese, Extend
@ -379,3 +379,10 @@ findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
 16e48 Letter: Upper case letter, Medefaidrin, Other, 16e68
 10f27 Letter: Other letter, Old_Sogdian, Other
 10f30 Letter: Other letter, Sogdian, Other
 findprop  a836  a833  1cf4  20f0  1cd0
 a836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
 a833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
 1cf4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
 20f0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
 1cd0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -1778,6 +1778,8 @@ typedef struct {
  uint8_t gbprop;     /* ucp_gbControl, etc. (grapheme break property) */
  uint8_t caseset;    /* offset to multichar other cases or zero */
  int32_t other_case; /* offset to other case, or zero if none */
  int16_t scriptx;    /* script extension value */
  int16_t dummy;      /* spare - to round to multiple of 4 bytes */  
 } ucd_record;
 /* UCD access macros */
@ -1800,6 +1802,7 @@ typedef struct {
 #define UCD_GRAPHBREAK(ch)  GET_UCD(ch)->gbprop
 #define UCD_CASESET(ch)     GET_UCD(ch)->caseset
 #define UCD_OTHERCASE(ch)   ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
 #define UCD_SCRIPTX(ch)     GET_UCD(ch)->scriptx
 /* Header for serialized pcre2 codes. */
@ -1858,6 +1861,7 @@ extern const uint8_t          PRIV(utf8_table4)[];
 #define _pcre2_vspace_list             PCRE2_SUFFIX(_pcre2_vspace_list_)
 #define _pcre2_ucd_caseless_sets       PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
 #define _pcre2_ucd_digit_sets          PCRE2_SUFFIX(_pcre2_ucd_digit_sets_)
 #define _pcre2_ucd_script_sets         PCRE2_SUFFIX(_pcre2_ucd_script_sets_)
 #define _pcre2_ucd_records             PCRE2_SUFFIX(_pcre2_ucd_records_)
 #define _pcre2_ucd_stage1              PCRE2_SUFFIX(_pcre2_ucd_stage1_)
 #define _pcre2_ucd_stage2              PCRE2_SUFFIX(_pcre2_ucd_stage2_)
@ -1880,6 +1884,7 @@ extern const uint32_t                  PRIV(hspace_list)[];
 extern const uint32_t                  PRIV(vspace_list)[];
 extern const uint32_t                  PRIV(ucd_caseless_sets)[];
 extern const uint32_t                  PRIV(ucd_digit_sets)[];
 extern const uint8_t                   PRIV(ucd_script_sets)[];
 extern const ucd_record                PRIV(ucd_records)[];
 #if PCRE2_CODE_UNIT_WIDTH == 32
 extern const ucd_record                PRIV(dummy_ucd_record)[];
--- a/src/pcre2_jit_compile.c
+++ b/src/pcre2_jit_compile.c
@ -4716,11 +4716,11 @@ struct sljit_jump *jump;
 #if defined SLJIT_DEBUG && SLJIT_DEBUG
 /* dummy_ucd_record */
 const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
-SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
+SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
 SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
 #endif
-SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8);
+SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12);
 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
@ -4756,11 +4756,11 @@ struct sljit_jump *jump;
 #if defined SLJIT_DEBUG && SLJIT_DEBUG
 /* dummy_ucd_record */
 const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
-SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
+SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
 SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
 #endif
-SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8);
+SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12);
 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
@ -4781,8 +4781,19 @@ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2));
 OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1);
 // PH hacking
 //fprintf(stderr, "~~A\n");
  OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
  OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
  OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
  OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
-OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
+
  OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
 // OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
 sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
 }
@ -7775,8 +7786,18 @@ if (needstype || needsscript)
  /* Before anything else, we deal with scripts. */
  if (needsscript)
    {
 // PH hacking
 //fprintf(stderr, "~~B\n");
      OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
      OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
      OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); 
    OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
-    OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
+
      OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
    // OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
    ccbegin = cc;
@ -7820,12 +7841,30 @@ if (needstype || needsscript)
    {
    if (!needschar)
      {
 // PH hacking
 //fprintf(stderr, "~~C\n");
  OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
  OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
  OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); 
  OP2(SLJIT_ADD, TMP1, 0, TMP2, 0, TMP1, 0);
      OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
-      OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
+
  OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
 //      OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
      }
    else
      {
 // PH hacking
 //fprintf(stderr, "~~D\n");
  OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
      OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
  OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
  OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0);
      OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
      typereg = RETURN_ADDR;
      }
@ -9155,10 +9194,19 @@ if (common->utf && *cc == OP_REFI)
  CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop);
 // PH hacking
 //fprintf(stderr, "~~E\n");
  OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
  add_jump(compiler, &common->getucd, JUMP(SLJIT_FAST_CALL));
    OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
  OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
    OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); 
  OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records));
  OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(ucd_record, other_case));
--- a/src/pcre2_ucd.c
+++ b/src/pcre2_ucd.c