diff --git a/ChangeLog b/ChangeLog index ccbdefc..048bdfd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -8,10 +8,10 @@ Version 10.40-RC1 xx-xxx-2021 1. Merged patch from @carenas (GitHub #35, 7db87842) to fix pcre2grep incorrect handling of multiple passes. -2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue +2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue in pcre2grep with buffered fseek(stdin). -3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is +3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is not supported. 4. Revert an unintended change in JIT repeat detection. @@ -25,7 +25,7 @@ from pcre2grep. 8. Merged patch from @carenas (GitHub #48) to fix CMake install issue #46. -9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and +9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and substituting. 10. Add null_subject and null_replacement modifiers to pcre2test. @@ -34,9 +34,9 @@ substituting. 12. Add check for NULL replacement to pcre2_substitute(). -13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and -pcre2_substitute(), and the replacement argument of the latter, if the pointer -is NULL and the length is zero, treat as an empty string. Apparently a number +13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and +pcre2_substitute(), and the replacement argument of the latter, if the pointer +is NULL and the length is zero, treat as an empty string. Apparently a number of applications treat NULL/0 in this way. 14. Added support for Bidi_Class and Bidi_Control Unicode properties. @@ -48,15 +48,20 @@ of applications treat NULL/0 in this way. 17. A number of changes to script matching for \p and \P: (a) Script extensions for a character are now coded as a bitmap instead of - a list of script numbers, which should be faster and does not need a + a list of script numbers, which should be faster and does not need a loop. - + (b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms sc and scx). - - (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being - the same as \p{scx:scriptname} because this change happened in Perl at - release 5.26. + + (c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being + the same as \p{scx:scriptname} because this change happened in Perl at + release 5.26. + +18. The Python scripts in the maint directory have been refactored. There are +now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c +(which is #included by pcre2_tables.c). The data lists that used to be +duplicated are now held in a single common Python module. Version 10.39 29-October-2021 diff --git a/maint/GenerateCommon.py b/maint/GenerateCommon.py new file mode 100644 index 0000000..acf2405 --- /dev/null +++ b/maint/GenerateCommon.py @@ -0,0 +1,278 @@ +#! /usr/bin/python + +# PCRE2 UNICODE PROPERTY SUPPORT +# ------------------------------ + +# This file is a Python module containing common lists and functions for the +# GenerateXXX scripts that create various.c and .h files from Unicode data +# files. It was created as part of a re-organizaton of these scripts in +# December 2021. + + +# --------------------------------------------------------------------------- +# DATA LISTS +# --------------------------------------------------------------------------- + +# The lists of script names and script abbreviations must be kept in step. Note +# that the pcre2pattern and pcre2syntax documentation has lists of scripts. + +script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ + 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ + 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \ + 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \ + 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \ + 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \ + 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \ + # New for Unicode 5.0 + 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \ + # New for Unicode 5.1 + 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \ + # New for Unicode 5.2 + 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \ + 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \ + 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \ + 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \ + # New for Unicode 6.0.0 + 'Batak', 'Brahmi', 'Mandaic', \ +# New for Unicode 6.1.0 + 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri', +# New for Unicode 7.0.0 + 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi', + 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean', + 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', + 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi', +# New for Unicode 8.0.0 + 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian', + 'SignWriting', +# New for Unicode 10.0.0 + 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi', + 'Nushu', 'Soyombo', 'Zanabazar_Square', +# New for Unicode 11.0.0 + 'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin', + 'Old_Sogdian', 'Sogdian', +# New for Unicode 12.0.0 + 'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho', +# New for Unicode 13.0.0 + 'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi', +# New for Unicode 14.0.0 + 'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi' + ] + +script_abbrevs = [ + 'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans', + 'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor', + 'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr', + 'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb', + 'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya', + 'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale', + 'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii', +#New for Unicode 5.0 + 'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx', +#New for Unicode 5.1 + 'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur', + 'Sund', 'Vaii', +#New for Unicode 5.2 + 'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu', + 'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt', +#New for Unicode 6.0.0 + 'Batk', 'Brah', 'Mand', +#New for Unicode 6.1.0 + 'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr', +#New for Unicode 7.0.0 + 'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj', + 'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm', + 'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara', +#New for Unicode 8.0.0 + 'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw', +#New for Unicode 10.0.0 + 'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo', + 'Zanb', +#New for Unicode 11.0.0 + 'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd', +#New for Unicode 12.0.0 + 'Elym', 'Nand', 'Hmnp', 'Wcho', +#New for Unicode 13.0.0 + 'Chrs', 'Diak', 'Kits', 'Yezi', +#New for Unicode 14.0.0 + 'Cpmn', 'Ougr', 'Tngs', 'Toto', 'Vith' + ] + +# BIDI classes in the DerivedBidiClass.txt file, with comments. + +bidi_classes = [ + 'AL', 'Arabic letter', + 'AN', 'Arabic number', + 'B', 'Paragraph separator', + 'BN', 'Boundary neutral', + 'CS', 'Common separator', + 'EN', 'European number', + 'ES', 'European separator', + 'ET', 'European terminator', + 'FSI', 'First strong isolate', + 'L', 'Left to right', + 'LRE', 'Left to right embedding', + 'LRI', 'Left to right isolate', + 'LRO', 'Left to right override', + 'NSM', 'Non-spacing mark', + 'ON', 'Other neutral', + 'PDF', 'Pop directional format', + 'PDI', 'Pop directional isolate', + 'R', 'Right to left', + 'RLE', 'Right to left embedding', + 'RLI', 'Right to left isolate', + 'RLO', 'Right to left override', + 'S', 'Segment separator', + 'WS', 'White space' + ] + +# Particular category property names, with comments. NOTE: If ever this list +# is changed, the table called "catposstab" in the pcre2_auto_possess.c file +# must be edited to keep in step. + +category_names = [ + 'Cc', 'Control', + 'Cf', 'Format', + 'Cn', 'Unassigned', + 'Co', 'Private use', + 'Cs', 'Surrogate', + 'Ll', 'Lower case letter', + 'Lm', 'Modifier letter', + 'Lo', 'Other letter', + 'Lt', 'Title case letter', + 'Lu', 'Upper case letter', + 'Mc', 'Spacing mark', + 'Me', 'Enclosing mark', + 'Mn', 'Non-spacing mark', + 'Nd', 'Decimal number', + 'Nl', 'Letter number', + 'No', 'Other number', + 'Pc', 'Connector punctuation', + 'Pd', 'Dash punctuation', + 'Pe', 'Close punctuation', + 'Pf', 'Final punctuation', + 'Pi', 'Initial punctuation', + 'Po', 'Other punctuation', + 'Ps', 'Open punctuation', + 'Sc', 'Currency symbol', + 'Sk', 'Modifier symbol', + 'Sm', 'Mathematical symbol', + 'So', 'Other symbol', + 'Zl', 'Line separator', + 'Zp', 'Paragraph separator', + 'Zs', 'Space separator' + ] + +# The Extended_Pictographic property is not found in the file where all the +# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt +# file, but we list it here so that the name has the correct index value. + +break_properties = [ + 'CR', ' 0', + 'LF', ' 1', + 'Control', ' 2', + 'Extend', ' 3', + 'Prepend', ' 4', + 'SpacingMark', ' 5', + 'L', ' 6 Hangul syllable type L', + 'V', ' 7 Hangul syllable type V', + 'T', ' 8 Hangul syllable type T', + 'LV', ' 9 Hangul syllable type LV', + 'LVT', '10 Hangul syllable type LVT', + 'Regional_Indicator', '11', + 'Other', '12', + 'ZWJ', '13', + 'Extended_Pictographic', '14' + ] + + +# --------------------------------------------------------------------------- +# DERIVED LISTS +# --------------------------------------------------------------------------- + +# Create general character property names from the first letters of the +# particular categories. + +gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2)) +general_category_names = list(gcn_set) +general_category_names.sort() + + +# --------------------------------------------------------------------------- +# FUNCTIONS +# --------------------------------------------------------------------------- + +import sys + +# Open an output file, using the command's argument or a default. Write common +# preliminary header information. + +def open_output(default): + if len(sys.argv) > 2: + print('** Too many arguments: just give a file name') + sys.exit(1) + if len(sys.argv) == 2: + output_name = sys.argv[1] + else: + output_name = default + try: + file = open(output_name, "w") + except IOError: + print ("** Couldn't open %s" % output_name) + sys.exit(1) + + script_name = sys.argv[0] + i = script_name.rfind('/') + if i >= 0: + script_name = script_name[i+1:] + + file.write("""\ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge + +This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY! +""") + + file.write("Instead, modify the maint/%s script and run it to generate\n" + "a new version of this code.\n\n" % script_name) + + file.write("""\ +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ +\n""") + return file + +# End of UcpCommon.py diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py new file mode 100755 index 0000000..c9ead06 --- /dev/null +++ b/maint/GenerateUcd.py @@ -0,0 +1,857 @@ +#! /usr/bin/python + +# PCRE2 UNICODE PROPERTY SUPPORT +# ------------------------------ +# +# This script generates the pcre2_ucd.c file from Unicode data files. This is +# the compressed Unicode property data used by PCRE2. The script was created in +# December 2021 as part of the Unicode data generation refactoring. It is +# basically a re-working of the MultiStage2.py script that was submitted to the +# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of +# Unicode property support. A number of extensions have since been added. The +# main difference in the 2021 upgrade (apart from comments and layout) is that +# the data tables (e.g. list of script names) are now held in a separate Python +# module that is shared with the other Generate scripts. +# +# This script must be run in the "maint" directory. It requires eight Unicode +# data tables: DerivedBidiClass.txt, DerivedGeneralCategory.txt, +# GraphemeBreakProperty.txt, PropList.txt, Scripts.txt, ScriptExtensions.txt, +# CaseFolding.txt, and emoji-data.txt. These must be in the Unicode.tables +# subdirectory. +# +# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted" +# subdirectory of the Unicode database (UCD) on the Unicode web site; +# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. PropList.txt, +# Scripts.txt, ScriptExtensions.txt, and CaseFolding.txt are directly in the +# UCD directory. +# +# The emoji-data.txt file is found in the "emoji" subdirectory even though it +# is technically part of a different (but coordinated) standard as shown +# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"), +# for example: +# +# http://unicode.org/Public/emoji/13.0/ReadMe.txt +# +# ----------------------------------------------------------------------------- +# Minor modifications made to the original script: +# Added #! line at start +# Removed tabs +# Made it work with Python 2.4 by rewriting two statements that needed 2.5 +# Consequent code tidy +# Adjusted data file names to take from the Unicode.tables directory +# Adjusted global table names by prefixing _pcre_. +# Commented out stuff relating to the casefolding table, which isn't used; +# removed completely in 2012. +# Corrected size calculation +# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed. +# Update for PCRE2: name changes, and SUPPORT_UCP is abolished. +# +# Major modifications made to the original script: +# Added code to add a grapheme break property field to records. +# +# Added code to search for sets of more than two characters that must match +# each other caselessly. A new table is output containing these sets, and +# offsets into the table are added to the main output records. This new +# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer +# used. +# +# Update for Python3: +# . Processed with 2to3, but that didn't fix everything +# . Changed string.strip to str.strip +# . Added encoding='utf-8' to the open() call +# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is +# required and the result of the division is a float +# +# Added code to scan the emoji-data.txt file to find the Extended Pictographic +# property, which is used by PCRE2 as a grapheme breaking property. This was +# done when updating to Unicode 11.0.0 (July 2018). +# +# Added code to add a Script Extensions field to records. This has increased +# their size from 8 to 12 bytes, only 10 of which are currently used. +# +# Added code to add a bidi class field to records by scanning the +# DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare +# bytes, so now 11 out of 12 are in use. +# +# 01-March-2010: Updated list of scripts for Unicode 5.2.0 +# 30-April-2011: Updated list of scripts for Unicode 6.0.0 +# July-2012: Updated list of scripts for Unicode 6.1.0 +# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new +# field in the record to hold the value. Luckily, the +# structure had a hole in it, so the resulting table is +# not much bigger than before. +# 18-September-2012: Added code for multiple caseless sets. This uses the +# final hole in the structure. +# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0 +# 13-May-2014: Updated for PCRE2 +# 03-June-2014: Updated for Python 3 +# 20-June-2014: Updated for Unicode 7.0.0 +# 12-August-2014: Updated to put Unicode version into the file +# 19-June-2015: Updated for Unicode 8.0.0 +# 02-July-2017: Updated for Unicode 10.0.0 +# 03-July-2018: Updated for Unicode 11.0.0 +# 07-July-2018: Added code to scan emoji-data.txt for the Extended +# Pictographic property. +# 01-October-2018: Added the 'Unknown' script name +# 03-October-2018: Added new field for Script Extensions +# 27-July-2019: Updated for Unicode 12.1.0 +# 10-March-2020: Updated for Unicode 13.0.0 +# PCRE2-10.39: Updated for Unicode 14.0.0 +# 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class, +# and also PropList.txt for the Bidi_Control property +# 19-December-2021: Reworked script extensions lists to be bit maps instead +# of zero-terminated lists of script numbers. +# ---------------------------------------------------------------------------- +# +# Changes to the refactored script: +# +# 26-December-2021: Refactoring completed +# +# ---------------------------------------------------------------------------- +# +# +# The main tables generated by this script are used by macros defined in +# pcre2_internal.h. They look up Unicode character properties using short +# sequences of code that contains no branches, which makes for greater speed. +# +# Conceptually, there is a table of records (of type ucd_record), one for each +# Unicode character. Each record contains the script number, script extension +# value, character type, grapheme break type, offset to caseless matching set, +# offset to the character's other case, and the bidi class/control. However, a +# real table covering all Unicode characters would be far too big. It can be +# efficiently compressed by observing that many characters have the same +# record, and many blocks of characters (taking 128 characters in a block) have +# the same set of records as other blocks. This leads to a 2-stage lookup +# process. +# +# This script constructs six tables. The ucd_caseless_sets table contains +# lists of characters that all match each other caselessly. Each list is +# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than +# any valid character. The first list is empty; this is used for characters +# that are not part of any list. +# +# The ucd_digit_sets table contains the code points of the '9' characters in +# each set of 10 decimal digits in Unicode. This is used to ensure that digits +# in script runs all come from the same set. The first element in the vector +# contains the number of subsequent elements, which are in ascending order. +# +# The ucd_script_sets vector contains bitmaps that represent lists of scripts +# for the Script Extensions properties of certain characters. Each bitmap +# consists of a fixed number of unsigned 32-bit numbers, enough to allocate +# a bit for every known script. A character with more than one script listed +# for its Script Extension property has a negative value in its record. This is +# the negated offset to the start of the relevant bitmap in the ucd_script_sets +# vector. +# +# The ucd_records table contains one instance of every unique record that is +# required. The ucd_stage1 table is indexed by a character's block number, +# which is the character's code point divided by 128, since 128 is the size +# of each block. The result of a lookup in ucd_stage1 a "virtual" block number. +# +# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by +# the offset of a character within its own block, and the result is the index +# number of the required record in the ucd_records vector. +# +# The following examples are correct for the Unicode 14.0.0 database. Future +# updates may make change the actual lookup values. +# +# Example: lowercase "a" (U+0061) is in block 0 +# lookup 0 in stage1 table yields 0 +# lookup 97 (0x61) in the first table in stage2 yields 22 +# record 22 is { 34, 5, 12, 0, -32, 34, 2, 0 } +# 34 = ucp_Latin => Latin script +# 5 = ucp_Ll => Lower case letter +# 12 = ucp_gbOther => Grapheme break property "Other" +# 0 => Not part of a caseless set +# -32 (-0x20) => Other case is U+0041 +# 34 = ucp_Latin => No special Script Extension property +# 2 = ucp_bidiL => Bidi class left-to-right +# 0 => Dummy value, unused at present +# +# Almost all lowercase latin characters resolve to the same record. One or two +# are different because they are part of a multi-character caseless set (for +# example, k, K and the Kelvin symbol are such a set). +# +# Example: hiragana letter A (U+3042) is in block 96 (0x60) +# lookup 96 in stage1 table yields 91 +# lookup 66 (0x42) in table 91 in stage2 yields 613 +# record 613 is { 27, 7, 12, 0, 0, 27, 2, 0 } +# 27 = ucp_Hiragana => Hiragana script +# 7 = ucp_Lo => Other letter +# 12 = ucp_gbOther => Grapheme break property "Other" +# 0 => Not part of a caseless set +# 0 => No other case +# 27 = ucp_Hiragana => No special Script Extension property +# 2 = ucp_bidiL => Bidi class left-to-right +# 0 => Dummy value, unused at present +# +# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) +# lookup 57 in stage1 table yields 55 +# lookup 80 (0x50) in table 55 in stage2 yields 485 +# record 485 is { 28, 12, 3, 0, 0, -122, 19, 0 } +# 28 = ucp_Inherited => Script inherited from predecessor +# 12 = ucp_Mn => Non-spacing mark +# 3 = ucp_gbExtend => Grapheme break property "Extend" +# 0 => Not part of a caseless set +# 0 => No other case +# -228 => Script Extension list offset = 228 +# 13 = ucp_bidiNSM => Bidi class non-spacing mark +# 0 => Dummy value, unused at present +# +# At offset 228 in the ucd_script_sets vector we find a bitmap with bits 3, 15, +# 29, and 107 set. This means that this character is expected to be used with +# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. +# +# Philip Hazel, last updated 19 December 2021. +############################################################################## + + +# Import standard modules + +import re +import string +import sys + +# Import common data lists and functions + +from GenerateCommon import \ + bidi_classes, \ + break_properties, \ + category_names, \ + general_category_names, \ + script_abbrevs, \ + script_names, \ + open_output + +# Some general parameters + +MAX_UNICODE = 0x110000 +NOTACHAR = 0xffffffff + + +# --------------------------------------------------------------------------- +# DEFINE FUNCTIONS +# --------------------------------------------------------------------------- + + +# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt +# or DerivedGeneralCategory.txt + +def make_get_names(enum): + return lambda chardata: enum.index(chardata[1]) + + +# Parse a line of CaseFolding.txt + +def get_other_case(chardata): + if chardata[1] == 'C' or chardata[1] == 'S': + return int(chardata[2], 16) - int(chardata[0], 16) + return 0 + + +# Parse a line of ScriptExtensions.txt + +def get_script_extension(chardata): + this_script_list = list(chardata[1].split(' ')) + if len(this_script_list) == 1: + return script_abbrevs.index(this_script_list[0]) + + script_numbers = [] + for d in this_script_list: + script_numbers.append(script_abbrevs.index(d)) + script_numbers.append(0) + script_numbers_length = len(script_numbers) + + for i in range(1, len(script_lists) - script_numbers_length + 1): + for j in range(0, script_numbers_length): + found = True + if script_lists[i+j] != script_numbers[j]: + found = False + break + if found: + return -i + + # Not found in existing lists + + return_value = len(script_lists) + script_lists.extend(script_numbers) + return -return_value + + +# Read a whole table in memory, setting/checking the Unicode version + +def read_table(file_name, get_value, default_value): + global unicode_version + + f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name) + file_base = f.group(1) + version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$" + file = open(file_name, 'r', encoding='utf-8') + f = re.match(version_pat, file.readline()) + version = f.group(1) + if unicode_version == "": + unicode_version = version + elif unicode_version != version: + print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr) + + table = [default_value] * MAX_UNICODE + for line in file: + line = re.sub(r'#.*', '', line) + chardata = list(map(str.strip, line.split(';'))) + if len(chardata) <= 1: + continue + value = get_value(chardata) + m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) + char = int(m.group(1), 16) + if m.group(3) is None: + last = char + else: + last = int(m.group(3), 16) + for i in range(char, last + 1): + # It is important not to overwrite a previously set value because in the + # CaseFolding file there are lines to be ignored (returning the default + # value of 0) which often come after a line which has already set data. + if table[i] == default_value: + table[i] = value + file.close() + return table + + +# Get the smallest possible C language type for the values in a table + +def get_type_size(table): + type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4), + ("signed char", 1), ("int16_t", 2), ("int32_t", 4)] + limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127), + (-32768, 32767), (-2147483648, 2147483647)] + minval = min(table) + maxval = max(table) + for num, (minlimit, maxlimit) in enumerate(limits): + if minlimit <= minval and maxval <= maxlimit: + return type_size[num] + raise OverflowError("Too large to fit into C types") + + +# Get the total size of a list of tables + +def get_tables_size(*tables): + total_size = 0 + for table in tables: + type, size = get_type_size(table) + total_size += size * len(table) + return total_size + + +# Compress a table into the two stages + +def compress_table(table, block_size): + blocks = {} # Dictionary for finding identical blocks + stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table) + stage2 = [] # Stage 2 table contains the blocks with property values + table = tuple(table) + for i in range(0, len(table), block_size): + block = table[i:i+block_size] + start = blocks.get(block) + if start is None: + # Allocate a new block + start = len(stage2) / block_size + stage2 += block + blocks[block] = start + stage1.append(start) + return stage1, stage2 + + +# Output a table + +def write_table(table, table_name, block_size = None): + type, size = get_type_size(table) + ELEMS_PER_LINE = 16 + + s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) + if block_size: + s += ", block = %d" % block_size + f.write(s + " */\n") + table = tuple(table) + if block_size is None: + fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n" + mult = MAX_UNICODE / len(table) + for i in range(0, len(table), ELEMS_PER_LINE): + f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),))) + else: + if block_size > ELEMS_PER_LINE: + el = ELEMS_PER_LINE + else: + el = block_size + fmt = "%3d," * el + "\n" + if block_size > ELEMS_PER_LINE: + fmt = fmt * int(block_size / ELEMS_PER_LINE) + for i in range(0, len(table), block_size): + f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])) + f.write("};\n\n") + + +# Extract the unique combinations of properties into records + +def combine_tables(*tables): + records = {} + index = [] + for t in zip(*tables): + i = records.get(t) + if i is None: + i = records[t] = len(records) + index.append(i) + return index, records + + +# Create a record struct + +def get_record_size_struct(records): + size = 0 + structure = 'typedef struct {\n' + for i in range(len(records[0])): + record_slice = [record[i] for record in records] + slice_type, slice_size = get_type_size(record_slice) + # add padding: round up to the nearest power of slice_size + size = (size + slice_size - 1) & -slice_size + size += slice_size + structure += '%s property_%d;\n' % (slice_type, i) + + # round up to the first item of the next structure in array + record_slice = [record[0] for record in records] + slice_type, slice_size = get_type_size(record_slice) + size = (size + slice_size - 1) & -slice_size + + structure += '} ucd_record;\n*/\n' + return size, structure + + +# Write records + +def write_records(records, record_size): + f.write('const ucd_record PRIV(ucd_records)[] = { ' + \ + '/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size)) + records = list(zip(list(records.keys()), list(records.values()))) + records.sort(key = lambda x: x[1]) + for i, record in enumerate(records): + f.write((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,))) + f.write('};\n\n') + + +# --------------------------------------------------------------------------- +# This bit of code must have been useful when the original script was being +# developed. Retain it just in case it is ever needed again. + +# def test_record_size(): +# tests = [ \ +# ( [(3,), (6,), (6,), (1,)], 1 ), \ +# ( [(300,), (600,), (600,), (100,)], 2 ), \ +# ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \ +# ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \ +# ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ +# ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ +# ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \ +# ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \ +# ] +# for test in tests: +# size, struct = get_record_size_struct(test[0]) +# assert(size == test[1]) +# test_record_size() +# --------------------------------------------------------------------------- + + + +# --------------------------------------------------------------------------- +# MAIN CODE FOR CREATING TABLES +# --------------------------------------------------------------------------- + +unicode_version = "" + +# Some of the tables imported from GenerateCommon.py have alternate comment +# strings for use by GenerateUcpHeader. The comments are now wanted here, so +# remove them. + +bidi_classes = bidi_classes[::2] +break_properties = break_properties[::2] +category_names = category_names[::2] + +# Create the various tables from Unicode data files + +script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown')) +category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) +break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other')) +other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) +bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L')) + +# The Bidi_Control property is a Y/N value, so needs only one bit. We scan the +# PropList.txt file and set 0x80 bit in the bidi_class table. + +file = open('Unicode.tables/PropList.txt', 'r', encoding='utf-8') +for line in file: + line = re.sub(r'#.*', '', line) + chardata = list(map(str.strip, line.split(';'))) + if len(chardata) <= 1: + continue + if chardata[1] != "Bidi_Control": + continue + m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) + char = int(m.group(1), 16) + if m.group(3) is None: + last = char + else: + last = int(m.group(3), 16) + for i in range(char, last + 1): + bidi_class[i] |= 0x80; +file.close() + +# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now +# we need to find the Extended_Pictographic property for emoji characters. This +# can be set as an additional grapheme break property, because the default for +# all the emojis is "other". We scan the emoji-data.txt file and modify the +# break-props table. + +file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8') +for line in file: + line = re.sub(r'#.*', '', line) + chardata = list(map(str.strip, line.split(';'))) + if len(chardata) <= 1: + continue + if chardata[1] != "Extended_Pictographic": + continue + m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) + char = int(m.group(1), 16) + if m.group(3) is None: + last = char + else: + last = int(m.group(3), 16) + for i in range(char, last + 1): + if break_props[i] != break_properties.index('Other'): + print("WARNING: Emoji 0x%x has break property %s, not 'Other'", + i, break_properties[break_props[i]], file=sys.stderr) + break_props[i] = break_properties.index('Extended_Pictographic') +file.close() + +# The Script Extensions property default value is the Script value. Parse the +# file, setting 'Unknown' as the default (this will never be a Script Extension +# value), then scan it and fill in the default from Scripts. Code added by PH +# in October 2018. Positive values are used for just a single script for a +# code point. Negative values are negated offsets in a list of bitsets of +# multiple scripts. Initialize this list with a single entry, as the zeroth +# element is never used. + +script_lists = [0] +script_abbrevs_default = script_abbrevs.index('Zzzz') +scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default) + +# Scan all characters and set their default script extension to the main +# script. We also have to adjust negative scriptx values, following a change in +# the way these work. They are currently negated offsets into the script_lists +# list, but have to be changed into indices in the new ucd_script_sets vector, +# which has fixed-size entries. We can compute the new offset by counting the +# zeros that precede the current offset. + +for i in range(0, MAX_UNICODE): + if scriptx[i] == script_abbrevs_default: + scriptx[i] = script[i] + elif scriptx[i] < 0: + count = 1 + for j in range(-scriptx[i], 0, -1): + if script_lists[j] == 0: + count += 1 + scriptx[i] = -count * (int(len(script_names)/32) + 1) + +# With the addition of the Script Extensions field, we needed some padding to +# get the Unicode records up to 12 bytes (multiple of 4). Originally this was a +# 16-bit field and padding_dummy[0] was set to 256 to ensure this, but 8 bits +# are now used for the bidi class, so zero will do. + +padding_dummy = [0] * MAX_UNICODE +padding_dummy[0] = 0 + +# This block of code was added by PH in September 2012. It scans the other_case +# table to find sets of more than two characters that must all match each other +# caselessly. Later in this script a table of these sets is written out. +# However, we have to do this work here in order to compute the offsets in the +# table that are inserted into the main table. + +# The CaseFolding.txt file lists pairs, but the common logic for reading data +# sets only one value, so first we go through the table and set "return" +# offsets for those that are not already set. + +for c in range(MAX_UNICODE): + if other_case[c] != 0 and other_case[c + other_case[c]] == 0: + other_case[c + other_case[c]] = -other_case[c] + +# Now scan again and create equivalence sets. + +caseless_sets = [] + +for c in range(MAX_UNICODE): + o = c + other_case[c] + + # Trigger when this character's other case does not point back here. We + # now have three characters that are case-equivalent. + + if other_case[o] != -other_case[c]: + t = o + other_case[o] + + # Scan the existing sets to see if any of the three characters are already + # part of a set. If so, unite the existing set with the new set. + + appended = 0 + for s in caseless_sets: + found = 0 + for x in s: + if x == c or x == o or x == t: + found = 1 + + # Add new characters to an existing set + + if found: + found = 0 + for y in [c, o, t]: + for x in s: + if x == y: + found = 1 + if not found: + s.append(y) + appended = 1 + + # If we have not added to an existing set, create a new one. + + if not appended: + caseless_sets.append([c, o, t]) + +# End of loop looking for caseless sets. + +# Now scan the sets and set appropriate offsets for the characters. + +caseless_offsets = [0] * MAX_UNICODE + +offset = 1; +for s in caseless_sets: + for x in s: + caseless_offsets[x] = offset + offset += len(s) + 1 + +# End of block of code for creating offsets for caseless matching sets. + + +# Combine all the tables + +table, records = combine_tables(script, category, break_props, + caseless_offsets, other_case, scriptx, bidi_class, padding_dummy) + +# Find the record size and create a string definition of the structure for +# outputting as a comment. + +record_size, record_struct = get_record_size_struct(list(records.keys())) + +# Find the optimum block size for the two-stage table + +min_size = sys.maxsize +for block_size in [2 ** i for i in range(5,10)]: + size = len(records) * record_size + stage1, stage2 = compress_table(table, block_size) + size += get_tables_size(stage1, stage2) + #print "/* block size %5d => %5d bytes */" % (block_size, size) + if size < min_size: + min_size = size + min_stage1, min_stage2 = stage1, stage2 + min_block_size = block_size + + +# --------------------------------------------------------------------------- +# MAIN CODE FOR WRITING THE OUTPUT FILE +# --------------------------------------------------------------------------- + +# Open the output file (no return on failure). This call also writes standard +# header boilerplate. + +f = open_output("pcre2_ucd.c") + +# Output this file's heading text + +f.write("""\ +/* This file contains tables of Unicode properties that are extracted from +Unicode data files. See the comments at the start of maint/GenerateUcd.py for +details. + +As well as being part of the PCRE2 library, this file is #included by the +pcre2test program, which redefines the PRIV macro to change table names from +_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present, +just one of these tables is actually needed. When compiling the library, some +headers are needed. */ + +#ifndef PCRE2_PCRE2TEST +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include "pcre2_internal.h" +#endif /* PCRE2_PCRE2TEST */ + +/* The tables herein are needed only when UCP support is built, and in PCRE2 +that happens automatically with UTF support. This module should not be +referenced otherwise, so it should not matter whether it is compiled or not. +However a comment was received about space saving - maybe the guy linked all +the modules rather than using a library - so we include a condition to cut out +the tables when not needed. But don't leave a totally empty module because some +compilers barf at that. Instead, just supply some small dummy tables. */ + +#ifndef SUPPORT_UNICODE +const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0,0 }}; +const uint16_t PRIV(ucd_stage1)[] = {0}; +const uint16_t PRIV(ucd_stage2)[] = {0}; +const uint32_t PRIV(ucd_caseless_sets)[] = {0}; +#else +\n""") + +# --- Output some variable heading stuff --- + +f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size)) +f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version)) + +f.write("""\ +/* When recompiling tables with a new Unicode version, please check the types +in this structure definition with those in pcre2_internal.h (the actual field +names will be different). +\n""") + +f.write(record_struct) + +f.write(""" +/* If the 32-bit library is run in non-32-bit mode, character values greater +than 0x10ffff may be encountered. For these we set up a special record. */ + +#if PCRE2_CODE_UNIT_WIDTH == 32 +const ucd_record PRIV(dummy_ucd_record)[] = {{ + ucp_Unknown, /* script */ + ucp_Cn, /* type unassigned */ + ucp_gbOther, /* grapheme break property */ + 0, /* case set */ + 0, /* other case */ + ucp_Unknown, /* script extension */ + ucp_bidiL, /* bidi class */ + 0 /* dummy filler */ + }}; +#endif +\n""") + +# --- Output the table of caseless character sets --- + +f.write("""\ +/* This table contains lists of characters that are caseless sets of +more than one character. Each list is terminated by NOTACHAR. */ + +const uint32_t PRIV(ucd_caseless_sets)[] = { + NOTACHAR, +""") + +for s in caseless_sets: + s = sorted(s) + for x in s: + f.write(' 0x%04x,' % x) + f.write(' NOTACHAR,\n') +f.write('};\n\n') + +# --- Other tables are not needed by pcre2test --- + +f.write("""\ +/* When #included in pcre2test, we don't need the table of digit sets, nor the +the large main UCD tables. */ + +#ifndef PCRE2_PCRE2TEST +\n""") + +# --- Read Scripts.txt again for the sets of 10 digits. --- + +digitsets = [] +file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8') + +for line in file: + m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line) + if m is None: + continue + first = int(m.group(1),16) + last = int(m.group(2),16) + if ((last - first + 1) % 10) != 0: + f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last), + file=sys.stderr) + while first < last: + digitsets.append(first + 9) + first += 10 +file.close() +digitsets.sort() + +f.write("""\ +/* This table lists the code points for the '9' characters in each set of +decimal digits. It is used to ensure that all the digits in a script run come +from the same set. */ + +const uint32_t PRIV(ucd_digit_sets)[] = { +""") + +f.write(" %d, /* Number of subsequent values */" % len(digitsets)) +count = 8 +for d in digitsets: + if count == 8: + f.write("\n ") + count = 0 + f.write(" 0x%05x," % d) + count += 1 +f.write("\n};\n\n") + +f.write("""\ +/* This vector is a list of script bitsets for the Script Extension property. */ + +const uint32_t PRIV(ucd_script_sets)[] = { +""") + +bitword_count = len(script_names)/32 + 1 +bitwords = [0] * int(bitword_count) + +for d in script_lists: + if d == 0: + s = " " + f.write(" ") + for x in bitwords: + f.write("%s" % s) + s = ", " + f.write("0x%08xu" % x) + f.write(",\n") + bitwords = [0] * int(bitword_count) + + else: + x = int(d/32) + y = int(d%32) + bitwords[x] = bitwords[x] | (1 << y) + +f.write("};\n\n") + +# Output the main UCD tables. + +f.write("""\ +/* These are the main two-stage UCD tables. The fields in each record are: +script (8 bits), character type (8 bits), grapheme break property (8 bits), +offset to multichar other cases or zero (8 bits), offset to other case or zero +(32 bits, signed), script extension (16 bits, signed), bidi class (8 bits), and +a dummy 8-bit field to make the whole thing a multiple of 4 bytes. */ +\n""") + +write_records(records, record_size) +write_table(min_stage1, 'PRIV(ucd_stage1)') +write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size) + +f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size) +f.write("""\ +#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h +#endif +#endif /* SUPPORT_UNICODE */ + +#endif /* PCRE2_PCRE2TEST */ + +/* End of pcre2_ucd.c */ +""") + +f.close + +# End diff --git a/maint/GenerateUcpHeader.py b/maint/GenerateUcpHeader.py new file mode 100755 index 0000000..d26293e --- /dev/null +++ b/maint/GenerateUcpHeader.py @@ -0,0 +1,80 @@ +#! /usr/bin/python + +# PCRE2 UNICODE PROPERTY SUPPORT +# ------------------------------ + +# This script generates the pcre2_ucp.h file from Unicode data files. This +# header uses enumerations to give names to Unicode property types and script +# names. + +# This script was created in December 2021 as part of the Unicode data +# generation refactoring. + + +# Import common data lists and functions + +from GenerateCommon import \ + bidi_classes, \ + break_properties, \ + category_names, \ + general_category_names, \ + script_names, \ + open_output + +# Open the output file (no return on failure). This call also writes standard +# header boilerplate. + +f = open_output("pcre2_ucp.h") + +# Output this file's heading text + +f.write("""\ +#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD +#define PCRE2_UCP_H_IDEMPOTENT_GUARD + +/* This file contains definitions of the Unicode property values that are +returned by the UCD access macros and used throughout PCRE2. + +IMPORTANT: The specific values of the first two enums (general and particular +character categories) are assumed by the table called catposstab in the file +pcre2_auto_possess.c. They are unlikely to change, but should be checked after +an update. */ +\n""") + +f.write("/* These are the general character categories. */\n\nenum {\n") +for i in general_category_names: + f.write(" ucp_%s,\n" % i) +f.write("};\n\n") + +f.write("/* These are the particular character categories. */\n\nenum {\n") +for i in range(0, len(category_names), 2): + f.write(" ucp_%s, /* %s */\n" % (category_names[i], category_names[i+1])) +f.write("};\n\n") + +f.write("/* These are the bidi class values. */\n\nenum {\n") +for i in range(0, len(bidi_classes), 2): + sp = ' ' * (4 - len(bidi_classes[i])) + f.write(" ucp_bidi%s,%s /* %s */\n" % (bidi_classes[i], sp, bidi_classes[i+1])) +f.write("};\n\n") + +f.write("/* These are grapheme break properties. The Extended Pictographic " + "property\ncomes from the emoji-data.txt file. */\n\nenum {\n") +for i in range(0, len(break_properties), 2): + sp = ' ' * (21 - len(break_properties[i])) + f.write(" ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1])) +f.write("};\n\n") + +f.write("/* These are the script identifications, additions happen at the end. */\n\nenum {\n") +for i in script_names: + f.write(" ucp_%s,\n" % i) +f.write("\n") + +f.write(" /* This must be last */\n") +f.write(" ucp_Script_Count\n};\n\n") + +f.write("#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */\n\n") +f.write("/* End of pcre2_ucp.h */\n") + +f.close + +# End diff --git a/maint/GenerateUcpTables.py b/maint/GenerateUcpTables.py new file mode 100755 index 0000000..03e9373 --- /dev/null +++ b/maint/GenerateUcpTables.py @@ -0,0 +1,178 @@ +#! /usr/bin/python + +# PCRE2 UNICODE PROPERTY SUPPORT +# ------------------------------ + +# This script generates the pcre2_ucptables.c file, which contains tables for +# recognizing Unicode property names. It is #included by pcre2_tables.c. In +# order to reduce the number of relocations when loading the PCRE2 library, the +# names are held as a single large string, with offsets in the table. This is +# tedious to maintain by hand. Therefore, a script is used to generate the +# table. + +# This script was created in December 2021 based on the previous GenerateUtt +# script, whose output had to be manually edited into pcre2_tables.c. Here is +# the history of the original script: + +# ----------------------------------------------------------------------------- +# Modified by PH 17-March-2009 to generate the more verbose form that works +# for UTF-support in EBCDIC as well as ASCII environments. +# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0. +# Modified by PH 04-May-2010 to add new "X.." special categories. +# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0 +# Modified by ChPe 30-September-2012 to add this note; no other changes were +# necessary for Unicode 6.2.0 support. +# Modfied by PH 26-February-2013 to add the Xuc special category. +# Comment modified by PH 13-May-2014 to update to PCRE2 file names. +# Script updated to Python 3 by running it through the 2to3 converter. +# Added script names for Unicode 7.0.0, 20-June-2014. +# Added script names for Unicode 8.0.0, 19-June-2015. +# Added script names for Unicode 10.0.0, 02-July-2017. +# Added script names for Unicode 11.0.0, 03-July-2018. +# Added 'Unknown' script, 01-October-2018. +# Added script names for Unicode 12.1.0, 27-July-2019. +# Added script names for Unicode 13.0.0, 10-March-2020. +# Added Script names for Unicode 14.0.0, PCRE2-10.39 +# Added support for bidi class and bidi control, 06-December-2021 +# This also involved lower casing strings and removing underscores, in +# accordance with Unicode's "loose matching" rules, which Perl observes. +# Changed default script type from PT_SC to PT_SCX, 18-December-2021 +# ----------------------------------------------------------------------------- + +# Note subsequent changes here: + + +# Import common data lists and functions + +from GenerateCommon import \ + bidi_classes, \ + category_names, \ + general_category_names, \ + script_abbrevs, \ + script_names, \ + open_output + +# Open the output file (no return on failure). This call also writes standard +# header boilerplate. + +f = open_output("pcre2_ucptables.c") + +# The list in bidi_classes contains just the Unicode classes such as AN, LRE, +# etc., along with comments. We need to add "bidi" in front of each value, in +# order to create names that don't clash with other types of property. + +bidi_class_names = [] +for i in range(0, len(bidi_classes), 2): + bidi_class_names.append("bidi" + bidi_classes[i]) + +# Remove the comments from other lists that contain them. + +category_names = category_names[::2] + +# Create standardized versions of the names by lowercasing and removing +# underscores. + +def stdnames(x): + y = [''] * len(x) + for i in range(len(x)): + y[i] = x[i].lower().replace('_', '') + return y + +std_script_names = stdnames(script_names) +std_category_names = stdnames(category_names) +std_general_category_names = stdnames(general_category_names) +std_bidi_class_names = stdnames(bidi_class_names) + +# Create the table, starting with the Unicode script, category and bidi class +# names. We keep both the standardized name and the original, because the +# latter is used for the ucp_xx names. + +utt_table = list(zip(std_script_names, script_names, ['PT_SCX'] * len(script_names))) +utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names))) +utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names))) +utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names))) + +# Now add specials and synonyms. Note both the standardized and capitalized +# forms are needed. + +utt_table.append(('any', 'Any', 'PT_ANY')) +utt_table.append(('bidic', 'BidiC', 'PT_BIDICO')) +utt_table.append(('bidicontrol', 'Bidi_Control', 'PT_BIDICO')) +utt_table.append(('l&', 'L&', 'PT_LAMP')) +utt_table.append(('lc', 'LC', 'PT_LAMP')) +utt_table.append(('xan', 'Xan', 'PT_ALNUM')) +utt_table.append(('xps', 'Xps', 'PT_PXSPACE')) +utt_table.append(('xsp', 'Xsp', 'PT_SPACE')) +utt_table.append(('xuc', 'Xuc', 'PT_UCNC')) +utt_table.append(('xwd', 'Xwd', 'PT_WORD')) + +# Sort the table. + +utt_table.sort() + +# Output file-specific heading + +f.write("""\ +#ifdef SUPPORT_UNICODE + +/* The PRIV(utt)[] table below translates Unicode property names into type and +code values. It is searched by binary chop, so must be in collating sequence of +name. Originally, the table contained pointers to the name strings in the first +field of each entry. However, that leads to a large number of relocations when +a shared library is dynamically loaded. A significant reduction is made by +putting all the names into a single, large string and using offsets instead. +All letters are lower cased, and underscores are removed, in accordance with +the "loose matching" rules that Unicode advises and Perl uses. */ +\n""") + +# We have to use STR_ macros to define the strings so that it all works in +# UTF-8 mode on EBCDIC platforms. + +for utt in utt_table: + f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND'))) + for c in utt[0]: + if c == '&': + f.write(' STR_AMPERSAND') + else: + f.write(' STR_%s' % c); + f.write(' "\\0"\n') + +# Output the long string of concatenated names + +f.write('\nconst char PRIV(utt_names)[] =\n'); +last = '' +for utt in utt_table: + if utt == utt_table[-1]: + last = ';' + f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last)) + +# Output the property type table + +f.write('\nconst ucp_type_table PRIV(utt)[] = {\n') +offset = 0 +last = ',' +for utt in utt_table: + if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', + 'PT_SPACE', 'PT_UCNC', 'PT_WORD', 'PT_BIDICO'): + value = '0' + else: + value = 'ucp_' + utt[1] + if utt == utt_table[-1]: + last = '' + f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last)) + offset += len(utt[0]) + 1 +f.write('};\n\n') + +# Ending text + +f.write("""\ +const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); + +#endif /* SUPPORT_UNICODE */ + +/* End of pcre2_ucptables.c */ +""") + +f.close + +# End diff --git a/maint/GenerateUtt.py b/maint/GenerateUtt.py deleted file mode 100755 index 2167569..0000000 --- a/maint/GenerateUtt.py +++ /dev/null @@ -1,166 +0,0 @@ -#! /usr/bin/python - -# Generate utt tables. Note: this script has now been converted to Python 3. - -# The source file pcre2_tables.c contains (amongst other things), a table that -# is indexed by script name. In order to reduce the number of relocations when -# loading the library, the names are held as a single large string, with -# offsets in the table. This is tedious to maintain by hand. Therefore, this -# script is used to generate the table. The output is sent to stdout; usually -# that should be directed to a temporary file. Then pcre2_tables.c can be -# edited by replacing the relevant definitions and table therein with the -# temporary file. - -# Modified by PH 17-March-2009 to generate the more verbose form that works -# for UTF-support in EBCDIC as well as ASCII environments. -# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0. -# Modified by PH 04-May-2010 to add new "X.." special categories. -# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0 -# Modified by ChPe 30-September-2012 to add this note; no other changes were -# necessary for Unicode 6.2.0 support. -# Modfied by PH 26-February-2013 to add the Xuc special category. -# Comment modified by PH 13-May-2014 to update to PCRE2 file names. -# Script updated to Python 3 by running it through the 2to3 converter. -# Added script names for Unicode 7.0.0, 20-June-2014. -# Added script names for Unicode 8.0.0, 19-June-2015. -# Added script names for Unicode 10.0.0, 02-July-2017. -# Added script names for Unicode 11.0.0, 03-July-2018. -# Added 'Unknown' script, 01-October-2018. -# Added script names for Unicode 12.1.0, 27-July-2019. -# Added script names for Unicode 13.0.0, 10-March-2020. -# Added Script names for Unicode 14.0.0, PCRE2-10.39 -# Added support for bidi class and bidi control, 06-December-2021 -# This also involved lower casing strings and removing underscores, in -# accordance with Unicode's "loose matching" rules, which Perl observes. -# Changed default script type from PT_SC to PT_SCX, 18-December-2021 - -script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ - 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ - 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \ - 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \ - 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \ - 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \ - 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \ - # New for Unicode 5.0 - 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \ - # New for Unicode 5.1 - 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \ - # New for Unicode 5.2 - 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \ - 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \ - 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \ - 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \ - # New for Unicode 6.0.0 - 'Batak', 'Brahmi', 'Mandaic', \ -# New for Unicode 6.1.0 - 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri', -# New for Unicode 7.0.0 - 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi', - 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean', - 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', - 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi', -# New for Unicode 8.0.0 - 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian', - 'SignWriting', -# New for Unicode 10.0.0 - 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi', - 'Nushu', 'Soyombo', 'Zanabazar_Square', -# New for Unicode 11.0.0 - 'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin', - 'Old_Sogdian', 'Sogdian', -# New for Unicode 12.0.0 - 'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho', -# New for Unicode 13.0.0 - 'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi', -# New for Unicode 14.0.0 - 'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi' - ] - -category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', - 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', - 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] - -general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z'] - -bidiclass_names = ['bidiAL', 'bidiAN', 'bidiB', 'bidiBN', 'bidiCS', 'bidiEN', - 'bidiES', 'bidiET', 'bidiFSI', 'bidiL', 'bidiLRE', 'bidiLRI', 'bidiLRO', - 'bidiNSM', 'bidiON', 'bidiPDF', 'bidiPDI', 'bidiR', 'bidiRLE', 'bidiRLI', - 'bidiRLO', 'bidiS', 'bidiWS' ] - -# Create standardized versions of the names by lowercasing and removing -# ampersands. - -def stdnames(x): - y = [''] * len(x) - for i in range(len(x)): - y[i] = x[i].lower().replace('_', '') - return y - -std_script_names = stdnames(script_names) -std_category_names = stdnames(category_names) -std_general_category_names = stdnames(general_category_names) -std_bidiclass_names = stdnames(bidiclass_names) - -# Create the table, starting with the Unicode script, category and bidi class -# names. We keep both the standardized name and the original, because the -# latter is used for the ucp_xx names. - -utt_table = list(zip(std_script_names, script_names, ['PT_SCX'] * len(script_names))) -utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names))) -utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names))) -utt_table += list(zip(std_bidiclass_names, bidiclass_names, ['PT_BIDICL'] * len(bidiclass_names))) - -# Now add our own specials and synonyms. Note both the standardized and -# capitalized forms are needed. - -utt_table.append(('any', 'Any', 'PT_ANY')) -utt_table.append(('bidic', 'BidiC', 'PT_BIDICO')) -utt_table.append(('bidicontrol', 'Bidi_Control', 'PT_BIDICO')) -utt_table.append(('l&', 'L&', 'PT_LAMP')) -utt_table.append(('lc', 'LC', 'PT_LAMP')) -utt_table.append(('xan', 'Xan', 'PT_ALNUM')) -utt_table.append(('xps', 'Xps', 'PT_PXSPACE')) -utt_table.append(('xsp', 'Xsp', 'PT_SPACE')) -utt_table.append(('xuc', 'Xuc', 'PT_UCNC')) -utt_table.append(('xwd', 'Xwd', 'PT_WORD')) - -# Sort the table. - -utt_table.sort() - -# We have to use STR_ macros to define the strings so that it all works in -# UTF-8 mode on EBCDIC platforms. - -for utt in utt_table: - print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ') - for c in utt[0]: - if c == '&': - print('STR_AMPERSAND', end=' ') - else: - print('STR_%s' % c, end=' '); - print('"\\0"') - -# Print the actual table, using the string names - -print('') -print('const char PRIV(utt_names)[] ='); -last = '' -for utt in utt_table: - if utt == utt_table[-1]: - last = ';' - print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)) - -print('\nconst ucp_type_table PRIV(utt)[] = {') -offset = 0 -last = ',' -for utt in utt_table: - if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', - 'PT_SPACE', 'PT_UCNC', 'PT_WORD', 'PT_BIDICO'): - value = '0' - else: - value = 'ucp_' + utt[1] - if utt == utt_table[-1]: - last = '' - print(' { %3d, %s, %s }%s' % (offset, utt[2], value, last)) - offset += len(utt[0]) + 1 -print('};') diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py deleted file mode 100755 index c56e8dd..0000000 --- a/maint/MultiStage2.py +++ /dev/null @@ -1,894 +0,0 @@ -#! /usr/bin/python - -# Multistage table builder -# (c) Peter Kankowski, 2008 - -############################################################################## -# This script was submitted to the PCRE project by Peter Kankowski as part of -# the upgrading of Unicode property support. The new code speeds up property -# matching many times. The script is for the use of PCRE maintainers, to -# generate the pcre2_ucd.c file that contains a digested form of the Unicode -# data tables. A number of extensions have been added to the original script. -# -# The script has now been upgraded to Python 3 for PCRE2, and should be run in -# the maint subdirectory, using the command -# -# [python3] ./MultiStage2.py >../src/pcre2_ucd.c -# -# It requires eight Unicode data tables: DerivedBidiClass.txt, -# DerivedGeneralCategory.txt, GraphemeBreakProperty.txt, PropList.txt, -# Scripts.txt, ScriptExtensions.txt, CaseFolding.txt, and emoji-data.txt. These -# must be in the maint/Unicode.tables subdirectory. -# -# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted" -# subdirectory of the Unicode database (UCD) on the Unicode web site; -# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. PropList.txt, -# Scripts.txt, ScriptExtensions.txt, and CaseFolding.txt are directly in the -# UCD directory. -# -# The emoji-data.txt file is found in the "emoji" subdirectory even though it -# is technically part of a different (but coordinated) standard as shown -# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"), -# for example: -# -# http://unicode.org/Public/emoji/13.0/ReadMe.txt -# -# ----------------------------------------------------------------------------- -# Minor modifications made to this script: -# Added #! line at start -# Removed tabs -# Made it work with Python 2.4 by rewriting two statements that needed 2.5 -# Consequent code tidy -# Adjusted data file names to take from the Unicode.tables directory -# Adjusted global table names by prefixing _pcre_. -# Commented out stuff relating to the casefolding table, which isn't used; -# removed completely in 2012. -# Corrected size calculation -# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed. -# Update for PCRE2: name changes, and SUPPORT_UCP is abolished. -# -# Major modifications made to this script: -# Added code to add a grapheme break property field to records. -# -# Added code to search for sets of more than two characters that must match -# each other caselessly. A new table is output containing these sets, and -# offsets into the table are added to the main output records. This new -# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer -# used. -# -# Update for Python3: -# . Processed with 2to3, but that didn't fix everything -# . Changed string.strip to str.strip -# . Added encoding='utf-8' to the open() call -# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is -# required and the result of the division is a float -# -# Added code to scan the emoji-data.txt file to find the Extended Pictographic -# property, which is used by PCRE2 as a grapheme breaking property. This was -# done when updating to Unicode 11.0.0 (July 2018). -# -# Added code to add a Script Extensions field to records. This has increased -# their size from 8 to 12 bytes, only 10 of which are currently used. -# -# Added code to add a bidi class field to records by scanning the -# DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare -# bytes, so now 11 out of 12 are in use. -# -# 01-March-2010: Updated list of scripts for Unicode 5.2.0 -# 30-April-2011: Updated list of scripts for Unicode 6.0.0 -# July-2012: Updated list of scripts for Unicode 6.1.0 -# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new -# field in the record to hold the value. Luckily, the -# structure had a hole in it, so the resulting table is -# not much bigger than before. -# 18-September-2012: Added code for multiple caseless sets. This uses the -# final hole in the structure. -# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0 -# 13-May-2014: Updated for PCRE2 -# 03-June-2014: Updated for Python 3 -# 20-June-2014: Updated for Unicode 7.0.0 -# 12-August-2014: Updated to put Unicode version into the file -# 19-June-2015: Updated for Unicode 8.0.0 -# 02-July-2017: Updated for Unicode 10.0.0 -# 03-July-2018: Updated for Unicode 11.0.0 -# 07-July-2018: Added code to scan emoji-data.txt for the Extended -# Pictographic property. -# 01-October-2018: Added the 'Unknown' script name -# 03-October-2018: Added new field for Script Extensions -# 27-July-2019: Updated for Unicode 12.1.0 -# 10-March-2020: Updated for Unicode 13.0.0 -# PCRE2-10.39: Updated for Unicode 14.0.0 -# 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class, -# and also PropList.txt for the Bidi_Control property -# 19-December-2021: Reworked script extensions lists to be bit maps instead -# of zero-terminated lists of script numbers. -# ---------------------------------------------------------------------------- -# -# -# The main tables generated by this script are used by macros defined in -# pcre2_internal.h. They look up Unicode character properties using short -# sequences of code that contains no branches, which makes for greater speed. -# -# Conceptually, there is a table of records (of type ucd_record), one for each -# Unicode character. Each record contains the script number, script extension -# value, character type, grapheme break type, offset to caseless matching set, -# offset to the character's other case, and the bidi class/control. However, a -# real table covering all Unicode characters would be far too big. It can be -# efficiently compressed by observing that many characters have the same -# record, and many blocks of characters (taking 128 characters in a block) have -# the same set of records as other blocks. This leads to a 2-stage lookup -# process. -# -# This script constructs six tables. The ucd_caseless_sets table contains -# lists of characters that all match each other caselessly. Each list is -# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than -# any valid character. The first list is empty; this is used for characters -# that are not part of any list. -# -# The ucd_digit_sets table contains the code points of the '9' characters in -# each set of 10 decimal digits in Unicode. This is used to ensure that digits -# in script runs all come from the same set. The first element in the vector -# contains the number of subsequent elements, which are in ascending order. -# -# The ucd_script_sets vector contains bitmaps that represent lists of scripts -# for the Script Extensions properties of certain characters. Each bitmap -# consists of a fixed number of unsigned 32-bit numbers, enough to allocate -# a bit for every known script. A character with more than one script listed -# for its Script Extension property has a negative value in its record. This is -# the negated offset to the start of the relevant bitmap in the ucd_script_sets -# vector. -# -# The ucd_records table contains one instance of every unique record that is -# required. The ucd_stage1 table is indexed by a character's block number, -# which is the character's code point divided by 128, since 128 is the size -# of each block. The result of a lookup in ucd_stage1 a "virtual" block number. -# -# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by -# the offset of a character within its own block, and the result is the index -# number of the required record in the ucd_records vector. -# -# The following examples are correct for the Unicode 14.0.0 database. Future -# updates may make change the actual lookup values. -# -# Example: lowercase "a" (U+0061) is in block 0 -# lookup 0 in stage1 table yields 0 -# lookup 97 (0x61) in the first table in stage2 yields 22 -# record 22 is { 34, 5, 12, 0, -32, 34, 2, 0 } -# 34 = ucp_Latin => Latin script -# 5 = ucp_Ll => Lower case letter -# 12 = ucp_gbOther => Grapheme break property "Other" -# 0 => Not part of a caseless set -# -32 (-0x20) => Other case is U+0041 -# 34 = ucp_Latin => No special Script Extension property -# 2 = ucp_bidiL => Bidi class left-to-right -# 0 => Dummy value, unused at present -# -# Almost all lowercase latin characters resolve to the same record. One or two -# are different because they are part of a multi-character caseless set (for -# example, k, K and the Kelvin symbol are such a set). -# -# Example: hiragana letter A (U+3042) is in block 96 (0x60) -# lookup 96 in stage1 table yields 91 -# lookup 66 (0x42) in table 91 in stage2 yields 613 -# record 613 is { 27, 7, 12, 0, 0, 27, 2, 0 } -# 27 = ucp_Hiragana => Hiragana script -# 7 = ucp_Lo => Other letter -# 12 = ucp_gbOther => Grapheme break property "Other" -# 0 => Not part of a caseless set -# 0 => No other case -# 27 = ucp_Hiragana => No special Script Extension property -# 2 = ucp_bidiL => Bidi class left-to-right -# 0 => Dummy value, unused at present -# -# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) -# lookup 57 in stage1 table yields 55 -# lookup 80 (0x50) in table 55 in stage2 yields 485 -# record 485 is { 28, 12, 3, 0, 0, -122, 19, 0 } -# 28 = ucp_Inherited => Script inherited from predecessor -# 12 = ucp_Mn => Non-spacing mark -# 3 = ucp_gbExtend => Grapheme break property "Extend" -# 0 => Not part of a caseless set -# 0 => No other case -# -228 => Script Extension list offset = 228 -# 13 = ucp_bidiNSM => Bidi class non-spacing mark -# 0 => Dummy value, unused at present -# -# At offset 228 in the ucd_script_sets vector we find a bitmap with bits 3, 15, -# 29, and 107 set. This means that this character is expected to be used with -# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. -# -# Philip Hazel, last updated 19 December 2021. -############################################################################## - - -import re -import string -import sys - -MAX_UNICODE = 0x110000 -NOTACHAR = 0xffffffff - - -# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, -# DerivedBidiClass.txt or DerivedGeneralCategory.txt - -def make_get_names(enum): - return lambda chardata: enum.index(chardata[1]) - -# Parse a line of CaseFolding.txt - -def get_other_case(chardata): - if chardata[1] == 'C' or chardata[1] == 'S': - return int(chardata[2], 16) - int(chardata[0], 16) - return 0 - -# Parse a line of ScriptExtensions.txt - -def get_script_extension(chardata): - this_script_list = list(chardata[1].split(' ')) - if len(this_script_list) == 1: - return script_abbrevs.index(this_script_list[0]) - - script_numbers = [] - for d in this_script_list: - script_numbers.append(script_abbrevs.index(d)) - script_numbers.append(0) - script_numbers_length = len(script_numbers) - - for i in range(1, len(script_lists) - script_numbers_length + 1): - for j in range(0, script_numbers_length): - found = True - if script_lists[i+j] != script_numbers[j]: - found = False - break - if found: - return -i - - # Not found in existing lists - - return_value = len(script_lists) - script_lists.extend(script_numbers) - return -return_value - -# Read the whole table in memory, setting/checking the Unicode version - -def read_table(file_name, get_value, default_value): - global unicode_version - - f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name) - file_base = f.group(1) - version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$" - file = open(file_name, 'r', encoding='utf-8') - f = re.match(version_pat, file.readline()) - version = f.group(1) - if unicode_version == "": - unicode_version = version - elif unicode_version != version: - print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr) - - table = [default_value] * MAX_UNICODE - for line in file: - line = re.sub(r'#.*', '', line) - chardata = list(map(str.strip, line.split(';'))) - if len(chardata) <= 1: - continue - value = get_value(chardata) - m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) - char = int(m.group(1), 16) - if m.group(3) is None: - last = char - else: - last = int(m.group(3), 16) - for i in range(char, last + 1): - # It is important not to overwrite a previously set - # value because in the CaseFolding file there are lines - # to be ignored (returning the default value of 0) - # which often come after a line which has already set - # data. - if table[i] == default_value: - table[i] = value - file.close() - return table - -# Get the smallest possible C language type for the values -def get_type_size(table): - type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4), - ("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)] - limits = [(0, 255), (0, 65535), (0, 4294967295), - (-128, 127), (-32768, 32767), (-2147483648, 2147483647)] - minval = min(table) - maxval = max(table) - for num, (minlimit, maxlimit) in enumerate(limits): - if minlimit <= minval and maxval <= maxlimit: - return type_size[num] - else: - raise OverflowError("Too large to fit into C types") - -def get_tables_size(*tables): - total_size = 0 - for table in tables: - type, size = get_type_size(table) - total_size += size * len(table) - return total_size - -# Compress the table into the two stages -def compress_table(table, block_size): - blocks = {} # Dictionary for finding identical blocks - stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table) - stage2 = [] # Stage 2 table contains the blocks with property values - table = tuple(table) - for i in range(0, len(table), block_size): - block = table[i:i+block_size] - start = blocks.get(block) - if start is None: - # Allocate a new block - start = len(stage2) / block_size - stage2 += block - blocks[block] = start - stage1.append(start) - - return stage1, stage2 - -# Print a table -def print_table(table, table_name, block_size = None): - type, size = get_type_size(table) - ELEMS_PER_LINE = 16 - - s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) - if block_size: - s += ", block = %d" % block_size - print(s + " */") - table = tuple(table) - if block_size is None: - fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */" - mult = MAX_UNICODE / len(table) - for i in range(0, len(table), ELEMS_PER_LINE): - print(fmt % (table[i:i+ELEMS_PER_LINE] + - (int(i * mult),))) - else: - if block_size > ELEMS_PER_LINE: - el = ELEMS_PER_LINE - else: - el = block_size - fmt = "%3d," * el + "\n" - if block_size > ELEMS_PER_LINE: - fmt = fmt * int(block_size / ELEMS_PER_LINE) - for i in range(0, len(table), block_size): - print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])) - print("};\n") - -# Extract the unique combinations of properties into records -def combine_tables(*tables): - records = {} - index = [] - for t in zip(*tables): - i = records.get(t) - if i is None: - i = records[t] = len(records) - index.append(i) - return index, records - -def get_record_size_struct(records): - size = 0 - structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \ - 'types in this structure definition from pcre2_internal.h (the actual\n' + \ - 'field names will be different):\n\ntypedef struct {\n' - for i in range(len(records[0])): - record_slice = [record[i] for record in records] - slice_type, slice_size = get_type_size(record_slice) - # add padding: round up to the nearest power of slice_size - size = (size + slice_size - 1) & -slice_size - size += slice_size - structure += '%s property_%d;\n' % (slice_type, i) - - # round up to the first item of the next structure in array - record_slice = [record[0] for record in records] - slice_type, slice_size = get_type_size(record_slice) - size = (size + slice_size - 1) & -slice_size - - structure += '} ucd_record;\n*/\n' - return size, structure - -def test_record_size(): - tests = [ \ - ( [(3,), (6,), (6,), (1,)], 1 ), \ - ( [(300,), (600,), (600,), (100,)], 2 ), \ - ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \ - ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \ - ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ - ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ - ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \ - ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \ - ] - for test in tests: - size, struct = get_record_size_struct(test[0]) - assert(size == test[1]) - #print struct - -def print_records(records, record_size): - print('const ucd_record PRIV(ucd_records)[] = { ' + \ - '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)) - - records = list(zip(list(records.keys()), list(records.values()))) - records.sort(key = lambda x: x[1]) - for i, record in enumerate(records): - print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))) - print('};\n') - -script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', - 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', - 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', - 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', - 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', - 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', - 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', -# New for Unicode 5.0 - 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', -# New for Unicode 5.1 - 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', -# New for Unicode 5.2 - 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', - 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', - 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', - 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', -# New for Unicode 6.0.0 - 'Batak', 'Brahmi', 'Mandaic', -# New for Unicode 6.1.0 - 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri', -# New for Unicode 7.0.0 - 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi', - 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean', - 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', - 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi', -# New for Unicode 8.0.0 - 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian', - 'SignWriting', -# New for Unicode 10.0.0 - 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi', - 'Nushu', 'Soyombo', 'Zanabazar_Square', -# New for Unicode 11.0.0 - 'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin', - 'Old_Sogdian', 'Sogdian', -# New for Unicode 12.0.0 - 'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho', -# New for Unicode 13.0.0 - 'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi', -# New for Unicode 14.0.0 - 'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi' - ] - -script_abbrevs = [ - 'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans', - 'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor', - 'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr', - 'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb', - 'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya', - 'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale', - 'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii', -#New for Unicode 5.0 - 'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx', -#New for Unicode 5.1 - 'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur', - 'Sund', 'Vaii', -#New for Unicode 5.2 - 'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu', - 'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt', -#New for Unicode 6.0.0 - 'Batk', 'Brah', 'Mand', -#New for Unicode 6.1.0 - 'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr', -#New for Unicode 7.0.0 - 'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj', - 'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm', - 'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara', -#New for Unicode 8.0.0 - 'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw', -#New for Unicode 10.0.0 - 'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo', - 'Zanb', -#New for Unicode 11.0.0 - 'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd', -#New for Unicode 12.0.0 - 'Elym', 'Nand', 'Hmnp', 'Wcho', -#New for Unicode 13.0.0 - 'Chrs', 'Diak', 'Kits', 'Yezi', -#New for Unicode 14.0.0 - 'Cpmn', 'Ougr', 'Tngs', 'Toto', 'Vith' - ] - -category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', - 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', - 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] - -# The Extended_Pictographic property is not found in the file where all the -# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt -# file, but we list it here so that the name has the correct index value. - -break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend', - 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other', - 'ZWJ', 'Extended_Pictographic' ] - -# BIDI class property names in the DerivedBidiClass.txt file - -bidiclass_names = ['AL', 'AN', 'B', 'BN', 'CS', 'EN', 'ES', 'ET', 'FSI', 'L', - 'LRE', 'LRI', 'LRO', 'NSM', 'ON', 'PDF', 'PDI', 'R', 'RLE', 'RLI', 'RLO', - 'S', 'WS' ] - -# Create the various tables - -test_record_size() -unicode_version = "" - -script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown')) -category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) -break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other')) -other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) -bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidiclass_names), bidiclass_names.index('L')) - -# The Bidi_Control property is a Y/N value, so needs only one bit. We scan the -# PropList.txt file and set 0x80 bit in the bidi_class table. - -file = open('Unicode.tables/PropList.txt', 'r', encoding='utf-8') -for line in file: - line = re.sub(r'#.*', '', line) - chardata = list(map(str.strip, line.split(';'))) - if len(chardata) <= 1: - continue - if chardata[1] != "Bidi_Control": - continue - m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) - char = int(m.group(1), 16) - if m.group(3) is None: - last = char - else: - last = int(m.group(3), 16) - for i in range(char, last + 1): - bidi_class[i] |= 0x80; -file.close() - -# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now -# we need to find the Extended_Pictographic property for emoji characters. This -# can be set as an additional grapheme break property, because the default for -# all the emojis is "other". We scan the emoji-data.txt file and modify the -# break-props table. - -file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8') -for line in file: - line = re.sub(r'#.*', '', line) - chardata = list(map(str.strip, line.split(';'))) - if len(chardata) <= 1: - continue - if chardata[1] != "Extended_Pictographic": - continue - m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) - char = int(m.group(1), 16) - if m.group(3) is None: - last = char - else: - last = int(m.group(3), 16) - for i in range(char, last + 1): - if break_props[i] != break_property_names.index('Other'): - print("WARNING: Emoji 0x%x has break property %s, not 'Other'", - i, break_property_names[break_props[i]], file=sys.stderr) - break_props[i] = break_property_names.index('Extended_Pictographic') -file.close() - -# The Script Extensions property default value is the Script value. Parse the -# file, setting 'Unknown' as the default (this will never be a Script Extension -# value), then scan it and fill in the default from Scripts. Code added by PH -# in October 2018. Positive values are used for just a single script for a -# code point. Negative values are negated offsets in a list of bitsets of -# multiple scripts. Initialize this list with a single entry, as the zeroth -# element is never used. - -script_lists = [0] -script_abbrevs_default = script_abbrevs.index('Zzzz') -scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default) - -# Scan all characters and set their default script extension to the main -# script. We also have to adjust negative scriptx values, following a change in -# the way these work. They are currently negated offsets into the script_lists -# list, but have to be changed into indices in the new ucd_script_sets vector, -# which has fixed-size entries. We can compute the new offset by counting the -# zeros that precede the current offset. - -for i in range(0, MAX_UNICODE): - if scriptx[i] == script_abbrevs_default: - scriptx[i] = script[i] - elif scriptx[i] < 0: - count = 1 - for j in range(-scriptx[i], 0, -1): - if script_lists[j] == 0: - count += 1 - scriptx[i] = -count * (int(len(script_names)/32) + 1) - -# With the addition of the Script Extensions field, we needed some padding to -# get the Unicode records up to 12 bytes (multiple of 4). Originally this was a -# 16-bit field and padding_dummy[0] was set to 256 to ensure this, but 8 bits -# are now used for the bidi class, so zero will do. - -padding_dummy = [0] * MAX_UNICODE -padding_dummy[0] = 0 - -# This block of code was added by PH in September 2012. I am not a Python -# programmer, so the style is probably dreadful, but it does the job. It scans -# the other_case table to find sets of more than two characters that must all -# match each other caselessly. Later in this script a table of these sets is -# written out. However, we have to do this work here in order to compute the -# offsets in the table that are inserted into the main table. - -# The CaseFolding.txt file lists pairs, but the common logic for reading data -# sets only one value, so first we go through the table and set "return" -# offsets for those that are not already set. - -for c in range(MAX_UNICODE): - if other_case[c] != 0 and other_case[c + other_case[c]] == 0: - other_case[c + other_case[c]] = -other_case[c] - -# Now scan again and create equivalence sets. - -sets = [] - -for c in range(MAX_UNICODE): - o = c + other_case[c] - - # Trigger when this character's other case does not point back here. We - # now have three characters that are case-equivalent. - - if other_case[o] != -other_case[c]: - t = o + other_case[o] - - # Scan the existing sets to see if any of the three characters are already - # part of a set. If so, unite the existing set with the new set. - - appended = 0 - for s in sets: - found = 0 - for x in s: - if x == c or x == o or x == t: - found = 1 - - # Add new characters to an existing set - - if found: - found = 0 - for y in [c, o, t]: - for x in s: - if x == y: - found = 1 - if not found: - s.append(y) - appended = 1 - - # If we have not added to an existing set, create a new one. - - if not appended: - sets.append([c, o, t]) - -# End of loop looking for caseless sets. - -# Now scan the sets and set appropriate offsets for the characters. - -caseless_offsets = [0] * MAX_UNICODE - -offset = 1; -for s in sets: - for x in s: - caseless_offsets[x] = offset - offset += len(s) + 1 - -# End of block of code for creating offsets for caseless matching sets. - - -# Combine the tables - -table, records = combine_tables(script, category, break_props, - caseless_offsets, other_case, scriptx, bidi_class, padding_dummy) - -record_size, record_struct = get_record_size_struct(list(records.keys())) - -# Find the optimum block size for the two-stage table -min_size = sys.maxsize -for block_size in [2 ** i for i in range(5,10)]: - size = len(records) * record_size - stage1, stage2 = compress_table(table, block_size) - size += get_tables_size(stage1, stage2) - #print "/* block size %5d => %5d bytes */" % (block_size, size) - if size < min_size: - min_size = size - min_stage1, min_stage2 = stage1, stage2 - min_block_size = block_size - -print("/* This module is generated by the maint/MultiStage2.py script.") -print("Do not modify it by hand. Instead modify the script and run it") -print("to regenerate this code.") -print() -print("As well as being part of the PCRE2 library, this module is #included") -print("by the pcre2test program, which redefines the PRIV macro to change") -print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes") -print("with the library. At present, just one of these tables is actually") -print("needed. */") -print() -print("#ifndef PCRE2_PCRE2TEST") -print() -print("#ifdef HAVE_CONFIG_H") -print("#include \"config.h\"") -print("#endif") -print() -print("#include \"pcre2_internal.h\"") -print() -print("#endif /* PCRE2_PCRE2TEST */") -print() -print("/* Unicode character database. */") -print("/* This file was autogenerated by the MultiStage2.py script. */") -print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)) -print() -print("/* The tables herein are needed only when UCP support is built,") -print("and in PCRE2 that happens automatically with UTF support.") -print("This module should not be referenced otherwise, so") -print("it should not matter whether it is compiled or not. However") -print("a comment was received about space saving - maybe the guy linked") -print("all the modules rather than using a library - so we include a") -print("condition to cut out the tables when not needed. But don't leave") -print("a totally empty module because some compilers barf at that.") -print("Instead, just supply some small dummy tables. */") -print() -print("#ifndef SUPPORT_UNICODE") -print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0,0 }};") -print("const uint16_t PRIV(ucd_stage1)[] = {0};") -print("const uint16_t PRIV(ucd_stage2)[] = {0};") -print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};") -print("#else") -print() -print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version)) -print() -print("/* If the 32-bit library is run in non-32-bit mode, character values") -print("greater than 0x10ffff may be encountered. For these we set up a") -print("special record. */") -print() -print("#if PCRE2_CODE_UNIT_WIDTH == 32") -print("const ucd_record PRIV(dummy_ucd_record)[] = {{") -print(" ucp_Unknown, /* script */") -print(" ucp_Cn, /* type unassigned */") -print(" ucp_gbOther, /* grapheme break property */") -print(" 0, /* case set */") -print(" 0, /* other case */") -print(" ucp_Unknown, /* script extension */") -print(" ucp_bidiL, /* bidi class */") -print(" 0, /* dummy filler */") -print(" }};") -print("#endif") -print() -print(record_struct) - -# --- Added by PH: output the table of caseless character sets --- - -print("/* This table contains lists of characters that are caseless sets of") -print("more than one character. Each list is terminated by NOTACHAR. */\n") - -print("const uint32_t PRIV(ucd_caseless_sets)[] = {") -print(" NOTACHAR,") -for s in sets: - s = sorted(s) - for x in s: - print(' 0x%04x,' % x, end=' ') - print(' NOTACHAR,') -print('};') -print() - -# ------ - -print("/* When #included in pcre2test, we don't need the table of digit") -print("sets, nor the the large main UCD tables. */") -print() -print("#ifndef PCRE2_PCRE2TEST") -print() - -# --- Added by PH: read Scripts.txt again for the sets of 10 digits. --- - -digitsets = [] -file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8') - -for line in file: - m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line) - if m is None: - continue - first = int(m.group(1),16) - last = int(m.group(2),16) - if ((last - first + 1) % 10) != 0: - print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last), - file=sys.stderr) - while first < last: - digitsets.append(first + 9) - first += 10 -file.close() -digitsets.sort() - -print("/* This table lists the code points for the '9' characters in each") -print("set of decimal digits. It is used to ensure that all the digits in") -print("a script run come from the same set. */\n") -print("const uint32_t PRIV(ucd_digit_sets)[] = {") - -print(" %d, /* Number of subsequent values */" % len(digitsets), end='') -count = 8 -for d in digitsets: - if count == 8: - print("\n ", end='') - count = 0 - print(" 0x%05x," % d, end='') - count += 1 -print("\n};\n") - -print("/* This vector is a list of script bitsets for the Script Extension") -print("property. */\n") -print("const uint32_t PRIV(ucd_script_sets)[] = {") - -bitword_count = len(script_names)/32 + 1 -bitwords = [0] * int(bitword_count) - -for d in script_lists: - if d == 0: - s = " " - print(" ", end='') - for x in bitwords: - print("%s" %s, end='') - s = ", " - print("0x%08xu" % x, end='') - print(",\n", end='') - bitwords = [0] * int(bitword_count) - - else: - x = int(d/32) - y = int(d%32) - bitwords[x] = bitwords[x] | (1 << y) - -print("};\n") - -# Output the main UCD tables. - -print("/* These are the main two-stage UCD tables. The fields in each record are:") -print("script (8 bits), character type (8 bits), grapheme break property (8 bits),") -print("offset to multichar other cases or zero (8 bits), offset to other case") -print("or zero (32 bits, signed), script extension (16 bits, signed), bidi class") -print("(8 bits), and a dummy 8-bit field to make the whole thing a multiple") -print("of 4 bytes. */\n") - -print_records(records, record_size) -print_table(min_stage1, 'PRIV(ucd_stage1)') -print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size) -print("#if UCD_BLOCK_SIZE != %d" % min_block_size) -print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h") -print("#endif") -print("#endif /* SUPPORT_UNICODE */") -print() -print("#endif /* PCRE2_PCRE2TEST */") - - -# This code was part of the original contribution, but is commented out as it -# was never used. A two-stage table has sufficed. - -""" - -# Three-stage tables: - -# Find the optimum block size for 3-stage table -min_size = sys.maxint -for stage3_block in [2 ** i for i in range(2,6)]: - stage_i, stage3 = compress_table(table, stage3_block) - for stage2_block in [2 ** i for i in range(5,10)]: - size = len(records) * 4 - stage1, stage2 = compress_table(stage_i, stage2_block) - size += get_tables_size(stage1, stage2, stage3) - # print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size) - if size < min_size: - min_size = size - min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3 - min_stage2_block, min_stage3_block = stage2_block, stage3_block - -print "/* Total size: %d bytes" % min_size */ -print_records(records) -print_table(min_stage1, 'ucd_stage1') -print_table(min_stage2, 'ucd_stage2', min_stage2_block) -print_table(min_stage3, 'ucd_stage3', min_stage3_block) - -""" diff --git a/maint/README b/maint/README index b1713c9..6500854 100644 --- a/maint/README +++ b/maint/README @@ -16,59 +16,66 @@ and also contains some notes for maintainers. Its contents are: Files in the maint directory ============================ -GenerateUtt.py A Python script to generate part of the pcre2_tables.c file - that contains Unicode script names in a long string with - offsets, which is tedious to maintain by hand. +GenerateCommon.py + A Python module containing data and functions that are used by the other + Generate scripts. Each o -ManyConfigTests A shell script that runs "configure, make, test" a number of - times with different configuration settings. +GenerateUcd.py + A Python script that generates the file pcre2_ucd.c from GenerateCommon.py + and Unicode data files, which are themselves downloaded from the Unicode web + site. The generated file contains the tables for a 2-stage lookup of Unicode + properties, along with some auxiliary tables. -MultiStage2.py A Python script that generates the file pcre2_ucd.c from eight - Unicode data files, which are themselves downloaded from the - Unicode web site. Run this script in the "maint" directory. - The generated file is written to stdout. It contains the - tables for a 2-stage lookup of Unicode properties, along with - some auxiliary tables. +GenerateUcpHeader.py + A Python script that generates the file pcre2_ucp.h from GenerateCommon.py + and Unicode data files. The generated file defines constants for various + Unicode property values. + +GenerateUcpTables.py + A Python script that generates the file pcre2_ucptables.c from + GenerateCommon.py and Unicode data files. The generated file contains tables + for looking up Unicode properties. + +ManyConfigTests + A shell script that runs "configure, make, test" a number of times with + different configuration settings. pcre2_chartables.c.non-standard - This is a set of character tables that came from a Windows - system. It has characters greater than 128 that are set as - spaces, amongst other things. I kept it so that it can be - used for testing from time to time. + This is a set of character tables that came from a Windows system. It has + characters greater than 128 that are set as spaces, amongst other things. I + kept it so that it can be used for testing from time to time. -README This file. +README + This file. -Unicode.tables The files in this directory were downloaded from the Unicode - web site. They contain information about Unicode characters - and scripts. The ones used by the MultiStage2.py script are - CaseFolding.txt, DerivedBidiClass.txt, - DerivedGeneralCategory.txt, PropList.txt, Scripts.txt, - ScriptExtensions.txt, GraphemeBreakProperty.txt, and - emoji-data.txt. I've kept UnicodeData.txt (which is no longer - used by the script) because it is useful occasionally for - manually looking up the details of certain characters. - However, note that character names in this file such as - "Arabic sign sanah" do NOT mean that the character is in a - particular script (in this case, Arabic). Scripts.txt and - ScriptExtensions.txt are where to look for script information. +Unicode.tables + The files in this directory were downloaded from the Unicode web site. They + contain information about Unicode characters and scripts, and are used by the + Generate scripts. There is also UnicodeData.txt, which is no longer used by + any script, because it is useful occasionally for manually looking up the + details of certain characters. However, note that character names in this + file such as "Arabic sign sanah" do NOT mean that the character is in a + particular script (in this case, Arabic). Scripts.txt and + ScriptExtensions.txt are where to look for script information. -ucptest.c A short C program for testing the Unicode property macros - that do lookups in the pcre2_ucd.c data, mainly useful after - rebuilding the Unicode property table. Compile and run this in - the "maint" directory (see comments at its head). This program - can also be used to find characters with specific properties. +ucptest.c + A short C program for testing the Unicode property macros that do lookups in + the pcre2_ucd.c data, mainly useful after rebuilding the Unicode property + table. Compile and run this in the "maint" directory (see comments at its + head). This program can also be used to find characters with specific + properties. -ucptestdata A directory containing four files, testinput{1,2} and - testoutput{1,2}, for use in conjunction with the ucptest - program. +ucptestdata + A directory containing four files, testinput{1,2} and testoutput{1,2}, for + use in conjunction with the ucptest program. -utf8.c A short, freestanding C program for converting a Unicode code - point into a sequence of bytes in the UTF-8 encoding, and vice - versa. If its argument is a hex number such as 0x1234, it - outputs a list of the equivalent UTF-8 bytes. If its argument - is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it - treats them as a UTF-8 character and outputs the equivalent - code point in hex. See comments at its head for details. +utf8.c + A short, freestanding C program for converting a Unicode code point into a + sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a + hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes. + If its argument is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it + treats them as a UTF-8 character and outputs the equivalent code point in + hex. See comments at its head for details. Updating to a new Unicode release @@ -76,33 +83,43 @@ Updating to a new Unicode release When there is a new release of Unicode, the files in Unicode.tables must be refreshed from the web site. If the new version of Unicode adds new character -scripts, the source file pcre2_ucp.h and both the MultiStage2.py and the -GenerateUtt.py scripts must be edited to add the new names. I have been adding -each new group at the end of the relevant list, with a comment. Note also that -both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of Unicode -script names. +scripts, the lists in GenerateCommon.py must be updated. I have been adding +each new group at the end of the relevant list, with a comment. -MultiStage2.py has two lists: the full names and the abbreviations that are -found in the ScriptExtensions.txt file. A list of script names and their -abbreviations can be found in the PropertyValueAliases.txt file on the +NOTE: Both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of +supported Unicode scripts that also have to be updated. These lists are in +alphabetical order. + +There are two lists in GenerateCommon.py: the full names and the abbreviations +that are found in the ScriptExtensions.txt file. A list of script names and +their abbreviations can be found in the PropertyValueAliases.txt file on the Unicode web site. There is also a Wikipedia page that lists them, and notes the Unicode version in which they were introduced: https://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts -Once the script name lists have been updated, MultiStage2.py can be run to -generate a new version of pcre2_ucd.c, and GenerateUtt.py can be run to -generate the tricky tables for inclusion in pcre2_tables.c (which must be -hand-edited). If MultiStage2.py gives the error "ValueError: list.index(x): x -not in list", the cause is usually a missing (or misspelt) name in one of the -lists of scripts. +Once the script name lists have been updated, the three generator scripts can +be run from within the maint directory. If you get the error "ValueError: +list.index(x): x not in list", the cause is usually a missing (or misspelt) +name in one of the lists. You can give an output file name as an argument, but +by default: -The ucptest program can be compiled and used to check that the new tables in -pcre2_ucd.c work properly, using the data files in ucptestdata to check a -number of test characters. It used to be necessary to update the source -ucptest.c whenever new Unicode scripts were added, but this is no longer -required because that program now uses the lists in the PCRE2 source. However, -adding a few tests for new scripts to the files in ucptestdata is a good idea. +GenerateUcd.py creates pcre2_ucd.c ) +GenerateUcpHeader.py creates pcre2_ucp.h ) in the current directory +GenerateUcpTables.py creates pcre2_ucptables.c ) + +These files can be compared against the existing versions in the src directory +to check on any changes before replacing the old files, but you can generate +directly into the final location by running + +./GenerateUcd.py ../src/pcre2_ucd.c +./GenerateUcpHeader.py ../src/pcre2_ucp.h +./GenerateUcpTables.py ../src/pcre2_ucptables.c + +The ucptest program can be compiled and used to check that the new tables work +properly, using the data files in ucptestdata to check a number of test +characters. See the comments at the start of ucptest.c. If there are new +scripts, adding a few tests to the files in ucptestdata is a good idea. Preparing for a PCRE2 release @@ -440,4 +457,4 @@ years. Philip Hazel Email local part: Philip.Hazel Email domain: gmail.com -Last updated: 05 December 2021 +Last updated: 26 December 2021 diff --git a/maint/ucptest.c b/maint/ucptest.c index 8a9497d..fc48a44 100644 --- a/maint/ucptest.c +++ b/maint/ucptest.c @@ -172,7 +172,7 @@ static const unsigned char *gb_names[] = { US"T", US"Hangul syllable type T", US"LV", US"Hangul syllable type LV", US"LVT", US"Hangul syllable type LVT", - US"RegionalIndicator", US"", + US"Regional_Indicator", US"", US"Other", US"", US"ZWJ", US"zero width joiner", US"Extended_Pictographic", US"" @@ -399,7 +399,7 @@ switch(gbprop) case ucp_gbT: graphbreak = US"Hangul syllable type T"; break; case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break; case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break; - case ucp_gbRegionalIndicator: + case ucp_gbRegional_Indicator: graphbreak = US"Regional Indicator"; break; case ucp_gbOther: graphbreak = US"Other"; break; case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break; diff --git a/src/pcre2_extuni.c b/src/pcre2_extuni.c index 5a719e9..b23946b 100644 --- a/src/pcre2_extuni.c +++ b/src/pcre2_extuni.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -105,7 +105,7 @@ while (eptr < end_subject) /* Not breaking between Regional Indicators is allowed only if there are an even number of preceding RIs. */ - if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) + if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) { int ricount = 0; PCRE2_SPTR bptr = eptr - 1; @@ -123,7 +123,7 @@ while (eptr < end_subject) } else c = *bptr; - if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break; + if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; ricount++; } if ((ricount & 1) != 0) break; /* Grapheme break required */ diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 2749cc6..3fb2731 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -8,7 +8,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel This module by Zoltan Herczeg Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -8379,7 +8379,7 @@ do /* Not breaking between Regional Indicators is allowed only if there are an even number of preceding RIs. */ - if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) + if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) { ricount = 0; bptr = prevcc; @@ -8391,7 +8391,7 @@ do BACKCHAR(bptr); GETCHAR(c, bptr); - if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) + if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; ricount++; @@ -8447,7 +8447,7 @@ do /* Not breaking between Regional Indicators is allowed only if there are an even number of preceding RIs. */ - if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) + if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) { ricount = 0; bptr = prevcc; @@ -8457,7 +8457,7 @@ do { GETCHARBACK_INVALID(c, bptr, start_subject, break); - if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) + if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; ricount++; @@ -8515,7 +8515,7 @@ while (cc < end_subject) /* Not breaking between Regional Indicators is allowed only if there are an even number of preceding RIs. */ - if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator) + if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) { ricount = 0; bptr = cc - 1; @@ -8530,7 +8530,7 @@ while (cc < end_subject) break; #endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ - if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break; + if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; ricount++; } diff --git a/src/pcre2_tables.c b/src/pcre2_tables.c index de2afe8..ff8f379 100644 --- a/src/pcre2_tables.c +++ b/src/pcre2_tables.c @@ -193,7 +193,7 @@ const uint32_t PRIV(ucp_gbtable)[] = { ESZ|(1u<