356 lines
12 KiB
Python
356 lines
12 KiB
Python
#! /usr/bin/python
|
|
|
|
# PCRE2 UNICODE PROPERTY SUPPORT
|
|
# ------------------------------
|
|
|
|
# This file is a Python module containing common lists and functions for the
|
|
# GenerateXXX scripts that create various.c and .h files from Unicode data
|
|
# files. It was created as part of a re-organizaton of these scripts in
|
|
# December 2021.
|
|
|
|
|
|
import re
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# DATA LISTS
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# BIDI classes in the DerivedBidiClass.txt file, with comments.
|
|
|
|
bidi_classes = [
|
|
'AL', 'Arabic letter',
|
|
'AN', 'Arabic number',
|
|
'B', 'Paragraph separator',
|
|
'BN', 'Boundary neutral',
|
|
'CS', 'Common separator',
|
|
'EN', 'European number',
|
|
'ES', 'European separator',
|
|
'ET', 'European terminator',
|
|
'FSI', 'First strong isolate',
|
|
'L', 'Left to right',
|
|
'LRE', 'Left to right embedding',
|
|
'LRI', 'Left to right isolate',
|
|
'LRO', 'Left to right override',
|
|
'NSM', 'Non-spacing mark',
|
|
'ON', 'Other neutral',
|
|
'PDF', 'Pop directional format',
|
|
'PDI', 'Pop directional isolate',
|
|
'R', 'Right to left',
|
|
'RLE', 'Right to left embedding',
|
|
'RLI', 'Right to left isolate',
|
|
'RLO', 'Right to left override',
|
|
'S', 'Segment separator',
|
|
'WS', 'White space'
|
|
]
|
|
|
|
# Particular category property names, with comments. NOTE: If ever this list
|
|
# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
|
|
# must be edited to keep in step.
|
|
|
|
category_names = [
|
|
'Cc', 'Control',
|
|
'Cf', 'Format',
|
|
'Cn', 'Unassigned',
|
|
'Co', 'Private use',
|
|
'Cs', 'Surrogate',
|
|
'Ll', 'Lower case letter',
|
|
'Lm', 'Modifier letter',
|
|
'Lo', 'Other letter',
|
|
'Lt', 'Title case letter',
|
|
'Lu', 'Upper case letter',
|
|
'Mc', 'Spacing mark',
|
|
'Me', 'Enclosing mark',
|
|
'Mn', 'Non-spacing mark',
|
|
'Nd', 'Decimal number',
|
|
'Nl', 'Letter number',
|
|
'No', 'Other number',
|
|
'Pc', 'Connector punctuation',
|
|
'Pd', 'Dash punctuation',
|
|
'Pe', 'Close punctuation',
|
|
'Pf', 'Final punctuation',
|
|
'Pi', 'Initial punctuation',
|
|
'Po', 'Other punctuation',
|
|
'Ps', 'Open punctuation',
|
|
'Sc', 'Currency symbol',
|
|
'Sk', 'Modifier symbol',
|
|
'Sm', 'Mathematical symbol',
|
|
'So', 'Other symbol',
|
|
'Zl', 'Line separator',
|
|
'Zp', 'Paragraph separator',
|
|
'Zs', 'Space separator'
|
|
]
|
|
|
|
# The Extended_Pictographic property is not found in the file where all the
|
|
# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
|
|
# file, but we list it here so that the name has the correct index value.
|
|
|
|
break_properties = [
|
|
'CR', ' 0',
|
|
'LF', ' 1',
|
|
'Control', ' 2',
|
|
'Extend', ' 3',
|
|
'Prepend', ' 4',
|
|
'SpacingMark', ' 5',
|
|
'L', ' 6 Hangul syllable type L',
|
|
'V', ' 7 Hangul syllable type V',
|
|
'T', ' 8 Hangul syllable type T',
|
|
'LV', ' 9 Hangul syllable type LV',
|
|
'LVT', '10 Hangul syllable type LVT',
|
|
'Regional_Indicator', '11',
|
|
'Other', '12',
|
|
'ZWJ', '13',
|
|
'Extended_Pictographic', '14'
|
|
]
|
|
|
|
# List of files from which the names of Boolean properties are obtained, along
|
|
# with a list of regex patterns for properties to be ignored, and a list of
|
|
# extra pattern names to add.
|
|
|
|
bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
|
|
bool_propsignore = [r'^Other_', r'^Hyphen$']
|
|
bool_propsextras = ['ASCII', 'Bidi_Mirrored']
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# GET BOOLEAN PROPERTY NAMES
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Get a list of Boolean property names from a number of files.
|
|
|
|
def getbpropslist():
|
|
bplist = []
|
|
bplast = ""
|
|
|
|
for filename in bool_propsfiles:
|
|
try:
|
|
file = open('Unicode.tables/' + filename, 'r')
|
|
except IOError:
|
|
print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
|
|
sys.exit(1)
|
|
|
|
for line in file:
|
|
line = re.sub(r'#.*', '', line)
|
|
data = list(map(str.strip, line.split(';')))
|
|
if len(data) <= 1 or data[1] == bplast:
|
|
continue
|
|
bplast = data[1]
|
|
for pat in bool_propsignore:
|
|
if re.match(pat, bplast) != None:
|
|
break
|
|
else:
|
|
bplist.append(bplast)
|
|
|
|
file.close()
|
|
|
|
bplist.extend(bool_propsextras)
|
|
bplist.sort()
|
|
return bplist
|
|
|
|
bool_properties = getbpropslist()
|
|
bool_props_list_item_size = (len(bool_properties) + 31) // 32
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# COLLECTING PROPERTY NAMES AND ALIASES
|
|
# ---------------------------------------------------------------------------
|
|
|
|
script_names = ['Unknown']
|
|
abbreviations = {}
|
|
|
|
def collect_property_names():
|
|
global script_names
|
|
global abbreviations
|
|
|
|
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
|
|
|
|
last_script_name = ""
|
|
with open("Unicode.tables/Scripts.txt") as f:
|
|
for line in f:
|
|
match_obj = names_re.match(line)
|
|
|
|
if match_obj == None or match_obj.group(1) == last_script_name:
|
|
continue
|
|
|
|
last_script_name = match_obj.group(1)
|
|
script_names.append(last_script_name)
|
|
|
|
# Sometimes there is comment in the line
|
|
# so splitting around semicolon is not enough
|
|
value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
|
|
|
|
with open("Unicode.tables/PropertyValueAliases.txt") as f:
|
|
for line in f:
|
|
match_obj = value_alias_re.match(line)
|
|
|
|
if match_obj == None:
|
|
continue
|
|
|
|
if match_obj.group(1) == "sc":
|
|
if match_obj.group(2) == match_obj.group(3):
|
|
abbreviations[match_obj.group(3)] = ()
|
|
elif match_obj.group(4) == None:
|
|
abbreviations[match_obj.group(3)] = (match_obj.group(2),)
|
|
else:
|
|
abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
|
|
|
|
# We can also collect Boolean property abbreviations into the same dictionary
|
|
|
|
bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
|
|
with open("Unicode.tables/PropertyAliases.txt") as f:
|
|
for line in f:
|
|
match_obj = bin_alias_re.match(line)
|
|
if match_obj == None:
|
|
continue
|
|
|
|
if match_obj.group(2) in bool_properties:
|
|
if match_obj.group(3) == None:
|
|
abbreviations[match_obj.group(2)] = (match_obj.group(1),)
|
|
else:
|
|
abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
|
|
|
|
collect_property_names()
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# REORDERING SCRIPT NAMES
|
|
# ---------------------------------------------------------------------------
|
|
|
|
script_abbrevs = []
|
|
|
|
def reorder_scripts():
|
|
global script_names
|
|
global script_abbrevs
|
|
global abbreviations
|
|
|
|
for name in script_names:
|
|
abbrevs = abbreviations[name]
|
|
script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
|
|
|
|
extended_script_abbrevs = set()
|
|
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
|
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
|
|
|
|
for line in f:
|
|
match_obj = names_re.match(line)
|
|
|
|
if match_obj == None:
|
|
continue
|
|
|
|
for name in match_obj.group(1).split(" "):
|
|
extended_script_abbrevs.add(name)
|
|
|
|
new_script_names = []
|
|
new_script_abbrevs = []
|
|
|
|
for idx, abbrev in enumerate(script_abbrevs):
|
|
if abbrev in extended_script_abbrevs:
|
|
new_script_names.append(script_names[idx])
|
|
new_script_abbrevs.append(abbrev)
|
|
|
|
for idx, abbrev in enumerate(script_abbrevs):
|
|
if abbrev not in extended_script_abbrevs:
|
|
new_script_names.append(script_names[idx])
|
|
new_script_abbrevs.append(abbrev)
|
|
|
|
script_names = new_script_names
|
|
script_abbrevs = new_script_abbrevs
|
|
|
|
reorder_scripts()
|
|
script_list_item_size = (script_names.index('Unknown') + 31) // 32
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# DERIVED LISTS
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Create general character property names from the first letters of the
|
|
# particular categories.
|
|
|
|
gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
|
|
general_category_names = list(gcn_set)
|
|
general_category_names.sort()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# FUNCTIONS
|
|
# ---------------------------------------------------------------------------
|
|
|
|
import sys
|
|
|
|
# Open an output file, using the command's argument or a default. Write common
|
|
# preliminary header information.
|
|
|
|
def open_output(default):
|
|
if len(sys.argv) > 2:
|
|
print('** Too many arguments: just give a file name')
|
|
sys.exit(1)
|
|
if len(sys.argv) == 2:
|
|
output_name = sys.argv[1]
|
|
else:
|
|
output_name = default
|
|
try:
|
|
file = open(output_name, "w")
|
|
except IOError:
|
|
print ("** Couldn't open %s" % output_name)
|
|
sys.exit(1)
|
|
|
|
script_name = sys.argv[0]
|
|
i = script_name.rfind('/')
|
|
if i >= 0:
|
|
script_name = script_name[i+1:]
|
|
|
|
file.write("""\
|
|
/*************************************************
|
|
* Perl-Compatible Regular Expressions *
|
|
*************************************************/
|
|
|
|
/* PCRE is a library of functions to support regular expressions whose syntax
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
|
New API code Copyright (c) 2016-2022 University of Cambridge
|
|
|
|
This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
|
|
""")
|
|
|
|
file.write("Instead, modify the maint/%s script and run it to generate\n"
|
|
"a new version of this code.\n\n" % script_name)
|
|
|
|
file.write("""\
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
* Neither the name of the University of Cambridge nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
-----------------------------------------------------------------------------
|
|
*/
|
|
\n""")
|
|
return file
|
|
|
|
# End of UcpCommon.py
|