194 lines
7.0 KiB
Python
Executable File
194 lines
7.0 KiB
Python
Executable File
#! /usr/bin/python
|
|
|
|
# PCRE2 UNICODE PROPERTY SUPPORT
|
|
# ------------------------------
|
|
|
|
# This script generates the pcre2_ucptables.c file, which contains tables for
|
|
# recognizing Unicode property names. It is #included by pcre2_tables.c. In
|
|
# order to reduce the number of relocations when loading the PCRE2 library, the
|
|
# names are held as a single large string, with offsets in the table. This is
|
|
# tedious to maintain by hand. Therefore, a script is used to generate the
|
|
# table.
|
|
|
|
# This script was created in December 2021 based on the previous GenerateUtt
|
|
# script, whose output had to be manually edited into pcre2_tables.c. Here is
|
|
# the history of the original script:
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
|
# for UTF-support in EBCDIC as well as ASCII environments.
|
|
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
|
|
# Modified by PH 04-May-2010 to add new "X.." special categories.
|
|
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
|
|
# Modified by ChPe 30-September-2012 to add this note; no other changes were
|
|
# necessary for Unicode 6.2.0 support.
|
|
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
|
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
|
# Script updated to Python 3 by running it through the 2to3 converter.
|
|
# Added script names for Unicode 7.0.0, 20-June-2014.
|
|
# Added script names for Unicode 8.0.0, 19-June-2015.
|
|
# Added script names for Unicode 10.0.0, 02-July-2017.
|
|
# Added script names for Unicode 11.0.0, 03-July-2018.
|
|
# Added 'Unknown' script, 01-October-2018.
|
|
# Added script names for Unicode 12.1.0, 27-July-2019.
|
|
# Added script names for Unicode 13.0.0, 10-March-2020.
|
|
# Added Script names for Unicode 14.0.0, PCRE2-10.39
|
|
# Added support for bidi class and bidi control, 06-December-2021
|
|
# This also involved lower casing strings and removing underscores, in
|
|
# accordance with Unicode's "loose matching" rules, which Perl observes.
|
|
# Changed default script type from PT_SC to PT_SCX, 18-December-2021
|
|
# -----------------------------------------------------------------------------
|
|
#
|
|
# Note subsequent changes here:
|
|
#
|
|
# 27-December_2021: Added support for 4-letter script abbreviations.
|
|
# -----------------------------------------------------------------------------
|
|
|
|
|
|
# Import common data lists and functions
|
|
|
|
from GenerateCommon import \
|
|
bidi_classes, \
|
|
category_names, \
|
|
general_category_names, \
|
|
script_abbrevs, \
|
|
script_names, \
|
|
open_output
|
|
|
|
# Open the output file (no return on failure). This call also writes standard
|
|
# header boilerplate.
|
|
|
|
f = open_output("pcre2_ucptables.c")
|
|
|
|
# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
|
|
# etc., along with comments. We need to add "bidi" in front of each value, in
|
|
# order to create names that don't clash with other types of property.
|
|
|
|
bidi_class_names = []
|
|
for i in range(0, len(bidi_classes), 2):
|
|
bidi_class_names.append("bidi" + bidi_classes[i])
|
|
|
|
# Remove the comments from other lists that contain them.
|
|
|
|
category_names = category_names[::2]
|
|
|
|
# Create standardized versions of the names by lowercasing and removing
|
|
# underscores.
|
|
|
|
def stdnames(x):
|
|
y = [''] * len(x)
|
|
for i in range(len(x)):
|
|
y[i] = x[i].lower().replace('_', '')
|
|
return y
|
|
|
|
std_script_names = stdnames(script_names)
|
|
std_script_abbrevs = stdnames(script_abbrevs)
|
|
std_category_names = stdnames(category_names)
|
|
std_general_category_names = stdnames(general_category_names)
|
|
std_bidi_class_names = stdnames(bidi_class_names)
|
|
|
|
# Create the table, starting with the Unicode script, category and bidi class
|
|
# names. We keep both the standardized name and the original, because the
|
|
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
|
|
# still use the full original names.
|
|
|
|
utt_table = list(zip(std_script_names, script_names, ['PT_SCX'] * len(script_names)))
|
|
utt_table += list(zip(std_script_abbrevs, script_names, ['PT_SCX'] * len(script_abbrevs)))
|
|
|
|
# At lease one script abbreviation is the same as the full name of the script,
|
|
# so we must remove duplicates. It doesn't matter if this operation changes the
|
|
# order, because we are going to sort the list later.
|
|
|
|
utt_table = list(set(utt_table))
|
|
|
|
# Add the remaining property lists
|
|
|
|
utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
|
|
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
|
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
|
|
|
|
# Now add specials and synonyms. Note both the standardized and capitalized
|
|
# forms are needed.
|
|
|
|
utt_table.append(('any', 'Any', 'PT_ANY'))
|
|
utt_table.append(('bidic', 'BidiC', 'PT_BIDICO'))
|
|
utt_table.append(('bidicontrol', 'Bidi_Control', 'PT_BIDICO'))
|
|
utt_table.append(('l&', 'L&', 'PT_LAMP'))
|
|
utt_table.append(('lc', 'LC', 'PT_LAMP'))
|
|
utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
|
|
utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
|
|
utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
|
|
utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
|
|
utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
|
|
|
|
# Sort the table.
|
|
|
|
utt_table.sort()
|
|
|
|
# Output file-specific heading
|
|
|
|
f.write("""\
|
|
#ifdef SUPPORT_UNICODE
|
|
|
|
/* The PRIV(utt)[] table below translates Unicode property names into type and
|
|
code values. It is searched by binary chop, so must be in collating sequence of
|
|
name. Originally, the table contained pointers to the name strings in the first
|
|
field of each entry. However, that leads to a large number of relocations when
|
|
a shared library is dynamically loaded. A significant reduction is made by
|
|
putting all the names into a single, large string and using offsets instead.
|
|
All letters are lower cased, and underscores are removed, in accordance with
|
|
the "loose matching" rules that Unicode advises and Perl uses. */
|
|
\n""")
|
|
|
|
# We have to use STR_ macros to define the strings so that it all works in
|
|
# UTF-8 mode on EBCDIC platforms.
|
|
|
|
for utt in utt_table:
|
|
f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
|
|
for c in utt[0]:
|
|
if c == '&':
|
|
f.write(' STR_AMPERSAND')
|
|
else:
|
|
f.write(' STR_%s' % c);
|
|
f.write(' "\\0"\n')
|
|
|
|
# Output the long string of concatenated names
|
|
|
|
f.write('\nconst char PRIV(utt_names)[] =\n');
|
|
last = ''
|
|
for utt in utt_table:
|
|
if utt == utt_table[-1]:
|
|
last = ';'
|
|
f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
|
|
|
|
# Output the property type table
|
|
|
|
f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
|
|
offset = 0
|
|
last = ','
|
|
for utt in utt_table:
|
|
if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
|
|
'PT_SPACE', 'PT_UCNC', 'PT_WORD', 'PT_BIDICO'):
|
|
value = '0'
|
|
else:
|
|
value = 'ucp_' + utt[1]
|
|
if utt == utt_table[-1]:
|
|
last = ''
|
|
f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
|
|
offset += len(utt[0]) + 1
|
|
f.write('};\n\n')
|
|
|
|
# Ending text
|
|
|
|
f.write("""\
|
|
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
|
|
|
#endif /* SUPPORT_UNICODE */
|
|
|
|
/* End of pcre2_ucptables.c */
|
|
""")
|
|
|
|
f.close
|
|
|
|
# End
|