pcre2/maint/GenerateUcpTables.py

#! /usr/bin/python

#                   PCRE2 UNICODE PROPERTY SUPPORT
#                   ------------------------------

# This script generates the pcre2_ucptables.c file, which contains tables for
# recognizing Unicode property names. It is #included by pcre2_tables.c. In
# order to reduce the number of relocations when loading the PCRE2 library, the
# names are held as a single large string, with offsets in the table. This is
# tedious to maintain by hand. Therefore, a script is used to generate the
# table.

# This script was created in December 2021 based on the previous GenerateUtt
# script, whose output had to be manually edited into pcre2_tables.c. Here is
# the history of the original script:

# -----------------------------------------------------------------------------
# Modified by PH 17-March-2009 to generate the more verbose form that works
# for UTF-support in EBCDIC as well as ASCII environments.
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
# Modified by PH 04-May-2010 to add new "X.." special categories.
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
# Modified by ChPe 30-September-2012 to add this note; no other changes were
# necessary for Unicode 6.2.0 support.
# Modfied by PH 26-February-2013 to add the Xuc special category.
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
# Script updated to Python 3 by running it through the 2to3 converter.
# Added script names for Unicode 7.0.0, 20-June-2014.
# Added script names for Unicode 8.0.0, 19-June-2015.
# Added script names for Unicode 10.0.0, 02-July-2017.
# Added script names for Unicode 11.0.0, 03-July-2018.
# Added 'Unknown' script, 01-October-2018.
# Added script names for Unicode 12.1.0, 27-July-2019.
# Added script names for Unicode 13.0.0, 10-March-2020.
# Added Script names for Unicode 14.0.0, PCRE2-10.39
# Added support for bidi class and bidi control, 06-December-2021
#   This also involved lower casing strings and removing underscores, in
#   accordance with Unicode's "loose matching" rules, which Perl observes.
# Changed default script type from PT_SC to PT_SCX, 18-December-2021
# -----------------------------------------------------------------------------
#
# Note subsequent changes here:
#
# 27-December_2021: Added support for 4-letter script abbreviations.
# -----------------------------------------------------------------------------


# Import common data lists and functions

from GenerateCommon import \
  bidi_classes, \
  category_names, \
  general_category_names, \
  script_abbrevs, \
  script_names, \
  open_output

# Open the output file (no return on failure). This call also writes standard
# header boilerplate.

f = open_output("pcre2_ucptables.c")

# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
# etc., along with comments. We need to add "bidi" in front of each value, in
# order to create names that don't clash with other types of property.

bidi_class_names = []
for i in range(0, len(bidi_classes), 2):
  bidi_class_names.append("bidi" + bidi_classes[i])

# Remove the comments from other lists that contain them.

category_names = category_names[::2]  

# Create standardized versions of the names by lowercasing and removing
# underscores.

def stdnames(x):
  y = [''] * len(x)
  for i in range(len(x)):
    y[i] = x[i].lower().replace('_', '')
  return y

std_script_names = stdnames(script_names)
std_script_abbrevs = stdnames(script_abbrevs)
std_category_names = stdnames(category_names)
std_general_category_names = stdnames(general_category_names)
std_bidi_class_names = stdnames(bidi_class_names)

# Create the table, starting with the Unicode script, category and bidi class
# names. We keep both the standardized name and the original, because the
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
# still use the full original names.

scx_end = script_names.index('Unknown')

utt_table  = list(zip(std_script_names[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))
utt_table += list(zip(std_script_names[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))
utt_table += list(zip(std_script_abbrevs[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))
utt_table += list(zip(std_script_abbrevs[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))

# At lease one script abbreviation is the same as the full name of the script,
# so we must remove duplicates. It doesn't matter if this operation changes the
# order, because we are going to sort the list later.

utt_table = list(set(utt_table))

# Add the remaining property lists

utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))

# Now add specials and synonyms. Note both the standardized and capitalized
# forms are needed.

utt_table.append(('any', 'Any', 'PT_ANY'))
utt_table.append(('bidic', 'BidiC', 'PT_BIDICO'))
utt_table.append(('bidicontrol', 'Bidi_Control', 'PT_BIDICO'))
utt_table.append(('l&',  'L&', 'PT_LAMP'))
utt_table.append(('lc',  'LC', 'PT_LAMP'))
utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
utt_table.append(('xwd', 'Xwd', 'PT_WORD'))

# Sort the table.

utt_table.sort()

# Output file-specific heading

f.write("""\
#ifdef SUPPORT_UNICODE

/* The PRIV(utt)[] table below translates Unicode property names into type and
code values. It is searched by binary chop, so must be in collating sequence of
name. Originally, the table contained pointers to the name strings in the first
field of each entry. However, that leads to a large number of relocations when
a shared library is dynamically loaded. A significant reduction is made by
putting all the names into a single, large string and using offsets instead.
All letters are lower cased, and underscores are removed, in accordance with
the "loose matching" rules that Unicode advises and Perl uses. */
\n""")

# We have to use STR_ macros to define the strings so that it all works in
# UTF-8 mode on EBCDIC platforms.

for utt in utt_table:
  f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
  for c in utt[0]:
    if c == '&':
      f.write(' STR_AMPERSAND')
    else:
      f.write(' STR_%s' % c);
  f.write(' "\\0"\n')

# Output the long string of concatenated names

f.write('\nconst char PRIV(utt_names)[] =\n');
last = ''
for utt in utt_table:
  if utt == utt_table[-1]:
    last = ';'
  f.write('  STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
  
# Output the property type table 

f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
offset = 0
last = ','
for utt in utt_table:
  if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
      'PT_SPACE', 'PT_UCNC', 'PT_WORD', 'PT_BIDICO'):
    value = '0'
  else:
    value = 'ucp_' + utt[1]
  if utt == utt_table[-1]:
    last = ''
  f.write('  { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
  offset += len(utt[0]) + 1
f.write('};\n\n')

# Ending text

f.write("""\
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);

#endif /* SUPPORT_UNICODE */

/* End of pcre2_ucptables.c */
""")

f.close

# End
Refactor Python scripts for generating Unicode property data 2021-12-26 18:49:58 +01:00			`#! /usr/bin/python`

			`# PCRE2 UNICODE PROPERTY SUPPORT`
			`# ------------------------------`

			`# This script generates the pcre2_ucptables.c file, which contains tables for`
			`# recognizing Unicode property names. It is #included by pcre2_tables.c. In`
			`# order to reduce the number of relocations when loading the PCRE2 library, the`
			`# names are held as a single large string, with offsets in the table. This is`
			`# tedious to maintain by hand. Therefore, a script is used to generate the`
			`# table.`

			`# This script was created in December 2021 based on the previous GenerateUtt`
			`# script, whose output had to be manually edited into pcre2_tables.c. Here is`
			`# the history of the original script:`

			`# -----------------------------------------------------------------------------`
			`# Modified by PH 17-March-2009 to generate the more verbose form that works`
			`# for UTF-support in EBCDIC as well as ASCII environments.`
			`# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.`
			`# Modified by PH 04-May-2010 to add new "X.." special categories.`
			`# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0`
			`# Modified by ChPe 30-September-2012 to add this note; no other changes were`
			`# necessary for Unicode 6.2.0 support.`
			`# Modfied by PH 26-February-2013 to add the Xuc special category.`
			`# Comment modified by PH 13-May-2014 to update to PCRE2 file names.`
			`# Script updated to Python 3 by running it through the 2to3 converter.`
			`# Added script names for Unicode 7.0.0, 20-June-2014.`
			`# Added script names for Unicode 8.0.0, 19-June-2015.`
			`# Added script names for Unicode 10.0.0, 02-July-2017.`
			`# Added script names for Unicode 11.0.0, 03-July-2018.`
			`# Added 'Unknown' script, 01-October-2018.`
			`# Added script names for Unicode 12.1.0, 27-July-2019.`
			`# Added script names for Unicode 13.0.0, 10-March-2020.`
			`# Added Script names for Unicode 14.0.0, PCRE2-10.39`
			`# Added support for bidi class and bidi control, 06-December-2021`
			`# This also involved lower casing strings and removing underscores, in`
			`# accordance with Unicode's "loose matching" rules, which Perl observes.`
			`# Changed default script type from PT_SC to PT_SCX, 18-December-2021`
			`# -----------------------------------------------------------------------------`
Add support for 4-character script abbreviations 2021-12-28 16:10:12 +01:00			`#`
Refactor Python scripts for generating Unicode property data 2021-12-26 18:49:58 +01:00			`# Note subsequent changes here:`
Add support for 4-character script abbreviations 2021-12-28 16:10:12 +01:00			`#`
			`# 27-December_2021: Added support for 4-letter script abbreviations.`
			`# -----------------------------------------------------------------------------`
Refactor Python scripts for generating Unicode property data 2021-12-26 18:49:58 +01:00

			`# Import common data lists and functions`

			`from GenerateCommon import \`
			`bidi_classes, \`
			`category_names, \`
			`general_category_names, \`
			`script_abbrevs, \`
			`script_names, \`
			`open_output`

			`# Open the output file (no return on failure). This call also writes standard`
			`# header boilerplate.`

			`f = open_output("pcre2_ucptables.c")`

			`# The list in bidi_classes contains just the Unicode classes such as AN, LRE,`
			`# etc., along with comments. We need to add "bidi" in front of each value, in`
			`# order to create names that don't clash with other types of property.`

			`bidi_class_names = []`
			`for i in range(0, len(bidi_classes), 2):`
			`bidi_class_names.append("bidi" + bidi_classes[i])`

			`# Remove the comments from other lists that contain them.`

			`category_names = category_names[::2]`

			`# Create standardized versions of the names by lowercasing and removing`
			`# underscores.`

			`def stdnames(x):`
			`y = [''] * len(x)`
			`for i in range(len(x)):`
			`y[i] = x[i].lower().replace('_', '')`
			`return y`

			`std_script_names = stdnames(script_names)`
Add support for 4-character script abbreviations 2021-12-28 16:10:12 +01:00			`std_script_abbrevs = stdnames(script_abbrevs)`
Refactor Python scripts for generating Unicode property data 2021-12-26 18:49:58 +01:00			`std_category_names = stdnames(category_names)`
			`std_general_category_names = stdnames(general_category_names)`
			`std_bidi_class_names = stdnames(bidi_class_names)`

			`# Create the table, starting with the Unicode script, category and bidi class`
			`# names. We keep both the standardized name and the original, because the`
Add support for 4-character script abbreviations 2021-12-28 16:10:12 +01:00			`# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we`
			`# still use the full original names.`
Refactor Python scripts for generating Unicode property data 2021-12-26 18:49:58 +01:00
Rework script extension handling (#64) Co-authored-by: Zoltan Herczeg <hzmester@freemail.hu> 2021-12-29 10:35:22 +01:00			`scx_end = script_names.index('Unknown')`

			`utt_table = list(zip(std_script_names[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))`
			`utt_table += list(zip(std_script_names[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))`
			`utt_table += list(zip(std_script_abbrevs[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))`
			`utt_table += list(zip(std_script_abbrevs[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))`
Add support for 4-character script abbreviations 2021-12-28 16:10:12 +01:00
			`# At lease one script abbreviation is the same as the full name of the script,`
			`# so we must remove duplicates. It doesn't matter if this operation changes the`
			`# order, because we are going to sort the list later.`

			`utt_table = list(set(utt_table))`

			`# Add the remaining property lists`

Refactor Python scripts for generating Unicode property data 2021-12-26 18:49:58 +01:00			`utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))`
			`utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))`
			`utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))`

			`# Now add specials and synonyms. Note both the standardized and capitalized`
			`# forms are needed.`

			`utt_table.append(('any', 'Any', 'PT_ANY'))`
			`utt_table.append(('bidic', 'BidiC', 'PT_BIDICO'))`
			`utt_table.append(('bidicontrol', 'Bidi_Control', 'PT_BIDICO'))`
			`utt_table.append(('l&', 'L&', 'PT_LAMP'))`
			`utt_table.append(('lc', 'LC', 'PT_LAMP'))`
			`utt_table.append(('xan', 'Xan', 'PT_ALNUM'))`
			`utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))`
			`utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))`
			`utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))`
			`utt_table.append(('xwd', 'Xwd', 'PT_WORD'))`

			`# Sort the table.`

			`utt_table.sort()`

			`# Output file-specific heading`

			`f.write("""\`
			`#ifdef SUPPORT_UNICODE`

			`/* The PRIV(utt)[] table below translates Unicode property names into type and`
			`code values. It is searched by binary chop, so must be in collating sequence of`
			`name. Originally, the table contained pointers to the name strings in the first`
			`field of each entry. However, that leads to a large number of relocations when`
			`a shared library is dynamically loaded. A significant reduction is made by`
			`putting all the names into a single, large string and using offsets instead.`
			`All letters are lower cased, and underscores are removed, in accordance with`
			`the "loose matching" rules that Unicode advises and Perl uses. */`
			`\n""")`

			`# We have to use STR_ macros to define the strings so that it all works in`
			`# UTF-8 mode on EBCDIC platforms.`

			`for utt in utt_table:`
			`f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))`
			`for c in utt[0]:`
			`if c == '&':`
			`f.write(' STR_AMPERSAND')`
			`else:`
			`f.write(' STR_%s' % c);`
			`f.write(' "\\0"\n')`

			`# Output the long string of concatenated names`

			`f.write('\nconst char PRIV(utt_names)[] =\n');`
			`last = ''`
			`for utt in utt_table:`
			`if utt == utt_table[-1]:`
			`last = ';'`
			`f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))`

			`# Output the property type table`

			`f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')`
			`offset = 0`
			`last = ','`
			`for utt in utt_table:`
			`if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',`
			`'PT_SPACE', 'PT_UCNC', 'PT_WORD', 'PT_BIDICO'):`
			`value = '0'`
			`else:`
			`value = 'ucp_' + utt[1]`
			`if utt == utt_table[-1]:`
			`last = ''`
			`f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))`
			`offset += len(utt[0]) + 1`
			`f.write('};\n\n')`

			`# Ending text`

			`f.write("""\`
			`const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);`

			`#endif /* SUPPORT_UNICODE */`

			`/* End of pcre2_ucptables.c */`
			`""")`

			`f.close`

			`# End`