pcre2/maint/GenerateUtt.py

#! /usr/bin/python

# Generate utt tables. Note: this script has now been converted to Python 3.

# The source file pcre2_tables.c contains (amongst other things), a table that
# is indexed by script name. In order to reduce the number of relocations when
# loading the library, the names are held as a single large string, with
# offsets in the table. This is tedious to maintain by hand. Therefore, this
# script is used to generate the table. The output is sent to stdout; usually
# that should be directed to a temporary file. Then pcre2_tables.c can be
# edited by replacing the relevant definitions and table therein with the
# temporary file.

# Modified by PH 17-March-2009 to generate the more verbose form that works
# for UTF-support in EBCDIC as well as ASCII environments.
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
# Modified by PH 04-May-2010 to add new "X.." special categories.
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
# Modified by ChPe 30-September-2012 to add this note; no other changes were
# necessary for Unicode 6.2.0 support.
# Modfied by PH 26-February-2013 to add the Xuc special category.
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
# Script updated to Python 3 by running it through the 2to3 converter.
# Added script names for Unicode 7.0.0, 20-June-2014.
# Added script names for Unicode 8.0.0, 19-June-2015.
# Added script names for Unicode 10.0.0, 02-July-2017.
# Added script names for Unicode 11.0.0, 03-July-2018.
# Added 'Unknown' script, 01-October-2018.
# Added script names for Unicode 12.1.0, 27-July-2019.
# Added script names for Unicode 13.0.0, 10-March-2020.
# Added Script names for Unicode 14.0.0, PCRE2-10.39

script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
 # New for Unicode 5.0
 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
 # New for Unicode 5.1
 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
 # New for Unicode 5.2
 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
 # New for Unicode 6.0.0
 'Batak', 'Brahmi', 'Mandaic', \
# New for Unicode 6.1.0
 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
# New for Unicode 7.0.0
 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
# New for Unicode 8.0.0
 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
 'SignWriting',
# New for Unicode 10.0.0
 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
 'Nushu', 'Soyombo', 'Zanabazar_Square',
# New for Unicode 11.0.0
  'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
  'Old_Sogdian', 'Sogdian',
# New for Unicode 12.0.0
  'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
# New for Unicode 13.0.0
  'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi',
# New for Unicode 14.0.0
  'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'
 ]

category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
  'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
  'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]

general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']

# First add the Unicode script and category names.

utt_table  = list(zip(script_names, ['PT_SC'] * len(script_names)))
utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))

# Now add our own specials.

utt_table.append(('Any', 'PT_ANY'))
utt_table.append(('L&',  'PT_LAMP'))
utt_table.append(('Xan', 'PT_ALNUM'))
utt_table.append(('Xps', 'PT_PXSPACE'))
utt_table.append(('Xsp', 'PT_SPACE'))
utt_table.append(('Xuc', 'PT_UCNC'))
utt_table.append(('Xwd', 'PT_WORD'))

# Sort the table.

utt_table.sort()

# We have to use STR_ macros to define the strings so that it all works in
# UTF-8 mode on EBCDIC platforms.

for utt in utt_table:
        print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
        for c in utt[0]:
                if c == '_':
                        print('STR_UNDERSCORE', end=' ')
                elif c == '&':
                        print('STR_AMPERSAND', end=' ')
                else:
                        print('STR_%s' % c, end=' ');
        print('"\\0"')

# Print the actual table, using the string names

print('')
print('const char PRIV(utt_names)[] =');
last = ''
for utt in utt_table:
        if utt == utt_table[-1]:
                last = ';'
        print('  STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
# This was how it was done before the EBCDIC-compatible modification.
#        print '  "%s\\0"%s' % (utt[0], last)

print('\nconst ucp_type_table PRIV(utt)[] = {')
offset = 0
last = ','
for utt in utt_table:
        if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', 
          'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
                value = '0'
        else:
                value = 'ucp_' + utt[0]
        if utt == utt_table[-1]:
                last = ''
        print('  { %3d, %s, %s }%s' % (offset, utt[1], value, last))
        offset += len(utt[0]) + 1
print('};')
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00			`#! /usr/bin/python`

Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`# Generate utt tables. Note: this script has now been converted to Python 3.`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00
			`# The source file pcre2_tables.c contains (amongst other things), a table that`
			`# is indexed by script name. In order to reduce the number of relocations when`
			`# loading the library, the names are held as a single large string, with`
			`# offsets in the table. This is tedious to maintain by hand. Therefore, this`
			`# script is used to generate the table. The output is sent to stdout; usually`
			`# that should be directed to a temporary file. Then pcre2_tables.c can be`
			`# edited by replacing the relevant definitions and table therein with the`
			`# temporary file.`

			`# Modified by PH 17-March-2009 to generate the more verbose form that works`
			`# for UTF-support in EBCDIC as well as ASCII environments.`
			`# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.`
			`# Modified by PH 04-May-2010 to add new "X.." special categories.`
			`# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0`
			`# Modified by ChPe 30-September-2012 to add this note; no other changes were`
			`# necessary for Unicode 6.2.0 support.`
			`# Modfied by PH 26-February-2013 to add the Xuc special category.`
			`# Comment modified by PH 13-May-2014 to update to PCRE2 file names.`
Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`# Script updated to Python 3 by running it through the 2to3 converter.`
Update for Unicode 7.0.0 2014-06-20 14:40:32 +02:00			`# Added script names for Unicode 7.0.0, 20-June-2014.`
Update Unicode tables to 8.0.0. 2015-07-17 17:44:51 +02:00			`# Added script names for Unicode 8.0.0, 19-June-2015.`
Update to Unicode 10.0.0 and add callout_no_where to pcre2test to aid testing. 2017-07-02 18:32:01 +02:00			`# Added script names for Unicode 10.0.0, 02-July-2017.`
Update to Unicode 11.0.0 2018-07-07 18:10:29 +02:00			`# Added script names for Unicode 11.0.0, 03-July-2018.`
Basic "script run" implementation. Not yet complete, and not yet documented. 2018-10-02 17:25:58 +02:00			`# Added 'Unknown' script, 01-October-2018.`
Upgrade to Unicode 12.1.0 2019-07-29 17:32:36 +02:00			`# Added script names for Unicode 12.1.0, 27-July-2019.`
Update to Unicode 13.0.0. 2020-03-25 18:18:33 +01:00			`# Added script names for Unicode 13.0.0, 10-March-2020.`
Update to Unicode 14.0.0 (#29) 2021-10-29 15:44:17 +02:00			`# Added Script names for Unicode 14.0.0, PCRE2-10.39`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00
Basic "script run" implementation. Not yet complete, and not yet documented. 2018-10-02 17:25:58 +02:00			`script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00			`'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \`
			`'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \`
			`'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \`
			`'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \`
			`'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \`
			`'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \`
			`# New for Unicode 5.0`
			`'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \`
			`# New for Unicode 5.1`
			`'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \`
			`# New for Unicode 5.2`
			`'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \`
			`'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \`
			`'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \`
			`'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \`
			`# New for Unicode 6.0.0`
			`'Batak', 'Brahmi', 'Mandaic', \`
			`# New for Unicode 6.1.0`
Update for Unicode 7.0.0 2014-06-20 14:40:32 +02:00			`'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',`
			`# New for Unicode 7.0.0`
			`'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',`
			`'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',`
			`'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',`
Update Unicode tables to 8.0.0. 2015-07-17 17:44:51 +02:00			`'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',`
			`# New for Unicode 8.0.0`
			`'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',`
Update to Unicode 10.0.0 and add callout_no_where to pcre2test to aid testing. 2017-07-02 18:32:01 +02:00			`'SignWriting',`
			`# New for Unicode 10.0.0`
			`'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',`
Update to Unicode 11.0.0 2018-07-07 18:10:29 +02:00			`'Nushu', 'Soyombo', 'Zanabazar_Square',`
			`# New for Unicode 11.0.0`
			`'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',`
Upgrade to Unicode 12.1.0 2019-07-29 17:32:36 +02:00			`'Old_Sogdian', 'Sogdian',`
			`# New for Unicode 12.0.0`
Update to Unicode 13.0.0. 2020-03-25 18:18:33 +01:00			`'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',`
			`# New for Unicode 13.0.0`
Update to Unicode 14.0.0 (#29) 2021-10-29 15:44:17 +02:00			`'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi',`
			`# New for Unicode 14.0.0`
			`'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00			`]`

			`category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',`
			`'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',`
			`'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]`

			`general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']`

			`# First add the Unicode script and category names.`

Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`utt_table = list(zip(script_names, ['PT_SC'] * len(script_names)))`
			`utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))`
			`utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00
			`# Now add our own specials.`

			`utt_table.append(('Any', 'PT_ANY'))`
			`utt_table.append(('L&', 'PT_LAMP'))`
			`utt_table.append(('Xan', 'PT_ALNUM'))`
			`utt_table.append(('Xps', 'PT_PXSPACE'))`
			`utt_table.append(('Xsp', 'PT_SPACE'))`
			`utt_table.append(('Xuc', 'PT_UCNC'))`
			`utt_table.append(('Xwd', 'PT_WORD'))`

			`# Sort the table.`

			`utt_table.sort()`

			`# We have to use STR_ macros to define the strings so that it all works in`
			`# UTF-8 mode on EBCDIC platforms.`

			`for utt in utt_table:`
Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00			`for c in utt[0]:`
			`if c == '_':`
Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`print('STR_UNDERSCORE', end=' ')`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00			`elif c == '&':`
Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`print('STR_AMPERSAND', end=' ')`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00			`else:`
Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`print('STR_%s' % c, end=' ');`
			`print('"\\0"')`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00
			`# Print the actual table, using the string names`

Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`print('')`
			`print('const char PRIV(utt_names)[] =');`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00			`last = ''`
			`for utt in utt_table:`
			`if utt == utt_table[-1]:`
			`last = ';'`
Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00			`# This was how it was done before the EBCDIC-compatible modification.`
			`# print ' "%s\\0"%s' % (utt[0], last)`

Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`print('\nconst ucp_type_table PRIV(utt)[] = {')`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00			`offset = 0`
			`last = ','`
			`for utt in utt_table:`
			`if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',`
			`'PT_SPACE', 'PT_UCNC', 'PT_WORD'):`
			`value = '0'`
			`else:`
			`value = 'ucp_' + utt[0]`
			`if utt == utt_table[-1]:`
			`last = ''`
Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last))`
Further work on pcre2test (can now display compiled code). 2014-05-13 13:20:03 +02:00			`offset += len(utt[0]) + 1`
Update the two Python maintenance scripts for Python 3. 2014-06-03 18:26:20 +02:00			`print('};')`