2014-05-13 13:20:03 +02:00
|
|
|
#! /usr/bin/python
|
|
|
|
|
2014-06-03 18:26:20 +02:00
|
|
|
# Generate utt tables. Note: this script has now been converted to Python 3.
|
2014-05-13 13:20:03 +02:00
|
|
|
|
|
|
|
# The source file pcre2_tables.c contains (amongst other things), a table that
|
|
|
|
# is indexed by script name. In order to reduce the number of relocations when
|
|
|
|
# loading the library, the names are held as a single large string, with
|
|
|
|
# offsets in the table. This is tedious to maintain by hand. Therefore, this
|
|
|
|
# script is used to generate the table. The output is sent to stdout; usually
|
|
|
|
# that should be directed to a temporary file. Then pcre2_tables.c can be
|
|
|
|
# edited by replacing the relevant definitions and table therein with the
|
|
|
|
# temporary file.
|
|
|
|
|
|
|
|
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
|
|
|
# for UTF-support in EBCDIC as well as ASCII environments.
|
|
|
|
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
|
|
|
|
# Modified by PH 04-May-2010 to add new "X.." special categories.
|
|
|
|
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
|
|
|
|
# Modified by ChPe 30-September-2012 to add this note; no other changes were
|
|
|
|
# necessary for Unicode 6.2.0 support.
|
|
|
|
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
|
|
|
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
2014-06-03 18:26:20 +02:00
|
|
|
# Script updated to Python 3 by running it through the 2to3 converter.
|
2014-06-20 14:40:32 +02:00
|
|
|
# Added script names for Unicode 7.0.0, 20-June-2014.
|
2015-07-17 17:44:51 +02:00
|
|
|
# Added script names for Unicode 8.0.0, 19-June-2015.
|
2017-07-02 18:32:01 +02:00
|
|
|
# Added script names for Unicode 10.0.0, 02-July-2017.
|
2018-07-07 18:10:29 +02:00
|
|
|
# Added script names for Unicode 11.0.0, 03-July-2018.
|
2018-10-02 17:25:58 +02:00
|
|
|
# Added 'Unknown' script, 01-October-2018.
|
2019-07-29 17:32:36 +02:00
|
|
|
# Added script names for Unicode 12.1.0, 27-July-2019.
|
2020-03-25 18:18:33 +01:00
|
|
|
# Added script names for Unicode 13.0.0, 10-March-2020.
|
2021-10-29 15:44:17 +02:00
|
|
|
# Added Script names for Unicode 14.0.0, PCRE2-10.39
|
2021-12-08 16:34:27 +01:00
|
|
|
# Added support for bidi class and bidi control, 06-December-2021
|
|
|
|
# This also involved lower casing strings and removing underscores, in
|
|
|
|
# accordance with Unicode's "loose matching" rules, which Perl observes.
|
2014-05-13 13:20:03 +02:00
|
|
|
|
2018-10-02 17:25:58 +02:00
|
|
|
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
2014-05-13 13:20:03 +02:00
|
|
|
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
|
|
|
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
|
|
|
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
|
|
|
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
|
|
|
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
|
|
|
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
|
|
|
# New for Unicode 5.0
|
|
|
|
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
|
|
|
# New for Unicode 5.1
|
|
|
|
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
|
|
|
|
# New for Unicode 5.2
|
|
|
|
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
|
|
|
|
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
|
|
|
|
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
|
|
|
|
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
|
|
|
|
# New for Unicode 6.0.0
|
|
|
|
'Batak', 'Brahmi', 'Mandaic', \
|
|
|
|
# New for Unicode 6.1.0
|
2014-06-20 14:40:32 +02:00
|
|
|
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
|
|
|
|
# New for Unicode 7.0.0
|
|
|
|
'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
|
|
|
|
'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
|
|
|
|
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
|
2015-07-17 17:44:51 +02:00
|
|
|
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
|
|
|
|
# New for Unicode 8.0.0
|
|
|
|
'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
|
2017-07-02 18:32:01 +02:00
|
|
|
'SignWriting',
|
|
|
|
# New for Unicode 10.0.0
|
|
|
|
'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
|
2018-07-07 18:10:29 +02:00
|
|
|
'Nushu', 'Soyombo', 'Zanabazar_Square',
|
|
|
|
# New for Unicode 11.0.0
|
|
|
|
'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
|
2019-07-29 17:32:36 +02:00
|
|
|
'Old_Sogdian', 'Sogdian',
|
|
|
|
# New for Unicode 12.0.0
|
2020-03-25 18:18:33 +01:00
|
|
|
'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
|
|
|
|
# New for Unicode 13.0.0
|
2021-10-29 15:44:17 +02:00
|
|
|
'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi',
|
|
|
|
# New for Unicode 14.0.0
|
|
|
|
'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'
|
2014-05-13 13:20:03 +02:00
|
|
|
]
|
|
|
|
|
|
|
|
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
|
|
|
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
|
|
|
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
|
|
|
|
|
|
|
general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
|
|
|
|
|
2021-12-08 16:34:27 +01:00
|
|
|
bidiclass_names = ['bidiAL', 'bidiAN', 'bidiB', 'bidiBN', 'bidiCS', 'bidiEN',
|
|
|
|
'bidiES', 'bidiET', 'bidiFSI', 'bidiL', 'bidiLRE', 'bidiLRI', 'bidiLRO',
|
|
|
|
'bidiNSM', 'bidiON', 'bidiPDF', 'bidiPDI', 'bidiR', 'bidiRLE', 'bidiRLI',
|
|
|
|
'bidiRLO', 'bidiS', 'bidiWS' ]
|
|
|
|
|
|
|
|
# Create standardized versions of the names by lowercasing and removing
|
|
|
|
# ampersands.
|
|
|
|
|
|
|
|
def stdnames(x):
|
|
|
|
y = [''] * len(x)
|
|
|
|
for i in range(len(x)):
|
|
|
|
y[i] = x[i].lower().replace('_', '')
|
|
|
|
return y
|
|
|
|
|
|
|
|
std_script_names = stdnames(script_names)
|
|
|
|
std_category_names = stdnames(category_names)
|
|
|
|
std_general_category_names = stdnames(general_category_names)
|
|
|
|
std_bidiclass_names = stdnames(bidiclass_names)
|
|
|
|
|
|
|
|
# Create the table, starting with the Unicode script, category and bidi class
|
|
|
|
# names. We keep both the standardized name and the original, because the
|
|
|
|
# latter is used for the ucp_xx names.
|
|
|
|
|
|
|
|
utt_table = list(zip(std_script_names, script_names, ['PT_SC'] * len(script_names)))
|
|
|
|
utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
|
|
|
|
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
|
|
|
utt_table += list(zip(std_bidiclass_names, bidiclass_names, ['PT_BIDICL'] * len(bidiclass_names)))
|
|
|
|
|
|
|
|
# Now add our own specials. Note both the standardized and capitalized forms
|
|
|
|
# are needed.
|
|
|
|
|
|
|
|
utt_table.append(('any', 'Any', 'PT_ANY'))
|
|
|
|
utt_table.append(('bidicontrol', 'Bidi_Control', 'PT_BIDICO'))
|
|
|
|
utt_table.append(('l&', 'L&', 'PT_LAMP'))
|
|
|
|
utt_table.append(('lc', 'LC', 'PT_LAMP'))
|
|
|
|
utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
|
|
|
|
utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
|
|
|
|
utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
|
|
|
|
utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
|
|
|
|
utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
|
2014-05-13 13:20:03 +02:00
|
|
|
|
|
|
|
# Sort the table.
|
|
|
|
|
|
|
|
utt_table.sort()
|
|
|
|
|
|
|
|
# We have to use STR_ macros to define the strings so that it all works in
|
|
|
|
# UTF-8 mode on EBCDIC platforms.
|
|
|
|
|
|
|
|
for utt in utt_table:
|
2014-06-03 18:26:20 +02:00
|
|
|
print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
|
2014-05-13 13:20:03 +02:00
|
|
|
for c in utt[0]:
|
2021-12-08 16:34:27 +01:00
|
|
|
if c == '&':
|
2014-06-03 18:26:20 +02:00
|
|
|
print('STR_AMPERSAND', end=' ')
|
2014-05-13 13:20:03 +02:00
|
|
|
else:
|
2014-06-03 18:26:20 +02:00
|
|
|
print('STR_%s' % c, end=' ');
|
|
|
|
print('"\\0"')
|
2014-05-13 13:20:03 +02:00
|
|
|
|
|
|
|
# Print the actual table, using the string names
|
|
|
|
|
2014-06-03 18:26:20 +02:00
|
|
|
print('')
|
|
|
|
print('const char PRIV(utt_names)[] =');
|
2014-05-13 13:20:03 +02:00
|
|
|
last = ''
|
|
|
|
for utt in utt_table:
|
|
|
|
if utt == utt_table[-1]:
|
|
|
|
last = ';'
|
2014-06-03 18:26:20 +02:00
|
|
|
print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
|
2014-05-13 13:20:03 +02:00
|
|
|
|
2014-06-03 18:26:20 +02:00
|
|
|
print('\nconst ucp_type_table PRIV(utt)[] = {')
|
2014-05-13 13:20:03 +02:00
|
|
|
offset = 0
|
|
|
|
last = ','
|
|
|
|
for utt in utt_table:
|
2021-12-08 16:34:27 +01:00
|
|
|
if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
|
|
|
|
'PT_SPACE', 'PT_UCNC', 'PT_WORD', 'PT_BIDICO'):
|
2014-05-13 13:20:03 +02:00
|
|
|
value = '0'
|
|
|
|
else:
|
2021-12-08 16:34:27 +01:00
|
|
|
value = 'ucp_' + utt[1]
|
2014-05-13 13:20:03 +02:00
|
|
|
if utt == utt_table[-1]:
|
|
|
|
last = ''
|
2021-12-08 16:34:27 +01:00
|
|
|
print(' { %3d, %s, %s }%s' % (offset, utt[2], value, last))
|
2014-05-13 13:20:03 +02:00
|
|
|
offset += len(utt[0]) + 1
|
2014-06-03 18:26:20 +02:00
|
|
|
print('};')
|