Update the two Python maintenance scripts for Python 3.

This commit is contained in:
Philip.Hazel 2014-06-03 16:26:20 +00:00
parent 1b4bcb79ae
commit 2801d5d132
2 changed files with 104 additions and 95 deletions

View File

@ -1,8 +1,6 @@
#! /usr/bin/python #! /usr/bin/python
# Generate utt tables. Note: this script is written in Python 2 and is # Generate utt tables. Note: this script has now been converted to Python 3.
# incompatible with Python 3. However, the 2to3 conversion script has been
# successfully tested on it.
# The source file pcre2_tables.c contains (amongst other things), a table that # The source file pcre2_tables.c contains (amongst other things), a table that
# is indexed by script name. In order to reduce the number of relocations when # is indexed by script name. In order to reduce the number of relocations when
@ -22,6 +20,7 @@
# necessary for Unicode 6.2.0 support. # necessary for Unicode 6.2.0 support.
# Modfied by PH 26-February-2013 to add the Xuc special category. # Modfied by PH 26-February-2013 to add the Xuc special category.
# Comment modified by PH 13-May-2014 to update to PCRE2 file names. # Comment modified by PH 13-May-2014 to update to PCRE2 file names.
# Script updated to Python 3 by running it through the 2to3 converter.
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
@ -53,9 +52,9 @@ general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
# First add the Unicode script and category names. # First add the Unicode script and category names.
utt_table = zip(script_names, ['PT_SC'] * len(script_names)) utt_table = list(zip(script_names, ['PT_SC'] * len(script_names)))
utt_table += zip(category_names, ['PT_PC'] * len(category_names)) utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names)) utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))
# Now add our own specials. # Now add our own specials.
@ -75,29 +74,29 @@ utt_table.sort()
# UTF-8 mode on EBCDIC platforms. # UTF-8 mode on EBCDIC platforms.
for utt in utt_table: for utt in utt_table:
print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
for c in utt[0]: for c in utt[0]:
if c == '_': if c == '_':
print 'STR_UNDERSCORE', print('STR_UNDERSCORE', end=' ')
elif c == '&': elif c == '&':
print 'STR_AMPERSAND', print('STR_AMPERSAND', end=' ')
else: else:
print 'STR_%s' % c,; print('STR_%s' % c, end=' ');
print '"\\0"' print('"\\0"')
# Print the actual table, using the string names # Print the actual table, using the string names
print '' print('')
print 'const char PRIV(utt_names)[] ='; print('const char PRIV(utt_names)[] =');
last = '' last = ''
for utt in utt_table: for utt in utt_table:
if utt == utt_table[-1]: if utt == utt_table[-1]:
last = ';' last = ';'
print ' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last) print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
# This was how it was done before the EBCDIC-compatible modification. # This was how it was done before the EBCDIC-compatible modification.
# print ' "%s\\0"%s' % (utt[0], last) # print ' "%s\\0"%s' % (utt[0], last)
print '\nconst ucp_type_table PRIV(utt)[] = {' print('\nconst ucp_type_table PRIV(utt)[] = {')
offset = 0 offset = 0
last = ',' last = ','
for utt in utt_table: for utt in utt_table:
@ -108,6 +107,6 @@ for utt in utt_table:
value = 'ucp_' + utt[0] value = 'ucp_' + utt[0]
if utt == utt_table[-1]: if utt == utt_table[-1]:
last = '' last = ''
print ' { %3d, %s, %s }%s' % (offset, utt[1], value, last) print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last))
offset += len(utt[0]) + 1 offset += len(utt[0]) + 1
print '};' print('};')

View File

@ -10,9 +10,10 @@
# generate the pcre_ucd.c file that contains a digested form of the Unicode # generate the pcre_ucd.c file that contains a digested form of the Unicode
# data tables. # data tables.
# #
# The script should be run in the maint subdirectory, using the command # The script has now been upgraded to Python 3 for PCRE2, and should be run in
# the maint subdirectory, using the command
# #
# [python2] ./MultiStage2.py >../src/pcre2_ucd.c # [python3] ./MultiStage2.py >../src/pcre2_ucd.c
# #
# It requires four Unicode data tables, DerivedGeneralCategory.txt, # It requires four Unicode data tables, DerivedGeneralCategory.txt,
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the # GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
@ -42,6 +43,13 @@
# offsets into the table are added to the main output records. This new # offsets into the table are added to the main output records. This new
# code scans CaseFolding.txt instead of UnicodeData.txt. # code scans CaseFolding.txt instead of UnicodeData.txt.
# #
# Update for Python3:
# . Processed with 2to3, but that didn't fix everything
# . Changed string.strip to str.strip
# . Added encoding='utf-8' to the open() call
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
# required and the result of the division is a float
#
# The main tables generated by this script are used by macros defined in # The main tables generated by this script are used by macros defined in
# pcre2_internal.h. They look up Unicode character properties using short # pcre2_internal.h. They look up Unicode character properties using short
# sequences of code that contains no branches, which makes for greater speed. # sequences of code that contains no branches, which makes for greater speed.
@ -110,6 +118,7 @@
# final hole in the structure. # final hole in the structure.
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0 # 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
# 13-May-2014: Updated for PCRE2 # 13-May-2014: Updated for PCRE2
# 03-June-2014: Updated for Python 3
############################################################################## ##############################################################################
@ -133,11 +142,11 @@ def get_other_case(chardata):
# Read the whole table in memory # Read the whole table in memory
def read_table(file_name, get_value, default_value): def read_table(file_name, get_value, default_value):
file = open(file_name, 'r') file = open(file_name, 'r', encoding='utf-8')
table = [default_value] * MAX_UNICODE table = [default_value] * MAX_UNICODE
for line in file: for line in file:
line = re.sub(r'#.*', '', line) line = re.sub(r'#.*', '', line)
chardata = map(string.strip, line.split(';')) chardata = list(map(str.strip, line.split(';')))
if len(chardata) <= 1: if len(chardata) <= 1:
continue continue
value = get_value(chardata) value = get_value(chardata)
@ -170,7 +179,7 @@ def get_type_size(table):
if minlimit <= minval and maxval <= maxlimit: if minlimit <= minval and maxval <= maxlimit:
return type_size[num] return type_size[num]
else: else:
raise OverflowError, "Too large to fit into C types" raise OverflowError("Too large to fit into C types")
def get_tables_size(*tables): def get_tables_size(*tables):
total_size = 0 total_size = 0
@ -205,13 +214,13 @@ def print_table(table, table_name, block_size = None):
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
if block_size: if block_size:
s += ", block = %d" % block_size s += ", block = %d" % block_size
print s + " */" print(s + " */")
table = tuple(table) table = tuple(table)
if block_size is None: if block_size is None:
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */" fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
mult = MAX_UNICODE / len(table) mult = MAX_UNICODE / len(table)
for i in range(0, len(table), ELEMS_PER_LINE): for i in range(0, len(table), ELEMS_PER_LINE):
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)) print(fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)))
else: else:
if block_size > ELEMS_PER_LINE: if block_size > ELEMS_PER_LINE:
el = ELEMS_PER_LINE el = ELEMS_PER_LINE
@ -219,10 +228,10 @@ def print_table(table, table_name, block_size = None):
el = block_size el = block_size
fmt = "%3d," * el + "\n" fmt = "%3d," * el + "\n"
if block_size > ELEMS_PER_LINE: if block_size > ELEMS_PER_LINE:
fmt = fmt * (block_size / ELEMS_PER_LINE) fmt = fmt * int(block_size / ELEMS_PER_LINE)
for i in range(0, len(table), block_size): for i in range(0, len(table), block_size):
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]) print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
print "};\n" print("};\n")
# Extract the unique combinations of properties into records # Extract the unique combinations of properties into records
def combine_tables(*tables): def combine_tables(*tables):
@ -241,7 +250,7 @@ def get_record_size_struct(records):
'types in this structure definition from pcre2_internal.h (the actual\n' + \ 'types in this structure definition from pcre2_internal.h (the actual\n' + \
'field names will be different):\n\ntypedef struct {\n' 'field names will be different):\n\ntypedef struct {\n'
for i in range(len(records[0])): for i in range(len(records[0])):
record_slice = map(lambda record: record[i], records) record_slice = [record[i] for record in records]
slice_type, slice_size = get_type_size(record_slice) slice_type, slice_size = get_type_size(record_slice)
# add padding: round up to the nearest power of slice_size # add padding: round up to the nearest power of slice_size
size = (size + slice_size - 1) & -slice_size size = (size + slice_size - 1) & -slice_size
@ -249,7 +258,7 @@ def get_record_size_struct(records):
structure += '%s property_%d;\n' % (slice_type, i) structure += '%s property_%d;\n' % (slice_type, i)
# round up to the first item of the next structure in array # round up to the first item of the next structure in array
record_slice = map(lambda record: record[0], records) record_slice = [record[0] for record in records]
slice_type, slice_size = get_type_size(record_slice) slice_type, slice_size = get_type_size(record_slice)
size = (size + slice_size - 1) & -slice_size size = (size + slice_size - 1) & -slice_size
@ -273,13 +282,14 @@ def test_record_size():
#print struct #print struct
def print_records(records, record_size): def print_records(records, record_size):
print 'const ucd_record PRIV(ucd_records)[] = { ' + \ print('const ucd_record PRIV(ucd_records)[] = { ' + \
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size) '/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
records = zip(records.keys(), records.values())
records.sort(None, lambda x: x[1]) records = list(zip(list(records.keys()), list(records.values())))
records.sort(key = lambda x: x[1])
for i, record in enumerate(records): for i, record in enumerate(records):
print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)) print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
print '};\n' print('};\n')
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
@ -393,10 +403,10 @@ for s in sets:
table, records = combine_tables(script, category, break_props, table, records = combine_tables(script, category, break_props,
caseless_offsets, other_case) caseless_offsets, other_case)
record_size, record_struct = get_record_size_struct(records.keys()) record_size, record_struct = get_record_size_struct(list(records.keys()))
# Find the optimum block size for the two-stage table # Find the optimum block size for the two-stage table
min_size = sys.maxint min_size = sys.maxsize
for block_size in [2 ** i for i in range(5,10)]: for block_size in [2 ** i for i in range(5,10)]:
size = len(records) * record_size size = len(records) * record_size
stage1, stage2 = compress_table(table, block_size) stage1, stage2 = compress_table(table, block_size)
@ -407,76 +417,76 @@ for block_size in [2 ** i for i in range(5,10)]:
min_stage1, min_stage2 = stage1, stage2 min_stage1, min_stage2 = stage1, stage2
min_block_size = block_size min_block_size = block_size
print "/* This module is generated by the maint/MultiStage2.py script." print("/* This module is generated by the maint/MultiStage2.py script.")
print "Do not modify it by hand. Instead modify the script and run it" print("Do not modify it by hand. Instead modify the script and run it")
print "to regenerate this code." print("to regenerate this code.")
print print()
print "As well as being part of the PCRE2 library, this module is #included" print("As well as being part of the PCRE2 library, this module is #included")
print "by the pcre2test program, which redefines the PRIV macro to change" print("by the pcre2test program, which redefines the PRIV macro to change")
print "table names from _pcre2_xxx to xxxx, thereby avoiding name clashes" print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
print "with the library. At present, just one of these tables is actually" print("with the library. At present, just one of these tables is actually")
print "needed. */" print("needed. */")
print print()
print "#ifndef PCRE2_INCLUDED" print("#ifndef PCRE2_INCLUDED")
print print()
print "#ifdef HAVE_CONFIG_H" print("#ifdef HAVE_CONFIG_H")
print "#include \"config.h\"" print("#include \"config.h\"")
print "#endif" print("#endif")
print print()
print "#include \"pcre2_internal.h\"" print("#include \"pcre2_internal.h\"")
print print()
print "#endif /* PCRE2_INCLUDED */" print("#endif /* PCRE2_INCLUDED */")
print print()
print "/* Unicode character database. */" print("/* Unicode character database. */")
print "/* This file was autogenerated by the MultiStage2.py script. */" print("/* This file was autogenerated by the MultiStage2.py script. */")
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size) print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
print print()
print "/* The tables herein are needed only when UCP support is built," print("/* The tables herein are needed only when UCP support is built,")
print "and in PCRE2 that happens automatically with UTF support." print("and in PCRE2 that happens automatically with UTF support.")
print "This module should not be referenced otherwise, so" print("This module should not be referenced otherwise, so")
print "it should not matter whether it is compiled or not. However" print("it should not matter whether it is compiled or not. However")
print "a comment was received about space saving - maybe the guy linked" print("a comment was received about space saving - maybe the guy linked")
print "all the modules rather than using a library - so we include a" print("all the modules rather than using a library - so we include a")
print "condition to cut out the tables when not needed. But don't leave" print("condition to cut out the tables when not needed. But don't leave")
print "a totally empty module because some compilers barf at that." print("a totally empty module because some compilers barf at that.")
print "Instead, just supply small dummy tables. */" print("Instead, just supply small dummy tables. */")
print print()
print "#ifndef SUPPORT_UTF" print("#ifndef SUPPORT_UTF")
print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};" print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
print "const uint8_t PRIV(ucd_stage1)[] = {0};" print("const uint8_t PRIV(ucd_stage1)[] = {0};")
print "const uint16_t PRIV(ucd_stage2)[] = {0};" print("const uint16_t PRIV(ucd_stage2)[] = {0};")
print "const uint32_t PRIV(ucd_caseless_sets)[] = {0};" print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
print "#else" print("#else")
print print()
print record_struct print(record_struct)
# --- Added by PH: output the table of caseless character sets --- # --- Added by PH: output the table of caseless character sets ---
print "const uint32_t PRIV(ucd_caseless_sets)[] = {" print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
print " NOTACHAR," print(" NOTACHAR,")
for s in sets: for s in sets:
s = sorted(s) s = sorted(s)
for x in s: for x in s:
print ' 0x%04x,' % x, print(' 0x%04x,' % x, end=' ')
print ' NOTACHAR,' print(' NOTACHAR,')
print '};' print('};')
print print()
# ------ # ------
print "/* When #included in pcre2test, we don't need this large table. */" print("/* When #included in pcre2test, we don't need this large table. */")
print print()
print "#ifndef PCRE2_INCLUDED" print("#ifndef PCRE2_INCLUDED")
print print()
print_records(records, record_size) print_records(records, record_size)
print_table(min_stage1, 'PRIV(ucd_stage1)') print_table(min_stage1, 'PRIV(ucd_stage1)')
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size) print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
print "#if UCD_BLOCK_SIZE != %d" % min_block_size print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
print "#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h" print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
print "#endif" print("#endif")
print "#endif /* SUPPORT_UTF */" print("#endif /* SUPPORT_UTF */")
print print()
print "#endif /* PCRE2_INCLUDED */" print("#endif /* PCRE2_INCLUDED */")
""" """