Update the two Python maintenance scripts for Python 3.
This commit is contained in:
parent
1b4bcb79ae
commit
2801d5d132
|
@ -1,8 +1,6 @@
|
||||||
#! /usr/bin/python
|
#! /usr/bin/python
|
||||||
|
|
||||||
# Generate utt tables. Note: this script is written in Python 2 and is
|
# Generate utt tables. Note: this script has now been converted to Python 3.
|
||||||
# incompatible with Python 3. However, the 2to3 conversion script has been
|
|
||||||
# successfully tested on it.
|
|
||||||
|
|
||||||
# The source file pcre2_tables.c contains (amongst other things), a table that
|
# The source file pcre2_tables.c contains (amongst other things), a table that
|
||||||
# is indexed by script name. In order to reduce the number of relocations when
|
# is indexed by script name. In order to reduce the number of relocations when
|
||||||
|
@ -22,6 +20,7 @@
|
||||||
# necessary for Unicode 6.2.0 support.
|
# necessary for Unicode 6.2.0 support.
|
||||||
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
||||||
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
||||||
|
# Script updated to Python 3 by running it through the 2to3 converter.
|
||||||
|
|
||||||
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||||
|
@ -53,9 +52,9 @@ general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
|
||||||
|
|
||||||
# First add the Unicode script and category names.
|
# First add the Unicode script and category names.
|
||||||
|
|
||||||
utt_table = zip(script_names, ['PT_SC'] * len(script_names))
|
utt_table = list(zip(script_names, ['PT_SC'] * len(script_names)))
|
||||||
utt_table += zip(category_names, ['PT_PC'] * len(category_names))
|
utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
|
||||||
utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names))
|
utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||||
|
|
||||||
# Now add our own specials.
|
# Now add our own specials.
|
||||||
|
|
||||||
|
@ -75,29 +74,29 @@ utt_table.sort()
|
||||||
# UTF-8 mode on EBCDIC platforms.
|
# UTF-8 mode on EBCDIC platforms.
|
||||||
|
|
||||||
for utt in utt_table:
|
for utt in utt_table:
|
||||||
print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')),
|
print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
|
||||||
for c in utt[0]:
|
for c in utt[0]:
|
||||||
if c == '_':
|
if c == '_':
|
||||||
print 'STR_UNDERSCORE',
|
print('STR_UNDERSCORE', end=' ')
|
||||||
elif c == '&':
|
elif c == '&':
|
||||||
print 'STR_AMPERSAND',
|
print('STR_AMPERSAND', end=' ')
|
||||||
else:
|
else:
|
||||||
print 'STR_%s' % c,;
|
print('STR_%s' % c, end=' ');
|
||||||
print '"\\0"'
|
print('"\\0"')
|
||||||
|
|
||||||
# Print the actual table, using the string names
|
# Print the actual table, using the string names
|
||||||
|
|
||||||
print ''
|
print('')
|
||||||
print 'const char PRIV(utt_names)[] =';
|
print('const char PRIV(utt_names)[] =');
|
||||||
last = ''
|
last = ''
|
||||||
for utt in utt_table:
|
for utt in utt_table:
|
||||||
if utt == utt_table[-1]:
|
if utt == utt_table[-1]:
|
||||||
last = ';'
|
last = ';'
|
||||||
print ' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)
|
print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
|
||||||
# This was how it was done before the EBCDIC-compatible modification.
|
# This was how it was done before the EBCDIC-compatible modification.
|
||||||
# print ' "%s\\0"%s' % (utt[0], last)
|
# print ' "%s\\0"%s' % (utt[0], last)
|
||||||
|
|
||||||
print '\nconst ucp_type_table PRIV(utt)[] = {'
|
print('\nconst ucp_type_table PRIV(utt)[] = {')
|
||||||
offset = 0
|
offset = 0
|
||||||
last = ','
|
last = ','
|
||||||
for utt in utt_table:
|
for utt in utt_table:
|
||||||
|
@ -108,6 +107,6 @@ for utt in utt_table:
|
||||||
value = 'ucp_' + utt[0]
|
value = 'ucp_' + utt[0]
|
||||||
if utt == utt_table[-1]:
|
if utt == utt_table[-1]:
|
||||||
last = ''
|
last = ''
|
||||||
print ' { %3d, %s, %s }%s' % (offset, utt[1], value, last)
|
print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last))
|
||||||
offset += len(utt[0]) + 1
|
offset += len(utt[0]) + 1
|
||||||
print '};'
|
print('};')
|
||||||
|
|
|
@ -10,9 +10,10 @@
|
||||||
# generate the pcre_ucd.c file that contains a digested form of the Unicode
|
# generate the pcre_ucd.c file that contains a digested form of the Unicode
|
||||||
# data tables.
|
# data tables.
|
||||||
#
|
#
|
||||||
# The script should be run in the maint subdirectory, using the command
|
# The script has now been upgraded to Python 3 for PCRE2, and should be run in
|
||||||
|
# the maint subdirectory, using the command
|
||||||
#
|
#
|
||||||
# [python2] ./MultiStage2.py >../src/pcre2_ucd.c
|
# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
|
||||||
#
|
#
|
||||||
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
|
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
|
||||||
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
|
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
|
||||||
|
@ -42,6 +43,13 @@
|
||||||
# offsets into the table are added to the main output records. This new
|
# offsets into the table are added to the main output records. This new
|
||||||
# code scans CaseFolding.txt instead of UnicodeData.txt.
|
# code scans CaseFolding.txt instead of UnicodeData.txt.
|
||||||
#
|
#
|
||||||
|
# Update for Python3:
|
||||||
|
# . Processed with 2to3, but that didn't fix everything
|
||||||
|
# . Changed string.strip to str.strip
|
||||||
|
# . Added encoding='utf-8' to the open() call
|
||||||
|
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
|
||||||
|
# required and the result of the division is a float
|
||||||
|
#
|
||||||
# The main tables generated by this script are used by macros defined in
|
# The main tables generated by this script are used by macros defined in
|
||||||
# pcre2_internal.h. They look up Unicode character properties using short
|
# pcre2_internal.h. They look up Unicode character properties using short
|
||||||
# sequences of code that contains no branches, which makes for greater speed.
|
# sequences of code that contains no branches, which makes for greater speed.
|
||||||
|
@ -110,6 +118,7 @@
|
||||||
# final hole in the structure.
|
# final hole in the structure.
|
||||||
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
||||||
# 13-May-2014: Updated for PCRE2
|
# 13-May-2014: Updated for PCRE2
|
||||||
|
# 03-June-2014: Updated for Python 3
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
@ -133,11 +142,11 @@ def get_other_case(chardata):
|
||||||
|
|
||||||
# Read the whole table in memory
|
# Read the whole table in memory
|
||||||
def read_table(file_name, get_value, default_value):
|
def read_table(file_name, get_value, default_value):
|
||||||
file = open(file_name, 'r')
|
file = open(file_name, 'r', encoding='utf-8')
|
||||||
table = [default_value] * MAX_UNICODE
|
table = [default_value] * MAX_UNICODE
|
||||||
for line in file:
|
for line in file:
|
||||||
line = re.sub(r'#.*', '', line)
|
line = re.sub(r'#.*', '', line)
|
||||||
chardata = map(string.strip, line.split(';'))
|
chardata = list(map(str.strip, line.split(';')))
|
||||||
if len(chardata) <= 1:
|
if len(chardata) <= 1:
|
||||||
continue
|
continue
|
||||||
value = get_value(chardata)
|
value = get_value(chardata)
|
||||||
|
@ -170,7 +179,7 @@ def get_type_size(table):
|
||||||
if minlimit <= minval and maxval <= maxlimit:
|
if minlimit <= minval and maxval <= maxlimit:
|
||||||
return type_size[num]
|
return type_size[num]
|
||||||
else:
|
else:
|
||||||
raise OverflowError, "Too large to fit into C types"
|
raise OverflowError("Too large to fit into C types")
|
||||||
|
|
||||||
def get_tables_size(*tables):
|
def get_tables_size(*tables):
|
||||||
total_size = 0
|
total_size = 0
|
||||||
|
@ -205,13 +214,13 @@ def print_table(table, table_name, block_size = None):
|
||||||
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
||||||
if block_size:
|
if block_size:
|
||||||
s += ", block = %d" % block_size
|
s += ", block = %d" % block_size
|
||||||
print s + " */"
|
print(s + " */")
|
||||||
table = tuple(table)
|
table = tuple(table)
|
||||||
if block_size is None:
|
if block_size is None:
|
||||||
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
|
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
|
||||||
mult = MAX_UNICODE / len(table)
|
mult = MAX_UNICODE / len(table)
|
||||||
for i in range(0, len(table), ELEMS_PER_LINE):
|
for i in range(0, len(table), ELEMS_PER_LINE):
|
||||||
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
|
print(fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)))
|
||||||
else:
|
else:
|
||||||
if block_size > ELEMS_PER_LINE:
|
if block_size > ELEMS_PER_LINE:
|
||||||
el = ELEMS_PER_LINE
|
el = ELEMS_PER_LINE
|
||||||
|
@ -219,10 +228,10 @@ def print_table(table, table_name, block_size = None):
|
||||||
el = block_size
|
el = block_size
|
||||||
fmt = "%3d," * el + "\n"
|
fmt = "%3d," * el + "\n"
|
||||||
if block_size > ELEMS_PER_LINE:
|
if block_size > ELEMS_PER_LINE:
|
||||||
fmt = fmt * (block_size / ELEMS_PER_LINE)
|
fmt = fmt * int(block_size / ELEMS_PER_LINE)
|
||||||
for i in range(0, len(table), block_size):
|
for i in range(0, len(table), block_size):
|
||||||
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
|
print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
|
||||||
print "};\n"
|
print("};\n")
|
||||||
|
|
||||||
# Extract the unique combinations of properties into records
|
# Extract the unique combinations of properties into records
|
||||||
def combine_tables(*tables):
|
def combine_tables(*tables):
|
||||||
|
@ -241,7 +250,7 @@ def get_record_size_struct(records):
|
||||||
'types in this structure definition from pcre2_internal.h (the actual\n' + \
|
'types in this structure definition from pcre2_internal.h (the actual\n' + \
|
||||||
'field names will be different):\n\ntypedef struct {\n'
|
'field names will be different):\n\ntypedef struct {\n'
|
||||||
for i in range(len(records[0])):
|
for i in range(len(records[0])):
|
||||||
record_slice = map(lambda record: record[i], records)
|
record_slice = [record[i] for record in records]
|
||||||
slice_type, slice_size = get_type_size(record_slice)
|
slice_type, slice_size = get_type_size(record_slice)
|
||||||
# add padding: round up to the nearest power of slice_size
|
# add padding: round up to the nearest power of slice_size
|
||||||
size = (size + slice_size - 1) & -slice_size
|
size = (size + slice_size - 1) & -slice_size
|
||||||
|
@ -249,7 +258,7 @@ def get_record_size_struct(records):
|
||||||
structure += '%s property_%d;\n' % (slice_type, i)
|
structure += '%s property_%d;\n' % (slice_type, i)
|
||||||
|
|
||||||
# round up to the first item of the next structure in array
|
# round up to the first item of the next structure in array
|
||||||
record_slice = map(lambda record: record[0], records)
|
record_slice = [record[0] for record in records]
|
||||||
slice_type, slice_size = get_type_size(record_slice)
|
slice_type, slice_size = get_type_size(record_slice)
|
||||||
size = (size + slice_size - 1) & -slice_size
|
size = (size + slice_size - 1) & -slice_size
|
||||||
|
|
||||||
|
@ -273,13 +282,14 @@ def test_record_size():
|
||||||
#print struct
|
#print struct
|
||||||
|
|
||||||
def print_records(records, record_size):
|
def print_records(records, record_size):
|
||||||
print 'const ucd_record PRIV(ucd_records)[] = { ' + \
|
print('const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||||
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
|
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
|
||||||
records = zip(records.keys(), records.values())
|
|
||||||
records.sort(None, lambda x: x[1])
|
records = list(zip(list(records.keys()), list(records.values())))
|
||||||
|
records.sort(key = lambda x: x[1])
|
||||||
for i, record in enumerate(records):
|
for i, record in enumerate(records):
|
||||||
print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
|
print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
|
||||||
print '};\n'
|
print('};\n')
|
||||||
|
|
||||||
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||||
|
@ -393,10 +403,10 @@ for s in sets:
|
||||||
table, records = combine_tables(script, category, break_props,
|
table, records = combine_tables(script, category, break_props,
|
||||||
caseless_offsets, other_case)
|
caseless_offsets, other_case)
|
||||||
|
|
||||||
record_size, record_struct = get_record_size_struct(records.keys())
|
record_size, record_struct = get_record_size_struct(list(records.keys()))
|
||||||
|
|
||||||
# Find the optimum block size for the two-stage table
|
# Find the optimum block size for the two-stage table
|
||||||
min_size = sys.maxint
|
min_size = sys.maxsize
|
||||||
for block_size in [2 ** i for i in range(5,10)]:
|
for block_size in [2 ** i for i in range(5,10)]:
|
||||||
size = len(records) * record_size
|
size = len(records) * record_size
|
||||||
stage1, stage2 = compress_table(table, block_size)
|
stage1, stage2 = compress_table(table, block_size)
|
||||||
|
@ -407,76 +417,76 @@ for block_size in [2 ** i for i in range(5,10)]:
|
||||||
min_stage1, min_stage2 = stage1, stage2
|
min_stage1, min_stage2 = stage1, stage2
|
||||||
min_block_size = block_size
|
min_block_size = block_size
|
||||||
|
|
||||||
print "/* This module is generated by the maint/MultiStage2.py script."
|
print("/* This module is generated by the maint/MultiStage2.py script.")
|
||||||
print "Do not modify it by hand. Instead modify the script and run it"
|
print("Do not modify it by hand. Instead modify the script and run it")
|
||||||
print "to regenerate this code."
|
print("to regenerate this code.")
|
||||||
print
|
print()
|
||||||
print "As well as being part of the PCRE2 library, this module is #included"
|
print("As well as being part of the PCRE2 library, this module is #included")
|
||||||
print "by the pcre2test program, which redefines the PRIV macro to change"
|
print("by the pcre2test program, which redefines the PRIV macro to change")
|
||||||
print "table names from _pcre2_xxx to xxxx, thereby avoiding name clashes"
|
print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
|
||||||
print "with the library. At present, just one of these tables is actually"
|
print("with the library. At present, just one of these tables is actually")
|
||||||
print "needed. */"
|
print("needed. */")
|
||||||
print
|
print()
|
||||||
print "#ifndef PCRE2_INCLUDED"
|
print("#ifndef PCRE2_INCLUDED")
|
||||||
print
|
print()
|
||||||
print "#ifdef HAVE_CONFIG_H"
|
print("#ifdef HAVE_CONFIG_H")
|
||||||
print "#include \"config.h\""
|
print("#include \"config.h\"")
|
||||||
print "#endif"
|
print("#endif")
|
||||||
print
|
print()
|
||||||
print "#include \"pcre2_internal.h\""
|
print("#include \"pcre2_internal.h\"")
|
||||||
print
|
print()
|
||||||
print "#endif /* PCRE2_INCLUDED */"
|
print("#endif /* PCRE2_INCLUDED */")
|
||||||
print
|
print()
|
||||||
print "/* Unicode character database. */"
|
print("/* Unicode character database. */")
|
||||||
print "/* This file was autogenerated by the MultiStage2.py script. */"
|
print("/* This file was autogenerated by the MultiStage2.py script. */")
|
||||||
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
|
print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
|
||||||
print
|
print()
|
||||||
print "/* The tables herein are needed only when UCP support is built,"
|
print("/* The tables herein are needed only when UCP support is built,")
|
||||||
print "and in PCRE2 that happens automatically with UTF support."
|
print("and in PCRE2 that happens automatically with UTF support.")
|
||||||
print "This module should not be referenced otherwise, so"
|
print("This module should not be referenced otherwise, so")
|
||||||
print "it should not matter whether it is compiled or not. However"
|
print("it should not matter whether it is compiled or not. However")
|
||||||
print "a comment was received about space saving - maybe the guy linked"
|
print("a comment was received about space saving - maybe the guy linked")
|
||||||
print "all the modules rather than using a library - so we include a"
|
print("all the modules rather than using a library - so we include a")
|
||||||
print "condition to cut out the tables when not needed. But don't leave"
|
print("condition to cut out the tables when not needed. But don't leave")
|
||||||
print "a totally empty module because some compilers barf at that."
|
print("a totally empty module because some compilers barf at that.")
|
||||||
print "Instead, just supply small dummy tables. */"
|
print("Instead, just supply small dummy tables. */")
|
||||||
print
|
print()
|
||||||
print "#ifndef SUPPORT_UTF"
|
print("#ifndef SUPPORT_UTF")
|
||||||
print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
|
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
|
||||||
print "const uint8_t PRIV(ucd_stage1)[] = {0};"
|
print("const uint8_t PRIV(ucd_stage1)[] = {0};")
|
||||||
print "const uint16_t PRIV(ucd_stage2)[] = {0};"
|
print("const uint16_t PRIV(ucd_stage2)[] = {0};")
|
||||||
print "const uint32_t PRIV(ucd_caseless_sets)[] = {0};"
|
print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
|
||||||
print "#else"
|
print("#else")
|
||||||
print
|
print()
|
||||||
print record_struct
|
print(record_struct)
|
||||||
|
|
||||||
# --- Added by PH: output the table of caseless character sets ---
|
# --- Added by PH: output the table of caseless character sets ---
|
||||||
|
|
||||||
print "const uint32_t PRIV(ucd_caseless_sets)[] = {"
|
print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
|
||||||
print " NOTACHAR,"
|
print(" NOTACHAR,")
|
||||||
for s in sets:
|
for s in sets:
|
||||||
s = sorted(s)
|
s = sorted(s)
|
||||||
for x in s:
|
for x in s:
|
||||||
print ' 0x%04x,' % x,
|
print(' 0x%04x,' % x, end=' ')
|
||||||
print ' NOTACHAR,'
|
print(' NOTACHAR,')
|
||||||
print '};'
|
print('};')
|
||||||
print
|
print()
|
||||||
|
|
||||||
# ------
|
# ------
|
||||||
|
|
||||||
print "/* When #included in pcre2test, we don't need this large table. */"
|
print("/* When #included in pcre2test, we don't need this large table. */")
|
||||||
print
|
print()
|
||||||
print "#ifndef PCRE2_INCLUDED"
|
print("#ifndef PCRE2_INCLUDED")
|
||||||
print
|
print()
|
||||||
print_records(records, record_size)
|
print_records(records, record_size)
|
||||||
print_table(min_stage1, 'PRIV(ucd_stage1)')
|
print_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||||
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||||
print "#if UCD_BLOCK_SIZE != %d" % min_block_size
|
print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
|
||||||
print "#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h"
|
print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
|
||||||
print "#endif"
|
print("#endif")
|
||||||
print "#endif /* SUPPORT_UTF */"
|
print("#endif /* SUPPORT_UTF */")
|
||||||
print
|
print()
|
||||||
print "#endif /* PCRE2_INCLUDED */"
|
print("#endif /* PCRE2_INCLUDED */")
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue