Update the two Python maintenance scripts for Python 3.
This commit is contained in:
parent
1b4bcb79ae
commit
2801d5d132
|
@ -1,8 +1,6 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# Generate utt tables. Note: this script is written in Python 2 and is
|
||||
# incompatible with Python 3. However, the 2to3 conversion script has been
|
||||
# successfully tested on it.
|
||||
# Generate utt tables. Note: this script has now been converted to Python 3.
|
||||
|
||||
# The source file pcre2_tables.c contains (amongst other things), a table that
|
||||
# is indexed by script name. In order to reduce the number of relocations when
|
||||
|
@ -22,6 +20,7 @@
|
|||
# necessary for Unicode 6.2.0 support.
|
||||
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
||||
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
||||
# Script updated to Python 3 by running it through the 2to3 converter.
|
||||
|
||||
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||
|
@ -53,9 +52,9 @@ general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
|
|||
|
||||
# First add the Unicode script and category names.
|
||||
|
||||
utt_table = zip(script_names, ['PT_SC'] * len(script_names))
|
||||
utt_table += zip(category_names, ['PT_PC'] * len(category_names))
|
||||
utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names))
|
||||
utt_table = list(zip(script_names, ['PT_SC'] * len(script_names)))
|
||||
utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
|
||||
utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||
|
||||
# Now add our own specials.
|
||||
|
||||
|
@ -75,29 +74,29 @@ utt_table.sort()
|
|||
# UTF-8 mode on EBCDIC platforms.
|
||||
|
||||
for utt in utt_table:
|
||||
print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')),
|
||||
print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
|
||||
for c in utt[0]:
|
||||
if c == '_':
|
||||
print 'STR_UNDERSCORE',
|
||||
print('STR_UNDERSCORE', end=' ')
|
||||
elif c == '&':
|
||||
print 'STR_AMPERSAND',
|
||||
print('STR_AMPERSAND', end=' ')
|
||||
else:
|
||||
print 'STR_%s' % c,;
|
||||
print '"\\0"'
|
||||
print('STR_%s' % c, end=' ');
|
||||
print('"\\0"')
|
||||
|
||||
# Print the actual table, using the string names
|
||||
|
||||
print ''
|
||||
print 'const char PRIV(utt_names)[] =';
|
||||
print('')
|
||||
print('const char PRIV(utt_names)[] =');
|
||||
last = ''
|
||||
for utt in utt_table:
|
||||
if utt == utt_table[-1]:
|
||||
last = ';'
|
||||
print ' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)
|
||||
print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
|
||||
# This was how it was done before the EBCDIC-compatible modification.
|
||||
# print ' "%s\\0"%s' % (utt[0], last)
|
||||
|
||||
print '\nconst ucp_type_table PRIV(utt)[] = {'
|
||||
print('\nconst ucp_type_table PRIV(utt)[] = {')
|
||||
offset = 0
|
||||
last = ','
|
||||
for utt in utt_table:
|
||||
|
@ -108,6 +107,6 @@ for utt in utt_table:
|
|||
value = 'ucp_' + utt[0]
|
||||
if utt == utt_table[-1]:
|
||||
last = ''
|
||||
print ' { %3d, %s, %s }%s' % (offset, utt[1], value, last)
|
||||
print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last))
|
||||
offset += len(utt[0]) + 1
|
||||
print '};'
|
||||
print('};')
|
||||
|
|
|
@ -10,9 +10,10 @@
|
|||
# generate the pcre_ucd.c file that contains a digested form of the Unicode
|
||||
# data tables.
|
||||
#
|
||||
# The script should be run in the maint subdirectory, using the command
|
||||
# The script has now been upgraded to Python 3 for PCRE2, and should be run in
|
||||
# the maint subdirectory, using the command
|
||||
#
|
||||
# [python2] ./MultiStage2.py >../src/pcre2_ucd.c
|
||||
# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
|
||||
#
|
||||
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
|
||||
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
|
||||
|
@ -42,6 +43,13 @@
|
|||
# offsets into the table are added to the main output records. This new
|
||||
# code scans CaseFolding.txt instead of UnicodeData.txt.
|
||||
#
|
||||
# Update for Python3:
|
||||
# . Processed with 2to3, but that didn't fix everything
|
||||
# . Changed string.strip to str.strip
|
||||
# . Added encoding='utf-8' to the open() call
|
||||
# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
|
||||
# required and the result of the division is a float
|
||||
#
|
||||
# The main tables generated by this script are used by macros defined in
|
||||
# pcre2_internal.h. They look up Unicode character properties using short
|
||||
# sequences of code that contains no branches, which makes for greater speed.
|
||||
|
@ -110,6 +118,7 @@
|
|||
# final hole in the structure.
|
||||
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
||||
# 13-May-2014: Updated for PCRE2
|
||||
# 03-June-2014: Updated for Python 3
|
||||
##############################################################################
|
||||
|
||||
|
||||
|
@ -133,11 +142,11 @@ def get_other_case(chardata):
|
|||
|
||||
# Read the whole table in memory
|
||||
def read_table(file_name, get_value, default_value):
|
||||
file = open(file_name, 'r')
|
||||
file = open(file_name, 'r', encoding='utf-8')
|
||||
table = [default_value] * MAX_UNICODE
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = map(string.strip, line.split(';'))
|
||||
chardata = list(map(str.strip, line.split(';')))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
value = get_value(chardata)
|
||||
|
@ -170,7 +179,7 @@ def get_type_size(table):
|
|||
if minlimit <= minval and maxval <= maxlimit:
|
||||
return type_size[num]
|
||||
else:
|
||||
raise OverflowError, "Too large to fit into C types"
|
||||
raise OverflowError("Too large to fit into C types")
|
||||
|
||||
def get_tables_size(*tables):
|
||||
total_size = 0
|
||||
|
@ -205,13 +214,13 @@ def print_table(table, table_name, block_size = None):
|
|||
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
||||
if block_size:
|
||||
s += ", block = %d" % block_size
|
||||
print s + " */"
|
||||
print(s + " */")
|
||||
table = tuple(table)
|
||||
if block_size is None:
|
||||
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
|
||||
mult = MAX_UNICODE / len(table)
|
||||
for i in range(0, len(table), ELEMS_PER_LINE):
|
||||
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
|
||||
print(fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)))
|
||||
else:
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
el = ELEMS_PER_LINE
|
||||
|
@ -219,10 +228,10 @@ def print_table(table, table_name, block_size = None):
|
|||
el = block_size
|
||||
fmt = "%3d," * el + "\n"
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
fmt = fmt * (block_size / ELEMS_PER_LINE)
|
||||
fmt = fmt * int(block_size / ELEMS_PER_LINE)
|
||||
for i in range(0, len(table), block_size):
|
||||
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
|
||||
print "};\n"
|
||||
print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
|
||||
print("};\n")
|
||||
|
||||
# Extract the unique combinations of properties into records
|
||||
def combine_tables(*tables):
|
||||
|
@ -241,7 +250,7 @@ def get_record_size_struct(records):
|
|||
'types in this structure definition from pcre2_internal.h (the actual\n' + \
|
||||
'field names will be different):\n\ntypedef struct {\n'
|
||||
for i in range(len(records[0])):
|
||||
record_slice = map(lambda record: record[i], records)
|
||||
record_slice = [record[i] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
# add padding: round up to the nearest power of slice_size
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
@ -249,7 +258,7 @@ def get_record_size_struct(records):
|
|||
structure += '%s property_%d;\n' % (slice_type, i)
|
||||
|
||||
# round up to the first item of the next structure in array
|
||||
record_slice = map(lambda record: record[0], records)
|
||||
record_slice = [record[0] for record in records]
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
||||
|
@ -273,13 +282,14 @@ def test_record_size():
|
|||
#print struct
|
||||
|
||||
def print_records(records, record_size):
|
||||
print 'const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
|
||||
records = zip(records.keys(), records.values())
|
||||
records.sort(None, lambda x: x[1])
|
||||
print('const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
|
||||
|
||||
records = list(zip(list(records.keys()), list(records.values())))
|
||||
records.sort(key = lambda x: x[1])
|
||||
for i, record in enumerate(records):
|
||||
print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
|
||||
print '};\n'
|
||||
print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
|
||||
print('};\n')
|
||||
|
||||
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||
|
@ -393,10 +403,10 @@ for s in sets:
|
|||
table, records = combine_tables(script, category, break_props,
|
||||
caseless_offsets, other_case)
|
||||
|
||||
record_size, record_struct = get_record_size_struct(records.keys())
|
||||
record_size, record_struct = get_record_size_struct(list(records.keys()))
|
||||
|
||||
# Find the optimum block size for the two-stage table
|
||||
min_size = sys.maxint
|
||||
min_size = sys.maxsize
|
||||
for block_size in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * record_size
|
||||
stage1, stage2 = compress_table(table, block_size)
|
||||
|
@ -407,76 +417,76 @@ for block_size in [2 ** i for i in range(5,10)]:
|
|||
min_stage1, min_stage2 = stage1, stage2
|
||||
min_block_size = block_size
|
||||
|
||||
print "/* This module is generated by the maint/MultiStage2.py script."
|
||||
print "Do not modify it by hand. Instead modify the script and run it"
|
||||
print "to regenerate this code."
|
||||
print
|
||||
print "As well as being part of the PCRE2 library, this module is #included"
|
||||
print "by the pcre2test program, which redefines the PRIV macro to change"
|
||||
print "table names from _pcre2_xxx to xxxx, thereby avoiding name clashes"
|
||||
print "with the library. At present, just one of these tables is actually"
|
||||
print "needed. */"
|
||||
print
|
||||
print "#ifndef PCRE2_INCLUDED"
|
||||
print
|
||||
print "#ifdef HAVE_CONFIG_H"
|
||||
print "#include \"config.h\""
|
||||
print "#endif"
|
||||
print
|
||||
print "#include \"pcre2_internal.h\""
|
||||
print
|
||||
print "#endif /* PCRE2_INCLUDED */"
|
||||
print
|
||||
print "/* Unicode character database. */"
|
||||
print "/* This file was autogenerated by the MultiStage2.py script. */"
|
||||
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
|
||||
print
|
||||
print "/* The tables herein are needed only when UCP support is built,"
|
||||
print "and in PCRE2 that happens automatically with UTF support."
|
||||
print "This module should not be referenced otherwise, so"
|
||||
print "it should not matter whether it is compiled or not. However"
|
||||
print "a comment was received about space saving - maybe the guy linked"
|
||||
print "all the modules rather than using a library - so we include a"
|
||||
print "condition to cut out the tables when not needed. But don't leave"
|
||||
print "a totally empty module because some compilers barf at that."
|
||||
print "Instead, just supply small dummy tables. */"
|
||||
print
|
||||
print "#ifndef SUPPORT_UTF"
|
||||
print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
|
||||
print "const uint8_t PRIV(ucd_stage1)[] = {0};"
|
||||
print "const uint16_t PRIV(ucd_stage2)[] = {0};"
|
||||
print "const uint32_t PRIV(ucd_caseless_sets)[] = {0};"
|
||||
print "#else"
|
||||
print
|
||||
print record_struct
|
||||
print("/* This module is generated by the maint/MultiStage2.py script.")
|
||||
print("Do not modify it by hand. Instead modify the script and run it")
|
||||
print("to regenerate this code.")
|
||||
print()
|
||||
print("As well as being part of the PCRE2 library, this module is #included")
|
||||
print("by the pcre2test program, which redefines the PRIV macro to change")
|
||||
print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
|
||||
print("with the library. At present, just one of these tables is actually")
|
||||
print("needed. */")
|
||||
print()
|
||||
print("#ifndef PCRE2_INCLUDED")
|
||||
print()
|
||||
print("#ifdef HAVE_CONFIG_H")
|
||||
print("#include \"config.h\"")
|
||||
print("#endif")
|
||||
print()
|
||||
print("#include \"pcre2_internal.h\"")
|
||||
print()
|
||||
print("#endif /* PCRE2_INCLUDED */")
|
||||
print()
|
||||
print("/* Unicode character database. */")
|
||||
print("/* This file was autogenerated by the MultiStage2.py script. */")
|
||||
print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
|
||||
print()
|
||||
print("/* The tables herein are needed only when UCP support is built,")
|
||||
print("and in PCRE2 that happens automatically with UTF support.")
|
||||
print("This module should not be referenced otherwise, so")
|
||||
print("it should not matter whether it is compiled or not. However")
|
||||
print("a comment was received about space saving - maybe the guy linked")
|
||||
print("all the modules rather than using a library - so we include a")
|
||||
print("condition to cut out the tables when not needed. But don't leave")
|
||||
print("a totally empty module because some compilers barf at that.")
|
||||
print("Instead, just supply small dummy tables. */")
|
||||
print()
|
||||
print("#ifndef SUPPORT_UTF")
|
||||
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
|
||||
print("const uint8_t PRIV(ucd_stage1)[] = {0};")
|
||||
print("const uint16_t PRIV(ucd_stage2)[] = {0};")
|
||||
print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
|
||||
print("#else")
|
||||
print()
|
||||
print(record_struct)
|
||||
|
||||
# --- Added by PH: output the table of caseless character sets ---
|
||||
|
||||
print "const uint32_t PRIV(ucd_caseless_sets)[] = {"
|
||||
print " NOTACHAR,"
|
||||
print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
|
||||
print(" NOTACHAR,")
|
||||
for s in sets:
|
||||
s = sorted(s)
|
||||
for x in s:
|
||||
print ' 0x%04x,' % x,
|
||||
print ' NOTACHAR,'
|
||||
print '};'
|
||||
print
|
||||
print(' 0x%04x,' % x, end=' ')
|
||||
print(' NOTACHAR,')
|
||||
print('};')
|
||||
print()
|
||||
|
||||
# ------
|
||||
|
||||
print "/* When #included in pcre2test, we don't need this large table. */"
|
||||
print
|
||||
print "#ifndef PCRE2_INCLUDED"
|
||||
print
|
||||
print("/* When #included in pcre2test, we don't need this large table. */")
|
||||
print()
|
||||
print("#ifndef PCRE2_INCLUDED")
|
||||
print()
|
||||
print_records(records, record_size)
|
||||
print_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||
print "#if UCD_BLOCK_SIZE != %d" % min_block_size
|
||||
print "#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h"
|
||||
print "#endif"
|
||||
print "#endif /* SUPPORT_UTF */"
|
||||
print
|
||||
print "#endif /* PCRE2_INCLUDED */"
|
||||
print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
|
||||
print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
|
||||
print("#endif")
|
||||
print("#endif /* SUPPORT_UTF */")
|
||||
print()
|
||||
print("#endif /* PCRE2_INCLUDED */")
|
||||
|
||||
"""
|
||||
|
||||
|
|
Loading…
Reference in New Issue