Update the two Python maintenance scripts for Python 3.

2014-06-03 16:26:20 +00:00 · 2014-06-03 16:26:20 +00:00 · 2801d5d132
parent 1b4bcb79ae
commit 2801d5d132
2 changed files with 104 additions and 95 deletions
--- a/maint/GenerateUtt.py
+++ b/maint/GenerateUtt.py
@ -1,8 +1,6 @@
 #! /usr/bin/python

-# Generate utt tables. Note: this script is written in Python 2 and is
-# incompatible with Python 3. However, the 2to3 conversion script has been 
-# successfully tested on it.
+# Generate utt tables. Note: this script has now been converted to Python 3.

 # The source file pcre2_tables.c contains (amongst other things), a table that
 # is indexed by script name. In order to reduce the number of relocations when
@ -22,6 +20,7 @@
 # necessary for Unicode 6.2.0 support.
 # Modfied by PH 26-February-2013 to add the Xuc special category.
 # Comment modified by PH 13-May-2014 to update to PCRE2 file names.
+# Script updated to Python 3 by running it through the 2to3 converter.

 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
@ -53,9 +52,9 @@ general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']

 # First add the Unicode script and category names.

-utt_table  = zip(script_names, ['PT_SC'] * len(script_names))
-utt_table += zip(category_names, ['PT_PC'] * len(category_names))
-utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names))
+utt_table  = list(zip(script_names, ['PT_SC'] * len(script_names)))
+utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
+utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))

 # Now add our own specials.

@ -75,29 +74,29 @@ utt_table.sort()
 # UTF-8 mode on EBCDIC platforms.

 for utt in utt_table:
-        print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')),
+        print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
        for c in utt[0]:
                if c == '_':
-                        print 'STR_UNDERSCORE',
+                        print('STR_UNDERSCORE', end=' ')
                elif c == '&':
-                        print 'STR_AMPERSAND',
+                        print('STR_AMPERSAND', end=' ')
                else:
-                        print 'STR_%s' % c,;
-        print '"\\0"'
+                        print('STR_%s' % c, end=' ');
+        print('"\\0"')

 # Print the actual table, using the string names

-print ''
-print 'const char PRIV(utt_names)[] =';
+print('')
+print('const char PRIV(utt_names)[] =');
 last = ''
 for utt in utt_table:
        if utt == utt_table[-1]:
                last = ';'
-        print '  STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)
+        print('  STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
 # This was how it was done before the EBCDIC-compatible modification.
 #        print '  "%s\\0"%s' % (utt[0], last)

-print '\nconst ucp_type_table PRIV(utt)[] = {'
+print('\nconst ucp_type_table PRIV(utt)[] = {')
 offset = 0
 last = ','
 for utt in utt_table:
@ -108,6 +107,6 @@ for utt in utt_table:
                value = 'ucp_' + utt[0]
        if utt == utt_table[-1]:
                last = ''
-        print '  { %3d, %s, %s }%s' % (offset, utt[1], value, last)
+        print('  { %3d, %s, %s }%s' % (offset, utt[1], value, last))
        offset += len(utt[0]) + 1
-print '};'
+print('};')
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@ -10,9 +10,10 @@
 # generate the pcre_ucd.c file that contains a digested form of the Unicode
 # data tables.
 #
-# The script should be run in the maint subdirectory, using the command
+# The script has now been upgraded to Python 3 for PCRE2, and should be run in 
+# the maint subdirectory, using the command
 #
-# [python2] ./MultiStage2.py >../src/pcre2_ucd.c
+# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
 #
 # It requires four Unicode data tables, DerivedGeneralCategory.txt,
 # GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the 
@ -42,6 +43,13 @@
 #  offsets into the table are added to the main output records. This new
 #  code scans CaseFolding.txt instead of UnicodeData.txt.
 #
+#  Update for Python3:
+#    . Processed with 2to3, but that didn't fix everything
+#    . Changed string.strip to str.strip
+#    . Added encoding='utf-8' to the open() call
+#    . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
+#        required and the result of the division is a float
+#
 # The main tables generated by this script are used by macros defined in
 # pcre2_internal.h. They look up Unicode character properties using short 
 # sequences of code that contains no branches, which makes for greater speed.
@ -110,6 +118,7 @@
 #                      final hole in the structure.
 # 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
 # 13-May-2014:       Updated for PCRE2
+# 03-June-2014:      Updated for Python 3
 ##############################################################################


@ -133,11 +142,11 @@ def get_other_case(chardata):

 # Read the whole table in memory
 def read_table(file_name, get_value, default_value):
-        file = open(file_name, 'r')
+        file = open(file_name, 'r', encoding='utf-8')
        table = [default_value] * MAX_UNICODE
        for line in file:
                line = re.sub(r'#.*', '', line)
-                chardata = map(string.strip, line.split(';'))
+                chardata = list(map(str.strip, line.split(';')))
                if len(chardata) <= 1:
                        continue
                value = get_value(chardata)
@ -170,7 +179,7 @@ def get_type_size(table):
                if minlimit <= minval and maxval <= maxlimit:
                        return type_size[num]
        else:
-                raise OverflowError, "Too large to fit into C types"
+                raise OverflowError("Too large to fit into C types")

 def get_tables_size(*tables):
        total_size = 0
@ -205,13 +214,13 @@ def print_table(table, table_name, block_size = None):
        s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
        if block_size:
                s += ", block = %d" % block_size
-        print s + " */"
+        print(s + " */")
        table = tuple(table)
        if block_size is None:
                fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
                mult = MAX_UNICODE / len(table)
                for i in range(0, len(table), ELEMS_PER_LINE):
-                        print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
+                        print(fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)))
        else:
                if block_size > ELEMS_PER_LINE:
                        el = ELEMS_PER_LINE
@ -219,10 +228,10 @@ def print_table(table, table_name, block_size = None):
                        el = block_size
                fmt = "%3d," * el + "\n"
                if block_size > ELEMS_PER_LINE:
-                        fmt = fmt * (block_size / ELEMS_PER_LINE)
+                        fmt = fmt * int(block_size / ELEMS_PER_LINE)
                for i in range(0, len(table), block_size):
-                        print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
-        print "};\n"
+                        print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
+        print("};\n")

 # Extract the unique combinations of properties into records
 def combine_tables(*tables):
@ -241,7 +250,7 @@ def get_record_size_struct(records):
        'types in this structure definition from pcre2_internal.h (the actual\n' + \
        'field names will be different):\n\ntypedef struct {\n'
        for i in range(len(records[0])):
-                record_slice = map(lambda record: record[i], records)
+                record_slice = [record[i] for record in records]
                slice_type, slice_size = get_type_size(record_slice)
                # add padding: round up to the nearest power of slice_size
                size = (size + slice_size - 1) & -slice_size
@ -249,7 +258,7 @@ def get_record_size_struct(records):
                structure += '%s property_%d;\n' % (slice_type, i)
        
        # round up to the first item of the next structure in array
-        record_slice = map(lambda record: record[0], records)
+        record_slice = [record[0] for record in records]
        slice_type, slice_size = get_type_size(record_slice)
        size = (size + slice_size - 1) & -slice_size
        
@ -273,13 +282,14 @@ def test_record_size():
            #print struct

 def print_records(records, record_size):
-        print 'const ucd_record PRIV(ucd_records)[] = { ' + \
-              '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
-        records = zip(records.keys(), records.values())
-        records.sort(None, lambda x: x[1])
+        print('const ucd_record PRIV(ucd_records)[] = { ' + \
+              '/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
+
+        records = list(zip(list(records.keys()), list(records.values())))
+        records.sort(key = lambda x: x[1])
        for i, record in enumerate(records):
-                print ('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
-        print '};\n'
+                print(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
+        print('};\n')

 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
@ -393,10 +403,10 @@ for s in sets:
 table, records = combine_tables(script, category, break_props, 
  caseless_offsets, other_case)

-record_size, record_struct = get_record_size_struct(records.keys())
+record_size, record_struct = get_record_size_struct(list(records.keys()))

 # Find the optimum block size for the two-stage table
-min_size = sys.maxint
+min_size = sys.maxsize
 for block_size in [2 ** i for i in range(5,10)]:
        size = len(records) * record_size
        stage1, stage2 = compress_table(table, block_size)
@ -407,76 +417,76 @@ for block_size in [2 ** i for i in range(5,10)]:
                min_stage1, min_stage2 = stage1, stage2
                min_block_size = block_size

-print "/* This module is generated by the maint/MultiStage2.py script."
-print "Do not modify it by hand. Instead modify the script and run it"
-print "to regenerate this code."
-print
-print "As well as being part of the PCRE2 library, this module is #included"
-print "by the pcre2test program, which redefines the PRIV macro to change"
-print "table names from _pcre2_xxx to xxxx, thereby avoiding name clashes"
-print "with the library. At present, just one of these tables is actually"
-print "needed. */"
-print
-print "#ifndef PCRE2_INCLUDED"
-print
-print "#ifdef HAVE_CONFIG_H"
-print "#include \"config.h\""
-print "#endif"
-print
-print "#include \"pcre2_internal.h\""
-print
-print "#endif /* PCRE2_INCLUDED */"
-print
-print "/* Unicode character database. */"
-print "/* This file was autogenerated by the MultiStage2.py script. */"
-print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
-print
-print "/* The tables herein are needed only when UCP support is built,"
-print "and in PCRE2 that happens automatically with UTF support." 
-print "This module should not be referenced otherwise, so"
-print "it should not matter whether it is compiled or not. However"
-print "a comment was received about space saving - maybe the guy linked"
-print "all the modules rather than using a library - so we include a"
-print "condition to cut out the tables when not needed. But don't leave"
-print "a totally empty module because some compilers barf at that."
-print "Instead, just supply small dummy tables. */"
-print
-print "#ifndef SUPPORT_UTF"
-print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
-print "const uint8_t PRIV(ucd_stage1)[] = {0};"
-print "const uint16_t PRIV(ucd_stage2)[] = {0};"
-print "const uint32_t PRIV(ucd_caseless_sets)[] = {0};"
-print "#else"
-print
-print record_struct
+print("/* This module is generated by the maint/MultiStage2.py script.")
+print("Do not modify it by hand. Instead modify the script and run it")
+print("to regenerate this code.")
+print()
+print("As well as being part of the PCRE2 library, this module is #included")
+print("by the pcre2test program, which redefines the PRIV macro to change")
+print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
+print("with the library. At present, just one of these tables is actually")
+print("needed. */")
+print()
+print("#ifndef PCRE2_INCLUDED")
+print()
+print("#ifdef HAVE_CONFIG_H")
+print("#include \"config.h\"")
+print("#endif")
+print()
+print("#include \"pcre2_internal.h\"")
+print()
+print("#endif /* PCRE2_INCLUDED */")
+print()
+print("/* Unicode character database. */")
+print("/* This file was autogenerated by the MultiStage2.py script. */")
+print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
+print()
+print("/* The tables herein are needed only when UCP support is built,")
+print("and in PCRE2 that happens automatically with UTF support.") 
+print("This module should not be referenced otherwise, so")
+print("it should not matter whether it is compiled or not. However")
+print("a comment was received about space saving - maybe the guy linked")
+print("all the modules rather than using a library - so we include a")
+print("condition to cut out the tables when not needed. But don't leave")
+print("a totally empty module because some compilers barf at that.")
+print("Instead, just supply small dummy tables. */")
+print()
+print("#ifndef SUPPORT_UTF")
+print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
+print("const uint8_t PRIV(ucd_stage1)[] = {0};")
+print("const uint16_t PRIV(ucd_stage2)[] = {0};")
+print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
+print("#else")
+print()
+print(record_struct)

 # --- Added by PH: output the table of caseless character sets ---

-print "const uint32_t PRIV(ucd_caseless_sets)[] = {"
-print "  NOTACHAR,"
+print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
+print("  NOTACHAR,")
 for s in sets:
  s = sorted(s)
  for x in s:
-    print '  0x%04x,' % x,
-  print '  NOTACHAR,'   
-print '};'
-print
+    print('  0x%04x,' % x, end=' ')
+  print('  NOTACHAR,')   
+print('};')
+print()

 # ------

-print "/* When #included in pcre2test, we don't need this large table. */"
-print
-print "#ifndef PCRE2_INCLUDED"
-print
+print("/* When #included in pcre2test, we don't need this large table. */")
+print()
+print("#ifndef PCRE2_INCLUDED")
+print()
 print_records(records, record_size)
 print_table(min_stage1, 'PRIV(ucd_stage1)')
 print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
-print "#if UCD_BLOCK_SIZE != %d" % min_block_size
-print "#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h"
-print "#endif"
-print "#endif  /* SUPPORT_UTF */"
-print
-print "#endif  /* PCRE2_INCLUDED */"
+print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
+print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
+print("#endif")
+print("#endif  /* SUPPORT_UTF */")
+print()
+print("#endif  /* PCRE2_INCLUDED */")

 """