From 2801d5d13207f198ababb15f28f8bdd43b9606bc Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Tue, 3 Jun 2014 16:26:20 +0000
Subject: [PATCH] Update the two Python maintenance scripts for Python 3.

---
 maint/GenerateUtt.py |  33 +++++----
 maint/MultiStage2.py | 166 +++++++++++++++++++++++--------------------
 2 files changed, 104 insertions(+), 95 deletions(-)

diff --git a/maint/GenerateUtt.py b/maint/GenerateUtt.py
index c9a6a55..81ad20f 100755
--- a/maint/GenerateUtt.py
+++ b/maint/GenerateUtt.py
@@ -1,8 +1,6 @@
 #! /usr/bin/python
 
-# Generate utt tables. Note: this script is written in Python 2 and is
-# incompatible with Python 3. However, the 2to3 conversion script has been 
-# successfully tested on it.
+# Generate utt tables. Note: this script has now been converted to Python 3.
 
 # The source file pcre2_tables.c contains (amongst other things), a table that
 # is indexed by script name. In order to reduce the number of relocations when
@@ -22,6 +20,7 @@
 # necessary for Unicode 6.2.0 support.
 # Modfied by PH 26-February-2013 to add the Xuc special category.
 # Comment modified by PH 13-May-2014 to update to PCRE2 file names.
+# Script updated to Python 3 by running it through the 2to3 converter.
 
 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
  'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
@@ -53,9 +52,9 @@ general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
 
 # First add the Unicode script and category names.
 
-utt_table  = zip(script_names, ['PT_SC'] * len(script_names))
-utt_table += zip(category_names, ['PT_PC'] * len(category_names))
-utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names))
+utt_table  = list(zip(script_names, ['PT_SC'] * len(script_names)))
+utt_table += list(zip(category_names, ['PT_PC'] * len(category_names)))
+utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names)))
 
 # Now add our own specials.
 
@@ -75,29 +74,29 @@ utt_table.sort()
 # UTF-8 mode on EBCDIC platforms.
 
 for utt in utt_table:
-        print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')),
+        print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ')
         for c in utt[0]:
                 if c == '_':
-                        print 'STR_UNDERSCORE',
+                        print('STR_UNDERSCORE', end=' ')
                 elif c == '&':
-                        print 'STR_AMPERSAND',
+                        print('STR_AMPERSAND', end=' ')
                 else:
-                        print 'STR_%s' % c,;
-        print '"\\0"'
+                        print('STR_%s' % c, end=' ');
+        print('"\\0"')
 
 # Print the actual table, using the string names
 
-print ''
-print 'const char PRIV(utt_names)[] =';
+print('')
+print('const char PRIV(utt_names)[] =');
 last = ''
 for utt in utt_table:
         if utt == utt_table[-1]:
                 last = ';'
-        print '  STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)
+        print('  STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last))
 # This was how it was done before the EBCDIC-compatible modification.
 #        print '  "%s\\0"%s' % (utt[0], last)
 
-print '\nconst ucp_type_table PRIV(utt)[] = {'
+print('\nconst ucp_type_table PRIV(utt)[] = {')
 offset = 0
 last = ','
 for utt in utt_table:
@@ -108,6 +107,6 @@ for utt in utt_table:
                 value = 'ucp_' + utt[0]
         if utt == utt_table[-1]:
                 last = ''
-        print '  { %3d, %s, %s }%s' % (offset, utt[1], value, last)
+        print('  { %3d, %s, %s }%s' % (offset, utt[1], value, last))
         offset += len(utt[0]) + 1
-print '};'
+print('};')
diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py
index bec081f..726fcb6 100755
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@@ -10,9 +10,10 @@
 # generate the pcre_ucd.c file that contains a digested form of the Unicode
 # data tables.
 #
-# The script should be run in the maint subdirectory, using the command
+# The script has now been upgraded to Python 3 for PCRE2, and should be run in 
+# the maint subdirectory, using the command
 #
-# [python2] ./MultiStage2.py >../src/pcre2_ucd.c
+# [python3] ./MultiStage2.py >../src/pcre2_ucd.c
 #
 # It requires four Unicode data tables, DerivedGeneralCategory.txt,
 # GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the 
@@ -42,6 +43,13 @@
 #  offsets into the table are added to the main output records. This new
 #  code scans CaseFolding.txt instead of UnicodeData.txt.
 #
+#  Update for Python3:
+#    . Processed with 2to3, but that didn't fix everything
+#    . Changed string.strip to str.strip
+#    . Added encoding='utf-8' to the open() call
+#    . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
+#        required and the result of the division is a float
+#
 # The main tables generated by this script are used by macros defined in
 # pcre2_internal.h. They look up Unicode character properties using short 
 # sequences of code that contains no branches, which makes for greater speed.
@@ -110,6 +118,7 @@
 #                      final hole in the structure.
 # 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
 # 13-May-2014:       Updated for PCRE2
+# 03-June-2014:      Updated for Python 3
 ##############################################################################
 
 
@@ -133,11 +142,11 @@ def get_other_case(chardata):
 
 # Read the whole table in memory
 def read_table(file_name, get_value, default_value):
-        file = open(file_name, 'r')
+        file = open(file_name, 'r', encoding='utf-8')
         table = [default_value] * MAX_UNICODE
         for line in file:
                 line = re.sub(r'#.*', '', line)
-                chardata = map(string.strip, line.split(';'))
+                chardata = list(map(str.strip, line.split(';')))
                 if len(chardata) <= 1:
                         continue
                 value = get_value(chardata)
@@ -170,7 +179,7 @@ def get_type_size(table):
                 if minlimit <= minval and maxval <= maxlimit:
                         return type_size[num]
         else:
-                raise OverflowError, "Too large to fit into C types"
+                raise OverflowError("Too large to fit into C types")
 
 def get_tables_size(*tables):
         total_size = 0
@@ -205,13 +214,13 @@ def print_table(table, table_name, block_size = None):
         s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
         if block_size:
                 s += ", block = %d" % block_size
-        print s + " */"
+        print(s + " */")
         table = tuple(table)
         if block_size is None:
                 fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
                 mult = MAX_UNICODE / len(table)
                 for i in range(0, len(table), ELEMS_PER_LINE):
-                        print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
+                        print(fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)))
         else:
                 if block_size > ELEMS_PER_LINE:
                         el = ELEMS_PER_LINE
@@ -219,10 +228,10 @@ def print_table(table, table_name, block_size = None):
                         el = block_size
                 fmt = "%3d," * el + "\n"
                 if block_size > ELEMS_PER_LINE:
-                        fmt = fmt * (block_size / ELEMS_PER_LINE)
+                        fmt = fmt * int(block_size / ELEMS_PER_LINE)
                 for i in range(0, len(table), block_size):
-                        print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
-        print "};\n"
+                        print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
+        print("};\n")
 
 # Extract the unique combinations of properties into records
 def combine_tables(*tables):
@@ -241,7 +250,7 @@ def get_record_size_struct(records):
         'types in this structure definition from pcre2_internal.h (the actual\n' + \
         'field names will be different):\n\ntypedef struct {\n'
         for i in range(len(records[0])):
-                record_slice = map(lambda record: record[i], records)
+                record_slice = [record[i] for record in records]
                 slice_type, slice_size = get_type_size(record_slice)
                 # add padding: round up to the nearest power of slice_size
                 size = (size + slice_size - 1) & -slice_size
@@ -249,7 +258,7 @@ def get_record_size_struct(records):
                 structure += '%s property_%d;\n' % (slice_type, i)
         
         # round up to the first item of the next structure in array
-        record_slice = map(lambda record: record[0], records)
+        record_slice = [record[0] for record in records]
         slice_type, slice_size = get_type_size(record_slice)
         size = (size + slice_size - 1) & -slice_size
         
@@ -273,13 +282,14 @@ def test_record_size():
             #print struct
 
 def print_records(records, record_size):
-        print 'const ucd_record PRIV(ucd_records)[] = { ' + \
-              '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
-        records = zip(records.keys(), records.values())
-        records.sort(None, lambda x: x[1])
+        print('const ucd_record PRIV(ucd_records)[] = { ' + \
+              '/* %d bytes, record size %d */' % (len(records) * record_size, record_size))
+
+        records = list(zip(list(records.keys()), list(records.values())))
+        records.sort(key = lambda x: x[1])
         for i, record in enumerate(records):
-                print ('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
-        print '};\n'
+                print(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
+        print('};\n')
 
 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
  'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
@@ -393,10 +403,10 @@ for s in sets:
 table, records = combine_tables(script, category, break_props, 
   caseless_offsets, other_case)
 
-record_size, record_struct = get_record_size_struct(records.keys())
+record_size, record_struct = get_record_size_struct(list(records.keys()))
 
 # Find the optimum block size for the two-stage table
-min_size = sys.maxint
+min_size = sys.maxsize
 for block_size in [2 ** i for i in range(5,10)]:
         size = len(records) * record_size
         stage1, stage2 = compress_table(table, block_size)
@@ -407,76 +417,76 @@ for block_size in [2 ** i for i in range(5,10)]:
                 min_stage1, min_stage2 = stage1, stage2
                 min_block_size = block_size
 
-print "/* This module is generated by the maint/MultiStage2.py script."
-print "Do not modify it by hand. Instead modify the script and run it"
-print "to regenerate this code."
-print
-print "As well as being part of the PCRE2 library, this module is #included"
-print "by the pcre2test program, which redefines the PRIV macro to change"
-print "table names from _pcre2_xxx to xxxx, thereby avoiding name clashes"
-print "with the library. At present, just one of these tables is actually"
-print "needed. */"
-print
-print "#ifndef PCRE2_INCLUDED"
-print
-print "#ifdef HAVE_CONFIG_H"
-print "#include \"config.h\""
-print "#endif"
-print
-print "#include \"pcre2_internal.h\""
-print
-print "#endif /* PCRE2_INCLUDED */"
-print
-print "/* Unicode character database. */"
-print "/* This file was autogenerated by the MultiStage2.py script. */"
-print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
-print
-print "/* The tables herein are needed only when UCP support is built,"
-print "and in PCRE2 that happens automatically with UTF support." 
-print "This module should not be referenced otherwise, so"
-print "it should not matter whether it is compiled or not. However"
-print "a comment was received about space saving - maybe the guy linked"
-print "all the modules rather than using a library - so we include a"
-print "condition to cut out the tables when not needed. But don't leave"
-print "a totally empty module because some compilers barf at that."
-print "Instead, just supply small dummy tables. */"
-print
-print "#ifndef SUPPORT_UTF"
-print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
-print "const uint8_t PRIV(ucd_stage1)[] = {0};"
-print "const uint16_t PRIV(ucd_stage2)[] = {0};"
-print "const uint32_t PRIV(ucd_caseless_sets)[] = {0};"
-print "#else"
-print
-print record_struct
+print("/* This module is generated by the maint/MultiStage2.py script.")
+print("Do not modify it by hand. Instead modify the script and run it")
+print("to regenerate this code.")
+print()
+print("As well as being part of the PCRE2 library, this module is #included")
+print("by the pcre2test program, which redefines the PRIV macro to change")
+print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes")
+print("with the library. At present, just one of these tables is actually")
+print("needed. */")
+print()
+print("#ifndef PCRE2_INCLUDED")
+print()
+print("#ifdef HAVE_CONFIG_H")
+print("#include \"config.h\"")
+print("#endif")
+print()
+print("#include \"pcre2_internal.h\"")
+print()
+print("#endif /* PCRE2_INCLUDED */")
+print()
+print("/* Unicode character database. */")
+print("/* This file was autogenerated by the MultiStage2.py script. */")
+print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size))
+print()
+print("/* The tables herein are needed only when UCP support is built,")
+print("and in PCRE2 that happens automatically with UTF support.") 
+print("This module should not be referenced otherwise, so")
+print("it should not matter whether it is compiled or not. However")
+print("a comment was received about space saving - maybe the guy linked")
+print("all the modules rather than using a library - so we include a")
+print("condition to cut out the tables when not needed. But don't leave")
+print("a totally empty module because some compilers barf at that.")
+print("Instead, just supply small dummy tables. */")
+print()
+print("#ifndef SUPPORT_UTF")
+print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
+print("const uint8_t PRIV(ucd_stage1)[] = {0};")
+print("const uint16_t PRIV(ucd_stage2)[] = {0};")
+print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
+print("#else")
+print()
+print(record_struct)
 
 # --- Added by PH: output the table of caseless character sets ---
 
-print "const uint32_t PRIV(ucd_caseless_sets)[] = {"
-print "  NOTACHAR,"
+print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
+print("  NOTACHAR,")
 for s in sets:
   s = sorted(s)
   for x in s:
-    print '  0x%04x,' % x,
-  print '  NOTACHAR,'   
-print '};'
-print
+    print('  0x%04x,' % x, end=' ')
+  print('  NOTACHAR,')   
+print('};')
+print()
 
 # ------
 
-print "/* When #included in pcre2test, we don't need this large table. */"
-print
-print "#ifndef PCRE2_INCLUDED"
-print
+print("/* When #included in pcre2test, we don't need this large table. */")
+print()
+print("#ifndef PCRE2_INCLUDED")
+print()
 print_records(records, record_size)
 print_table(min_stage1, 'PRIV(ucd_stage1)')
 print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
-print "#if UCD_BLOCK_SIZE != %d" % min_block_size
-print "#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h"
-print "#endif"
-print "#endif  /* SUPPORT_UTF */"
-print
-print "#endif  /* PCRE2_INCLUDED */"
+print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
+print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
+print("#endif")
+print("#endif  /* SUPPORT_UTF */")
+print()
+print("#endif  /* PCRE2_INCLUDED */")
 
 """