Fix psl-make-dafsa to work with python3

2016-11-12 21:10:59 +01:00 · 2016-11-12 21:10:59 +01:00 · 3f276c7d1e
parent 1b36fb0012
commit 3f276c7d1e
1 changed files with 57 additions and 48 deletions
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@ -243,6 +243,10 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x00-0x
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # 0xE0-0xEF
                      4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF

+def to_bytes(n):
+  """Converts an integer value to a bytes object."""
+  return bytes(bytearray((n,)))
+
 def to_dafsa(words, utf_mode):
  """Generates a DAFSA from a word list and returns the source node.

@ -253,35 +257,34 @@ def to_dafsa(words, utf_mode):
    raise InputError('The domain list must not be empty')
  def to_nodes(word, multibyte_length):
    """Split words into characters"""
-    byte = ord(word[0])
+    byte = ord(word[:1])
    if multibyte_length:
      # Consume next byte in multibyte sequence.
      if byte & 0xC0 != 0x80:
        raise InputError('Invalid UTF-8 multibyte sequence')
-      return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
+      return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
    char_length = char_length_table[byte]
    if char_length == 1:
      # 7-bit printable ASCII.
      if len(word) == 1:
-        return chr(int(word[0], 16) & 0x0F), [None]
-      return word[0], [to_nodes(word[1:], 0)]
+        return to_bytes(int(word[:1], 16) & 0x0F), [None]
+      return word[:1], [to_nodes(word[1:], 0)]
    elif char_length > 1:
      # Leading byte in multibyte sequence.
      if not utf_mode:
        raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
      if len(word) <= char_length:
        raise InputError('Unterminated UTF-8 multibyte sequence')
-      return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
+      return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
    # Unexpected character.
    raise InputError('Domain names must be printable ASCII or UTF-8')

  return [to_nodes(word, 0) for word in words]

-
 def to_words(node):
  """Generates a word list from all paths starting from an internal node."""
  if not node:
-    return ['']
+    return [b'']
  return [(node[0] + word) for child in node[1] for word in to_words(child)]


@ -348,7 +351,7 @@ def join_suffixes(dafsa):
  """Generates a new DAFSA where nodes that represent the same word lists
  towards the sink are merged.
  """
-  nodemap = {frozenset(('',)): None}
+  nodemap = {frozenset((b'',)): None}

  def join(node):
    """Returns a macthing node. A new node is created if no matching node
@ -446,7 +449,7 @@ def encode_prefix(label):
  will then be a prefix to the label in the child node.
  """
  assert label
-  return [ord(c) for c in reversed(label)]
+  return [c for c in bytearray(reversed(label))]


 def encode_label(label):
@ -479,59 +482,61 @@ def encode(dafsa, utf_mode):
  return output


-def to_cxx(data):
+def to_cxx(data, codecs):
  """Generates C++ code from a list of encoded bytes."""
-  text = '/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
-  text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
-  text += ' documentation.'
-  text += '*/\n\n'
-  text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
+  text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
+  text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
+  text += b' documentation.'
+  text += b'*/\n\n'
+  text += b'static const unsigned char kDafsa['
+  text += bytes(str(len(data)), **codecs)
+  text += b'] = {\n'
  for i in range(0, len(data), 12):
-    text += '  '
-    text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
-    text += ',\n'
-  text += '};\n'
+    text += b'  '
+    text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
+    text += b',\n'
+  text += b'};\n'
  return text


-def words_to_whatever(words, converter, utf_mode):
+def words_to_whatever(words, converter, utf_mode, codecs):
  """Generates C++ code from a word list"""
  dafsa = to_dafsa(words, utf_mode)
  for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
    dafsa = fun(dafsa)
-  return converter(encode(dafsa, utf_mode))
+  return converter(encode(dafsa, utf_mode), codecs)


-def words_to_cxx(words, utf_mode):
+def words_to_cxx(words, utf_mode, codecs):
  """Generates C++ code from a word list"""
-  return words_to_whatever(words, to_cxx, utf_mode)
+  return words_to_whatever(words, to_cxx, utf_mode, codecs)


-def words_to_binary(words, utf_mode):
+def words_to_binary(words, utf_mode, codecs):
  """Generates C++ code from a word list"""
-  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, bytearray, utf_mode)
+  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)


-def parse_psl2c(infile, utf_mode):
+def parse_psl2c(infile, utf_mode, codecs):
  """Parses file generated by psl2c and extract strings and return code"""
-  lines = [line.strip() for line in infile]
+  lines = [bytes(line.strip(), **codecs) for line in infile]

  for line in lines:
-    if line[-3:-1] != ', ':
+    if line[-3:-1] != b', ':
      raise InputError('Expected "domainname, <digit>", found "%s"' % line)
    # Technically the DAFSA format could support return values in range [0x00-0x1E],
    # but the values below are the only with a defined meaning.
-    if line[-1] not in '0123456789ABCDEF':
-      raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1])
+    if line[-1] not in b'0123456789ABCDEF':
+      raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1:])

 #  with open("gperf.out", 'w') as outfile:
 #    for line in sorted(lines):
 #      outfile.write(line[:-3] + line[-1] + "\n")

-  return [line[:-3] + line[-1] for line in sorted(lines)]
+  return [line[:-3] + line[-1:] for line in sorted(lines)]


-def parse_psl(infile, utf_mode):
+def parse_psl(infile, utf_mode, codecs):
  """Parses PSL file and extract strings and return code"""
  PSL_FLAG_EXCEPTION = (1<<0)
  PSL_FLAG_WILDCARD = (1<<1)
@ -543,37 +548,37 @@ def parse_psl(infile, utf_mode):
  section = 0

  for line in infile:
-    line = line.strip()
+    line = bytes(line.strip(), **codecs)
    if not line:
      continue

-    if line.startswith("//"):
+    if line.startswith(b'//'):
      if section == 0:
-        if "===BEGIN ICANN DOMAINS===" in line:
+        if b'===BEGIN ICANN DOMAINS===' in line:
          section = PSL_FLAG_ICANN
-        elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
+        elif section == 0 and b'===BEGIN PRIVATE DOMAINS===' in line:
          section = PSL_FLAG_PRIVATE
-      elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
+      elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
        section = 0
-      elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
+      elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
        section = 0
      continue # skip comments

-    if line[0] == '!':
+    if line[:1] == b'!':
      flags = PSL_FLAG_EXCEPTION | section
      line = line[1:]
-    elif line[0] == '*':
-      if line[1] != '.':
+    elif line[:1] == b'*':
+      if line[1:2] != b'.':
        print('Unsupported kind of rule (ignored): %s' % line)
        continue
      flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
      line = line[2:]
    else:
-      if not '.' in line:
+      if not b'.' in line:
        continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
      flags = PSL_FLAG_PLAIN | section

-    punycode = line.decode('utf-8').encode("idna")
+    punycode = line.decode('utf-8').encode('idna')

    if punycode in psl:
      """Found existing entry:
@ -596,7 +601,7 @@ def parse_psl(infile, utf_mode):
 #    for (domain, flags) in sorted(psl.iteritems()):
 #      outfile.write(domain + "%X" % (flags & 0x0F) + "\n")

-  return [domain + "%X" % (flags & 0x0F) for (domain, flags) in sorted(psl.iteritems())]
+  return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]


 def usage():
@ -620,6 +625,10 @@ def main():
  parser = parse_psl2c
  utf_mode = True

+  codecs = dict()
+  if sys.version_info.major > 2:
+    codecs['encoding'] = 'utf-8'
+
  for arg in sys.argv[1:-2]:
    if arg.startswith('--input-format='):
      value = arg[15:].lower()
@ -652,11 +661,11 @@ def main():
      usage()

  if sys.argv[-2] == '-':
-    with open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(sys.stdin, utf_mode), utf_mode))
+    with open(sys.argv[-1], 'wb') as outfile:
+      outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
  else:
-    with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(infile, utf_mode), utf_mode))
+    with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'wb') as outfile:
+      outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))

  return 0