Fix psl-make-dafsa to work with python3

This commit is contained in:
Olle Liljenzin 2016-11-12 21:10:59 +01:00
parent 1b36fb0012
commit 3f276c7d1e
1 changed files with 57 additions and 48 deletions

View File

@ -243,6 +243,10 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
def to_bytes(n):
"""Converts an integer value to a bytes object."""
return bytes(bytearray((n,)))
def to_dafsa(words, utf_mode):
"""Generates a DAFSA from a word list and returns the source node.
@ -253,35 +257,34 @@ def to_dafsa(words, utf_mode):
raise InputError('The domain list must not be empty')
def to_nodes(word, multibyte_length):
"""Split words into characters"""
byte = ord(word[0])
byte = ord(word[:1])
if multibyte_length:
# Consume next byte in multibyte sequence.
if byte & 0xC0 != 0x80:
raise InputError('Invalid UTF-8 multibyte sequence')
return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
char_length = char_length_table[byte]
if char_length == 1:
# 7-bit printable ASCII.
if len(word) == 1:
return chr(int(word[0], 16) & 0x0F), [None]
return word[0], [to_nodes(word[1:], 0)]
return to_bytes(int(word[:1], 16) & 0x0F), [None]
return word[:1], [to_nodes(word[1:], 0)]
elif char_length > 1:
# Leading byte in multibyte sequence.
if not utf_mode:
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
if len(word) <= char_length:
raise InputError('Unterminated UTF-8 multibyte sequence')
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
# Unexpected character.
raise InputError('Domain names must be printable ASCII or UTF-8')
return [to_nodes(word, 0) for word in words]
def to_words(node):
"""Generates a word list from all paths starting from an internal node."""
if not node:
return ['']
return [b'']
return [(node[0] + word) for child in node[1] for word in to_words(child)]
@ -348,7 +351,7 @@ def join_suffixes(dafsa):
"""Generates a new DAFSA where nodes that represent the same word lists
towards the sink are merged.
"""
nodemap = {frozenset(('',)): None}
nodemap = {frozenset((b'',)): None}
def join(node):
"""Returns a macthing node. A new node is created if no matching node
@ -446,7 +449,7 @@ def encode_prefix(label):
will then be a prefix to the label in the child node.
"""
assert label
return [ord(c) for c in reversed(label)]
return [c for c in bytearray(reversed(label))]
def encode_label(label):
@ -479,59 +482,61 @@ def encode(dafsa, utf_mode):
return output
def to_cxx(data):
def to_cxx(data, codecs):
"""Generates C++ code from a list of encoded bytes."""
text = '/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
text += ' documentation.'
text += '*/\n\n'
text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
text += b' documentation.'
text += b'*/\n\n'
text += b'static const unsigned char kDafsa['
text += bytes(str(len(data)), **codecs)
text += b'] = {\n'
for i in range(0, len(data), 12):
text += ' '
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
text += ',\n'
text += '};\n'
text += b' '
text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
text += b',\n'
text += b'};\n'
return text
def words_to_whatever(words, converter, utf_mode):
def words_to_whatever(words, converter, utf_mode, codecs):
"""Generates C++ code from a word list"""
dafsa = to_dafsa(words, utf_mode)
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
dafsa = fun(dafsa)
return converter(encode(dafsa, utf_mode))
return converter(encode(dafsa, utf_mode), codecs)
def words_to_cxx(words, utf_mode):
def words_to_cxx(words, utf_mode, codecs):
"""Generates C++ code from a word list"""
return words_to_whatever(words, to_cxx, utf_mode)
return words_to_whatever(words, to_cxx, utf_mode, codecs)
def words_to_binary(words, utf_mode):
def words_to_binary(words, utf_mode, codecs):
"""Generates C++ code from a word list"""
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
def parse_psl2c(infile, utf_mode):
def parse_psl2c(infile, utf_mode, codecs):
"""Parses file generated by psl2c and extract strings and return code"""
lines = [line.strip() for line in infile]
lines = [bytes(line.strip(), **codecs) for line in infile]
for line in lines:
if line[-3:-1] != ', ':
if line[-3:-1] != b', ':
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
# Technically the DAFSA format could support return values in range [0x00-0x1E],
# but the values below are the only with a defined meaning.
if line[-1] not in '0123456789ABCDEF':
raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1])
if line[-1] not in b'0123456789ABCDEF':
raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1:])
# with open("gperf.out", 'w') as outfile:
# for line in sorted(lines):
# outfile.write(line[:-3] + line[-1] + "\n")
return [line[:-3] + line[-1] for line in sorted(lines)]
return [line[:-3] + line[-1:] for line in sorted(lines)]
def parse_psl(infile, utf_mode):
def parse_psl(infile, utf_mode, codecs):
"""Parses PSL file and extract strings and return code"""
PSL_FLAG_EXCEPTION = (1<<0)
PSL_FLAG_WILDCARD = (1<<1)
@ -543,37 +548,37 @@ def parse_psl(infile, utf_mode):
section = 0
for line in infile:
line = line.strip()
line = bytes(line.strip(), **codecs)
if not line:
continue
if line.startswith("//"):
if line.startswith(b'//'):
if section == 0:
if "===BEGIN ICANN DOMAINS===" in line:
if b'===BEGIN ICANN DOMAINS===' in line:
section = PSL_FLAG_ICANN
elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
elif section == 0 and b'===BEGIN PRIVATE DOMAINS===' in line:
section = PSL_FLAG_PRIVATE
elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
section = 0
elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
section = 0
continue # skip comments
if line[0] == '!':
if line[:1] == b'!':
flags = PSL_FLAG_EXCEPTION | section
line = line[1:]
elif line[0] == '*':
if line[1] != '.':
elif line[:1] == b'*':
if line[1:2] != b'.':
print('Unsupported kind of rule (ignored): %s' % line)
continue
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
line = line[2:]
else:
if not '.' in line:
if not b'.' in line:
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
flags = PSL_FLAG_PLAIN | section
punycode = line.decode('utf-8').encode("idna")
punycode = line.decode('utf-8').encode('idna')
if punycode in psl:
"""Found existing entry:
@ -596,7 +601,7 @@ def parse_psl(infile, utf_mode):
# for (domain, flags) in sorted(psl.iteritems()):
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
return [domain + "%X" % (flags & 0x0F) for (domain, flags) in sorted(psl.iteritems())]
return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]
def usage():
@ -620,6 +625,10 @@ def main():
parser = parse_psl2c
utf_mode = True
codecs = dict()
if sys.version_info.major > 2:
codecs['encoding'] = 'utf-8'
for arg in sys.argv[1:-2]:
if arg.startswith('--input-format='):
value = arg[15:].lower()
@ -652,11 +661,11 @@ def main():
usage()
if sys.argv[-2] == '-':
with open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(sys.stdin, utf_mode), utf_mode))
with open(sys.argv[-1], 'wb') as outfile:
outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
else:
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(infile, utf_mode), utf_mode))
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'wb') as outfile:
outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
return 0