Fix psl-make-dafsa to work with python3
This commit is contained in:
parent
1b36fb0012
commit
3f276c7d1e
|
@ -243,6 +243,10 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x
|
|||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
|
||||
|
||||
def to_bytes(n):
|
||||
"""Converts an integer value to a bytes object."""
|
||||
return bytes(bytearray((n,)))
|
||||
|
||||
def to_dafsa(words, utf_mode):
|
||||
"""Generates a DAFSA from a word list and returns the source node.
|
||||
|
||||
|
@ -253,35 +257,34 @@ def to_dafsa(words, utf_mode):
|
|||
raise InputError('The domain list must not be empty')
|
||||
def to_nodes(word, multibyte_length):
|
||||
"""Split words into characters"""
|
||||
byte = ord(word[0])
|
||||
byte = ord(word[:1])
|
||||
if multibyte_length:
|
||||
# Consume next byte in multibyte sequence.
|
||||
if byte & 0xC0 != 0x80:
|
||||
raise InputError('Invalid UTF-8 multibyte sequence')
|
||||
return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
|
||||
return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
|
||||
char_length = char_length_table[byte]
|
||||
if char_length == 1:
|
||||
# 7-bit printable ASCII.
|
||||
if len(word) == 1:
|
||||
return chr(int(word[0], 16) & 0x0F), [None]
|
||||
return word[0], [to_nodes(word[1:], 0)]
|
||||
return to_bytes(int(word[:1], 16) & 0x0F), [None]
|
||||
return word[:1], [to_nodes(word[1:], 0)]
|
||||
elif char_length > 1:
|
||||
# Leading byte in multibyte sequence.
|
||||
if not utf_mode:
|
||||
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
|
||||
if len(word) <= char_length:
|
||||
raise InputError('Unterminated UTF-8 multibyte sequence')
|
||||
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
||||
return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
||||
# Unexpected character.
|
||||
raise InputError('Domain names must be printable ASCII or UTF-8')
|
||||
|
||||
return [to_nodes(word, 0) for word in words]
|
||||
|
||||
|
||||
def to_words(node):
|
||||
"""Generates a word list from all paths starting from an internal node."""
|
||||
if not node:
|
||||
return ['']
|
||||
return [b'']
|
||||
return [(node[0] + word) for child in node[1] for word in to_words(child)]
|
||||
|
||||
|
||||
|
@ -348,7 +351,7 @@ def join_suffixes(dafsa):
|
|||
"""Generates a new DAFSA where nodes that represent the same word lists
|
||||
towards the sink are merged.
|
||||
"""
|
||||
nodemap = {frozenset(('',)): None}
|
||||
nodemap = {frozenset((b'',)): None}
|
||||
|
||||
def join(node):
|
||||
"""Returns a macthing node. A new node is created if no matching node
|
||||
|
@ -446,7 +449,7 @@ def encode_prefix(label):
|
|||
will then be a prefix to the label in the child node.
|
||||
"""
|
||||
assert label
|
||||
return [ord(c) for c in reversed(label)]
|
||||
return [c for c in bytearray(reversed(label))]
|
||||
|
||||
|
||||
def encode_label(label):
|
||||
|
@ -479,59 +482,61 @@ def encode(dafsa, utf_mode):
|
|||
return output
|
||||
|
||||
|
||||
def to_cxx(data):
|
||||
def to_cxx(data, codecs):
|
||||
"""Generates C++ code from a list of encoded bytes."""
|
||||
text = '/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
|
||||
text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
||||
text += ' documentation.'
|
||||
text += '*/\n\n'
|
||||
text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
|
||||
text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
|
||||
text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
||||
text += b' documentation.'
|
||||
text += b'*/\n\n'
|
||||
text += b'static const unsigned char kDafsa['
|
||||
text += bytes(str(len(data)), **codecs)
|
||||
text += b'] = {\n'
|
||||
for i in range(0, len(data), 12):
|
||||
text += ' '
|
||||
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
|
||||
text += ',\n'
|
||||
text += '};\n'
|
||||
text += b' '
|
||||
text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
|
||||
text += b',\n'
|
||||
text += b'};\n'
|
||||
return text
|
||||
|
||||
|
||||
def words_to_whatever(words, converter, utf_mode):
|
||||
def words_to_whatever(words, converter, utf_mode, codecs):
|
||||
"""Generates C++ code from a word list"""
|
||||
dafsa = to_dafsa(words, utf_mode)
|
||||
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
||||
dafsa = fun(dafsa)
|
||||
return converter(encode(dafsa, utf_mode))
|
||||
return converter(encode(dafsa, utf_mode), codecs)
|
||||
|
||||
|
||||
def words_to_cxx(words, utf_mode):
|
||||
def words_to_cxx(words, utf_mode, codecs):
|
||||
"""Generates C++ code from a word list"""
|
||||
return words_to_whatever(words, to_cxx, utf_mode)
|
||||
return words_to_whatever(words, to_cxx, utf_mode, codecs)
|
||||
|
||||
|
||||
def words_to_binary(words, utf_mode):
|
||||
def words_to_binary(words, utf_mode, codecs):
|
||||
"""Generates C++ code from a word list"""
|
||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
|
||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
|
||||
|
||||
|
||||
def parse_psl2c(infile, utf_mode):
|
||||
def parse_psl2c(infile, utf_mode, codecs):
|
||||
"""Parses file generated by psl2c and extract strings and return code"""
|
||||
lines = [line.strip() for line in infile]
|
||||
lines = [bytes(line.strip(), **codecs) for line in infile]
|
||||
|
||||
for line in lines:
|
||||
if line[-3:-1] != ', ':
|
||||
if line[-3:-1] != b', ':
|
||||
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
|
||||
# Technically the DAFSA format could support return values in range [0x00-0x1E],
|
||||
# but the values below are the only with a defined meaning.
|
||||
if line[-1] not in '0123456789ABCDEF':
|
||||
raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1])
|
||||
if line[-1] not in b'0123456789ABCDEF':
|
||||
raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1:])
|
||||
|
||||
# with open("gperf.out", 'w') as outfile:
|
||||
# for line in sorted(lines):
|
||||
# outfile.write(line[:-3] + line[-1] + "\n")
|
||||
|
||||
return [line[:-3] + line[-1] for line in sorted(lines)]
|
||||
return [line[:-3] + line[-1:] for line in sorted(lines)]
|
||||
|
||||
|
||||
def parse_psl(infile, utf_mode):
|
||||
def parse_psl(infile, utf_mode, codecs):
|
||||
"""Parses PSL file and extract strings and return code"""
|
||||
PSL_FLAG_EXCEPTION = (1<<0)
|
||||
PSL_FLAG_WILDCARD = (1<<1)
|
||||
|
@ -543,37 +548,37 @@ def parse_psl(infile, utf_mode):
|
|||
section = 0
|
||||
|
||||
for line in infile:
|
||||
line = line.strip()
|
||||
line = bytes(line.strip(), **codecs)
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith("//"):
|
||||
if line.startswith(b'//'):
|
||||
if section == 0:
|
||||
if "===BEGIN ICANN DOMAINS===" in line:
|
||||
if b'===BEGIN ICANN DOMAINS===' in line:
|
||||
section = PSL_FLAG_ICANN
|
||||
elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
|
||||
elif section == 0 and b'===BEGIN PRIVATE DOMAINS===' in line:
|
||||
section = PSL_FLAG_PRIVATE
|
||||
elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
|
||||
elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
|
||||
section = 0
|
||||
elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
|
||||
elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
|
||||
section = 0
|
||||
continue # skip comments
|
||||
|
||||
if line[0] == '!':
|
||||
if line[:1] == b'!':
|
||||
flags = PSL_FLAG_EXCEPTION | section
|
||||
line = line[1:]
|
||||
elif line[0] == '*':
|
||||
if line[1] != '.':
|
||||
elif line[:1] == b'*':
|
||||
if line[1:2] != b'.':
|
||||
print('Unsupported kind of rule (ignored): %s' % line)
|
||||
continue
|
||||
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
|
||||
line = line[2:]
|
||||
else:
|
||||
if not '.' in line:
|
||||
if not b'.' in line:
|
||||
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
|
||||
flags = PSL_FLAG_PLAIN | section
|
||||
|
||||
punycode = line.decode('utf-8').encode("idna")
|
||||
punycode = line.decode('utf-8').encode('idna')
|
||||
|
||||
if punycode in psl:
|
||||
"""Found existing entry:
|
||||
|
@ -596,7 +601,7 @@ def parse_psl(infile, utf_mode):
|
|||
# for (domain, flags) in sorted(psl.iteritems()):
|
||||
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
|
||||
|
||||
return [domain + "%X" % (flags & 0x0F) for (domain, flags) in sorted(psl.iteritems())]
|
||||
return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]
|
||||
|
||||
|
||||
def usage():
|
||||
|
@ -620,6 +625,10 @@ def main():
|
|||
parser = parse_psl2c
|
||||
utf_mode = True
|
||||
|
||||
codecs = dict()
|
||||
if sys.version_info.major > 2:
|
||||
codecs['encoding'] = 'utf-8'
|
||||
|
||||
for arg in sys.argv[1:-2]:
|
||||
if arg.startswith('--input-format='):
|
||||
value = arg[15:].lower()
|
||||
|
@ -652,11 +661,11 @@ def main():
|
|||
usage()
|
||||
|
||||
if sys.argv[-2] == '-':
|
||||
with open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(sys.stdin, utf_mode), utf_mode))
|
||||
with open(sys.argv[-1], 'wb') as outfile:
|
||||
outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
|
||||
else:
|
||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(infile, utf_mode), utf_mode))
|
||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'wb') as outfile:
|
||||
outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
|
||||
|
||||
return 0
|
||||
|
||||
|
|
Loading…
Reference in New Issue