Put punycode + UTF-8 rules into DAFSA in utf-8 mode

This commit is contained in:
Tim Rühsen 2016-11-05 11:31:11 +01:00
parent e30e77ef12
commit 3211a66f00
1 changed files with 18 additions and 13 deletions

View File

@ -481,7 +481,7 @@ def encode(dafsa, utf_mode):
def to_cxx(data): def to_cxx(data):
"""Generates C++ code from a list of encoded bytes.""" """Generates C++ code from a list of encoded bytes."""
text = '/* This file is generated. DO NOT EDIT!\n\n' text = '/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
text += 'The byte array encodes effective tld names. See psl-make-dafsa source for' text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
text += ' documentation.' text += ' documentation.'
text += '*/\n\n' text += '*/\n\n'
@ -512,7 +512,7 @@ def words_to_binary(words, utf_mode):
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode) return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
def parse_psl2c(infile): def parse_psl2c(infile, utf_mode):
"""Parses file generated by psl2c and extract strings and return code""" """Parses file generated by psl2c and extract strings and return code"""
lines = [line.strip() for line in infile] lines = [line.strip() for line in infile]
@ -531,7 +531,7 @@ def parse_psl2c(infile):
return [line[:-3] + line[-1] for line in sorted(lines)] return [line[:-3] + line[-1] for line in sorted(lines)]
def parse_psl(infile): def parse_psl(infile, utf_mode):
"""Parses PSL file and extract strings and return code""" """Parses PSL file and extract strings and return code"""
PSL_FLAG_EXCEPTION = (1<<0) PSL_FLAG_EXCEPTION = (1<<0)
PSL_FLAG_WILDCARD = (1<<1) PSL_FLAG_WILDCARD = (1<<1)
@ -573,9 +573,9 @@ def parse_psl(infile):
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
flags = PSL_FLAG_PLAIN | section flags = PSL_FLAG_PLAIN | section
line = line.decode('utf-8').encode("idna") punycode = line.decode('utf-8').encode("idna")
if line in psl: if punycode in psl:
"""Found existing entry: """Found existing entry:
Combination of exception and plain rule is ambiguous Combination of exception and plain rule is ambiguous
!foo.bar !foo.bar
@ -585,10 +585,12 @@ def parse_psl(infile):
!foo.bar + *.foo.bar !foo.bar + *.foo.bar
foo.bar + *.foo.bar foo.bar + *.foo.bar
""" """
print('Found %s/%X (now %X)' % line, psl[line], flags) print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
continue continue
if utf_mode:
psl[line] = flags psl[line] = flags
psl[punycode] = flags
# with open("psl.out", 'w') as outfile: # with open("psl.out", 'w') as outfile:
# for (domain, flags) in sorted(psl.iteritems()): # for (domain, flags) in sorted(psl.iteritems()):
@ -602,10 +604,10 @@ def usage():
print('usage: %s [options] infile outfile' % sys.argv[0]) print('usage: %s [options] infile outfile' % sys.argv[0])
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)') print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
print(' --input-format=psl infile is a Public Suffix List file') print(' --input-format=psl infile is a Public Suffix List file')
print(' --output-format=cxx Write DAFSA as C/C++ code') print(' --output-format=cxx Write DAFSA as C/C++ code (default)')
print(' --output-format=binary Write DAFSA binary data') print(' --output-format=binary Write DAFSA binary data')
print(' --encoding=ascii 7-bit ASCII mode (default)') print(' --encoding=ascii 7-bit ASCII mode')
print(' --encoding=utf-8 UTF-8 mode') print(' --encoding=utf-8 UTF-8 mode (default)')
exit(1) exit(1)
@ -616,7 +618,7 @@ def main():
converter = words_to_cxx converter = words_to_cxx
parser = parse_psl2c parser = parse_psl2c
utf_mode = False utf_mode = True
for arg in sys.argv[1:-2]: for arg in sys.argv[1:-2]:
if arg.startswith('--input-format='): if arg.startswith('--input-format='):
@ -634,6 +636,9 @@ def main():
converter = words_to_binary converter = words_to_binary
elif value == 'cxx': elif value == 'cxx':
converter = words_to_cxx converter = words_to_cxx
else:
print("Unknown output format '%s'" % value)
return 1
elif arg.startswith('--encoding='): elif arg.startswith('--encoding='):
value = arg[11:].lower() value = arg[11:].lower()
if value == 'ascii': if value == 'ascii':
@ -648,10 +653,10 @@ def main():
if sys.argv[-2] == '-': if sys.argv[-2] == '-':
with open(sys.argv[-1], 'w') as outfile: with open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(sys.stdin), utf_mode)) outfile.write(converter(parser(sys.stdin, utf_mode), utf_mode))
else: else:
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile: with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(infile), utf_mode)) outfile.write(converter(parser(infile, utf_mode), utf_mode))
return 0 return 0