From 3211a66f00be12d9f0358b99c6c3187236c6e904 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Sat, 5 Nov 2016 11:31:11 +0100 Subject: [PATCH] Put punycode + UTF-8 rules into DAFSA in utf-8 mode --- src/psl-make-dafsa | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/psl-make-dafsa b/src/psl-make-dafsa index aa9d451..0d4408a 100755 --- a/src/psl-make-dafsa +++ b/src/psl-make-dafsa @@ -481,7 +481,7 @@ def encode(dafsa, utf_mode): def to_cxx(data): """Generates C++ code from a list of encoded bytes.""" - text = '/* This file is generated. DO NOT EDIT!\n\n' + text = '/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n' text += 'The byte array encodes effective tld names. See psl-make-dafsa source for' text += ' documentation.' text += '*/\n\n' @@ -512,7 +512,7 @@ def words_to_binary(words, utf_mode): return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode) -def parse_psl2c(infile): +def parse_psl2c(infile, utf_mode): """Parses file generated by psl2c and extract strings and return code""" lines = [line.strip() for line in infile] @@ -531,7 +531,7 @@ def parse_psl2c(infile): return [line[:-3] + line[-1] for line in sorted(lines)] -def parse_psl(infile): +def parse_psl(infile, utf_mode): """Parses PSL file and extract strings and return code""" PSL_FLAG_EXCEPTION = (1<<0) PSL_FLAG_WILDCARD = (1<<1) @@ -573,9 +573,9 @@ def parse_psl(infile): continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule flags = PSL_FLAG_PLAIN | section - line = line.decode('utf-8').encode("idna") + punycode = line.decode('utf-8').encode("idna") - if line in psl: + if punycode in psl: """Found existing entry: Combination of exception and plain rule is ambiguous !foo.bar @@ -585,10 +585,12 @@ def parse_psl(infile): !foo.bar + *.foo.bar foo.bar + *.foo.bar """ - print('Found %s/%X (now %X)' % line, psl[line], flags) + print('Found %s/%X (now %X)' % punycode, psl[punycode], flags) continue - psl[line] = flags + if utf_mode: + psl[line] = flags + psl[punycode] = flags # with open("psl.out", 'w') as outfile: # for (domain, flags) in sorted(psl.iteritems()): @@ -602,10 +604,10 @@ def usage(): print('usage: %s [options] infile outfile' % sys.argv[0]) print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)') print(' --input-format=psl infile is a Public Suffix List file') - print(' --output-format=cxx Write DAFSA as C/C++ code') + print(' --output-format=cxx Write DAFSA as C/C++ code (default)') print(' --output-format=binary Write DAFSA binary data') - print(' --encoding=ascii 7-bit ASCII mode (default)') - print(' --encoding=utf-8 UTF-8 mode') + print(' --encoding=ascii 7-bit ASCII mode') + print(' --encoding=utf-8 UTF-8 mode (default)') exit(1) @@ -616,7 +618,7 @@ def main(): converter = words_to_cxx parser = parse_psl2c - utf_mode = False + utf_mode = True for arg in sys.argv[1:-2]: if arg.startswith('--input-format='): @@ -634,6 +636,9 @@ def main(): converter = words_to_binary elif value == 'cxx': converter = words_to_cxx + else: + print("Unknown output format '%s'" % value) + return 1 elif arg.startswith('--encoding='): value = arg[11:].lower() if value == 'ascii': @@ -648,10 +653,10 @@ def main(): if sys.argv[-2] == '-': with open(sys.argv[-1], 'w') as outfile: - outfile.write(converter(parser(sys.stdin), utf_mode)) + outfile.write(converter(parser(sys.stdin, utf_mode), utf_mode)) else: with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile: - outfile.write(converter(parser(infile), utf_mode)) + outfile.write(converter(parser(infile, utf_mode), utf_mode)) return 0