diff --git a/src/make_dafsa.py b/src/make_dafsa.py index c3766ec..0456191 100755 --- a/src/make_dafsa.py +++ b/src/make_dafsa.py @@ -206,14 +206,14 @@ def to_dafsa(words): """ if not words: raise InputError('The domain list must not be empty') - def ToNodes(word): + def to_nodes(word): """Split words into characters""" if not 0x1F < ord(word[0]) < 0x80: raise InputError('Domain names must be printable 7-bit ASCII') if len(word) == 1: return chr(ord(word[0]) & 0x0F), [None] - return word[0], [ToNodes(word[1:])] - return [ToNodes(word) for word in words] + return word[0], [to_nodes(word[1:])] + return [to_nodes(word) for word in words] def to_words(node): @@ -254,8 +254,8 @@ def join_labels(dafsa): """Generates a new DAFSA where internal nodes are merged if there is a one to one connection. """ - parentcount = { id(None): 2 } - nodemap = { id(None): None } + parentcount = {id(None): 2} + nodemap = {id(None): None} def count_parents(node): """Count incoming references""" @@ -286,7 +286,7 @@ def join_suffixes(dafsa): """Generates a new DAFSA where nodes that represent the same word lists towards the sink are merged. """ - nodemap = { frozenset(('',)): None } + nodemap = {frozenset(('',)): None} def join(node): """Returns a macthing node. A new node is created if no matching node @@ -343,7 +343,7 @@ def encode_links(children, offsets, current): return [] guess = 3 * len(children) assert children - children = sorted(children, key = lambda x: -offsets[id(x)]) + children = sorted(children, key=lambda x: -offsets[id(x)]) while True: offset = current + guess buf = [] @@ -448,13 +448,10 @@ def words_to_binary(words): return words_to_whatever(words, bytearray) -def parse_gperf(infile): - """Parses gperf file and extract strings and return code""" +def parse_psl2c(infile): + """Parses file generated by psl2c and extract strings and return code""" lines = [line.strip() for line in infile] - # Extract strings after the first '%%' and before the second '%%'. - #begin = lines.index('%%') + 1 - #end = lines.index('%%', begin) - #lines = lines[begin:end] + for line in lines: if line[-3:-1] != ', ': raise InputError('Expected "domainname, ", found "%s"' % line) @@ -463,27 +460,126 @@ def parse_gperf(infile): if line[-1] not in '0123456789ABCDEF': raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % line[-1]) + +# with open("gperf.out", 'w') as outfile: +# for line in lines: +# outfile.write(line[:-3] + line[-1] + "\n") + return [line[:-3] + line[-1] for line in lines] +def parse_psl(infile): + """Parses PSL file and extract strings and return code""" + PSL_FLAG_EXCEPTION = (1<<0) + PSL_FLAG_WILDCARD = (1<<1) + PSL_FLAG_ICANN = (1<<2) # entry of ICANN section + PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section + PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking + + psl = {} + section = 0 + + for line in infile: + line = line.strip() + if not line: + continue + + if line.startswith("//"): + if section == 0: + if "===BEGIN ICANN DOMAINS===" in line: + section = PSL_FLAG_ICANN + elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line: + section = PSL_FLAG_PRIVATE + elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line: + section = 0 + elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line: + section = 0 + continue # skip comments + + if line[0] == '!': + flags = PSL_FLAG_EXCEPTION | section + line = line[1:] + elif line[0] == '*': + if line[1] != '.': + print 'Unsupported kind of rule (ignored): %s' % line + continue + flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section + line = line[2:] + else: + if not '.' in line: + continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule + flags = PSL_FLAG_PLAIN | section + + line = line.decode('utf-8').encode("idna") + + if line in psl: + """Found existing entry: + Combination of exception and plain rule is ambiguous + !foo.bar + foo.bar + + Allowed: + !foo.bar + *.foo.bar + foo.bar + *.foo.bar + """ + print('Found %s/%X (now %X)' % line, psl[line], flags) + continue + + psl[line] = flags + +# with open("psl.out", 'w') as outfile: +# for (domain, flags) in psl.iteritems(): +# outfile.write(domain + "%X" % (flags & 0x0F) + "\n") + + return [domain + "%X" % (flags & 0x0F) for (domain, flags) in psl.iteritems()] + + +def usage(): + """Prints the usage""" + print 'usage: %s [options] infile outfile' % sys.argv[0] + print ' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)' + print ' --input-format=psl infile is a Public Suffix List file' + print ' --output-format=cxx Write DAFSA as C/C++ code' + print ' --output-format=binary Write DAFSA binary data' + exit(1) + + def main(): + """Convert PSL file into C or binary DAFSA file""" if len(sys.argv) < 3: - print('usage: %s [--binary] infile outfile' % sys.argv[0]) - return 1 + usage() - argpos = 1 converter = words_to_cxx + parser = parse_psl2c - if sys.argv[argpos] == '--binary': - converter = words_to_binary - argpos += 1 + for arg in sys.argv[1:-2]: + if arg.startswith('--input-format='): + value = arg[15:].lower() + if value == 'psl': + parser = parse_psl + elif value == 'psl2c': + parser = parse_psl2c + else: + print "Unknown input format '%s'" % value + return 1 + elif arg.startswith('--output-format='): + value = arg[16:].lower() + if value == 'binary': + converter = words_to_binary + elif value == 'cxx': + converter = words_to_cxx + else: + print "Unknown output format '%s'" % value + return 1 + else: + usage() - if sys.argv[argpos] == '-': - with open(sys.argv[argpos + 1], 'w') as outfile: - outfile.write(converter(parse_gperf(sys.stdin))) + if sys.argv[-2] == '-': + with open(sys.argv[-1], 'w') as outfile: + outfile.write(converter(parser(sys.stdin))) else: - with open(sys.argv[argpos], 'r') as infile, open(sys.argv[argpos + 1], 'w') as outfile: - outfile.write(converter(parse_gperf(infile))) + with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile: + outfile.write(converter(parser(infile))) return 0 diff --git a/src/psl.c b/src/psl.c index c320ed1..49bfdbd 100644 --- a/src/psl.c +++ b/src/psl.c @@ -1170,7 +1170,7 @@ psl_ctx_t *psl_load_fp(FILE *fp) if ((index = _vector_find(psl->suffixes, &suffix)) >= 0) { /* Found existing entry: - * Combination of exception and plain rule is ambigous + * Combination of exception and plain rule is ambiguous * !foo.bar * foo.bar *