Add --input-format and --output-format to make_dafsa.py

This commit is contained in:
Tim Rühsen 2015-12-30 17:52:48 +01:00
parent 355edc152f
commit c699e3c441
2 changed files with 121 additions and 25 deletions

View File

@ -206,14 +206,14 @@ def to_dafsa(words):
"""
if not words:
raise InputError('The domain list must not be empty')
def ToNodes(word):
def to_nodes(word):
"""Split words into characters"""
if not 0x1F < ord(word[0]) < 0x80:
raise InputError('Domain names must be printable 7-bit ASCII')
if len(word) == 1:
return chr(ord(word[0]) & 0x0F), [None]
return word[0], [ToNodes(word[1:])]
return [ToNodes(word) for word in words]
return word[0], [to_nodes(word[1:])]
return [to_nodes(word) for word in words]
def to_words(node):
@ -254,8 +254,8 @@ def join_labels(dafsa):
"""Generates a new DAFSA where internal nodes are merged if there is a one to
one connection.
"""
parentcount = { id(None): 2 }
nodemap = { id(None): None }
parentcount = {id(None): 2}
nodemap = {id(None): None}
def count_parents(node):
"""Count incoming references"""
@ -286,7 +286,7 @@ def join_suffixes(dafsa):
"""Generates a new DAFSA where nodes that represent the same word lists
towards the sink are merged.
"""
nodemap = { frozenset(('',)): None }
nodemap = {frozenset(('',)): None}
def join(node):
"""Returns a macthing node. A new node is created if no matching node
@ -343,7 +343,7 @@ def encode_links(children, offsets, current):
return []
guess = 3 * len(children)
assert children
children = sorted(children, key = lambda x: -offsets[id(x)])
children = sorted(children, key=lambda x: -offsets[id(x)])
while True:
offset = current + guess
buf = []
@ -448,13 +448,10 @@ def words_to_binary(words):
return words_to_whatever(words, bytearray)
def parse_gperf(infile):
"""Parses gperf file and extract strings and return code"""
def parse_psl2c(infile):
"""Parses file generated by psl2c and extract strings and return code"""
lines = [line.strip() for line in infile]
# Extract strings after the first '%%' and before the second '%%'.
#begin = lines.index('%%') + 1
#end = lines.index('%%', begin)
#lines = lines[begin:end]
for line in lines:
if line[-3:-1] != ', ':
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
@ -463,27 +460,126 @@ def parse_gperf(infile):
if line[-1] not in '0123456789ABCDEF':
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' %
line[-1])
# with open("gperf.out", 'w') as outfile:
# for line in lines:
# outfile.write(line[:-3] + line[-1] + "\n")
return [line[:-3] + line[-1] for line in lines]
def parse_psl(infile):
"""Parses PSL file and extract strings and return code"""
PSL_FLAG_EXCEPTION = (1<<0)
PSL_FLAG_WILDCARD = (1<<1)
PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
psl = {}
section = 0
for line in infile:
line = line.strip()
if not line:
continue
if line.startswith("//"):
if section == 0:
if "===BEGIN ICANN DOMAINS===" in line:
section = PSL_FLAG_ICANN
elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
section = PSL_FLAG_PRIVATE
elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
section = 0
elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
section = 0
continue # skip comments
if line[0] == '!':
flags = PSL_FLAG_EXCEPTION | section
line = line[1:]
elif line[0] == '*':
if line[1] != '.':
print 'Unsupported kind of rule (ignored): %s' % line
continue
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
line = line[2:]
else:
if not '.' in line:
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
flags = PSL_FLAG_PLAIN | section
line = line.decode('utf-8').encode("idna")
if line in psl:
"""Found existing entry:
Combination of exception and plain rule is ambiguous
!foo.bar
foo.bar
Allowed:
!foo.bar + *.foo.bar
foo.bar + *.foo.bar
"""
print('Found %s/%X (now %X)' % line, psl[line], flags)
continue
psl[line] = flags
# with open("psl.out", 'w') as outfile:
# for (domain, flags) in psl.iteritems():
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
return [domain + "%X" % (flags & 0x0F) for (domain, flags) in psl.iteritems()]
def usage():
"""Prints the usage"""
print 'usage: %s [options] infile outfile' % sys.argv[0]
print ' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)'
print ' --input-format=psl infile is a Public Suffix List file'
print ' --output-format=cxx Write DAFSA as C/C++ code'
print ' --output-format=binary Write DAFSA binary data'
exit(1)
def main():
"""Convert PSL file into C or binary DAFSA file"""
if len(sys.argv) < 3:
print('usage: %s [--binary] infile outfile' % sys.argv[0])
return 1
usage()
argpos = 1
converter = words_to_cxx
parser = parse_psl2c
if sys.argv[argpos] == '--binary':
converter = words_to_binary
argpos += 1
for arg in sys.argv[1:-2]:
if arg.startswith('--input-format='):
value = arg[15:].lower()
if value == 'psl':
parser = parse_psl
elif value == 'psl2c':
parser = parse_psl2c
else:
print "Unknown input format '%s'" % value
return 1
elif arg.startswith('--output-format='):
value = arg[16:].lower()
if value == 'binary':
converter = words_to_binary
elif value == 'cxx':
converter = words_to_cxx
else:
print "Unknown output format '%s'" % value
return 1
else:
usage()
if sys.argv[argpos] == '-':
with open(sys.argv[argpos + 1], 'w') as outfile:
outfile.write(converter(parse_gperf(sys.stdin)))
if sys.argv[-2] == '-':
with open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(sys.stdin)))
else:
with open(sys.argv[argpos], 'r') as infile, open(sys.argv[argpos + 1], 'w') as outfile:
outfile.write(converter(parse_gperf(infile)))
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(infile)))
return 0

View File

@ -1170,7 +1170,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
if ((index = _vector_find(psl->suffixes, &suffix)) >= 0) {
/* Found existing entry:
* Combination of exception and plain rule is ambigous
* Combination of exception and plain rule is ambiguous
* !foo.bar
* foo.bar
*