Add --input-format and --output-format to make_dafsa.py
This commit is contained in:
parent
355edc152f
commit
c699e3c441
|
@ -206,14 +206,14 @@ def to_dafsa(words):
|
|||
"""
|
||||
if not words:
|
||||
raise InputError('The domain list must not be empty')
|
||||
def ToNodes(word):
|
||||
def to_nodes(word):
|
||||
"""Split words into characters"""
|
||||
if not 0x1F < ord(word[0]) < 0x80:
|
||||
raise InputError('Domain names must be printable 7-bit ASCII')
|
||||
if len(word) == 1:
|
||||
return chr(ord(word[0]) & 0x0F), [None]
|
||||
return word[0], [ToNodes(word[1:])]
|
||||
return [ToNodes(word) for word in words]
|
||||
return word[0], [to_nodes(word[1:])]
|
||||
return [to_nodes(word) for word in words]
|
||||
|
||||
|
||||
def to_words(node):
|
||||
|
@ -254,8 +254,8 @@ def join_labels(dafsa):
|
|||
"""Generates a new DAFSA where internal nodes are merged if there is a one to
|
||||
one connection.
|
||||
"""
|
||||
parentcount = { id(None): 2 }
|
||||
nodemap = { id(None): None }
|
||||
parentcount = {id(None): 2}
|
||||
nodemap = {id(None): None}
|
||||
|
||||
def count_parents(node):
|
||||
"""Count incoming references"""
|
||||
|
@ -286,7 +286,7 @@ def join_suffixes(dafsa):
|
|||
"""Generates a new DAFSA where nodes that represent the same word lists
|
||||
towards the sink are merged.
|
||||
"""
|
||||
nodemap = { frozenset(('',)): None }
|
||||
nodemap = {frozenset(('',)): None}
|
||||
|
||||
def join(node):
|
||||
"""Returns a macthing node. A new node is created if no matching node
|
||||
|
@ -343,7 +343,7 @@ def encode_links(children, offsets, current):
|
|||
return []
|
||||
guess = 3 * len(children)
|
||||
assert children
|
||||
children = sorted(children, key = lambda x: -offsets[id(x)])
|
||||
children = sorted(children, key=lambda x: -offsets[id(x)])
|
||||
while True:
|
||||
offset = current + guess
|
||||
buf = []
|
||||
|
@ -448,13 +448,10 @@ def words_to_binary(words):
|
|||
return words_to_whatever(words, bytearray)
|
||||
|
||||
|
||||
def parse_gperf(infile):
|
||||
"""Parses gperf file and extract strings and return code"""
|
||||
def parse_psl2c(infile):
|
||||
"""Parses file generated by psl2c and extract strings and return code"""
|
||||
lines = [line.strip() for line in infile]
|
||||
# Extract strings after the first '%%' and before the second '%%'.
|
||||
#begin = lines.index('%%') + 1
|
||||
#end = lines.index('%%', begin)
|
||||
#lines = lines[begin:end]
|
||||
|
||||
for line in lines:
|
||||
if line[-3:-1] != ', ':
|
||||
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
|
||||
|
@ -463,27 +460,126 @@ def parse_gperf(infile):
|
|||
if line[-1] not in '0123456789ABCDEF':
|
||||
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' %
|
||||
line[-1])
|
||||
|
||||
# with open("gperf.out", 'w') as outfile:
|
||||
# for line in lines:
|
||||
# outfile.write(line[:-3] + line[-1] + "\n")
|
||||
|
||||
return [line[:-3] + line[-1] for line in lines]
|
||||
|
||||
|
||||
def parse_psl(infile):
|
||||
"""Parses PSL file and extract strings and return code"""
|
||||
PSL_FLAG_EXCEPTION = (1<<0)
|
||||
PSL_FLAG_WILDCARD = (1<<1)
|
||||
PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
|
||||
PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
|
||||
PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
|
||||
|
||||
psl = {}
|
||||
section = 0
|
||||
|
||||
for line in infile:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith("//"):
|
||||
if section == 0:
|
||||
if "===BEGIN ICANN DOMAINS===" in line:
|
||||
section = PSL_FLAG_ICANN
|
||||
elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
|
||||
section = PSL_FLAG_PRIVATE
|
||||
elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
|
||||
section = 0
|
||||
elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
|
||||
section = 0
|
||||
continue # skip comments
|
||||
|
||||
if line[0] == '!':
|
||||
flags = PSL_FLAG_EXCEPTION | section
|
||||
line = line[1:]
|
||||
elif line[0] == '*':
|
||||
if line[1] != '.':
|
||||
print 'Unsupported kind of rule (ignored): %s' % line
|
||||
continue
|
||||
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
|
||||
line = line[2:]
|
||||
else:
|
||||
if not '.' in line:
|
||||
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
|
||||
flags = PSL_FLAG_PLAIN | section
|
||||
|
||||
line = line.decode('utf-8').encode("idna")
|
||||
|
||||
if line in psl:
|
||||
"""Found existing entry:
|
||||
Combination of exception and plain rule is ambiguous
|
||||
!foo.bar
|
||||
foo.bar
|
||||
|
||||
Allowed:
|
||||
!foo.bar + *.foo.bar
|
||||
foo.bar + *.foo.bar
|
||||
"""
|
||||
print('Found %s/%X (now %X)' % line, psl[line], flags)
|
||||
continue
|
||||
|
||||
psl[line] = flags
|
||||
|
||||
# with open("psl.out", 'w') as outfile:
|
||||
# for (domain, flags) in psl.iteritems():
|
||||
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
|
||||
|
||||
return [domain + "%X" % (flags & 0x0F) for (domain, flags) in psl.iteritems()]
|
||||
|
||||
|
||||
def usage():
|
||||
"""Prints the usage"""
|
||||
print 'usage: %s [options] infile outfile' % sys.argv[0]
|
||||
print ' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)'
|
||||
print ' --input-format=psl infile is a Public Suffix List file'
|
||||
print ' --output-format=cxx Write DAFSA as C/C++ code'
|
||||
print ' --output-format=binary Write DAFSA binary data'
|
||||
exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
"""Convert PSL file into C or binary DAFSA file"""
|
||||
if len(sys.argv) < 3:
|
||||
print('usage: %s [--binary] infile outfile' % sys.argv[0])
|
||||
return 1
|
||||
usage()
|
||||
|
||||
argpos = 1
|
||||
converter = words_to_cxx
|
||||
parser = parse_psl2c
|
||||
|
||||
if sys.argv[argpos] == '--binary':
|
||||
converter = words_to_binary
|
||||
argpos += 1
|
||||
for arg in sys.argv[1:-2]:
|
||||
if arg.startswith('--input-format='):
|
||||
value = arg[15:].lower()
|
||||
if value == 'psl':
|
||||
parser = parse_psl
|
||||
elif value == 'psl2c':
|
||||
parser = parse_psl2c
|
||||
else:
|
||||
print "Unknown input format '%s'" % value
|
||||
return 1
|
||||
elif arg.startswith('--output-format='):
|
||||
value = arg[16:].lower()
|
||||
if value == 'binary':
|
||||
converter = words_to_binary
|
||||
elif value == 'cxx':
|
||||
converter = words_to_cxx
|
||||
else:
|
||||
print "Unknown output format '%s'" % value
|
||||
return 1
|
||||
else:
|
||||
usage()
|
||||
|
||||
if sys.argv[argpos] == '-':
|
||||
with open(sys.argv[argpos + 1], 'w') as outfile:
|
||||
outfile.write(converter(parse_gperf(sys.stdin)))
|
||||
if sys.argv[-2] == '-':
|
||||
with open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(sys.stdin)))
|
||||
else:
|
||||
with open(sys.argv[argpos], 'r') as infile, open(sys.argv[argpos + 1], 'w') as outfile:
|
||||
outfile.write(converter(parse_gperf(infile)))
|
||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(infile)))
|
||||
|
||||
return 0
|
||||
|
||||
|
|
Loading…
Reference in New Issue