Put punycode + UTF-8 rules into DAFSA in utf-8 mode
This commit is contained in:
parent
e30e77ef12
commit
3211a66f00
|
@ -481,7 +481,7 @@ def encode(dafsa, utf_mode):
|
|||
|
||||
def to_cxx(data):
|
||||
"""Generates C++ code from a list of encoded bytes."""
|
||||
text = '/* This file is generated. DO NOT EDIT!\n\n'
|
||||
text = '/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
|
||||
text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
||||
text += ' documentation.'
|
||||
text += '*/\n\n'
|
||||
|
@ -512,7 +512,7 @@ def words_to_binary(words, utf_mode):
|
|||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
|
||||
|
||||
|
||||
def parse_psl2c(infile):
|
||||
def parse_psl2c(infile, utf_mode):
|
||||
"""Parses file generated by psl2c and extract strings and return code"""
|
||||
lines = [line.strip() for line in infile]
|
||||
|
||||
|
@ -531,7 +531,7 @@ def parse_psl2c(infile):
|
|||
return [line[:-3] + line[-1] for line in sorted(lines)]
|
||||
|
||||
|
||||
def parse_psl(infile):
|
||||
def parse_psl(infile, utf_mode):
|
||||
"""Parses PSL file and extract strings and return code"""
|
||||
PSL_FLAG_EXCEPTION = (1<<0)
|
||||
PSL_FLAG_WILDCARD = (1<<1)
|
||||
|
@ -573,9 +573,9 @@ def parse_psl(infile):
|
|||
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
|
||||
flags = PSL_FLAG_PLAIN | section
|
||||
|
||||
line = line.decode('utf-8').encode("idna")
|
||||
punycode = line.decode('utf-8').encode("idna")
|
||||
|
||||
if line in psl:
|
||||
if punycode in psl:
|
||||
"""Found existing entry:
|
||||
Combination of exception and plain rule is ambiguous
|
||||
!foo.bar
|
||||
|
@ -585,10 +585,12 @@ def parse_psl(infile):
|
|||
!foo.bar + *.foo.bar
|
||||
foo.bar + *.foo.bar
|
||||
"""
|
||||
print('Found %s/%X (now %X)' % line, psl[line], flags)
|
||||
print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
|
||||
continue
|
||||
|
||||
if utf_mode:
|
||||
psl[line] = flags
|
||||
psl[punycode] = flags
|
||||
|
||||
# with open("psl.out", 'w') as outfile:
|
||||
# for (domain, flags) in sorted(psl.iteritems()):
|
||||
|
@ -602,10 +604,10 @@ def usage():
|
|||
print('usage: %s [options] infile outfile' % sys.argv[0])
|
||||
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
|
||||
print(' --input-format=psl infile is a Public Suffix List file')
|
||||
print(' --output-format=cxx Write DAFSA as C/C++ code')
|
||||
print(' --output-format=cxx Write DAFSA as C/C++ code (default)')
|
||||
print(' --output-format=binary Write DAFSA binary data')
|
||||
print(' --encoding=ascii 7-bit ASCII mode (default)')
|
||||
print(' --encoding=utf-8 UTF-8 mode')
|
||||
print(' --encoding=ascii 7-bit ASCII mode')
|
||||
print(' --encoding=utf-8 UTF-8 mode (default)')
|
||||
exit(1)
|
||||
|
||||
|
||||
|
@ -616,7 +618,7 @@ def main():
|
|||
|
||||
converter = words_to_cxx
|
||||
parser = parse_psl2c
|
||||
utf_mode = False
|
||||
utf_mode = True
|
||||
|
||||
for arg in sys.argv[1:-2]:
|
||||
if arg.startswith('--input-format='):
|
||||
|
@ -634,6 +636,9 @@ def main():
|
|||
converter = words_to_binary
|
||||
elif value == 'cxx':
|
||||
converter = words_to_cxx
|
||||
else:
|
||||
print("Unknown output format '%s'" % value)
|
||||
return 1
|
||||
elif arg.startswith('--encoding='):
|
||||
value = arg[11:].lower()
|
||||
if value == 'ascii':
|
||||
|
@ -648,10 +653,10 @@ def main():
|
|||
|
||||
if sys.argv[-2] == '-':
|
||||
with open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(sys.stdin), utf_mode))
|
||||
outfile.write(converter(parser(sys.stdin, utf_mode), utf_mode))
|
||||
else:
|
||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(infile), utf_mode))
|
||||
outfile.write(converter(parser(infile, utf_mode), utf_mode))
|
||||
|
||||
return 0
|
||||
|
||||
|
|
Loading…
Reference in New Issue