Put punycode + UTF-8 rules into DAFSA in utf-8 mode
This commit is contained in:
parent
e30e77ef12
commit
3211a66f00
|
@ -481,7 +481,7 @@ def encode(dafsa, utf_mode):
|
||||||
|
|
||||||
def to_cxx(data):
|
def to_cxx(data):
|
||||||
"""Generates C++ code from a list of encoded bytes."""
|
"""Generates C++ code from a list of encoded bytes."""
|
||||||
text = '/* This file is generated. DO NOT EDIT!\n\n'
|
text = '/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
|
||||||
text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
||||||
text += ' documentation.'
|
text += ' documentation.'
|
||||||
text += '*/\n\n'
|
text += '*/\n\n'
|
||||||
|
@ -512,7 +512,7 @@ def words_to_binary(words, utf_mode):
|
||||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
|
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
|
||||||
|
|
||||||
|
|
||||||
def parse_psl2c(infile):
|
def parse_psl2c(infile, utf_mode):
|
||||||
"""Parses file generated by psl2c and extract strings and return code"""
|
"""Parses file generated by psl2c and extract strings and return code"""
|
||||||
lines = [line.strip() for line in infile]
|
lines = [line.strip() for line in infile]
|
||||||
|
|
||||||
|
@ -531,7 +531,7 @@ def parse_psl2c(infile):
|
||||||
return [line[:-3] + line[-1] for line in sorted(lines)]
|
return [line[:-3] + line[-1] for line in sorted(lines)]
|
||||||
|
|
||||||
|
|
||||||
def parse_psl(infile):
|
def parse_psl(infile, utf_mode):
|
||||||
"""Parses PSL file and extract strings and return code"""
|
"""Parses PSL file and extract strings and return code"""
|
||||||
PSL_FLAG_EXCEPTION = (1<<0)
|
PSL_FLAG_EXCEPTION = (1<<0)
|
||||||
PSL_FLAG_WILDCARD = (1<<1)
|
PSL_FLAG_WILDCARD = (1<<1)
|
||||||
|
@ -573,9 +573,9 @@ def parse_psl(infile):
|
||||||
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
|
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
|
||||||
flags = PSL_FLAG_PLAIN | section
|
flags = PSL_FLAG_PLAIN | section
|
||||||
|
|
||||||
line = line.decode('utf-8').encode("idna")
|
punycode = line.decode('utf-8').encode("idna")
|
||||||
|
|
||||||
if line in psl:
|
if punycode in psl:
|
||||||
"""Found existing entry:
|
"""Found existing entry:
|
||||||
Combination of exception and plain rule is ambiguous
|
Combination of exception and plain rule is ambiguous
|
||||||
!foo.bar
|
!foo.bar
|
||||||
|
@ -585,10 +585,12 @@ def parse_psl(infile):
|
||||||
!foo.bar + *.foo.bar
|
!foo.bar + *.foo.bar
|
||||||
foo.bar + *.foo.bar
|
foo.bar + *.foo.bar
|
||||||
"""
|
"""
|
||||||
print('Found %s/%X (now %X)' % line, psl[line], flags)
|
print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if utf_mode:
|
||||||
psl[line] = flags
|
psl[line] = flags
|
||||||
|
psl[punycode] = flags
|
||||||
|
|
||||||
# with open("psl.out", 'w') as outfile:
|
# with open("psl.out", 'w') as outfile:
|
||||||
# for (domain, flags) in sorted(psl.iteritems()):
|
# for (domain, flags) in sorted(psl.iteritems()):
|
||||||
|
@ -602,10 +604,10 @@ def usage():
|
||||||
print('usage: %s [options] infile outfile' % sys.argv[0])
|
print('usage: %s [options] infile outfile' % sys.argv[0])
|
||||||
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
|
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
|
||||||
print(' --input-format=psl infile is a Public Suffix List file')
|
print(' --input-format=psl infile is a Public Suffix List file')
|
||||||
print(' --output-format=cxx Write DAFSA as C/C++ code')
|
print(' --output-format=cxx Write DAFSA as C/C++ code (default)')
|
||||||
print(' --output-format=binary Write DAFSA binary data')
|
print(' --output-format=binary Write DAFSA binary data')
|
||||||
print(' --encoding=ascii 7-bit ASCII mode (default)')
|
print(' --encoding=ascii 7-bit ASCII mode')
|
||||||
print(' --encoding=utf-8 UTF-8 mode')
|
print(' --encoding=utf-8 UTF-8 mode (default)')
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
@ -616,7 +618,7 @@ def main():
|
||||||
|
|
||||||
converter = words_to_cxx
|
converter = words_to_cxx
|
||||||
parser = parse_psl2c
|
parser = parse_psl2c
|
||||||
utf_mode = False
|
utf_mode = True
|
||||||
|
|
||||||
for arg in sys.argv[1:-2]:
|
for arg in sys.argv[1:-2]:
|
||||||
if arg.startswith('--input-format='):
|
if arg.startswith('--input-format='):
|
||||||
|
@ -634,6 +636,9 @@ def main():
|
||||||
converter = words_to_binary
|
converter = words_to_binary
|
||||||
elif value == 'cxx':
|
elif value == 'cxx':
|
||||||
converter = words_to_cxx
|
converter = words_to_cxx
|
||||||
|
else:
|
||||||
|
print("Unknown output format '%s'" % value)
|
||||||
|
return 1
|
||||||
elif arg.startswith('--encoding='):
|
elif arg.startswith('--encoding='):
|
||||||
value = arg[11:].lower()
|
value = arg[11:].lower()
|
||||||
if value == 'ascii':
|
if value == 'ascii':
|
||||||
|
@ -648,10 +653,10 @@ def main():
|
||||||
|
|
||||||
if sys.argv[-2] == '-':
|
if sys.argv[-2] == '-':
|
||||||
with open(sys.argv[-1], 'w') as outfile:
|
with open(sys.argv[-1], 'w') as outfile:
|
||||||
outfile.write(converter(parser(sys.stdin), utf_mode))
|
outfile.write(converter(parser(sys.stdin, utf_mode), utf_mode))
|
||||||
else:
|
else:
|
||||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
||||||
outfile.write(converter(parser(infile), utf_mode))
|
outfile.write(converter(parser(infile, utf_mode), utf_mode))
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue