Added version info into generated DAFSA.

psl-make-dafsa got a mode switch so that the old version can be
generated for testing.
This commit is contained in:
Olle Liljenzin 2016-11-04 19:43:36 +01:00 committed by Tim Rühsen
parent e03953e27a
commit 8c2bcd5a24
1 changed files with 32 additions and 14 deletions

View File

@ -85,8 +85,13 @@ The generated byte array can described by the following BNF:
| <prefix> <node> | <prefix> <node>
| <end_label> | <end_label>
<dafsa> ::= <source> <graph> ::= <graph>
| <dafsa> <node> | <graph> <node>
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
<dafsa> ::= <graph> <version>
Decoding: Decoding:
@ -238,7 +243,7 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
def to_dafsa(words): def to_dafsa(words, utf_mode):
"""Generates a DAFSA from a word list and returns the source node. """Generates a DAFSA from a word list and returns the source node.
Each word is split into characters so that each character is represented by Each word is split into characters so that each character is represented by
@ -262,6 +267,8 @@ def to_dafsa(words):
return word[0], [to_nodes(word[1:], 0)] return word[0], [to_nodes(word[1:], 0)]
elif char_length > 1: elif char_length > 1:
# Leading byte in multibyte sequence. # Leading byte in multibyte sequence.
if not utf_mode:
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
if len(word) <= char_length: if len(word) <= char_length:
raise InputError('Unterminated UTF-8 multibyte sequence') raise InputError('Unterminated UTF-8 multibyte sequence')
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])] return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
@ -451,7 +458,7 @@ def encode_label(label):
return buf return buf
def encode(dafsa): def encode(dafsa, utf_mode):
"""Encodes a DAFSA to a list of bytes""" """Encodes a DAFSA to a list of bytes"""
output = [] output = []
offsets = {} offsets = {}
@ -467,6 +474,8 @@ def encode(dafsa):
output.extend(encode_links(dafsa, offsets, len(output))) output.extend(encode_links(dafsa, offsets, len(output)))
output.reverse() output.reverse()
if utf_mode:
output.append(0x01)
return output return output
@ -485,22 +494,22 @@ def to_cxx(data):
return text return text
def words_to_whatever(words, converter): def words_to_whatever(words, converter, utf_mode):
"""Generates C++ code from a word list""" """Generates C++ code from a word list"""
dafsa = to_dafsa(words) dafsa = to_dafsa(words, utf_mode)
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels): for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
dafsa = fun(dafsa) dafsa = fun(dafsa)
return converter(encode(dafsa)) return converter(encode(dafsa, utf_mode))
def words_to_cxx(words): def words_to_cxx(words, utf_mode):
"""Generates C++ code from a word list""" """Generates C++ code from a word list"""
return words_to_whatever(words, to_cxx) return words_to_whatever(words, to_cxx, utf_mode)
def words_to_binary(words): def words_to_binary(words, utf_mode):
"""Generates C++ code from a word list""" """Generates C++ code from a word list"""
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray) return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
def parse_psl2c(infile): def parse_psl2c(infile):
@ -595,6 +604,8 @@ def usage():
print(' --input-format=psl infile is a Public Suffix List file') print(' --input-format=psl infile is a Public Suffix List file')
print(' --output-format=cxx Write DAFSA as C/C++ code') print(' --output-format=cxx Write DAFSA as C/C++ code')
print(' --output-format=binary Write DAFSA binary data') print(' --output-format=binary Write DAFSA binary data')
print(' --encoding=ascii 7-bit ASCII mode (default)')
print(' --encoding=utf-8 UTF-8 mode')
exit(1) exit(1)
@ -605,6 +616,7 @@ def main():
converter = words_to_cxx converter = words_to_cxx
parser = parse_psl2c parser = parse_psl2c
utf_mode = False
for arg in sys.argv[1:-2]: for arg in sys.argv[1:-2]:
if arg.startswith('--input-format='): if arg.startswith('--input-format='):
@ -622,18 +634,24 @@ def main():
converter = words_to_binary converter = words_to_binary
elif value == 'cxx': elif value == 'cxx':
converter = words_to_cxx converter = words_to_cxx
elif arg.startswith('--encoding='):
value = arg[11:].lower()
if value == 'ascii':
utf_mode = False
elif value == 'utf-8':
utf_mode = True
else: else:
print("Unknown output format '%s'" % value) print("Unknown encoding '%s'" % value)
return 1 return 1
else: else:
usage() usage()
if sys.argv[-2] == '-': if sys.argv[-2] == '-':
with open(sys.argv[-1], 'w') as outfile: with open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(sys.stdin))) outfile.write(converter(parser(sys.stdin), utf_mode))
else: else:
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile: with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(infile))) outfile.write(converter(parser(infile), utf_mode))
return 0 return 0