Added version info into generated DAFSA.

psl-make-dafsa got a mode switch so that the old version can be
generated for testing.
This commit is contained in:
Olle Liljenzin 2016-11-04 19:43:36 +01:00 committed by Tim Rühsen
parent e03953e27a
commit 8c2bcd5a24
1 changed files with 32 additions and 14 deletions

View File

@ -85,8 +85,13 @@ The generated byte array can described by the following BNF:
| <prefix> <node>
| <end_label>
<dafsa> ::= <source>
| <dafsa> <node>
<graph> ::= <graph>
| <graph> <node>
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
<dafsa> ::= <graph> <version>
Decoding:
@ -238,7 +243,7 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
def to_dafsa(words):
def to_dafsa(words, utf_mode):
"""Generates a DAFSA from a word list and returns the source node.
Each word is split into characters so that each character is represented by
@ -262,6 +267,8 @@ def to_dafsa(words):
return word[0], [to_nodes(word[1:], 0)]
elif char_length > 1:
# Leading byte in multibyte sequence.
if not utf_mode:
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
if len(word) <= char_length:
raise InputError('Unterminated UTF-8 multibyte sequence')
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
@ -451,7 +458,7 @@ def encode_label(label):
return buf
def encode(dafsa):
def encode(dafsa, utf_mode):
"""Encodes a DAFSA to a list of bytes"""
output = []
offsets = {}
@ -467,6 +474,8 @@ def encode(dafsa):
output.extend(encode_links(dafsa, offsets, len(output)))
output.reverse()
if utf_mode:
output.append(0x01)
return output
@ -485,22 +494,22 @@ def to_cxx(data):
return text
def words_to_whatever(words, converter):
def words_to_whatever(words, converter, utf_mode):
"""Generates C++ code from a word list"""
dafsa = to_dafsa(words)
dafsa = to_dafsa(words, utf_mode)
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
dafsa = fun(dafsa)
return converter(encode(dafsa))
return converter(encode(dafsa, utf_mode))
def words_to_cxx(words):
def words_to_cxx(words, utf_mode):
"""Generates C++ code from a word list"""
return words_to_whatever(words, to_cxx)
return words_to_whatever(words, to_cxx, utf_mode)
def words_to_binary(words):
def words_to_binary(words, utf_mode):
"""Generates C++ code from a word list"""
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray)
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
def parse_psl2c(infile):
@ -595,6 +604,8 @@ def usage():
print(' --input-format=psl infile is a Public Suffix List file')
print(' --output-format=cxx Write DAFSA as C/C++ code')
print(' --output-format=binary Write DAFSA binary data')
print(' --encoding=ascii 7-bit ASCII mode (default)')
print(' --encoding=utf-8 UTF-8 mode')
exit(1)
@ -605,6 +616,7 @@ def main():
converter = words_to_cxx
parser = parse_psl2c
utf_mode = False
for arg in sys.argv[1:-2]:
if arg.startswith('--input-format='):
@ -622,18 +634,24 @@ def main():
converter = words_to_binary
elif value == 'cxx':
converter = words_to_cxx
elif arg.startswith('--encoding='):
value = arg[11:].lower()
if value == 'ascii':
utf_mode = False
elif value == 'utf-8':
utf_mode = True
else:
print("Unknown output format '%s'" % value)
print("Unknown encoding '%s'" % value)
return 1
else:
usage()
if sys.argv[-2] == '-':
with open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(sys.stdin)))
outfile.write(converter(parser(sys.stdin), utf_mode))
else:
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(infile)))
outfile.write(converter(parser(infile), utf_mode))
return 0