Added version info into generated DAFSA.
psl-make-dafsa got a mode switch so that the old version can be generated for testing.
This commit is contained in:
parent
e03953e27a
commit
8c2bcd5a24
|
@ -85,8 +85,13 @@ The generated byte array can described by the following BNF:
|
|||
| <prefix> <node>
|
||||
| <end_label>
|
||||
|
||||
<dafsa> ::= <source>
|
||||
| <dafsa> <node>
|
||||
<graph> ::= <graph>
|
||||
| <graph> <node>
|
||||
|
||||
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
|
||||
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
|
||||
|
||||
<dafsa> ::= <graph> <version>
|
||||
|
||||
Decoding:
|
||||
|
||||
|
@ -238,7 +243,7 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x
|
|||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
|
||||
|
||||
def to_dafsa(words):
|
||||
def to_dafsa(words, utf_mode):
|
||||
"""Generates a DAFSA from a word list and returns the source node.
|
||||
|
||||
Each word is split into characters so that each character is represented by
|
||||
|
@ -262,6 +267,8 @@ def to_dafsa(words):
|
|||
return word[0], [to_nodes(word[1:], 0)]
|
||||
elif char_length > 1:
|
||||
# Leading byte in multibyte sequence.
|
||||
if not utf_mode:
|
||||
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
|
||||
if len(word) <= char_length:
|
||||
raise InputError('Unterminated UTF-8 multibyte sequence')
|
||||
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
||||
|
@ -451,7 +458,7 @@ def encode_label(label):
|
|||
return buf
|
||||
|
||||
|
||||
def encode(dafsa):
|
||||
def encode(dafsa, utf_mode):
|
||||
"""Encodes a DAFSA to a list of bytes"""
|
||||
output = []
|
||||
offsets = {}
|
||||
|
@ -467,6 +474,8 @@ def encode(dafsa):
|
|||
|
||||
output.extend(encode_links(dafsa, offsets, len(output)))
|
||||
output.reverse()
|
||||
if utf_mode:
|
||||
output.append(0x01)
|
||||
return output
|
||||
|
||||
|
||||
|
@ -485,22 +494,22 @@ def to_cxx(data):
|
|||
return text
|
||||
|
||||
|
||||
def words_to_whatever(words, converter):
|
||||
def words_to_whatever(words, converter, utf_mode):
|
||||
"""Generates C++ code from a word list"""
|
||||
dafsa = to_dafsa(words)
|
||||
dafsa = to_dafsa(words, utf_mode)
|
||||
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
||||
dafsa = fun(dafsa)
|
||||
return converter(encode(dafsa))
|
||||
return converter(encode(dafsa, utf_mode))
|
||||
|
||||
|
||||
def words_to_cxx(words):
|
||||
def words_to_cxx(words, utf_mode):
|
||||
"""Generates C++ code from a word list"""
|
||||
return words_to_whatever(words, to_cxx)
|
||||
return words_to_whatever(words, to_cxx, utf_mode)
|
||||
|
||||
|
||||
def words_to_binary(words):
|
||||
def words_to_binary(words, utf_mode):
|
||||
"""Generates C++ code from a word list"""
|
||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray)
|
||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
|
||||
|
||||
|
||||
def parse_psl2c(infile):
|
||||
|
@ -595,6 +604,8 @@ def usage():
|
|||
print(' --input-format=psl infile is a Public Suffix List file')
|
||||
print(' --output-format=cxx Write DAFSA as C/C++ code')
|
||||
print(' --output-format=binary Write DAFSA binary data')
|
||||
print(' --encoding=ascii 7-bit ASCII mode (default)')
|
||||
print(' --encoding=utf-8 UTF-8 mode')
|
||||
exit(1)
|
||||
|
||||
|
||||
|
@ -605,6 +616,7 @@ def main():
|
|||
|
||||
converter = words_to_cxx
|
||||
parser = parse_psl2c
|
||||
utf_mode = False
|
||||
|
||||
for arg in sys.argv[1:-2]:
|
||||
if arg.startswith('--input-format='):
|
||||
|
@ -622,18 +634,24 @@ def main():
|
|||
converter = words_to_binary
|
||||
elif value == 'cxx':
|
||||
converter = words_to_cxx
|
||||
elif arg.startswith('--encoding='):
|
||||
value = arg[11:].lower()
|
||||
if value == 'ascii':
|
||||
utf_mode = False
|
||||
elif value == 'utf-8':
|
||||
utf_mode = True
|
||||
else:
|
||||
print("Unknown output format '%s'" % value)
|
||||
print("Unknown encoding '%s'" % value)
|
||||
return 1
|
||||
else:
|
||||
usage()
|
||||
|
||||
if sys.argv[-2] == '-':
|
||||
with open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(sys.stdin)))
|
||||
outfile.write(converter(parser(sys.stdin), utf_mode))
|
||||
else:
|
||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(infile)))
|
||||
outfile.write(converter(parser(infile), utf_mode))
|
||||
|
||||
return 0
|
||||
|
||||
|
|
Loading…
Reference in New Issue