Added version info into generated DAFSA.
psl-make-dafsa got a mode switch so that the old version can be generated for testing.
This commit is contained in:
parent
e03953e27a
commit
8c2bcd5a24
|
@ -85,8 +85,13 @@ The generated byte array can described by the following BNF:
|
||||||
| <prefix> <node>
|
| <prefix> <node>
|
||||||
| <end_label>
|
| <end_label>
|
||||||
|
|
||||||
<dafsa> ::= <source>
|
<graph> ::= <graph>
|
||||||
| <dafsa> <node>
|
| <graph> <node>
|
||||||
|
|
||||||
|
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
|
||||||
|
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
|
||||||
|
|
||||||
|
<dafsa> ::= <graph> <version>
|
||||||
|
|
||||||
Decoding:
|
Decoding:
|
||||||
|
|
||||||
|
@ -238,7 +243,7 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x
|
||||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
|
||||||
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
|
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
|
||||||
|
|
||||||
def to_dafsa(words):
|
def to_dafsa(words, utf_mode):
|
||||||
"""Generates a DAFSA from a word list and returns the source node.
|
"""Generates a DAFSA from a word list and returns the source node.
|
||||||
|
|
||||||
Each word is split into characters so that each character is represented by
|
Each word is split into characters so that each character is represented by
|
||||||
|
@ -262,6 +267,8 @@ def to_dafsa(words):
|
||||||
return word[0], [to_nodes(word[1:], 0)]
|
return word[0], [to_nodes(word[1:], 0)]
|
||||||
elif char_length > 1:
|
elif char_length > 1:
|
||||||
# Leading byte in multibyte sequence.
|
# Leading byte in multibyte sequence.
|
||||||
|
if not utf_mode:
|
||||||
|
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
|
||||||
if len(word) <= char_length:
|
if len(word) <= char_length:
|
||||||
raise InputError('Unterminated UTF-8 multibyte sequence')
|
raise InputError('Unterminated UTF-8 multibyte sequence')
|
||||||
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
||||||
|
@ -451,7 +458,7 @@ def encode_label(label):
|
||||||
return buf
|
return buf
|
||||||
|
|
||||||
|
|
||||||
def encode(dafsa):
|
def encode(dafsa, utf_mode):
|
||||||
"""Encodes a DAFSA to a list of bytes"""
|
"""Encodes a DAFSA to a list of bytes"""
|
||||||
output = []
|
output = []
|
||||||
offsets = {}
|
offsets = {}
|
||||||
|
@ -467,6 +474,8 @@ def encode(dafsa):
|
||||||
|
|
||||||
output.extend(encode_links(dafsa, offsets, len(output)))
|
output.extend(encode_links(dafsa, offsets, len(output)))
|
||||||
output.reverse()
|
output.reverse()
|
||||||
|
if utf_mode:
|
||||||
|
output.append(0x01)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
@ -485,22 +494,22 @@ def to_cxx(data):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def words_to_whatever(words, converter):
|
def words_to_whatever(words, converter, utf_mode):
|
||||||
"""Generates C++ code from a word list"""
|
"""Generates C++ code from a word list"""
|
||||||
dafsa = to_dafsa(words)
|
dafsa = to_dafsa(words, utf_mode)
|
||||||
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
||||||
dafsa = fun(dafsa)
|
dafsa = fun(dafsa)
|
||||||
return converter(encode(dafsa))
|
return converter(encode(dafsa, utf_mode))
|
||||||
|
|
||||||
|
|
||||||
def words_to_cxx(words):
|
def words_to_cxx(words, utf_mode):
|
||||||
"""Generates C++ code from a word list"""
|
"""Generates C++ code from a word list"""
|
||||||
return words_to_whatever(words, to_cxx)
|
return words_to_whatever(words, to_cxx, utf_mode)
|
||||||
|
|
||||||
|
|
||||||
def words_to_binary(words):
|
def words_to_binary(words, utf_mode):
|
||||||
"""Generates C++ code from a word list"""
|
"""Generates C++ code from a word list"""
|
||||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray)
|
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
|
||||||
|
|
||||||
|
|
||||||
def parse_psl2c(infile):
|
def parse_psl2c(infile):
|
||||||
|
@ -595,6 +604,8 @@ def usage():
|
||||||
print(' --input-format=psl infile is a Public Suffix List file')
|
print(' --input-format=psl infile is a Public Suffix List file')
|
||||||
print(' --output-format=cxx Write DAFSA as C/C++ code')
|
print(' --output-format=cxx Write DAFSA as C/C++ code')
|
||||||
print(' --output-format=binary Write DAFSA binary data')
|
print(' --output-format=binary Write DAFSA binary data')
|
||||||
|
print(' --encoding=ascii 7-bit ASCII mode (default)')
|
||||||
|
print(' --encoding=utf-8 UTF-8 mode')
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
@ -605,6 +616,7 @@ def main():
|
||||||
|
|
||||||
converter = words_to_cxx
|
converter = words_to_cxx
|
||||||
parser = parse_psl2c
|
parser = parse_psl2c
|
||||||
|
utf_mode = False
|
||||||
|
|
||||||
for arg in sys.argv[1:-2]:
|
for arg in sys.argv[1:-2]:
|
||||||
if arg.startswith('--input-format='):
|
if arg.startswith('--input-format='):
|
||||||
|
@ -622,18 +634,24 @@ def main():
|
||||||
converter = words_to_binary
|
converter = words_to_binary
|
||||||
elif value == 'cxx':
|
elif value == 'cxx':
|
||||||
converter = words_to_cxx
|
converter = words_to_cxx
|
||||||
|
elif arg.startswith('--encoding='):
|
||||||
|
value = arg[11:].lower()
|
||||||
|
if value == 'ascii':
|
||||||
|
utf_mode = False
|
||||||
|
elif value == 'utf-8':
|
||||||
|
utf_mode = True
|
||||||
else:
|
else:
|
||||||
print("Unknown output format '%s'" % value)
|
print("Unknown encoding '%s'" % value)
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
usage()
|
usage()
|
||||||
|
|
||||||
if sys.argv[-2] == '-':
|
if sys.argv[-2] == '-':
|
||||||
with open(sys.argv[-1], 'w') as outfile:
|
with open(sys.argv[-1], 'w') as outfile:
|
||||||
outfile.write(converter(parser(sys.stdin)))
|
outfile.write(converter(parser(sys.stdin), utf_mode))
|
||||||
else:
|
else:
|
||||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
||||||
outfile.write(converter(parser(infile)))
|
outfile.write(converter(parser(infile), utf_mode))
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue