From 8c2bcd5a24496cb7e67e3f6d0bd6126b4563097b Mon Sep 17 00:00:00 2001 From: Olle Liljenzin Date: Fri, 4 Nov 2016 19:43:36 +0100 Subject: [PATCH] Added version info into generated DAFSA. psl-make-dafsa got a mode switch so that the old version can be generated for testing. --- src/psl-make-dafsa | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/src/psl-make-dafsa b/src/psl-make-dafsa index bd9a79a..aa9d451 100755 --- a/src/psl-make-dafsa +++ b/src/psl-make-dafsa @@ -85,8 +85,13 @@ The generated byte array can described by the following BNF: | | - ::= - | + ::= + | + + ::= # The DAFSA was generated in ASCII mode. + | < byte value 0x01 > # The DAFSA was generated in UTF-8 mode. + + ::= Decoding: @@ -238,7 +243,7 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF -def to_dafsa(words): +def to_dafsa(words, utf_mode): """Generates a DAFSA from a word list and returns the source node. Each word is split into characters so that each character is represented by @@ -262,6 +267,8 @@ def to_dafsa(words): return word[0], [to_nodes(word[1:], 0)] elif char_length > 1: # Leading byte in multibyte sequence. + if not utf_mode: + raise InputError('UTF-8 encoded characters are not allowed in ASCII mode') if len(word) <= char_length: raise InputError('Unterminated UTF-8 multibyte sequence') return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])] @@ -451,7 +458,7 @@ def encode_label(label): return buf -def encode(dafsa): +def encode(dafsa, utf_mode): """Encodes a DAFSA to a list of bytes""" output = [] offsets = {} @@ -467,6 +474,8 @@ def encode(dafsa): output.extend(encode_links(dafsa, offsets, len(output))) output.reverse() + if utf_mode: + output.append(0x01) return output @@ -485,22 +494,22 @@ def to_cxx(data): return text -def words_to_whatever(words, converter): +def words_to_whatever(words, converter, utf_mode): """Generates C++ code from a word list""" - dafsa = to_dafsa(words) + dafsa = to_dafsa(words, utf_mode) for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels): dafsa = fun(dafsa) - return converter(encode(dafsa)) + return converter(encode(dafsa, utf_mode)) -def words_to_cxx(words): +def words_to_cxx(words, utf_mode): """Generates C++ code from a word list""" - return words_to_whatever(words, to_cxx) + return words_to_whatever(words, to_cxx, utf_mode) -def words_to_binary(words): +def words_to_binary(words, utf_mode): """Generates C++ code from a word list""" - return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray) + return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode) def parse_psl2c(infile): @@ -595,6 +604,8 @@ def usage(): print(' --input-format=psl infile is a Public Suffix List file') print(' --output-format=cxx Write DAFSA as C/C++ code') print(' --output-format=binary Write DAFSA binary data') + print(' --encoding=ascii 7-bit ASCII mode (default)') + print(' --encoding=utf-8 UTF-8 mode') exit(1) @@ -605,6 +616,7 @@ def main(): converter = words_to_cxx parser = parse_psl2c + utf_mode = False for arg in sys.argv[1:-2]: if arg.startswith('--input-format='): @@ -622,18 +634,24 @@ def main(): converter = words_to_binary elif value == 'cxx': converter = words_to_cxx + elif arg.startswith('--encoding='): + value = arg[11:].lower() + if value == 'ascii': + utf_mode = False + elif value == 'utf-8': + utf_mode = True else: - print("Unknown output format '%s'" % value) + print("Unknown encoding '%s'" % value) return 1 else: usage() if sys.argv[-2] == '-': with open(sys.argv[-1], 'w') as outfile: - outfile.write(converter(parser(sys.stdin))) + outfile.write(converter(parser(sys.stdin), utf_mode)) else: with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile: - outfile.write(converter(parser(infile))) + outfile.write(converter(parser(infile), utf_mode)) return 0