Added version info into generated DAFSA.

psl-make-dafsa got a mode switch so that the old version can be generated for testing.
2016-11-04 19:43:36 +01:00 · 2016-11-04 19:43:36 +01:00 · 8c2bcd5a24
parent e03953e27a
commit 8c2bcd5a24
1 changed files with 32 additions and 14 deletions
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@ -85,8 +85,13 @@ The generated byte array can described by the following BNF:
         | <prefix> <node>
         | <end_label>

-<dafsa> ::= <source>
-          | <dafsa> <node>
+<graph> ::= <graph>
+          | <graph> <node>
+
+<version> ::= <empty>            # The DAFSA was generated in ASCII mode.
+          | < byte value 0x01 >  # The DAFSA was generated in UTF-8 mode.
+
+<dafsa> ::= <graph> <version>

 Decoding:

@ -238,7 +243,7 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x00-0x
                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # 0xE0-0xEF
                      4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF

-def to_dafsa(words):
+def to_dafsa(words, utf_mode):
  """Generates a DAFSA from a word list and returns the source node.

  Each word is split into characters so that each character is represented by
@ -262,6 +267,8 @@ def to_dafsa(words):
      return word[0], [to_nodes(word[1:], 0)]
    elif char_length > 1:
      # Leading byte in multibyte sequence.
+      if not utf_mode:
+        raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
      if len(word) <= char_length:
        raise InputError('Unterminated UTF-8 multibyte sequence')
      return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
@ -451,7 +458,7 @@ def encode_label(label):
  return buf


-def encode(dafsa):
+def encode(dafsa, utf_mode):
  """Encodes a DAFSA to a list of bytes"""
  output = []
  offsets = {}
@ -467,6 +474,8 @@ def encode(dafsa):

  output.extend(encode_links(dafsa, offsets, len(output)))
  output.reverse()
+  if utf_mode:
+    output.append(0x01)
  return output


@ -485,22 +494,22 @@ def to_cxx(data):
  return text


-def words_to_whatever(words, converter):
+def words_to_whatever(words, converter, utf_mode):
  """Generates C++ code from a word list"""
-  dafsa = to_dafsa(words)
+  dafsa = to_dafsa(words, utf_mode)
  for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
    dafsa = fun(dafsa)
-  return converter(encode(dafsa))
+  return converter(encode(dafsa, utf_mode))


-def words_to_cxx(words):
+def words_to_cxx(words, utf_mode):
  """Generates C++ code from a word list"""
-  return words_to_whatever(words, to_cxx)
+  return words_to_whatever(words, to_cxx, utf_mode)


-def words_to_binary(words):
+def words_to_binary(words, utf_mode):
  """Generates C++ code from a word list"""
-  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, bytearray)
+  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, bytearray, utf_mode)


 def parse_psl2c(infile):
@ -595,6 +604,8 @@ def usage():
  print('  --input-format=psl      infile is a Public Suffix List file')
  print('  --output-format=cxx     Write DAFSA as C/C++ code')
  print('  --output-format=binary  Write DAFSA binary data')
+  print('  --encoding=ascii        7-bit ASCII mode (default)')
+  print('  --encoding=utf-8        UTF-8 mode')
  exit(1)


@ -605,6 +616,7 @@ def main():

  converter = words_to_cxx
  parser = parse_psl2c
+  utf_mode = False

  for arg in sys.argv[1:-2]:
    if arg.startswith('--input-format='):
@ -622,18 +634,24 @@ def main():
        converter = words_to_binary
      elif value == 'cxx':
        converter = words_to_cxx
+    elif arg.startswith('--encoding='):
+      value = arg[11:].lower()
+      if value == 'ascii':
+        utf_mode = False
+      elif value == 'utf-8':
+        utf_mode = True
      else:
-        print("Unknown output format '%s'" % value)
+        print("Unknown encoding '%s'" % value)
        return 1
    else:
      usage()

  if sys.argv[-2] == '-':
    with open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(sys.stdin)))
+      outfile.write(converter(parser(sys.stdin), utf_mode))
  else:
    with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(infile)))
+      outfile.write(converter(parser(infile), utf_mode))

  return 0