From 375aef05ae2e0f1c4edef2ef54b8844dd108b2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Fri, 4 Dec 2015 17:15:55 +0100 Subject: [PATCH] Add tools/make_dafsa.py --- tools/make_dafsa.py | 469 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 469 insertions(+) create mode 100755 tools/make_dafsa.py diff --git a/tools/make_dafsa.py b/tools/make_dafsa.py new file mode 100755 index 0000000..78358ef --- /dev/null +++ b/tools/make_dafsa.py @@ -0,0 +1,469 @@ +#!/usr/bin/env python +# Copyright 2014 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +""" +A Deterministic acyclic finite state automaton (DAFSA) is a compact +representation of an unordered word list (dictionary). + +http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton + +This python program converts a list of strings to a byte array in C++. +This python program fetches strings and return values from a gperf file +and generates a C++ file with a byte array representing graph that can be +used as a memory efficient replacement for the perfect hash table. + +The input strings are assumed to consist of printable 7-bit ASCII characters +and the return values are assumed to be one digit integers. + +In this program a DAFSA is a diamond shaped graph starting at a common +source node and ending at a common sink node. All internal nodes contain +a label and each word is represented by the labels in one path from +the source node to the sink node. + +The following python represention is used for nodes: + + Source node: [ children ] + Internal node: (label, [ children ]) + Sink node: None + +The graph is first compressed by prefixes like a trie. In the next step +suffixes are compressed so that the graph gets diamond shaped. Finally +one to one linked nodes are replaced by nodes with the labels joined. + +The order of the operations is crucial since lookups will be performed +starting from the source with no backtracking. Thus a node must have at +most one child with a label starting by the same character. The output +is also arranged so that all jumps are to increasing addresses, thus forward +in memory. + +The generated output has suffix free decoding so that the sign of leading +bits in a link (a reference to a child node) indicate if it has a size of one, +two or three bytes and if it is the last outgoing link from the actual node. +A node label is terminated by a byte with the leading bit set. + +The generated byte array can described by the following BNF: + + ::= < 8-bit value in range [0x00-0xFF] > + + ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > + ::= < char + 0x80, byte in range [0xA0-0xFF] > + ::= < value + 0x80, byte in range [0x80-0x8F] > + + ::= < byte in range [0x00-0x3F] > + ::= < byte in range [0x40-0x5F] > + ::= < byte in range [0x60-0x7F] > + + ::= < byte in range [0x80-0xBF] > + ::= < byte in range [0xC0-0xDF] > + ::= < byte in range [0xE0-0xFF] > + + ::= + +