2016-09-17 03:42:54 +02:00
|
|
|
#!/usr/bin/env python
|
2015-12-04 17:15:55 +01:00
|
|
|
# Copyright 2014 The Chromium Authors. All rights reserved.
|
|
|
|
# Use of this source code is governed by a BSD-style license that can be
|
2015-12-09 09:35:04 +01:00
|
|
|
# found in the LICENSE.chromium file.
|
2015-12-04 17:15:55 +01:00
|
|
|
|
|
|
|
"""
|
|
|
|
A Deterministic acyclic finite state automaton (DAFSA) is a compact
|
|
|
|
representation of an unordered word list (dictionary).
|
|
|
|
|
2016-07-05 17:49:14 +02:00
|
|
|
https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
|
2015-12-04 17:15:55 +01:00
|
|
|
|
|
|
|
This python program converts a list of strings to a byte array in C++.
|
|
|
|
This python program fetches strings and return values from a gperf file
|
|
|
|
and generates a C++ file with a byte array representing graph that can be
|
|
|
|
used as a memory efficient replacement for the perfect hash table.
|
|
|
|
|
2016-11-02 20:22:01 +01:00
|
|
|
The input strings must consist of printable 7-bit ASCII characters or UTF-8
|
|
|
|
multibyte sequences. Control characters in the range [0x00-0x1F] are not
|
|
|
|
allowed. The return values must be one digit integers. .
|
2015-12-04 17:15:55 +01:00
|
|
|
|
|
|
|
In this program a DAFSA is a diamond shaped graph starting at a common
|
|
|
|
source node and ending at a common sink node. All internal nodes contain
|
|
|
|
a label and each word is represented by the labels in one path from
|
|
|
|
the source node to the sink node.
|
|
|
|
|
|
|
|
The following python represention is used for nodes:
|
|
|
|
|
|
|
|
Source node: [ children ]
|
|
|
|
Internal node: (label, [ children ])
|
|
|
|
Sink node: None
|
|
|
|
|
|
|
|
The graph is first compressed by prefixes like a trie. In the next step
|
|
|
|
suffixes are compressed so that the graph gets diamond shaped. Finally
|
|
|
|
one to one linked nodes are replaced by nodes with the labels joined.
|
|
|
|
|
|
|
|
The order of the operations is crucial since lookups will be performed
|
|
|
|
starting from the source with no backtracking. Thus a node must have at
|
|
|
|
most one child with a label starting by the same character. The output
|
|
|
|
is also arranged so that all jumps are to increasing addresses, thus forward
|
|
|
|
in memory.
|
|
|
|
|
|
|
|
The generated output has suffix free decoding so that the sign of leading
|
|
|
|
bits in a link (a reference to a child node) indicate if it has a size of one,
|
|
|
|
two or three bytes and if it is the last outgoing link from the actual node.
|
|
|
|
A node label is terminated by a byte with the leading bit set.
|
|
|
|
|
|
|
|
The generated byte array can described by the following BNF:
|
|
|
|
|
|
|
|
<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
|
|
|
|
2016-11-02 20:22:01 +01:00
|
|
|
<char> ::= < byte in range [0x1F-0x7F] >
|
|
|
|
<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
|
2015-12-04 17:15:55 +01:00
|
|
|
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
|
|
|
|
|
|
|
<offset1> ::= < byte in range [0x00-0x3F] >
|
|
|
|
<offset2> ::= < byte in range [0x40-0x5F] >
|
|
|
|
<offset3> ::= < byte in range [0x60-0x7F] >
|
|
|
|
|
|
|
|
<end_offset1> ::= < byte in range [0x80-0xBF] >
|
|
|
|
<end_offset2> ::= < byte in range [0xC0-0xDF] >
|
|
|
|
<end_offset3> ::= < byte in range [0xE0-0xFF] >
|
|
|
|
|
|
|
|
<prefix> ::= <char>
|
|
|
|
|
|
|
|
<label> ::= <end_char>
|
|
|
|
| <char> <label>
|
|
|
|
|
|
|
|
<end_label> ::= <return_value>
|
|
|
|
| <char> <end_label>
|
|
|
|
|
|
|
|
<offset> ::= <offset1>
|
|
|
|
| <offset2> <byte>
|
|
|
|
| <offset3> <byte> <byte>
|
|
|
|
|
|
|
|
<end_offset> ::= <end_offset1>
|
|
|
|
| <end_offset2> <byte>
|
|
|
|
| <end_offset3> <byte> <byte>
|
|
|
|
|
|
|
|
<offsets> ::= <end_offset>
|
|
|
|
| <offset> <offsets>
|
|
|
|
|
|
|
|
<source> ::= <offsets>
|
|
|
|
|
|
|
|
<node> ::= <label> <offsets>
|
|
|
|
| <prefix> <node>
|
|
|
|
| <end_label>
|
|
|
|
|
2016-11-04 19:43:36 +01:00
|
|
|
<graph> ::= <graph>
|
|
|
|
| <graph> <node>
|
|
|
|
|
|
|
|
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
|
|
|
|
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
|
|
|
|
|
|
|
|
<dafsa> ::= <graph> <version>
|
2015-12-04 17:15:55 +01:00
|
|
|
|
|
|
|
Decoding:
|
|
|
|
|
2016-11-02 20:22:01 +01:00
|
|
|
<char> -> character
|
|
|
|
<end_char> & 0x7F -> character
|
2015-12-04 17:15:55 +01:00
|
|
|
<return value> & 0x0F -> integer
|
|
|
|
<offset1 & 0x3F> -> integer
|
|
|
|
((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
|
|
|
((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
|
|
|
|
|
|
|
|
end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
|
|
|
|
offset2 and offset3 respectively.
|
|
|
|
|
|
|
|
The first offset in a list of offsets is the distance in bytes between the
|
|
|
|
offset itself and the first child node. Subsequent offsets are the distance
|
|
|
|
between previous child node and next child node. Thus each offset links a node
|
|
|
|
to a child node. The distance is always counted between start addresses, i.e.
|
|
|
|
first byte in decoded offset or first byte in child node.
|
|
|
|
|
2016-11-02 20:22:01 +01:00
|
|
|
Transcoding of UTF-8 multibyte sequences:
|
|
|
|
|
|
|
|
The original DAFSA format was limited to 7-bit printable ASCII characters in
|
|
|
|
range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
|
|
|
|
By transcoding of such characters the new format preserves compatibility with
|
|
|
|
old parsers, so that a DAFSA in the extended format can be used by an old
|
|
|
|
parser without false positives, although strings containing transcoded
|
|
|
|
characters will never match. Since the format is extended rather than being
|
|
|
|
changed, a parser supporting the new format will automatically support data
|
|
|
|
generated in the old format.
|
|
|
|
|
|
|
|
Transcoding is performed by insertion of a start byte with the special value
|
|
|
|
0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
|
|
|
|
the range of printable ASCII.
|
|
|
|
|
|
|
|
2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
|
|
|
|
|
|
|
|
3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
|
|
|
|
|
|
|
|
4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
|
|
|
|
00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
|
|
|
|
|
2015-12-04 17:15:55 +01:00
|
|
|
Example 1:
|
|
|
|
|
|
|
|
%%
|
|
|
|
aa, 1
|
|
|
|
a, 2
|
|
|
|
%%
|
|
|
|
|
|
|
|
The input is first parsed to a list of words:
|
|
|
|
["aa1", "a2"]
|
|
|
|
|
|
|
|
A fully expanded graph is created from the words:
|
|
|
|
source = [node1, node4]
|
|
|
|
node1 = ("a", [node2])
|
|
|
|
node2 = ("a", [node3])
|
|
|
|
node3 = ("\x01", [sink])
|
|
|
|
node4 = ("a", [node5])
|
|
|
|
node5 = ("\x02", [sink])
|
|
|
|
sink = None
|
|
|
|
|
|
|
|
Compression results in the following graph:
|
|
|
|
source = [node1]
|
|
|
|
node1 = ("a", [node2, node3])
|
|
|
|
node2 = ("\x02", [sink])
|
|
|
|
node3 = ("a\x01", [sink])
|
|
|
|
sink = None
|
|
|
|
|
|
|
|
A C++ representation of the compressed graph is generated:
|
|
|
|
|
|
|
|
const unsigned char dafsa[7] = {
|
|
|
|
0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
|
|
|
|
};
|
|
|
|
|
|
|
|
The bytes in the generated array has the following meaning:
|
|
|
|
|
|
|
|
0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
|
|
|
|
|
|
|
|
1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
|
|
|
|
2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
|
|
|
|
|
|
|
|
3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
|
|
|
|
4: 0x82 <return_value> 0x82 & 0x0F -> return 2
|
|
|
|
|
|
|
|
5: 0x61 <char> label character 0x61 -> match "a"
|
|
|
|
6: 0x81 <return_value> 0x81 & 0x0F -> return 1
|
|
|
|
|
|
|
|
Example 2:
|
|
|
|
|
|
|
|
%%
|
|
|
|
aa, 1
|
|
|
|
bbb, 2
|
|
|
|
baa, 1
|
|
|
|
%%
|
|
|
|
|
|
|
|
The input is first parsed to a list of words:
|
|
|
|
["aa1", "bbb2", "baa1"]
|
|
|
|
|
|
|
|
Compression results in the following graph:
|
|
|
|
source = [node1, node2]
|
|
|
|
node1 = ("b", [node2, node3])
|
|
|
|
node2 = ("aa\x01", [sink])
|
|
|
|
node3 = ("bb\x02", [sink])
|
|
|
|
sink = None
|
|
|
|
|
|
|
|
A C++ representation of the compressed graph is generated:
|
|
|
|
|
|
|
|
const unsigned char dafsa[11] = {
|
|
|
|
0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
|
|
|
|
};
|
|
|
|
|
|
|
|
The bytes in the generated array has the following meaning:
|
|
|
|
|
|
|
|
0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
|
|
|
|
1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
|
|
|
|
|
|
|
|
2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
|
|
|
|
3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
|
|
|
|
4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
|
|
|
|
|
|
|
|
5: 0x61 <char> label character 0x61 -> match "a"
|
|
|
|
6: 0x61 <char> label character 0x61 -> match "a"
|
|
|
|
7: 0x81 <return_value> 0x81 & 0x0F -> return 1
|
|
|
|
|
|
|
|
8: 0x62 <char> label character 0x62 -> match "b"
|
|
|
|
9: 0x62 <char> label character 0x62 -> match "b"
|
|
|
|
10: 0x82 <return_value> 0x82 & 0x0F -> return 2
|
|
|
|
"""
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
class InputError(Exception):
|
|
|
|
"""Exception raised for errors in the input file."""
|
|
|
|
|
2016-11-02 20:22:01 +01:00
|
|
|
# Length of a character starting at a given byte.
|
|
|
|
char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF
|
|
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
|
2015-12-04 17:15:55 +01:00
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
def to_bytes(n):
|
|
|
|
"""Converts an integer value to a bytes object."""
|
|
|
|
return bytes(bytearray((n,)))
|
|
|
|
|
2016-11-04 19:43:36 +01:00
|
|
|
def to_dafsa(words, utf_mode):
|
2015-12-04 17:15:55 +01:00
|
|
|
"""Generates a DAFSA from a word list and returns the source node.
|
|
|
|
|
|
|
|
Each word is split into characters so that each character is represented by
|
|
|
|
a unique node. It is assumed the word list is not empty.
|
|
|
|
"""
|
|
|
|
if not words:
|
|
|
|
raise InputError('The domain list must not be empty')
|
2016-11-02 20:22:01 +01:00
|
|
|
def to_nodes(word, multibyte_length):
|
2015-12-04 17:15:55 +01:00
|
|
|
"""Split words into characters"""
|
2016-11-12 21:10:59 +01:00
|
|
|
byte = ord(word[:1])
|
2016-11-02 20:22:01 +01:00
|
|
|
if multibyte_length:
|
|
|
|
# Consume next byte in multibyte sequence.
|
|
|
|
if byte & 0xC0 != 0x80:
|
|
|
|
raise InputError('Invalid UTF-8 multibyte sequence')
|
2016-11-12 21:10:59 +01:00
|
|
|
return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
|
2016-11-02 20:22:01 +01:00
|
|
|
char_length = char_length_table[byte]
|
|
|
|
if char_length == 1:
|
|
|
|
# 7-bit printable ASCII.
|
|
|
|
if len(word) == 1:
|
2016-11-12 21:10:59 +01:00
|
|
|
return to_bytes(int(word[:1], 16) & 0x0F), [None]
|
|
|
|
return word[:1], [to_nodes(word[1:], 0)]
|
2016-11-02 20:22:01 +01:00
|
|
|
elif char_length > 1:
|
|
|
|
# Leading byte in multibyte sequence.
|
2016-11-04 19:43:36 +01:00
|
|
|
if not utf_mode:
|
|
|
|
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
|
2016-11-02 20:22:01 +01:00
|
|
|
if len(word) <= char_length:
|
|
|
|
raise InputError('Unterminated UTF-8 multibyte sequence')
|
2016-11-12 21:10:59 +01:00
|
|
|
return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
2016-11-02 20:22:01 +01:00
|
|
|
# Unexpected character.
|
|
|
|
raise InputError('Domain names must be printable ASCII or UTF-8')
|
|
|
|
|
|
|
|
return [to_nodes(word, 0) for word in words]
|
2015-12-04 17:15:55 +01:00
|
|
|
|
|
|
|
def to_words(node):
|
|
|
|
"""Generates a word list from all paths starting from an internal node."""
|
|
|
|
if not node:
|
2016-11-12 21:10:59 +01:00
|
|
|
return [b'']
|
2015-12-04 17:15:55 +01:00
|
|
|
return [(node[0] + word) for child in node[1] for word in to_words(child)]
|
|
|
|
|
|
|
|
|
|
|
|
def reverse(dafsa):
|
|
|
|
"""Generates a new DAFSA that is reversed, so that the old sink node becomes
|
|
|
|
the new source node.
|
|
|
|
"""
|
|
|
|
sink = []
|
|
|
|
nodemap = {}
|
|
|
|
|
|
|
|
def dfs(node, parent):
|
|
|
|
"""Creates reverse nodes.
|
|
|
|
|
|
|
|
A new reverse node will be created for each old node. The new node will
|
|
|
|
get a reversed label and the parents of the old node as children.
|
|
|
|
"""
|
|
|
|
if not node:
|
|
|
|
sink.append(parent)
|
|
|
|
elif id(node) not in nodemap:
|
|
|
|
nodemap[id(node)] = (node[0][::-1], [parent])
|
|
|
|
for child in node[1]:
|
|
|
|
dfs(child, nodemap[id(node)])
|
|
|
|
else:
|
|
|
|
nodemap[id(node)][1].append(parent)
|
|
|
|
|
|
|
|
for node in dafsa:
|
|
|
|
dfs(node, None)
|
|
|
|
return sink
|
|
|
|
|
|
|
|
|
|
|
|
def join_labels(dafsa):
|
|
|
|
"""Generates a new DAFSA where internal nodes are merged if there is a one to
|
|
|
|
one connection.
|
|
|
|
"""
|
2015-12-30 17:52:48 +01:00
|
|
|
parentcount = {id(None): 2}
|
|
|
|
nodemap = {id(None): None}
|
2015-12-04 17:15:55 +01:00
|
|
|
|
|
|
|
def count_parents(node):
|
|
|
|
"""Count incoming references"""
|
|
|
|
if id(node) in parentcount:
|
|
|
|
parentcount[id(node)] += 1
|
|
|
|
else:
|
|
|
|
parentcount[id(node)] = 1
|
|
|
|
for child in node[1]:
|
|
|
|
count_parents(child)
|
|
|
|
|
|
|
|
def join(node):
|
|
|
|
"""Create new nodes"""
|
|
|
|
if id(node) not in nodemap:
|
|
|
|
children = [join(child) for child in node[1]]
|
|
|
|
if len(children) == 1 and parentcount[id(node[1][0])] == 1:
|
|
|
|
child = children[0]
|
|
|
|
nodemap[id(node)] = (node[0] + child[0], child[1])
|
|
|
|
else:
|
|
|
|
nodemap[id(node)] = (node[0], children)
|
|
|
|
return nodemap[id(node)]
|
|
|
|
|
|
|
|
for node in dafsa:
|
|
|
|
count_parents(node)
|
|
|
|
return [join(node) for node in dafsa]
|
|
|
|
|
|
|
|
|
|
|
|
def join_suffixes(dafsa):
|
|
|
|
"""Generates a new DAFSA where nodes that represent the same word lists
|
|
|
|
towards the sink are merged.
|
|
|
|
"""
|
2016-11-12 21:10:59 +01:00
|
|
|
nodemap = {frozenset((b'',)): None}
|
2015-12-04 17:15:55 +01:00
|
|
|
|
|
|
|
def join(node):
|
|
|
|
"""Returns a macthing node. A new node is created if no matching node
|
|
|
|
exists. The graph is accessed in dfs order.
|
|
|
|
"""
|
|
|
|
suffixes = frozenset(to_words(node))
|
|
|
|
if suffixes not in nodemap:
|
|
|
|
nodemap[suffixes] = (node[0], [join(child) for child in node[1]])
|
|
|
|
return nodemap[suffixes]
|
|
|
|
|
|
|
|
return [join(node) for node in dafsa]
|
|
|
|
|
|
|
|
|
|
|
|
def top_sort(dafsa):
|
|
|
|
"""Generates list of nodes in topological sort order."""
|
|
|
|
incoming = {}
|
|
|
|
|
|
|
|
def count_incoming(node):
|
|
|
|
"""Counts incoming references."""
|
|
|
|
if node:
|
|
|
|
if id(node) not in incoming:
|
|
|
|
incoming[id(node)] = 1
|
|
|
|
for child in node[1]:
|
|
|
|
count_incoming(child)
|
|
|
|
else:
|
|
|
|
incoming[id(node)] += 1
|
|
|
|
|
|
|
|
for node in dafsa:
|
|
|
|
count_incoming(node)
|
|
|
|
|
|
|
|
for node in dafsa:
|
|
|
|
incoming[id(node)] -= 1
|
|
|
|
|
|
|
|
waiting = [node for node in dafsa if incoming[id(node)] == 0]
|
|
|
|
nodes = []
|
|
|
|
|
|
|
|
while waiting:
|
|
|
|
node = waiting.pop()
|
|
|
|
assert incoming[id(node)] == 0
|
|
|
|
nodes.append(node)
|
|
|
|
for child in node[1]:
|
|
|
|
if child:
|
|
|
|
incoming[id(child)] -= 1
|
|
|
|
if incoming[id(child)] == 0:
|
|
|
|
waiting.append(child)
|
|
|
|
return nodes
|
|
|
|
|
|
|
|
|
|
|
|
def encode_links(children, offsets, current):
|
|
|
|
"""Encodes a list of children as one, two or three byte offsets."""
|
|
|
|
if not children[0]:
|
|
|
|
# This is an <end_label> node and no links follow such nodes
|
|
|
|
assert len(children) == 1
|
|
|
|
return []
|
|
|
|
guess = 3 * len(children)
|
|
|
|
assert children
|
2015-12-30 17:52:48 +01:00
|
|
|
children = sorted(children, key=lambda x: -offsets[id(x)])
|
2015-12-04 17:15:55 +01:00
|
|
|
while True:
|
|
|
|
offset = current + guess
|
|
|
|
buf = []
|
|
|
|
for child in children:
|
|
|
|
last = len(buf)
|
|
|
|
distance = offset - offsets[id(child)]
|
|
|
|
assert distance > 0 and distance < (1 << 21)
|
|
|
|
|
|
|
|
if distance < (1 << 6):
|
|
|
|
# A 6-bit offset: "s0xxxxxx"
|
|
|
|
buf.append(distance)
|
|
|
|
elif distance < (1 << 13):
|
|
|
|
# A 13-bit offset: "s10xxxxxxxxxxxxx"
|
|
|
|
buf.append(0x40 | (distance >> 8))
|
|
|
|
buf.append(distance & 0xFF)
|
|
|
|
else:
|
|
|
|
# A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
|
|
|
|
buf.append(0x60 | (distance >> 16))
|
|
|
|
buf.append((distance >> 8) & 0xFF)
|
|
|
|
buf.append(distance & 0xFF)
|
|
|
|
# Distance in first link is relative to following record.
|
|
|
|
# Distance in other links are relative to previous link.
|
|
|
|
offset -= distance
|
|
|
|
if len(buf) == guess:
|
|
|
|
break
|
|
|
|
guess = len(buf)
|
|
|
|
# Set most significant bit to mark end of links in this node.
|
|
|
|
buf[last] |= (1 << 7)
|
|
|
|
buf.reverse()
|
|
|
|
return buf
|
|
|
|
|
|
|
|
|
|
|
|
def encode_prefix(label):
|
|
|
|
"""Encodes a node label as a list of bytes without a trailing high byte.
|
|
|
|
|
|
|
|
This method encodes a node if there is exactly one child and the
|
|
|
|
child follows immidiately after so that no jump is needed. This label
|
|
|
|
will then be a prefix to the label in the child node.
|
|
|
|
"""
|
|
|
|
assert label
|
2016-11-12 21:10:59 +01:00
|
|
|
return [c for c in bytearray(reversed(label))]
|
2015-12-04 17:15:55 +01:00
|
|
|
|
|
|
|
|
|
|
|
def encode_label(label):
|
|
|
|
"""Encodes a node label as a list of bytes with a trailing high byte >0x80.
|
|
|
|
"""
|
|
|
|
buf = encode_prefix(label)
|
|
|
|
# Set most significant bit to mark end of label in this node.
|
|
|
|
buf[0] |= (1 << 7)
|
|
|
|
return buf
|
|
|
|
|
|
|
|
|
2016-11-04 19:43:36 +01:00
|
|
|
def encode(dafsa, utf_mode):
|
2015-12-04 17:15:55 +01:00
|
|
|
"""Encodes a DAFSA to a list of bytes"""
|
|
|
|
output = []
|
|
|
|
offsets = {}
|
|
|
|
|
|
|
|
for node in reversed(top_sort(dafsa)):
|
|
|
|
if (len(node[1]) == 1 and node[1][0] and
|
|
|
|
(offsets[id(node[1][0])] == len(output))):
|
|
|
|
output.extend(encode_prefix(node[0]))
|
|
|
|
else:
|
|
|
|
output.extend(encode_links(node[1], offsets, len(output)))
|
|
|
|
output.extend(encode_label(node[0]))
|
|
|
|
offsets[id(node)] = len(output)
|
|
|
|
|
|
|
|
output.extend(encode_links(dafsa, offsets, len(output)))
|
|
|
|
output.reverse()
|
2016-11-04 19:43:36 +01:00
|
|
|
if utf_mode:
|
|
|
|
output.append(0x01)
|
2015-12-04 17:15:55 +01:00
|
|
|
return output
|
|
|
|
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
def to_cxx(data, codecs):
|
2015-12-04 17:15:55 +01:00
|
|
|
"""Generates C++ code from a list of encoded bytes."""
|
2016-11-12 21:10:59 +01:00
|
|
|
text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
|
|
|
|
text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
|
|
|
text += b' documentation.'
|
|
|
|
text += b'*/\n\n'
|
|
|
|
text += b'static const unsigned char kDafsa['
|
|
|
|
text += bytes(str(len(data)), **codecs)
|
|
|
|
text += b'] = {\n'
|
2015-12-04 17:15:55 +01:00
|
|
|
for i in range(0, len(data), 12):
|
2016-11-12 21:10:59 +01:00
|
|
|
text += b' '
|
|
|
|
text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
|
|
|
|
text += b',\n'
|
|
|
|
text += b'};\n'
|
2015-12-04 17:15:55 +01:00
|
|
|
return text
|
|
|
|
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
def words_to_whatever(words, converter, utf_mode, codecs):
|
2015-12-04 17:15:55 +01:00
|
|
|
"""Generates C++ code from a word list"""
|
2016-11-04 19:43:36 +01:00
|
|
|
dafsa = to_dafsa(words, utf_mode)
|
2015-12-04 17:15:55 +01:00
|
|
|
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
|
|
|
dafsa = fun(dafsa)
|
2016-11-12 21:10:59 +01:00
|
|
|
return converter(encode(dafsa, utf_mode), codecs)
|
2015-12-29 16:53:47 +01:00
|
|
|
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
def words_to_cxx(words, utf_mode, codecs):
|
2015-12-29 16:53:47 +01:00
|
|
|
"""Generates C++ code from a word list"""
|
2016-11-12 21:10:59 +01:00
|
|
|
return words_to_whatever(words, to_cxx, utf_mode, codecs)
|
2015-12-29 16:53:47 +01:00
|
|
|
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
def words_to_binary(words, utf_mode, codecs):
|
2015-12-29 16:53:47 +01:00
|
|
|
"""Generates C++ code from a word list"""
|
2016-11-12 21:10:59 +01:00
|
|
|
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
|
2015-12-04 17:15:55 +01:00
|
|
|
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
def parse_psl2c(infile, utf_mode, codecs):
|
2015-12-30 17:52:48 +01:00
|
|
|
"""Parses file generated by psl2c and extract strings and return code"""
|
2016-11-12 21:10:59 +01:00
|
|
|
lines = [bytes(line.strip(), **codecs) for line in infile]
|
2015-12-30 17:52:48 +01:00
|
|
|
|
2015-12-04 17:15:55 +01:00
|
|
|
for line in lines:
|
2016-11-12 21:10:59 +01:00
|
|
|
if line[-3:-1] != b', ':
|
2015-12-04 17:15:55 +01:00
|
|
|
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
|
2016-11-06 15:31:18 +01:00
|
|
|
# Technically the DAFSA format could support return values in range [0x00-0x1E],
|
2015-12-04 17:15:55 +01:00
|
|
|
# but the values below are the only with a defined meaning.
|
2016-11-12 21:10:59 +01:00
|
|
|
if line[-1] not in b'0123456789ABCDEF':
|
|
|
|
raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1:])
|
2015-12-30 17:52:48 +01:00
|
|
|
|
|
|
|
# with open("gperf.out", 'w') as outfile:
|
2016-01-05 10:57:07 +01:00
|
|
|
# for line in sorted(lines):
|
2015-12-30 17:52:48 +01:00
|
|
|
# outfile.write(line[:-3] + line[-1] + "\n")
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
return [line[:-3] + line[-1:] for line in sorted(lines)]
|
2015-12-04 17:15:55 +01:00
|
|
|
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
def parse_psl(infile, utf_mode, codecs):
|
2015-12-30 17:52:48 +01:00
|
|
|
"""Parses PSL file and extract strings and return code"""
|
|
|
|
PSL_FLAG_EXCEPTION = (1<<0)
|
|
|
|
PSL_FLAG_WILDCARD = (1<<1)
|
|
|
|
PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
|
|
|
|
PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
|
|
|
|
PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
|
|
|
|
|
|
|
|
psl = {}
|
|
|
|
section = 0
|
|
|
|
|
|
|
|
for line in infile:
|
2016-11-12 21:10:59 +01:00
|
|
|
line = bytes(line.strip(), **codecs)
|
2015-12-30 17:52:48 +01:00
|
|
|
if not line:
|
|
|
|
continue
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
if line.startswith(b'//'):
|
2015-12-30 17:52:48 +01:00
|
|
|
if section == 0:
|
2016-11-12 21:10:59 +01:00
|
|
|
if b'===BEGIN ICANN DOMAINS===' in line:
|
2015-12-30 17:52:48 +01:00
|
|
|
section = PSL_FLAG_ICANN
|
2016-11-12 21:10:59 +01:00
|
|
|
elif section == 0 and b'===BEGIN PRIVATE DOMAINS===' in line:
|
2015-12-30 17:52:48 +01:00
|
|
|
section = PSL_FLAG_PRIVATE
|
2016-11-12 21:10:59 +01:00
|
|
|
elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
|
2015-12-30 17:52:48 +01:00
|
|
|
section = 0
|
2016-11-12 21:10:59 +01:00
|
|
|
elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
|
2015-12-30 17:52:48 +01:00
|
|
|
section = 0
|
|
|
|
continue # skip comments
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
if line[:1] == b'!':
|
2015-12-30 17:52:48 +01:00
|
|
|
flags = PSL_FLAG_EXCEPTION | section
|
|
|
|
line = line[1:]
|
2016-11-12 21:10:59 +01:00
|
|
|
elif line[:1] == b'*':
|
|
|
|
if line[1:2] != b'.':
|
2016-01-04 22:15:43 +01:00
|
|
|
print('Unsupported kind of rule (ignored): %s' % line)
|
2015-12-30 17:52:48 +01:00
|
|
|
continue
|
|
|
|
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
|
|
|
|
line = line[2:]
|
|
|
|
else:
|
2016-11-12 21:10:59 +01:00
|
|
|
if not b'.' in line:
|
2015-12-30 17:52:48 +01:00
|
|
|
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
|
|
|
|
flags = PSL_FLAG_PLAIN | section
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
punycode = line.decode('utf-8').encode('idna')
|
2015-12-30 17:52:48 +01:00
|
|
|
|
2016-11-05 11:31:11 +01:00
|
|
|
if punycode in psl:
|
2015-12-30 17:52:48 +01:00
|
|
|
"""Found existing entry:
|
|
|
|
Combination of exception and plain rule is ambiguous
|
|
|
|
!foo.bar
|
|
|
|
foo.bar
|
|
|
|
|
|
|
|
Allowed:
|
|
|
|
!foo.bar + *.foo.bar
|
|
|
|
foo.bar + *.foo.bar
|
|
|
|
"""
|
2016-11-05 11:31:11 +01:00
|
|
|
print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
|
2015-12-30 17:52:48 +01:00
|
|
|
continue
|
|
|
|
|
2016-11-05 11:31:11 +01:00
|
|
|
if utf_mode:
|
|
|
|
psl[line] = flags
|
|
|
|
psl[punycode] = flags
|
2015-12-30 17:52:48 +01:00
|
|
|
|
|
|
|
# with open("psl.out", 'w') as outfile:
|
2016-01-05 10:57:07 +01:00
|
|
|
# for (domain, flags) in sorted(psl.iteritems()):
|
2015-12-30 17:52:48 +01:00
|
|
|
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
|
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]
|
2015-12-30 17:52:48 +01:00
|
|
|
|
|
|
|
|
|
|
|
def usage():
|
|
|
|
"""Prints the usage"""
|
2016-01-04 22:15:43 +01:00
|
|
|
print('usage: %s [options] infile outfile' % sys.argv[0])
|
2016-01-04 20:22:13 +01:00
|
|
|
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
|
|
|
|
print(' --input-format=psl infile is a Public Suffix List file')
|
2016-11-05 11:31:11 +01:00
|
|
|
print(' --output-format=cxx Write DAFSA as C/C++ code (default)')
|
2016-01-04 20:22:13 +01:00
|
|
|
print(' --output-format=binary Write DAFSA binary data')
|
2016-11-05 11:31:11 +01:00
|
|
|
print(' --encoding=ascii 7-bit ASCII mode')
|
|
|
|
print(' --encoding=utf-8 UTF-8 mode (default)')
|
2015-12-30 17:52:48 +01:00
|
|
|
exit(1)
|
|
|
|
|
|
|
|
|
2015-12-04 17:15:55 +01:00
|
|
|
def main():
|
2015-12-30 17:52:48 +01:00
|
|
|
"""Convert PSL file into C or binary DAFSA file"""
|
2015-12-29 16:53:47 +01:00
|
|
|
if len(sys.argv) < 3:
|
2015-12-30 17:52:48 +01:00
|
|
|
usage()
|
2015-12-29 16:53:47 +01:00
|
|
|
|
|
|
|
converter = words_to_cxx
|
2015-12-30 17:52:48 +01:00
|
|
|
parser = parse_psl2c
|
2016-11-05 11:31:11 +01:00
|
|
|
utf_mode = True
|
2015-12-30 17:52:48 +01:00
|
|
|
|
2016-11-12 21:10:59 +01:00
|
|
|
codecs = dict()
|
|
|
|
if sys.version_info.major > 2:
|
|
|
|
codecs['encoding'] = 'utf-8'
|
|
|
|
|
2015-12-30 17:52:48 +01:00
|
|
|
for arg in sys.argv[1:-2]:
|
|
|
|
if arg.startswith('--input-format='):
|
|
|
|
value = arg[15:].lower()
|
|
|
|
if value == 'psl':
|
|
|
|
parser = parse_psl
|
|
|
|
elif value == 'psl2c':
|
|
|
|
parser = parse_psl2c
|
|
|
|
else:
|
2016-01-04 22:15:43 +01:00
|
|
|
print("Unknown input format '%s'" % value)
|
2015-12-30 17:52:48 +01:00
|
|
|
return 1
|
|
|
|
elif arg.startswith('--output-format='):
|
|
|
|
value = arg[16:].lower()
|
|
|
|
if value == 'binary':
|
|
|
|
converter = words_to_binary
|
|
|
|
elif value == 'cxx':
|
|
|
|
converter = words_to_cxx
|
2016-11-05 11:31:11 +01:00
|
|
|
else:
|
|
|
|
print("Unknown output format '%s'" % value)
|
|
|
|
return 1
|
2016-11-04 19:43:36 +01:00
|
|
|
elif arg.startswith('--encoding='):
|
|
|
|
value = arg[11:].lower()
|
|
|
|
if value == 'ascii':
|
|
|
|
utf_mode = False
|
|
|
|
elif value == 'utf-8':
|
|
|
|
utf_mode = True
|
2015-12-30 17:52:48 +01:00
|
|
|
else:
|
2016-11-04 19:43:36 +01:00
|
|
|
print("Unknown encoding '%s'" % value)
|
2015-12-30 17:52:48 +01:00
|
|
|
return 1
|
|
|
|
else:
|
|
|
|
usage()
|
2015-12-29 16:53:47 +01:00
|
|
|
|
2015-12-30 17:52:48 +01:00
|
|
|
if sys.argv[-2] == '-':
|
2016-11-12 21:10:59 +01:00
|
|
|
with open(sys.argv[-1], 'wb') as outfile:
|
|
|
|
outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
|
2015-12-04 21:26:30 +01:00
|
|
|
else:
|
2016-11-12 21:10:59 +01:00
|
|
|
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'wb') as outfile:
|
|
|
|
outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
|
2015-12-29 16:53:47 +01:00
|
|
|
|
2015-12-04 17:15:55 +01:00
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
sys.exit(main())
|