Updated DAFSA generator and parser to support UTF-8 encoding
This commit is contained in:
parent
e126a67354
commit
e03953e27a
1
AUTHORS
1
AUTHORS
|
@ -16,3 +16,4 @@ Christopher Meng (Fedora building)
|
||||||
Jakub Čajka
|
Jakub Čajka
|
||||||
Giuseppe Scrivano
|
Giuseppe Scrivano
|
||||||
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
|
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
|
||||||
|
Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)
|
||||||
|
|
|
@ -21,6 +21,48 @@
|
||||||
|
|
||||||
#define CHECK_LT(a, b) if ((a) >= b) return 0
|
#define CHECK_LT(a, b) if ((a) >= b) return 0
|
||||||
|
|
||||||
|
static const char multibyte_length_table[16] = {
|
||||||
|
0, 0, 0, 0, /* 0x00-0x3F */
|
||||||
|
0, 0, 0, 0, /* 0x40-0x7F */
|
||||||
|
0, 0, 0, 0, /* 0x80-0xBF */
|
||||||
|
2, 2, 3, 4, /* 0xC0-0xFF */
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get lenght of multibyte character sequence starting at a given byte.
|
||||||
|
* Returns zero if the byte is not a valid leading byte in UTF-8.
|
||||||
|
*/
|
||||||
|
static int GetMultibyteLength(char c) {
|
||||||
|
return multibyte_length_table[((unsigned char)c) >> 4];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Moves pointers one byte forward.
|
||||||
|
*/
|
||||||
|
static void NextPos(const unsigned char** pos,
|
||||||
|
const char** key,
|
||||||
|
const char** multibyte_start)
|
||||||
|
{
|
||||||
|
++*pos;
|
||||||
|
if (*multibyte_start) {
|
||||||
|
/* Advance key to next byte in multibyte sequence. */
|
||||||
|
++*key;
|
||||||
|
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
|
||||||
|
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
|
||||||
|
*multibyte_start = 0;
|
||||||
|
} else {
|
||||||
|
if (GetMultibyteLength(**key)) {
|
||||||
|
/* Multibyte prefix was matched in the dafsa, start matching multibyte
|
||||||
|
* content in next round. */
|
||||||
|
*multibyte_start = *key;
|
||||||
|
} else {
|
||||||
|
/* Advance key as a single byte character was matched. */
|
||||||
|
++*key;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Read next offset from pos.
|
* Read next offset from pos.
|
||||||
* Returns true if an offset could be read, false otherwise.
|
* Returns true if an offset could be read, false otherwise.
|
||||||
|
@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
||||||
return(*offset & 0x80) != 0;
|
return(*offset & 0x80) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if byte at offset matches first character in key.
|
||||||
|
* This version assumes a range check was already performed by the caller.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int IsMatchUnchecked(const unsigned char matcher,
|
||||||
|
const char* key,
|
||||||
|
const char* multibyte_start)
|
||||||
|
{
|
||||||
|
if (multibyte_start) {
|
||||||
|
/* Multibyte matching mode. */
|
||||||
|
if (multibyte_start == key) {
|
||||||
|
/* Match leading byte, which will also match the sequence length. */
|
||||||
|
return (matcher ^ 0x80) == (const unsigned char)*key;
|
||||||
|
} else {
|
||||||
|
/* Match following bytes. */
|
||||||
|
return (matcher ^ 0xC0) == (const unsigned char)*key;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* If key points at a leading byte in a multibyte sequence, but we are not yet
|
||||||
|
* in multibyte mode, then the dafsa should contain a special byte to indicate
|
||||||
|
* a mode switch. */
|
||||||
|
if (GetMultibyteLength(*key)) {
|
||||||
|
return matcher == 0x1F;
|
||||||
|
}
|
||||||
|
/* Normal matching of a single byte character. */
|
||||||
|
return matcher == (const unsigned char)*key;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check if byte at offset matches first character in key.
|
* Check if byte at offset matches first character in key.
|
||||||
* This version matches characters not last in label.
|
* This version matches characters not last in label.
|
||||||
|
@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
||||||
|
|
||||||
static int IsMatch(const unsigned char* offset,
|
static int IsMatch(const unsigned char* offset,
|
||||||
const unsigned char* end,
|
const unsigned char* end,
|
||||||
const char* key)
|
const char* key,
|
||||||
|
const char* multibyte_start)
|
||||||
{
|
{
|
||||||
CHECK_LT(offset, end);
|
CHECK_LT(offset, end);
|
||||||
return *offset == *key;
|
return IsMatchUnchecked(*offset, key, multibyte_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,
|
||||||
|
|
||||||
static int IsEndCharMatch(const unsigned char* offset,
|
static int IsEndCharMatch(const unsigned char* offset,
|
||||||
const unsigned char* end,
|
const unsigned char* end,
|
||||||
const char* key)
|
const char* key,
|
||||||
|
const char* multibyte_start)
|
||||||
{
|
{
|
||||||
CHECK_LT(offset, end);
|
CHECK_LT(offset, end);
|
||||||
return *offset == (*key | 0x80);
|
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,
|
||||||
|
|
||||||
static int GetReturnValue(const unsigned char* offset,
|
static int GetReturnValue(const unsigned char* offset,
|
||||||
const unsigned char* end,
|
const unsigned char* end,
|
||||||
|
const char* multibyte_start,
|
||||||
int* return_value)
|
int* return_value)
|
||||||
{
|
{
|
||||||
CHECK_LT(offset, end);
|
CHECK_LT(offset, end);
|
||||||
if ((*offset & 0xE0) == 0x80) {
|
if (!multibyte_start && (*offset & 0xE0) == 0x80) {
|
||||||
*return_value = *offset & 0x0F;
|
*return_value = *offset & 0x0F;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
const unsigned char* end = graph + length;
|
const unsigned char* end = graph + length;
|
||||||
const unsigned char* offset = pos;
|
const unsigned char* offset = pos;
|
||||||
const char* key_end = key + key_length;
|
const char* key_end = key + key_length;
|
||||||
|
const char* multibyte_start = 0;
|
||||||
|
|
||||||
while (GetNextOffset(&pos, end, &offset)) {
|
while (GetNextOffset(&pos, end, &offset)) {
|
||||||
/*char <char>+ end_char offsets
|
/*char <char>+ end_char offsets
|
||||||
|
@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
|
|
||||||
if (key != key_end && !IsEOL(offset, end)) {
|
if (key != key_end && !IsEOL(offset, end)) {
|
||||||
/* Leading <char> is not a match. Don't dive into this child */
|
/* Leading <char> is not a match. Don't dive into this child */
|
||||||
if (!IsMatch(offset, end, key))
|
if (!IsMatch(offset, end, key, multibyte_start))
|
||||||
continue;
|
continue;
|
||||||
did_consume = 1;
|
did_consume = 1;
|
||||||
++offset;
|
NextPos(&offset, &key, &multibyte_start);
|
||||||
++key;
|
|
||||||
/* Possible matches at this point:
|
/* Possible matches at this point:
|
||||||
* <char>+ end_char offsets
|
* <char>+ end_char offsets
|
||||||
* <char>+ return value
|
* <char>+ return value
|
||||||
|
@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
|
|
||||||
/* Remove all remaining <char> nodes possible */
|
/* Remove all remaining <char> nodes possible */
|
||||||
while (!IsEOL(offset, end) && key != key_end) {
|
while (!IsEOL(offset, end) && key != key_end) {
|
||||||
if (!IsMatch(offset, end, key))
|
if (!IsMatch(offset, end, key, multibyte_start))
|
||||||
return -1;
|
return -1;
|
||||||
++key;
|
NextPos(&offset, &key, &multibyte_start);
|
||||||
++offset;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Possible matches at this point:
|
/* Possible matches at this point:
|
||||||
|
@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
if (key == key_end) {
|
if (key == key_end) {
|
||||||
int return_value;
|
int return_value;
|
||||||
|
|
||||||
if (GetReturnValue(offset, end, &return_value))
|
if (GetReturnValue(offset, end, multibyte_start, &return_value))
|
||||||
return return_value;
|
return return_value;
|
||||||
/* The DAFSA guarantees that if the first char is a match, all
|
/* The DAFSA guarantees that if the first char is a match, all
|
||||||
* remaining char elements MUST match if the key is truly present.
|
* remaining char elements MUST match if the key is truly present.
|
||||||
|
@ -191,13 +264,13 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
return -1;
|
return -1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!IsEndCharMatch(offset, end, key)) {
|
if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
|
||||||
if (did_consume)
|
if (did_consume)
|
||||||
return -1; /* Unexpected */
|
return -1; /* Unexpected */
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
++key;
|
NextPos(&offset, &key, &multibyte_start);
|
||||||
pos = ++offset; /* Dive into child */
|
pos = offset; /* Dive into child */
|
||||||
}
|
}
|
||||||
|
|
||||||
return -1; /* No match */
|
return -1; /* No match */
|
||||||
|
|
|
@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
|
||||||
and generates a C++ file with a byte array representing graph that can be
|
and generates a C++ file with a byte array representing graph that can be
|
||||||
used as a memory efficient replacement for the perfect hash table.
|
used as a memory efficient replacement for the perfect hash table.
|
||||||
|
|
||||||
The input strings are assumed to consist of printable 7-bit ASCII characters
|
The input strings must consist of printable 7-bit ASCII characters or UTF-8
|
||||||
and the return values are assumed to be one digit integers.
|
multibyte sequences. Control characters in the range [0x00-0x1F] are not
|
||||||
|
allowed. The return values must be one digit integers. .
|
||||||
|
|
||||||
In this program a DAFSA is a diamond shaped graph starting at a common
|
In this program a DAFSA is a diamond shaped graph starting at a common
|
||||||
source node and ending at a common sink node. All internal nodes contain
|
source node and ending at a common sink node. All internal nodes contain
|
||||||
|
@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:
|
||||||
|
|
||||||
<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
||||||
|
|
||||||
<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
|
<char> ::= < byte in range [0x1F-0x7F] >
|
||||||
<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
|
<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
|
||||||
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
||||||
|
|
||||||
<offset1> ::= < byte in range [0x00-0x3F] >
|
<offset1> ::= < byte in range [0x00-0x3F] >
|
||||||
|
@ -89,8 +90,8 @@ The generated byte array can described by the following BNF:
|
||||||
|
|
||||||
Decoding:
|
Decoding:
|
||||||
|
|
||||||
<char> -> printable 7-bit ASCII character
|
<char> -> character
|
||||||
<end_char> & 0x7F -> printable 7-bit ASCII character
|
<end_char> & 0x7F -> character
|
||||||
<return value> & 0x0F -> integer
|
<return value> & 0x0F -> integer
|
||||||
<offset1 & 0x3F> -> integer
|
<offset1 & 0x3F> -> integer
|
||||||
((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
||||||
|
@ -105,6 +106,28 @@ between previous child node and next child node. Thus each offset links a node
|
||||||
to a child node. The distance is always counted between start addresses, i.e.
|
to a child node. The distance is always counted between start addresses, i.e.
|
||||||
first byte in decoded offset or first byte in child node.
|
first byte in decoded offset or first byte in child node.
|
||||||
|
|
||||||
|
Transcoding of UTF-8 multibyte sequences:
|
||||||
|
|
||||||
|
The original DAFSA format was limited to 7-bit printable ASCII characters in
|
||||||
|
range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
|
||||||
|
By transcoding of such characters the new format preserves compatibility with
|
||||||
|
old parsers, so that a DAFSA in the extended format can be used by an old
|
||||||
|
parser without false positives, although strings containing transcoded
|
||||||
|
characters will never match. Since the format is extended rather than being
|
||||||
|
changed, a parser supporting the new format will automatically support data
|
||||||
|
generated in the old format.
|
||||||
|
|
||||||
|
Transcoding is performed by insertion of a start byte with the special value
|
||||||
|
0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
|
||||||
|
the range of printable ASCII.
|
||||||
|
|
||||||
|
2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
|
||||||
|
|
||||||
|
3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
|
||||||
|
|
||||||
|
4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
|
||||||
|
00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
|
||||||
|
|
||||||
Example 1:
|
Example 1:
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
@ -197,6 +220,23 @@ import sys
|
||||||
class InputError(Exception):
|
class InputError(Exception):
|
||||||
"""Exception raised for errors in the input file."""
|
"""Exception raised for errors in the input file."""
|
||||||
|
|
||||||
|
# Length of a character starting at a given byte.
|
||||||
|
char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF
|
||||||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
|
||||||
|
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
|
||||||
|
|
||||||
def to_dafsa(words):
|
def to_dafsa(words):
|
||||||
"""Generates a DAFSA from a word list and returns the source node.
|
"""Generates a DAFSA from a word list and returns the source node.
|
||||||
|
@ -206,14 +246,29 @@ def to_dafsa(words):
|
||||||
"""
|
"""
|
||||||
if not words:
|
if not words:
|
||||||
raise InputError('The domain list must not be empty')
|
raise InputError('The domain list must not be empty')
|
||||||
def to_nodes(word):
|
def to_nodes(word, multibyte_length):
|
||||||
"""Split words into characters"""
|
"""Split words into characters"""
|
||||||
if not 0x1F < ord(word[0]) < 0x80:
|
byte = ord(word[0])
|
||||||
raise InputError('Domain names must be printable 7-bit ASCII')
|
if multibyte_length:
|
||||||
|
# Consume next byte in multibyte sequence.
|
||||||
|
if byte & 0xC0 != 0x80:
|
||||||
|
raise InputError('Invalid UTF-8 multibyte sequence')
|
||||||
|
return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
|
||||||
|
char_length = char_length_table[byte]
|
||||||
|
if char_length == 1:
|
||||||
|
# 7-bit printable ASCII.
|
||||||
if len(word) == 1:
|
if len(word) == 1:
|
||||||
return chr(int(word[0], 16) & 0x0F), [None]
|
return chr(int(word[0], 16) & 0x0F), [None]
|
||||||
return word[0], [to_nodes(word[1:])]
|
return word[0], [to_nodes(word[1:], 0)]
|
||||||
return [to_nodes(word) for word in words]
|
elif char_length > 1:
|
||||||
|
# Leading byte in multibyte sequence.
|
||||||
|
if len(word) <= char_length:
|
||||||
|
raise InputError('Unterminated UTF-8 multibyte sequence')
|
||||||
|
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
||||||
|
# Unexpected character.
|
||||||
|
raise InputError('Domain names must be printable ASCII or UTF-8')
|
||||||
|
|
||||||
|
return [to_nodes(word, 0) for word in words]
|
||||||
|
|
||||||
|
|
||||||
def to_words(node):
|
def to_words(node):
|
||||||
|
|
Loading…
Reference in New Issue