Updated DAFSA generator and parser to support UTF-8 encoding
This commit is contained in:
parent
e126a67354
commit
e03953e27a
1
AUTHORS
1
AUTHORS
|
@ -16,3 +16,4 @@ Christopher Meng (Fedora building)
|
|||
Jakub Čajka
|
||||
Giuseppe Scrivano
|
||||
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
|
||||
Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)
|
||||
|
|
|
@ -21,6 +21,48 @@
|
|||
|
||||
#define CHECK_LT(a, b) if ((a) >= b) return 0
|
||||
|
||||
static const char multibyte_length_table[16] = {
|
||||
0, 0, 0, 0, /* 0x00-0x3F */
|
||||
0, 0, 0, 0, /* 0x40-0x7F */
|
||||
0, 0, 0, 0, /* 0x80-0xBF */
|
||||
2, 2, 3, 4, /* 0xC0-0xFF */
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Get lenght of multibyte character sequence starting at a given byte.
|
||||
* Returns zero if the byte is not a valid leading byte in UTF-8.
|
||||
*/
|
||||
static int GetMultibyteLength(char c) {
|
||||
return multibyte_length_table[((unsigned char)c) >> 4];
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves pointers one byte forward.
|
||||
*/
|
||||
static void NextPos(const unsigned char** pos,
|
||||
const char** key,
|
||||
const char** multibyte_start)
|
||||
{
|
||||
++*pos;
|
||||
if (*multibyte_start) {
|
||||
/* Advance key to next byte in multibyte sequence. */
|
||||
++*key;
|
||||
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
|
||||
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
|
||||
*multibyte_start = 0;
|
||||
} else {
|
||||
if (GetMultibyteLength(**key)) {
|
||||
/* Multibyte prefix was matched in the dafsa, start matching multibyte
|
||||
* content in next round. */
|
||||
*multibyte_start = *key;
|
||||
} else {
|
||||
/* Advance key as a single byte character was matched. */
|
||||
++*key;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read next offset from pos.
|
||||
* Returns true if an offset could be read, false otherwise.
|
||||
|
@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
|||
return(*offset & 0x80) != 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset matches first character in key.
|
||||
* This version assumes a range check was already performed by the caller.
|
||||
*/
|
||||
|
||||
static int IsMatchUnchecked(const unsigned char matcher,
|
||||
const char* key,
|
||||
const char* multibyte_start)
|
||||
{
|
||||
if (multibyte_start) {
|
||||
/* Multibyte matching mode. */
|
||||
if (multibyte_start == key) {
|
||||
/* Match leading byte, which will also match the sequence length. */
|
||||
return (matcher ^ 0x80) == (const unsigned char)*key;
|
||||
} else {
|
||||
/* Match following bytes. */
|
||||
return (matcher ^ 0xC0) == (const unsigned char)*key;
|
||||
}
|
||||
}
|
||||
/* If key points at a leading byte in a multibyte sequence, but we are not yet
|
||||
* in multibyte mode, then the dafsa should contain a special byte to indicate
|
||||
* a mode switch. */
|
||||
if (GetMultibyteLength(*key)) {
|
||||
return matcher == 0x1F;
|
||||
}
|
||||
/* Normal matching of a single byte character. */
|
||||
return matcher == (const unsigned char)*key;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset matches first character in key.
|
||||
* This version matches characters not last in label.
|
||||
|
@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
|||
|
||||
static int IsMatch(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* key)
|
||||
const char* key,
|
||||
const char* multibyte_start)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
return *offset == *key;
|
||||
return IsMatchUnchecked(*offset, key, multibyte_start);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,
|
|||
|
||||
static int IsEndCharMatch(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* key)
|
||||
const char* key,
|
||||
const char* multibyte_start)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
return *offset == (*key | 0x80);
|
||||
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,
|
|||
|
||||
static int GetReturnValue(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* multibyte_start,
|
||||
int* return_value)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
if ((*offset & 0xE0) == 0x80) {
|
||||
if (!multibyte_start && (*offset & 0xE0) == 0x80) {
|
||||
*return_value = *offset & 0x0F;
|
||||
return 1;
|
||||
}
|
||||
|
@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
|||
const unsigned char* end = graph + length;
|
||||
const unsigned char* offset = pos;
|
||||
const char* key_end = key + key_length;
|
||||
const char* multibyte_start = 0;
|
||||
|
||||
while (GetNextOffset(&pos, end, &offset)) {
|
||||
/*char <char>+ end_char offsets
|
||||
|
@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
|||
|
||||
if (key != key_end && !IsEOL(offset, end)) {
|
||||
/* Leading <char> is not a match. Don't dive into this child */
|
||||
if (!IsMatch(offset, end, key))
|
||||
if (!IsMatch(offset, end, key, multibyte_start))
|
||||
continue;
|
||||
did_consume = 1;
|
||||
++offset;
|
||||
++key;
|
||||
NextPos(&offset, &key, &multibyte_start);
|
||||
/* Possible matches at this point:
|
||||
* <char>+ end_char offsets
|
||||
* <char>+ return value
|
||||
|
@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
|||
|
||||
/* Remove all remaining <char> nodes possible */
|
||||
while (!IsEOL(offset, end) && key != key_end) {
|
||||
if (!IsMatch(offset, end, key))
|
||||
if (!IsMatch(offset, end, key, multibyte_start))
|
||||
return -1;
|
||||
++key;
|
||||
++offset;
|
||||
NextPos(&offset, &key, &multibyte_start);
|
||||
}
|
||||
}
|
||||
/* Possible matches at this point:
|
||||
|
@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
|||
if (key == key_end) {
|
||||
int return_value;
|
||||
|
||||
if (GetReturnValue(offset, end, &return_value))
|
||||
if (GetReturnValue(offset, end, multibyte_start, &return_value))
|
||||
return return_value;
|
||||
/* The DAFSA guarantees that if the first char is a match, all
|
||||
* remaining char elements MUST match if the key is truly present.
|
||||
|
@ -191,13 +264,13 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
|||
return -1;
|
||||
continue;
|
||||
}
|
||||
if (!IsEndCharMatch(offset, end, key)) {
|
||||
if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
|
||||
if (did_consume)
|
||||
return -1; /* Unexpected */
|
||||
continue;
|
||||
}
|
||||
++key;
|
||||
pos = ++offset; /* Dive into child */
|
||||
NextPos(&offset, &key, &multibyte_start);
|
||||
pos = offset; /* Dive into child */
|
||||
}
|
||||
|
||||
return -1; /* No match */
|
||||
|
|
|
@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
|
|||
and generates a C++ file with a byte array representing graph that can be
|
||||
used as a memory efficient replacement for the perfect hash table.
|
||||
|
||||
The input strings are assumed to consist of printable 7-bit ASCII characters
|
||||
and the return values are assumed to be one digit integers.
|
||||
The input strings must consist of printable 7-bit ASCII characters or UTF-8
|
||||
multibyte sequences. Control characters in the range [0x00-0x1F] are not
|
||||
allowed. The return values must be one digit integers. .
|
||||
|
||||
In this program a DAFSA is a diamond shaped graph starting at a common
|
||||
source node and ending at a common sink node. All internal nodes contain
|
||||
|
@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:
|
|||
|
||||
<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
||||
|
||||
<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
|
||||
<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
|
||||
<char> ::= < byte in range [0x1F-0x7F] >
|
||||
<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
|
||||
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
||||
|
||||
<offset1> ::= < byte in range [0x00-0x3F] >
|
||||
|
@ -89,8 +90,8 @@ The generated byte array can described by the following BNF:
|
|||
|
||||
Decoding:
|
||||
|
||||
<char> -> printable 7-bit ASCII character
|
||||
<end_char> & 0x7F -> printable 7-bit ASCII character
|
||||
<char> -> character
|
||||
<end_char> & 0x7F -> character
|
||||
<return value> & 0x0F -> integer
|
||||
<offset1 & 0x3F> -> integer
|
||||
((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
||||
|
@ -105,6 +106,28 @@ between previous child node and next child node. Thus each offset links a node
|
|||
to a child node. The distance is always counted between start addresses, i.e.
|
||||
first byte in decoded offset or first byte in child node.
|
||||
|
||||
Transcoding of UTF-8 multibyte sequences:
|
||||
|
||||
The original DAFSA format was limited to 7-bit printable ASCII characters in
|
||||
range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
|
||||
By transcoding of such characters the new format preserves compatibility with
|
||||
old parsers, so that a DAFSA in the extended format can be used by an old
|
||||
parser without false positives, although strings containing transcoded
|
||||
characters will never match. Since the format is extended rather than being
|
||||
changed, a parser supporting the new format will automatically support data
|
||||
generated in the old format.
|
||||
|
||||
Transcoding is performed by insertion of a start byte with the special value
|
||||
0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
|
||||
the range of printable ASCII.
|
||||
|
||||
2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
|
||||
|
||||
3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
|
||||
|
||||
4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
|
||||
00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
|
||||
|
||||
Example 1:
|
||||
|
||||
%%
|
||||
|
@ -197,6 +220,23 @@ import sys
|
|||
class InputError(Exception):
|
||||
"""Exception raised for errors in the input file."""
|
||||
|
||||
# Length of a character starting at a given byte.
|
||||
char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
|
||||
|
||||
def to_dafsa(words):
|
||||
"""Generates a DAFSA from a word list and returns the source node.
|
||||
|
@ -206,14 +246,29 @@ def to_dafsa(words):
|
|||
"""
|
||||
if not words:
|
||||
raise InputError('The domain list must not be empty')
|
||||
def to_nodes(word):
|
||||
def to_nodes(word, multibyte_length):
|
||||
"""Split words into characters"""
|
||||
if not 0x1F < ord(word[0]) < 0x80:
|
||||
raise InputError('Domain names must be printable 7-bit ASCII')
|
||||
byte = ord(word[0])
|
||||
if multibyte_length:
|
||||
# Consume next byte in multibyte sequence.
|
||||
if byte & 0xC0 != 0x80:
|
||||
raise InputError('Invalid UTF-8 multibyte sequence')
|
||||
return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
|
||||
char_length = char_length_table[byte]
|
||||
if char_length == 1:
|
||||
# 7-bit printable ASCII.
|
||||
if len(word) == 1:
|
||||
return chr(int(word[0], 16) & 0x0F), [None]
|
||||
return word[0], [to_nodes(word[1:])]
|
||||
return [to_nodes(word) for word in words]
|
||||
return word[0], [to_nodes(word[1:], 0)]
|
||||
elif char_length > 1:
|
||||
# Leading byte in multibyte sequence.
|
||||
if len(word) <= char_length:
|
||||
raise InputError('Unterminated UTF-8 multibyte sequence')
|
||||
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
||||
# Unexpected character.
|
||||
raise InputError('Domain names must be printable ASCII or UTF-8')
|
||||
|
||||
return [to_nodes(word, 0) for word in words]
|
||||
|
||||
|
||||
def to_words(node):
|
||||
|
|
Loading…
Reference in New Issue