Updated DAFSA generator and parser to support UTF-8 encoding

This commit is contained in:
Olle Liljenzin 2016-11-02 20:22:01 +01:00 committed by Tim Rühsen
parent e126a67354
commit e03953e27a
3 changed files with 157 additions and 28 deletions

View File

@ -16,3 +16,4 @@ Christopher Meng (Fedora building)
Jakub Čajka Jakub Čajka
Giuseppe Scrivano Giuseppe Scrivano
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support) Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)

View File

@ -21,6 +21,48 @@
#define CHECK_LT(a, b) if ((a) >= b) return 0 #define CHECK_LT(a, b) if ((a) >= b) return 0
static const char multibyte_length_table[16] = {
0, 0, 0, 0, /* 0x00-0x3F */
0, 0, 0, 0, /* 0x40-0x7F */
0, 0, 0, 0, /* 0x80-0xBF */
2, 2, 3, 4, /* 0xC0-0xFF */
};
/**
* Get lenght of multibyte character sequence starting at a given byte.
* Returns zero if the byte is not a valid leading byte in UTF-8.
*/
static int GetMultibyteLength(char c) {
return multibyte_length_table[((unsigned char)c) >> 4];
}
/**
* Moves pointers one byte forward.
*/
static void NextPos(const unsigned char** pos,
const char** key,
const char** multibyte_start)
{
++*pos;
if (*multibyte_start) {
/* Advance key to next byte in multibyte sequence. */
++*key;
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
*multibyte_start = 0;
} else {
if (GetMultibyteLength(**key)) {
/* Multibyte prefix was matched in the dafsa, start matching multibyte
* content in next round. */
*multibyte_start = *key;
} else {
/* Advance key as a single byte character was matched. */
++*key;
}
}
}
/* /*
* Read next offset from pos. * Read next offset from pos.
* Returns true if an offset could be read, false otherwise. * Returns true if an offset could be read, false otherwise.
@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
return(*offset & 0x80) != 0; return(*offset & 0x80) != 0;
} }
/*
* Check if byte at offset matches first character in key.
* This version assumes a range check was already performed by the caller.
*/
static int IsMatchUnchecked(const unsigned char matcher,
const char* key,
const char* multibyte_start)
{
if (multibyte_start) {
/* Multibyte matching mode. */
if (multibyte_start == key) {
/* Match leading byte, which will also match the sequence length. */
return (matcher ^ 0x80) == (const unsigned char)*key;
} else {
/* Match following bytes. */
return (matcher ^ 0xC0) == (const unsigned char)*key;
}
}
/* If key points at a leading byte in a multibyte sequence, but we are not yet
* in multibyte mode, then the dafsa should contain a special byte to indicate
* a mode switch. */
if (GetMultibyteLength(*key)) {
return matcher == 0x1F;
}
/* Normal matching of a single byte character. */
return matcher == (const unsigned char)*key;
}
/* /*
* Check if byte at offset matches first character in key. * Check if byte at offset matches first character in key.
* This version matches characters not last in label. * This version matches characters not last in label.
@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
static int IsMatch(const unsigned char* offset, static int IsMatch(const unsigned char* offset,
const unsigned char* end, const unsigned char* end,
const char* key) const char* key,
const char* multibyte_start)
{ {
CHECK_LT(offset, end); CHECK_LT(offset, end);
return *offset == *key; return IsMatchUnchecked(*offset, key, multibyte_start);
} }
/* /*
@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,
static int IsEndCharMatch(const unsigned char* offset, static int IsEndCharMatch(const unsigned char* offset,
const unsigned char* end, const unsigned char* end,
const char* key) const char* key,
const char* multibyte_start)
{ {
CHECK_LT(offset, end); CHECK_LT(offset, end);
return *offset == (*key | 0x80); return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
} }
/* /*
@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,
static int GetReturnValue(const unsigned char* offset, static int GetReturnValue(const unsigned char* offset,
const unsigned char* end, const unsigned char* end,
const char* multibyte_start,
int* return_value) int* return_value)
{ {
CHECK_LT(offset, end); CHECK_LT(offset, end);
if ((*offset & 0xE0) == 0x80) { if (!multibyte_start && (*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x0F; *return_value = *offset & 0x0F;
return 1; return 1;
} }
@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
const unsigned char* end = graph + length; const unsigned char* end = graph + length;
const unsigned char* offset = pos; const unsigned char* offset = pos;
const char* key_end = key + key_length; const char* key_end = key + key_length;
const char* multibyte_start = 0;
while (GetNextOffset(&pos, end, &offset)) { while (GetNextOffset(&pos, end, &offset)) {
/*char <char>+ end_char offsets /*char <char>+ end_char offsets
@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
if (key != key_end && !IsEOL(offset, end)) { if (key != key_end && !IsEOL(offset, end)) {
/* Leading <char> is not a match. Don't dive into this child */ /* Leading <char> is not a match. Don't dive into this child */
if (!IsMatch(offset, end, key)) if (!IsMatch(offset, end, key, multibyte_start))
continue; continue;
did_consume = 1; did_consume = 1;
++offset; NextPos(&offset, &key, &multibyte_start);
++key;
/* Possible matches at this point: /* Possible matches at this point:
* <char>+ end_char offsets * <char>+ end_char offsets
* <char>+ return value * <char>+ return value
@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
/* Remove all remaining <char> nodes possible */ /* Remove all remaining <char> nodes possible */
while (!IsEOL(offset, end) && key != key_end) { while (!IsEOL(offset, end) && key != key_end) {
if (!IsMatch(offset, end, key)) if (!IsMatch(offset, end, key, multibyte_start))
return -1; return -1;
++key; NextPos(&offset, &key, &multibyte_start);
++offset;
} }
} }
/* Possible matches at this point: /* Possible matches at this point:
@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
if (key == key_end) { if (key == key_end) {
int return_value; int return_value;
if (GetReturnValue(offset, end, &return_value)) if (GetReturnValue(offset, end, multibyte_start, &return_value))
return return_value; return return_value;
/* The DAFSA guarantees that if the first char is a match, all /* The DAFSA guarantees that if the first char is a match, all
* remaining char elements MUST match if the key is truly present. * remaining char elements MUST match if the key is truly present.
@ -191,13 +264,13 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
return -1; return -1;
continue; continue;
} }
if (!IsEndCharMatch(offset, end, key)) { if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
if (did_consume) if (did_consume)
return -1; /* Unexpected */ return -1; /* Unexpected */
continue; continue;
} }
++key; NextPos(&offset, &key, &multibyte_start);
pos = ++offset; /* Dive into child */ pos = offset; /* Dive into child */
} }
return -1; /* No match */ return -1; /* No match */

View File

@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
and generates a C++ file with a byte array representing graph that can be and generates a C++ file with a byte array representing graph that can be
used as a memory efficient replacement for the perfect hash table. used as a memory efficient replacement for the perfect hash table.
The input strings are assumed to consist of printable 7-bit ASCII characters The input strings must consist of printable 7-bit ASCII characters or UTF-8
and the return values are assumed to be one digit integers. multibyte sequences. Control characters in the range [0x00-0x1F] are not
allowed. The return values must be one digit integers. .
In this program a DAFSA is a diamond shaped graph starting at a common In this program a DAFSA is a diamond shaped graph starting at a common
source node and ending at a common sink node. All internal nodes contain source node and ending at a common sink node. All internal nodes contain
@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:
<byte> ::= < 8-bit value in range [0x00-0xFF] > <byte> ::= < 8-bit value in range [0x00-0xFF] >
<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > <char> ::= < byte in range [0x1F-0x7F] >
<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] > <end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] > <return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
<offset1> ::= < byte in range [0x00-0x3F] > <offset1> ::= < byte in range [0x00-0x3F] >
@ -89,8 +90,8 @@ The generated byte array can described by the following BNF:
Decoding: Decoding:
<char> -> printable 7-bit ASCII character <char> -> character
<end_char> & 0x7F -> printable 7-bit ASCII character <end_char> & 0x7F -> character
<return value> & 0x0F -> integer <return value> & 0x0F -> integer
<offset1 & 0x3F> -> integer <offset1 & 0x3F> -> integer
((<offset2> & 0x1F>) << 8) + <byte> -> integer ((<offset2> & 0x1F>) << 8) + <byte> -> integer
@ -105,6 +106,28 @@ between previous child node and next child node. Thus each offset links a node
to a child node. The distance is always counted between start addresses, i.e. to a child node. The distance is always counted between start addresses, i.e.
first byte in decoded offset or first byte in child node. first byte in decoded offset or first byte in child node.
Transcoding of UTF-8 multibyte sequences:
The original DAFSA format was limited to 7-bit printable ASCII characters in
range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
By transcoding of such characters the new format preserves compatibility with
old parsers, so that a DAFSA in the extended format can be used by an old
parser without false positives, although strings containing transcoded
characters will never match. Since the format is extended rather than being
changed, a parser supporting the new format will automatically support data
generated in the old format.
Transcoding is performed by insertion of a start byte with the special value
0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
the range of printable ASCII.
2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
Example 1: Example 1:
%% %%
@ -197,6 +220,23 @@ import sys
class InputError(Exception): class InputError(Exception):
"""Exception raised for errors in the input file.""" """Exception raised for errors in the input file."""
# Length of a character starting at a given byte.
char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
def to_dafsa(words): def to_dafsa(words):
"""Generates a DAFSA from a word list and returns the source node. """Generates a DAFSA from a word list and returns the source node.
@ -206,14 +246,29 @@ def to_dafsa(words):
""" """
if not words: if not words:
raise InputError('The domain list must not be empty') raise InputError('The domain list must not be empty')
def to_nodes(word): def to_nodes(word, multibyte_length):
"""Split words into characters""" """Split words into characters"""
if not 0x1F < ord(word[0]) < 0x80: byte = ord(word[0])
raise InputError('Domain names must be printable 7-bit ASCII') if multibyte_length:
# Consume next byte in multibyte sequence.
if byte & 0xC0 != 0x80:
raise InputError('Invalid UTF-8 multibyte sequence')
return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
char_length = char_length_table[byte]
if char_length == 1:
# 7-bit printable ASCII.
if len(word) == 1: if len(word) == 1:
return chr(int(word[0], 16) & 0x0F), [None] return chr(int(word[0], 16) & 0x0F), [None]
return word[0], [to_nodes(word[1:])] return word[0], [to_nodes(word[1:], 0)]
return [to_nodes(word) for word in words] elif char_length > 1:
# Leading byte in multibyte sequence.
if len(word) <= char_length:
raise InputError('Unterminated UTF-8 multibyte sequence')
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
# Unexpected character.
raise InputError('Domain names must be printable ASCII or UTF-8')
return [to_nodes(word, 0) for word in words]
def to_words(node): def to_words(node):