diff --git a/AUTHORS b/AUTHORS index 33dad7b..6f3195c 100644 --- a/AUTHORS +++ b/AUTHORS @@ -16,3 +16,4 @@ Christopher Meng (Fedora building) Jakub Čajka Giuseppe Scrivano Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support) +Olle Liljenzin (Original DAFSA implementation and UTF-8 patch) diff --git a/src/lookup_string_in_fixed_set.c b/src/lookup_string_in_fixed_set.c index ddf63ae..01edc4e 100644 --- a/src/lookup_string_in_fixed_set.c +++ b/src/lookup_string_in_fixed_set.c @@ -21,6 +21,48 @@ #define CHECK_LT(a, b) if ((a) >= b) return 0 +static const char multibyte_length_table[16] = { + 0, 0, 0, 0, /* 0x00-0x3F */ + 0, 0, 0, 0, /* 0x40-0x7F */ + 0, 0, 0, 0, /* 0x80-0xBF */ + 2, 2, 3, 4, /* 0xC0-0xFF */ +}; + + +/** + * Get lenght of multibyte character sequence starting at a given byte. + * Returns zero if the byte is not a valid leading byte in UTF-8. + */ +static int GetMultibyteLength(char c) { + return multibyte_length_table[((unsigned char)c) >> 4]; +} + +/** + * Moves pointers one byte forward. + */ +static void NextPos(const unsigned char** pos, + const char** key, + const char** multibyte_start) +{ + ++*pos; + if (*multibyte_start) { + /* Advance key to next byte in multibyte sequence. */ + ++*key; + /* Reset multibyte_start if last byte in multibyte sequence was consumed. */ + if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start)) + *multibyte_start = 0; + } else { + if (GetMultibyteLength(**key)) { + /* Multibyte prefix was matched in the dafsa, start matching multibyte + * content in next round. */ + *multibyte_start = *key; + } else { + /* Advance key as a single byte character was matched. */ + ++*key; + } + } +} + /* * Read next offset from pos. * Returns true if an offset could be read, false otherwise. @@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end) return(*offset & 0x80) != 0; } +/* + * Check if byte at offset matches first character in key. + * This version assumes a range check was already performed by the caller. + */ + +static int IsMatchUnchecked(const unsigned char matcher, + const char* key, + const char* multibyte_start) +{ + if (multibyte_start) { + /* Multibyte matching mode. */ + if (multibyte_start == key) { + /* Match leading byte, which will also match the sequence length. */ + return (matcher ^ 0x80) == (const unsigned char)*key; + } else { + /* Match following bytes. */ + return (matcher ^ 0xC0) == (const unsigned char)*key; + } + } + /* If key points at a leading byte in a multibyte sequence, but we are not yet + * in multibyte mode, then the dafsa should contain a special byte to indicate + * a mode switch. */ + if (GetMultibyteLength(*key)) { + return matcher == 0x1F; + } + /* Normal matching of a single byte character. */ + return matcher == (const unsigned char)*key; +} + /* * Check if byte at offset matches first character in key. * This version matches characters not last in label. @@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end) static int IsMatch(const unsigned char* offset, const unsigned char* end, - const char* key) + const char* key, + const char* multibyte_start) { CHECK_LT(offset, end); - return *offset == *key; + return IsMatchUnchecked(*offset, key, multibyte_start); } /* @@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset, static int IsEndCharMatch(const unsigned char* offset, const unsigned char* end, - const char* key) + const char* key, + const char* multibyte_start) { CHECK_LT(offset, end); - return *offset == (*key | 0x80); + return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start); } /* @@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset, static int GetReturnValue(const unsigned char* offset, const unsigned char* end, + const char* multibyte_start, int* return_value) { CHECK_LT(offset, end); - if ((*offset & 0xE0) == 0x80) { + if (!multibyte_start && (*offset & 0xE0) == 0x80) { *return_value = *offset & 0x0F; return 1; } @@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph, const unsigned char* end = graph + length; const unsigned char* offset = pos; const char* key_end = key + key_length; + const char* multibyte_start = 0; while (GetNextOffset(&pos, end, &offset)) { /*char + end_char offsets @@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph, if (key != key_end && !IsEOL(offset, end)) { /* Leading is not a match. Don't dive into this child */ - if (!IsMatch(offset, end, key)) + if (!IsMatch(offset, end, key, multibyte_start)) continue; did_consume = 1; - ++offset; - ++key; + NextPos(&offset, &key, &multibyte_start); /* Possible matches at this point: * + end_char offsets * + return value @@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph, /* Remove all remaining nodes possible */ while (!IsEOL(offset, end) && key != key_end) { - if (!IsMatch(offset, end, key)) + if (!IsMatch(offset, end, key, multibyte_start)) return -1; - ++key; - ++offset; + NextPos(&offset, &key, &multibyte_start); } } /* Possible matches at this point: @@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph, if (key == key_end) { int return_value; - if (GetReturnValue(offset, end, &return_value)) + if (GetReturnValue(offset, end, multibyte_start, &return_value)) return return_value; /* The DAFSA guarantees that if the first char is a match, all * remaining char elements MUST match if the key is truly present. @@ -191,13 +264,13 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph, return -1; continue; } - if (!IsEndCharMatch(offset, end, key)) { + if (!IsEndCharMatch(offset, end, key, multibyte_start)) { if (did_consume) return -1; /* Unexpected */ continue; } - ++key; - pos = ++offset; /* Dive into child */ + NextPos(&offset, &key, &multibyte_start); + pos = offset; /* Dive into child */ } return -1; /* No match */ diff --git a/src/psl-make-dafsa b/src/psl-make-dafsa index 99c3135..bd9a79a 100755 --- a/src/psl-make-dafsa +++ b/src/psl-make-dafsa @@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file and generates a C++ file with a byte array representing graph that can be used as a memory efficient replacement for the perfect hash table. -The input strings are assumed to consist of printable 7-bit ASCII characters -and the return values are assumed to be one digit integers. +The input strings must consist of printable 7-bit ASCII characters or UTF-8 +multibyte sequences. Control characters in the range [0x00-0x1F] are not +allowed. The return values must be one digit integers. . In this program a DAFSA is a diamond shaped graph starting at a common source node and ending at a common sink node. All internal nodes contain @@ -47,8 +48,8 @@ The generated byte array can described by the following BNF: ::= < 8-bit value in range [0x00-0xFF] > - ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > - ::= < char + 0x80, byte in range [0xA0-0xFF] > + ::= < byte in range [0x1F-0x7F] > + ::= < char + 0x80, byte in range [0x9F-0xFF] > ::= < value + 0x80, byte in range [0x80-0x8F] > ::= < byte in range [0x00-0x3F] > @@ -89,8 +90,8 @@ The generated byte array can described by the following BNF: Decoding: - -> printable 7-bit ASCII character - & 0x7F -> printable 7-bit ASCII character + -> character + & 0x7F -> character & 0x0F -> integer -> integer (( & 0x1F>) << 8) + -> integer @@ -105,6 +106,28 @@ between previous child node and next child node. Thus each offset links a node to a child node. The distance is always counted between start addresses, i.e. first byte in decoded offset or first byte in child node. +Transcoding of UTF-8 multibyte sequences: + +The original DAFSA format was limited to 7-bit printable ASCII characters in +range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences. +By transcoding of such characters the new format preserves compatibility with +old parsers, so that a DAFSA in the extended format can be used by an old +parser without false positives, although strings containing transcoded +characters will never match. Since the format is extended rather than being +changed, a parser supporting the new format will automatically support data +generated in the old format. + +Transcoding is performed by insertion of a start byte with the special value +0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside +the range of printable ASCII. + +2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn + +3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn + +4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn -> + 00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn + Example 1: %% @@ -197,6 +220,23 @@ import sys class InputError(Exception): """Exception raised for errors in the input file.""" +# Length of a character starting at a given byte. +char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF def to_dafsa(words): """Generates a DAFSA from a word list and returns the source node. @@ -206,14 +246,29 @@ def to_dafsa(words): """ if not words: raise InputError('The domain list must not be empty') - def to_nodes(word): + def to_nodes(word, multibyte_length): """Split words into characters""" - if not 0x1F < ord(word[0]) < 0x80: - raise InputError('Domain names must be printable 7-bit ASCII') - if len(word) == 1: - return chr(int(word[0], 16) & 0x0F), [None] - return word[0], [to_nodes(word[1:])] - return [to_nodes(word) for word in words] + byte = ord(word[0]) + if multibyte_length: + # Consume next byte in multibyte sequence. + if byte & 0xC0 != 0x80: + raise InputError('Invalid UTF-8 multibyte sequence') + return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)] + char_length = char_length_table[byte] + if char_length == 1: + # 7-bit printable ASCII. + if len(word) == 1: + return chr(int(word[0], 16) & 0x0F), [None] + return word[0], [to_nodes(word[1:], 0)] + elif char_length > 1: + # Leading byte in multibyte sequence. + if len(word) <= char_length: + raise InputError('Unterminated UTF-8 multibyte sequence') + return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])] + # Unexpected character. + raise InputError('Domain names must be printable ASCII or UTF-8') + + return [to_nodes(word, 0) for word in words] def to_words(node):