This commit is contained in:
Olle Liljenzin 2016-11-06 14:35:34 +00:00 committed by GitHub
commit 2844125fa8
4 changed files with 212 additions and 48 deletions

View File

@ -16,3 +16,4 @@ Christopher Meng (Fedora building)
Jakub Čajka
Giuseppe Scrivano
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)

View File

@ -21,6 +21,48 @@
#define CHECK_LT(a, b) if ((a) >= b) return 0
static const char multibyte_length_table[16] = {
0, 0, 0, 0, /* 0x00-0x3F */
0, 0, 0, 0, /* 0x40-0x7F */
0, 0, 0, 0, /* 0x80-0xBF */
2, 2, 3, 4, /* 0xC0-0xFF */
};
/**
* Get lenght of multibyte character sequence starting at a given byte.
* Returns zero if the byte is not a valid leading byte in UTF-8.
*/
static int GetMultibyteLength(char c) {
return multibyte_length_table[((unsigned char)c) >> 4];
}
/**
* Moves pointers one byte forward.
*/
static void NextPos(const unsigned char** pos,
const char** key,
const char** multibyte_start)
{
++*pos;
if (*multibyte_start) {
/* Advance key to next byte in multibyte sequence. */
++*key;
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
*multibyte_start = 0;
} else {
if (GetMultibyteLength(**key)) {
/* Multibyte prefix was matched in the dafsa, start matching multibyte
* content in next round. */
*multibyte_start = *key;
} else {
/* Advance key as a single byte character was matched. */
++*key;
}
}
}
/*
* Read next offset from pos.
* Returns true if an offset could be read, false otherwise.
@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
return(*offset & 0x80) != 0;
}
/*
* Check if byte at offset matches first character in key.
* This version assumes a range check was already performed by the caller.
*/
static int IsMatchUnchecked(const unsigned char matcher,
const char* key,
const char* multibyte_start)
{
if (multibyte_start) {
/* Multibyte matching mode. */
if (multibyte_start == key) {
/* Match leading byte, which will also match the sequence length. */
return (matcher ^ 0x80) == (const unsigned char)*key;
} else {
/* Match following bytes. */
return (matcher ^ 0xC0) == (const unsigned char)*key;
}
}
/* If key points at a leading byte in a multibyte sequence, but we are not yet
* in multibyte mode, then the dafsa should contain a special byte to indicate
* a mode switch. */
if (GetMultibyteLength(*key)) {
return matcher == 0x1F;
}
/* Normal matching of a single byte character. */
return matcher == (const unsigned char)*key;
}
/*
* Check if byte at offset matches first character in key.
* This version matches characters not last in label.
@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
static int IsMatch(const unsigned char* offset,
const unsigned char* end,
const char* key)
const char* key,
const char* multibyte_start)
{
CHECK_LT(offset, end);
return *offset == *key;
return IsMatchUnchecked(*offset, key, multibyte_start);
}
/*
@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,
static int IsEndCharMatch(const unsigned char* offset,
const unsigned char* end,
const char* key)
const char* key,
const char* multibyte_start)
{
CHECK_LT(offset, end);
return *offset == (*key | 0x80);
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
}
/*
@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,
static int GetReturnValue(const unsigned char* offset,
const unsigned char* end,
const char* multibyte_start,
int* return_value)
{
CHECK_LT(offset, end);
if ((*offset & 0xE0) == 0x80) {
if (!multibyte_start && (*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x0F;
return 1;
}
@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
const unsigned char* end = graph + length;
const unsigned char* offset = pos;
const char* key_end = key + key_length;
const char* multibyte_start = 0;
while (GetNextOffset(&pos, end, &offset)) {
/*char <char>+ end_char offsets
@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
if (key != key_end && !IsEOL(offset, end)) {
/* Leading <char> is not a match. Don't dive into this child */
if (!IsMatch(offset, end, key))
if (!IsMatch(offset, end, key, multibyte_start))
continue;
did_consume = 1;
++offset;
++key;
NextPos(&offset, &key, &multibyte_start);
/* Possible matches at this point:
* <char>+ end_char offsets
* <char>+ return value
@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
/* Remove all remaining <char> nodes possible */
while (!IsEOL(offset, end) && key != key_end) {
if (!IsMatch(offset, end, key))
if (!IsMatch(offset, end, key, multibyte_start))
return -1;
++key;
++offset;
NextPos(&offset, &key, &multibyte_start);
}
}
/* Possible matches at this point:
@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
if (key == key_end) {
int return_value;
if (GetReturnValue(offset, end, &return_value))
if (GetReturnValue(offset, end, multibyte_start, &return_value))
return return_value;
/* The DAFSA guarantees that if the first char is a match, all
* remaining char elements MUST match if the key is truly present.
@ -191,14 +264,23 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
return -1;
continue;
}
if (!IsEndCharMatch(offset, end, key)) {
if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
if (did_consume)
return -1; /* Unexpected */
continue;
}
++key;
pos = ++offset; /* Dive into child */
NextPos(&offset, &key, &multibyte_start);
pos = offset; /* Dive into child */
}
return -1; /* No match */
}
/* prototype to skip warning with -Wmissing-prototypes */
int _HIDDEN GetUtfMode(const unsigned char*, size_t);
int _HIDDEN GetUtfMode(const unsigned char* graph,
size_t length)
{
return length > 0 && graph[length - 1] < 0x80;
}

View File

@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
and generates a C++ file with a byte array representing graph that can be
used as a memory efficient replacement for the perfect hash table.
The input strings are assumed to consist of printable 7-bit ASCII characters
and the return values are assumed to be one digit integers.
The input strings must consist of printable 7-bit ASCII characters or UTF-8
multibyte sequences. Control characters in the range [0x00-0x1F] are not
allowed. The return values must be one digit integers. .
In this program a DAFSA is a diamond shaped graph starting at a common
source node and ending at a common sink node. All internal nodes contain
@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:
<byte> ::= < 8-bit value in range [0x00-0xFF] >
<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
<char> ::= < byte in range [0x1F-0x7F] >
<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
<offset1> ::= < byte in range [0x00-0x3F] >
@ -84,13 +85,18 @@ The generated byte array can described by the following BNF:
| <prefix> <node>
| <end_label>
<dafsa> ::= <source>
| <dafsa> <node>
<graph> ::= <graph>
| <graph> <node>
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
<dafsa> ::= <graph> <version>
Decoding:
<char> -> printable 7-bit ASCII character
<end_char> & 0x7F -> printable 7-bit ASCII character
<char> -> character
<end_char> & 0x7F -> character
<return value> & 0x0F -> integer
<offset1 & 0x3F> -> integer
((<offset2> & 0x1F>) << 8) + <byte> -> integer
@ -105,6 +111,28 @@ between previous child node and next child node. Thus each offset links a node
to a child node. The distance is always counted between start addresses, i.e.
first byte in decoded offset or first byte in child node.
Transcoding of UTF-8 multibyte sequences:
The original DAFSA format was limited to 7-bit printable ASCII characters in
range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
By transcoding of such characters the new format preserves compatibility with
old parsers, so that a DAFSA in the extended format can be used by an old
parser without false positives, although strings containing transcoded
characters will never match. Since the format is extended rather than being
changed, a parser supporting the new format will automatically support data
generated in the old format.
Transcoding is performed by insertion of a start byte with the special value
0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
the range of printable ASCII.
2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
Example 1:
%%
@ -197,8 +225,25 @@ import sys
class InputError(Exception):
"""Exception raised for errors in the input file."""
# Length of a character starting at a given byte.
char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
def to_dafsa(words):
def to_dafsa(words, utf_mode):
"""Generates a DAFSA from a word list and returns the source node.
Each word is split into characters so that each character is represented by
@ -206,14 +251,31 @@ def to_dafsa(words):
"""
if not words:
raise InputError('The domain list must not be empty')
def to_nodes(word):
def to_nodes(word, multibyte_length):
"""Split words into characters"""
if not 0x1F < ord(word[0]) < 0x80:
raise InputError('Domain names must be printable 7-bit ASCII')
if len(word) == 1:
return chr(int(word[0], 16) & 0x0F), [None]
return word[0], [to_nodes(word[1:])]
return [to_nodes(word) for word in words]
byte = ord(word[0])
if multibyte_length:
# Consume next byte in multibyte sequence.
if byte & 0xC0 != 0x80:
raise InputError('Invalid UTF-8 multibyte sequence')
return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
char_length = char_length_table[byte]
if char_length == 1:
# 7-bit printable ASCII.
if len(word) == 1:
return chr(int(word[0], 16) & 0x0F), [None]
return word[0], [to_nodes(word[1:], 0)]
elif char_length > 1:
# Leading byte in multibyte sequence.
if not utf_mode:
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
if len(word) <= char_length:
raise InputError('Unterminated UTF-8 multibyte sequence')
return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
# Unexpected character.
raise InputError('Domain names must be printable ASCII or UTF-8')
return [to_nodes(word, 0) for word in words]
def to_words(node):
@ -396,7 +458,7 @@ def encode_label(label):
return buf
def encode(dafsa):
def encode(dafsa, utf_mode):
"""Encodes a DAFSA to a list of bytes"""
output = []
offsets = {}
@ -412,6 +474,8 @@ def encode(dafsa):
output.extend(encode_links(dafsa, offsets, len(output)))
output.reverse()
if utf_mode:
output.append(0x01)
return output
@ -430,22 +494,22 @@ def to_cxx(data):
return text
def words_to_whatever(words, converter):
def words_to_whatever(words, converter, utf_mode):
"""Generates C++ code from a word list"""
dafsa = to_dafsa(words)
dafsa = to_dafsa(words, utf_mode)
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
dafsa = fun(dafsa)
return converter(encode(dafsa))
return converter(encode(dafsa, utf_mode))
def words_to_cxx(words):
def words_to_cxx(words, utf_mode):
"""Generates C++ code from a word list"""
return words_to_whatever(words, to_cxx)
return words_to_whatever(words, to_cxx, utf_mode)
def words_to_binary(words):
def words_to_binary(words, utf_mode):
"""Generates C++ code from a word list"""
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray)
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray, utf_mode)
def parse_psl2c(infile):
@ -455,10 +519,10 @@ def parse_psl2c(infile):
for line in lines:
if line[-3:-1] != ', ':
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
# Technically the DAFSA format could support return values in range [0-31],
# Technically the DAFSA format could support return values in range [0x00-0x1E],
# but the values below are the only with a defined meaning.
if line[-1] not in '0123456789ABCDEF':
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % line[-1])
raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1])
# with open("gperf.out", 'w') as outfile:
# for line in sorted(lines):
@ -540,6 +604,8 @@ def usage():
print(' --input-format=psl infile is a Public Suffix List file')
print(' --output-format=cxx Write DAFSA as C/C++ code')
print(' --output-format=binary Write DAFSA binary data')
print(' --encoding=ascii 7-bit ASCII mode (default)')
print(' --encoding=utf-8 UTF-8 mode')
exit(1)
@ -550,6 +616,7 @@ def main():
converter = words_to_cxx
parser = parse_psl2c
utf_mode = False
for arg in sys.argv[1:-2]:
if arg.startswith('--input-format='):
@ -567,18 +634,24 @@ def main():
converter = words_to_binary
elif value == 'cxx':
converter = words_to_cxx
elif arg.startswith('--encoding='):
value = arg[11:].lower()
if value == 'ascii':
utf_mode = False
elif value == 'utf-8':
utf_mode = True
else:
print("Unknown output format '%s'" % value)
print("Unknown encoding '%s'" % value)
return 1
else:
usage()
if sys.argv[-2] == '-':
with open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(sys.stdin)))
outfile.write(converter(parser(sys.stdin), utf_mode))
else:
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(infile)))
outfile.write(converter(parser(infile), utf_mode))
return 0

View File

@ -784,6 +784,7 @@ static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_en
/* prototype */
int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
int GetUtfMode(const unsigned char*, size_t);
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
{
@ -791,6 +792,15 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
const char *p;
char *punycode = NULL;
int need_conversion = 0;
size_t dafsa_size = 0;
const unsigned char *dafsa = NULL;
int utf_mode = 0;
if (psl == &_builtin_psl || psl->dafsa) {
dafsa_size = psl == &_builtin_psl ? sizeof(kDafsa) : psl->dafsa_size;
dafsa = psl == &_builtin_psl ? kDafsa : psl->dafsa;
utf_mode = GetUtfMode(dafsa, dafsa_size);
}
/* this function should be called without leading dots, just make sure */
if (*domain == '.')
@ -801,7 +811,7 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
for (p = domain; *p; p++) {
if (*p == '.')
suffix.nlabels++;
else if (*((unsigned char *)p) >= 128)
else if (!utf_mode && *((unsigned char *)p) >= 128)
need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */
}
@ -831,9 +841,7 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
suffix.length = p - suffix.label;
}
if (psl == &_builtin_psl || psl->dafsa) {
size_t dafsa_size = psl == &_builtin_psl ? sizeof(kDafsa) : psl->dafsa_size;
const unsigned char *dafsa = psl == &_builtin_psl ? kDafsa : psl->dafsa;
if (dafsa) {
int rc = LookupStringInFixedSet(dafsa, dafsa_size, suffix.label, suffix.length);
if (rc != -1) {
/* check for correct rule type */