Merge 673a9e0132 into e126a67354

2016-11-06 14:35:34 +00:00 · 2016-11-06 14:35:34 +00:00 · 2844125fa8
parent e126a67354 673a9e0132
commit 2844125fa8
4 changed files with 212 additions and 48 deletions
--- a/1
+++ b/1
@ -16,3 +16,4 @@ Christopher Meng (Fedora building)
 Jakub Čajka
 Giuseppe Scrivano
 Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
+Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)
--- a/src/lookup_string_in_fixed_set.c
+++ b/src/lookup_string_in_fixed_set.c
@ -21,6 +21,48 @@

 #define CHECK_LT(a, b) if ((a) >= b) return 0

+static const char multibyte_length_table[16] = {
+	0, 0, 0, 0,	 /* 0x00-0x3F */
+	0, 0, 0, 0,	 /* 0x40-0x7F */
+	0, 0, 0, 0,	 /* 0x80-0xBF */
+	2, 2, 3, 4,	 /* 0xC0-0xFF */
+};
+
+
+/**
+ * Get lenght of multibyte character sequence starting at a given byte.
+ * Returns zero if the byte is not a valid leading byte in UTF-8.
+ */
+static int GetMultibyteLength(char c) {
+	return multibyte_length_table[((unsigned char)c) >> 4];
+}
+
+/**
+ * Moves pointers one byte forward.
+ */
+static void NextPos(const unsigned char** pos,
+	const char** key,
+	const char** multibyte_start)
+{
+	++*pos;
+	if (*multibyte_start) {
+		/* Advance key to next byte in multibyte sequence. */
+		++*key;
+		/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
+		if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
+			*multibyte_start = 0;
+	} else {
+		if (GetMultibyteLength(**key)) {
+			/* Multibyte prefix was matched in the dafsa, start matching multibyte
+			 * content in next round. */
+			*multibyte_start = *key;
+		} else {
+			/* Advance key as a single byte character was matched. */
+			++*key;
+		}
+	}
+}
+
 /*
 * Read next offset from pos.
 * Returns true if an offset could be read, false otherwise.
@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
 	return(*offset & 0x80) != 0;
 }

+/*
+ * Check if byte at offset matches first character in key.
+ * This version assumes a range check was already performed by the caller.
+ */
+
+static int IsMatchUnchecked(const unsigned char matcher,
+	const char* key,
+	const char* multibyte_start)
+{
+	if (multibyte_start) {
+		/* Multibyte matching mode. */
+		if (multibyte_start == key) {
+			/* Match leading byte, which will also match the sequence length. */
+			return (matcher ^ 0x80) == (const unsigned char)*key;
+		} else {
+			/* Match following bytes. */
+			return (matcher ^ 0xC0) == (const unsigned char)*key;
+		}
+	}
+	/* If key points at a leading byte in a multibyte sequence, but we are not yet
+	 * in multibyte mode, then the dafsa should contain a special byte to indicate
+	 * a mode switch. */
+	if (GetMultibyteLength(*key)) {
+		return matcher == 0x1F;
+	}
+	/* Normal matching of a single byte character. */
+	return matcher == (const unsigned char)*key;
+}
+
 /*
 * Check if byte at offset matches first character in key.
 * This version matches characters not last in label.
@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)

 static int IsMatch(const unsigned char* offset,
 	const unsigned char* end,
-	const char* key)
+	const char* key,
+	const char* multibyte_start)
 {
 	CHECK_LT(offset, end);
-	return *offset == *key;
+	return IsMatchUnchecked(*offset, key, multibyte_start);
 }

 /*
@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,

 static int IsEndCharMatch(const unsigned char* offset,
 	const unsigned char* end,
-	const char* key)
+	const char* key,
+	const char* multibyte_start)
 {
 	CHECK_LT(offset, end);
-	return *offset == (*key | 0x80);
+	return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
 }

 /*
@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,

 static int GetReturnValue(const unsigned char* offset,
 	const unsigned char* end,
+	const char* multibyte_start,
 	int* return_value)
 {
 	CHECK_LT(offset, end);
-	if ((*offset & 0xE0) == 0x80) {
+	if (!multibyte_start && (*offset & 0xE0) == 0x80) {
 		*return_value = *offset & 0x0F;
 		return 1;
 	}
@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 	const unsigned char* end = graph + length;
 	const unsigned char* offset = pos;
 	const char* key_end = key + key_length;
+	const char* multibyte_start = 0;

 	while (GetNextOffset(&pos, end, &offset)) {
 		/*char <char>+ end_char offsets
@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,

 		if (key != key_end && !IsEOL(offset, end)) {
 			/* Leading <char> is not a match. Don't dive into this child */
-			if (!IsMatch(offset, end, key))
+			if (!IsMatch(offset, end, key, multibyte_start))
 				continue;
 			did_consume = 1;
-			++offset;
-			++key;
+			NextPos(&offset, &key, &multibyte_start);
 			/* Possible matches at this point:
 			 * <char>+ end_char offsets
 			 * <char>+ return value
@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,

 			/* Remove all remaining <char> nodes possible */
 			while (!IsEOL(offset, end) && key != key_end) {
-				if (!IsMatch(offset, end, key))
+				if (!IsMatch(offset, end, key, multibyte_start))
 					return -1;
-				++key;
-				++offset;
+				NextPos(&offset, &key, &multibyte_start);
 			}
 		}
 		/* Possible matches at this point:
@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 		if (key == key_end) {
 			int return_value;

-			if (GetReturnValue(offset, end, &return_value))
+			if (GetReturnValue(offset, end, multibyte_start, &return_value))
 				return return_value;
 			/* The DAFSA guarantees that if the first char is a match, all
 			 * remaining char elements MUST match if the key is truly present.
@ -191,14 +264,23 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 				return -1;
 			continue;
 		}
-		if (!IsEndCharMatch(offset, end, key)) {
+		if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
 			if (did_consume)
 				return -1; /* Unexpected */
 			continue;
 		}
-		++key;
-		pos = ++offset; /* Dive into child */
+		NextPos(&offset, &key, &multibyte_start);
+		pos = offset; /* Dive into child */
 	}

 	return -1; /* No match */
 }
+
+/* prototype to skip warning with -Wmissing-prototypes */
+int _HIDDEN GetUtfMode(const unsigned char*, size_t);
+
+int _HIDDEN GetUtfMode(const unsigned char* graph,
+	size_t length)
+{
+	return length > 0 && graph[length - 1] < 0x80;
+}
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
 and generates a C++ file with a byte array representing graph that can be
 used as a memory efficient replacement for the perfect hash table.

-The input strings are assumed to consist of printable 7-bit ASCII characters
-and the return values are assumed to be one digit integers.
+The input strings must consist of printable 7-bit ASCII characters or UTF-8
+multibyte sequences. Control characters in the range [0x00-0x1F] are not
+allowed. The return values must be one digit integers. .

 In this program a DAFSA is a diamond shaped graph starting at a common
 source node and ending at a common sink node. All internal nodes contain
@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:

 <byte> ::= < 8-bit value in range [0x00-0xFF] >

-<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
-<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
+<char> ::= < byte in range [0x1F-0x7F] >
+<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
 <return value> ::= < value + 0x80, byte in range [0x80-0x8F] >

 <offset1> ::= < byte in range [0x00-0x3F] >
@ -84,13 +85,18 @@ The generated byte array can described by the following BNF:
         | <prefix> <node>
         | <end_label>

-<dafsa> ::= <source>
-          | <dafsa> <node>
+<graph> ::= <graph>
+          | <graph> <node>
+
+<version> ::= <empty>            # The DAFSA was generated in ASCII mode.
+          | < byte value 0x01 >  # The DAFSA was generated in UTF-8 mode.
+
+<dafsa> ::= <graph> <version>

 Decoding:

-<char> -> printable 7-bit ASCII character
-<end_char> & 0x7F -> printable 7-bit ASCII character
+<char> -> character
+<end_char> & 0x7F -> character
 <return value> & 0x0F -> integer
 <offset1 & 0x3F> -> integer
 ((<offset2> & 0x1F>) << 8) + <byte> -> integer
@ -105,6 +111,28 @@ between previous child node and next child node. Thus each offset links a node
 to a child node. The distance is always counted between start addresses, i.e.
 first byte in decoded offset or first byte in child node.

+Transcoding of UTF-8 multibyte sequences:
+
+The original DAFSA format was limited to 7-bit printable ASCII characters in
+range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
+By transcoding of such characters the new format preserves compatibility with
+old parsers, so that a DAFSA in the extended format can be used by an old
+parser without false positives, although strings containing transcoded
+characters will never match. Since the format is extended rather than being
+changed, a parser supporting the new format will automatically support data
+generated in the old format.
+
+Transcoding is performed by insertion of a start byte with the special value
+0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
+the range of printable ASCII.
+
+2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
+
+3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
+
+4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
+                00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
+
 Example 1:

 %%
@ -197,8 +225,25 @@ import sys
 class InputError(Exception):
  """Exception raised for errors in the input file."""

+# Length of a character starting at a given byte.
+char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x00-0x0F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x10-0x1F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x20-0x2F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x30-x03F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x40-0x4F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x50-x05F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x60-0x6F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x70-x07F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x80-0x8F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x90-0x9F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xA0-0xAF
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xB0-0xBF
+                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xC0-0xCF
+                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xD0-0xDF
+                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # 0xE0-0xEF
+                      4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF

-def to_dafsa(words):
+def to_dafsa(words, utf_mode):
  """Generates a DAFSA from a word list and returns the source node.

  Each word is split into characters so that each character is represented by
@ -206,14 +251,31 @@ def to_dafsa(words):
  """
  if not words:
    raise InputError('The domain list must not be empty')
-  def to_nodes(word):
+  def to_nodes(word, multibyte_length):
    """Split words into characters"""
-    if not 0x1F < ord(word[0]) < 0x80:
-      raise InputError('Domain names must be printable 7-bit ASCII')
-    if len(word) == 1:
-      return chr(int(word[0], 16) & 0x0F), [None]
-    return word[0], [to_nodes(word[1:])]
-  return [to_nodes(word) for word in words]
+    byte = ord(word[0])
+    if multibyte_length:
+      # Consume next byte in multibyte sequence.
+      if byte & 0xC0 != 0x80:
+        raise InputError('Invalid UTF-8 multibyte sequence')
+      return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
+    char_length = char_length_table[byte]
+    if char_length == 1:
+      # 7-bit printable ASCII.
+      if len(word) == 1:
+        return chr(int(word[0], 16) & 0x0F), [None]
+      return word[0], [to_nodes(word[1:], 0)]
+    elif char_length > 1:
+      # Leading byte in multibyte sequence.
+      if not utf_mode:
+        raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
+      if len(word) <= char_length:
+        raise InputError('Unterminated UTF-8 multibyte sequence')
+      return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
+    # Unexpected character.
+    raise InputError('Domain names must be printable ASCII or UTF-8')
+
+  return [to_nodes(word, 0) for word in words]


 def to_words(node):
@ -396,7 +458,7 @@ def encode_label(label):
  return buf


-def encode(dafsa):
+def encode(dafsa, utf_mode):
  """Encodes a DAFSA to a list of bytes"""
  output = []
  offsets = {}
@ -412,6 +474,8 @@ def encode(dafsa):

  output.extend(encode_links(dafsa, offsets, len(output)))
  output.reverse()
+  if utf_mode:
+    output.append(0x01)
  return output


@ -430,22 +494,22 @@ def to_cxx(data):
  return text


-def words_to_whatever(words, converter):
+def words_to_whatever(words, converter, utf_mode):
  """Generates C++ code from a word list"""
-  dafsa = to_dafsa(words)
+  dafsa = to_dafsa(words, utf_mode)
  for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
    dafsa = fun(dafsa)
-  return converter(encode(dafsa))
+  return converter(encode(dafsa, utf_mode))


-def words_to_cxx(words):
+def words_to_cxx(words, utf_mode):
  """Generates C++ code from a word list"""
-  return words_to_whatever(words, to_cxx)
+  return words_to_whatever(words, to_cxx, utf_mode)


-def words_to_binary(words):
+def words_to_binary(words, utf_mode):
  """Generates C++ code from a word list"""
-  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, bytearray)
+  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, bytearray, utf_mode)


 def parse_psl2c(infile):
@ -455,10 +519,10 @@ def parse_psl2c(infile):
  for line in lines:
    if line[-3:-1] != ', ':
      raise InputError('Expected "domainname, <digit>", found "%s"' % line)
-    # Technically the DAFSA format could support return values in range [0-31],
+    # Technically the DAFSA format could support return values in range [0x00-0x1E],
    # but the values below are the only with a defined meaning.
    if line[-1] not in '0123456789ABCDEF':
-      raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % line[-1])
+      raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1])

 #  with open("gperf.out", 'w') as outfile:
 #    for line in sorted(lines):
@ -540,6 +604,8 @@ def usage():
  print('  --input-format=psl      infile is a Public Suffix List file')
  print('  --output-format=cxx     Write DAFSA as C/C++ code')
  print('  --output-format=binary  Write DAFSA binary data')
+  print('  --encoding=ascii        7-bit ASCII mode (default)')
+  print('  --encoding=utf-8        UTF-8 mode')
  exit(1)


@ -550,6 +616,7 @@ def main():

  converter = words_to_cxx
  parser = parse_psl2c
+  utf_mode = False

  for arg in sys.argv[1:-2]:
    if arg.startswith('--input-format='):
@ -567,18 +634,24 @@ def main():
        converter = words_to_binary
      elif value == 'cxx':
        converter = words_to_cxx
+    elif arg.startswith('--encoding='):
+      value = arg[11:].lower()
+      if value == 'ascii':
+        utf_mode = False
+      elif value == 'utf-8':
+        utf_mode = True
      else:
-        print("Unknown output format '%s'" % value)
+        print("Unknown encoding '%s'" % value)
        return 1
    else:
      usage()

  if sys.argv[-2] == '-':
    with open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(sys.stdin)))
+      outfile.write(converter(parser(sys.stdin), utf_mode))
  else:
    with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(infile)))
+      outfile.write(converter(parser(infile), utf_mode))

  return 0

--- a/src/psl.c
+++ b/src/psl.c
@ -784,6 +784,7 @@ static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_en

 /* prototype */
 int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
+int GetUtfMode(const unsigned char*, size_t);

 static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
 {
@ -791,6 +792,15 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
 	const char *p;
 	char *punycode = NULL;
 	int need_conversion = 0;
+	size_t dafsa_size = 0;
+	const unsigned char *dafsa = NULL;
+	int utf_mode = 0;
+
+	if (psl == &_builtin_psl || psl->dafsa) {
+		dafsa_size = psl == &_builtin_psl ? sizeof(kDafsa) : psl->dafsa_size;
+		dafsa = psl == &_builtin_psl ? kDafsa : psl->dafsa;
+		utf_mode = GetUtfMode(dafsa, dafsa_size);
+	}

 	/* this function should be called without leading dots, just make sure */
 	if (*domain == '.')
@ -801,7 +811,7 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
 	for (p = domain; *p; p++) {
 		if (*p == '.')
 			suffix.nlabels++;
-		else if (*((unsigned char *)p) >= 128)
+		else if (!utf_mode && *((unsigned char *)p) >= 128)
 			need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */
 	}

@ -831,9 +841,7 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
 		suffix.length = p - suffix.label;
 	}

-	if (psl == &_builtin_psl || psl->dafsa) {
-		size_t dafsa_size = psl == &_builtin_psl ? sizeof(kDafsa) : psl->dafsa_size;
-		const unsigned char *dafsa = psl == &_builtin_psl ? kDafsa : psl->dafsa;
+	if (dafsa) {
 		int rc = LookupStringInFixedSet(dafsa, dafsa_size, suffix.label, suffix.length);
 		if (rc != -1) {
 			/* check for correct rule type */