From e03953e27a7a8ff5b4f5cce1b5de9039e64c293d Mon Sep 17 00:00:00 2001
From: Olle Liljenzin <olle@liljenzin.se>
Date: Wed, 2 Nov 2016 20:22:01 +0100
Subject: [PATCH] Updated DAFSA generator and parser to support UTF-8 encoding

---
 AUTHORS                          |   1 +
 src/lookup_string_in_fixed_set.c | 103 ++++++++++++++++++++++++++-----
 src/psl-make-dafsa               |  81 ++++++++++++++++++++----
 3 files changed, 157 insertions(+), 28 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 33dad7b..6f3195c 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -16,3 +16,4 @@ Christopher Meng (Fedora building)
 Jakub Čajka
 Giuseppe Scrivano
 Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
+Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)
diff --git a/src/lookup_string_in_fixed_set.c b/src/lookup_string_in_fixed_set.c
index ddf63ae..01edc4e 100644
--- a/src/lookup_string_in_fixed_set.c
+++ b/src/lookup_string_in_fixed_set.c
@@ -21,6 +21,48 @@
 
 #define CHECK_LT(a, b) if ((a) >= b) return 0
 
+static const char multibyte_length_table[16] = {
+	0, 0, 0, 0,	 /* 0x00-0x3F */
+	0, 0, 0, 0,	 /* 0x40-0x7F */
+	0, 0, 0, 0,	 /* 0x80-0xBF */
+	2, 2, 3, 4,	 /* 0xC0-0xFF */
+};
+
+
+/**
+ * Get lenght of multibyte character sequence starting at a given byte.
+ * Returns zero if the byte is not a valid leading byte in UTF-8.
+ */
+static int GetMultibyteLength(char c) {
+	return multibyte_length_table[((unsigned char)c) >> 4];
+}
+
+/**
+ * Moves pointers one byte forward.
+ */
+static void NextPos(const unsigned char** pos,
+	const char** key,
+	const char** multibyte_start)
+{
+	++*pos;
+	if (*multibyte_start) {
+		/* Advance key to next byte in multibyte sequence. */
+		++*key;
+		/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
+		if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
+			*multibyte_start = 0;
+	} else {
+		if (GetMultibyteLength(**key)) {
+			/* Multibyte prefix was matched in the dafsa, start matching multibyte
+			 * content in next round. */
+			*multibyte_start = *key;
+		} else {
+			/* Advance key as a single byte character was matched. */
+			++*key;
+		}
+	}
+}
+
 /*
  * Read next offset from pos.
  * Returns true if an offset could be read, false otherwise.
@@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
 	return(*offset & 0x80) != 0;
 }
 
+/*
+ * Check if byte at offset matches first character in key.
+ * This version assumes a range check was already performed by the caller.
+ */
+
+static int IsMatchUnchecked(const unsigned char matcher,
+	const char* key,
+	const char* multibyte_start)
+{
+	if (multibyte_start) {
+		/* Multibyte matching mode. */
+		if (multibyte_start == key) {
+			/* Match leading byte, which will also match the sequence length. */
+			return (matcher ^ 0x80) == (const unsigned char)*key;
+		} else {
+			/* Match following bytes. */
+			return (matcher ^ 0xC0) == (const unsigned char)*key;
+		}
+	}
+	/* If key points at a leading byte in a multibyte sequence, but we are not yet
+	 * in multibyte mode, then the dafsa should contain a special byte to indicate
+	 * a mode switch. */
+	if (GetMultibyteLength(*key)) {
+		return matcher == 0x1F;
+	}
+	/* Normal matching of a single byte character. */
+	return matcher == (const unsigned char)*key;
+}
+
 /*
  * Check if byte at offset matches first character in key.
  * This version matches characters not last in label.
@@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
 
 static int IsMatch(const unsigned char* offset,
 	const unsigned char* end,
-	const char* key)
+	const char* key,
+	const char* multibyte_start)
 {
 	CHECK_LT(offset, end);
-	return *offset == *key;
+	return IsMatchUnchecked(*offset, key, multibyte_start);
 }
 
 /*
@@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,
 
 static int IsEndCharMatch(const unsigned char* offset,
 	const unsigned char* end,
-	const char* key)
+	const char* key,
+	const char* multibyte_start)
 {
 	CHECK_LT(offset, end);
-	return *offset == (*key | 0x80);
+	return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
 }
 
 /*
@@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,
 
 static int GetReturnValue(const unsigned char* offset,
 	const unsigned char* end,
+	const char* multibyte_start,
 	int* return_value)
 {
 	CHECK_LT(offset, end);
-	if ((*offset & 0xE0) == 0x80) {
+	if (!multibyte_start && (*offset & 0xE0) == 0x80) {
 		*return_value = *offset & 0x0F;
 		return 1;
 	}
@@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 	const unsigned char* end = graph + length;
 	const unsigned char* offset = pos;
 	const char* key_end = key + key_length;
+	const char* multibyte_start = 0;
 
 	while (GetNextOffset(&pos, end, &offset)) {
 		/*char <char>+ end_char offsets
@@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 
 		if (key != key_end && !IsEOL(offset, end)) {
 			/* Leading <char> is not a match. Don't dive into this child */
-			if (!IsMatch(offset, end, key))
+			if (!IsMatch(offset, end, key, multibyte_start))
 				continue;
 			did_consume = 1;
-			++offset;
-			++key;
+			NextPos(&offset, &key, &multibyte_start);
 			/* Possible matches at this point:
 			 * <char>+ end_char offsets
 			 * <char>+ return value
@@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 
 			/* Remove all remaining <char> nodes possible */
 			while (!IsEOL(offset, end) && key != key_end) {
-				if (!IsMatch(offset, end, key))
+				if (!IsMatch(offset, end, key, multibyte_start))
 					return -1;
-				++key;
-				++offset;
+				NextPos(&offset, &key, &multibyte_start);
 			}
 		}
 		/* Possible matches at this point:
@@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 		if (key == key_end) {
 			int return_value;
 
-			if (GetReturnValue(offset, end, &return_value))
+			if (GetReturnValue(offset, end, multibyte_start, &return_value))
 				return return_value;
 			/* The DAFSA guarantees that if the first char is a match, all
 			 * remaining char elements MUST match if the key is truly present.
@@ -191,13 +264,13 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 				return -1;
 			continue;
 		}
-		if (!IsEndCharMatch(offset, end, key)) {
+		if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
 			if (did_consume)
 				return -1; /* Unexpected */
 			continue;
 		}
-		++key;
-		pos = ++offset; /* Dive into child */
+		NextPos(&offset, &key, &multibyte_start);
+		pos = offset; /* Dive into child */
 	}
 
 	return -1; /* No match */
diff --git a/src/psl-make-dafsa b/src/psl-make-dafsa
index 99c3135..bd9a79a 100755
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
 and generates a C++ file with a byte array representing graph that can be
 used as a memory efficient replacement for the perfect hash table.
 
-The input strings are assumed to consist of printable 7-bit ASCII characters
-and the return values are assumed to be one digit integers.
+The input strings must consist of printable 7-bit ASCII characters or UTF-8
+multibyte sequences. Control characters in the range [0x00-0x1F] are not
+allowed. The return values must be one digit integers. .
 
 In this program a DAFSA is a diamond shaped graph starting at a common
 source node and ending at a common sink node. All internal nodes contain
@@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:
 
 <byte> ::= < 8-bit value in range [0x00-0xFF] >
 
-<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
-<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
+<char> ::= < byte in range [0x1F-0x7F] >
+<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
 <return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
 
 <offset1> ::= < byte in range [0x00-0x3F] >
@@ -89,8 +90,8 @@ The generated byte array can described by the following BNF:
 
 Decoding:
 
-<char> -> printable 7-bit ASCII character
-<end_char> & 0x7F -> printable 7-bit ASCII character
+<char> -> character
+<end_char> & 0x7F -> character
 <return value> & 0x0F -> integer
 <offset1 & 0x3F> -> integer
 ((<offset2> & 0x1F>) << 8) + <byte> -> integer
@@ -105,6 +106,28 @@ between previous child node and next child node. Thus each offset links a node
 to a child node. The distance is always counted between start addresses, i.e.
 first byte in decoded offset or first byte in child node.
 
+Transcoding of UTF-8 multibyte sequences:
+
+The original DAFSA format was limited to 7-bit printable ASCII characters in
+range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
+By transcoding of such characters the new format preserves compatibility with
+old parsers, so that a DAFSA in the extended format can be used by an old
+parser without false positives, although strings containing transcoded
+characters will never match. Since the format is extended rather than being
+changed, a parser supporting the new format will automatically support data
+generated in the old format.
+
+Transcoding is performed by insertion of a start byte with the special value
+0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
+the range of printable ASCII.
+
+2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
+
+3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
+
+4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
+                00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
+
 Example 1:
 
 %%
@@ -197,6 +220,23 @@ import sys
 class InputError(Exception):
   """Exception raised for errors in the input file."""
 
+# Length of a character starting at a given byte.
+char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x00-0x0F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x10-0x1F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x20-0x2F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x30-x03F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x40-0x4F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x50-x05F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x60-0x6F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x70-x07F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x80-0x8F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x90-0x9F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xA0-0xAF
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xB0-0xBF
+                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xC0-0xCF
+                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xD0-0xDF
+                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # 0xE0-0xEF
+                      4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
 
 def to_dafsa(words):
   """Generates a DAFSA from a word list and returns the source node.
@@ -206,14 +246,29 @@ def to_dafsa(words):
   """
   if not words:
     raise InputError('The domain list must not be empty')
-  def to_nodes(word):
+  def to_nodes(word, multibyte_length):
     """Split words into characters"""
-    if not 0x1F < ord(word[0]) < 0x80:
-      raise InputError('Domain names must be printable 7-bit ASCII')
-    if len(word) == 1:
-      return chr(int(word[0], 16) & 0x0F), [None]
-    return word[0], [to_nodes(word[1:])]
-  return [to_nodes(word) for word in words]
+    byte = ord(word[0])
+    if multibyte_length:
+      # Consume next byte in multibyte sequence.
+      if byte & 0xC0 != 0x80:
+        raise InputError('Invalid UTF-8 multibyte sequence')
+      return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
+    char_length = char_length_table[byte]
+    if char_length == 1:
+      # 7-bit printable ASCII.
+      if len(word) == 1:
+        return chr(int(word[0], 16) & 0x0F), [None]
+      return word[0], [to_nodes(word[1:], 0)]
+    elif char_length > 1:
+      # Leading byte in multibyte sequence.
+      if len(word) <= char_length:
+        raise InputError('Unterminated UTF-8 multibyte sequence')
+      return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
+    # Unexpected character.
+    raise InputError('Domain names must be printable ASCII or UTF-8')
+
+  return [to_nodes(word, 0) for word in words]
 
 
 def to_words(node):