From 003dec4203f9711dc74f147a90eddcd94ad5025b Mon Sep 17 00:00:00 2001
From: Jeremy Ehrhardt <jeremy@bat-country.us>
Date: Fri, 16 Sep 2016 18:42:54 -0700
Subject: [PATCH 01/42] Change src/psl-make-dafsa shebang so it'll run on OS X

---
 src/psl-make-dafsa | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/psl-make-dafsa b/src/psl-make-dafsa
index 50c9dac..99c3135 100755
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 # Copyright 2014 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE.chromium file.

From 126d2dca9c15dc813c93ae30eb26a4ad6762ad05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Tue, 16 Aug 2016 14:06:33 +0200
Subject: [PATCH 02/42] Package and install psl.1 and psl-make-dafsa.1

Fixes #53
Reported-by: https://github.com/yselkowitz
---
 src/Makefile.am   | 2 ++
 tools/Makefile.am | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/Makefile.am b/src/Makefile.am
index 71a1d01..cd319ac 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -39,3 +39,5 @@ suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
 	./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
 
 EXTRA_DIST = psl-make-dafsa LICENSE.chromium
+
+dist_man_MANS = psl-make-dafsa.1
diff --git a/tools/Makefile.am b/tools/Makefile.am
index f758ccd..3e2c9e3 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -12,3 +12,5 @@ LDADD = ../src/libpsl.la
 #if WITH_LIBIDN
 #  LDADD += -lidn
 #endif
+
+dist_man_MANS = psl.1

From 7983f868201bd9af63eafffece80e19d1f23ff27 Mon Sep 17 00:00:00 2001
From: Dagobert Michelsen <dam@opencsw.org>
Date: Sun, 21 Aug 2016 14:14:18 +0200
Subject: [PATCH 03/42] Use proper library path and libs for ICU

---
 configure.ac    | 2 +-
 src/Makefile.am | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index 8fecc46..f44794b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -168,7 +168,7 @@ if test "$enable_runtime" = "libicu" -o "$enable_builtin" = "libicu"; then
       [AC_LANG_PROGRAM(
         [[#include <unicode/ustring.h>]],
         [[u_strToUTF8(NULL, 0, NULL, NULL, 0, NULL);]])],
-      [HAVE_LIBICU=yes; AC_MSG_RESULT([yes])],
+      [HAVE_LIBICU=yes; LIBICU_LIBS="-licuuc"; AC_MSG_RESULT([yes])],
       [AC_MSG_RESULT([no]); AC_MSG_ERROR(You requested libicu but it is not installed.)])
     LIBS=$OLDLIBS
   ])
diff --git a/src/Makefile.am b/src/Makefile.am
index cd319ac..a3426c3 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -11,7 +11,7 @@ libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
 # include ABI version information
 libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
 if WITH_LIBICU
-  libpsl_la_LDFLAGS += -licuuc
+  libpsl_la_LDFLAGS += $(LIBICU_LIBS)
 endif
 if WITH_LIBIDN2
   libpsl_la_LDFLAGS += -lidn2 -lunistring
@@ -24,7 +24,7 @@ noinst_PROGRAMS = psl2c
 psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
 psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/psl-make-dafsa\"
 if BUILTIN_GENERATOR_LIBICU
-  psl2c_LDADD = -licuuc
+  psl2c_LDADD = $(LIBICU_LIBS)
 endif
 if BUILTIN_GENERATOR_LIBIDN2
   psl2c_LDADD = @LTLIBICONV@ -lidn2 -lunistring

From c9de2de301cb3a6d91b4326c206b853fbbca4aed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Wed, 31 Aug 2016 15:16:04 +0200
Subject: [PATCH 04/42] Fix error msg when libunistring is missing

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index f44794b..2653141 100644
--- a/configure.ac
+++ b/configure.ac
@@ -191,7 +191,7 @@ fi
 if test "x$HAVE_LIBIDN2" = "xyes" -o "x$HAVE_LIBIDN" = "xyes"; then
   # Check for libunistring, we need it for psl_str_to_utf8lower()
   OLDLIBS=$LIBS
-  AC_SEARCH_LIBS(u8_tolower, unistring, HAVE_UNISTRING=yes, AC_MSG_ERROR(You requested libidn2 but libunistring is not installed.))
+  AC_SEARCH_LIBS(u8_tolower, unistring, HAVE_UNISTRING=yes, AC_MSG_ERROR(You requested libidn2|libidn but libunistring is not installed.))
   LIBS=$OLDLIBS
 fi
 

From 01d3f53321e67b89895230b8f8216046583cfa38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Wed, 21 Sep 2016 10:00:22 +0200
Subject: [PATCH 05/42] Add src/suffixes_dafsa.c to .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 3988824..6ee2c8d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,6 +67,7 @@ po/remove-potcdate.sed
 po/stamp-po
 src/psl2c
 src/suffixes.c
+src/suffixes_dafsa.c
 stamp-h1
 test-driver
 tests/*.log

From 1ab7be564138954dc0efeb310f2131c27cb7dc98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Wed, 21 Sep 2016 11:13:02 +0200
Subject: [PATCH 06/42] Check malloc/realloc results in src/psl.c

Fixes #57
Reported-by: https://github.com/daurnimator
---
 include/libpsl.h.in | 10 ++++++----
 src/psl.c           | 38 ++++++++++++++++++++++++++++----------
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/include/libpsl.h.in b/include/libpsl.h.in
index f8cbedc..ad38b95 100644
--- a/include/libpsl.h.in
+++ b/include/libpsl.h.in
@@ -53,10 +53,11 @@ extern "C" {
  * psl_error_t:
  * @PSL_SUCCESS: Successful return.
  * @PSL_ERR_INVALID_ARG: Invalid argument.
- * @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter
+ * @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter.
  * @PSL_ERR_TO_UTF16: Failed to convert to utf-16.
  * @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase.
  * @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8.
+ * @PSL_ERR_NO_MEM: Failed to allocate memory.
  *
  * Return codes for PSL functions.
  * Negative return codes mean failure.
@@ -66,9 +67,10 @@ typedef enum {
 	PSL_SUCCESS = 0,
 	PSL_ERR_INVALID_ARG = -1,
 	PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */
-	PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */
-	PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */
-	PSL_ERR_TO_UTF8 = -5 /* failed to convert utf-16 to utf-8 */
+	PSL_ERR_TO_UTF16 = -3,  /* failed to convert to utf-16 */
+	PSL_ERR_TO_LOWER = -4,  /* failed to convert utf-16 to lowercase */
+	PSL_ERR_TO_UTF8 = -5,   /* failed to convert utf-16 to utf-8 */
+	PSL_ERR_NO_MEM = -6    /* failed to allocate memory */
 } psl_error_t;
 
 typedef struct _psl_ctx_st psl_ctx_t;
diff --git a/src/psl.c b/src/psl.c
index c3a4ffe..9b952e6 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -263,11 +263,21 @@ static int _vector_add(_psl_vector_t *v, const _psl_entry_t *elem)
 	if (v) {
 		void *elemp;
 
-		elemp = malloc(sizeof(_psl_entry_t));
+		if (!(elemp = malloc(sizeof(_psl_entry_t))))
+			return -1;
+
 		memcpy(elemp, elem, sizeof(_psl_entry_t));
 
-		if (v->max == v->cur)
-			v->entry = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
+		if (v->max == v->cur) {
+			void *m = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
+
+			if (m)
+				v->entry = m;
+			else {
+				free(elemp);
+				return -1;
+			}
+		}
 
 		v->entry[v->cur++] = elemp;
 		return v->cur - 1;
@@ -776,8 +786,8 @@ static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_en
 			/* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
 			_suffix_init(&suffix, lookupname, strlen(lookupname));
 			suffix.flags = e->flags;
-			suffixp = _vector_get(v, _vector_add(v, &suffix));
-			suffixp->label = suffixp->label_buf; /* set label to changed address */
+			if ((suffixp = _vector_get(v, _vector_add(v, &suffix))))
+				suffixp->label = suffixp->label_buf; /* set label to changed address */
 		} /* else ignore */
 
 		free(lookupname);
@@ -1231,9 +1241,10 @@ psl_ctx_t *psl_load_fp(FILE *fp)
 				suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
 			}
 
-			suffixp->label = suffixp->label_buf; /* set label to changed address */
-
-			_add_punycode_if_needed(idna, psl->suffixes, suffixp);
+			if (suffixp) {
+				suffixp->label = suffixp->label_buf; /* set label to changed address */
+				_add_punycode_if_needed(idna, psl->suffixes, suffixp);
+			}
 		}
 	} while ((linep = fgets(buf, sizeof(buf), fp)));
 
@@ -1567,6 +1578,7 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
  *   PSL_ERR_TO_UTF16: Failed to convert @str to unicode
  *   PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase
  *   PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8
+ *   PSL_ERR_NO_MEM: Failed to allocate memory
  *
  * Since: 0.4
  */
@@ -1659,11 +1671,17 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 				size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
 				char *dst = malloc(dst_len + 1), *dst_tmp = dst;
 
-				if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1) {
+				if (!dst) {
+					ret = PSL_ERR_NO_MEM;
+				}
+				else if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1) {
 					uint8_t *resbuf = malloc(dst_len * 2 + 1);
 					size_t len = dst_len * 2; /* leave space for additional \0 byte */
 
-					if ((dst = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
+					if (!resbuf) {
+						ret = PSL_ERR_NO_MEM;
+					}
+					else if ((dst = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
 						/* u8_tolower() does not terminate the result string */
 						if (lower)
 							*lower = strndup((char *)dst, len);

From 10f7b5fe7c3edfd241908b41a4c6dcaf3b0033ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Wed, 21 Sep 2016 11:54:39 +0200
Subject: [PATCH 07/42] Fallback to malloc from alloca for larger memory chunks

Fixes #58
Reported-by: https://github.com/daurnimator
---
 src/psl.c | 54 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 9 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index 9b952e6..d77c9c3 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -524,17 +524,26 @@ static enum punycode_status punycode_encode(
 	return punycode_success;
 }
 
-static ssize_t _utf8_to_utf32(const char *in, size_t inlen, punycode_uint *out, size_t outlen)
+static ssize_t _utf8_to_utf32(const char *in, const size_t inlen, punycode_uint *out, size_t outlen)
 {
 	size_t n = 0;
 	unsigned char *s;
+	void *m;
 
 	if (!outlen)
 		return -1;
 
 	outlen--;
 
-	s = alloca(inlen + 1);
+	if (inlen < 1024) {
+		s = alloca(inlen + 1);
+		m = NULL;
+	} else
+		s = m = malloc(inlen + 1);
+
+	if (!s)
+		return -1;
+
 	memcpy(s, in, inlen);
 	s[inlen] = 0;
 
@@ -557,10 +566,14 @@ static ssize_t _utf8_to_utf32(const char *in, size_t inlen, punycode_uint *out,
 				return -1;
 			out[n++] = ((*s & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
 			s += 4;
-		} else
+		} else {
+			free(m);
 			return -1;
+		}
 	}
 
+	free(m);
+
 	return n;
 }
 
@@ -1616,10 +1629,21 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 	char *utf8_lower;
 	UConverter *uconv;
 
-	/* C89 allocation */
-	utf16_dst   = alloca(sizeof(UChar) * (str_length * 2 + 1));
-	utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
-	utf8_lower  = alloca(str_length * 2 + 1);
+	if (str_length < 256) {
+		/* C89 allocation */
+		utf16_dst   = alloca(sizeof(UChar) * (str_length * 2 + 1));
+		utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
+		utf8_lower  = alloca(str_length * 2 + 1);
+	} else {
+		utf16_dst   = malloc(sizeof(UChar) * (str_length * 2 + 1));
+		utf16_lower = malloc(sizeof(UChar) * (str_length * 2 + 1));
+		utf8_lower  = malloc(str_length * 2 + 1);
+
+		if (!utf16_dst || !utf16_lower || !utf8_lower) {
+			ret = PSL_ERR_NO_MEM;
+			goto out;
+		}
+	}
 
 	uconv = ucnv_open(encoding, &status);
 	if (U_SUCCESS(status)) {
@@ -1631,8 +1655,14 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 			if (U_SUCCESS(status)) {
 				u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status);
 				if (U_SUCCESS(status)) {
-					if (lower)
-						*lower = strdup(utf8_lower);
+					if (lower) {
+						if (str_length < 256)
+							*lower = strdup(utf8_lower);
+						else {
+							*lower = utf8_lower;
+							utf8_lower = NULL;
+						}
+					}
 					ret = PSL_SUCCESS;
 				} else {
 					ret = PSL_ERR_TO_UTF8;
@@ -1650,6 +1680,12 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 		ret = PSL_ERR_CONVERTER;
 		/* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */
 	}
+out:
+	if (str_length >= 256) {
+		free(utf16_dst);
+		free(utf16_lower);
+		free(utf8_lower);
+	}
 	} while (0);
 #elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
 	do {

From 6cfb33e5308d95f787e574f13c2b29d6fe27d42a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Wed, 21 Sep 2016 12:03:00 +0200
Subject: [PATCH 08/42] Amend API docs to be more precise about invalid input.

Fixes #59
Reported-by: https://github.com/daurnimator
---
 src/psl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index d77c9c3..307a975 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -958,7 +958,7 @@ suffix_yes:
  * For cookie domain checking see psl_is_cookie_domain_acceptable().
  *
  * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
- * Other encodings result in unexpected behavior.
+ * Other encodings likely result in incorrect return values.
  *
  * @psl is a context returned by either psl_load_file(), psl_load_fp() or
  * psl_builtin().
@@ -988,7 +988,7 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
  * %PSL_TYPE_PRIVATE, %PSL_TYPE_ICANN and %PSL_TYPE_ANY.
  *
  * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
- * Other encodings result in unexpected behavior.
+ * Other encodings likely result in incorrect return values.
  *
  * @psl is a context returned by either psl_load_file(), psl_load_fp() or
  * psl_builtin().
@@ -1014,7 +1014,7 @@ int psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type)
  * of the [Mozilla Public Suffix List](https://publicsuffix.org).
  *
  * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
- * Other encodings result in unexpected behavior.
+ * Other encodings likely result in incorrect return values.
  *
  * @psl is a context returned by either psl_load_file(), psl_load_fp() or
  * psl_builtin().
@@ -1053,7 +1053,7 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
  * of the [Mozilla Public Suffix List](https://publicsuffix.org).
  *
  * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
- * Other encodings result in unexpected behavior.
+ * Other encodings likely result in incorrect return values.
  *
  * @psl is a context returned by either psl_load_file(), psl_load_fp() or
  * psl_builtin().
@@ -1520,7 +1520,7 @@ static int _isip(const char *hostname)
  * @hostname.
  *
  * For international domain names both, @hostname and @cookie_domain, have to be either in lowercase UTF-8
- * or in ASCII form (punycode). Other encodings or mixing UTF-8 and punycode result in unexpected behavior.
+ * or in ASCII form (punycode). Other encodings or mixing UTF-8 and punycode likely result in incorrect return values.
  *
  * Examples:
  * 1. Cookie domain 'example.com' would be acceptable for hostname 'www.example.com',

From 0264454ea2309209ea5b9440ccce07497fa6bb1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Wed, 21 Sep 2016 12:34:44 +0200
Subject: [PATCH 09/42] Add Coverity badge

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 884084b..ba14b71 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
-[![Build Status](https://travis-ci.org/rockdaboot/libpsl.png?branch=master)](https://travis-ci.org/rockdaboot/libpsl)
+[![Travis-CI Status](https://travis-ci.org/rockdaboot/libpsl.png?branch=master)](https://travis-ci.org/rockdaboot/libpsl)
+[![Coverity Scan](https://img.shields.io/coverity/scan/10227.svg)](https://scan.coverity.com/projects/rockdaboot-libpsl)
 
 libpsl - C library to handle the Public Suffix List
 ===================================================

From b2fcafcfddb80d2d06371a2dc246f89c7552beb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Thu, 22 Sep 2016 10:57:05 +0200
Subject: [PATCH 10/42] Add 'make clean' to contrib/check-hard

---
 contrib/check-hard | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/check-hard b/contrib/check-hard
index a3a3c3a..11c15ea 100755
--- a/contrib/check-hard
+++ b/contrib/check-hard
@@ -51,6 +51,7 @@ for CC in gcc clang; do
       for xLCALL in C tr_TR.utf8; do
         export TESTS_ENVIRONMENT="LC_ALL=$xLCALL VALGRIND_TESTS=$xVALGRIND"
         echo "    *** TESTS_ENVIRONMENT=\"$TESTS_ENVIRONMENT\"" make check -j$CORES
+        make clean > /dev/null
         make check -j$CORES > /dev/null
       done
     done

From 9e1ca81be4b1d658ac5a7a5f312d5ada3a9ba6b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Thu, 22 Sep 2016 11:03:45 +0200
Subject: [PATCH 11/42] Remove memory allocations from _utf8_to_utf32()

Reported-by: https://github.com/daurnimator
---
 src/psl.c | 36 ++++++++++++------------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index 307a975..5a84227 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -524,56 +524,44 @@ static enum punycode_status punycode_encode(
 	return punycode_success;
 }
 
-static ssize_t _utf8_to_utf32(const char *in, const size_t inlen, punycode_uint *out, size_t outlen)
+static ssize_t _utf8_to_utf32(const char *in, size_t inlen, punycode_uint *out, size_t outlen)
 {
 	size_t n = 0;
-	unsigned char *s;
-	void *m;
+	const unsigned char *s = (void *)in;
+	const unsigned char *e = (void *)(in + inlen);
 
 	if (!outlen)
 		return -1;
 
 	outlen--;
 
-	if (inlen < 1024) {
-		s = alloca(inlen + 1);
-		m = NULL;
-	} else
-		s = m = malloc(inlen + 1);
+	while (n < outlen) {
+		size_t inleft = e - s;
 
-	if (!s)
-		return -1;
-
-	memcpy(s, in, inlen);
-	s[inlen] = 0;
-
-	while (*s && n < outlen) {
-		if ((*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
+		if (inleft >= 1 && (*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
 			out[n++] = *s;
 			s++;
-		} else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
+		} else if (inleft >= 2 && (*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
 			if ((s[1] & 0xC0) != 0x80)
 				return -1;
 			out[n++] = ((*s & 0x1F) << 6) | (s[1] & 0x3F);
 			s += 2;
-		} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
+		} else if (inleft >= 3 && (*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
 			if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
 				return -1;
 			out[n++] = ((*s & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
 			s += 3;
-		} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
+		} else if (inleft >= 4 && (*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
 			if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
 				return -1;
 			out[n++] = ((*s & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
 			s += 4;
-		} else {
-			free(m);
+		} else if (!inleft) {
+			break;
+		} else
 			return -1;
-		}
 	}
 
-	free(m);
-
 	return n;
 }
 

From 351b3fb912b63692eb086ac21fd5402e54d70fa2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Thu, 22 Sep 2016 11:33:31 +0200
Subject: [PATCH 12/42] Remove redundant define of countof()

---
 src/psl.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index 5a84227..efa631e 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -101,9 +101,6 @@
 
 #include <libpsl.h>
 
-/* number of elements within an array */
-#define countof(a) (sizeof(a)/sizeof(*(a)))
-
 #ifndef HAVE_STRNDUP
 /* I found no strndup on my old SUSE 7.3 test system (gcc 2.95) */
 
@@ -586,7 +583,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
 		/* printf("s=%s inlen=%zd\n", label, labellen); */
 
 		if (_mem_is_ascii(label, labellen)) {
-			if (outlen + labellen + (e != NULL)>= outsize)
+			if (outlen + labellen + (e != NULL) >= outsize)
 				return 1;
 
 			/* printf("outlen=%zd labellen=%zd\n", outlen, labellen); */
@@ -598,7 +595,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
 			if (outlen + labellen + (e != NULL) + 4 >= outsize)
 				return 1;
 
-			if ((inputlen = _utf8_to_utf32(label, labellen, input, sizeof (input) / sizeof (input[0]))) < 0)
+			if ((inputlen = _utf8_to_utf32(label, labellen, input, countof(input))) < 0)
 				return 1;
 
 			memcpy(out + outlen, "xn--", 4);
@@ -702,9 +699,9 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
 		UChar utf16_dst[128], utf16_src[128];
 		int32_t utf16_src_length;
 
-		u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, utf8, -1, &status);
+		u_strFromUTF8(utf16_src, countof(utf16_src), &utf16_src_length, utf8, -1, &status);
 		if (U_SUCCESS(status)) {
-			int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
+			int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, countof(utf16_dst), &info, &status);
 			if (U_SUCCESS(status)) {
 				u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
 				if (U_SUCCESS(status)) {

From e2812e8c4cff7daba154f851945fd1e217a1ffb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Thu, 22 Sep 2016 15:53:31 +0200
Subject: [PATCH 13/42] Check return value for strdup and strndup

Fixes #60
Reported-by: https://github.com/daurnimator
---
 src/psl.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index efa631e..d637394 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -706,8 +706,8 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
 				u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
 				if (U_SUCCESS(status)) {
 					if (ascii)
-						*ascii = strdup(lookupname);
-					ret = 0;
+						if ((*ascii = strdup(lookupname)))
+							ret = 0;
 				} /* else
 					fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
 			} /* else
@@ -734,6 +734,8 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
 		uint8_t *tmp = lower;
 		lower = (uint8_t *)strndup((char *)lower, len);
 		free(tmp);
+		if (!lower)
+			return -1;
 	}
 
 	if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
@@ -762,8 +764,8 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
 
 	if (_domain_to_punycode(utf8, lookupname, sizeof(lookupname)) == 0) {
 		if (ascii)
-			*ascii = strdup(lookupname);
-		ret = 0;
+			if ((*ascii = strdup(lookupname)))
+				ret = 0;
 	}
 #endif
 
@@ -1595,7 +1597,8 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 		if (lower) {
 			char *p;
 
-			*lower = strdup(str);
+			if (!(*lower = strdup(str)))
+				return PSL_ERR_NO_MEM;
 
 			/* convert ASCII string to lowercase */
 			for (p = *lower; *p; p++)
@@ -1640,15 +1643,16 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 			if (U_SUCCESS(status)) {
 				u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status);
 				if (U_SUCCESS(status)) {
+					ret = PSL_SUCCESS;
 					if (lower) {
-						if (str_length < 256)
-							*lower = strdup(utf8_lower);
-						else {
+						if (str_length < 256) {
+							if (!(*lower = strdup(utf8_lower)))
+								ret = PSL_ERR_NO_MEM;
+						} else {
 							*lower = utf8_lower;
 							utf8_lower = NULL;
 						}
 					}
-					ret = PSL_SUCCESS;
 				} else {
 					ret = PSL_ERR_TO_UTF8;
 					/* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
@@ -1674,6 +1678,7 @@ out:
 	} while (0);
 #elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
 	do {
+		printf("### encoding=%s lower=%p\n", encoding, lower ? *lower : NULL);
 		/* find out local charset encoding */
 		if (!encoding) {
 			encoding = nl_langinfo(CODESET);
@@ -1704,16 +1709,14 @@ out:
 					}
 					else if ((dst = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
 						/* u8_tolower() does not terminate the result string */
+						ret = PSL_SUCCESS;
 						if (lower)
-							*lower = strndup((char *)dst, len);
+							if (!(*lower = strndup((char *)dst, len)))
+								ret = PSL_ERR_NO_MEM;
 					} else {
 						ret = PSL_ERR_TO_LOWER;
 						/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
 					}
-
-					if (lower)
-						*lower = strndup(dst, dst_len - dst_len_tmp);
-					ret = PSL_SUCCESS;
 				} else {
 					ret = PSL_ERR_TO_UTF8;
 					/* fprintf(stderr, "Failed to convert '%s' string into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
@@ -1737,7 +1740,8 @@ out:
 			if ((dst = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str), 0, UNINORM_NFKC, resbuf, &len))) {
 				/* u8_tolower() does not terminate the result string */
 				if (lower)
-					*lower = strndup((char *)dst, len);
+					if (!(*lower = strndup((char *)dst, len)))
+						ret = PSL_ERR_NO_MEM;
 			} else {
 				ret = PSL_ERR_TO_LOWER;
 				/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */

From 1baaacccd5e1349f12c32f57e4a5e7098525415c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Fri, 23 Sep 2016 11:12:52 +0200
Subject: [PATCH 14/42] Fix libidn/libidn2 code path of psl_str_to_utf8lower()

* fixing memory leaks
* proper handling of unterminated results of u8_tolower()
* second call to iconv() ensures flush of internal memory
* check more code paths of psl_str_to_utf8lower() via
  tests/test-registrable-domain.c
---
 src/psl.c                       | 50 ++++++++++++++++++++------------
 tests/test-registrable-domain.c | 51 +++++++++++++++++++++++++++++++--
 2 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index d637394..68bb014 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -73,6 +73,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#include <time.h>
 #include <errno.h>
 #include <limits.h> /* for UINT_MAX */
 #include <langinfo.h>
@@ -1678,7 +1679,6 @@ out:
 	} while (0);
 #elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
 	do {
-		printf("### encoding=%s lower=%p\n", encoding, lower ? *lower : NULL);
 		/* find out local charset encoding */
 		if (!encoding) {
 			encoding = nl_langinfo(CODESET);
@@ -1700,19 +1700,25 @@ out:
 				if (!dst) {
 					ret = PSL_ERR_NO_MEM;
 				}
-				else if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1) {
-					uint8_t *resbuf = malloc(dst_len * 2 + 1);
-					size_t len = dst_len * 2; /* leave space for additional \0 byte */
+				else if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
+					&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
+				{
+					uint8_t resbuf[256];
+					size_t len = sizeof(resbuf);
 
-					if (!resbuf) {
-						ret = PSL_ERR_NO_MEM;
-					}
-					else if ((dst = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
+					if ((tmp = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
 						/* u8_tolower() does not terminate the result string */
 						ret = PSL_SUCCESS;
-						if (lower)
-							if (!(*lower = strndup((char *)dst, len)))
+						if (lower) {
+							if ((*lower = malloc(len + 1))) {
+								/* tmp is not 0 terminated */
+								memcpy(*lower, tmp, len);
+								(*lower)[len] = 0;
+							} else
 								ret = PSL_ERR_NO_MEM;
+						}
+						if (tmp != (char *)resbuf)
+							free(tmp);
 					} else {
 						ret = PSL_ERR_TO_LOWER;
 						/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
@@ -1728,20 +1734,26 @@ out:
 				ret = PSL_ERR_TO_UTF8;
 				/* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
 			}
-		} else
+		} else {
+			/* convert to lowercase */
+			uint8_t resbuf[256], *tmp;
+			size_t len = sizeof(resbuf);
+
 			ret = PSL_SUCCESS;
 
-		/* convert to lowercase */
-		if (ret == PSL_SUCCESS) {
-			uint8_t *dst, resbuf[256];
-			size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
-
 			/* we need a conversion to lowercase */
-			if ((dst = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str), 0, UNINORM_NFKC, resbuf, &len))) {
+			if ((tmp = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str), 0, UNINORM_NFKC, resbuf, &len))) {
 				/* u8_tolower() does not terminate the result string */
-				if (lower)
-					if (!(*lower = strndup((char *)dst, len)))
+				if (lower) {
+					if ((*lower = malloc(len + 1))) {
+						/* tmp is not 0 terminated */
+						memcpy(*lower, tmp, len);
+						(*lower)[len] = 0;
+					} else
 						ret = PSL_ERR_NO_MEM;
+				}
+				if (tmp != resbuf)
+					free(tmp);
 			} else {
 				ret = PSL_ERR_TO_LOWER;
 				/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
diff --git a/tests/test-registrable-domain.c b/tests/test-registrable-domain.c
index 6a227e1..819216b 100644
--- a/tests/test-registrable-domain.c
+++ b/tests/test-registrable-domain.c
@@ -50,14 +50,28 @@ static int
 	ok,
 	failed;
 
-static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
+static void testx(const psl_ctx_t *psl, const char *domain, const char *encoding, const char *lang, const char *expected_result)
 {
 	const char *result;
 	char *lower;
+	int rc;
 
-	/* our test data is fixed to UTF-8 (english), so provide it here */
-	if (psl_str_to_utf8lower(domain, "utf-8", "en", &lower) == PSL_SUCCESS)
+	/* just to cover special code paths for valgrind checking */
+	psl_str_to_utf8lower(domain, encoding, lang, NULL);
+
+	if ((rc = psl_str_to_utf8lower(domain, encoding, lang, &lower)) == PSL_SUCCESS)
 		domain = lower;
+	/* non-ASCII domains fail here if no runtime IDN library is configured, so skip it */
+#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
+	else if (domain) {
+		/* if we do not runtime support, test failure have to be skipped */
+		failed++;
+		printf("psl_str_to_utf8lower(%s)=%d\n", domain ? domain : "NULL", rc);
+
+		free(lower);
+		return;
+	}
+#endif
 
 	result = psl_registrable_domain(psl, domain);
 
@@ -72,13 +86,28 @@ static void test(const psl_ctx_t *psl, const char *domain, const char *expected_
 	free(lower);
 }
 
+static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
+{
+	testx(psl, domain, "utf-8", "en", expected_result);
+}
+
+static void test_iso(const psl_ctx_t *psl, const char *domain, const char *expected_result)
+{
+	/* makes only sense with a runtime IDN library configured */
+#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
+	testx(psl, domain, "iso-8859-15", "de", expected_result);
+#endif
+}
+
 static void test_psl(void)
 {
 	FILE *fp;
 	const psl_ctx_t *psl;
 	const char *p;
 	char buf[256], domain[128], expected_regdom[128], semicolon[2];
+	char lbuf[258];
 	int er_is_null, d_is_null;
+	unsigned it;
 
 	psl = psl_builtin();
 
@@ -101,6 +130,22 @@ static void test_psl(void)
 	/* Norwegian with lowercase oe */
 	test(psl, "www.\303\270yer.no", "www.\303\270yer.no");
 
+	/* Norwegian with lowercase oe, encoded as ISO-8859-15 */
+	test_iso(psl, "www.\370yer.no", "www.\303\270yer.no");
+
+	/* Testing special code paths of psl_str_to_utf8lower() */
+	for (it = 254; it <= 257; it++) {
+		memset(lbuf, 'a', it);
+		lbuf[it] = 0;
+
+		lbuf[0] = '\370';
+		test_iso(psl, lbuf, NULL);
+
+		lbuf[0] = '\303';
+		lbuf[1] = '\270';
+		test(psl, lbuf, NULL);
+	}
+
 	/* special check with NULL psl context and TLD */
 	test(psl, "whoever.forgot.his.name", "whoever.forgot.his.name");
 

From 9b2d7b7a9cd2b5e69b2c71919ba10e9fa8244cc0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Fri, 23 Sep 2016 17:13:32 +0200
Subject: [PATCH 15/42] Add gcov test coverage report

---
 .gitignore  |  6 ++++++
 Makefile.am | 10 ++++++++++
 README.md   |  1 +
 3 files changed, 17 insertions(+)

diff --git a/.gitignore b/.gitignore
index 6ee2c8d..043d477 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,7 @@
 *.exe
+*.gcda
+*.gcno
+*.gcov
 *.gz
 *.la
 *.lo
@@ -10,6 +13,7 @@
 *.cache
 *.plist
 *.stamp
+ABOUT-NLS
 aclocal.m4
 ar-lib
 autom4te.cache/
@@ -43,6 +47,8 @@ gtk-doc.m4
 gtk-doc.make
 include/libpsl.h
 install-sh
+lcov/
+libpsl.info
 libpsl.pc
 libtool
 ltmain.sh
diff --git a/Makefile.am b/Makefile.am
index 3904754..f66032d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -19,3 +19,13 @@ dist-hook:
 	mkdir -p $(distdir)/list/tests
 	cp -p $(PSL_FILE) $(distdir)/list
 	cp -p $(PSL_TESTFILE) $(distdir)/list/tests
+
+clean-local:
+	rm -rf */*.gc?? libpsl.info lcov
+
+check-coverage:
+	$(MAKE) clean
+	lcov --no-external --capture --initial --directory . --output-file libpsl.info
+	CFLAGS=$CFLAGS" --coverage" LDFLAGS=$LDFLAGS" --coverage" $(MAKE) check
+	lcov --no-external --capture --directory . --output-file libpsl.info
+	genhtml --prefix . --ignore-errors source libpsl.info --legend --title "libpsl" --output-directory=lcov
diff --git a/README.md b/README.md
index ba14b71..5696c0b 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 [![Travis-CI Status](https://travis-ci.org/rockdaboot/libpsl.png?branch=master)](https://travis-ci.org/rockdaboot/libpsl)
 [![Coverity Scan](https://img.shields.io/coverity/scan/10227.svg)](https://scan.coverity.com/projects/rockdaboot-libpsl)
+[![Coverage Status](https://coveralls.io/repos/github/rockdaboot/libpsl/badge.svg?branch=master)](https://coveralls.io/github/rockdaboot/libpsl?branch=master)
 
 libpsl - C library to handle the Public Suffix List
 ===================================================

From 069c6ff091b50752dece7f188e346dc7782a0b47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sun, 25 Sep 2016 12:48:32 +0200
Subject: [PATCH 16/42] Fix check-coverage target (CFLAGS+LDFLAGS not set
 properly)

Reported-by: https://github.com/darnir
---
 Makefile.am | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile.am b/Makefile.am
index f66032d..f0cc385 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -26,6 +26,7 @@ clean-local:
 check-coverage:
 	$(MAKE) clean
 	lcov --no-external --capture --initial --directory . --output-file libpsl.info
-	CFLAGS=$CFLAGS" --coverage" LDFLAGS=$LDFLAGS" --coverage" $(MAKE) check
+	CFLAGS=$$CFLAGS" --coverage" LDFLAGS=$$LDFLAGS" --coverage" ./configure && $(MAKE) check
 	lcov --no-external --capture --directory . --output-file libpsl.info
+# lcov --remove libpsl.info 'tests/*.c' 'src/lookup_*' 'src/psl2c.c' -o libpsl.info
 	genhtml --prefix . --ignore-errors source libpsl.info --legend --title "libpsl" --output-directory=lcov

From 32543dd5a507a10eb2dae9b1b7092c77965f6486 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sun, 25 Sep 2016 12:49:56 +0200
Subject: [PATCH 17/42] Avoid unneeded memory allocactions in
 psl_str_to_utf8lower()

Reported-by: https://github.com/daurnimator
---
 src/psl.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index 68bb014..13a4d6b 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -1693,7 +1693,7 @@ out:
 
 			if (cd != (iconv_t)-1) {
 				char *tmp = (char *)str; /* iconv won't change where str points to, but changes tmp itself */
-				size_t tmp_len = strlen(str);
+				size_t tmp_len = strlen(str) + 1;
 				size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
 				char *dst = malloc(dst_len + 1), *dst_tmp = dst;
 
@@ -1710,10 +1710,11 @@ out:
 						/* u8_tolower() does not terminate the result string */
 						ret = PSL_SUCCESS;
 						if (lower) {
-							if ((*lower = malloc(len + 1))) {
-								/* tmp is not 0 terminated */
+							if (tmp != (char *)resbuf) {
+								*lower = tmp;
+								tmp = NULL;
+							} else if ((*lower = malloc(len))) {
 								memcpy(*lower, tmp, len);
-								(*lower)[len] = 0;
 							} else
 								ret = PSL_ERR_NO_MEM;
 						}
@@ -1742,13 +1743,14 @@ out:
 			ret = PSL_SUCCESS;
 
 			/* we need a conversion to lowercase */
-			if ((tmp = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str), 0, UNINORM_NFKC, resbuf, &len))) {
+			if ((tmp = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str) + 1, 0, UNINORM_NFKC, resbuf, &len))) {
 				/* u8_tolower() does not terminate the result string */
 				if (lower) {
-					if ((*lower = malloc(len + 1))) {
-						/* tmp is not 0 terminated */
+					if (tmp != resbuf) {
+						*lower = tmp;
+						tmp = NULL;
+					} else if ((*lower = malloc(len))) {
 						memcpy(*lower, tmp, len);
-						(*lower)[len] = 0;
 					} else
 						ret = PSL_ERR_NO_MEM;
 				}

From 7eb8592035a034c3adbfa3a9aa2dbf8db792971d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sun, 25 Sep 2016 19:44:33 +0200
Subject: [PATCH 18/42] Let u8_tolower() allocate the result buffer.

Reported-by: https://github.com/daurnimator
---
 src/psl.c | 46 ++++++++++++++++++----------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index 13a4d6b..38a3981 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -1703,22 +1703,17 @@ out:
 				else if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
 					&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
 				{
-					uint8_t resbuf[256];
-					size_t len = sizeof(resbuf);
+					/* start size for u8_tolower internal memory allocation.
+					 * u8_tolower() does not terminate the result string. we have 0 byte included in above tmp_len
+					 * and thus in len. */
+					size_t len = dst_len - dst_len_tmp;
 
-					if ((tmp = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
-						/* u8_tolower() does not terminate the result string */
+					if ((tmp = (char *)u8_tolower((uint8_t *)dst, len, 0, UNINORM_NFKC, NULL, &len))) {
 						ret = PSL_SUCCESS;
 						if (lower) {
-							if (tmp != (char *)resbuf) {
-								*lower = tmp;
-								tmp = NULL;
-							} else if ((*lower = malloc(len))) {
-								memcpy(*lower, tmp, len);
-							} else
-								ret = PSL_ERR_NO_MEM;
-						}
-						if (tmp != (char *)resbuf)
+							*lower = tmp;
+							tmp = NULL;
+						} else
 							free(tmp);
 					} else {
 						ret = PSL_ERR_TO_LOWER;
@@ -1736,25 +1731,20 @@ out:
 				/* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
 			}
 		} else {
-			/* convert to lowercase */
-			uint8_t resbuf[256], *tmp;
-			size_t len = sizeof(resbuf);
+			/* we need a conversion to lowercase */
+			uint8_t *tmp;
+
+			/* start size for u8_tolower internal memory allocation.
+			 * u8_tolower() does not terminate the result string, so include terminating 0 byte in len. */
+			size_t len = u8_strlen((uint8_t *)str) + 1;
 
 			ret = PSL_SUCCESS;
 
-			/* we need a conversion to lowercase */
-			if ((tmp = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str) + 1, 0, UNINORM_NFKC, resbuf, &len))) {
-				/* u8_tolower() does not terminate the result string */
+			if ((tmp = u8_tolower((uint8_t *)str, len, 0, UNINORM_NFKC, NULL, &len))) {
 				if (lower) {
-					if (tmp != resbuf) {
-						*lower = tmp;
-						tmp = NULL;
-					} else if ((*lower = malloc(len))) {
-						memcpy(*lower, tmp, len);
-					} else
-						ret = PSL_ERR_NO_MEM;
-				}
-				if (tmp != resbuf)
+					*lower = tmp;
+					tmp = NULL;
+				} else
 					free(tmp);
 			} else {
 				ret = PSL_ERR_TO_LOWER;

From 5ebc24f0e07a0d65faf643e416dcaad4e1328a40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Mon, 26 Sep 2016 10:13:43 +0200
Subject: [PATCH 19/42] Code cleanup in libidn2 branch of _psl_idna_toASCII()

Reported-by: https://github.com/daurnimator
---
 src/psl.c | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index 38a3981..e829dcf 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -718,34 +718,21 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
 	}
 #elif defined(WITH_LIBIDN2)
 	int rc;
-	uint8_t *lower, resbuf[256];
-	size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
+	uint8_t *lower;
+	size_t len = u8_strlen((uint8_t *)utf8) + 1;
 
 	/* we need a conversion to lowercase */
-	lower = u8_tolower((uint8_t *)utf8, u8_strlen((uint8_t *)utf8), 0, UNINORM_NFKC, resbuf, &len);
-	if (!lower) {
+	if (!(lower = u8_tolower((uint8_t *)utf8, len, 0, UNINORM_NFKC, NULL, &len))) {
 		/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
 		return -1;
 	}
 
-	/* u8_tolower() does not terminate the result string */
-	if (lower == resbuf) {
-		lower[len]=0;
-	} else {
-		uint8_t *tmp = lower;
-		lower = (uint8_t *)strndup((char *)lower, len);
-		free(tmp);
-		if (!lower)
-			return -1;
-	}
-
 	if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
 		ret = 0;
 	} /* else
 		fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
 
-	if (lower != resbuf)
-		free(lower);
+	free(lower);
 #elif defined(WITH_LIBIDN)
 	int rc;
 
@@ -1738,9 +1725,8 @@ out:
 			 * u8_tolower() does not terminate the result string, so include terminating 0 byte in len. */
 			size_t len = u8_strlen((uint8_t *)str) + 1;
 
-			ret = PSL_SUCCESS;
-
 			if ((tmp = u8_tolower((uint8_t *)str, len, 0, UNINORM_NFKC, NULL, &len))) {
+				ret = PSL_SUCCESS;
 				if (lower) {
 					*lower = tmp;
 					tmp = NULL;

From 598a78b2de6426dba7be877e27bd9b0af0151734 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Mon, 26 Sep 2016 15:15:34 +0200
Subject: [PATCH 20/42] Add better test code coverage

---
 Makefile.am                              | 26 +++++++---
 src/psl.c                                |  2 +-
 tests/Makefile.am                        |  9 ++++
 tests/test-is-cookie-domain-acceptable.c |  6 +++
 tests/test-is-public-all.c               | 11 +++-
 tests/test-is-public.c                   | 65 +++++++++++++++++++++++-
 6 files changed, 110 insertions(+), 9 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index f0cc385..948b50d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -21,12 +21,26 @@ dist-hook:
 	cp -p $(PSL_TESTFILE) $(distdir)/list/tests
 
 clean-local:
-	rm -rf */*.gc?? libpsl.info lcov
+	rm -rf */*.gc?? */*/*.gc?? libpsl.info lcov
 
 check-coverage:
-	$(MAKE) clean
-	lcov --no-external --capture --initial --directory . --output-file libpsl.info
-	CFLAGS=$$CFLAGS" --coverage" LDFLAGS=$$LDFLAGS" --coverage" ./configure && $(MAKE) check
-	lcov --no-external --capture --directory . --output-file libpsl.info
-# lcov --remove libpsl.info 'tests/*.c' 'src/lookup_*' 'src/psl2c.c' -o libpsl.info
+	if test -z "$(XLIB)"; then \
+		CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --disable-runtime --disable-builtin; \
+	else \
+		CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --enable-runtime=$(XLIB) --enable-builtin=$(XLIB); \
+	fi
+	$(MAKE) clean && $(MAKE)
+	lcov --no-external --capture --initial --directory src --output-file libpsl.info
+	$(MAKE) check
+	lcov --no-external --capture --directory src --output-file libpsl.info
+	lcov --remove libpsl.info 'src/psl2c.c' -o libpsl.info
 	genhtml --prefix . --ignore-errors source libpsl.info --legend --title "libpsl" --output-directory=lcov
+
+check-coverage-libidn:
+	XLIB=libidn $(MAKE) check-coverage
+
+check-coverage-libidn2:
+	XLIB=libidn2 $(MAKE) check-coverage
+
+check-coverage-libicu:
+	XLIB=libicu $(MAKE) check-coverage
diff --git a/src/psl.c b/src/psl.c
index e829dcf..4add23e 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -1728,7 +1728,7 @@ out:
 			if ((tmp = u8_tolower((uint8_t *)str, len, 0, UNINORM_NFKC, NULL, &len))) {
 				ret = PSL_SUCCESS;
 				if (lower) {
-					*lower = tmp;
+					*lower = (char*)tmp;
 					tmp = NULL;
 				} else
 					free(tmp);
diff --git a/tests/Makefile.am b/tests/Makefile.am
index c71cd99..d3e9667 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -23,3 +23,12 @@ check_PROGRAMS = $(PSL_TESTS)
 
 TESTS_ENVIRONMENT = TESTS_VALGRIND="@VALGRIND_ENVIRONMENT@"
 TESTS = $(PSL_TESTS)
+
+# dafsa.psl must be created before any test is executed
+# check-local target works in parallel to the tests, so the test suite will likely fail
+BUILT_SOURCES = dafsa.psl
+dafsa.psl:
+	$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary "$(PSL_FILE)" psl.dafsa
+
+clean-local:
+	rm -f psl.dafsa
\ No newline at end of file
diff --git a/tests/test-is-cookie-domain-acceptable.c b/tests/test-is-cookie-domain-acceptable.c
index c50342d..6bac1a4 100644
--- a/tests/test-is-cookie-domain-acceptable.c
+++ b/tests/test-is-cookie-domain-acceptable.c
@@ -65,6 +65,7 @@ static void test_psl(void)
 		{ "www.his.name", "his.name", 1 },
 		{ "www.his.name", "name", 0 },
 		{ "www.example.com", "www.example.com", 1 },
+		{ "www.example.com", "wwww.example.com", 0 },
 		{ "www.example.com", "example.com", 1 },
 		{ "www.example.com", "com", 0 }, /* not accepted by normalization (PSL rule 'com') */
 		{ "www.example.com", "example.org", 0 },
@@ -77,6 +78,8 @@ static void test_psl(void)
 		{ "2a00:1450:4013:c01::8b", ":1450:4013:c01::8b", 0 }, /* IPv6 address, partial match */
 		{ "::ffff:192.1.123.2", "::ffff:192.1.123.2", 1 }, /* IPv6 address dotted-quad, full match */
 		{ "::ffff:192.1.123.2", ".1.123.2", 0 }, /* IPv6 address dotted-quad, partial match */
+		{ NULL, ".1.123.2", 0 },
+		{ "hiho", NULL, 0 },
 	};
 	unsigned it;
 	psl_ctx_t *psl;
@@ -98,6 +101,9 @@ static void test_psl(void)
 		}
 	}
 
+	/* do checks to cover more code paths in libpsl */
+	psl_is_cookie_domain_acceptable(NULL, "example.com", "example.com");
+
 	psl_free(psl);
 }
 
diff --git a/tests/test-is-public-all.c b/tests/test-is-public-all.c
index f4b7494..f76cfb8 100644
--- a/tests/test-is-public-all.c
+++ b/tests/test-is-public-all.c
@@ -131,7 +131,7 @@ static void test_psl_entry(const psl_ctx_t *psl, const char *domain, int type)
 static void test_psl(void)
 {
 	FILE *fp;
-	psl_ctx_t *psl;
+	psl_ctx_t *psl, *psl3;
 	const psl_ctx_t *psl2;
 	int type = 0;
 	char buf[256], *linep, *p;
@@ -142,6 +142,11 @@ static void test_psl(void)
 	psl2 = psl_builtin();
 	printf("builtin PSL has %d suffixes and %d exceptions\n", psl_suffix_count(psl2), psl_suffix_exception_count(psl2));
 
+	if (!(psl3 = psl_load_file("psl.dafsa"))) {
+		fprintf(stderr, "Failed to load 'psl.dafsa'\n");
+		failed++;
+	}
+
 	if ((fp = fopen(PSL_FILE, "r"))) {
 #ifdef HAVE_CLOCK_GETTIME
 		clock_gettime(CLOCK_REALTIME, &ts1);
@@ -174,6 +179,9 @@ static void test_psl(void)
 
 			if (psl2)
 				test_psl_entry(psl2, p, type);
+
+			if (psl3)
+				test_psl_entry(psl3, p, type);
 		}
 
 #ifdef HAVE_CLOCK_GETTIME
@@ -187,6 +195,7 @@ static void test_psl(void)
 
 	psl_free(psl);
 	psl_free((psl_ctx_t *)psl2);
+	psl_free(psl3);
 }
 
 int main(int argc, const char * const *argv)
diff --git a/tests/test-is-public.c b/tests/test-is-public.c
index 5d32425..d1567b7 100644
--- a/tests/test-is-public.c
+++ b/tests/test-is-public.c
@@ -84,6 +84,7 @@ static void test_psl(void)
 		{ "adfhoweirh", 1 }, /* unknown TLD */
 	};
 	unsigned it;
+	int result, ver;
 	psl_ctx_t *psl;
 
 	psl = psl_load_file(PSL_FILE);
@@ -92,7 +93,7 @@ static void test_psl(void)
 
 	for (it = 0; it < countof(test_data); it++) {
 		const struct test_data *t = &test_data[it];
-		int result = psl_is_public_suffix(psl, t->domain);
+		result = psl_is_public_suffix(psl, t->domain);
 
 		if (result == t->result) {
 			ok++;
@@ -102,6 +103,68 @@ static void test_psl(void)
 		}
 	}
 
+	/* do some checks to cover more code paths in libpsl */
+	psl_is_public_suffix(NULL, "xxx");
+
+	if ((ver = psl_check_version_number(0)) == 0) {
+		printf("psl_check_version_number(0) is 0\n");
+		failed++;
+	} else {
+		if (((result = psl_check_version_number(ver)) != ver)) {
+			printf("psl_check_version_number(%06X) is %06X\n", ver, result);
+			failed++;
+		}
+
+		if (((result = psl_check_version_number(ver - 1)) != 0)) {
+			printf("psl_check_version_number(%06X) is %06X\n", ver - 1, result);
+			failed++;
+		}
+
+		if (((result = psl_check_version_number(ver + 1)) != ver)) {
+			printf("psl_check_version_number(%06X) is %06X\n", ver, result);
+			failed++;
+		}
+	}
+
+	psl_str_to_utf8lower("www.example.com", "utf-8", "en", NULL);
+	psl_str_to_utf8lower(NULL, "utf-8", "en", NULL);
+
+	{
+		char *lower = NULL;
+
+		psl_str_to_utf8lower("www.example.com", NULL, "de", &lower);
+		free(lower); lower = NULL;
+
+		psl_str_to_utf8lower("\374bel.de", NULL, "de", &lower);
+		free(lower); lower = NULL;
+
+		psl_str_to_utf8lower("\374bel.de", "iso-8859-1", NULL, &lower);
+		free(lower); lower = NULL;
+
+		psl_str_to_utf8lower(NULL, "utf-8", "en", &lower);
+		free(lower); lower = NULL;
+	}
+
+	psl_get_version();
+	psl_builtin_filename();
+	psl_builtin_outdated();
+	psl_builtin_file_time();
+	psl_builtin_sha1sum();
+	psl_suffix_wildcard_count(NULL);
+	psl_suffix_wildcard_count(psl);
+	psl_suffix_wildcard_count(psl_builtin());
+	psl_suffix_count(NULL);
+	psl_suffix_exception_count(NULL);
+	psl_load_file(NULL);
+	psl_load_fp(NULL);
+	psl_registrable_domain(NULL, "");
+	psl_registrable_domain(psl, NULL);
+	psl_registrable_domain(psl, "www.example.com");
+	psl_unregistrable_domain(NULL, "");
+	psl_unregistrable_domain(psl, NULL);
+	psl_is_public_suffix2(NULL, "", PSL_TYPE_ANY);
+	psl_is_public_suffix2(psl, NULL, PSL_TYPE_ANY);
+
 	psl_free(psl);
 }
 

From 8a6220500496f25078eba4c5f124290e3b8ae74a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Tue, 27 Sep 2016 11:48:27 +0200
Subject: [PATCH 21/42] Add Travis script for Coveralls.io

---
 .travis.yml          | 3 +++
 .travis_coveralls.sh | 5 +++++
 Makefile.am          | 6 +++---
 3 files changed, 11 insertions(+), 3 deletions(-)
 create mode 100755 .travis_coveralls.sh

diff --git a/.travis.yml b/.travis.yml
index 5b40aa7..edf2f62 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,6 +10,7 @@ branches:
     only:
         - master
         - develop
+        - coveralls
 
 env:
     - RUNTIME=libicu
@@ -34,6 +35,7 @@ addons:
             - libicu-dev
             - libunistring0
             - libunistring-dev
+            - lcov
 
 script:
   - ./autogen.sh
@@ -44,3 +46,4 @@ script:
   - ./configure --enable-runtime=$RUNTIME --disable-builtin && make clean && make -j4 && make check -j4
   - ./configure --enable-gtk-doc && make -j4 && make check -j4
   - make distcheck
+  - if [[ $CC == "gcc" && $RUNTIME == "libicu" ]]; then ./.travis_coveralls.sh; fi
diff --git a/.travis_coveralls.sh b/.travis_coveralls.sh
new file mode 100755
index 0000000..f2411a1
--- /dev/null
+++ b/.travis_coveralls.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+make check-coverage-libicu
+pip install --user cpp-coveralls
+coveralls --include libwget/ --include src/ -e "src/psl2c.c"
diff --git a/Makefile.am b/Makefile.am
index 948b50d..75f52aa 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -30,11 +30,11 @@ check-coverage:
 		CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --enable-runtime=$(XLIB) --enable-builtin=$(XLIB); \
 	fi
 	$(MAKE) clean && $(MAKE)
-	lcov --no-external --capture --initial --directory src --output-file libpsl.info
+	lcov --capture --initial --directory src --output-file libpsl.info
 	$(MAKE) check
-	lcov --no-external --capture --directory src --output-file libpsl.info
+	lcov --capture --directory src --output-file libpsl.info
 	lcov --remove libpsl.info 'src/psl2c.c' -o libpsl.info
-	genhtml --prefix . --ignore-errors source libpsl.info --legend --title "libpsl" --output-directory=lcov
+	genhtml --prefix . libpsl.info --legend --title "libpsl" --output-directory=lcov
 
 check-coverage-libidn:
 	XLIB=libidn $(MAKE) check-coverage

From e126a67354c2b63a6e17030d9db5613f84b594c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Fri, 4 Nov 2016 14:28:30 +0100
Subject: [PATCH 22/42] Add Solaris OpenCSW badges

---
 .travis.yml | 1 -
 README.md   | 6 ++++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index edf2f62..d91e76f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,7 +10,6 @@ branches:
     only:
         - master
         - develop
-        - coveralls
 
 env:
     - RUNTIME=libicu
diff --git a/README.md b/README.md
index 5696c0b..7ceb525 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,12 @@
 [![Coverity Scan](https://img.shields.io/coverity/scan/10227.svg)](https://scan.coverity.com/projects/rockdaboot-libpsl)
 [![Coverage Status](https://coveralls.io/repos/github/rockdaboot/libpsl/badge.svg?branch=master)](https://coveralls.io/github/rockdaboot/libpsl?branch=master)
 
+Solaris OpenCSW [![Build Status Solaris amd64](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-amd64)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-amd64)
+[![Build Status Solaris i386](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-i386)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-i386)
+[![Build Status Solaris Sparc](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-sparc)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-sparc)
+[![Build Status Solaris SparcV9](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-sparcv9)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-sparcv9)
+
+
 libpsl - C library to handle the Public Suffix List
 ===================================================
 

From e03953e27a7a8ff5b4f5cce1b5de9039e64c293d Mon Sep 17 00:00:00 2001
From: Olle Liljenzin <olle@liljenzin.se>
Date: Wed, 2 Nov 2016 20:22:01 +0100
Subject: [PATCH 23/42] Updated DAFSA generator and parser to support UTF-8
 encoding

---
 AUTHORS                          |   1 +
 src/lookup_string_in_fixed_set.c | 103 ++++++++++++++++++++++++++-----
 src/psl-make-dafsa               |  81 ++++++++++++++++++++----
 3 files changed, 157 insertions(+), 28 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 33dad7b..6f3195c 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -16,3 +16,4 @@ Christopher Meng (Fedora building)
 Jakub Čajka
 Giuseppe Scrivano
 Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
+Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)
diff --git a/src/lookup_string_in_fixed_set.c b/src/lookup_string_in_fixed_set.c
index ddf63ae..01edc4e 100644
--- a/src/lookup_string_in_fixed_set.c
+++ b/src/lookup_string_in_fixed_set.c
@@ -21,6 +21,48 @@
 
 #define CHECK_LT(a, b) if ((a) >= b) return 0
 
+static const char multibyte_length_table[16] = {
+	0, 0, 0, 0,	 /* 0x00-0x3F */
+	0, 0, 0, 0,	 /* 0x40-0x7F */
+	0, 0, 0, 0,	 /* 0x80-0xBF */
+	2, 2, 3, 4,	 /* 0xC0-0xFF */
+};
+
+
+/**
+ * Get lenght of multibyte character sequence starting at a given byte.
+ * Returns zero if the byte is not a valid leading byte in UTF-8.
+ */
+static int GetMultibyteLength(char c) {
+	return multibyte_length_table[((unsigned char)c) >> 4];
+}
+
+/**
+ * Moves pointers one byte forward.
+ */
+static void NextPos(const unsigned char** pos,
+	const char** key,
+	const char** multibyte_start)
+{
+	++*pos;
+	if (*multibyte_start) {
+		/* Advance key to next byte in multibyte sequence. */
+		++*key;
+		/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
+		if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
+			*multibyte_start = 0;
+	} else {
+		if (GetMultibyteLength(**key)) {
+			/* Multibyte prefix was matched in the dafsa, start matching multibyte
+			 * content in next round. */
+			*multibyte_start = *key;
+		} else {
+			/* Advance key as a single byte character was matched. */
+			++*key;
+		}
+	}
+}
+
 /*
  * Read next offset from pos.
  * Returns true if an offset could be read, false otherwise.
@@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
 	return(*offset & 0x80) != 0;
 }
 
+/*
+ * Check if byte at offset matches first character in key.
+ * This version assumes a range check was already performed by the caller.
+ */
+
+static int IsMatchUnchecked(const unsigned char matcher,
+	const char* key,
+	const char* multibyte_start)
+{
+	if (multibyte_start) {
+		/* Multibyte matching mode. */
+		if (multibyte_start == key) {
+			/* Match leading byte, which will also match the sequence length. */
+			return (matcher ^ 0x80) == (const unsigned char)*key;
+		} else {
+			/* Match following bytes. */
+			return (matcher ^ 0xC0) == (const unsigned char)*key;
+		}
+	}
+	/* If key points at a leading byte in a multibyte sequence, but we are not yet
+	 * in multibyte mode, then the dafsa should contain a special byte to indicate
+	 * a mode switch. */
+	if (GetMultibyteLength(*key)) {
+		return matcher == 0x1F;
+	}
+	/* Normal matching of a single byte character. */
+	return matcher == (const unsigned char)*key;
+}
+
 /*
  * Check if byte at offset matches first character in key.
  * This version matches characters not last in label.
@@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
 
 static int IsMatch(const unsigned char* offset,
 	const unsigned char* end,
-	const char* key)
+	const char* key,
+	const char* multibyte_start)
 {
 	CHECK_LT(offset, end);
-	return *offset == *key;
+	return IsMatchUnchecked(*offset, key, multibyte_start);
 }
 
 /*
@@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,
 
 static int IsEndCharMatch(const unsigned char* offset,
 	const unsigned char* end,
-	const char* key)
+	const char* key,
+	const char* multibyte_start)
 {
 	CHECK_LT(offset, end);
-	return *offset == (*key | 0x80);
+	return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
 }
 
 /*
@@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,
 
 static int GetReturnValue(const unsigned char* offset,
 	const unsigned char* end,
+	const char* multibyte_start,
 	int* return_value)
 {
 	CHECK_LT(offset, end);
-	if ((*offset & 0xE0) == 0x80) {
+	if (!multibyte_start && (*offset & 0xE0) == 0x80) {
 		*return_value = *offset & 0x0F;
 		return 1;
 	}
@@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 	const unsigned char* end = graph + length;
 	const unsigned char* offset = pos;
 	const char* key_end = key + key_length;
+	const char* multibyte_start = 0;
 
 	while (GetNextOffset(&pos, end, &offset)) {
 		/*char <char>+ end_char offsets
@@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 
 		if (key != key_end && !IsEOL(offset, end)) {
 			/* Leading <char> is not a match. Don't dive into this child */
-			if (!IsMatch(offset, end, key))
+			if (!IsMatch(offset, end, key, multibyte_start))
 				continue;
 			did_consume = 1;
-			++offset;
-			++key;
+			NextPos(&offset, &key, &multibyte_start);
 			/* Possible matches at this point:
 			 * <char>+ end_char offsets
 			 * <char>+ return value
@@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 
 			/* Remove all remaining <char> nodes possible */
 			while (!IsEOL(offset, end) && key != key_end) {
-				if (!IsMatch(offset, end, key))
+				if (!IsMatch(offset, end, key, multibyte_start))
 					return -1;
-				++key;
-				++offset;
+				NextPos(&offset, &key, &multibyte_start);
 			}
 		}
 		/* Possible matches at this point:
@@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 		if (key == key_end) {
 			int return_value;
 
-			if (GetReturnValue(offset, end, &return_value))
+			if (GetReturnValue(offset, end, multibyte_start, &return_value))
 				return return_value;
 			/* The DAFSA guarantees that if the first char is a match, all
 			 * remaining char elements MUST match if the key is truly present.
@@ -191,13 +264,13 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 				return -1;
 			continue;
 		}
-		if (!IsEndCharMatch(offset, end, key)) {
+		if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
 			if (did_consume)
 				return -1; /* Unexpected */
 			continue;
 		}
-		++key;
-		pos = ++offset; /* Dive into child */
+		NextPos(&offset, &key, &multibyte_start);
+		pos = offset; /* Dive into child */
 	}
 
 	return -1; /* No match */
diff --git a/src/psl-make-dafsa b/src/psl-make-dafsa
index 99c3135..bd9a79a 100755
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
 and generates a C++ file with a byte array representing graph that can be
 used as a memory efficient replacement for the perfect hash table.
 
-The input strings are assumed to consist of printable 7-bit ASCII characters
-and the return values are assumed to be one digit integers.
+The input strings must consist of printable 7-bit ASCII characters or UTF-8
+multibyte sequences. Control characters in the range [0x00-0x1F] are not
+allowed. The return values must be one digit integers. .
 
 In this program a DAFSA is a diamond shaped graph starting at a common
 source node and ending at a common sink node. All internal nodes contain
@@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:
 
 <byte> ::= < 8-bit value in range [0x00-0xFF] >
 
-<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
-<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
+<char> ::= < byte in range [0x1F-0x7F] >
+<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
 <return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
 
 <offset1> ::= < byte in range [0x00-0x3F] >
@@ -89,8 +90,8 @@ The generated byte array can described by the following BNF:
 
 Decoding:
 
-<char> -> printable 7-bit ASCII character
-<end_char> & 0x7F -> printable 7-bit ASCII character
+<char> -> character
+<end_char> & 0x7F -> character
 <return value> & 0x0F -> integer
 <offset1 & 0x3F> -> integer
 ((<offset2> & 0x1F>) << 8) + <byte> -> integer
@@ -105,6 +106,28 @@ between previous child node and next child node. Thus each offset links a node
 to a child node. The distance is always counted between start addresses, i.e.
 first byte in decoded offset or first byte in child node.
 
+Transcoding of UTF-8 multibyte sequences:
+
+The original DAFSA format was limited to 7-bit printable ASCII characters in
+range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
+By transcoding of such characters the new format preserves compatibility with
+old parsers, so that a DAFSA in the extended format can be used by an old
+parser without false positives, although strings containing transcoded
+characters will never match. Since the format is extended rather than being
+changed, a parser supporting the new format will automatically support data
+generated in the old format.
+
+Transcoding is performed by insertion of a start byte with the special value
+0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
+the range of printable ASCII.
+
+2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
+
+3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
+
+4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
+                00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
+
 Example 1:
 
 %%
@@ -197,6 +220,23 @@ import sys
 class InputError(Exception):
   """Exception raised for errors in the input file."""
 
+# Length of a character starting at a given byte.
+char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x00-0x0F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x10-0x1F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x20-0x2F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x30-x03F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x40-0x4F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x50-x05F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x60-0x6F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x70-x07F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x80-0x8F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x90-0x9F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xA0-0xAF
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xB0-0xBF
+                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xC0-0xCF
+                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xD0-0xDF
+                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # 0xE0-0xEF
+                      4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
 
 def to_dafsa(words):
   """Generates a DAFSA from a word list and returns the source node.
@@ -206,14 +246,29 @@ def to_dafsa(words):
   """
   if not words:
     raise InputError('The domain list must not be empty')
-  def to_nodes(word):
+  def to_nodes(word, multibyte_length):
     """Split words into characters"""
-    if not 0x1F < ord(word[0]) < 0x80:
-      raise InputError('Domain names must be printable 7-bit ASCII')
-    if len(word) == 1:
-      return chr(int(word[0], 16) & 0x0F), [None]
-    return word[0], [to_nodes(word[1:])]
-  return [to_nodes(word) for word in words]
+    byte = ord(word[0])
+    if multibyte_length:
+      # Consume next byte in multibyte sequence.
+      if byte & 0xC0 != 0x80:
+        raise InputError('Invalid UTF-8 multibyte sequence')
+      return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
+    char_length = char_length_table[byte]
+    if char_length == 1:
+      # 7-bit printable ASCII.
+      if len(word) == 1:
+        return chr(int(word[0], 16) & 0x0F), [None]
+      return word[0], [to_nodes(word[1:], 0)]
+    elif char_length > 1:
+      # Leading byte in multibyte sequence.
+      if len(word) <= char_length:
+        raise InputError('Unterminated UTF-8 multibyte sequence')
+      return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
+    # Unexpected character.
+    raise InputError('Domain names must be printable ASCII or UTF-8')
+
+  return [to_nodes(word, 0) for word in words]
 
 
 def to_words(node):

From 8c2bcd5a24496cb7e67e3f6d0bd6126b4563097b Mon Sep 17 00:00:00 2001
From: Olle Liljenzin <olle@liljenzin.se>
Date: Fri, 4 Nov 2016 19:43:36 +0100
Subject: [PATCH 24/42] Added version info into generated DAFSA.

psl-make-dafsa got a mode switch so that the old version can be
generated for testing.
---
 src/psl-make-dafsa | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/src/psl-make-dafsa b/src/psl-make-dafsa
index bd9a79a..aa9d451 100755
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@@ -85,8 +85,13 @@ The generated byte array can described by the following BNF:
          | <prefix> <node>
          | <end_label>
 
-<dafsa> ::= <source>
-          | <dafsa> <node>
+<graph> ::= <graph>
+          | <graph> <node>
+
+<version> ::= <empty>            # The DAFSA was generated in ASCII mode.
+          | < byte value 0x01 >  # The DAFSA was generated in UTF-8 mode.
+
+<dafsa> ::= <graph> <version>
 
 Decoding:
 
@@ -238,7 +243,7 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x00-0x
                       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # 0xE0-0xEF
                       4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
 
-def to_dafsa(words):
+def to_dafsa(words, utf_mode):
   """Generates a DAFSA from a word list and returns the source node.
 
   Each word is split into characters so that each character is represented by
@@ -262,6 +267,8 @@ def to_dafsa(words):
       return word[0], [to_nodes(word[1:], 0)]
     elif char_length > 1:
       # Leading byte in multibyte sequence.
+      if not utf_mode:
+        raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
       if len(word) <= char_length:
         raise InputError('Unterminated UTF-8 multibyte sequence')
       return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
@@ -451,7 +458,7 @@ def encode_label(label):
   return buf
 
 
-def encode(dafsa):
+def encode(dafsa, utf_mode):
   """Encodes a DAFSA to a list of bytes"""
   output = []
   offsets = {}
@@ -467,6 +474,8 @@ def encode(dafsa):
 
   output.extend(encode_links(dafsa, offsets, len(output)))
   output.reverse()
+  if utf_mode:
+    output.append(0x01)
   return output
 
 
@@ -485,22 +494,22 @@ def to_cxx(data):
   return text
 
 
-def words_to_whatever(words, converter):
+def words_to_whatever(words, converter, utf_mode):
   """Generates C++ code from a word list"""
-  dafsa = to_dafsa(words)
+  dafsa = to_dafsa(words, utf_mode)
   for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
     dafsa = fun(dafsa)
-  return converter(encode(dafsa))
+  return converter(encode(dafsa, utf_mode))
 
 
-def words_to_cxx(words):
+def words_to_cxx(words, utf_mode):
   """Generates C++ code from a word list"""
-  return words_to_whatever(words, to_cxx)
+  return words_to_whatever(words, to_cxx, utf_mode)
 
 
-def words_to_binary(words):
+def words_to_binary(words, utf_mode):
   """Generates C++ code from a word list"""
-  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, bytearray)
+  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, bytearray, utf_mode)
 
 
 def parse_psl2c(infile):
@@ -595,6 +604,8 @@ def usage():
   print('  --input-format=psl      infile is a Public Suffix List file')
   print('  --output-format=cxx     Write DAFSA as C/C++ code')
   print('  --output-format=binary  Write DAFSA binary data')
+  print('  --encoding=ascii        7-bit ASCII mode (default)')
+  print('  --encoding=utf-8        UTF-8 mode')
   exit(1)
 
 
@@ -605,6 +616,7 @@ def main():
 
   converter = words_to_cxx
   parser = parse_psl2c
+  utf_mode = False
 
   for arg in sys.argv[1:-2]:
     if arg.startswith('--input-format='):
@@ -622,18 +634,24 @@ def main():
         converter = words_to_binary
       elif value == 'cxx':
         converter = words_to_cxx
+    elif arg.startswith('--encoding='):
+      value = arg[11:].lower()
+      if value == 'ascii':
+        utf_mode = False
+      elif value == 'utf-8':
+        utf_mode = True
       else:
-        print("Unknown output format '%s'" % value)
+        print("Unknown encoding '%s'" % value)
         return 1
     else:
       usage()
 
   if sys.argv[-2] == '-':
     with open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(sys.stdin)))
+      outfile.write(converter(parser(sys.stdin), utf_mode))
   else:
     with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(infile)))
+      outfile.write(converter(parser(infile), utf_mode))
 
   return 0
 

From 86034ac7c961e29d016a5e0ea6e6f3f94d4c7be3 Mon Sep 17 00:00:00 2001
From: Olle Liljenzin <olle@liljenzin.se>
Date: Fri, 4 Nov 2016 20:03:41 +0100
Subject: [PATCH 25/42] Added function to the parser for reading DAFSA encoding
 mode.

---
 src/lookup_string_in_fixed_set.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/lookup_string_in_fixed_set.c b/src/lookup_string_in_fixed_set.c
index 01edc4e..57d455b 100644
--- a/src/lookup_string_in_fixed_set.c
+++ b/src/lookup_string_in_fixed_set.c
@@ -275,3 +275,11 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 
 	return -1; /* No match */
 }
+
+/* prototype to skip warning with -Wmissing-prototypes */
+int _HIDDEN GetUtfMode(const unsigned char *graph, size_t length);
+
+int _HIDDEN GetUtfMode(const unsigned char *graph, size_t length)
+{
+	return length > 0 && graph[length - 1] < 0x80;
+}

From 4b42762cbf6063ff4ab194f194b45c71fea5c7db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sat, 5 Nov 2016 10:34:09 +0100
Subject: [PATCH 26/42] Skip punycode conversion for _psl_is_public_suffix() if
 data contains UTF-8 rules

---
 src/psl.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index 4add23e..af14a94 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -174,10 +174,11 @@ struct _psl_ctx_st {
 	size_t
 		dafsa_size;
 	int
-		mode,
 		nsuffixes,
 		nexceptions,
 		nwildcards;
+	unsigned char
+		utf8 : 1; /* 1: data contains UTF-8 + punycode encoded rules */
 };
 
 /* include the PSL data compiled by 'psl2c' */
@@ -782,8 +783,9 @@ static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_en
 	}
 }
 
-/* prototype */
+/* prototypes */
 int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
+int GetUtfMode(const unsigned char *graph, size_t length);
 
 static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
 {
@@ -801,7 +803,7 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
 	for (p = domain; *p; p++) {
 		if (*p == '.')
 			suffix.nlabels++;
-		else if (*((unsigned char *)p) >= 128)
+		else if (!psl->utf8 && *((unsigned char *)p) >= 128)
 			need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */
 	}
 
@@ -1150,6 +1152,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
 			psl->dafsa = m;
 
 		psl->dafsa_size = len;
+		psl->utf8 = !!GetUtfMode(psl->dafsa, len);
 
 		return psl;
 	}
@@ -1161,6 +1164,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
 	 *  as of 19.02.2014, the list at https://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
 	 */
 	psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
+	psl->utf8 = 1; /* we put UTF-8 and punycode rules in the lookup vector */
 
 	do {
 		while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */

From 3ac807d987bc6af1ec9cbc407c1200061e690a99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sat, 5 Nov 2016 10:36:25 +0100
Subject: [PATCH 27/42] Add --encoding to psl-make-dafsa man page

---
 src/psl-make-dafsa.1 | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/psl-make-dafsa.1 b/src/psl-make-dafsa.1
index 9eb1fd7..b8c6e09 100644
--- a/src/psl-make-dafsa.1
+++ b/src/psl-make-dafsa.1
@@ -28,9 +28,14 @@ depends on options passed to it.
 \fBcxx\fR: (default) output is C/C++ code
 .br
 \fBbinary\fR: output is an architecture-independent binary format
+.TP
+\fB\-\-encoding=\fR[\fIutf-8\fR|\fIascii\fR]
+\fButf-8\fR: (default) UTF-8 mode (output contains UTF-8 + punycode)
+.br
+\fBascii\fR: (deprecated) 7-bit ASCII mode (output contains punycode only)
 .SH SEE ALSO
 .IR https://publicsuffix.org/ ", " https://github.com/rockdaboot/libpsl
 .SH COPYRIGHT
-\fBpsl-make-dafsa\fR was originally part of the Chromium project, and
+\fBpsl-make-dafsa\fR was was written by Olle Jiljenzin as part of the Chromium project and
 has been modified by Tim Ruehsen and Daniel Kahn Gillmor.  The code
 and its documentation is governed by a BSD-style license.

From e30e77ef127702e3ffe71c49051f362cdef5bf48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sat, 5 Nov 2016 11:17:11 +0100
Subject: [PATCH 28/42] Create ASCII and UTF-8 binaries for testing

---
 tests/Makefile.am | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/Makefile.am b/tests/Makefile.am
index d3e9667..d15c8a7 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -24,11 +24,13 @@ check_PROGRAMS = $(PSL_TESTS)
 TESTS_ENVIRONMENT = TESTS_VALGRIND="@VALGRIND_ENVIRONMENT@"
 TESTS = $(PSL_TESTS)
 
-# dafsa.psl must be created before any test is executed
+# dafsa.psl and dafsa_ascii.psl must be created before any test is executed
 # check-local target works in parallel to the tests, so the test suite will likely fail
-BUILT_SOURCES = dafsa.psl
-dafsa.psl:
+BUILT_SOURCES = psl.dafsa psl_ascii.dafsa
+psl.dafsa:
 	$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary "$(PSL_FILE)" psl.dafsa
+psl_ascii.dafsa:
+	$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary --encoding=ascii "$(PSL_FILE)" psl_ascii.dafsa
 
 clean-local:
-	rm -f psl.dafsa
\ No newline at end of file
+	rm -f psl.dafsa psl_ascii.dafsa

From 3211a66f00be12d9f0358b99c6c3187236c6e904 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sat, 5 Nov 2016 11:31:11 +0100
Subject: [PATCH 29/42] Put punycode + UTF-8 rules into DAFSA in utf-8 mode

---
 src/psl-make-dafsa | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/psl-make-dafsa b/src/psl-make-dafsa
index aa9d451..0d4408a 100755
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@@ -481,7 +481,7 @@ def encode(dafsa, utf_mode):
 
 def to_cxx(data):
   """Generates C++ code from a list of encoded bytes."""
-  text = '/* This file is generated. DO NOT EDIT!\n\n'
+  text = '/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
   text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
   text += ' documentation.'
   text += '*/\n\n'
@@ -512,7 +512,7 @@ def words_to_binary(words, utf_mode):
   return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, bytearray, utf_mode)
 
 
-def parse_psl2c(infile):
+def parse_psl2c(infile, utf_mode):
   """Parses file generated by psl2c and extract strings and return code"""
   lines = [line.strip() for line in infile]
 
@@ -531,7 +531,7 @@ def parse_psl2c(infile):
   return [line[:-3] + line[-1] for line in sorted(lines)]
 
 
-def parse_psl(infile):
+def parse_psl(infile, utf_mode):
   """Parses PSL file and extract strings and return code"""
   PSL_FLAG_EXCEPTION = (1<<0)
   PSL_FLAG_WILDCARD = (1<<1)
@@ -573,9 +573,9 @@ def parse_psl(infile):
         continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
       flags = PSL_FLAG_PLAIN | section
 
-    line = line.decode('utf-8').encode("idna")
+    punycode = line.decode('utf-8').encode("idna")
 
-    if line in psl:
+    if punycode in psl:
       """Found existing entry:
          Combination of exception and plain rule is ambiguous
            !foo.bar
@@ -585,10 +585,12 @@ def parse_psl(infile):
            !foo.bar + *.foo.bar
             foo.bar + *.foo.bar
       """
-      print('Found %s/%X (now %X)' % line, psl[line], flags)
+      print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
       continue
 
-    psl[line] = flags
+    if utf_mode:
+      psl[line] = flags
+    psl[punycode] = flags
 
 #  with open("psl.out", 'w') as outfile:
 #    for (domain, flags) in sorted(psl.iteritems()):
@@ -602,10 +604,10 @@ def usage():
   print('usage: %s [options] infile outfile' % sys.argv[0])
   print('  --input-format=psl2c    infile has been generated by libpsl/psl2c utility (default)')
   print('  --input-format=psl      infile is a Public Suffix List file')
-  print('  --output-format=cxx     Write DAFSA as C/C++ code')
+  print('  --output-format=cxx     Write DAFSA as C/C++ code (default)')
   print('  --output-format=binary  Write DAFSA binary data')
-  print('  --encoding=ascii        7-bit ASCII mode (default)')
-  print('  --encoding=utf-8        UTF-8 mode')
+  print('  --encoding=ascii        7-bit ASCII mode')
+  print('  --encoding=utf-8        UTF-8 mode (default)')
   exit(1)
 
 
@@ -616,7 +618,7 @@ def main():
 
   converter = words_to_cxx
   parser = parse_psl2c
-  utf_mode = False
+  utf_mode = True
 
   for arg in sys.argv[1:-2]:
     if arg.startswith('--input-format='):
@@ -634,6 +636,9 @@ def main():
         converter = words_to_binary
       elif value == 'cxx':
         converter = words_to_cxx
+      else:
+        print("Unknown output format '%s'" % value)
+        return 1
     elif arg.startswith('--encoding='):
       value = arg[11:].lower()
       if value == 'ascii':
@@ -648,10 +653,10 @@ def main():
 
   if sys.argv[-2] == '-':
     with open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(sys.stdin), utf_mode))
+      outfile.write(converter(parser(sys.stdin, utf_mode), utf_mode))
   else:
     with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(infile), utf_mode))
+      outfile.write(converter(parser(infile, utf_mode), utf_mode))
 
   return 0
 

From 70661c68076f7aedf8f5814b7c2275ccb685c3c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sat, 5 Nov 2016 17:34:05 +0100
Subject: [PATCH 30/42] Add checking with ASCII DAFSA in test-is-public-all.c

---
 tests/test-is-public-all.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/test-is-public-all.c b/tests/test-is-public-all.c
index f76cfb8..aca1458 100644
--- a/tests/test-is-public-all.c
+++ b/tests/test-is-public-all.c
@@ -131,7 +131,7 @@ static void test_psl_entry(const psl_ctx_t *psl, const char *domain, int type)
 static void test_psl(void)
 {
 	FILE *fp;
-	psl_ctx_t *psl, *psl3;
+	psl_ctx_t *psl, *psl3, *psl4;
 	const psl_ctx_t *psl2;
 	int type = 0;
 	char buf[256], *linep, *p;
@@ -147,6 +147,11 @@ static void test_psl(void)
 		failed++;
 	}
 
+	if (!(psl4 = psl_load_file("psl_ascii.dafsa"))) {
+		fprintf(stderr, "Failed to load 'psl_ascii.dafsa'\n");
+		failed++;
+	}
+
 	if ((fp = fopen(PSL_FILE, "r"))) {
 #ifdef HAVE_CLOCK_GETTIME
 		clock_gettime(CLOCK_REALTIME, &ts1);
@@ -182,6 +187,9 @@ static void test_psl(void)
 
 			if (psl3)
 				test_psl_entry(psl3, p, type);
+
+			if (psl4)
+				test_psl_entry(psl4, p, type);
 		}
 
 #ifdef HAVE_CLOCK_GETTIME
@@ -193,9 +201,10 @@ static void test_psl(void)
 		failed++;
 	}
 
-	psl_free(psl);
-	psl_free((psl_ctx_t *)psl2);
+	psl_free(psl4);
 	psl_free(psl3);
+	psl_free((psl_ctx_t *)psl2);
+	psl_free(psl);
 }
 
 int main(int argc, const char * const *argv)

From 2677621b624f673c73c4bac5c3bc4b270da8358d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sat, 5 Nov 2016 17:37:34 +0100
Subject: [PATCH 31/42] Add test DAFSA files to .gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 043d477..0e87313 100644
--- a/.gitignore
+++ b/.gitignore
@@ -78,6 +78,8 @@ stamp-h1
 test-driver
 tests/*.log
 tests/*.trs
+tests/psl.dafsa
+tests/psl_ascii.dafsa
 tests/test-is-cookie-domain-acceptable
 tests/test-is-public
 tests/test-is-public-all

From 44e6bd4eb891d91f15474eb9c7a1fb19ac973e90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sun, 6 Nov 2016 11:28:15 +0100
Subject: [PATCH 32/42] src/psl2c.c: Also include UTF-8 into DAFSA output

---
 src/psl2c.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/psl2c.c b/src/psl2c.c
index f5654da..6ac3474 100644
--- a/src/psl2c.c
+++ b/src/psl2c.c
@@ -153,11 +153,6 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
 	if ((fp = fopen("in.tmp", "w"))) {
 		for (it = 0; it < v->cur; it++) {
 			_psl_entry_t *e = _vector_get(v, it);
-			unsigned char *s = (unsigned char *)e->label_buf;
-
-			/* search for non-ASCII label and skip it */
-			while (*s && *s < 128) s++;
-			if (*s) continue;
 
 			fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
 		}
@@ -191,11 +186,6 @@ static int _print_psl_entries_dafsa_binary(const char *fname, const _psl_vector_
 	if ((fp = fopen("in.tmp", "w"))) {
 		for (it = 0; it < v->cur; it++) {
 			_psl_entry_t *e = _vector_get(v, it);
-			unsigned char *s = (unsigned char *)e->label_buf;
-
-			/* search for non-ASCII label and skip it */
-			while (*s && *s < 128) s++;
-			if (*s) continue;
 
 			fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
 		}

From 2c871b1306b927a08540916d7aa67698d9ccb0e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sun, 6 Nov 2016 11:59:36 +0100
Subject: [PATCH 33/42] Skip conversion in _psl_is_public_suffix() for builtin
 psl context

---
 src/psl.c                  | 14 +++++++++++---
 tests/test-is-public-all.c |  2 +-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index af14a94..aaf794a 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -177,7 +177,7 @@ struct _psl_ctx_st {
 		nsuffixes,
 		nexceptions,
 		nwildcards;
-	unsigned char
+	unsigned
 		utf8 : 1; /* 1: data contains UTF-8 + punycode encoded rules */
 };
 
@@ -619,7 +619,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
 }
 #endif
 
-static inline int _isspace_ascii(const char c)
+static int _isspace_ascii(const char c)
 {
 	return c == ' ' || c == '\t' || c == '\r' || c == '\n';
 }
@@ -803,7 +803,7 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
 	for (p = domain; *p; p++) {
 		if (*p == '.')
 			suffix.nlabels++;
-		else if (!psl->utf8 && *((unsigned char *)p) >= 128)
+		else if (*((unsigned char *)p) >= 128)
 			need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */
 	}
 
@@ -814,6 +814,14 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
 		return 1;
 	}
 
+	if (psl->utf8 || psl == &_builtin_psl)
+		need_conversion = 0;
+
+#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
+	if (psl == &_builtin_psl)
+		need_conversion = 0;
+#endif
+
 	if (need_conversion) {
 		_psl_idna_t *idna = _psl_idna_open();
 
diff --git a/tests/test-is-public-all.c b/tests/test-is-public-all.c
index aca1458..3e875d1 100644
--- a/tests/test-is-public-all.c
+++ b/tests/test-is-public-all.c
@@ -49,7 +49,7 @@ static int
 	struct timespec ts1, ts2;
 #endif
 
-static inline int _isspace_ascii(const char c)
+static int _isspace_ascii(const char c)
 {
 	return c == ' ' || c == '\t' || c == '\r' || c == '\n';
 }

From 514aa0163cbd3fbf4e90745f7e57c390b3453d05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sun, 6 Nov 2016 12:10:23 +0100
Subject: [PATCH 34/42] Add Daurnimator to AUTHORS

---
 AUTHORS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AUTHORS b/AUTHORS
index 6f3195c..a32e646 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -16,4 +16,5 @@ Christopher Meng (Fedora building)
 Jakub Čajka
 Giuseppe Scrivano
 Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
+Daurnimator (Code review, discussion, reports)
 Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)

From 3a4dff880521bab2d2a6d39d007404ba3bb90b0b Mon Sep 17 00:00:00 2001
From: Olle Liljenzin <olle@liljenzin.se>
Date: Sun, 6 Nov 2016 15:31:18 +0100
Subject: [PATCH 35/42] Fixed documentation and error message to match the
 actual code.

---
 src/psl-make-dafsa | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/psl-make-dafsa b/src/psl-make-dafsa
index 0d4408a..46f0406 100755
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@@ -519,10 +519,10 @@ def parse_psl2c(infile, utf_mode):
   for line in lines:
     if line[-3:-1] != ', ':
       raise InputError('Expected "domainname, <digit>", found "%s"' % line)
-    # Technically the DAFSA format could support return values in range [0-31],
+    # Technically the DAFSA format could support return values in range [0x00-0x1E],
     # but the values below are the only with a defined meaning.
     if line[-1] not in '0123456789ABCDEF':
-      raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % line[-1])
+      raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1])
 
 #  with open("gperf.out", 'w') as outfile:
 #    for line in sorted(lines):

From 761d938d2af05a1720541f912d51c738d9474609 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Sun, 6 Nov 2016 22:47:33 +0100
Subject: [PATCH 36/42] Fix name of Olle Liljenzin in src/psl-make-dafsa.1

---
 src/psl-make-dafsa.1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/psl-make-dafsa.1 b/src/psl-make-dafsa.1
index b8c6e09..66d20c9 100644
--- a/src/psl-make-dafsa.1
+++ b/src/psl-make-dafsa.1
@@ -36,6 +36,6 @@ depends on options passed to it.
 .SH SEE ALSO
 .IR https://publicsuffix.org/ ", " https://github.com/rockdaboot/libpsl
 .SH COPYRIGHT
-\fBpsl-make-dafsa\fR was was written by Olle Jiljenzin as part of the Chromium project and
+\fBpsl-make-dafsa\fR was was written by Olle Liljenzin as part of the Chromium project and
 has been modified by Tim Ruehsen and Daniel Kahn Gillmor.  The code
 and its documentation is governed by a BSD-style license.

From f01a81472b6d01321d97bff1b6910e740031d2e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Thu, 10 Nov 2016 11:05:00 +0100
Subject: [PATCH 37/42] Fix list email address in README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7ceb525..87658f8 100644
--- a/README.md
+++ b/README.md
@@ -124,7 +124,7 @@ Mailing List
 
 To join the mailing list send an email to
 
-<libpsl-bugs+subscribe@googlegroups.com>
+libpsl-bugs+subscribe@googlegroups.com
 
 and follow the instructions provided by the answer mail.
 

From 1b36fb00123020a0ecc9d440c88e875c37c834af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Thu, 10 Nov 2016 11:56:19 +0100
Subject: [PATCH 38/42] Remove -Wall from automake options to reduce verbosity

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 2653141..e78f52d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,7 +1,7 @@
 
 AC_INIT([libpsl], [0.14.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
 AC_PREREQ([2.59])
-AM_INIT_AUTOMAKE([1.10 -Wall no-define foreign])
+AM_INIT_AUTOMAKE([1.10 no-define foreign])
 
 # Generate two configuration headers; one for building the library itself with
 # an autogenerated template, and a second one that will be installed alongside

From 3f276c7d1e1c1a4d394567ac79d011f28f45d65a Mon Sep 17 00:00:00 2001
From: Olle Liljenzin <olle@liljenzin.se>
Date: Sat, 12 Nov 2016 21:10:59 +0100
Subject: [PATCH 39/42] Fix psl-make-dafsa to work with python3

---
 src/psl-make-dafsa | 105 ++++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 48 deletions(-)

diff --git a/src/psl-make-dafsa b/src/psl-make-dafsa
index 46f0406..b12d443 100755
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@@ -243,6 +243,10 @@ char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x00-0x
                       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # 0xE0-0xEF
                       4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
 
+def to_bytes(n):
+  """Converts an integer value to a bytes object."""
+  return bytes(bytearray((n,)))
+
 def to_dafsa(words, utf_mode):
   """Generates a DAFSA from a word list and returns the source node.
 
@@ -253,35 +257,34 @@ def to_dafsa(words, utf_mode):
     raise InputError('The domain list must not be empty')
   def to_nodes(word, multibyte_length):
     """Split words into characters"""
-    byte = ord(word[0])
+    byte = ord(word[:1])
     if multibyte_length:
       # Consume next byte in multibyte sequence.
       if byte & 0xC0 != 0x80:
         raise InputError('Invalid UTF-8 multibyte sequence')
-      return chr(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
+      return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
     char_length = char_length_table[byte]
     if char_length == 1:
       # 7-bit printable ASCII.
       if len(word) == 1:
-        return chr(int(word[0], 16) & 0x0F), [None]
-      return word[0], [to_nodes(word[1:], 0)]
+        return to_bytes(int(word[:1], 16) & 0x0F), [None]
+      return word[:1], [to_nodes(word[1:], 0)]
     elif char_length > 1:
       # Leading byte in multibyte sequence.
       if not utf_mode:
         raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
       if len(word) <= char_length:
         raise InputError('Unterminated UTF-8 multibyte sequence')
-      return chr(0x1F), [(chr(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
+      return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
     # Unexpected character.
     raise InputError('Domain names must be printable ASCII or UTF-8')
 
   return [to_nodes(word, 0) for word in words]
 
-
 def to_words(node):
   """Generates a word list from all paths starting from an internal node."""
   if not node:
-    return ['']
+    return [b'']
   return [(node[0] + word) for child in node[1] for word in to_words(child)]
 
 
@@ -348,7 +351,7 @@ def join_suffixes(dafsa):
   """Generates a new DAFSA where nodes that represent the same word lists
   towards the sink are merged.
   """
-  nodemap = {frozenset(('',)): None}
+  nodemap = {frozenset((b'',)): None}
 
   def join(node):
     """Returns a macthing node. A new node is created if no matching node
@@ -446,7 +449,7 @@ def encode_prefix(label):
   will then be a prefix to the label in the child node.
   """
   assert label
-  return [ord(c) for c in reversed(label)]
+  return [c for c in bytearray(reversed(label))]
 
 
 def encode_label(label):
@@ -479,59 +482,61 @@ def encode(dafsa, utf_mode):
   return output
 
 
-def to_cxx(data):
+def to_cxx(data, codecs):
   """Generates C++ code from a list of encoded bytes."""
-  text = '/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
-  text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
-  text += ' documentation.'
-  text += '*/\n\n'
-  text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
+  text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
+  text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
+  text += b' documentation.'
+  text += b'*/\n\n'
+  text += b'static const unsigned char kDafsa['
+  text += bytes(str(len(data)), **codecs)
+  text += b'] = {\n'
   for i in range(0, len(data), 12):
-    text += '  '
-    text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
-    text += ',\n'
-  text += '};\n'
+    text += b'  '
+    text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
+    text += b',\n'
+  text += b'};\n'
   return text
 
 
-def words_to_whatever(words, converter, utf_mode):
+def words_to_whatever(words, converter, utf_mode, codecs):
   """Generates C++ code from a word list"""
   dafsa = to_dafsa(words, utf_mode)
   for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
     dafsa = fun(dafsa)
-  return converter(encode(dafsa, utf_mode))
+  return converter(encode(dafsa, utf_mode), codecs)
 
 
-def words_to_cxx(words, utf_mode):
+def words_to_cxx(words, utf_mode, codecs):
   """Generates C++ code from a word list"""
-  return words_to_whatever(words, to_cxx, utf_mode)
+  return words_to_whatever(words, to_cxx, utf_mode, codecs)
 
 
-def words_to_binary(words, utf_mode):
+def words_to_binary(words, utf_mode, codecs):
   """Generates C++ code from a word list"""
-  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, bytearray, utf_mode)
+  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
 
 
-def parse_psl2c(infile, utf_mode):
+def parse_psl2c(infile, utf_mode, codecs):
   """Parses file generated by psl2c and extract strings and return code"""
-  lines = [line.strip() for line in infile]
+  lines = [bytes(line.strip(), **codecs) for line in infile]
 
   for line in lines:
-    if line[-3:-1] != ', ':
+    if line[-3:-1] != b', ':
       raise InputError('Expected "domainname, <digit>", found "%s"' % line)
     # Technically the DAFSA format could support return values in range [0x00-0x1E],
     # but the values below are the only with a defined meaning.
-    if line[-1] not in '0123456789ABCDEF':
-      raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1])
+    if line[-1] not in b'0123456789ABCDEF':
+      raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1:])
 
 #  with open("gperf.out", 'w') as outfile:
 #    for line in sorted(lines):
 #      outfile.write(line[:-3] + line[-1] + "\n")
 
-  return [line[:-3] + line[-1] for line in sorted(lines)]
+  return [line[:-3] + line[-1:] for line in sorted(lines)]
 
 
-def parse_psl(infile, utf_mode):
+def parse_psl(infile, utf_mode, codecs):
   """Parses PSL file and extract strings and return code"""
   PSL_FLAG_EXCEPTION = (1<<0)
   PSL_FLAG_WILDCARD = (1<<1)
@@ -543,37 +548,37 @@ def parse_psl(infile, utf_mode):
   section = 0
 
   for line in infile:
-    line = line.strip()
+    line = bytes(line.strip(), **codecs)
     if not line:
       continue
 
-    if line.startswith("//"):
+    if line.startswith(b'//'):
       if section == 0:
-        if "===BEGIN ICANN DOMAINS===" in line:
+        if b'===BEGIN ICANN DOMAINS===' in line:
           section = PSL_FLAG_ICANN
-        elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
+        elif section == 0 and b'===BEGIN PRIVATE DOMAINS===' in line:
           section = PSL_FLAG_PRIVATE
-      elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
+      elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
         section = 0
-      elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
+      elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
         section = 0
       continue # skip comments
 
-    if line[0] == '!':
+    if line[:1] == b'!':
       flags = PSL_FLAG_EXCEPTION | section
       line = line[1:]
-    elif line[0] == '*':
-      if line[1] != '.':
+    elif line[:1] == b'*':
+      if line[1:2] != b'.':
         print('Unsupported kind of rule (ignored): %s' % line)
         continue
       flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
       line = line[2:]
     else:
-      if not '.' in line:
+      if not b'.' in line:
         continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
       flags = PSL_FLAG_PLAIN | section
 
-    punycode = line.decode('utf-8').encode("idna")
+    punycode = line.decode('utf-8').encode('idna')
 
     if punycode in psl:
       """Found existing entry:
@@ -596,7 +601,7 @@ def parse_psl(infile, utf_mode):
 #    for (domain, flags) in sorted(psl.iteritems()):
 #      outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
 
-  return [domain + "%X" % (flags & 0x0F) for (domain, flags) in sorted(psl.iteritems())]
+  return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]
 
 
 def usage():
@@ -620,6 +625,10 @@ def main():
   parser = parse_psl2c
   utf_mode = True
 
+  codecs = dict()
+  if sys.version_info.major > 2:
+    codecs['encoding'] = 'utf-8'
+
   for arg in sys.argv[1:-2]:
     if arg.startswith('--input-format='):
       value = arg[15:].lower()
@@ -652,11 +661,11 @@ def main():
       usage()
 
   if sys.argv[-2] == '-':
-    with open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(sys.stdin, utf_mode), utf_mode))
+    with open(sys.argv[-1], 'wb') as outfile:
+      outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
   else:
-    with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(infile, utf_mode), utf_mode))
+    with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'wb') as outfile:
+      outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
 
   return 0
 

From 5d32b80077b87f29af56146fe12eda555be98adb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Mon, 14 Nov 2016 12:08:20 +0100
Subject: [PATCH 40/42] Make API docs more detailed

---
 src/psl.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/psl.c b/src/psl.c
index aaf794a..753fad7 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -942,8 +942,9 @@ suffix_yes:
  *
  * For cookie domain checking see psl_is_cookie_domain_acceptable().
  *
- * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
+ * International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
  * Other encodings likely result in incorrect return values.
+ * Use helper function psl_str_to_utf8lower() for normalization @domain.
  *
  * @psl is a context returned by either psl_load_file(), psl_load_fp() or
  * psl_builtin().
@@ -972,8 +973,9 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
  * @type specifies the PSL section where to perform the lookup. Valid values are
  * %PSL_TYPE_PRIVATE, %PSL_TYPE_ICANN and %PSL_TYPE_ANY.
  *
- * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
+ * International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
  * Other encodings likely result in incorrect return values.
+ * Use helper function psl_str_to_utf8lower() for normalization @domain.
  *
  * @psl is a context returned by either psl_load_file(), psl_load_fp() or
  * psl_builtin().
@@ -998,8 +1000,9 @@ int psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type)
  * This function finds the longest public suffix part of @domain by the means
  * of the [Mozilla Public Suffix List](https://publicsuffix.org).
  *
- * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
+ * International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
  * Other encodings likely result in incorrect return values.
+ * Use helper function psl_str_to_utf8lower() for normalization @domain.
  *
  * @psl is a context returned by either psl_load_file(), psl_load_fp() or
  * psl_builtin().
@@ -1037,8 +1040,9 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
  * This function finds the shortest private suffix part of @domain by the means
  * of the [Mozilla Public Suffix List](https://publicsuffix.org).
  *
- * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
+ * International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
  * Other encodings likely result in incorrect return values.
+ * Use helper function psl_str_to_utf8lower() for normalization @domain.
  *
  * @psl is a context returned by either psl_load_file(), psl_load_fp() or
  * psl_builtin().
@@ -1078,7 +1082,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
  * This function loads the public suffixes file named @fname.
  * To free the allocated resources, call psl_free().
  *
- * The suffixes are expected to be lowercase UTF-8 encoded if they are international.
+ * The suffixes are expected to be UTF-8 encoded (lowercase + NFCK) if they are international.
  *
  * Returns: Pointer to a PSL context or %NULL on failure.
  *
@@ -1107,7 +1111,7 @@ psl_ctx_t *psl_load_file(const char *fname)
  * This function loads the public suffixes from a FILE pointer.
  * To free the allocated resources, call psl_free().
  *
- * The suffixes are expected to be lowercase UTF-8 encoded if they are international.
+ * The suffixes are expected to be UTF-8 encoded (lowercase + NFCK) if they are international.
  *
  * Returns: Pointer to a PSL context or %NULL on failure.
  *
@@ -1286,8 +1290,8 @@ void psl_free(psl_ctx_t *psl)
  * The builtin data also contains punycode entries, one for each international domain name.
  *
  * If the generation of built-in data has been disabled during compilation, %NULL will be returned.
- * When using the builtin psl context, you can provide UTF-8 or punycode representations of domains to
- * functions like psl_is_public_suffix().
+ * When using the builtin psl context, you can provide UTF-8 (lowercase + NFCK) or ASCII/ACE (punycode)
+ * representations of domains to functions like psl_is_public_suffix().
  *
  * Returns: Pointer to the built in PSL data or NULL if this data is not available.
  *
@@ -1506,8 +1510,10 @@ static int _isip(const char *hostname)
  * This helper function checks whether @cookie_domain is an acceptable cookie domain value for the request
  * @hostname.
  *
- * For international domain names both, @hostname and @cookie_domain, have to be either in lowercase UTF-8
- * or in ASCII form (punycode). Other encodings or mixing UTF-8 and punycode likely result in incorrect return values.
+ * For international domain names both, @hostname and @cookie_domain, have to be either in UTF-8 (lowercase + NFCK)
+ * or in ASCII/ACE (punycode) format. Other encodings or mixing UTF-8 and punycode likely result in incorrect return values.
+ *
+ * Use helper function psl_str_to_utf8lower() for normalization of @hostname and @cookie_domain.
  *
  * Examples:
  * 1. Cookie domain 'example.com' would be acceptable for hostname 'www.example.com',
@@ -1564,8 +1570,8 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
  * @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
  * @lower: return value containing the converted string
  *
- * This helper function converts a string to lowercase UTF-8 representation.
- * Lowercase UTF-8 is needed as input to the domain checking functions.
+ * This helper function converts a string to UTF-8 lowercase + NFCK representation.
+ * Lowercase + NFCK UTF-8 is needed as input to the domain checking functions.
  *
  * @lower is set to %NULL on error.
  *

From 3909351697b2bdcde255ae017cf27cc1c5d7caac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Mon, 14 Nov 2016 12:36:16 +0100
Subject: [PATCH 41/42] Update PSL submodule

---
 list | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/list b/list
index 1df90f8..41a519a 160000
--- a/list
+++ b/list
@@ -1 +1 @@
-Subproject commit 1df90f84db1a041991a48e46e786705f7161ab4c
+Subproject commit 41a519ad34cf86ff4470b967d9e4755d72b63a6c

From d83bc6d52305534b239ad5ea71ea1fe1874ea237 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20R=C3=BChsen?= <tim.ruehsen@gmx.de>
Date: Mon, 14 Nov 2016 12:50:48 +0100
Subject: [PATCH 42/42] Release v0.15.0

---
 NEWS         | 9 +++++++++
 configure.ac | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/NEWS b/NEWS
index adc7151..e9f0661 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,14 @@
 Copyright (C) 2014-2016 Tim Rühsen
 
+14.11.2016 Release V0.15.0
+  * Python3 compatibility for psl-make-dafsa
+  * Support for UTF-8 in DAFSA data
+  * Skip punycode conversion if DAFSA has UTF-8
+  * Better code coverage by test suite
+  * Code cleanup and enhancements
+  * Install man pages for psl-make-dafsa and psl
+  * Enhancements to the documentation
+
 30.07.2016 Release V0.14.0
   * Remove unneeded libraries from tools/psl link step
   * Use https instead of http where possible
diff --git a/configure.ac b/configure.ac
index e78f52d..58904ad 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
 
-AC_INIT([libpsl], [0.14.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
+AC_INIT([libpsl], [0.15.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
 AC_PREREQ([2.59])
 AM_INIT_AUTOMAKE([1.10 no-define foreign])
 
@@ -88,7 +88,7 @@ PKG_PROG_PKG_CONFIG
 # 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0.
 # 5. If any interfaces have been added since the last public release, then increment age.
 # 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0. 
-AC_SUBST([LIBPSL_SO_VERSION], [5:1:0])
+AC_SUBST([LIBPSL_SO_VERSION], [5:2:0])
 AC_SUBST([LIBPSL_VERSION], $VERSION)
 
 # Check for enable/disable builtin PSL data