Release v0.15.0

-----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEHLJ9vJhhSy1YQWRtCDAttqJnBCgFAlgppfATHHRpbS5ydWVo c2VuQGdteC5kZQAKCRAIMC22omcEKGy/D/9iduEEwzSDt22U6MxmqD77hvgB9hQn 8Xn7CsTye408EUlw2ENYg4H/V3xNQN7ZbA4wJi20FmcniFhSUbSv9UD5Vr2FSTZS NJ1EpAbqljswE5x49u3lWRyo8XOEbVdWZS66+E5W9T/0Nl6kLUk4nYkBE6LBQGhp vd6+p74kqpjJGHhrZ4uYV5bkttoeSee/arGzvWTR3kmgERVCm9Qr90ldOx3Sp91s iqwb6RpDVkL3q5sA9bOfrpEDdADJdQYLr1BkkTOb7ZA52uEhdU6nEyfswoJsaBuI aj1hOgspekVqEs7ZUpltnT2GPbFyXtj338SA0738xxZaTm/eYzvNea5Fnpg4fnQb /w7I++IZGmdXljQnk1gtqzIgxCwia34u2/T4XgEpyd/h9A5PUdjo2EKPtBgHRFG7 GnK9IRgLHqdxZFpfiUyp2zIZL8+/PUlD5Ekwi1D3Wgc5PSOO0rMHR1IWzCmpopbU Mo9E511RcIdsn+IStB1gwclT5qk1fo3n5dcQBBXtpPTEJ6CRedLK+WcbLyhh3R0Z ham1D8t3kVDQgfg57mEJOIS5sgcLj5LR3ydya5ELf3pS6FVo4qvBO4Sp3E6wbgpE 9n5D150bKyv+RkTuNTgW8uahhYdR++bXUPWbaZReGVxKy3VB7VikDusRfnVFej9c cJP1HAskz6qTwA== =ksJN -----END PGP SIGNATURE----- Merge tag 'libpsl-0.15.0' into debian Release v0.15.0
2016-11-15 08:38:51 +09:00 · 2016-11-15 08:38:51 +09:00 · 4ef2e7c54b
parent 4ad905d13a d83bc6d523
commit 4ef2e7c54b
23 changed files with 630 additions and 206 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,7 @@
 *.exe
+*.gcda
+*.gcno
+*.gcov
 *.gz
 *.la
 *.lo
@ -10,6 +13,7 @@
 *.cache
 *.plist
 *.stamp
+ABOUT-NLS
 aclocal.m4
 ar-lib
 autom4te.cache/
@ -43,6 +47,8 @@ gtk-doc.m4
 gtk-doc.make
 include/libpsl.h
 install-sh
+lcov/
+libpsl.info
 libpsl.pc
 libtool
 ltmain.sh
@ -67,10 +73,13 @@ po/remove-potcdate.sed
 po/stamp-po
 src/psl2c
 src/suffixes.c
+src/suffixes_dafsa.c
 stamp-h1
 test-driver
 tests/*.log
 tests/*.trs
+tests/psl.dafsa
+tests/psl_ascii.dafsa
 tests/test-is-cookie-domain-acceptable
 tests/test-is-public
 tests/test-is-public-all
--- a/.travis.yml
+++ b/.travis.yml
@ -34,6 +34,7 @@ addons:
            - libicu-dev
            - libunistring0
            - libunistring-dev
+            - lcov

 script:
  - ./autogen.sh
@ -44,3 +45,4 @@ script:
  - ./configure --enable-runtime=$RUNTIME --disable-builtin && make clean && make -j4 && make check -j4
  - ./configure --enable-gtk-doc && make -j4 && make check -j4
  - make distcheck
+  - if [[ $CC == "gcc" && $RUNTIME == "libicu" ]]; then ./.travis_coveralls.sh; fi
--- a/.travis_coveralls.sh
+++ b/.travis_coveralls.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+
+make check-coverage-libicu
+pip install --user cpp-coveralls
+coveralls --include libwget/ --include src/ -e "src/psl2c.c"
--- a/2
+++ b/2
@ -16,3 +16,5 @@ Christopher Meng (Fedora building)
 Jakub Čajka
 Giuseppe Scrivano
 Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
+Daurnimator (Code review, discussion, reports)
+Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)
--- a/Makefile.am
+++ b/Makefile.am
@ -19,3 +19,28 @@ dist-hook:
 	mkdir -p $(distdir)/list/tests
 	cp -p $(PSL_FILE) $(distdir)/list
 	cp -p $(PSL_TESTFILE) $(distdir)/list/tests
+
+clean-local:
+	rm -rf */*.gc?? */*/*.gc?? libpsl.info lcov
+
+check-coverage:
+	if test -z "$(XLIB)"; then \
+		CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --disable-runtime --disable-builtin; \
+	else \
+		CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --enable-runtime=$(XLIB) --enable-builtin=$(XLIB); \
+	fi
+	$(MAKE) clean && $(MAKE)
+	lcov --capture --initial --directory src --output-file libpsl.info
+	$(MAKE) check
+	lcov --capture --directory src --output-file libpsl.info
+	lcov --remove libpsl.info 'src/psl2c.c' -o libpsl.info
+	genhtml --prefix . libpsl.info --legend --title "libpsl" --output-directory=lcov
+
+check-coverage-libidn:
+	XLIB=libidn $(MAKE) check-coverage
+
+check-coverage-libidn2:
+	XLIB=libidn2 $(MAKE) check-coverage
+
+check-coverage-libicu:
+	XLIB=libicu $(MAKE) check-coverage
--- a/9
+++ b/9
@ -1,5 +1,14 @@
 Copyright (C) 2014-2016 Tim Rühsen

+14.11.2016 Release V0.15.0
+  * Python3 compatibility for psl-make-dafsa
+  * Support for UTF-8 in DAFSA data
+  * Skip punycode conversion if DAFSA has UTF-8
+  * Better code coverage by test suite
+  * Code cleanup and enhancements
+  * Install man pages for psl-make-dafsa and psl
+  * Enhancements to the documentation
+
 30.07.2016 Release V0.14.0
  * Remove unneeded libraries from tools/psl link step
  * Use https instead of http where possible
--- a/README.md
+++ b/README.md
@ -1,4 +1,12 @@
-[![Build Status](https://travis-ci.org/rockdaboot/libpsl.png?branch=master)](https://travis-ci.org/rockdaboot/libpsl)
+[![Travis-CI Status](https://travis-ci.org/rockdaboot/libpsl.png?branch=master)](https://travis-ci.org/rockdaboot/libpsl)
+[![Coverity Scan](https://img.shields.io/coverity/scan/10227.svg)](https://scan.coverity.com/projects/rockdaboot-libpsl)
+[![Coverage Status](https://coveralls.io/repos/github/rockdaboot/libpsl/badge.svg?branch=master)](https://coveralls.io/github/rockdaboot/libpsl?branch=master)
+
+Solaris OpenCSW [![Build Status Solaris amd64](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-amd64)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-amd64)
+[![Build Status Solaris i386](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-i386)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-i386)
+[![Build Status Solaris Sparc](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-sparc)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-sparc)
+[![Build Status Solaris SparcV9](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-sparcv9)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-sparcv9)
+

 libpsl - C library to handle the Public Suffix List
 ===================================================
@ -116,7 +124,7 @@ Mailing List

 To join the mailing list send an email to

-<libpsl-bugs+subscribe@googlegroups.com>
+libpsl-bugs+subscribe@googlegroups.com

 and follow the instructions provided by the answer mail.

--- a/configure.ac
+++ b/configure.ac
@ -1,7 +1,7 @@

-AC_INIT([libpsl], [0.14.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
+AC_INIT([libpsl], [0.15.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
 AC_PREREQ([2.59])
-AM_INIT_AUTOMAKE([1.10 -Wall no-define foreign])
+AM_INIT_AUTOMAKE([1.10 no-define foreign])

 # Generate two configuration headers; one for building the library itself with
 # an autogenerated template, and a second one that will be installed alongside
@ -88,7 +88,7 @@ PKG_PROG_PKG_CONFIG
 # 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0.
 # 5. If any interfaces have been added since the last public release, then increment age.
 # 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0. 
-AC_SUBST([LIBPSL_SO_VERSION], [5:1:0])
+AC_SUBST([LIBPSL_SO_VERSION], [5:2:0])
 AC_SUBST([LIBPSL_VERSION], $VERSION)

 # Check for enable/disable builtin PSL data
@ -168,7 +168,7 @@ if test "$enable_runtime" = "libicu" -o "$enable_builtin" = "libicu"; then
      [AC_LANG_PROGRAM(
        [[#include <unicode/ustring.h>]],
        [[u_strToUTF8(NULL, 0, NULL, NULL, 0, NULL);]])],
-      [HAVE_LIBICU=yes; AC_MSG_RESULT([yes])],
+      [HAVE_LIBICU=yes; LIBICU_LIBS="-licuuc"; AC_MSG_RESULT([yes])],
      [AC_MSG_RESULT([no]); AC_MSG_ERROR(You requested libicu but it is not installed.)])
    LIBS=$OLDLIBS
  ])
@ -191,7 +191,7 @@ fi
 if test "x$HAVE_LIBIDN2" = "xyes" -o "x$HAVE_LIBIDN" = "xyes"; then
  # Check for libunistring, we need it for psl_str_to_utf8lower()
  OLDLIBS=$LIBS
-  AC_SEARCH_LIBS(u8_tolower, unistring, HAVE_UNISTRING=yes, AC_MSG_ERROR(You requested libidn2 but libunistring is not installed.))
+  AC_SEARCH_LIBS(u8_tolower, unistring, HAVE_UNISTRING=yes, AC_MSG_ERROR(You requested libidn2|libidn but libunistring is not installed.))
  LIBS=$OLDLIBS
 fi

--- a/contrib/check-hard
+++ b/contrib/check-hard
@ -51,6 +51,7 @@ for CC in gcc clang; do
      for xLCALL in C tr_TR.utf8; do
        export TESTS_ENVIRONMENT="LC_ALL=$xLCALL VALGRIND_TESTS=$xVALGRIND"
        echo "    *** TESTS_ENVIRONMENT=\"$TESTS_ENVIRONMENT\"" make check -j$CORES
+        make clean > /dev/null
        make check -j$CORES > /dev/null
      done
    done
--- a/include/libpsl.h.in
+++ b/include/libpsl.h.in
@ -53,10 +53,11 @@ extern "C" {
 * psl_error_t:
 * @PSL_SUCCESS: Successful return.
 * @PSL_ERR_INVALID_ARG: Invalid argument.
- * @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter
+ * @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter.
 * @PSL_ERR_TO_UTF16: Failed to convert to utf-16.
 * @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase.
 * @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8.
+ * @PSL_ERR_NO_MEM: Failed to allocate memory.
 *
 * Return codes for PSL functions.
 * Negative return codes mean failure.
@ -66,9 +67,10 @@ typedef enum {
 	PSL_SUCCESS = 0,
 	PSL_ERR_INVALID_ARG = -1,
 	PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */
-	PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */
-	PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */
-	PSL_ERR_TO_UTF8 = -5 /* failed to convert utf-16 to utf-8 */
+	PSL_ERR_TO_UTF16 = -3,  /* failed to convert to utf-16 */
+	PSL_ERR_TO_LOWER = -4,  /* failed to convert utf-16 to lowercase */
+	PSL_ERR_TO_UTF8 = -5,   /* failed to convert utf-16 to utf-8 */
+	PSL_ERR_NO_MEM = -6    /* failed to allocate memory */
 } psl_error_t;

 typedef struct _psl_ctx_st psl_ctx_t;
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 1df90f84db1a041991a48e46e786705f7161ab4c
+Subproject commit 41a519ad34cf86ff4470b967d9e4755d72b63a6c
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -11,7 +11,7 @@ libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
 # include ABI version information
 libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
 if WITH_LIBICU
-  libpsl_la_LDFLAGS += -licuuc
+  libpsl_la_LDFLAGS += $(LIBICU_LIBS)
 endif
 if WITH_LIBIDN2
  libpsl_la_LDFLAGS += -lidn2 -lunistring
@ -24,7 +24,7 @@ noinst_PROGRAMS = psl2c
 psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
 psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/psl-make-dafsa\"
 if BUILTIN_GENERATOR_LIBICU
-  psl2c_LDADD = -licuuc
+  psl2c_LDADD = $(LIBICU_LIBS)
 endif
 if BUILTIN_GENERATOR_LIBIDN2
  psl2c_LDADD = @LTLIBICONV@ -lidn2 -lunistring
@ -39,3 +39,5 @@ suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
 	./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c

 EXTRA_DIST = psl-make-dafsa LICENSE.chromium
+
+dist_man_MANS = psl-make-dafsa.1
--- a/src/lookup_string_in_fixed_set.c
+++ b/src/lookup_string_in_fixed_set.c
@ -21,6 +21,48 @@

 #define CHECK_LT(a, b) if ((a) >= b) return 0

+static const char multibyte_length_table[16] = {
+	0, 0, 0, 0,	 /* 0x00-0x3F */
+	0, 0, 0, 0,	 /* 0x40-0x7F */
+	0, 0, 0, 0,	 /* 0x80-0xBF */
+	2, 2, 3, 4,	 /* 0xC0-0xFF */
+};
+
+
+/**
+ * Get lenght of multibyte character sequence starting at a given byte.
+ * Returns zero if the byte is not a valid leading byte in UTF-8.
+ */
+static int GetMultibyteLength(char c) {
+	return multibyte_length_table[((unsigned char)c) >> 4];
+}
+
+/**
+ * Moves pointers one byte forward.
+ */
+static void NextPos(const unsigned char** pos,
+	const char** key,
+	const char** multibyte_start)
+{
+	++*pos;
+	if (*multibyte_start) {
+		/* Advance key to next byte in multibyte sequence. */
+		++*key;
+		/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
+		if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
+			*multibyte_start = 0;
+	} else {
+		if (GetMultibyteLength(**key)) {
+			/* Multibyte prefix was matched in the dafsa, start matching multibyte
+			 * content in next round. */
+			*multibyte_start = *key;
+		} else {
+			/* Advance key as a single byte character was matched. */
+			++*key;
+		}
+	}
+}
+
 /*
 * Read next offset from pos.
 * Returns true if an offset could be read, false otherwise.
@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
 	return(*offset & 0x80) != 0;
 }

+/*
+ * Check if byte at offset matches first character in key.
+ * This version assumes a range check was already performed by the caller.
+ */
+
+static int IsMatchUnchecked(const unsigned char matcher,
+	const char* key,
+	const char* multibyte_start)
+{
+	if (multibyte_start) {
+		/* Multibyte matching mode. */
+		if (multibyte_start == key) {
+			/* Match leading byte, which will also match the sequence length. */
+			return (matcher ^ 0x80) == (const unsigned char)*key;
+		} else {
+			/* Match following bytes. */
+			return (matcher ^ 0xC0) == (const unsigned char)*key;
+		}
+	}
+	/* If key points at a leading byte in a multibyte sequence, but we are not yet
+	 * in multibyte mode, then the dafsa should contain a special byte to indicate
+	 * a mode switch. */
+	if (GetMultibyteLength(*key)) {
+		return matcher == 0x1F;
+	}
+	/* Normal matching of a single byte character. */
+	return matcher == (const unsigned char)*key;
+}
+
 /*
 * Check if byte at offset matches first character in key.
 * This version matches characters not last in label.
@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)

 static int IsMatch(const unsigned char* offset,
 	const unsigned char* end,
-	const char* key)
+	const char* key,
+	const char* multibyte_start)
 {
 	CHECK_LT(offset, end);
-	return *offset == *key;
+	return IsMatchUnchecked(*offset, key, multibyte_start);
 }

 /*
@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,

 static int IsEndCharMatch(const unsigned char* offset,
 	const unsigned char* end,
-	const char* key)
+	const char* key,
+	const char* multibyte_start)
 {
 	CHECK_LT(offset, end);
-	return *offset == (*key | 0x80);
+	return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
 }

 /*
@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,

 static int GetReturnValue(const unsigned char* offset,
 	const unsigned char* end,
+	const char* multibyte_start,
 	int* return_value)
 {
 	CHECK_LT(offset, end);
-	if ((*offset & 0xE0) == 0x80) {
+	if (!multibyte_start && (*offset & 0xE0) == 0x80) {
 		*return_value = *offset & 0x0F;
 		return 1;
 	}
@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 	const unsigned char* end = graph + length;
 	const unsigned char* offset = pos;
 	const char* key_end = key + key_length;
+	const char* multibyte_start = 0;

 	while (GetNextOffset(&pos, end, &offset)) {
 		/*char <char>+ end_char offsets
@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,

 		if (key != key_end && !IsEOL(offset, end)) {
 			/* Leading <char> is not a match. Don't dive into this child */
-			if (!IsMatch(offset, end, key))
+			if (!IsMatch(offset, end, key, multibyte_start))
 				continue;
 			did_consume = 1;
-			++offset;
-			++key;
+			NextPos(&offset, &key, &multibyte_start);
 			/* Possible matches at this point:
 			 * <char>+ end_char offsets
 			 * <char>+ return value
@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,

 			/* Remove all remaining <char> nodes possible */
 			while (!IsEOL(offset, end) && key != key_end) {
-				if (!IsMatch(offset, end, key))
+				if (!IsMatch(offset, end, key, multibyte_start))
 					return -1;
-				++key;
-				++offset;
+				NextPos(&offset, &key, &multibyte_start);
 			}
 		}
 		/* Possible matches at this point:
@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 		if (key == key_end) {
 			int return_value;

-			if (GetReturnValue(offset, end, &return_value))
+			if (GetReturnValue(offset, end, multibyte_start, &return_value))
 				return return_value;
 			/* The DAFSA guarantees that if the first char is a match, all
 			 * remaining char elements MUST match if the key is truly present.
@ -191,14 +264,22 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
 				return -1;
 			continue;
 		}
-		if (!IsEndCharMatch(offset, end, key)) {
+		if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
 			if (did_consume)
 				return -1; /* Unexpected */
 			continue;
 		}
-		++key;
-		pos = ++offset; /* Dive into child */
+		NextPos(&offset, &key, &multibyte_start);
+		pos = offset; /* Dive into child */
 	}

 	return -1; /* No match */
 }
+
+/* prototype to skip warning with -Wmissing-prototypes */
+int _HIDDEN GetUtfMode(const unsigned char *graph, size_t length);
+
+int _HIDDEN GetUtfMode(const unsigned char *graph, size_t length)
+{
+	return length > 0 && graph[length - 1] < 0x80;
+}
--- a/src/psl-make-dafsa
+++ b/src/psl-make-dafsa
@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python
 # Copyright 2014 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE.chromium file.
@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
 and generates a C++ file with a byte array representing graph that can be
 used as a memory efficient replacement for the perfect hash table.

-The input strings are assumed to consist of printable 7-bit ASCII characters
-and the return values are assumed to be one digit integers.
+The input strings must consist of printable 7-bit ASCII characters or UTF-8
+multibyte sequences. Control characters in the range [0x00-0x1F] are not
+allowed. The return values must be one digit integers. .

 In this program a DAFSA is a diamond shaped graph starting at a common
 source node and ending at a common sink node. All internal nodes contain
@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:

 <byte> ::= < 8-bit value in range [0x00-0xFF] >

-<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
-<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
+<char> ::= < byte in range [0x1F-0x7F] >
+<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
 <return value> ::= < value + 0x80, byte in range [0x80-0x8F] >

 <offset1> ::= < byte in range [0x00-0x3F] >
@ -84,13 +85,18 @@ The generated byte array can described by the following BNF:
         | <prefix> <node>
         | <end_label>

-<dafsa> ::= <source>
-          | <dafsa> <node>
+<graph> ::= <graph>
+          | <graph> <node>
+
+<version> ::= <empty>            # The DAFSA was generated in ASCII mode.
+          | < byte value 0x01 >  # The DAFSA was generated in UTF-8 mode.
+
+<dafsa> ::= <graph> <version>

 Decoding:

-<char> -> printable 7-bit ASCII character
-<end_char> & 0x7F -> printable 7-bit ASCII character
+<char> -> character
+<end_char> & 0x7F -> character
 <return value> & 0x0F -> integer
 <offset1 & 0x3F> -> integer
 ((<offset2> & 0x1F>) << 8) + <byte> -> integer
@ -105,6 +111,28 @@ between previous child node and next child node. Thus each offset links a node
 to a child node. The distance is always counted between start addresses, i.e.
 first byte in decoded offset or first byte in child node.

+Transcoding of UTF-8 multibyte sequences:
+
+The original DAFSA format was limited to 7-bit printable ASCII characters in
+range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
+By transcoding of such characters the new format preserves compatibility with
+old parsers, so that a DAFSA in the extended format can be used by an old
+parser without false positives, although strings containing transcoded
+characters will never match. Since the format is extended rather than being
+changed, a parser supporting the new format will automatically support data
+generated in the old format.
+
+Transcoding is performed by insertion of a start byte with the special value
+0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
+the range of printable ASCII.
+
+2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
+
+3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
+
+4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
+                00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
+
 Example 1:

 %%
@ -197,8 +225,29 @@ import sys
 class InputError(Exception):
  """Exception raised for errors in the input file."""

+# Length of a character starting at a given byte.
+char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x00-0x0F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x10-0x1F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x20-0x2F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x30-x03F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x40-0x4F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x50-x05F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x60-0x6F
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 0x70-x07F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x80-0x8F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0x90-0x9F
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xA0-0xAF
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  # 0xB0-0xBF
+                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xC0-0xCF
+                      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  # 0xD0-0xDF
+                      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  # 0xE0-0xEF
+                      4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF

-def to_dafsa(words):
+def to_bytes(n):
+  """Converts an integer value to a bytes object."""
+  return bytes(bytearray((n,)))
+
+def to_dafsa(words, utf_mode):
  """Generates a DAFSA from a word list and returns the source node.

  Each word is split into characters so that each character is represented by
@ -206,20 +255,36 @@ def to_dafsa(words):
  """
  if not words:
    raise InputError('The domain list must not be empty')
-  def to_nodes(word):
+  def to_nodes(word, multibyte_length):
    """Split words into characters"""
-    if not 0x1F < ord(word[0]) < 0x80:
-      raise InputError('Domain names must be printable 7-bit ASCII')
-    if len(word) == 1:
-      return chr(int(word[0], 16) & 0x0F), [None]
-    return word[0], [to_nodes(word[1:])]
-  return [to_nodes(word) for word in words]
+    byte = ord(word[:1])
+    if multibyte_length:
+      # Consume next byte in multibyte sequence.
+      if byte & 0xC0 != 0x80:
+        raise InputError('Invalid UTF-8 multibyte sequence')
+      return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
+    char_length = char_length_table[byte]
+    if char_length == 1:
+      # 7-bit printable ASCII.
+      if len(word) == 1:
+        return to_bytes(int(word[:1], 16) & 0x0F), [None]
+      return word[:1], [to_nodes(word[1:], 0)]
+    elif char_length > 1:
+      # Leading byte in multibyte sequence.
+      if not utf_mode:
+        raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
+      if len(word) <= char_length:
+        raise InputError('Unterminated UTF-8 multibyte sequence')
+      return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
+    # Unexpected character.
+    raise InputError('Domain names must be printable ASCII or UTF-8')

+  return [to_nodes(word, 0) for word in words]

 def to_words(node):
  """Generates a word list from all paths starting from an internal node."""
  if not node:
-    return ['']
+    return [b'']
  return [(node[0] + word) for child in node[1] for word in to_words(child)]


@ -286,7 +351,7 @@ def join_suffixes(dafsa):
  """Generates a new DAFSA where nodes that represent the same word lists
  towards the sink are merged.
  """
-  nodemap = {frozenset(('',)): None}
+  nodemap = {frozenset((b'',)): None}

  def join(node):
    """Returns a macthing node. A new node is created if no matching node
@ -384,7 +449,7 @@ def encode_prefix(label):
  will then be a prefix to the label in the child node.
  """
  assert label
-  return [ord(c) for c in reversed(label)]
+  return [c for c in bytearray(reversed(label))]


 def encode_label(label):
@ -396,7 +461,7 @@ def encode_label(label):
  return buf


-def encode(dafsa):
+def encode(dafsa, utf_mode):
  """Encodes a DAFSA to a list of bytes"""
  output = []
  offsets = {}
@ -412,62 +477,66 @@ def encode(dafsa):

  output.extend(encode_links(dafsa, offsets, len(output)))
  output.reverse()
+  if utf_mode:
+    output.append(0x01)
  return output


-def to_cxx(data):
+def to_cxx(data, codecs):
  """Generates C++ code from a list of encoded bytes."""
-  text = '/* This file is generated. DO NOT EDIT!\n\n'
-  text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
-  text += ' documentation.'
-  text += '*/\n\n'
-  text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
+  text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
+  text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
+  text += b' documentation.'
+  text += b'*/\n\n'
+  text += b'static const unsigned char kDafsa['
+  text += bytes(str(len(data)), **codecs)
+  text += b'] = {\n'
  for i in range(0, len(data), 12):
-    text += '  '
-    text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
-    text += ',\n'
-  text += '};\n'
+    text += b'  '
+    text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
+    text += b',\n'
+  text += b'};\n'
  return text


-def words_to_whatever(words, converter):
+def words_to_whatever(words, converter, utf_mode, codecs):
  """Generates C++ code from a word list"""
-  dafsa = to_dafsa(words)
+  dafsa = to_dafsa(words, utf_mode)
  for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
    dafsa = fun(dafsa)
-  return converter(encode(dafsa))
+  return converter(encode(dafsa, utf_mode), codecs)


-def words_to_cxx(words):
+def words_to_cxx(words, utf_mode, codecs):
  """Generates C++ code from a word list"""
-  return words_to_whatever(words, to_cxx)
+  return words_to_whatever(words, to_cxx, utf_mode, codecs)


-def words_to_binary(words):
+def words_to_binary(words, utf_mode, codecs):
  """Generates C++ code from a word list"""
-  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, bytearray)
+  return b'.DAFSA@PSL_0   \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)


-def parse_psl2c(infile):
+def parse_psl2c(infile, utf_mode, codecs):
  """Parses file generated by psl2c and extract strings and return code"""
-  lines = [line.strip() for line in infile]
+  lines = [bytes(line.strip(), **codecs) for line in infile]

  for line in lines:
-    if line[-3:-1] != ', ':
+    if line[-3:-1] != b', ':
      raise InputError('Expected "domainname, <digit>", found "%s"' % line)
-    # Technically the DAFSA format could support return values in range [0-31],
+    # Technically the DAFSA format could support return values in range [0x00-0x1E],
    # but the values below are the only with a defined meaning.
-    if line[-1] not in '0123456789ABCDEF':
-      raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % line[-1])
+    if line[-1] not in b'0123456789ABCDEF':
+      raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1:])

 #  with open("gperf.out", 'w') as outfile:
 #    for line in sorted(lines):
 #      outfile.write(line[:-3] + line[-1] + "\n")

-  return [line[:-3] + line[-1] for line in sorted(lines)]
+  return [line[:-3] + line[-1:] for line in sorted(lines)]


-def parse_psl(infile):
+def parse_psl(infile, utf_mode, codecs):
  """Parses PSL file and extract strings and return code"""
  PSL_FLAG_EXCEPTION = (1<<0)
  PSL_FLAG_WILDCARD = (1<<1)
@ -479,39 +548,39 @@ def parse_psl(infile):
  section = 0

  for line in infile:
-    line = line.strip()
+    line = bytes(line.strip(), **codecs)
    if not line:
      continue

-    if line.startswith("//"):
+    if line.startswith(b'//'):
      if section == 0:
-        if "===BEGIN ICANN DOMAINS===" in line:
+        if b'===BEGIN ICANN DOMAINS===' in line:
          section = PSL_FLAG_ICANN
-        elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
+        elif section == 0 and b'===BEGIN PRIVATE DOMAINS===' in line:
          section = PSL_FLAG_PRIVATE
-      elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
+      elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
        section = 0
-      elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
+      elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
        section = 0
      continue # skip comments

-    if line[0] == '!':
+    if line[:1] == b'!':
      flags = PSL_FLAG_EXCEPTION | section
      line = line[1:]
-    elif line[0] == '*':
-      if line[1] != '.':
+    elif line[:1] == b'*':
+      if line[1:2] != b'.':
        print('Unsupported kind of rule (ignored): %s' % line)
        continue
      flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
      line = line[2:]
    else:
-      if not '.' in line:
+      if not b'.' in line:
        continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
      flags = PSL_FLAG_PLAIN | section

-    line = line.decode('utf-8').encode("idna")
+    punycode = line.decode('utf-8').encode('idna')

-    if line in psl:
+    if punycode in psl:
      """Found existing entry:
         Combination of exception and plain rule is ambiguous
           !foo.bar
@ -521,16 +590,18 @@ def parse_psl(infile):
           !foo.bar + *.foo.bar
            foo.bar + *.foo.bar
      """
-      print('Found %s/%X (now %X)' % line, psl[line], flags)
+      print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
      continue

-    psl[line] = flags
+    if utf_mode:
+      psl[line] = flags
+    psl[punycode] = flags

 #  with open("psl.out", 'w') as outfile:
 #    for (domain, flags) in sorted(psl.iteritems()):
 #      outfile.write(domain + "%X" % (flags & 0x0F) + "\n")

-  return [domain + "%X" % (flags & 0x0F) for (domain, flags) in sorted(psl.iteritems())]
+  return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]


 def usage():
@ -538,8 +609,10 @@ def usage():
  print('usage: %s [options] infile outfile' % sys.argv[0])
  print('  --input-format=psl2c    infile has been generated by libpsl/psl2c utility (default)')
  print('  --input-format=psl      infile is a Public Suffix List file')
-  print('  --output-format=cxx     Write DAFSA as C/C++ code')
+  print('  --output-format=cxx     Write DAFSA as C/C++ code (default)')
  print('  --output-format=binary  Write DAFSA binary data')
+  print('  --encoding=ascii        7-bit ASCII mode')
+  print('  --encoding=utf-8        UTF-8 mode (default)')
  exit(1)


@ -550,6 +623,11 @@ def main():

  converter = words_to_cxx
  parser = parse_psl2c
+  utf_mode = True
+
+  codecs = dict()
+  if sys.version_info.major > 2:
+    codecs['encoding'] = 'utf-8'

  for arg in sys.argv[1:-2]:
    if arg.startswith('--input-format='):
@ -570,15 +648,24 @@ def main():
      else:
        print("Unknown output format '%s'" % value)
        return 1
+    elif arg.startswith('--encoding='):
+      value = arg[11:].lower()
+      if value == 'ascii':
+        utf_mode = False
+      elif value == 'utf-8':
+        utf_mode = True
+      else:
+        print("Unknown encoding '%s'" % value)
+        return 1
    else:
      usage()

  if sys.argv[-2] == '-':
-    with open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(sys.stdin)))
+    with open(sys.argv[-1], 'wb') as outfile:
+      outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
  else:
-    with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
-      outfile.write(converter(parser(infile)))
+    with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'wb') as outfile:
+      outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))

  return 0

--- a/src/psl-make-dafsa.1
+++ b/src/psl-make-dafsa.1
@ -28,9 +28,14 @@ depends on options passed to it.
 \fBcxx\fR: (default) output is C/C++ code
 .br
 \fBbinary\fR: output is an architecture-independent binary format
+.TP
+\fB\-\-encoding=\fR[\fIutf-8\fR|\fIascii\fR]
+\fButf-8\fR: (default) UTF-8 mode (output contains UTF-8 + punycode)
+.br
+\fBascii\fR: (deprecated) 7-bit ASCII mode (output contains punycode only)
 .SH SEE ALSO
 .IR https://publicsuffix.org/ ", " https://github.com/rockdaboot/libpsl
 .SH COPYRIGHT
-\fBpsl-make-dafsa\fR was originally part of the Chromium project, and
+\fBpsl-make-dafsa\fR was was written by Olle Liljenzin as part of the Chromium project and
 has been modified by Tim Ruehsen and Daniel Kahn Gillmor.  The code
 and its documentation is governed by a BSD-style license.
--- a/src/psl.c
+++ b/src/psl.c
@ -73,6 +73,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#include <time.h>
 #include <errno.h>
 #include <limits.h> /* for UINT_MAX */
 #include <langinfo.h>
@ -101,9 +102,6 @@

 #include <libpsl.h>

-/* number of elements within an array */
-#define countof(a) (sizeof(a)/sizeof(*(a)))
-
 #ifndef HAVE_STRNDUP
 /* I found no strndup on my old SUSE 7.3 test system (gcc 2.95) */

@ -176,10 +174,11 @@ struct _psl_ctx_st {
 	size_t
 		dafsa_size;
 	int
-		mode,
 		nsuffixes,
 		nexceptions,
 		nwildcards;
+	unsigned
+		utf8 : 1; /* 1: data contains UTF-8 + punycode encoded rules */
 };

 /* include the PSL data compiled by 'psl2c' */
@ -263,11 +262,21 @@ static int _vector_add(_psl_vector_t *v, const _psl_entry_t *elem)
 	if (v) {
 		void *elemp;

-		elemp = malloc(sizeof(_psl_entry_t));
+		if (!(elemp = malloc(sizeof(_psl_entry_t))))
+			return -1;
+
 		memcpy(elemp, elem, sizeof(_psl_entry_t));

-		if (v->max == v->cur)
-			v->entry = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
+		if (v->max == v->cur) {
+			void *m = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
+
+			if (m)
+				v->entry = m;
+			else {
+				free(elemp);
+				return -1;
+			}
+		}

 		v->entry[v->cur++] = elemp;
 		return v->cur - 1;
@ -517,36 +526,37 @@ static enum punycode_status punycode_encode(
 static ssize_t _utf8_to_utf32(const char *in, size_t inlen, punycode_uint *out, size_t outlen)
 {
 	size_t n = 0;
-	unsigned char *s;
+	const unsigned char *s = (void *)in;
+	const unsigned char *e = (void *)(in + inlen);

 	if (!outlen)
 		return -1;

 	outlen--;

-	s = alloca(inlen + 1);
-	memcpy(s, in, inlen);
-	s[inlen] = 0;
+	while (n < outlen) {
+		size_t inleft = e - s;

-	while (*s && n < outlen) {
-		if ((*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
+		if (inleft >= 1 && (*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
 			out[n++] = *s;
 			s++;
-		} else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
+		} else if (inleft >= 2 && (*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
 			if ((s[1] & 0xC0) != 0x80)
 				return -1;
 			out[n++] = ((*s & 0x1F) << 6) | (s[1] & 0x3F);
 			s += 2;
-		} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
+		} else if (inleft >= 3 && (*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
 			if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
 				return -1;
 			out[n++] = ((*s & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
 			s += 3;
-		} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
+		} else if (inleft >= 4 && (*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
 			if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
 				return -1;
 			out[n++] = ((*s & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
 			s += 4;
+		} else if (!inleft) {
+			break;
 		} else
 			return -1;
 	}
@ -575,7 +585,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
 		/* printf("s=%s inlen=%zd\n", label, labellen); */

 		if (_mem_is_ascii(label, labellen)) {
-			if (outlen + labellen + (e != NULL)>= outsize)
+			if (outlen + labellen + (e != NULL) >= outsize)
 				return 1;

 			/* printf("outlen=%zd labellen=%zd\n", outlen, labellen); */
@ -587,7 +597,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
 			if (outlen + labellen + (e != NULL) + 4 >= outsize)
 				return 1;

-			if ((inputlen = _utf8_to_utf32(label, labellen, input, sizeof (input) / sizeof (input[0]))) < 0)
+			if ((inputlen = _utf8_to_utf32(label, labellen, input, countof(input))) < 0)
 				return 1;

 			memcpy(out + outlen, "xn--", 4);
@ -609,7 +619,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
 }
 #endif

-static inline int _isspace_ascii(const char c)
+static int _isspace_ascii(const char c)
 {
 	return c == ' ' || c == '\t' || c == '\r' || c == '\n';
 }
@ -691,15 +701,15 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
 		UChar utf16_dst[128], utf16_src[128];
 		int32_t utf16_src_length;

-		u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, utf8, -1, &status);
+		u_strFromUTF8(utf16_src, countof(utf16_src), &utf16_src_length, utf8, -1, &status);
 		if (U_SUCCESS(status)) {
-			int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
+			int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, countof(utf16_dst), &info, &status);
 			if (U_SUCCESS(status)) {
 				u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
 				if (U_SUCCESS(status)) {
 					if (ascii)
-						*ascii = strdup(lookupname);
-					ret = 0;
+						if ((*ascii = strdup(lookupname)))
+							ret = 0;
 				} /* else
 					fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
 			} /* else
@ -709,32 +719,21 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
 	}
 #elif defined(WITH_LIBIDN2)
 	int rc;
-	uint8_t *lower, resbuf[256];
-	size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
+	uint8_t *lower;
+	size_t len = u8_strlen((uint8_t *)utf8) + 1;

 	/* we need a conversion to lowercase */
-	lower = u8_tolower((uint8_t *)utf8, u8_strlen((uint8_t *)utf8), 0, UNINORM_NFKC, resbuf, &len);
-	if (!lower) {
+	if (!(lower = u8_tolower((uint8_t *)utf8, len, 0, UNINORM_NFKC, NULL, &len))) {
 		/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
 		return -1;
 	}

-	/* u8_tolower() does not terminate the result string */
-	if (lower == resbuf) {
-		lower[len]=0;
-	} else {
-		uint8_t *tmp = lower;
-		lower = (uint8_t *)strndup((char *)lower, len);
-		free(tmp);
-	}
-
 	if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
 		ret = 0;
 	} /* else
 		fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */

-	if (lower != resbuf)
-		free(lower);
+	free(lower);
 #elif defined(WITH_LIBIDN)
 	int rc;

@ -754,8 +753,8 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *

 	if (_domain_to_punycode(utf8, lookupname, sizeof(lookupname)) == 0) {
 		if (ascii)
-			*ascii = strdup(lookupname);
-		ret = 0;
+			if ((*ascii = strdup(lookupname)))
+				ret = 0;
 	}
 #endif

@ -776,16 +775,17 @@ static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_en
 			/* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
 			_suffix_init(&suffix, lookupname, strlen(lookupname));
 			suffix.flags = e->flags;
-			suffixp = _vector_get(v, _vector_add(v, &suffix));
-			suffixp->label = suffixp->label_buf; /* set label to changed address */
+			if ((suffixp = _vector_get(v, _vector_add(v, &suffix))))
+				suffixp->label = suffixp->label_buf; /* set label to changed address */
 		} /* else ignore */

 		free(lookupname);
 	}
 }

-/* prototype */
+/* prototypes */
 int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
+int GetUtfMode(const unsigned char *graph, size_t length);

 static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
 {
@ -814,6 +814,14 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
 		return 1;
 	}

+	if (psl->utf8 || psl == &_builtin_psl)
+		need_conversion = 0;
+
+#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
+	if (psl == &_builtin_psl)
+		need_conversion = 0;
+#endif
+
 	if (need_conversion) {
 		_psl_idna_t *idna = _psl_idna_open();

@ -934,8 +942,9 @@ suffix_yes:
 *
 * For cookie domain checking see psl_is_cookie_domain_acceptable().
 *
- * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
- * Other encodings result in unexpected behavior.
+ * International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
+ * Other encodings likely result in incorrect return values.
+ * Use helper function psl_str_to_utf8lower() for normalization @domain.
 *
 * @psl is a context returned by either psl_load_file(), psl_load_fp() or
 * psl_builtin().
@ -964,8 +973,9 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
 * @type specifies the PSL section where to perform the lookup. Valid values are
 * %PSL_TYPE_PRIVATE, %PSL_TYPE_ICANN and %PSL_TYPE_ANY.
 *
- * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
- * Other encodings result in unexpected behavior.
+ * International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
+ * Other encodings likely result in incorrect return values.
+ * Use helper function psl_str_to_utf8lower() for normalization @domain.
 *
 * @psl is a context returned by either psl_load_file(), psl_load_fp() or
 * psl_builtin().
@ -990,8 +1000,9 @@ int psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type)
 * This function finds the longest public suffix part of @domain by the means
 * of the [Mozilla Public Suffix List](https://publicsuffix.org).
 *
- * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
- * Other encodings result in unexpected behavior.
+ * International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
+ * Other encodings likely result in incorrect return values.
+ * Use helper function psl_str_to_utf8lower() for normalization @domain.
 *
 * @psl is a context returned by either psl_load_file(), psl_load_fp() or
 * psl_builtin().
@ -1029,8 +1040,9 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
 * This function finds the shortest private suffix part of @domain by the means
 * of the [Mozilla Public Suffix List](https://publicsuffix.org).
 *
- * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
- * Other encodings result in unexpected behavior.
+ * International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
+ * Other encodings likely result in incorrect return values.
+ * Use helper function psl_str_to_utf8lower() for normalization @domain.
 *
 * @psl is a context returned by either psl_load_file(), psl_load_fp() or
 * psl_builtin().
@ -1070,7 +1082,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
 * This function loads the public suffixes file named @fname.
 * To free the allocated resources, call psl_free().
 *
- * The suffixes are expected to be lowercase UTF-8 encoded if they are international.
+ * The suffixes are expected to be UTF-8 encoded (lowercase + NFCK) if they are international.
 *
 * Returns: Pointer to a PSL context or %NULL on failure.
 *
@ -1099,7 +1111,7 @@ psl_ctx_t *psl_load_file(const char *fname)
 * This function loads the public suffixes from a FILE pointer.
 * To free the allocated resources, call psl_free().
 *
- * The suffixes are expected to be lowercase UTF-8 encoded if they are international.
+ * The suffixes are expected to be UTF-8 encoded (lowercase + NFCK) if they are international.
 *
 * Returns: Pointer to a PSL context or %NULL on failure.
 *
@ -1152,6 +1164,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
 			psl->dafsa = m;

 		psl->dafsa_size = len;
+		psl->utf8 = !!GetUtfMode(psl->dafsa, len);

 		return psl;
 	}
@ -1163,6 +1176,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
 	 *  as of 19.02.2014, the list at https://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
 	 */
 	psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
+	psl->utf8 = 1; /* we put UTF-8 and punycode rules in the lookup vector */

 	do {
 		while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
@ -1231,9 +1245,10 @@ psl_ctx_t *psl_load_fp(FILE *fp)
 				suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
 			}

-			suffixp->label = suffixp->label_buf; /* set label to changed address */
-
-			_add_punycode_if_needed(idna, psl->suffixes, suffixp);
+			if (suffixp) {
+				suffixp->label = suffixp->label_buf; /* set label to changed address */
+				_add_punycode_if_needed(idna, psl->suffixes, suffixp);
+			}
 		}
 	} while ((linep = fgets(buf, sizeof(buf), fp)));

@ -1275,8 +1290,8 @@ void psl_free(psl_ctx_t *psl)
 * The builtin data also contains punycode entries, one for each international domain name.
 *
 * If the generation of built-in data has been disabled during compilation, %NULL will be returned.
- * When using the builtin psl context, you can provide UTF-8 or punycode representations of domains to
- * functions like psl_is_public_suffix().
+ * When using the builtin psl context, you can provide UTF-8 (lowercase + NFCK) or ASCII/ACE (punycode)
+ * representations of domains to functions like psl_is_public_suffix().
 *
 * Returns: Pointer to the built in PSL data or NULL if this data is not available.
 *
@ -1495,8 +1510,10 @@ static int _isip(const char *hostname)
 * This helper function checks whether @cookie_domain is an acceptable cookie domain value for the request
 * @hostname.
 *
- * For international domain names both, @hostname and @cookie_domain, have to be either in lowercase UTF-8
- * or in ASCII form (punycode). Other encodings or mixing UTF-8 and punycode result in unexpected behavior.
+ * For international domain names both, @hostname and @cookie_domain, have to be either in UTF-8 (lowercase + NFCK)
+ * or in ASCII/ACE (punycode) format. Other encodings or mixing UTF-8 and punycode likely result in incorrect return values.
+ *
+ * Use helper function psl_str_to_utf8lower() for normalization of @hostname and @cookie_domain.
 *
 * Examples:
 * 1. Cookie domain 'example.com' would be acceptable for hostname 'www.example.com',
@ -1553,8 +1570,8 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
 * @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
 * @lower: return value containing the converted string
 *
- * This helper function converts a string to lowercase UTF-8 representation.
- * Lowercase UTF-8 is needed as input to the domain checking functions.
+ * This helper function converts a string to UTF-8 lowercase + NFCK representation.
+ * Lowercase + NFCK UTF-8 is needed as input to the domain checking functions.
 *
 * @lower is set to %NULL on error.
 *
@ -1567,6 +1584,7 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
 *   PSL_ERR_TO_UTF16: Failed to convert @str to unicode
 *   PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase
 *   PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8
+ *   PSL_ERR_NO_MEM: Failed to allocate memory
 *
 * Since: 0.4
 */
@ -1585,7 +1603,8 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 		if (lower) {
 			char *p;

-			*lower = strdup(str);
+			if (!(*lower = strdup(str)))
+				return PSL_ERR_NO_MEM;

 			/* convert ASCII string to lowercase */
 			for (p = *lower; *p; p++)
@ -1604,10 +1623,21 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 	char *utf8_lower;
 	UConverter *uconv;

-	/* C89 allocation */
-	utf16_dst   = alloca(sizeof(UChar) * (str_length * 2 + 1));
-	utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
-	utf8_lower  = alloca(str_length * 2 + 1);
+	if (str_length < 256) {
+		/* C89 allocation */
+		utf16_dst   = alloca(sizeof(UChar) * (str_length * 2 + 1));
+		utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
+		utf8_lower  = alloca(str_length * 2 + 1);
+	} else {
+		utf16_dst   = malloc(sizeof(UChar) * (str_length * 2 + 1));
+		utf16_lower = malloc(sizeof(UChar) * (str_length * 2 + 1));
+		utf8_lower  = malloc(str_length * 2 + 1);
+
+		if (!utf16_dst || !utf16_lower || !utf8_lower) {
+			ret = PSL_ERR_NO_MEM;
+			goto out;
+		}
+	}

 	uconv = ucnv_open(encoding, &status);
 	if (U_SUCCESS(status)) {
@ -1619,9 +1649,16 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 			if (U_SUCCESS(status)) {
 				u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status);
 				if (U_SUCCESS(status)) {
-					if (lower)
-						*lower = strdup(utf8_lower);
 					ret = PSL_SUCCESS;
+					if (lower) {
+						if (str_length < 256) {
+							if (!(*lower = strdup(utf8_lower)))
+								ret = PSL_ERR_NO_MEM;
+						} else {
+							*lower = utf8_lower;
+							utf8_lower = NULL;
+						}
+					}
 				} else {
 					ret = PSL_ERR_TO_UTF8;
 					/* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
@ -1638,6 +1675,12 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 		ret = PSL_ERR_CONVERTER;
 		/* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */
 	}
+out:
+	if (str_length >= 256) {
+		free(utf16_dst);
+		free(utf16_lower);
+		free(utf8_lower);
+	}
 	} while (0);
 #elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
 	do {
@ -1655,26 +1698,32 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,

 			if (cd != (iconv_t)-1) {
 				char *tmp = (char *)str; /* iconv won't change where str points to, but changes tmp itself */
-				size_t tmp_len = strlen(str);
+				size_t tmp_len = strlen(str) + 1;
 				size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
 				char *dst = malloc(dst_len + 1), *dst_tmp = dst;

-				if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1) {
-					uint8_t *resbuf = malloc(dst_len * 2 + 1);
-					size_t len = dst_len * 2; /* leave space for additional \0 byte */
+				if (!dst) {
+					ret = PSL_ERR_NO_MEM;
+				}
+				else if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
+					&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
+				{
+					/* start size for u8_tolower internal memory allocation.
+					 * u8_tolower() does not terminate the result string. we have 0 byte included in above tmp_len
+					 * and thus in len. */
+					size_t len = dst_len - dst_len_tmp;

-					if ((dst = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
-						/* u8_tolower() does not terminate the result string */
-						if (lower)
-							*lower = strndup((char *)dst, len);
+					if ((tmp = (char *)u8_tolower((uint8_t *)dst, len, 0, UNINORM_NFKC, NULL, &len))) {
+						ret = PSL_SUCCESS;
+						if (lower) {
+							*lower = tmp;
+							tmp = NULL;
+						} else
+							free(tmp);
 					} else {
 						ret = PSL_ERR_TO_LOWER;
 						/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
 					}
-
-					if (lower)
-						*lower = strndup(dst, dst_len - dst_len_tmp);
-					ret = PSL_SUCCESS;
 				} else {
 					ret = PSL_ERR_TO_UTF8;
 					/* fprintf(stderr, "Failed to convert '%s' string into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
@ -1686,19 +1735,21 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
 				ret = PSL_ERR_TO_UTF8;
 				/* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
 			}
-		} else
-			ret = PSL_SUCCESS;
-
-		/* convert to lowercase */
-		if (ret == PSL_SUCCESS) {
-			uint8_t *dst, resbuf[256];
-			size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
-
+		} else {
 			/* we need a conversion to lowercase */
-			if ((dst = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str), 0, UNINORM_NFKC, resbuf, &len))) {
-				/* u8_tolower() does not terminate the result string */
-				if (lower)
-					*lower = strndup((char *)dst, len);
+			uint8_t *tmp;
+
+			/* start size for u8_tolower internal memory allocation.
+			 * u8_tolower() does not terminate the result string, so include terminating 0 byte in len. */
+			size_t len = u8_strlen((uint8_t *)str) + 1;
+
+			if ((tmp = u8_tolower((uint8_t *)str, len, 0, UNINORM_NFKC, NULL, &len))) {
+				ret = PSL_SUCCESS;
+				if (lower) {
+					*lower = (char*)tmp;
+					tmp = NULL;
+				} else
+					free(tmp);
 			} else {
 				ret = PSL_ERR_TO_LOWER;
 				/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
--- a/src/psl2c.c
+++ b/src/psl2c.c
@ -153,11 +153,6 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
 	if ((fp = fopen("in.tmp", "w"))) {
 		for (it = 0; it < v->cur; it++) {
 			_psl_entry_t *e = _vector_get(v, it);
-			unsigned char *s = (unsigned char *)e->label_buf;
-
-			/* search for non-ASCII label and skip it */
-			while (*s && *s < 128) s++;
-			if (*s) continue;

 			fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
 		}
@ -191,11 +186,6 @@ static int _print_psl_entries_dafsa_binary(const char *fname, const _psl_vector_
 	if ((fp = fopen("in.tmp", "w"))) {
 		for (it = 0; it < v->cur; it++) {
 			_psl_entry_t *e = _vector_get(v, it);
-			unsigned char *s = (unsigned char *)e->label_buf;
-
-			/* search for non-ASCII label and skip it */
-			while (*s && *s < 128) s++;
-			if (*s) continue;

 			fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
 		}
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@ -23,3 +23,14 @@ check_PROGRAMS = $(PSL_TESTS)

 TESTS_ENVIRONMENT = TESTS_VALGRIND="@VALGRIND_ENVIRONMENT@"
 TESTS = $(PSL_TESTS)
+
+# dafsa.psl and dafsa_ascii.psl must be created before any test is executed
+# check-local target works in parallel to the tests, so the test suite will likely fail
+BUILT_SOURCES = psl.dafsa psl_ascii.dafsa
+psl.dafsa:
+	$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary "$(PSL_FILE)" psl.dafsa
+psl_ascii.dafsa:
+	$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary --encoding=ascii "$(PSL_FILE)" psl_ascii.dafsa
+
+clean-local:
+	rm -f psl.dafsa psl_ascii.dafsa
--- a/tests/test-is-cookie-domain-acceptable.c
+++ b/tests/test-is-cookie-domain-acceptable.c
@ -65,6 +65,7 @@ static void test_psl(void)
 		{ "www.his.name", "his.name", 1 },
 		{ "www.his.name", "name", 0 },
 		{ "www.example.com", "www.example.com", 1 },
+		{ "www.example.com", "wwww.example.com", 0 },
 		{ "www.example.com", "example.com", 1 },
 		{ "www.example.com", "com", 0 }, /* not accepted by normalization (PSL rule 'com') */
 		{ "www.example.com", "example.org", 0 },
@ -77,6 +78,8 @@ static void test_psl(void)
 		{ "2a00:1450:4013:c01::8b", ":1450:4013:c01::8b", 0 }, /* IPv6 address, partial match */
 		{ "::ffff:192.1.123.2", "::ffff:192.1.123.2", 1 }, /* IPv6 address dotted-quad, full match */
 		{ "::ffff:192.1.123.2", ".1.123.2", 0 }, /* IPv6 address dotted-quad, partial match */
+		{ NULL, ".1.123.2", 0 },
+		{ "hiho", NULL, 0 },
 	};
 	unsigned it;
 	psl_ctx_t *psl;
@ -98,6 +101,9 @@ static void test_psl(void)
 		}
 	}

+	/* do checks to cover more code paths in libpsl */
+	psl_is_cookie_domain_acceptable(NULL, "example.com", "example.com");
+
 	psl_free(psl);
 }

--- a/tests/test-is-public-all.c
+++ b/tests/test-is-public-all.c
@ -49,7 +49,7 @@ static int
 	struct timespec ts1, ts2;
 #endif

-static inline int _isspace_ascii(const char c)
+static int _isspace_ascii(const char c)
 {
 	return c == ' ' || c == '\t' || c == '\r' || c == '\n';
 }
@ -131,7 +131,7 @@ static void test_psl_entry(const psl_ctx_t *psl, const char *domain, int type)
 static void test_psl(void)
 {
 	FILE *fp;
-	psl_ctx_t *psl;
+	psl_ctx_t *psl, *psl3, *psl4;
 	const psl_ctx_t *psl2;
 	int type = 0;
 	char buf[256], *linep, *p;
@ -142,6 +142,16 @@ static void test_psl(void)
 	psl2 = psl_builtin();
 	printf("builtin PSL has %d suffixes and %d exceptions\n", psl_suffix_count(psl2), psl_suffix_exception_count(psl2));

+	if (!(psl3 = psl_load_file("psl.dafsa"))) {
+		fprintf(stderr, "Failed to load 'psl.dafsa'\n");
+		failed++;
+	}
+
+	if (!(psl4 = psl_load_file("psl_ascii.dafsa"))) {
+		fprintf(stderr, "Failed to load 'psl_ascii.dafsa'\n");
+		failed++;
+	}
+
 	if ((fp = fopen(PSL_FILE, "r"))) {
 #ifdef HAVE_CLOCK_GETTIME
 		clock_gettime(CLOCK_REALTIME, &ts1);
@ -174,6 +184,12 @@ static void test_psl(void)

 			if (psl2)
 				test_psl_entry(psl2, p, type);
+
+			if (psl3)
+				test_psl_entry(psl3, p, type);
+
+			if (psl4)
+				test_psl_entry(psl4, p, type);
 		}

 #ifdef HAVE_CLOCK_GETTIME
@ -185,8 +201,10 @@ static void test_psl(void)
 		failed++;
 	}

-	psl_free(psl);
+	psl_free(psl4);
+	psl_free(psl3);
 	psl_free((psl_ctx_t *)psl2);
+	psl_free(psl);
 }

 int main(int argc, const char * const *argv)
--- a/tests/test-is-public.c
+++ b/tests/test-is-public.c
@ -84,6 +84,7 @@ static void test_psl(void)
 		{ "adfhoweirh", 1 }, /* unknown TLD */
 	};
 	unsigned it;
+	int result, ver;
 	psl_ctx_t *psl;

 	psl = psl_load_file(PSL_FILE);
@ -92,7 +93,7 @@ static void test_psl(void)

 	for (it = 0; it < countof(test_data); it++) {
 		const struct test_data *t = &test_data[it];
-		int result = psl_is_public_suffix(psl, t->domain);
+		result = psl_is_public_suffix(psl, t->domain);

 		if (result == t->result) {
 			ok++;
@ -102,6 +103,68 @@ static void test_psl(void)
 		}
 	}

+	/* do some checks to cover more code paths in libpsl */
+	psl_is_public_suffix(NULL, "xxx");
+
+	if ((ver = psl_check_version_number(0)) == 0) {
+		printf("psl_check_version_number(0) is 0\n");
+		failed++;
+	} else {
+		if (((result = psl_check_version_number(ver)) != ver)) {
+			printf("psl_check_version_number(%06X) is %06X\n", ver, result);
+			failed++;
+		}
+
+		if (((result = psl_check_version_number(ver - 1)) != 0)) {
+			printf("psl_check_version_number(%06X) is %06X\n", ver - 1, result);
+			failed++;
+		}
+
+		if (((result = psl_check_version_number(ver + 1)) != ver)) {
+			printf("psl_check_version_number(%06X) is %06X\n", ver, result);
+			failed++;
+		}
+	}
+
+	psl_str_to_utf8lower("www.example.com", "utf-8", "en", NULL);
+	psl_str_to_utf8lower(NULL, "utf-8", "en", NULL);
+
+	{
+		char *lower = NULL;
+
+		psl_str_to_utf8lower("www.example.com", NULL, "de", &lower);
+		free(lower); lower = NULL;
+
+		psl_str_to_utf8lower("\374bel.de", NULL, "de", &lower);
+		free(lower); lower = NULL;
+
+		psl_str_to_utf8lower("\374bel.de", "iso-8859-1", NULL, &lower);
+		free(lower); lower = NULL;
+
+		psl_str_to_utf8lower(NULL, "utf-8", "en", &lower);
+		free(lower); lower = NULL;
+	}
+
+	psl_get_version();
+	psl_builtin_filename();
+	psl_builtin_outdated();
+	psl_builtin_file_time();
+	psl_builtin_sha1sum();
+	psl_suffix_wildcard_count(NULL);
+	psl_suffix_wildcard_count(psl);
+	psl_suffix_wildcard_count(psl_builtin());
+	psl_suffix_count(NULL);
+	psl_suffix_exception_count(NULL);
+	psl_load_file(NULL);
+	psl_load_fp(NULL);
+	psl_registrable_domain(NULL, "");
+	psl_registrable_domain(psl, NULL);
+	psl_registrable_domain(psl, "www.example.com");
+	psl_unregistrable_domain(NULL, "");
+	psl_unregistrable_domain(psl, NULL);
+	psl_is_public_suffix2(NULL, "", PSL_TYPE_ANY);
+	psl_is_public_suffix2(psl, NULL, PSL_TYPE_ANY);
+
 	psl_free(psl);
 }

--- a/tests/test-registrable-domain.c
+++ b/tests/test-registrable-domain.c
@ -50,14 +50,28 @@ static int
 	ok,
 	failed;

-static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
+static void testx(const psl_ctx_t *psl, const char *domain, const char *encoding, const char *lang, const char *expected_result)
 {
 	const char *result;
 	char *lower;
+	int rc;

-	/* our test data is fixed to UTF-8 (english), so provide it here */
-	if (psl_str_to_utf8lower(domain, "utf-8", "en", &lower) == PSL_SUCCESS)
+	/* just to cover special code paths for valgrind checking */
+	psl_str_to_utf8lower(domain, encoding, lang, NULL);
+
+	if ((rc = psl_str_to_utf8lower(domain, encoding, lang, &lower)) == PSL_SUCCESS)
 		domain = lower;
+	/* non-ASCII domains fail here if no runtime IDN library is configured, so skip it */
+#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
+	else if (domain) {
+		/* if we do not runtime support, test failure have to be skipped */
+		failed++;
+		printf("psl_str_to_utf8lower(%s)=%d\n", domain ? domain : "NULL", rc);
+
+		free(lower);
+		return;
+	}
+#endif

 	result = psl_registrable_domain(psl, domain);

@ -72,13 +86,28 @@ static void test(const psl_ctx_t *psl, const char *domain, const char *expected_
 	free(lower);
 }

+static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
+{
+	testx(psl, domain, "utf-8", "en", expected_result);
+}
+
+static void test_iso(const psl_ctx_t *psl, const char *domain, const char *expected_result)
+{
+	/* makes only sense with a runtime IDN library configured */
+#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
+	testx(psl, domain, "iso-8859-15", "de", expected_result);
+#endif
+}
+
 static void test_psl(void)
 {
 	FILE *fp;
 	const psl_ctx_t *psl;
 	const char *p;
 	char buf[256], domain[128], expected_regdom[128], semicolon[2];
+	char lbuf[258];
 	int er_is_null, d_is_null;
+	unsigned it;

 	psl = psl_builtin();

@ -101,6 +130,22 @@ static void test_psl(void)
 	/* Norwegian with lowercase oe */
 	test(psl, "www.\303\270yer.no", "www.\303\270yer.no");

+	/* Norwegian with lowercase oe, encoded as ISO-8859-15 */
+	test_iso(psl, "www.\370yer.no", "www.\303\270yer.no");
+
+	/* Testing special code paths of psl_str_to_utf8lower() */
+	for (it = 254; it <= 257; it++) {
+		memset(lbuf, 'a', it);
+		lbuf[it] = 0;
+
+		lbuf[0] = '\370';
+		test_iso(psl, lbuf, NULL);
+
+		lbuf[0] = '\303';
+		lbuf[1] = '\270';
+		test(psl, lbuf, NULL);
+	}
+
 	/* special check with NULL psl context and TLD */
 	test(psl, "whoever.forgot.his.name", "whoever.forgot.his.name");

--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@ -12,3 +12,5 @@ LDADD = ../src/libpsl.la
 #if WITH_LIBIDN
 #  LDADD += -lidn
 #endif
+
+dist_man_MANS = psl.1