diff --git a/AUTHORS b/AUTHORS
index 12e83e6..33dad7b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -15,3 +15,4 @@ Dagobert Michelsen (Fixed Solaris building)
Christopher Meng (Fedora building)
Jakub ÄŒajka
Giuseppe Scrivano
+Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
diff --git a/README.md b/README.md
index 7bc8fbc..75e1038 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ Browsers and other web clients can use it to
Libpsl...
-- has built-in PSL data for fast access
+- has built-in PSL data for fast access (DAWG/DAFSA reduces size from 180kB to ~32kB)
- allows to load PSL data from files
- checks if a given domain is a "public suffix"
- provides immediate cookie domain verification
@@ -28,6 +28,8 @@ Find more information about the Publix Suffix List [here](http://publicsuffix.or
Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat).
+The DAFSA code has been taken from [Chromium Project](https://code.google.com/p/chromium/).
+
API Documentation
-----------------
@@ -74,6 +76,8 @@ License
Libpsl is made available under the terms of the MIT license.
See the LICENSE file that accompanies this distribution for the full text of the license.
+src/make_dafsa.py and src/lookup_string_in_fixed_set.c are licensed under the term written in
+src/LICENSE.chromium.
Building from git
-----------------
diff --git a/src/LICENSE.chromium b/src/LICENSE.chromium
new file mode 100644
index 0000000..ffe66fe
--- /dev/null
+++ b/src/LICENSE.chromium
@@ -0,0 +1,30 @@
+* The following License is for the source code files
+ make_dafsa.py and lookup_string_in_fixed_set.c.
+
+// Copyright 2015 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/Makefile.am b/src/Makefile.am
index 1111bb3..9e04d53 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,12 +1,12 @@
# suffixes.c must be created before psl.c is compiled
-BUILT_SOURCES = suffixes.c
+BUILT_SOURCES = suffixes_dafsa.c
# suffixes.c is a built source that must be cleaned
-CLEANFILES = suffixes.c
+CLEANFILES = suffixes_dafsa.c
lib_LTLIBRARIES = libpsl.la
-libpsl_la_SOURCES = psl.c
+libpsl_la_SOURCES = psl.c lookup_string_in_fixed_set.c
libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
# include ABI version information
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
@@ -21,8 +21,8 @@ if WITH_LIBIDN
endif
noinst_PROGRAMS = psl2c
-psl2c_SOURCES = psl2c.c
-psl2c_CPPFLAGS = -I$(top_srcdir)/include
+psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
+psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/make_dafsa.py\"
if BUILTIN_GENERATOR_LIBICU
psl2c_LDADD = -licuuc
endif
@@ -33,8 +33,9 @@ if BUILTIN_GENERATOR_LIBIDN
psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring
endif
-# Build rule for suffix.c
+# Build rule for suffix_dafsa.c
# PSL_FILE can be set by ./configure --with-psl-file=[PATH]
-suffixes.c: $(PSL_FILE) psl2c$(EXEEXT)
- ./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes.c
- ./psl2c$(EXEEXT) --dafsa "$(PSL_FILE)" suffixes_dafsa.c
\ No newline at end of file
+suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
+ ./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
+
+EXTRA_DIST = make_dafsa.py LICENSE.chromium
diff --git a/src/lookup_string_in_fixed_set.c b/src/lookup_string_in_fixed_set.c
new file mode 100644
index 0000000..bdec2e5
--- /dev/null
+++ b/src/lookup_string_in_fixed_set.c
@@ -0,0 +1,204 @@
+/* Copyright 2015 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE.chromium file.
+ *
+ * Converted to C89 2015 by Tim Rühsen
+ */
+
+#include
+
+#if defined(__GNUC__) && defined(__GNUC_MINOR__)
+# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#else
+# define _GCC_VERSION_AT_LEAST(major, minor) 0
+#endif
+
+#if _GCC_VERSION_AT_LEAST(4,0)
+# define _HIDDEN __attribute__ ((visibility ("hidden")))
+#else
+# define _HIDDEN
+#endif
+
+#define CHECK_LT(a, b) if ((a) >= b) return 0
+
+/*
+ * Read next offset from pos.
+ * Returns true if an offset could be read, false otherwise.
+ */
+
+static int GetNextOffset(const unsigned char** pos,
+ const unsigned char* end,
+ const unsigned char** offset)
+{
+ size_t bytes_consumed;
+
+ if (*pos == end)
+ return 0;
+
+ /* When reading an offset the byte array must always contain at least
+ * three more bytes to consume. First the offset to read, then a node
+ * to skip over and finally a destination node. No object can be smaller
+ * than one byte. */
+ CHECK_LT(*pos + 2, end);
+ switch (**pos & 0x60) {
+ case 0x60: /* Read three byte offset */
+ *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
+ bytes_consumed = 3;
+ break;
+ case 0x40: /* Read two byte offset */
+ *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
+ bytes_consumed = 2;
+ break;
+ default:
+ *offset += (*pos)[0] & 0x3F;
+ bytes_consumed = 1;
+ }
+ if ((**pos & 0x80) != 0) {
+ *pos = end;
+ } else {
+ *pos += bytes_consumed;
+ }
+ return 1;
+}
+
+/*
+ * Check if byte at offset is last in label.
+ */
+
+static int IsEOL(const unsigned char* offset, const unsigned char* end)
+{
+ CHECK_LT(offset, end);
+ return(*offset & 0x80) != 0;
+}
+
+/*
+ * Check if byte at offset matches first character in key.
+ * This version matches characters not last in label.
+ */
+
+static int IsMatch(const unsigned char* offset,
+ const unsigned char* end,
+ const char* key)
+{
+ CHECK_LT(offset, end);
+ return *offset == *key;
+}
+
+/*
+ * Check if byte at offset matches first character in key.
+ * This version matches characters last in label.
+ */
+
+static int IsEndCharMatch(const unsigned char* offset,
+ const unsigned char* end,
+ const char* key)
+{
+ CHECK_LT(offset, end);
+ return *offset == (*key | 0x80);
+}
+
+/*
+ * Read return value at offset.
+ * Returns true if a return value could be read, false otherwise.
+ */
+
+static int GetReturnValue(const unsigned char* offset,
+ const unsigned char* end,
+ int* return_value)
+{
+ CHECK_LT(offset, end);
+ if ((*offset & 0xE0) == 0x80) {
+ *return_value = *offset & 0x0F;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Looks up the string |key| with length |key_length| in a fixed set of
+ * strings. The set of strings must be known at compile time. It is converted to
+ * a graph structure named a DAFSA (Deterministic Acyclic Finite State
+ * Automaton) by the script make_dafsa.py during compilation. This permits
+ * efficient (in time and space) lookup. The graph generated by make_dafsa.py
+ * takes the form of a constant byte array which should be supplied via the
+ * |graph| and |length| parameters. The return value is kDafsaNotFound,
+ * kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
+ * kDafsaWildcardRule and kDafsaPrivateRule ORed together.
+ *
+ * Lookup a domain key in a byte array generated by make_dafsa.py.
+ */
+
+/* prototype to skip warning with -Wmissing-prototypes */
+int _HIDDEN LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
+
+int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
+ size_t length,
+ const char* key,
+ size_t key_length)
+{
+ const unsigned char* pos = graph;
+ const unsigned char* end = graph + length;
+ const unsigned char* offset = pos;
+ const char* key_end = key + key_length;
+
+ while (GetNextOffset(&pos, end, &offset)) {
+ /*char + end_char offsets
+ * char + return value
+ * char end_char offsets
+ * char return value
+ * end_char offsets
+ * return_value
+ */
+ int did_consume = 0;
+
+ if (key != key_end && !IsEOL(offset, end)) {
+ /* Leading is not a match. Don't dive into this child */
+ if (!IsMatch(offset, end, key))
+ continue;
+ did_consume = 1;
+ ++offset;
+ ++key;
+ /* Possible matches at this point:
+ * + end_char offsets
+ * + return value
+ * end_char offsets
+ * return value
+ */
+
+ /* Remove all remaining nodes possible */
+ while (!IsEOL(offset, end) && key != key_end) {
+ if (!IsMatch(offset, end, key))
+ return -1;
+ ++key;
+ ++offset;
+ }
+ }
+ /* Possible matches at this point:
+ * end_char offsets
+ * return_value
+ * If one or more elements were consumed, a failure
+ * to match is terminal. Otherwise, try the next node.
+ */
+ if (key == key_end) {
+ int return_value;
+
+ if (GetReturnValue(offset, end, &return_value))
+ return return_value;
+ /* The DAFSA guarantees that if the first char is a match, all
+ * remaining char elements MUST match if the key is truly present.
+ */
+ if (did_consume)
+ return -1;
+ continue;
+ }
+ if (!IsEndCharMatch(offset, end, key)) {
+ if (did_consume)
+ return -1; /* Unexpected */
+ continue;
+ }
+ ++key;
+ pos = ++offset; /* Dive into child */
+ }
+
+ return -1; /* No match */
+}
diff --git a/tools/make_dafsa.py b/src/make_dafsa.py
similarity index 98%
rename from tools/make_dafsa.py
rename to src/make_dafsa.py
index 6a04bf1..8268271 100755
--- a/tools/make_dafsa.py
+++ b/src/make_dafsa.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# found in the LICENSE.chromium file.
"""
A Deterministic acyclic finite state automaton (DAFSA) is a compact
@@ -421,7 +421,7 @@ def to_cxx(data):
text += 'The byte array encodes effective tld names. See make_dafsa.py for'
text += ' documentation.'
text += '*/\n\n'
- text += 'const unsigned char kDafsa[%s] = {\n' % len(data)
+ text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
for i in range(0, len(data), 12):
text += ' '
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
@@ -450,7 +450,7 @@ def parse_gperf(infile):
raise InputError('Expected "domainname, ", found "%s"' % line)
# Technically the DAFSA format could support return values in range [0-31],
# but the values below are the only with a defined meaning.
- if line[-1] not in '01245':
+ if line[-1] not in '0123456789ABCDEF':
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' %
line[-1])
return [line[:-3] + line[-1] for line in lines]
diff --git a/src/psl.c b/src/psl.c
index 17717ea..c7c39ef 100644
--- a/src/psl.c
+++ b/src/psl.c
@@ -32,6 +32,18 @@
# include
#endif
+#if defined(__GNUC__) && defined(__GNUC_MINOR__)
+# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#else
+# define _GCC_VERSION_AT_LEAST(major, minor) 0
+#endif
+
+#if _GCC_VERSION_AT_LEAST(2,95)
+# define _UNUSED __attribute__ ((unused))
+#else
+# define _UNUSED
+#endif
+
/* if this file is included by psl2c, redefine to use requested library for builtin data */
#ifdef _LIBPSL_INCLUDED_BY_PSL2C
# undef WITH_LIBICU
@@ -167,10 +179,10 @@ struct _psl_ctx_st {
/* include the PSL data compiled by 'psl2c' */
#ifndef _LIBPSL_INCLUDED_BY_PSL2C
-# include "suffixes.c"
+# include "suffixes_dafsa.c"
#else
/* if this source file is included by psl2c.c, provide empty builtin data */
- static _psl_entry_t suffixes[1];
+ static const unsigned char kDafsa[1];
static time_t _psl_file_time;
static time_t _psl_compile_time;
static int _psl_nsuffixes;
@@ -313,20 +325,196 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length)
return 0;
}
+
+static inline int _isspace_ascii(const char c)
+{
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n';
+}
+
+static int _str_is_ascii(const char *s)
+{
+ while (*s && *((unsigned char *)s) < 128) s++;
+
+ return !*s;
+}
+
+#if defined(WITH_LIBIDN)
+/*
+ * Work around a libidn <= 1.30 vulnerability.
+ *
+ * The function checks for a valid UTF-8 character sequence before
+ * passing it to idna_to_ascii_8z().
+ *
+ * [1] http://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
+ * [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
+ * [3] http://curl.haxx.se/mail/lib-2015-06/0143.html
+ */
+static int _utf8_is_valid(const char *utf8)
+{
+ const unsigned char *s = (const unsigned char *) utf8;
+
+ while (*s) {
+ if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
+ s++;
+ else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
+ if ((s[1] & 0xC0) != 0x80)
+ return 0;
+ s += 2;
+ } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
+ if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
+ return 0;
+ s += 3;
+ } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
+ if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
+ return 0;
+ s += 4;
+ } else
+ return 0;
+ }
+
+ return 1;
+}
+#endif
+
+typedef void *_psl_idna_t;
+
+static _psl_idna_t *_psl_idna_open(void)
+{
+#if defined(WITH_LIBICU)
+ UErrorCode status = 0;
+ return (void *)uidna_openUTS46(UIDNA_USE_STD3_RULES, &status);
+#endif
+ return NULL;
+}
+
+static void _psl_idna_close(_psl_idna_t *idna _UNUSED)
+{
+#if defined(WITH_LIBICU)
+ if (idna)
+ uidna_close((UIDNA *)idna);
+#endif
+}
+
+static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char **ascii)
+{
+ int ret = -1;
+
+#if defined(WITH_LIBICU)
+ /* IDNA2008 UTS#46 punycode conversion */
+ if (idna) {
+ char lookupname[128] = "";
+ UErrorCode status = 0;
+ UIDNAInfo info = UIDNA_INFO_INITIALIZER;
+ UChar utf16_dst[128], utf16_src[128];
+ int32_t utf16_src_length;
+
+ u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, utf8, -1, &status);
+ if (U_SUCCESS(status)) {
+ int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
+ if (U_SUCCESS(status)) {
+ u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
+ if (U_SUCCESS(status)) {
+ if (ascii)
+ *ascii = strdup(lookupname);
+ ret = 0;
+ } /* else
+ fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
+ } /* else
+ fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */
+ } /* else
+ fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */
+ }
+#elif defined(WITH_LIBIDN2)
+ int rc;
+ uint8_t *lower, resbuf[256];
+ size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
+
+ /* we need a conversion to lowercase */
+ lower = u8_tolower((uint8_t *)utf8, u8_strlen((uint8_t *)utf8), 0, UNINORM_NFKC, resbuf, &len);
+ if (!lower) {
+ /* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
+ return -1;
+ }
+
+ /* u8_tolower() does not terminate the result string */
+ if (lower == resbuf) {
+ lower[len]=0;
+ } else {
+ uint8_t *tmp = lower;
+ lower = (uint8_t *)strndup((char *)lower, len);
+ free(tmp);
+ }
+
+ if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
+ ret = 0;
+ } /* else
+ fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
+
+ if (lower != resbuf)
+ free(lower);
+#elif defined(WITH_LIBIDN)
+ int rc;
+
+ if (!_utf8_is_valid(utf8)) {
+ /* fprintf(_(stderr, "Invalid UTF-8 sequence not converted: '%s'\n"), utf8); */
+ return -1;
+ }
+
+ /* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */
+
+ if ((rc = idna_to_ascii_8z(utf8, ascii, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
+ ret = 0;
+ } /* else
+ fprintf(_(stderr, "toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); */
+#endif
+
+ return ret;
+}
+
+static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_entry_t *e)
+{
+ char *lookupname;
+
+ if (_str_is_ascii(e->label_buf))
+ return;
+
+ if (_psl_idna_toASCII(idna, e->label_buf, &lookupname) == 0) {
+ if (strcmp(e->label_buf, lookupname)) {
+ _psl_entry_t suffix, *suffixp;
+
+ /* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
+ _suffix_init(&suffix, lookupname, strlen(lookupname));
+ suffix.flags = e->flags;
+ suffixp = _vector_get(v, _vector_add(v, &suffix));
+ suffixp->label = suffixp->label_buf; /* set label to changed address */
+ } /* else ignore */
+
+ free(lookupname);
+ }
+}
+
+/* prototype */
+int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
+
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
{
- _psl_entry_t suffix, *rule;
+ _psl_entry_t suffix;
const char *p;
- int builtin;
+ char *punycode = NULL;
+ int need_conversion = 0;
/* this function should be called without leading dots, just make sure */
- suffix.label = domain + (*domain == '.');
- suffix.length = strlen(suffix.label);
+ if (*domain == '.')
+ domain++;
+
suffix.nlabels = 1;
- for (p = suffix.label; *p; p++)
+ for (p = domain; *p; p++) {
if (*p == '.')
suffix.nlabels++;
+ else if (*((unsigned char *)p) < 128)
+ need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */
+ }
if (suffix.nlabels == 1) {
/* TLD, this is the prevailing '*' match.
@@ -335,61 +523,111 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
return 1;
}
- /* if domain has enough labels, it is public */
- builtin = (psl == &_builtin_psl);
+ if (need_conversion) {
+ _psl_idna_t *idna = _psl_idna_open();
- if (builtin)
- rule = &suffixes[0];
- else
- rule = _vector_get(psl->suffixes, 0);
+ if (_psl_idna_toASCII(idna, domain, &punycode) == 0) {
+ suffix.label = punycode;
+ suffix.length = strlen(punycode);
+ } else {
+ /* fallback */
+ suffix.label = domain;
+ suffix.length = p - suffix.label;
+ }
- if (!rule || rule->nlabels < suffix.nlabels - 1)
- return 0;
-
- if (rule == &suffixes[0])
- rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
- else
- rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
-
- if (rule) {
- /* check for correct rule type */
- if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
- return 0;
- else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
- return 0;
-
- /* definitely a match, no matter if the found rule is a wildcard or not */
- if (rule->flags & _PSL_FLAG_EXCEPTION)
- return 0;
- if (rule->flags & _PSL_FLAG_PLAIN)
- return 1;
+ _psl_idna_close(idna);
+ } else {
+ suffix.label = domain;
+ suffix.length = p - suffix.label;
}
- if ((suffix.label = strchr(suffix.label, '.'))) {
- int pos = rule - suffixes;
+ if (psl == &_builtin_psl) {
+ int rc = LookupStringInFixedSet(kDafsa, sizeof(kDafsa), suffix.label, suffix.length);
+ if (rc != -1) {
+ /* check for correct rule type */
+ if (type == PSL_TYPE_ICANN && !(rc & _PSL_FLAG_ICANN))
+ goto suffix_no;
+ else if (type == PSL_TYPE_PRIVATE && !(rc & _PSL_FLAG_PRIVATE))
+ goto suffix_no;
- suffix.label++;
- suffix.length = strlen(suffix.label);
- suffix.nlabels--;
+ if (rc & _PSL_FLAG_EXCEPTION)
+ goto suffix_no;
- if (builtin)
- rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
- else
- rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix)));
+ /* wildcard *.foo.bar implicitly make foo.bar a public suffix */
+ /* definitely a match, no matter if the found rule is a wildcard or not */
+ goto suffix_yes;
+ }
+ if ((suffix.label = strchr(suffix.label, '.'))) {
+ suffix.label++;
+ suffix.length = strlen(suffix.label);
+ suffix.nlabels--;
+
+ rc = LookupStringInFixedSet(kDafsa, sizeof(kDafsa), suffix.label, suffix.length);
+ if (rc != -1) {
+ /* check for correct rule type */
+ if (type == PSL_TYPE_ICANN && !(rc & _PSL_FLAG_ICANN))
+ goto suffix_no;
+ else if (type == PSL_TYPE_PRIVATE && !(rc & _PSL_FLAG_PRIVATE))
+ goto suffix_no;
+
+ if (rc & _PSL_FLAG_WILDCARD)
+ goto suffix_yes;
+ }
+ }
+ } else {
+ _psl_entry_t *rule = _vector_get(psl->suffixes, 0);
+
+ if (!rule || rule->nlabels < suffix.nlabels - 1)
+ return 0;
+
+ rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
if (rule) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
- return 0;
+ goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
- return 0;
+ goto suffix_no;
- if ((rule->flags & _PSL_FLAG_WILDCARD))
- return 1;
+ if (rule->flags & _PSL_FLAG_EXCEPTION)
+ goto suffix_no;
+
+ /* wildcard *.foo.bar implicitly make foo.bar a public suffix */
+ /* definitely a match, no matter if the found rule is a wildcard or not */
+ goto suffix_yes;
+ }
+
+ if ((suffix.label = strchr(suffix.label, '.'))) {
+ int pos;
+
+ suffix.label++;
+ suffix.length = strlen(suffix.label);
+ suffix.nlabels--;
+
+ rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix)));
+
+ if (rule) {
+ /* check for correct rule type */
+ if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
+ goto suffix_no;
+ else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
+ goto suffix_no;
+
+ if (rule->flags & _PSL_FLAG_WILDCARD)
+ goto suffix_yes;
+ }
}
}
+suffix_no:
+ if (punycode)
+ free(punycode);
return 0;
+
+suffix_yes:
+ if (punycode)
+ free(punycode);
+ return 1;
}
/**
@@ -531,167 +769,6 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
return regdom;
}
-static inline int _isspace_ascii(const char c)
-{
- return c == ' ' || c == '\t' || c == '\r' || c == '\n';
-}
-
-static int _str_is_ascii(const char *s)
-{
- while (*s && *((unsigned char *)s) < 128) s++;
-
- return !*s;
-}
-
-#if defined(WITH_LIBIDN)
-/*
- * Work around a libidn <= 1.30 vulnerability.
- *
- * The function checks for a valid UTF-8 character sequence before
- * passing it to idna_to_ascii_8z().
- *
- * [1] http://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
- * [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
- * [3] http://curl.haxx.se/mail/lib-2015-06/0143.html
- */
-static int _utf8_is_valid(const char *utf8)
-{
- const unsigned char *s = (const unsigned char *) utf8;
-
- while (*s) {
- if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
- s++;
- else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
- if ((s[1] & 0xC0) != 0x80)
- return 0;
- s += 2;
- } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
- if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
- return 0;
- s += 3;
- } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
- if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
- return 0;
- s += 4;
- } else
- return 0;
- }
-
- return 1;
-}
-#endif
-
-#if defined(WITH_LIBICU)
-static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t *e)
-{
- if (_str_is_ascii(e->label_buf))
- return;
-
- /* IDNA2008 UTS#46 punycode conversion */
- if (idna) {
- char lookupname[128] = "";
- UErrorCode status = 0;
- UIDNAInfo info = UIDNA_INFO_INITIALIZER;
- UChar utf16_dst[128], utf16_src[128];
- int32_t utf16_src_length;
-
- u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, -1, &status);
- if (U_SUCCESS(status)) {
- int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
- if (U_SUCCESS(status)) {
- u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
- if (U_SUCCESS(status)) {
- if (strcmp(e->label_buf, lookupname)) {
- _psl_entry_t suffix, *suffixp;
-
- /* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */
- _suffix_init(&suffix, lookupname, strlen(lookupname));
- suffix.flags = e->flags;
- suffixp = _vector_get(v, _vector_add(v, &suffix));
- suffixp->label = suffixp->label_buf; /* set label to changed address */
- } /* else ignore */
- } /* else
- fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
- } /* else
- fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */
- } /* else
- fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */
- }
-}
-#elif defined(WITH_LIBIDN2)
-static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
-{
- char *lookupname = NULL;
- int rc;
- uint8_t *lower, resbuf[256];
- size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
-
- if (_str_is_ascii(e->label_buf))
- return;
-
- /* we need a conversion to lowercase */
- lower = u8_tolower((uint8_t *)e->label_buf, u8_strlen((uint8_t *)e->label_buf), 0, UNINORM_NFKC, resbuf, &len);
- if (!lower) {
- /* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", e->label_buf, errno); */
- return;
- }
-
- /* u8_tolower() does not terminate the result string */
- if (lower == resbuf) {
- lower[len]=0;
- } else {
- uint8_t *tmp = lower;
- lower = (uint8_t *)strndup((char *)lower, len);
- free(tmp);
- }
-
- if ((rc = idn2_lookup_u8(lower, (uint8_t **)&lookupname, 0)) == IDN2_OK) {
- if (strcmp(e->label_buf, lookupname)) {
- _psl_entry_t suffix, *suffixp;
-
- /* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
- _suffix_init(&suffix, lookupname, strlen(lookupname));
- suffix.flags = e->flags;
- suffixp = _vector_get(v, _vector_add(v, &suffix));
- suffixp->label = suffixp->label_buf; /* set label to changed address */
- } /* else ignore */
- } /* else
- fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
-
- if (lower != resbuf)
- free(lower);
-}
-#elif defined(WITH_LIBIDN)
-static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
-{
- char *lookupname = NULL;
- int rc;
-
- if (_str_is_ascii(e->label_buf))
- return;
-
- if (!_utf8_is_valid(e->label_buf)) {
- /* fprintf(_(stderr, "Invalid UTF-8 sequence not converted: '%s'\n"), e->label_buf); */
- return;
- }
-
- /* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */
-
- if ((rc = idna_to_ascii_8z(e->label_buf, &lookupname, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
- if (strcmp(e->label_buf, lookupname)) {
- _psl_entry_t suffix, *suffixp;
-
- /* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
- _suffix_init(&suffix, lookupname, strlen(lookupname));
- suffix.flags = e->flags;
- suffixp = _vector_get(v, _vector_add(v, &suffix));
- suffixp->label = suffixp->label_buf; /* set label to changed address */
- } /* else ignore */
- } /* else
- fprintf(_(stderr, "toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); */
-}
-#endif
-
/**
* psl_load_file:
* @fname: Name of PSL file
@@ -740,10 +817,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
_psl_entry_t suffix, *suffixp;
char buf[256], *linep, *p;
int type = 0;
-#ifdef WITH_LIBICU
- UIDNA *idna;
- UErrorCode status = 0;
-#endif
+ _psl_idna_t *idna;
if (!fp)
return NULL;
@@ -751,9 +825,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
if (!(psl = calloc(1, sizeof(psl_ctx_t))))
return NULL;
-#ifdef WITH_LIBICU
- idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status);
-#endif
+ idna = _psl_idna_open();
/*
* as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions.
@@ -794,7 +866,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
continue;
}
p++;
- /* wildcard *.foo.bar implicitely make foo.bar a public suffix */
+ /* wildcard *.foo.bar implicitly make foo.bar a public suffix */
suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN | type;
psl->nwildcards++;
psl->nsuffixes++;
@@ -829,20 +901,14 @@ psl_ctx_t *psl_load_fp(FILE *fp)
}
suffixp->label = suffixp->label_buf; /* set label to changed address */
-#ifdef WITH_LIBICU
+
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
-#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
- _add_punycode_if_needed(psl->suffixes, suffixp);
-#endif
}
}
_vector_sort(psl->suffixes);
-#ifdef WITH_LIBICU
- if (idna)
- uidna_close(idna);
-#endif
+ _psl_idna_close(idna);
return psl;
}
@@ -1184,7 +1250,7 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
*
* Since: 0.4
*/
-psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower)
+psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale _UNUSED, char **lower)
{
int ret = PSL_ERR_INVALID_ARG;
diff --git a/src/psl2c.c b/src/psl2c.c
index 33a85f4..7d02679 100644
--- a/src/psl2c.c
+++ b/src/psl2c.c
@@ -128,39 +128,6 @@ static int _check_psl(const psl_ctx_t *psl)
}
#endif
-static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname)
-{
- int it;
-
-#ifdef BUILTIN_GENERATOR_LIBICU
- do {
- UVersionInfo version_info;
- char version[U_MAX_VERSION_STRING_LENGTH];
-
- u_getVersion(version_info);
- u_versionToString(version_info, version);
- fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version);
- } while (0);
-#elif defined(BUILTIN_GENERATOR_LIBIDN2)
- fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL));
-#elif defined(BUILTIN_GENERATOR_LIBIDN)
- fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL));
-#else
- fprintf(fpout, "/* automatically generated by psl2c (without punycode support) */\n");
-#endif
-
- fprintf(fpout, "static _psl_entry_t %s[] = {\n", varname);
-
- for (it = 0; it < v->cur; it++) {
- _psl_entry_t *e = _vector_get(v, it);
-
- fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n",
- e->label_buf, e->length, (int) e->nlabels, (int) e->flags);
- }
-
- fprintf(fpout, "};\n");
-}
-
static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
{
FILE *fp;
@@ -192,13 +159,14 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
while (*s && *s < 128) s++;
if (*s) continue;
- fprintf(fp, "%s, %d\n", e->label_buf, (int) e->flags);
+ fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
}
fclose(fp);
}
- system("../tools/make_dafsa.py in.tmp out.tmp");
+ if ((it = system(MAKE_DAFSA " in.tmp out.tmp")))
+ fprintf(stderr, "Failed to execute " MAKE_DAFSA "\n");
if ((fp = fopen("out.tmp", "r"))) {
char buf[256];
@@ -208,6 +176,9 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
fclose(fp);
}
+
+ unlink("in.tmp");
+ unlink("out.tmp");
}
#if 0
@@ -262,15 +233,10 @@ int main(int argc, const char **argv)
#ifdef _GENERATE_BUILTIN_DATA
psl_ctx_t *psl;
#endif
- int ret = 0, argpos = 1, dafsa = 0;
-
- if (argc == 4 && !strcmp(argv[1], "--dafsa")) {
- argpos = 2;
- dafsa = 1;
- }
+ int ret = 0, argpos = 1;
if (argc - argpos != 2) {
- fprintf(stderr, "Usage: psl2c [--dafsa] \n");
+ fprintf(stderr, "Usage: psl2c \n");
fprintf(stderr, " is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
fprintf(stderr, " is the the C filename to be generated from \n");
return 1;
@@ -299,10 +265,7 @@ int main(int argc, const char **argv)
_add_punycode_if_needed(psl->suffixes);
#endif
- if (dafsa)
- _print_psl_entries_dafsa(fpout, psl->suffixes);
- else
- _print_psl_entries(fpout, psl->suffixes, "suffixes");
+ _print_psl_entries_dafsa(fpout, psl->suffixes);
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);
if ((pp = popen(cmd, "r"))) {