diff --git a/AUTHORS b/AUTHORS index 12e83e6..33dad7b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -15,3 +15,4 @@ Dagobert Michelsen (Fixed Solaris building) Christopher Meng (Fedora building) Jakub ÄŒajka Giuseppe Scrivano +Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support) diff --git a/README.md b/README.md index 7bc8fbc..75e1038 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Browsers and other web clients can use it to Libpsl... -- has built-in PSL data for fast access +- has built-in PSL data for fast access (DAWG/DAFSA reduces size from 180kB to ~32kB) - allows to load PSL data from files - checks if a given domain is a "public suffix" - provides immediate cookie domain verification @@ -28,6 +28,8 @@ Find more information about the Publix Suffix List [here](http://publicsuffix.or Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat). +The DAFSA code has been taken from [Chromium Project](https://code.google.com/p/chromium/). + API Documentation ----------------- @@ -74,6 +76,8 @@ License Libpsl is made available under the terms of the MIT license.
See the LICENSE file that accompanies this distribution for the full text of the license. +src/make_dafsa.py and src/lookup_string_in_fixed_set.c are licensed under the term written in +src/LICENSE.chromium. Building from git ----------------- diff --git a/src/LICENSE.chromium b/src/LICENSE.chromium new file mode 100644 index 0000000..ffe66fe --- /dev/null +++ b/src/LICENSE.chromium @@ -0,0 +1,30 @@ +* The following License is for the source code files + make_dafsa.py and lookup_string_in_fixed_set.c. + +// Copyright 2015 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/Makefile.am b/src/Makefile.am index 1111bb3..9e04d53 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,12 +1,12 @@ # suffixes.c must be created before psl.c is compiled -BUILT_SOURCES = suffixes.c +BUILT_SOURCES = suffixes_dafsa.c # suffixes.c is a built source that must be cleaned -CLEANFILES = suffixes.c +CLEANFILES = suffixes_dafsa.c lib_LTLIBRARIES = libpsl.la -libpsl_la_SOURCES = psl.c +libpsl_la_SOURCES = psl.c lookup_string_in_fixed_set.c libpsl_la_CPPFLAGS = -I$(top_srcdir)/include # include ABI version information libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION) @@ -21,8 +21,8 @@ if WITH_LIBIDN endif noinst_PROGRAMS = psl2c -psl2c_SOURCES = psl2c.c -psl2c_CPPFLAGS = -I$(top_srcdir)/include +psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c +psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/make_dafsa.py\" if BUILTIN_GENERATOR_LIBICU psl2c_LDADD = -licuuc endif @@ -33,8 +33,9 @@ if BUILTIN_GENERATOR_LIBIDN psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring endif -# Build rule for suffix.c +# Build rule for suffix_dafsa.c # PSL_FILE can be set by ./configure --with-psl-file=[PATH] -suffixes.c: $(PSL_FILE) psl2c$(EXEEXT) - ./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes.c - ./psl2c$(EXEEXT) --dafsa "$(PSL_FILE)" suffixes_dafsa.c \ No newline at end of file +suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT) + ./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c + +EXTRA_DIST = make_dafsa.py LICENSE.chromium diff --git a/src/lookup_string_in_fixed_set.c b/src/lookup_string_in_fixed_set.c new file mode 100644 index 0000000..bdec2e5 --- /dev/null +++ b/src/lookup_string_in_fixed_set.c @@ -0,0 +1,204 @@ +/* Copyright 2015 The Chromium Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE.chromium file. + * + * Converted to C89 2015 by Tim Rühsen + */ + +#include + +#if defined(__GNUC__) && defined(__GNUC_MINOR__) +# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) +#else +# define _GCC_VERSION_AT_LEAST(major, minor) 0 +#endif + +#if _GCC_VERSION_AT_LEAST(4,0) +# define _HIDDEN __attribute__ ((visibility ("hidden"))) +#else +# define _HIDDEN +#endif + +#define CHECK_LT(a, b) if ((a) >= b) return 0 + +/* + * Read next offset from pos. + * Returns true if an offset could be read, false otherwise. + */ + +static int GetNextOffset(const unsigned char** pos, + const unsigned char* end, + const unsigned char** offset) +{ + size_t bytes_consumed; + + if (*pos == end) + return 0; + + /* When reading an offset the byte array must always contain at least + * three more bytes to consume. First the offset to read, then a node + * to skip over and finally a destination node. No object can be smaller + * than one byte. */ + CHECK_LT(*pos + 2, end); + switch (**pos & 0x60) { + case 0x60: /* Read three byte offset */ + *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2]; + bytes_consumed = 3; + break; + case 0x40: /* Read two byte offset */ + *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1]; + bytes_consumed = 2; + break; + default: + *offset += (*pos)[0] & 0x3F; + bytes_consumed = 1; + } + if ((**pos & 0x80) != 0) { + *pos = end; + } else { + *pos += bytes_consumed; + } + return 1; +} + +/* + * Check if byte at offset is last in label. + */ + +static int IsEOL(const unsigned char* offset, const unsigned char* end) +{ + CHECK_LT(offset, end); + return(*offset & 0x80) != 0; +} + +/* + * Check if byte at offset matches first character in key. + * This version matches characters not last in label. + */ + +static int IsMatch(const unsigned char* offset, + const unsigned char* end, + const char* key) +{ + CHECK_LT(offset, end); + return *offset == *key; +} + +/* + * Check if byte at offset matches first character in key. + * This version matches characters last in label. + */ + +static int IsEndCharMatch(const unsigned char* offset, + const unsigned char* end, + const char* key) +{ + CHECK_LT(offset, end); + return *offset == (*key | 0x80); +} + +/* + * Read return value at offset. + * Returns true if a return value could be read, false otherwise. + */ + +static int GetReturnValue(const unsigned char* offset, + const unsigned char* end, + int* return_value) +{ + CHECK_LT(offset, end); + if ((*offset & 0xE0) == 0x80) { + *return_value = *offset & 0x0F; + return 1; + } + return 0; +} + +/* + * Looks up the string |key| with length |key_length| in a fixed set of + * strings. The set of strings must be known at compile time. It is converted to + * a graph structure named a DAFSA (Deterministic Acyclic Finite State + * Automaton) by the script make_dafsa.py during compilation. This permits + * efficient (in time and space) lookup. The graph generated by make_dafsa.py + * takes the form of a constant byte array which should be supplied via the + * |graph| and |length| parameters. The return value is kDafsaNotFound, + * kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule, + * kDafsaWildcardRule and kDafsaPrivateRule ORed together. + * + * Lookup a domain key in a byte array generated by make_dafsa.py. + */ + +/* prototype to skip warning with -Wmissing-prototypes */ +int _HIDDEN LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t); + +int _HIDDEN LookupStringInFixedSet(const unsigned char* graph, + size_t length, + const char* key, + size_t key_length) +{ + const unsigned char* pos = graph; + const unsigned char* end = graph + length; + const unsigned char* offset = pos; + const char* key_end = key + key_length; + + while (GetNextOffset(&pos, end, &offset)) { + /*char + end_char offsets + * char + return value + * char end_char offsets + * char return value + * end_char offsets + * return_value + */ + int did_consume = 0; + + if (key != key_end && !IsEOL(offset, end)) { + /* Leading is not a match. Don't dive into this child */ + if (!IsMatch(offset, end, key)) + continue; + did_consume = 1; + ++offset; + ++key; + /* Possible matches at this point: + * + end_char offsets + * + return value + * end_char offsets + * return value + */ + + /* Remove all remaining nodes possible */ + while (!IsEOL(offset, end) && key != key_end) { + if (!IsMatch(offset, end, key)) + return -1; + ++key; + ++offset; + } + } + /* Possible matches at this point: + * end_char offsets + * return_value + * If one or more elements were consumed, a failure + * to match is terminal. Otherwise, try the next node. + */ + if (key == key_end) { + int return_value; + + if (GetReturnValue(offset, end, &return_value)) + return return_value; + /* The DAFSA guarantees that if the first char is a match, all + * remaining char elements MUST match if the key is truly present. + */ + if (did_consume) + return -1; + continue; + } + if (!IsEndCharMatch(offset, end, key)) { + if (did_consume) + return -1; /* Unexpected */ + continue; + } + ++key; + pos = ++offset; /* Dive into child */ + } + + return -1; /* No match */ +} diff --git a/tools/make_dafsa.py b/src/make_dafsa.py similarity index 98% rename from tools/make_dafsa.py rename to src/make_dafsa.py index 6a04bf1..8268271 100755 --- a/tools/make_dafsa.py +++ b/src/make_dafsa.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # Copyright 2014 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# found in the LICENSE.chromium file. """ A Deterministic acyclic finite state automaton (DAFSA) is a compact @@ -421,7 +421,7 @@ def to_cxx(data): text += 'The byte array encodes effective tld names. See make_dafsa.py for' text += ' documentation.' text += '*/\n\n' - text += 'const unsigned char kDafsa[%s] = {\n' % len(data) + text += 'static const unsigned char kDafsa[%s] = {\n' % len(data) for i in range(0, len(data), 12): text += ' ' text += ', '.join('0x%02x' % byte for byte in data[i:i + 12]) @@ -450,7 +450,7 @@ def parse_gperf(infile): raise InputError('Expected "domainname, ", found "%s"' % line) # Technically the DAFSA format could support return values in range [0-31], # but the values below are the only with a defined meaning. - if line[-1] not in '01245': + if line[-1] not in '0123456789ABCDEF': raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % line[-1]) return [line[:-3] + line[-1] for line in lines] diff --git a/src/psl.c b/src/psl.c index 17717ea..c7c39ef 100644 --- a/src/psl.c +++ b/src/psl.c @@ -32,6 +32,18 @@ # include #endif +#if defined(__GNUC__) && defined(__GNUC_MINOR__) +# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) +#else +# define _GCC_VERSION_AT_LEAST(major, minor) 0 +#endif + +#if _GCC_VERSION_AT_LEAST(2,95) +# define _UNUSED __attribute__ ((unused)) +#else +# define _UNUSED +#endif + /* if this file is included by psl2c, redefine to use requested library for builtin data */ #ifdef _LIBPSL_INCLUDED_BY_PSL2C # undef WITH_LIBICU @@ -167,10 +179,10 @@ struct _psl_ctx_st { /* include the PSL data compiled by 'psl2c' */ #ifndef _LIBPSL_INCLUDED_BY_PSL2C -# include "suffixes.c" +# include "suffixes_dafsa.c" #else /* if this source file is included by psl2c.c, provide empty builtin data */ - static _psl_entry_t suffixes[1]; + static const unsigned char kDafsa[1]; static time_t _psl_file_time; static time_t _psl_compile_time; static int _psl_nsuffixes; @@ -313,20 +325,196 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length) return 0; } + +static inline int _isspace_ascii(const char c) +{ + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +} + +static int _str_is_ascii(const char *s) +{ + while (*s && *((unsigned char *)s) < 128) s++; + + return !*s; +} + +#if defined(WITH_LIBIDN) +/* + * Work around a libidn <= 1.30 vulnerability. + * + * The function checks for a valid UTF-8 character sequence before + * passing it to idna_to_ascii_8z(). + * + * [1] http://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html + * [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html + * [3] http://curl.haxx.se/mail/lib-2015-06/0143.html + */ +static int _utf8_is_valid(const char *utf8) +{ + const unsigned char *s = (const unsigned char *) utf8; + + while (*s) { + if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */ + s++; + else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ { + if ((s[1] & 0xC0) != 0x80) + return 0; + s += 2; + } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ { + if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80) + return 0; + s += 3; + } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ { + if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80) + return 0; + s += 4; + } else + return 0; + } + + return 1; +} +#endif + +typedef void *_psl_idna_t; + +static _psl_idna_t *_psl_idna_open(void) +{ +#if defined(WITH_LIBICU) + UErrorCode status = 0; + return (void *)uidna_openUTS46(UIDNA_USE_STD3_RULES, &status); +#endif + return NULL; +} + +static void _psl_idna_close(_psl_idna_t *idna _UNUSED) +{ +#if defined(WITH_LIBICU) + if (idna) + uidna_close((UIDNA *)idna); +#endif +} + +static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char **ascii) +{ + int ret = -1; + +#if defined(WITH_LIBICU) + /* IDNA2008 UTS#46 punycode conversion */ + if (idna) { + char lookupname[128] = ""; + UErrorCode status = 0; + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + UChar utf16_dst[128], utf16_src[128]; + int32_t utf16_src_length; + + u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, utf8, -1, &status); + if (U_SUCCESS(status)) { + int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status); + if (U_SUCCESS(status)) { + u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status); + if (U_SUCCESS(status)) { + if (ascii) + *ascii = strdup(lookupname); + ret = 0; + } /* else + fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */ + } /* else + fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */ + } /* else + fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */ + } +#elif defined(WITH_LIBIDN2) + int rc; + uint8_t *lower, resbuf[256]; + size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */ + + /* we need a conversion to lowercase */ + lower = u8_tolower((uint8_t *)utf8, u8_strlen((uint8_t *)utf8), 0, UNINORM_NFKC, resbuf, &len); + if (!lower) { + /* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */ + return -1; + } + + /* u8_tolower() does not terminate the result string */ + if (lower == resbuf) { + lower[len]=0; + } else { + uint8_t *tmp = lower; + lower = (uint8_t *)strndup((char *)lower, len); + free(tmp); + } + + if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) { + ret = 0; + } /* else + fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */ + + if (lower != resbuf) + free(lower); +#elif defined(WITH_LIBIDN) + int rc; + + if (!_utf8_is_valid(utf8)) { + /* fprintf(_(stderr, "Invalid UTF-8 sequence not converted: '%s'\n"), utf8); */ + return -1; + } + + /* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */ + + if ((rc = idna_to_ascii_8z(utf8, ascii, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) { + ret = 0; + } /* else + fprintf(_(stderr, "toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); */ +#endif + + return ret; +} + +static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_entry_t *e) +{ + char *lookupname; + + if (_str_is_ascii(e->label_buf)) + return; + + if (_psl_idna_toASCII(idna, e->label_buf, &lookupname) == 0) { + if (strcmp(e->label_buf, lookupname)) { + _psl_entry_t suffix, *suffixp; + + /* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */ + _suffix_init(&suffix, lookupname, strlen(lookupname)); + suffix.flags = e->flags; + suffixp = _vector_get(v, _vector_add(v, &suffix)); + suffixp->label = suffixp->label_buf; /* set label to changed address */ + } /* else ignore */ + + free(lookupname); + } +} + +/* prototype */ +int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length); + static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type) { - _psl_entry_t suffix, *rule; + _psl_entry_t suffix; const char *p; - int builtin; + char *punycode = NULL; + int need_conversion = 0; /* this function should be called without leading dots, just make sure */ - suffix.label = domain + (*domain == '.'); - suffix.length = strlen(suffix.label); + if (*domain == '.') + domain++; + suffix.nlabels = 1; - for (p = suffix.label; *p; p++) + for (p = domain; *p; p++) { if (*p == '.') suffix.nlabels++; + else if (*((unsigned char *)p) < 128) + need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */ + } if (suffix.nlabels == 1) { /* TLD, this is the prevailing '*' match. @@ -335,61 +523,111 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t return 1; } - /* if domain has enough labels, it is public */ - builtin = (psl == &_builtin_psl); + if (need_conversion) { + _psl_idna_t *idna = _psl_idna_open(); - if (builtin) - rule = &suffixes[0]; - else - rule = _vector_get(psl->suffixes, 0); + if (_psl_idna_toASCII(idna, domain, &punycode) == 0) { + suffix.label = punycode; + suffix.length = strlen(punycode); + } else { + /* fallback */ + suffix.label = domain; + suffix.length = p - suffix.label; + } - if (!rule || rule->nlabels < suffix.nlabels - 1) - return 0; - - if (rule == &suffixes[0]) - rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare); - else - rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix)); - - if (rule) { - /* check for correct rule type */ - if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN)) - return 0; - else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE)) - return 0; - - /* definitely a match, no matter if the found rule is a wildcard or not */ - if (rule->flags & _PSL_FLAG_EXCEPTION) - return 0; - if (rule->flags & _PSL_FLAG_PLAIN) - return 1; + _psl_idna_close(idna); + } else { + suffix.label = domain; + suffix.length = p - suffix.label; } - if ((suffix.label = strchr(suffix.label, '.'))) { - int pos = rule - suffixes; + if (psl == &_builtin_psl) { + int rc = LookupStringInFixedSet(kDafsa, sizeof(kDafsa), suffix.label, suffix.length); + if (rc != -1) { + /* check for correct rule type */ + if (type == PSL_TYPE_ICANN && !(rc & _PSL_FLAG_ICANN)) + goto suffix_no; + else if (type == PSL_TYPE_PRIVATE && !(rc & _PSL_FLAG_PRIVATE)) + goto suffix_no; - suffix.label++; - suffix.length = strlen(suffix.label); - suffix.nlabels--; + if (rc & _PSL_FLAG_EXCEPTION) + goto suffix_no; - if (builtin) - rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare); - else - rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix))); + /* wildcard *.foo.bar implicitly make foo.bar a public suffix */ + /* definitely a match, no matter if the found rule is a wildcard or not */ + goto suffix_yes; + } + if ((suffix.label = strchr(suffix.label, '.'))) { + suffix.label++; + suffix.length = strlen(suffix.label); + suffix.nlabels--; + + rc = LookupStringInFixedSet(kDafsa, sizeof(kDafsa), suffix.label, suffix.length); + if (rc != -1) { + /* check for correct rule type */ + if (type == PSL_TYPE_ICANN && !(rc & _PSL_FLAG_ICANN)) + goto suffix_no; + else if (type == PSL_TYPE_PRIVATE && !(rc & _PSL_FLAG_PRIVATE)) + goto suffix_no; + + if (rc & _PSL_FLAG_WILDCARD) + goto suffix_yes; + } + } + } else { + _psl_entry_t *rule = _vector_get(psl->suffixes, 0); + + if (!rule || rule->nlabels < suffix.nlabels - 1) + return 0; + + rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix)); if (rule) { /* check for correct rule type */ if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN)) - return 0; + goto suffix_no; else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE)) - return 0; + goto suffix_no; - if ((rule->flags & _PSL_FLAG_WILDCARD)) - return 1; + if (rule->flags & _PSL_FLAG_EXCEPTION) + goto suffix_no; + + /* wildcard *.foo.bar implicitly make foo.bar a public suffix */ + /* definitely a match, no matter if the found rule is a wildcard or not */ + goto suffix_yes; + } + + if ((suffix.label = strchr(suffix.label, '.'))) { + int pos; + + suffix.label++; + suffix.length = strlen(suffix.label); + suffix.nlabels--; + + rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix))); + + if (rule) { + /* check for correct rule type */ + if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN)) + goto suffix_no; + else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE)) + goto suffix_no; + + if (rule->flags & _PSL_FLAG_WILDCARD) + goto suffix_yes; + } } } +suffix_no: + if (punycode) + free(punycode); return 0; + +suffix_yes: + if (punycode) + free(punycode); + return 1; } /** @@ -531,167 +769,6 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain) return regdom; } -static inline int _isspace_ascii(const char c) -{ - return c == ' ' || c == '\t' || c == '\r' || c == '\n'; -} - -static int _str_is_ascii(const char *s) -{ - while (*s && *((unsigned char *)s) < 128) s++; - - return !*s; -} - -#if defined(WITH_LIBIDN) -/* - * Work around a libidn <= 1.30 vulnerability. - * - * The function checks for a valid UTF-8 character sequence before - * passing it to idna_to_ascii_8z(). - * - * [1] http://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html - * [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html - * [3] http://curl.haxx.se/mail/lib-2015-06/0143.html - */ -static int _utf8_is_valid(const char *utf8) -{ - const unsigned char *s = (const unsigned char *) utf8; - - while (*s) { - if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */ - s++; - else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ { - if ((s[1] & 0xC0) != 0x80) - return 0; - s += 2; - } else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ { - if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80) - return 0; - s += 3; - } else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ { - if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80) - return 0; - s += 4; - } else - return 0; - } - - return 1; -} -#endif - -#if defined(WITH_LIBICU) -static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t *e) -{ - if (_str_is_ascii(e->label_buf)) - return; - - /* IDNA2008 UTS#46 punycode conversion */ - if (idna) { - char lookupname[128] = ""; - UErrorCode status = 0; - UIDNAInfo info = UIDNA_INFO_INITIALIZER; - UChar utf16_dst[128], utf16_src[128]; - int32_t utf16_src_length; - - u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, -1, &status); - if (U_SUCCESS(status)) { - int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status); - if (U_SUCCESS(status)) { - u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status); - if (U_SUCCESS(status)) { - if (strcmp(e->label_buf, lookupname)) { - _psl_entry_t suffix, *suffixp; - - /* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */ - _suffix_init(&suffix, lookupname, strlen(lookupname)); - suffix.flags = e->flags; - suffixp = _vector_get(v, _vector_add(v, &suffix)); - suffixp->label = suffixp->label_buf; /* set label to changed address */ - } /* else ignore */ - } /* else - fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */ - } /* else - fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */ - } /* else - fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */ - } -} -#elif defined(WITH_LIBIDN2) -static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e) -{ - char *lookupname = NULL; - int rc; - uint8_t *lower, resbuf[256]; - size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */ - - if (_str_is_ascii(e->label_buf)) - return; - - /* we need a conversion to lowercase */ - lower = u8_tolower((uint8_t *)e->label_buf, u8_strlen((uint8_t *)e->label_buf), 0, UNINORM_NFKC, resbuf, &len); - if (!lower) { - /* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", e->label_buf, errno); */ - return; - } - - /* u8_tolower() does not terminate the result string */ - if (lower == resbuf) { - lower[len]=0; - } else { - uint8_t *tmp = lower; - lower = (uint8_t *)strndup((char *)lower, len); - free(tmp); - } - - if ((rc = idn2_lookup_u8(lower, (uint8_t **)&lookupname, 0)) == IDN2_OK) { - if (strcmp(e->label_buf, lookupname)) { - _psl_entry_t suffix, *suffixp; - - /* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */ - _suffix_init(&suffix, lookupname, strlen(lookupname)); - suffix.flags = e->flags; - suffixp = _vector_get(v, _vector_add(v, &suffix)); - suffixp->label = suffixp->label_buf; /* set label to changed address */ - } /* else ignore */ - } /* else - fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */ - - if (lower != resbuf) - free(lower); -} -#elif defined(WITH_LIBIDN) -static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e) -{ - char *lookupname = NULL; - int rc; - - if (_str_is_ascii(e->label_buf)) - return; - - if (!_utf8_is_valid(e->label_buf)) { - /* fprintf(_(stderr, "Invalid UTF-8 sequence not converted: '%s'\n"), e->label_buf); */ - return; - } - - /* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */ - - if ((rc = idna_to_ascii_8z(e->label_buf, &lookupname, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) { - if (strcmp(e->label_buf, lookupname)) { - _psl_entry_t suffix, *suffixp; - - /* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */ - _suffix_init(&suffix, lookupname, strlen(lookupname)); - suffix.flags = e->flags; - suffixp = _vector_get(v, _vector_add(v, &suffix)); - suffixp->label = suffixp->label_buf; /* set label to changed address */ - } /* else ignore */ - } /* else - fprintf(_(stderr, "toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); */ -} -#endif - /** * psl_load_file: * @fname: Name of PSL file @@ -740,10 +817,7 @@ psl_ctx_t *psl_load_fp(FILE *fp) _psl_entry_t suffix, *suffixp; char buf[256], *linep, *p; int type = 0; -#ifdef WITH_LIBICU - UIDNA *idna; - UErrorCode status = 0; -#endif + _psl_idna_t *idna; if (!fp) return NULL; @@ -751,9 +825,7 @@ psl_ctx_t *psl_load_fp(FILE *fp) if (!(psl = calloc(1, sizeof(psl_ctx_t)))) return NULL; -#ifdef WITH_LIBICU - idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status); -#endif + idna = _psl_idna_open(); /* * as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions. @@ -794,7 +866,7 @@ psl_ctx_t *psl_load_fp(FILE *fp) continue; } p++; - /* wildcard *.foo.bar implicitely make foo.bar a public suffix */ + /* wildcard *.foo.bar implicitly make foo.bar a public suffix */ suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN | type; psl->nwildcards++; psl->nsuffixes++; @@ -829,20 +901,14 @@ psl_ctx_t *psl_load_fp(FILE *fp) } suffixp->label = suffixp->label_buf; /* set label to changed address */ -#ifdef WITH_LIBICU + _add_punycode_if_needed(idna, psl->suffixes, suffixp); -#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN) - _add_punycode_if_needed(psl->suffixes, suffixp); -#endif } } _vector_sort(psl->suffixes); -#ifdef WITH_LIBICU - if (idna) - uidna_close(idna); -#endif + _psl_idna_close(idna); return psl; } @@ -1184,7 +1250,7 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, * * Since: 0.4 */ -psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower) +psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale _UNUSED, char **lower) { int ret = PSL_ERR_INVALID_ARG; diff --git a/src/psl2c.c b/src/psl2c.c index 33a85f4..7d02679 100644 --- a/src/psl2c.c +++ b/src/psl2c.c @@ -128,39 +128,6 @@ static int _check_psl(const psl_ctx_t *psl) } #endif -static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname) -{ - int it; - -#ifdef BUILTIN_GENERATOR_LIBICU - do { - UVersionInfo version_info; - char version[U_MAX_VERSION_STRING_LENGTH]; - - u_getVersion(version_info); - u_versionToString(version_info, version); - fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version); - } while (0); -#elif defined(BUILTIN_GENERATOR_LIBIDN2) - fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL)); -#elif defined(BUILTIN_GENERATOR_LIBIDN) - fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL)); -#else - fprintf(fpout, "/* automatically generated by psl2c (without punycode support) */\n"); -#endif - - fprintf(fpout, "static _psl_entry_t %s[] = {\n", varname); - - for (it = 0; it < v->cur; it++) { - _psl_entry_t *e = _vector_get(v, it); - - fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n", - e->label_buf, e->length, (int) e->nlabels, (int) e->flags); - } - - fprintf(fpout, "};\n"); -} - static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v) { FILE *fp; @@ -192,13 +159,14 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v) while (*s && *s < 128) s++; if (*s) continue; - fprintf(fp, "%s, %d\n", e->label_buf, (int) e->flags); + fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F)); } fclose(fp); } - system("../tools/make_dafsa.py in.tmp out.tmp"); + if ((it = system(MAKE_DAFSA " in.tmp out.tmp"))) + fprintf(stderr, "Failed to execute " MAKE_DAFSA "\n"); if ((fp = fopen("out.tmp", "r"))) { char buf[256]; @@ -208,6 +176,9 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v) fclose(fp); } + + unlink("in.tmp"); + unlink("out.tmp"); } #if 0 @@ -262,15 +233,10 @@ int main(int argc, const char **argv) #ifdef _GENERATE_BUILTIN_DATA psl_ctx_t *psl; #endif - int ret = 0, argpos = 1, dafsa = 0; - - if (argc == 4 && !strcmp(argv[1], "--dafsa")) { - argpos = 2; - dafsa = 1; - } + int ret = 0, argpos = 1; if (argc - argpos != 2) { - fprintf(stderr, "Usage: psl2c [--dafsa] \n"); + fprintf(stderr, "Usage: psl2c \n"); fprintf(stderr, " is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n"); fprintf(stderr, " is the the C filename to be generated from \n"); return 1; @@ -299,10 +265,7 @@ int main(int argc, const char **argv) _add_punycode_if_needed(psl->suffixes); #endif - if (dafsa) - _print_psl_entries_dafsa(fpout, psl->suffixes); - else - _print_psl_entries(fpout, psl->suffixes, "suffixes"); + _print_psl_entries_dafsa(fpout, psl->suffixes); snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]); if ((pp = popen(cmd, "r"))) {