Use DAWG/DAFSA format for builtin data

This data representation reduces the size of the PSL data
drastically and still allows fast lookups.
This commit is contained in:
Tim Rühsen 2015-12-09 09:35:04 +01:00
parent 36139b601d
commit 0ca3741df6
8 changed files with 552 additions and 283 deletions

View File

@ -15,3 +15,4 @@ Dagobert Michelsen (Fixed Solaris building)
Christopher Meng (Fedora building) Christopher Meng (Fedora building)
Jakub Čajka Jakub Čajka
Giuseppe Scrivano Giuseppe Scrivano
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)

View File

@ -14,7 +14,7 @@ Browsers and other web clients can use it to
Libpsl... Libpsl...
- has built-in PSL data for fast access - has built-in PSL data for fast access (DAWG/DAFSA reduces size from 180kB to ~32kB)
- allows to load PSL data from files - allows to load PSL data from files
- checks if a given domain is a "public suffix" - checks if a given domain is a "public suffix"
- provides immediate cookie domain verification - provides immediate cookie domain verification
@ -28,6 +28,8 @@ Find more information about the Publix Suffix List [here](http://publicsuffix.or
Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat). Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat).
The DAFSA code has been taken from [Chromium Project](https://code.google.com/p/chromium/).
API Documentation API Documentation
----------------- -----------------
@ -74,6 +76,8 @@ License
Libpsl is made available under the terms of the MIT license.<br> Libpsl is made available under the terms of the MIT license.<br>
See the LICENSE file that accompanies this distribution for the full text of the license. See the LICENSE file that accompanies this distribution for the full text of the license.
src/make_dafsa.py and src/lookup_string_in_fixed_set.c are licensed under the term written in
src/LICENSE.chromium.
Building from git Building from git
----------------- -----------------

30
src/LICENSE.chromium Normal file
View File

@ -0,0 +1,30 @@
* The following License is for the source code files
make_dafsa.py and lookup_string_in_fixed_set.c.
// Copyright 2015 The Chromium Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,12 +1,12 @@
# suffixes.c must be created before psl.c is compiled # suffixes.c must be created before psl.c is compiled
BUILT_SOURCES = suffixes.c BUILT_SOURCES = suffixes_dafsa.c
# suffixes.c is a built source that must be cleaned # suffixes.c is a built source that must be cleaned
CLEANFILES = suffixes.c CLEANFILES = suffixes_dafsa.c
lib_LTLIBRARIES = libpsl.la lib_LTLIBRARIES = libpsl.la
libpsl_la_SOURCES = psl.c libpsl_la_SOURCES = psl.c lookup_string_in_fixed_set.c
libpsl_la_CPPFLAGS = -I$(top_srcdir)/include libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
# include ABI version information # include ABI version information
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION) libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
@ -21,8 +21,8 @@ if WITH_LIBIDN
endif endif
noinst_PROGRAMS = psl2c noinst_PROGRAMS = psl2c
psl2c_SOURCES = psl2c.c psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
psl2c_CPPFLAGS = -I$(top_srcdir)/include psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/make_dafsa.py\"
if BUILTIN_GENERATOR_LIBICU if BUILTIN_GENERATOR_LIBICU
psl2c_LDADD = -licuuc psl2c_LDADD = -licuuc
endif endif
@ -33,8 +33,9 @@ if BUILTIN_GENERATOR_LIBIDN
psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring
endif endif
# Build rule for suffix.c # Build rule for suffix_dafsa.c
# PSL_FILE can be set by ./configure --with-psl-file=[PATH] # PSL_FILE can be set by ./configure --with-psl-file=[PATH]
suffixes.c: $(PSL_FILE) psl2c$(EXEEXT) suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes.c ./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
./psl2c$(EXEEXT) --dafsa "$(PSL_FILE)" suffixes_dafsa.c
EXTRA_DIST = make_dafsa.py LICENSE.chromium

View File

@ -0,0 +1,204 @@
/* Copyright 2015 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE.chromium file.
*
* Converted to C89 2015 by Tim Rühsen
*/
#include <stddef.h>
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
#else
# define _GCC_VERSION_AT_LEAST(major, minor) 0
#endif
#if _GCC_VERSION_AT_LEAST(4,0)
# define _HIDDEN __attribute__ ((visibility ("hidden")))
#else
# define _HIDDEN
#endif
#define CHECK_LT(a, b) if ((a) >= b) return 0
/*
* Read next offset from pos.
* Returns true if an offset could be read, false otherwise.
*/
static int GetNextOffset(const unsigned char** pos,
const unsigned char* end,
const unsigned char** offset)
{
size_t bytes_consumed;
if (*pos == end)
return 0;
/* When reading an offset the byte array must always contain at least
* three more bytes to consume. First the offset to read, then a node
* to skip over and finally a destination node. No object can be smaller
* than one byte. */
CHECK_LT(*pos + 2, end);
switch (**pos & 0x60) {
case 0x60: /* Read three byte offset */
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
bytes_consumed = 3;
break;
case 0x40: /* Read two byte offset */
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
bytes_consumed = 2;
break;
default:
*offset += (*pos)[0] & 0x3F;
bytes_consumed = 1;
}
if ((**pos & 0x80) != 0) {
*pos = end;
} else {
*pos += bytes_consumed;
}
return 1;
}
/*
* Check if byte at offset is last in label.
*/
static int IsEOL(const unsigned char* offset, const unsigned char* end)
{
CHECK_LT(offset, end);
return(*offset & 0x80) != 0;
}
/*
* Check if byte at offset matches first character in key.
* This version matches characters not last in label.
*/
static int IsMatch(const unsigned char* offset,
const unsigned char* end,
const char* key)
{
CHECK_LT(offset, end);
return *offset == *key;
}
/*
* Check if byte at offset matches first character in key.
* This version matches characters last in label.
*/
static int IsEndCharMatch(const unsigned char* offset,
const unsigned char* end,
const char* key)
{
CHECK_LT(offset, end);
return *offset == (*key | 0x80);
}
/*
* Read return value at offset.
* Returns true if a return value could be read, false otherwise.
*/
static int GetReturnValue(const unsigned char* offset,
const unsigned char* end,
int* return_value)
{
CHECK_LT(offset, end);
if ((*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x0F;
return 1;
}
return 0;
}
/*
* Looks up the string |key| with length |key_length| in a fixed set of
* strings. The set of strings must be known at compile time. It is converted to
* a graph structure named a DAFSA (Deterministic Acyclic Finite State
* Automaton) by the script make_dafsa.py during compilation. This permits
* efficient (in time and space) lookup. The graph generated by make_dafsa.py
* takes the form of a constant byte array which should be supplied via the
* |graph| and |length| parameters. The return value is kDafsaNotFound,
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
* kDafsaWildcardRule and kDafsaPrivateRule ORed together.
*
* Lookup a domain key in a byte array generated by make_dafsa.py.
*/
/* prototype to skip warning with -Wmissing-prototypes */
int _HIDDEN LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
size_t length,
const char* key,
size_t key_length)
{
const unsigned char* pos = graph;
const unsigned char* end = graph + length;
const unsigned char* offset = pos;
const char* key_end = key + key_length;
while (GetNextOffset(&pos, end, &offset)) {
/*char <char>+ end_char offsets
* char <char>+ return value
* char end_char offsets
* char return value
* end_char offsets
* return_value
*/
int did_consume = 0;
if (key != key_end && !IsEOL(offset, end)) {
/* Leading <char> is not a match. Don't dive into this child */
if (!IsMatch(offset, end, key))
continue;
did_consume = 1;
++offset;
++key;
/* Possible matches at this point:
* <char>+ end_char offsets
* <char>+ return value
* end_char offsets
* return value
*/
/* Remove all remaining <char> nodes possible */
while (!IsEOL(offset, end) && key != key_end) {
if (!IsMatch(offset, end, key))
return -1;
++key;
++offset;
}
}
/* Possible matches at this point:
* end_char offsets
* return_value
* If one or more <char> elements were consumed, a failure
* to match is terminal. Otherwise, try the next node.
*/
if (key == key_end) {
int return_value;
if (GetReturnValue(offset, end, &return_value))
return return_value;
/* The DAFSA guarantees that if the first char is a match, all
* remaining char elements MUST match if the key is truly present.
*/
if (did_consume)
return -1;
continue;
}
if (!IsEndCharMatch(offset, end, key)) {
if (did_consume)
return -1; /* Unexpected */
continue;
}
++key;
pos = ++offset; /* Dive into child */
}
return -1; /* No match */
}

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright 2014 The Chromium Authors. All rights reserved. # Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE.chromium file.
""" """
A Deterministic acyclic finite state automaton (DAFSA) is a compact A Deterministic acyclic finite state automaton (DAFSA) is a compact
@ -421,7 +421,7 @@ def to_cxx(data):
text += 'The byte array encodes effective tld names. See make_dafsa.py for' text += 'The byte array encodes effective tld names. See make_dafsa.py for'
text += ' documentation.' text += ' documentation.'
text += '*/\n\n' text += '*/\n\n'
text += 'const unsigned char kDafsa[%s] = {\n' % len(data) text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
for i in range(0, len(data), 12): for i in range(0, len(data), 12):
text += ' ' text += ' '
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12]) text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
@ -450,7 +450,7 @@ def parse_gperf(infile):
raise InputError('Expected "domainname, <digit>", found "%s"' % line) raise InputError('Expected "domainname, <digit>", found "%s"' % line)
# Technically the DAFSA format could support return values in range [0-31], # Technically the DAFSA format could support return values in range [0-31],
# but the values below are the only with a defined meaning. # but the values below are the only with a defined meaning.
if line[-1] not in '01245': if line[-1] not in '0123456789ABCDEF':
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' %
line[-1]) line[-1])
return [line[:-3] + line[-1] for line in lines] return [line[:-3] + line[-1] for line in lines]

514
src/psl.c
View File

@ -32,6 +32,18 @@
# include <config.h> # include <config.h>
#endif #endif
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
#else
# define _GCC_VERSION_AT_LEAST(major, minor) 0
#endif
#if _GCC_VERSION_AT_LEAST(2,95)
# define _UNUSED __attribute__ ((unused))
#else
# define _UNUSED
#endif
/* if this file is included by psl2c, redefine to use requested library for builtin data */ /* if this file is included by psl2c, redefine to use requested library for builtin data */
#ifdef _LIBPSL_INCLUDED_BY_PSL2C #ifdef _LIBPSL_INCLUDED_BY_PSL2C
# undef WITH_LIBICU # undef WITH_LIBICU
@ -167,10 +179,10 @@ struct _psl_ctx_st {
/* include the PSL data compiled by 'psl2c' */ /* include the PSL data compiled by 'psl2c' */
#ifndef _LIBPSL_INCLUDED_BY_PSL2C #ifndef _LIBPSL_INCLUDED_BY_PSL2C
# include "suffixes.c" # include "suffixes_dafsa.c"
#else #else
/* if this source file is included by psl2c.c, provide empty builtin data */ /* if this source file is included by psl2c.c, provide empty builtin data */
static _psl_entry_t suffixes[1]; static const unsigned char kDafsa[1];
static time_t _psl_file_time; static time_t _psl_file_time;
static time_t _psl_compile_time; static time_t _psl_compile_time;
static int _psl_nsuffixes; static int _psl_nsuffixes;
@ -313,20 +325,196 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length)
return 0; return 0;
} }
static inline int _isspace_ascii(const char c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static int _str_is_ascii(const char *s)
{
while (*s && *((unsigned char *)s) < 128) s++;
return !*s;
}
#if defined(WITH_LIBIDN)
/*
* Work around a libidn <= 1.30 vulnerability.
*
* The function checks for a valid UTF-8 character sequence before
* passing it to idna_to_ascii_8z().
*
* [1] http://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
* [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
* [3] http://curl.haxx.se/mail/lib-2015-06/0143.html
*/
static int _utf8_is_valid(const char *utf8)
{
const unsigned char *s = (const unsigned char *) utf8;
while (*s) {
if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
s++;
else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80)
return 0;
s += 2;
} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
return 0;
s += 3;
} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
return 0;
s += 4;
} else
return 0;
}
return 1;
}
#endif
typedef void *_psl_idna_t;
static _psl_idna_t *_psl_idna_open(void)
{
#if defined(WITH_LIBICU)
UErrorCode status = 0;
return (void *)uidna_openUTS46(UIDNA_USE_STD3_RULES, &status);
#endif
return NULL;
}
static void _psl_idna_close(_psl_idna_t *idna _UNUSED)
{
#if defined(WITH_LIBICU)
if (idna)
uidna_close((UIDNA *)idna);
#endif
}
static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char **ascii)
{
int ret = -1;
#if defined(WITH_LIBICU)
/* IDNA2008 UTS#46 punycode conversion */
if (idna) {
char lookupname[128] = "";
UErrorCode status = 0;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
UChar utf16_dst[128], utf16_src[128];
int32_t utf16_src_length;
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, utf8, -1, &status);
if (U_SUCCESS(status)) {
int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
if (U_SUCCESS(status)) {
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
if (U_SUCCESS(status)) {
if (ascii)
*ascii = strdup(lookupname);
ret = 0;
} /* else
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
} /* else
fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */
} /* else
fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */
}
#elif defined(WITH_LIBIDN2)
int rc;
uint8_t *lower, resbuf[256];
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
/* we need a conversion to lowercase */
lower = u8_tolower((uint8_t *)utf8, u8_strlen((uint8_t *)utf8), 0, UNINORM_NFKC, resbuf, &len);
if (!lower) {
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
return -1;
}
/* u8_tolower() does not terminate the result string */
if (lower == resbuf) {
lower[len]=0;
} else {
uint8_t *tmp = lower;
lower = (uint8_t *)strndup((char *)lower, len);
free(tmp);
}
if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
ret = 0;
} /* else
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
if (lower != resbuf)
free(lower);
#elif defined(WITH_LIBIDN)
int rc;
if (!_utf8_is_valid(utf8)) {
/* fprintf(_(stderr, "Invalid UTF-8 sequence not converted: '%s'\n"), utf8); */
return -1;
}
/* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */
if ((rc = idna_to_ascii_8z(utf8, ascii, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
ret = 0;
} /* else
fprintf(_(stderr, "toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); */
#endif
return ret;
}
static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_entry_t *e)
{
char *lookupname;
if (_str_is_ascii(e->label_buf))
return;
if (_psl_idna_toASCII(idna, e->label_buf, &lookupname) == 0) {
if (strcmp(e->label_buf, lookupname)) {
_psl_entry_t suffix, *suffixp;
/* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */
free(lookupname);
}
}
/* prototype */
int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type) static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
{ {
_psl_entry_t suffix, *rule; _psl_entry_t suffix;
const char *p; const char *p;
int builtin; char *punycode = NULL;
int need_conversion = 0;
/* this function should be called without leading dots, just make sure */ /* this function should be called without leading dots, just make sure */
suffix.label = domain + (*domain == '.'); if (*domain == '.')
suffix.length = strlen(suffix.label); domain++;
suffix.nlabels = 1; suffix.nlabels = 1;
for (p = suffix.label; *p; p++) for (p = domain; *p; p++) {
if (*p == '.') if (*p == '.')
suffix.nlabels++; suffix.nlabels++;
else if (*((unsigned char *)p) < 128)
need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */
}
if (suffix.nlabels == 1) { if (suffix.nlabels == 1) {
/* TLD, this is the prevailing '*' match. /* TLD, this is the prevailing '*' match.
@ -335,61 +523,111 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
return 1; return 1;
} }
/* if domain has enough labels, it is public */ if (need_conversion) {
builtin = (psl == &_builtin_psl); _psl_idna_t *idna = _psl_idna_open();
if (builtin) if (_psl_idna_toASCII(idna, domain, &punycode) == 0) {
rule = &suffixes[0]; suffix.label = punycode;
else suffix.length = strlen(punycode);
rule = _vector_get(psl->suffixes, 0); } else {
/* fallback */
suffix.label = domain;
suffix.length = p - suffix.label;
}
if (!rule || rule->nlabels < suffix.nlabels - 1) _psl_idna_close(idna);
return 0; } else {
suffix.label = domain;
if (rule == &suffixes[0]) suffix.length = p - suffix.label;
rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
else
rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
if (rule) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
return 0;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
return 0;
/* definitely a match, no matter if the found rule is a wildcard or not */
if (rule->flags & _PSL_FLAG_EXCEPTION)
return 0;
if (rule->flags & _PSL_FLAG_PLAIN)
return 1;
} }
if ((suffix.label = strchr(suffix.label, '.'))) { if (psl == &_builtin_psl) {
int pos = rule - suffixes; int rc = LookupStringInFixedSet(kDafsa, sizeof(kDafsa), suffix.label, suffix.length);
if (rc != -1) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rc & _PSL_FLAG_ICANN))
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rc & _PSL_FLAG_PRIVATE))
goto suffix_no;
suffix.label++; if (rc & _PSL_FLAG_EXCEPTION)
suffix.length = strlen(suffix.label); goto suffix_no;
suffix.nlabels--;
if (builtin) /* wildcard *.foo.bar implicitly make foo.bar a public suffix */
rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare); /* definitely a match, no matter if the found rule is a wildcard or not */
else goto suffix_yes;
rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix))); }
if ((suffix.label = strchr(suffix.label, '.'))) {
suffix.label++;
suffix.length = strlen(suffix.label);
suffix.nlabels--;
rc = LookupStringInFixedSet(kDafsa, sizeof(kDafsa), suffix.label, suffix.length);
if (rc != -1) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rc & _PSL_FLAG_ICANN))
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rc & _PSL_FLAG_PRIVATE))
goto suffix_no;
if (rc & _PSL_FLAG_WILDCARD)
goto suffix_yes;
}
}
} else {
_psl_entry_t *rule = _vector_get(psl->suffixes, 0);
if (!rule || rule->nlabels < suffix.nlabels - 1)
return 0;
rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
if (rule) { if (rule) {
/* check for correct rule type */ /* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN)) if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
return 0; goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE)) else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
return 0; goto suffix_no;
if ((rule->flags & _PSL_FLAG_WILDCARD)) if (rule->flags & _PSL_FLAG_EXCEPTION)
return 1; goto suffix_no;
/* wildcard *.foo.bar implicitly make foo.bar a public suffix */
/* definitely a match, no matter if the found rule is a wildcard or not */
goto suffix_yes;
}
if ((suffix.label = strchr(suffix.label, '.'))) {
int pos;
suffix.label++;
suffix.length = strlen(suffix.label);
suffix.nlabels--;
rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix)));
if (rule) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
goto suffix_no;
if (rule->flags & _PSL_FLAG_WILDCARD)
goto suffix_yes;
}
} }
} }
suffix_no:
if (punycode)
free(punycode);
return 0; return 0;
suffix_yes:
if (punycode)
free(punycode);
return 1;
} }
/** /**
@ -531,167 +769,6 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
return regdom; return regdom;
} }
static inline int _isspace_ascii(const char c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static int _str_is_ascii(const char *s)
{
while (*s && *((unsigned char *)s) < 128) s++;
return !*s;
}
#if defined(WITH_LIBIDN)
/*
* Work around a libidn <= 1.30 vulnerability.
*
* The function checks for a valid UTF-8 character sequence before
* passing it to idna_to_ascii_8z().
*
* [1] http://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
* [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
* [3] http://curl.haxx.se/mail/lib-2015-06/0143.html
*/
static int _utf8_is_valid(const char *utf8)
{
const unsigned char *s = (const unsigned char *) utf8;
while (*s) {
if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
s++;
else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80)
return 0;
s += 2;
} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
return 0;
s += 3;
} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
return 0;
s += 4;
} else
return 0;
}
return 1;
}
#endif
#if defined(WITH_LIBICU)
static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t *e)
{
if (_str_is_ascii(e->label_buf))
return;
/* IDNA2008 UTS#46 punycode conversion */
if (idna) {
char lookupname[128] = "";
UErrorCode status = 0;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
UChar utf16_dst[128], utf16_src[128];
int32_t utf16_src_length;
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, -1, &status);
if (U_SUCCESS(status)) {
int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
if (U_SUCCESS(status)) {
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
if (U_SUCCESS(status)) {
if (strcmp(e->label_buf, lookupname)) {
_psl_entry_t suffix, *suffixp;
/* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */
} /* else
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
} /* else
fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */
} /* else
fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */
}
}
#elif defined(WITH_LIBIDN2)
static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
{
char *lookupname = NULL;
int rc;
uint8_t *lower, resbuf[256];
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
if (_str_is_ascii(e->label_buf))
return;
/* we need a conversion to lowercase */
lower = u8_tolower((uint8_t *)e->label_buf, u8_strlen((uint8_t *)e->label_buf), 0, UNINORM_NFKC, resbuf, &len);
if (!lower) {
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", e->label_buf, errno); */
return;
}
/* u8_tolower() does not terminate the result string */
if (lower == resbuf) {
lower[len]=0;
} else {
uint8_t *tmp = lower;
lower = (uint8_t *)strndup((char *)lower, len);
free(tmp);
}
if ((rc = idn2_lookup_u8(lower, (uint8_t **)&lookupname, 0)) == IDN2_OK) {
if (strcmp(e->label_buf, lookupname)) {
_psl_entry_t suffix, *suffixp;
/* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */
} /* else
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
if (lower != resbuf)
free(lower);
}
#elif defined(WITH_LIBIDN)
static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
{
char *lookupname = NULL;
int rc;
if (_str_is_ascii(e->label_buf))
return;
if (!_utf8_is_valid(e->label_buf)) {
/* fprintf(_(stderr, "Invalid UTF-8 sequence not converted: '%s'\n"), e->label_buf); */
return;
}
/* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */
if ((rc = idna_to_ascii_8z(e->label_buf, &lookupname, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
if (strcmp(e->label_buf, lookupname)) {
_psl_entry_t suffix, *suffixp;
/* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */
} /* else
fprintf(_(stderr, "toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); */
}
#endif
/** /**
* psl_load_file: * psl_load_file:
* @fname: Name of PSL file * @fname: Name of PSL file
@ -740,10 +817,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
_psl_entry_t suffix, *suffixp; _psl_entry_t suffix, *suffixp;
char buf[256], *linep, *p; char buf[256], *linep, *p;
int type = 0; int type = 0;
#ifdef WITH_LIBICU _psl_idna_t *idna;
UIDNA *idna;
UErrorCode status = 0;
#endif
if (!fp) if (!fp)
return NULL; return NULL;
@ -751,9 +825,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
if (!(psl = calloc(1, sizeof(psl_ctx_t)))) if (!(psl = calloc(1, sizeof(psl_ctx_t))))
return NULL; return NULL;
#ifdef WITH_LIBICU idna = _psl_idna_open();
idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status);
#endif
/* /*
* as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions. * as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions.
@ -794,7 +866,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
continue; continue;
} }
p++; p++;
/* wildcard *.foo.bar implicitely make foo.bar a public suffix */ /* wildcard *.foo.bar implicitly make foo.bar a public suffix */
suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN | type; suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN | type;
psl->nwildcards++; psl->nwildcards++;
psl->nsuffixes++; psl->nsuffixes++;
@ -829,20 +901,14 @@ psl_ctx_t *psl_load_fp(FILE *fp)
} }
suffixp->label = suffixp->label_buf; /* set label to changed address */ suffixp->label = suffixp->label_buf; /* set label to changed address */
#ifdef WITH_LIBICU
_add_punycode_if_needed(idna, psl->suffixes, suffixp); _add_punycode_if_needed(idna, psl->suffixes, suffixp);
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
_add_punycode_if_needed(psl->suffixes, suffixp);
#endif
} }
} }
_vector_sort(psl->suffixes); _vector_sort(psl->suffixes);
#ifdef WITH_LIBICU _psl_idna_close(idna);
if (idna)
uidna_close(idna);
#endif
return psl; return psl;
} }
@ -1184,7 +1250,7 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
* *
* Since: 0.4 * Since: 0.4
*/ */
psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower) psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale _UNUSED, char **lower)
{ {
int ret = PSL_ERR_INVALID_ARG; int ret = PSL_ERR_INVALID_ARG;

View File

@ -128,39 +128,6 @@ static int _check_psl(const psl_ctx_t *psl)
} }
#endif #endif
static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname)
{
int it;
#ifdef BUILTIN_GENERATOR_LIBICU
do {
UVersionInfo version_info;
char version[U_MAX_VERSION_STRING_LENGTH];
u_getVersion(version_info);
u_versionToString(version_info, version);
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version);
} while (0);
#elif defined(BUILTIN_GENERATOR_LIBIDN2)
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL));
#elif defined(BUILTIN_GENERATOR_LIBIDN)
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL));
#else
fprintf(fpout, "/* automatically generated by psl2c (without punycode support) */\n");
#endif
fprintf(fpout, "static _psl_entry_t %s[] = {\n", varname);
for (it = 0; it < v->cur; it++) {
_psl_entry_t *e = _vector_get(v, it);
fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n",
e->label_buf, e->length, (int) e->nlabels, (int) e->flags);
}
fprintf(fpout, "};\n");
}
static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v) static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
{ {
FILE *fp; FILE *fp;
@ -192,13 +159,14 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
while (*s && *s < 128) s++; while (*s && *s < 128) s++;
if (*s) continue; if (*s) continue;
fprintf(fp, "%s, %d\n", e->label_buf, (int) e->flags); fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
} }
fclose(fp); fclose(fp);
} }
system("../tools/make_dafsa.py in.tmp out.tmp"); if ((it = system(MAKE_DAFSA " in.tmp out.tmp")))
fprintf(stderr, "Failed to execute " MAKE_DAFSA "\n");
if ((fp = fopen("out.tmp", "r"))) { if ((fp = fopen("out.tmp", "r"))) {
char buf[256]; char buf[256];
@ -208,6 +176,9 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
fclose(fp); fclose(fp);
} }
unlink("in.tmp");
unlink("out.tmp");
} }
#if 0 #if 0
@ -262,15 +233,10 @@ int main(int argc, const char **argv)
#ifdef _GENERATE_BUILTIN_DATA #ifdef _GENERATE_BUILTIN_DATA
psl_ctx_t *psl; psl_ctx_t *psl;
#endif #endif
int ret = 0, argpos = 1, dafsa = 0; int ret = 0, argpos = 1;
if (argc == 4 && !strcmp(argv[1], "--dafsa")) {
argpos = 2;
dafsa = 1;
}
if (argc - argpos != 2) { if (argc - argpos != 2) {
fprintf(stderr, "Usage: psl2c [--dafsa] <infile> <outfile>\n"); fprintf(stderr, "Usage: psl2c <infile> <outfile>\n");
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n"); fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
fprintf(stderr, " <outfile> is the the C filename to be generated from <infile>\n"); fprintf(stderr, " <outfile> is the the C filename to be generated from <infile>\n");
return 1; return 1;
@ -299,10 +265,7 @@ int main(int argc, const char **argv)
_add_punycode_if_needed(psl->suffixes); _add_punycode_if_needed(psl->suffixes);
#endif #endif
if (dafsa) _print_psl_entries_dafsa(fpout, psl->suffixes);
_print_psl_entries_dafsa(fpout, psl->suffixes);
else
_print_psl_entries(fpout, psl->suffixes, "suffixes");
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]); snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);
if ((pp = popen(cmd, "r"))) { if ((pp = popen(cmd, "r"))) {