Use DAWG/DAFSA format for builtin data

This data representation reduces the size of the PSL data
drastically and still allows fast lookups.
This commit is contained in:
Tim Rühsen 2015-12-09 09:35:04 +01:00
parent 36139b601d
commit 0ca3741df6
8 changed files with 552 additions and 283 deletions

View File

@ -15,3 +15,4 @@ Dagobert Michelsen (Fixed Solaris building)
Christopher Meng (Fedora building)
Jakub Čajka
Giuseppe Scrivano
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)

View File

@ -14,7 +14,7 @@ Browsers and other web clients can use it to
Libpsl...
- has built-in PSL data for fast access
- has built-in PSL data for fast access (DAWG/DAFSA reduces size from 180kB to ~32kB)
- allows to load PSL data from files
- checks if a given domain is a "public suffix"
- provides immediate cookie domain verification
@ -28,6 +28,8 @@ Find more information about the Publix Suffix List [here](http://publicsuffix.or
Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat).
The DAFSA code has been taken from [Chromium Project](https://code.google.com/p/chromium/).
API Documentation
-----------------
@ -74,6 +76,8 @@ License
Libpsl is made available under the terms of the MIT license.<br>
See the LICENSE file that accompanies this distribution for the full text of the license.
src/make_dafsa.py and src/lookup_string_in_fixed_set.c are licensed under the term written in
src/LICENSE.chromium.
Building from git
-----------------

30
src/LICENSE.chromium Normal file
View File

@ -0,0 +1,30 @@
* The following License is for the source code files
make_dafsa.py and lookup_string_in_fixed_set.c.
// Copyright 2015 The Chromium Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,12 +1,12 @@
# suffixes.c must be created before psl.c is compiled
BUILT_SOURCES = suffixes.c
BUILT_SOURCES = suffixes_dafsa.c
# suffixes.c is a built source that must be cleaned
CLEANFILES = suffixes.c
CLEANFILES = suffixes_dafsa.c
lib_LTLIBRARIES = libpsl.la
libpsl_la_SOURCES = psl.c
libpsl_la_SOURCES = psl.c lookup_string_in_fixed_set.c
libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
# include ABI version information
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
@ -21,8 +21,8 @@ if WITH_LIBIDN
endif
noinst_PROGRAMS = psl2c
psl2c_SOURCES = psl2c.c
psl2c_CPPFLAGS = -I$(top_srcdir)/include
psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/make_dafsa.py\"
if BUILTIN_GENERATOR_LIBICU
psl2c_LDADD = -licuuc
endif
@ -33,8 +33,9 @@ if BUILTIN_GENERATOR_LIBIDN
psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring
endif
# Build rule for suffix.c
# Build rule for suffix_dafsa.c
# PSL_FILE can be set by ./configure --with-psl-file=[PATH]
suffixes.c: $(PSL_FILE) psl2c$(EXEEXT)
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes.c
./psl2c$(EXEEXT) --dafsa "$(PSL_FILE)" suffixes_dafsa.c
suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
EXTRA_DIST = make_dafsa.py LICENSE.chromium

View File

@ -0,0 +1,204 @@
/* Copyright 2015 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE.chromium file.
*
* Converted to C89 2015 by Tim Rühsen
*/
#include <stddef.h>
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
#else
# define _GCC_VERSION_AT_LEAST(major, minor) 0
#endif
#if _GCC_VERSION_AT_LEAST(4,0)
# define _HIDDEN __attribute__ ((visibility ("hidden")))
#else
# define _HIDDEN
#endif
#define CHECK_LT(a, b) if ((a) >= b) return 0
/*
* Read next offset from pos.
* Returns true if an offset could be read, false otherwise.
*/
static int GetNextOffset(const unsigned char** pos,
const unsigned char* end,
const unsigned char** offset)
{
size_t bytes_consumed;
if (*pos == end)
return 0;
/* When reading an offset the byte array must always contain at least
* three more bytes to consume. First the offset to read, then a node
* to skip over and finally a destination node. No object can be smaller
* than one byte. */
CHECK_LT(*pos + 2, end);
switch (**pos & 0x60) {
case 0x60: /* Read three byte offset */
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
bytes_consumed = 3;
break;
case 0x40: /* Read two byte offset */
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
bytes_consumed = 2;
break;
default:
*offset += (*pos)[0] & 0x3F;
bytes_consumed = 1;
}
if ((**pos & 0x80) != 0) {
*pos = end;
} else {
*pos += bytes_consumed;
}
return 1;
}
/*
* Check if byte at offset is last in label.
*/
static int IsEOL(const unsigned char* offset, const unsigned char* end)
{
CHECK_LT(offset, end);
return(*offset & 0x80) != 0;
}
/*
* Check if byte at offset matches first character in key.
* This version matches characters not last in label.
*/
static int IsMatch(const unsigned char* offset,
const unsigned char* end,
const char* key)
{
CHECK_LT(offset, end);
return *offset == *key;
}
/*
* Check if byte at offset matches first character in key.
* This version matches characters last in label.
*/
static int IsEndCharMatch(const unsigned char* offset,
const unsigned char* end,
const char* key)
{
CHECK_LT(offset, end);
return *offset == (*key | 0x80);
}
/*
* Read return value at offset.
* Returns true if a return value could be read, false otherwise.
*/
static int GetReturnValue(const unsigned char* offset,
const unsigned char* end,
int* return_value)
{
CHECK_LT(offset, end);
if ((*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x0F;
return 1;
}
return 0;
}
/*
* Looks up the string |key| with length |key_length| in a fixed set of
* strings. The set of strings must be known at compile time. It is converted to
* a graph structure named a DAFSA (Deterministic Acyclic Finite State
* Automaton) by the script make_dafsa.py during compilation. This permits
* efficient (in time and space) lookup. The graph generated by make_dafsa.py
* takes the form of a constant byte array which should be supplied via the
* |graph| and |length| parameters. The return value is kDafsaNotFound,
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
* kDafsaWildcardRule and kDafsaPrivateRule ORed together.
*
* Lookup a domain key in a byte array generated by make_dafsa.py.
*/
/* prototype to skip warning with -Wmissing-prototypes */
int _HIDDEN LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
size_t length,
const char* key,
size_t key_length)
{
const unsigned char* pos = graph;
const unsigned char* end = graph + length;
const unsigned char* offset = pos;
const char* key_end = key + key_length;
while (GetNextOffset(&pos, end, &offset)) {
/*char <char>+ end_char offsets
* char <char>+ return value
* char end_char offsets
* char return value
* end_char offsets
* return_value
*/
int did_consume = 0;
if (key != key_end && !IsEOL(offset, end)) {
/* Leading <char> is not a match. Don't dive into this child */
if (!IsMatch(offset, end, key))
continue;
did_consume = 1;
++offset;
++key;
/* Possible matches at this point:
* <char>+ end_char offsets
* <char>+ return value
* end_char offsets
* return value
*/
/* Remove all remaining <char> nodes possible */
while (!IsEOL(offset, end) && key != key_end) {
if (!IsMatch(offset, end, key))
return -1;
++key;
++offset;
}
}
/* Possible matches at this point:
* end_char offsets
* return_value
* If one or more <char> elements were consumed, a failure
* to match is terminal. Otherwise, try the next node.
*/
if (key == key_end) {
int return_value;
if (GetReturnValue(offset, end, &return_value))
return return_value;
/* The DAFSA guarantees that if the first char is a match, all
* remaining char elements MUST match if the key is truly present.
*/
if (did_consume)
return -1;
continue;
}
if (!IsEndCharMatch(offset, end, key)) {
if (did_consume)
return -1; /* Unexpected */
continue;
}
++key;
pos = ++offset; /* Dive into child */
}
return -1; /* No match */
}

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# found in the LICENSE.chromium file.
"""
A Deterministic acyclic finite state automaton (DAFSA) is a compact
@ -421,7 +421,7 @@ def to_cxx(data):
text += 'The byte array encodes effective tld names. See make_dafsa.py for'
text += ' documentation.'
text += '*/\n\n'
text += 'const unsigned char kDafsa[%s] = {\n' % len(data)
text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
for i in range(0, len(data), 12):
text += ' '
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
@ -450,7 +450,7 @@ def parse_gperf(infile):
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
# Technically the DAFSA format could support return values in range [0-31],
# but the values below are the only with a defined meaning.
if line[-1] not in '01245':
if line[-1] not in '0123456789ABCDEF':
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' %
line[-1])
return [line[:-3] + line[-1] for line in lines]

514
src/psl.c
View File

@ -32,6 +32,18 @@
# include <config.h>
#endif
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
#else
# define _GCC_VERSION_AT_LEAST(major, minor) 0
#endif
#if _GCC_VERSION_AT_LEAST(2,95)
# define _UNUSED __attribute__ ((unused))
#else
# define _UNUSED
#endif
/* if this file is included by psl2c, redefine to use requested library for builtin data */
#ifdef _LIBPSL_INCLUDED_BY_PSL2C
# undef WITH_LIBICU
@ -167,10 +179,10 @@ struct _psl_ctx_st {
/* include the PSL data compiled by 'psl2c' */
#ifndef _LIBPSL_INCLUDED_BY_PSL2C
# include "suffixes.c"
# include "suffixes_dafsa.c"
#else
/* if this source file is included by psl2c.c, provide empty builtin data */
static _psl_entry_t suffixes[1];
static const unsigned char kDafsa[1];
static time_t _psl_file_time;
static time_t _psl_compile_time;
static int _psl_nsuffixes;
@ -313,20 +325,196 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length)
return 0;
}
static inline int _isspace_ascii(const char c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static int _str_is_ascii(const char *s)
{
while (*s && *((unsigned char *)s) < 128) s++;
return !*s;
}
#if defined(WITH_LIBIDN)
/*
* Work around a libidn <= 1.30 vulnerability.
*
* The function checks for a valid UTF-8 character sequence before
* passing it to idna_to_ascii_8z().
*
* [1] http://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
* [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
* [3] http://curl.haxx.se/mail/lib-2015-06/0143.html
*/
static int _utf8_is_valid(const char *utf8)
{
const unsigned char *s = (const unsigned char *) utf8;
while (*s) {
if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
s++;
else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80)
return 0;
s += 2;
} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
return 0;
s += 3;
} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
return 0;
s += 4;
} else
return 0;
}
return 1;
}
#endif
typedef void *_psl_idna_t;
static _psl_idna_t *_psl_idna_open(void)
{
#if defined(WITH_LIBICU)
UErrorCode status = 0;
return (void *)uidna_openUTS46(UIDNA_USE_STD3_RULES, &status);
#endif
return NULL;
}
static void _psl_idna_close(_psl_idna_t *idna _UNUSED)
{
#if defined(WITH_LIBICU)
if (idna)
uidna_close((UIDNA *)idna);
#endif
}
static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char **ascii)
{
int ret = -1;
#if defined(WITH_LIBICU)
/* IDNA2008 UTS#46 punycode conversion */
if (idna) {
char lookupname[128] = "";
UErrorCode status = 0;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
UChar utf16_dst[128], utf16_src[128];
int32_t utf16_src_length;
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, utf8, -1, &status);
if (U_SUCCESS(status)) {
int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
if (U_SUCCESS(status)) {
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
if (U_SUCCESS(status)) {
if (ascii)
*ascii = strdup(lookupname);
ret = 0;
} /* else
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
} /* else
fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */
} /* else
fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */
}
#elif defined(WITH_LIBIDN2)
int rc;
uint8_t *lower, resbuf[256];
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
/* we need a conversion to lowercase */
lower = u8_tolower((uint8_t *)utf8, u8_strlen((uint8_t *)utf8), 0, UNINORM_NFKC, resbuf, &len);
if (!lower) {
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
return -1;
}
/* u8_tolower() does not terminate the result string */
if (lower == resbuf) {
lower[len]=0;
} else {
uint8_t *tmp = lower;
lower = (uint8_t *)strndup((char *)lower, len);
free(tmp);
}
if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
ret = 0;
} /* else
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
if (lower != resbuf)
free(lower);
#elif defined(WITH_LIBIDN)
int rc;
if (!_utf8_is_valid(utf8)) {
/* fprintf(_(stderr, "Invalid UTF-8 sequence not converted: '%s'\n"), utf8); */
return -1;
}
/* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */
if ((rc = idna_to_ascii_8z(utf8, ascii, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
ret = 0;
} /* else
fprintf(_(stderr, "toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); */
#endif
return ret;
}
static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_entry_t *e)
{
char *lookupname;
if (_str_is_ascii(e->label_buf))
return;
if (_psl_idna_toASCII(idna, e->label_buf, &lookupname) == 0) {
if (strcmp(e->label_buf, lookupname)) {
_psl_entry_t suffix, *suffixp;
/* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */
free(lookupname);
}
}
/* prototype */
int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
{
_psl_entry_t suffix, *rule;
_psl_entry_t suffix;
const char *p;
int builtin;
char *punycode = NULL;
int need_conversion = 0;
/* this function should be called without leading dots, just make sure */
suffix.label = domain + (*domain == '.');
suffix.length = strlen(suffix.label);
if (*domain == '.')
domain++;
suffix.nlabels = 1;
for (p = suffix.label; *p; p++)
for (p = domain; *p; p++) {
if (*p == '.')
suffix.nlabels++;
else if (*((unsigned char *)p) < 128)
need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */
}
if (suffix.nlabels == 1) {
/* TLD, this is the prevailing '*' match.
@ -335,61 +523,111 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
return 1;
}
/* if domain has enough labels, it is public */
builtin = (psl == &_builtin_psl);
if (need_conversion) {
_psl_idna_t *idna = _psl_idna_open();
if (builtin)
rule = &suffixes[0];
else
rule = _vector_get(psl->suffixes, 0);
if (_psl_idna_toASCII(idna, domain, &punycode) == 0) {
suffix.label = punycode;
suffix.length = strlen(punycode);
} else {
/* fallback */
suffix.label = domain;
suffix.length = p - suffix.label;
}
if (!rule || rule->nlabels < suffix.nlabels - 1)
return 0;
if (rule == &suffixes[0])
rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
else
rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
if (rule) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
return 0;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
return 0;
/* definitely a match, no matter if the found rule is a wildcard or not */
if (rule->flags & _PSL_FLAG_EXCEPTION)
return 0;
if (rule->flags & _PSL_FLAG_PLAIN)
return 1;
_psl_idna_close(idna);
} else {
suffix.label = domain;
suffix.length = p - suffix.label;
}
if ((suffix.label = strchr(suffix.label, '.'))) {
int pos = rule - suffixes;
if (psl == &_builtin_psl) {
int rc = LookupStringInFixedSet(kDafsa, sizeof(kDafsa), suffix.label, suffix.length);
if (rc != -1) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rc & _PSL_FLAG_ICANN))
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rc & _PSL_FLAG_PRIVATE))
goto suffix_no;
suffix.label++;
suffix.length = strlen(suffix.label);
suffix.nlabels--;
if (rc & _PSL_FLAG_EXCEPTION)
goto suffix_no;
if (builtin)
rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
else
rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix)));
/* wildcard *.foo.bar implicitly make foo.bar a public suffix */
/* definitely a match, no matter if the found rule is a wildcard or not */
goto suffix_yes;
}
if ((suffix.label = strchr(suffix.label, '.'))) {
suffix.label++;
suffix.length = strlen(suffix.label);
suffix.nlabels--;
rc = LookupStringInFixedSet(kDafsa, sizeof(kDafsa), suffix.label, suffix.length);
if (rc != -1) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rc & _PSL_FLAG_ICANN))
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rc & _PSL_FLAG_PRIVATE))
goto suffix_no;
if (rc & _PSL_FLAG_WILDCARD)
goto suffix_yes;
}
}
} else {
_psl_entry_t *rule = _vector_get(psl->suffixes, 0);
if (!rule || rule->nlabels < suffix.nlabels - 1)
return 0;
rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
if (rule) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
return 0;
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
return 0;
goto suffix_no;
if ((rule->flags & _PSL_FLAG_WILDCARD))
return 1;
if (rule->flags & _PSL_FLAG_EXCEPTION)
goto suffix_no;
/* wildcard *.foo.bar implicitly make foo.bar a public suffix */
/* definitely a match, no matter if the found rule is a wildcard or not */
goto suffix_yes;
}
if ((suffix.label = strchr(suffix.label, '.'))) {
int pos;
suffix.label++;
suffix.length = strlen(suffix.label);
suffix.nlabels--;
rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix)));
if (rule) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
goto suffix_no;
if (rule->flags & _PSL_FLAG_WILDCARD)
goto suffix_yes;
}
}
}
suffix_no:
if (punycode)
free(punycode);
return 0;
suffix_yes:
if (punycode)
free(punycode);
return 1;
}
/**
@ -531,167 +769,6 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
return regdom;
}
static inline int _isspace_ascii(const char c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static int _str_is_ascii(const char *s)
{
while (*s && *((unsigned char *)s) < 128) s++;
return !*s;
}
#if defined(WITH_LIBIDN)
/*
* Work around a libidn <= 1.30 vulnerability.
*
* The function checks for a valid UTF-8 character sequence before
* passing it to idna_to_ascii_8z().
*
* [1] http://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
* [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
* [3] http://curl.haxx.se/mail/lib-2015-06/0143.html
*/
static int _utf8_is_valid(const char *utf8)
{
const unsigned char *s = (const unsigned char *) utf8;
while (*s) {
if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
s++;
else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80)
return 0;
s += 2;
} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
return 0;
s += 3;
} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
return 0;
s += 4;
} else
return 0;
}
return 1;
}
#endif
#if defined(WITH_LIBICU)
static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t *e)
{
if (_str_is_ascii(e->label_buf))
return;
/* IDNA2008 UTS#46 punycode conversion */
if (idna) {
char lookupname[128] = "";
UErrorCode status = 0;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
UChar utf16_dst[128], utf16_src[128];
int32_t utf16_src_length;
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, -1, &status);
if (U_SUCCESS(status)) {
int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
if (U_SUCCESS(status)) {
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
if (U_SUCCESS(status)) {
if (strcmp(e->label_buf, lookupname)) {
_psl_entry_t suffix, *suffixp;
/* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */
} /* else
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
} /* else
fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */
} /* else
fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */
}
}
#elif defined(WITH_LIBIDN2)
static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
{
char *lookupname = NULL;
int rc;
uint8_t *lower, resbuf[256];
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
if (_str_is_ascii(e->label_buf))
return;
/* we need a conversion to lowercase */
lower = u8_tolower((uint8_t *)e->label_buf, u8_strlen((uint8_t *)e->label_buf), 0, UNINORM_NFKC, resbuf, &len);
if (!lower) {
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", e->label_buf, errno); */
return;
}
/* u8_tolower() does not terminate the result string */
if (lower == resbuf) {
lower[len]=0;
} else {
uint8_t *tmp = lower;
lower = (uint8_t *)strndup((char *)lower, len);
free(tmp);
}
if ((rc = idn2_lookup_u8(lower, (uint8_t **)&lookupname, 0)) == IDN2_OK) {
if (strcmp(e->label_buf, lookupname)) {
_psl_entry_t suffix, *suffixp;
/* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */
} /* else
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
if (lower != resbuf)
free(lower);
}
#elif defined(WITH_LIBIDN)
static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
{
char *lookupname = NULL;
int rc;
if (_str_is_ascii(e->label_buf))
return;
if (!_utf8_is_valid(e->label_buf)) {
/* fprintf(_(stderr, "Invalid UTF-8 sequence not converted: '%s'\n"), e->label_buf); */
return;
}
/* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */
if ((rc = idna_to_ascii_8z(e->label_buf, &lookupname, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
if (strcmp(e->label_buf, lookupname)) {
_psl_entry_t suffix, *suffixp;
/* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */
} /* else
fprintf(_(stderr, "toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); */
}
#endif
/**
* psl_load_file:
* @fname: Name of PSL file
@ -740,10 +817,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
_psl_entry_t suffix, *suffixp;
char buf[256], *linep, *p;
int type = 0;
#ifdef WITH_LIBICU
UIDNA *idna;
UErrorCode status = 0;
#endif
_psl_idna_t *idna;
if (!fp)
return NULL;
@ -751,9 +825,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
if (!(psl = calloc(1, sizeof(psl_ctx_t))))
return NULL;
#ifdef WITH_LIBICU
idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status);
#endif
idna = _psl_idna_open();
/*
* as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions.
@ -794,7 +866,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
continue;
}
p++;
/* wildcard *.foo.bar implicitely make foo.bar a public suffix */
/* wildcard *.foo.bar implicitly make foo.bar a public suffix */
suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN | type;
psl->nwildcards++;
psl->nsuffixes++;
@ -829,20 +901,14 @@ psl_ctx_t *psl_load_fp(FILE *fp)
}
suffixp->label = suffixp->label_buf; /* set label to changed address */
#ifdef WITH_LIBICU
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
_add_punycode_if_needed(psl->suffixes, suffixp);
#endif
}
}
_vector_sort(psl->suffixes);
#ifdef WITH_LIBICU
if (idna)
uidna_close(idna);
#endif
_psl_idna_close(idna);
return psl;
}
@ -1184,7 +1250,7 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
*
* Since: 0.4
*/
psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower)
psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale _UNUSED, char **lower)
{
int ret = PSL_ERR_INVALID_ARG;

View File

@ -128,39 +128,6 @@ static int _check_psl(const psl_ctx_t *psl)
}
#endif
static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname)
{
int it;
#ifdef BUILTIN_GENERATOR_LIBICU
do {
UVersionInfo version_info;
char version[U_MAX_VERSION_STRING_LENGTH];
u_getVersion(version_info);
u_versionToString(version_info, version);
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version);
} while (0);
#elif defined(BUILTIN_GENERATOR_LIBIDN2)
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL));
#elif defined(BUILTIN_GENERATOR_LIBIDN)
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL));
#else
fprintf(fpout, "/* automatically generated by psl2c (without punycode support) */\n");
#endif
fprintf(fpout, "static _psl_entry_t %s[] = {\n", varname);
for (it = 0; it < v->cur; it++) {
_psl_entry_t *e = _vector_get(v, it);
fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n",
e->label_buf, e->length, (int) e->nlabels, (int) e->flags);
}
fprintf(fpout, "};\n");
}
static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
{
FILE *fp;
@ -192,13 +159,14 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
while (*s && *s < 128) s++;
if (*s) continue;
fprintf(fp, "%s, %d\n", e->label_buf, (int) e->flags);
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
}
fclose(fp);
}
system("../tools/make_dafsa.py in.tmp out.tmp");
if ((it = system(MAKE_DAFSA " in.tmp out.tmp")))
fprintf(stderr, "Failed to execute " MAKE_DAFSA "\n");
if ((fp = fopen("out.tmp", "r"))) {
char buf[256];
@ -208,6 +176,9 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
fclose(fp);
}
unlink("in.tmp");
unlink("out.tmp");
}
#if 0
@ -262,15 +233,10 @@ int main(int argc, const char **argv)
#ifdef _GENERATE_BUILTIN_DATA
psl_ctx_t *psl;
#endif
int ret = 0, argpos = 1, dafsa = 0;
if (argc == 4 && !strcmp(argv[1], "--dafsa")) {
argpos = 2;
dafsa = 1;
}
int ret = 0, argpos = 1;
if (argc - argpos != 2) {
fprintf(stderr, "Usage: psl2c [--dafsa] <infile> <outfile>\n");
fprintf(stderr, "Usage: psl2c <infile> <outfile>\n");
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
fprintf(stderr, " <outfile> is the the C filename to be generated from <infile>\n");
return 1;
@ -299,10 +265,7 @@ int main(int argc, const char **argv)
_add_punycode_if_needed(psl->suffixes);
#endif
if (dafsa)
_print_psl_entries_dafsa(fpout, psl->suffixes);
else
_print_psl_entries(fpout, psl->suffixes, "suffixes");
_print_psl_entries_dafsa(fpout, psl->suffixes);
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);
if ((pp = popen(cmd, "r"))) {