Use DAWG/DAFSA format for builtin data
This data representation reduces the size of the PSL data drastically and still allows fast lookups.
This commit is contained in:
parent
36139b601d
commit
0ca3741df6
1
AUTHORS
1
AUTHORS
|
@ -15,3 +15,4 @@ Dagobert Michelsen (Fixed Solaris building)
|
||||||
Christopher Meng (Fedora building)
|
Christopher Meng (Fedora building)
|
||||||
Jakub Čajka
|
Jakub Čajka
|
||||||
Giuseppe Scrivano
|
Giuseppe Scrivano
|
||||||
|
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
|
||||||
|
|
|
@ -14,7 +14,7 @@ Browsers and other web clients can use it to
|
||||||
|
|
||||||
Libpsl...
|
Libpsl...
|
||||||
|
|
||||||
- has built-in PSL data for fast access
|
- has built-in PSL data for fast access (DAWG/DAFSA reduces size from 180kB to ~32kB)
|
||||||
- allows to load PSL data from files
|
- allows to load PSL data from files
|
||||||
- checks if a given domain is a "public suffix"
|
- checks if a given domain is a "public suffix"
|
||||||
- provides immediate cookie domain verification
|
- provides immediate cookie domain verification
|
||||||
|
@ -28,6 +28,8 @@ Find more information about the Publix Suffix List [here](http://publicsuffix.or
|
||||||
|
|
||||||
Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat).
|
Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat).
|
||||||
|
|
||||||
|
The DAFSA code has been taken from [Chromium Project](https://code.google.com/p/chromium/).
|
||||||
|
|
||||||
|
|
||||||
API Documentation
|
API Documentation
|
||||||
-----------------
|
-----------------
|
||||||
|
@ -74,6 +76,8 @@ License
|
||||||
Libpsl is made available under the terms of the MIT license.<br>
|
Libpsl is made available under the terms of the MIT license.<br>
|
||||||
See the LICENSE file that accompanies this distribution for the full text of the license.
|
See the LICENSE file that accompanies this distribution for the full text of the license.
|
||||||
|
|
||||||
|
src/make_dafsa.py and src/lookup_string_in_fixed_set.c are licensed under the term written in
|
||||||
|
src/LICENSE.chromium.
|
||||||
|
|
||||||
Building from git
|
Building from git
|
||||||
-----------------
|
-----------------
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
* The following License is for the source code files
|
||||||
|
make_dafsa.py and lookup_string_in_fixed_set.c.
|
||||||
|
|
||||||
|
// Copyright 2015 The Chromium Authors. All rights reserved.
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms, with or without
|
||||||
|
// modification, are permitted provided that the following conditions are
|
||||||
|
// met:
|
||||||
|
//
|
||||||
|
// * Redistributions of source code must retain the above copyright
|
||||||
|
// notice, this list of conditions and the following disclaimer.
|
||||||
|
// * Redistributions in binary form must reproduce the above
|
||||||
|
// copyright notice, this list of conditions and the following disclaimer
|
||||||
|
// in the documentation and/or other materials provided with the
|
||||||
|
// distribution.
|
||||||
|
// * Neither the name of Google Inc. nor the names of its
|
||||||
|
// contributors may be used to endorse or promote products derived from
|
||||||
|
// this software without specific prior written permission.
|
||||||
|
//
|
||||||
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1,12 +1,12 @@
|
||||||
# suffixes.c must be created before psl.c is compiled
|
# suffixes.c must be created before psl.c is compiled
|
||||||
BUILT_SOURCES = suffixes.c
|
BUILT_SOURCES = suffixes_dafsa.c
|
||||||
|
|
||||||
# suffixes.c is a built source that must be cleaned
|
# suffixes.c is a built source that must be cleaned
|
||||||
CLEANFILES = suffixes.c
|
CLEANFILES = suffixes_dafsa.c
|
||||||
|
|
||||||
lib_LTLIBRARIES = libpsl.la
|
lib_LTLIBRARIES = libpsl.la
|
||||||
|
|
||||||
libpsl_la_SOURCES = psl.c
|
libpsl_la_SOURCES = psl.c lookup_string_in_fixed_set.c
|
||||||
libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
|
libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
|
||||||
# include ABI version information
|
# include ABI version information
|
||||||
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
|
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
|
||||||
|
@ -21,8 +21,8 @@ if WITH_LIBIDN
|
||||||
endif
|
endif
|
||||||
|
|
||||||
noinst_PROGRAMS = psl2c
|
noinst_PROGRAMS = psl2c
|
||||||
psl2c_SOURCES = psl2c.c
|
psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
|
||||||
psl2c_CPPFLAGS = -I$(top_srcdir)/include
|
psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/make_dafsa.py\"
|
||||||
if BUILTIN_GENERATOR_LIBICU
|
if BUILTIN_GENERATOR_LIBICU
|
||||||
psl2c_LDADD = -licuuc
|
psl2c_LDADD = -licuuc
|
||||||
endif
|
endif
|
||||||
|
@ -33,8 +33,9 @@ if BUILTIN_GENERATOR_LIBIDN
|
||||||
psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring
|
psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# Build rule for suffix.c
|
# Build rule for suffix_dafsa.c
|
||||||
# PSL_FILE can be set by ./configure --with-psl-file=[PATH]
|
# PSL_FILE can be set by ./configure --with-psl-file=[PATH]
|
||||||
suffixes.c: $(PSL_FILE) psl2c$(EXEEXT)
|
suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
|
||||||
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes.c
|
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
|
||||||
./psl2c$(EXEEXT) --dafsa "$(PSL_FILE)" suffixes_dafsa.c
|
|
||||||
|
EXTRA_DIST = make_dafsa.py LICENSE.chromium
|
||||||
|
|
|
@ -0,0 +1,204 @@
|
||||||
|
/* Copyright 2015 The Chromium Authors. All rights reserved.
|
||||||
|
* Use of this source code is governed by a BSD-style license that can be
|
||||||
|
* found in the LICENSE.chromium file.
|
||||||
|
*
|
||||||
|
* Converted to C89 2015 by Tim Rühsen
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
|
||||||
|
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
|
||||||
|
#else
|
||||||
|
# define _GCC_VERSION_AT_LEAST(major, minor) 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if _GCC_VERSION_AT_LEAST(4,0)
|
||||||
|
# define _HIDDEN __attribute__ ((visibility ("hidden")))
|
||||||
|
#else
|
||||||
|
# define _HIDDEN
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CHECK_LT(a, b) if ((a) >= b) return 0
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Read next offset from pos.
|
||||||
|
* Returns true if an offset could be read, false otherwise.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int GetNextOffset(const unsigned char** pos,
|
||||||
|
const unsigned char* end,
|
||||||
|
const unsigned char** offset)
|
||||||
|
{
|
||||||
|
size_t bytes_consumed;
|
||||||
|
|
||||||
|
if (*pos == end)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* When reading an offset the byte array must always contain at least
|
||||||
|
* three more bytes to consume. First the offset to read, then a node
|
||||||
|
* to skip over and finally a destination node. No object can be smaller
|
||||||
|
* than one byte. */
|
||||||
|
CHECK_LT(*pos + 2, end);
|
||||||
|
switch (**pos & 0x60) {
|
||||||
|
case 0x60: /* Read three byte offset */
|
||||||
|
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
|
||||||
|
bytes_consumed = 3;
|
||||||
|
break;
|
||||||
|
case 0x40: /* Read two byte offset */
|
||||||
|
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
|
||||||
|
bytes_consumed = 2;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
*offset += (*pos)[0] & 0x3F;
|
||||||
|
bytes_consumed = 1;
|
||||||
|
}
|
||||||
|
if ((**pos & 0x80) != 0) {
|
||||||
|
*pos = end;
|
||||||
|
} else {
|
||||||
|
*pos += bytes_consumed;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if byte at offset is last in label.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
||||||
|
{
|
||||||
|
CHECK_LT(offset, end);
|
||||||
|
return(*offset & 0x80) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if byte at offset matches first character in key.
|
||||||
|
* This version matches characters not last in label.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int IsMatch(const unsigned char* offset,
|
||||||
|
const unsigned char* end,
|
||||||
|
const char* key)
|
||||||
|
{
|
||||||
|
CHECK_LT(offset, end);
|
||||||
|
return *offset == *key;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if byte at offset matches first character in key.
|
||||||
|
* This version matches characters last in label.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int IsEndCharMatch(const unsigned char* offset,
|
||||||
|
const unsigned char* end,
|
||||||
|
const char* key)
|
||||||
|
{
|
||||||
|
CHECK_LT(offset, end);
|
||||||
|
return *offset == (*key | 0x80);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Read return value at offset.
|
||||||
|
* Returns true if a return value could be read, false otherwise.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int GetReturnValue(const unsigned char* offset,
|
||||||
|
const unsigned char* end,
|
||||||
|
int* return_value)
|
||||||
|
{
|
||||||
|
CHECK_LT(offset, end);
|
||||||
|
if ((*offset & 0xE0) == 0x80) {
|
||||||
|
*return_value = *offset & 0x0F;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Looks up the string |key| with length |key_length| in a fixed set of
|
||||||
|
* strings. The set of strings must be known at compile time. It is converted to
|
||||||
|
* a graph structure named a DAFSA (Deterministic Acyclic Finite State
|
||||||
|
* Automaton) by the script make_dafsa.py during compilation. This permits
|
||||||
|
* efficient (in time and space) lookup. The graph generated by make_dafsa.py
|
||||||
|
* takes the form of a constant byte array which should be supplied via the
|
||||||
|
* |graph| and |length| parameters. The return value is kDafsaNotFound,
|
||||||
|
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
|
||||||
|
* kDafsaWildcardRule and kDafsaPrivateRule ORed together.
|
||||||
|
*
|
||||||
|
* Lookup a domain key in a byte array generated by make_dafsa.py.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* prototype to skip warning with -Wmissing-prototypes */
|
||||||
|
int _HIDDEN LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
|
||||||
|
|
||||||
|
int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
|
size_t length,
|
||||||
|
const char* key,
|
||||||
|
size_t key_length)
|
||||||
|
{
|
||||||
|
const unsigned char* pos = graph;
|
||||||
|
const unsigned char* end = graph + length;
|
||||||
|
const unsigned char* offset = pos;
|
||||||
|
const char* key_end = key + key_length;
|
||||||
|
|
||||||
|
while (GetNextOffset(&pos, end, &offset)) {
|
||||||
|
/*char <char>+ end_char offsets
|
||||||
|
* char <char>+ return value
|
||||||
|
* char end_char offsets
|
||||||
|
* char return value
|
||||||
|
* end_char offsets
|
||||||
|
* return_value
|
||||||
|
*/
|
||||||
|
int did_consume = 0;
|
||||||
|
|
||||||
|
if (key != key_end && !IsEOL(offset, end)) {
|
||||||
|
/* Leading <char> is not a match. Don't dive into this child */
|
||||||
|
if (!IsMatch(offset, end, key))
|
||||||
|
continue;
|
||||||
|
did_consume = 1;
|
||||||
|
++offset;
|
||||||
|
++key;
|
||||||
|
/* Possible matches at this point:
|
||||||
|
* <char>+ end_char offsets
|
||||||
|
* <char>+ return value
|
||||||
|
* end_char offsets
|
||||||
|
* return value
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Remove all remaining <char> nodes possible */
|
||||||
|
while (!IsEOL(offset, end) && key != key_end) {
|
||||||
|
if (!IsMatch(offset, end, key))
|
||||||
|
return -1;
|
||||||
|
++key;
|
||||||
|
++offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Possible matches at this point:
|
||||||
|
* end_char offsets
|
||||||
|
* return_value
|
||||||
|
* If one or more <char> elements were consumed, a failure
|
||||||
|
* to match is terminal. Otherwise, try the next node.
|
||||||
|
*/
|
||||||
|
if (key == key_end) {
|
||||||
|
int return_value;
|
||||||
|
|
||||||
|
if (GetReturnValue(offset, end, &return_value))
|
||||||
|
return return_value;
|
||||||
|
/* The DAFSA guarantees that if the first char is a match, all
|
||||||
|
* remaining char elements MUST match if the key is truly present.
|
||||||
|
*/
|
||||||
|
if (did_consume)
|
||||||
|
return -1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!IsEndCharMatch(offset, end, key)) {
|
||||||
|
if (did_consume)
|
||||||
|
return -1; /* Unexpected */
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
++key;
|
||||||
|
pos = ++offset; /* Dive into child */
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1; /* No match */
|
||||||
|
}
|
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# Copyright 2014 The Chromium Authors. All rights reserved.
|
# Copyright 2014 The Chromium Authors. All rights reserved.
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE.chromium file.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
A Deterministic acyclic finite state automaton (DAFSA) is a compact
|
A Deterministic acyclic finite state automaton (DAFSA) is a compact
|
||||||
|
@ -421,7 +421,7 @@ def to_cxx(data):
|
||||||
text += 'The byte array encodes effective tld names. See make_dafsa.py for'
|
text += 'The byte array encodes effective tld names. See make_dafsa.py for'
|
||||||
text += ' documentation.'
|
text += ' documentation.'
|
||||||
text += '*/\n\n'
|
text += '*/\n\n'
|
||||||
text += 'const unsigned char kDafsa[%s] = {\n' % len(data)
|
text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
|
||||||
for i in range(0, len(data), 12):
|
for i in range(0, len(data), 12):
|
||||||
text += ' '
|
text += ' '
|
||||||
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
|
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
|
||||||
|
@ -450,7 +450,7 @@ def parse_gperf(infile):
|
||||||
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
|
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
|
||||||
# Technically the DAFSA format could support return values in range [0-31],
|
# Technically the DAFSA format could support return values in range [0-31],
|
||||||
# but the values below are the only with a defined meaning.
|
# but the values below are the only with a defined meaning.
|
||||||
if line[-1] not in '01245':
|
if line[-1] not in '0123456789ABCDEF':
|
||||||
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' %
|
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' %
|
||||||
line[-1])
|
line[-1])
|
||||||
return [line[:-3] + line[-1] for line in lines]
|
return [line[:-3] + line[-1] for line in lines]
|
486
src/psl.c
486
src/psl.c
|
@ -32,6 +32,18 @@
|
||||||
# include <config.h>
|
# include <config.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
|
||||||
|
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
|
||||||
|
#else
|
||||||
|
# define _GCC_VERSION_AT_LEAST(major, minor) 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if _GCC_VERSION_AT_LEAST(2,95)
|
||||||
|
# define _UNUSED __attribute__ ((unused))
|
||||||
|
#else
|
||||||
|
# define _UNUSED
|
||||||
|
#endif
|
||||||
|
|
||||||
/* if this file is included by psl2c, redefine to use requested library for builtin data */
|
/* if this file is included by psl2c, redefine to use requested library for builtin data */
|
||||||
#ifdef _LIBPSL_INCLUDED_BY_PSL2C
|
#ifdef _LIBPSL_INCLUDED_BY_PSL2C
|
||||||
# undef WITH_LIBICU
|
# undef WITH_LIBICU
|
||||||
|
@ -167,10 +179,10 @@ struct _psl_ctx_st {
|
||||||
|
|
||||||
/* include the PSL data compiled by 'psl2c' */
|
/* include the PSL data compiled by 'psl2c' */
|
||||||
#ifndef _LIBPSL_INCLUDED_BY_PSL2C
|
#ifndef _LIBPSL_INCLUDED_BY_PSL2C
|
||||||
# include "suffixes.c"
|
# include "suffixes_dafsa.c"
|
||||||
#else
|
#else
|
||||||
/* if this source file is included by psl2c.c, provide empty builtin data */
|
/* if this source file is included by psl2c.c, provide empty builtin data */
|
||||||
static _psl_entry_t suffixes[1];
|
static const unsigned char kDafsa[1];
|
||||||
static time_t _psl_file_time;
|
static time_t _psl_file_time;
|
||||||
static time_t _psl_compile_time;
|
static time_t _psl_compile_time;
|
||||||
static int _psl_nsuffixes;
|
static int _psl_nsuffixes;
|
||||||
|
@ -313,20 +325,196 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline int _isspace_ascii(const char c)
|
||||||
|
{
|
||||||
|
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
static int _str_is_ascii(const char *s)
|
||||||
|
{
|
||||||
|
while (*s && *((unsigned char *)s) < 128) s++;
|
||||||
|
|
||||||
|
return !*s;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(WITH_LIBIDN)
|
||||||
|
/*
|
||||||
|
* Work around a libidn <= 1.30 vulnerability.
|
||||||
|
*
|
||||||
|
* The function checks for a valid UTF-8 character sequence before
|
||||||
|
* passing it to idna_to_ascii_8z().
|
||||||
|
*
|
||||||
|
* [1] http://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
|
||||||
|
* [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
|
||||||
|
* [3] http://curl.haxx.se/mail/lib-2015-06/0143.html
|
||||||
|
*/
|
||||||
|
static int _utf8_is_valid(const char *utf8)
|
||||||
|
{
|
||||||
|
const unsigned char *s = (const unsigned char *) utf8;
|
||||||
|
|
||||||
|
while (*s) {
|
||||||
|
if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
|
||||||
|
s++;
|
||||||
|
else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
|
||||||
|
if ((s[1] & 0xC0) != 0x80)
|
||||||
|
return 0;
|
||||||
|
s += 2;
|
||||||
|
} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
|
||||||
|
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
|
||||||
|
return 0;
|
||||||
|
s += 3;
|
||||||
|
} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
|
||||||
|
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
|
||||||
|
return 0;
|
||||||
|
s += 4;
|
||||||
|
} else
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef void *_psl_idna_t;
|
||||||
|
|
||||||
|
static _psl_idna_t *_psl_idna_open(void)
|
||||||
|
{
|
||||||
|
#if defined(WITH_LIBICU)
|
||||||
|
UErrorCode status = 0;
|
||||||
|
return (void *)uidna_openUTS46(UIDNA_USE_STD3_RULES, &status);
|
||||||
|
#endif
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void _psl_idna_close(_psl_idna_t *idna _UNUSED)
|
||||||
|
{
|
||||||
|
#if defined(WITH_LIBICU)
|
||||||
|
if (idna)
|
||||||
|
uidna_close((UIDNA *)idna);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char **ascii)
|
||||||
|
{
|
||||||
|
int ret = -1;
|
||||||
|
|
||||||
|
#if defined(WITH_LIBICU)
|
||||||
|
/* IDNA2008 UTS#46 punycode conversion */
|
||||||
|
if (idna) {
|
||||||
|
char lookupname[128] = "";
|
||||||
|
UErrorCode status = 0;
|
||||||
|
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
|
||||||
|
UChar utf16_dst[128], utf16_src[128];
|
||||||
|
int32_t utf16_src_length;
|
||||||
|
|
||||||
|
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, utf8, -1, &status);
|
||||||
|
if (U_SUCCESS(status)) {
|
||||||
|
int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
|
||||||
|
if (U_SUCCESS(status)) {
|
||||||
|
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
|
||||||
|
if (U_SUCCESS(status)) {
|
||||||
|
if (ascii)
|
||||||
|
*ascii = strdup(lookupname);
|
||||||
|
ret = 0;
|
||||||
|
} /* else
|
||||||
|
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
|
||||||
|
} /* else
|
||||||
|
fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */
|
||||||
|
} /* else
|
||||||
|
fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */
|
||||||
|
}
|
||||||
|
#elif defined(WITH_LIBIDN2)
|
||||||
|
int rc;
|
||||||
|
uint8_t *lower, resbuf[256];
|
||||||
|
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
|
||||||
|
|
||||||
|
/* we need a conversion to lowercase */
|
||||||
|
lower = u8_tolower((uint8_t *)utf8, u8_strlen((uint8_t *)utf8), 0, UNINORM_NFKC, resbuf, &len);
|
||||||
|
if (!lower) {
|
||||||
|
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* u8_tolower() does not terminate the result string */
|
||||||
|
if (lower == resbuf) {
|
||||||
|
lower[len]=0;
|
||||||
|
} else {
|
||||||
|
uint8_t *tmp = lower;
|
||||||
|
lower = (uint8_t *)strndup((char *)lower, len);
|
||||||
|
free(tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
|
||||||
|
ret = 0;
|
||||||
|
} /* else
|
||||||
|
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
|
||||||
|
|
||||||
|
if (lower != resbuf)
|
||||||
|
free(lower);
|
||||||
|
#elif defined(WITH_LIBIDN)
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
if (!_utf8_is_valid(utf8)) {
|
||||||
|
/* fprintf(_(stderr, "Invalid UTF-8 sequence not converted: '%s'\n"), utf8); */
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */
|
||||||
|
|
||||||
|
if ((rc = idna_to_ascii_8z(utf8, ascii, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
|
||||||
|
ret = 0;
|
||||||
|
} /* else
|
||||||
|
fprintf(_(stderr, "toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_entry_t *e)
|
||||||
|
{
|
||||||
|
char *lookupname;
|
||||||
|
|
||||||
|
if (_str_is_ascii(e->label_buf))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (_psl_idna_toASCII(idna, e->label_buf, &lookupname) == 0) {
|
||||||
|
if (strcmp(e->label_buf, lookupname)) {
|
||||||
|
_psl_entry_t suffix, *suffixp;
|
||||||
|
|
||||||
|
/* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
|
||||||
|
_suffix_init(&suffix, lookupname, strlen(lookupname));
|
||||||
|
suffix.flags = e->flags;
|
||||||
|
suffixp = _vector_get(v, _vector_add(v, &suffix));
|
||||||
|
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
||||||
|
} /* else ignore */
|
||||||
|
|
||||||
|
free(lookupname);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* prototype */
|
||||||
|
int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
|
||||||
|
|
||||||
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
|
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
|
||||||
{
|
{
|
||||||
_psl_entry_t suffix, *rule;
|
_psl_entry_t suffix;
|
||||||
const char *p;
|
const char *p;
|
||||||
int builtin;
|
char *punycode = NULL;
|
||||||
|
int need_conversion = 0;
|
||||||
|
|
||||||
/* this function should be called without leading dots, just make sure */
|
/* this function should be called without leading dots, just make sure */
|
||||||
suffix.label = domain + (*domain == '.');
|
if (*domain == '.')
|
||||||
suffix.length = strlen(suffix.label);
|
domain++;
|
||||||
|
|
||||||
suffix.nlabels = 1;
|
suffix.nlabels = 1;
|
||||||
|
|
||||||
for (p = suffix.label; *p; p++)
|
for (p = domain; *p; p++) {
|
||||||
if (*p == '.')
|
if (*p == '.')
|
||||||
suffix.nlabels++;
|
suffix.nlabels++;
|
||||||
|
else if (*((unsigned char *)p) < 128)
|
||||||
|
need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */
|
||||||
|
}
|
||||||
|
|
||||||
if (suffix.nlabels == 1) {
|
if (suffix.nlabels == 1) {
|
||||||
/* TLD, this is the prevailing '*' match.
|
/* TLD, this is the prevailing '*' match.
|
||||||
|
@ -335,61 +523,111 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if domain has enough labels, it is public */
|
if (need_conversion) {
|
||||||
builtin = (psl == &_builtin_psl);
|
_psl_idna_t *idna = _psl_idna_open();
|
||||||
|
|
||||||
if (builtin)
|
if (_psl_idna_toASCII(idna, domain, &punycode) == 0) {
|
||||||
rule = &suffixes[0];
|
suffix.label = punycode;
|
||||||
else
|
suffix.length = strlen(punycode);
|
||||||
rule = _vector_get(psl->suffixes, 0);
|
} else {
|
||||||
|
/* fallback */
|
||||||
|
suffix.label = domain;
|
||||||
|
suffix.length = p - suffix.label;
|
||||||
|
}
|
||||||
|
|
||||||
|
_psl_idna_close(idna);
|
||||||
|
} else {
|
||||||
|
suffix.label = domain;
|
||||||
|
suffix.length = p - suffix.label;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (psl == &_builtin_psl) {
|
||||||
|
int rc = LookupStringInFixedSet(kDafsa, sizeof(kDafsa), suffix.label, suffix.length);
|
||||||
|
if (rc != -1) {
|
||||||
|
/* check for correct rule type */
|
||||||
|
if (type == PSL_TYPE_ICANN && !(rc & _PSL_FLAG_ICANN))
|
||||||
|
goto suffix_no;
|
||||||
|
else if (type == PSL_TYPE_PRIVATE && !(rc & _PSL_FLAG_PRIVATE))
|
||||||
|
goto suffix_no;
|
||||||
|
|
||||||
|
if (rc & _PSL_FLAG_EXCEPTION)
|
||||||
|
goto suffix_no;
|
||||||
|
|
||||||
|
/* wildcard *.foo.bar implicitly make foo.bar a public suffix */
|
||||||
|
/* definitely a match, no matter if the found rule is a wildcard or not */
|
||||||
|
goto suffix_yes;
|
||||||
|
}
|
||||||
|
if ((suffix.label = strchr(suffix.label, '.'))) {
|
||||||
|
suffix.label++;
|
||||||
|
suffix.length = strlen(suffix.label);
|
||||||
|
suffix.nlabels--;
|
||||||
|
|
||||||
|
rc = LookupStringInFixedSet(kDafsa, sizeof(kDafsa), suffix.label, suffix.length);
|
||||||
|
if (rc != -1) {
|
||||||
|
/* check for correct rule type */
|
||||||
|
if (type == PSL_TYPE_ICANN && !(rc & _PSL_FLAG_ICANN))
|
||||||
|
goto suffix_no;
|
||||||
|
else if (type == PSL_TYPE_PRIVATE && !(rc & _PSL_FLAG_PRIVATE))
|
||||||
|
goto suffix_no;
|
||||||
|
|
||||||
|
if (rc & _PSL_FLAG_WILDCARD)
|
||||||
|
goto suffix_yes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
_psl_entry_t *rule = _vector_get(psl->suffixes, 0);
|
||||||
|
|
||||||
if (!rule || rule->nlabels < suffix.nlabels - 1)
|
if (!rule || rule->nlabels < suffix.nlabels - 1)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (rule == &suffixes[0])
|
|
||||||
rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
|
|
||||||
else
|
|
||||||
rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
|
rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
|
||||||
|
|
||||||
if (rule) {
|
if (rule) {
|
||||||
/* check for correct rule type */
|
/* check for correct rule type */
|
||||||
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
|
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
|
||||||
return 0;
|
goto suffix_no;
|
||||||
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
|
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
|
||||||
return 0;
|
goto suffix_no;
|
||||||
|
|
||||||
/* definitely a match, no matter if the found rule is a wildcard or not */
|
|
||||||
if (rule->flags & _PSL_FLAG_EXCEPTION)
|
if (rule->flags & _PSL_FLAG_EXCEPTION)
|
||||||
return 0;
|
goto suffix_no;
|
||||||
if (rule->flags & _PSL_FLAG_PLAIN)
|
|
||||||
return 1;
|
/* wildcard *.foo.bar implicitly make foo.bar a public suffix */
|
||||||
|
/* definitely a match, no matter if the found rule is a wildcard or not */
|
||||||
|
goto suffix_yes;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((suffix.label = strchr(suffix.label, '.'))) {
|
if ((suffix.label = strchr(suffix.label, '.'))) {
|
||||||
int pos = rule - suffixes;
|
int pos;
|
||||||
|
|
||||||
suffix.label++;
|
suffix.label++;
|
||||||
suffix.length = strlen(suffix.label);
|
suffix.length = strlen(suffix.label);
|
||||||
suffix.nlabels--;
|
suffix.nlabels--;
|
||||||
|
|
||||||
if (builtin)
|
|
||||||
rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
|
|
||||||
else
|
|
||||||
rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix)));
|
rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix)));
|
||||||
|
|
||||||
if (rule) {
|
if (rule) {
|
||||||
/* check for correct rule type */
|
/* check for correct rule type */
|
||||||
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
|
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
|
||||||
return 0;
|
goto suffix_no;
|
||||||
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
|
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
|
||||||
|
goto suffix_no;
|
||||||
|
|
||||||
|
if (rule->flags & _PSL_FLAG_WILDCARD)
|
||||||
|
goto suffix_yes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
suffix_no:
|
||||||
|
if (punycode)
|
||||||
|
free(punycode);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if ((rule->flags & _PSL_FLAG_WILDCARD))
|
suffix_yes:
|
||||||
|
if (punycode)
|
||||||
|
free(punycode);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -531,167 +769,6 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
|
||||||
return regdom;
|
return regdom;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int _isspace_ascii(const char c)
|
|
||||||
{
|
|
||||||
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
|
||||||
}
|
|
||||||
|
|
||||||
static int _str_is_ascii(const char *s)
|
|
||||||
{
|
|
||||||
while (*s && *((unsigned char *)s) < 128) s++;
|
|
||||||
|
|
||||||
return !*s;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(WITH_LIBIDN)
|
|
||||||
/*
|
|
||||||
* Work around a libidn <= 1.30 vulnerability.
|
|
||||||
*
|
|
||||||
* The function checks for a valid UTF-8 character sequence before
|
|
||||||
* passing it to idna_to_ascii_8z().
|
|
||||||
*
|
|
||||||
* [1] http://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
|
|
||||||
* [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
|
|
||||||
* [3] http://curl.haxx.se/mail/lib-2015-06/0143.html
|
|
||||||
*/
|
|
||||||
static int _utf8_is_valid(const char *utf8)
|
|
||||||
{
|
|
||||||
const unsigned char *s = (const unsigned char *) utf8;
|
|
||||||
|
|
||||||
while (*s) {
|
|
||||||
if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
|
|
||||||
s++;
|
|
||||||
else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
|
|
||||||
if ((s[1] & 0xC0) != 0x80)
|
|
||||||
return 0;
|
|
||||||
s += 2;
|
|
||||||
} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
|
|
||||||
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
|
|
||||||
return 0;
|
|
||||||
s += 3;
|
|
||||||
} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
|
|
||||||
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
|
|
||||||
return 0;
|
|
||||||
s += 4;
|
|
||||||
} else
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(WITH_LIBICU)
|
|
||||||
static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t *e)
|
|
||||||
{
|
|
||||||
if (_str_is_ascii(e->label_buf))
|
|
||||||
return;
|
|
||||||
|
|
||||||
/* IDNA2008 UTS#46 punycode conversion */
|
|
||||||
if (idna) {
|
|
||||||
char lookupname[128] = "";
|
|
||||||
UErrorCode status = 0;
|
|
||||||
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
|
|
||||||
UChar utf16_dst[128], utf16_src[128];
|
|
||||||
int32_t utf16_src_length;
|
|
||||||
|
|
||||||
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, -1, &status);
|
|
||||||
if (U_SUCCESS(status)) {
|
|
||||||
int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
|
|
||||||
if (U_SUCCESS(status)) {
|
|
||||||
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
|
|
||||||
if (U_SUCCESS(status)) {
|
|
||||||
if (strcmp(e->label_buf, lookupname)) {
|
|
||||||
_psl_entry_t suffix, *suffixp;
|
|
||||||
|
|
||||||
/* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */
|
|
||||||
_suffix_init(&suffix, lookupname, strlen(lookupname));
|
|
||||||
suffix.flags = e->flags;
|
|
||||||
suffixp = _vector_get(v, _vector_add(v, &suffix));
|
|
||||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
|
||||||
} /* else ignore */
|
|
||||||
} /* else
|
|
||||||
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
|
|
||||||
} /* else
|
|
||||||
fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */
|
|
||||||
} /* else
|
|
||||||
fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#elif defined(WITH_LIBIDN2)
|
|
||||||
static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
|
|
||||||
{
|
|
||||||
char *lookupname = NULL;
|
|
||||||
int rc;
|
|
||||||
uint8_t *lower, resbuf[256];
|
|
||||||
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
|
|
||||||
|
|
||||||
if (_str_is_ascii(e->label_buf))
|
|
||||||
return;
|
|
||||||
|
|
||||||
/* we need a conversion to lowercase */
|
|
||||||
lower = u8_tolower((uint8_t *)e->label_buf, u8_strlen((uint8_t *)e->label_buf), 0, UNINORM_NFKC, resbuf, &len);
|
|
||||||
if (!lower) {
|
|
||||||
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", e->label_buf, errno); */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* u8_tolower() does not terminate the result string */
|
|
||||||
if (lower == resbuf) {
|
|
||||||
lower[len]=0;
|
|
||||||
} else {
|
|
||||||
uint8_t *tmp = lower;
|
|
||||||
lower = (uint8_t *)strndup((char *)lower, len);
|
|
||||||
free(tmp);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((rc = idn2_lookup_u8(lower, (uint8_t **)&lookupname, 0)) == IDN2_OK) {
|
|
||||||
if (strcmp(e->label_buf, lookupname)) {
|
|
||||||
_psl_entry_t suffix, *suffixp;
|
|
||||||
|
|
||||||
/* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
|
|
||||||
_suffix_init(&suffix, lookupname, strlen(lookupname));
|
|
||||||
suffix.flags = e->flags;
|
|
||||||
suffixp = _vector_get(v, _vector_add(v, &suffix));
|
|
||||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
|
||||||
} /* else ignore */
|
|
||||||
} /* else
|
|
||||||
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
|
|
||||||
|
|
||||||
if (lower != resbuf)
|
|
||||||
free(lower);
|
|
||||||
}
|
|
||||||
#elif defined(WITH_LIBIDN)
|
|
||||||
static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
|
|
||||||
{
|
|
||||||
char *lookupname = NULL;
|
|
||||||
int rc;
|
|
||||||
|
|
||||||
if (_str_is_ascii(e->label_buf))
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (!_utf8_is_valid(e->label_buf)) {
|
|
||||||
/* fprintf(_(stderr, "Invalid UTF-8 sequence not converted: '%s'\n"), e->label_buf); */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */
|
|
||||||
|
|
||||||
if ((rc = idna_to_ascii_8z(e->label_buf, &lookupname, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
|
|
||||||
if (strcmp(e->label_buf, lookupname)) {
|
|
||||||
_psl_entry_t suffix, *suffixp;
|
|
||||||
|
|
||||||
/* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
|
|
||||||
_suffix_init(&suffix, lookupname, strlen(lookupname));
|
|
||||||
suffix.flags = e->flags;
|
|
||||||
suffixp = _vector_get(v, _vector_add(v, &suffix));
|
|
||||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
|
||||||
} /* else ignore */
|
|
||||||
} /* else
|
|
||||||
fprintf(_(stderr, "toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); */
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* psl_load_file:
|
* psl_load_file:
|
||||||
* @fname: Name of PSL file
|
* @fname: Name of PSL file
|
||||||
|
@ -740,10 +817,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
_psl_entry_t suffix, *suffixp;
|
_psl_entry_t suffix, *suffixp;
|
||||||
char buf[256], *linep, *p;
|
char buf[256], *linep, *p;
|
||||||
int type = 0;
|
int type = 0;
|
||||||
#ifdef WITH_LIBICU
|
_psl_idna_t *idna;
|
||||||
UIDNA *idna;
|
|
||||||
UErrorCode status = 0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (!fp)
|
if (!fp)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -751,9 +825,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
if (!(psl = calloc(1, sizeof(psl_ctx_t))))
|
if (!(psl = calloc(1, sizeof(psl_ctx_t))))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
#ifdef WITH_LIBICU
|
idna = _psl_idna_open();
|
||||||
idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions.
|
* as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions.
|
||||||
|
@ -794,7 +866,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
p++;
|
p++;
|
||||||
/* wildcard *.foo.bar implicitely make foo.bar a public suffix */
|
/* wildcard *.foo.bar implicitly make foo.bar a public suffix */
|
||||||
suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN | type;
|
suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN | type;
|
||||||
psl->nwildcards++;
|
psl->nwildcards++;
|
||||||
psl->nsuffixes++;
|
psl->nsuffixes++;
|
||||||
|
@ -829,20 +901,14 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
}
|
}
|
||||||
|
|
||||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
||||||
#ifdef WITH_LIBICU
|
|
||||||
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
|
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
|
||||||
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
|
|
||||||
_add_punycode_if_needed(psl->suffixes, suffixp);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_vector_sort(psl->suffixes);
|
_vector_sort(psl->suffixes);
|
||||||
|
|
||||||
#ifdef WITH_LIBICU
|
_psl_idna_close(idna);
|
||||||
if (idna)
|
|
||||||
uidna_close(idna);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return psl;
|
return psl;
|
||||||
}
|
}
|
||||||
|
@ -1184,7 +1250,7 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
|
||||||
*
|
*
|
||||||
* Since: 0.4
|
* Since: 0.4
|
||||||
*/
|
*/
|
||||||
psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower)
|
psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale _UNUSED, char **lower)
|
||||||
{
|
{
|
||||||
int ret = PSL_ERR_INVALID_ARG;
|
int ret = PSL_ERR_INVALID_ARG;
|
||||||
|
|
||||||
|
|
53
src/psl2c.c
53
src/psl2c.c
|
@ -128,39 +128,6 @@ static int _check_psl(const psl_ctx_t *psl)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname)
|
|
||||||
{
|
|
||||||
int it;
|
|
||||||
|
|
||||||
#ifdef BUILTIN_GENERATOR_LIBICU
|
|
||||||
do {
|
|
||||||
UVersionInfo version_info;
|
|
||||||
char version[U_MAX_VERSION_STRING_LENGTH];
|
|
||||||
|
|
||||||
u_getVersion(version_info);
|
|
||||||
u_versionToString(version_info, version);
|
|
||||||
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version);
|
|
||||||
} while (0);
|
|
||||||
#elif defined(BUILTIN_GENERATOR_LIBIDN2)
|
|
||||||
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL));
|
|
||||||
#elif defined(BUILTIN_GENERATOR_LIBIDN)
|
|
||||||
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL));
|
|
||||||
#else
|
|
||||||
fprintf(fpout, "/* automatically generated by psl2c (without punycode support) */\n");
|
|
||||||
#endif
|
|
||||||
|
|
||||||
fprintf(fpout, "static _psl_entry_t %s[] = {\n", varname);
|
|
||||||
|
|
||||||
for (it = 0; it < v->cur; it++) {
|
|
||||||
_psl_entry_t *e = _vector_get(v, it);
|
|
||||||
|
|
||||||
fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n",
|
|
||||||
e->label_buf, e->length, (int) e->nlabels, (int) e->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(fpout, "};\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
|
static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
|
||||||
{
|
{
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
|
@ -192,13 +159,14 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
|
||||||
while (*s && *s < 128) s++;
|
while (*s && *s < 128) s++;
|
||||||
if (*s) continue;
|
if (*s) continue;
|
||||||
|
|
||||||
fprintf(fp, "%s, %d\n", e->label_buf, (int) e->flags);
|
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
|
||||||
}
|
}
|
||||||
|
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}
|
}
|
||||||
|
|
||||||
system("../tools/make_dafsa.py in.tmp out.tmp");
|
if ((it = system(MAKE_DAFSA " in.tmp out.tmp")))
|
||||||
|
fprintf(stderr, "Failed to execute " MAKE_DAFSA "\n");
|
||||||
|
|
||||||
if ((fp = fopen("out.tmp", "r"))) {
|
if ((fp = fopen("out.tmp", "r"))) {
|
||||||
char buf[256];
|
char buf[256];
|
||||||
|
@ -208,6 +176,9 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
|
||||||
|
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unlink("in.tmp");
|
||||||
|
unlink("out.tmp");
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
@ -262,15 +233,10 @@ int main(int argc, const char **argv)
|
||||||
#ifdef _GENERATE_BUILTIN_DATA
|
#ifdef _GENERATE_BUILTIN_DATA
|
||||||
psl_ctx_t *psl;
|
psl_ctx_t *psl;
|
||||||
#endif
|
#endif
|
||||||
int ret = 0, argpos = 1, dafsa = 0;
|
int ret = 0, argpos = 1;
|
||||||
|
|
||||||
if (argc == 4 && !strcmp(argv[1], "--dafsa")) {
|
|
||||||
argpos = 2;
|
|
||||||
dafsa = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (argc - argpos != 2) {
|
if (argc - argpos != 2) {
|
||||||
fprintf(stderr, "Usage: psl2c [--dafsa] <infile> <outfile>\n");
|
fprintf(stderr, "Usage: psl2c <infile> <outfile>\n");
|
||||||
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
|
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
|
||||||
fprintf(stderr, " <outfile> is the the C filename to be generated from <infile>\n");
|
fprintf(stderr, " <outfile> is the the C filename to be generated from <infile>\n");
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -299,10 +265,7 @@ int main(int argc, const char **argv)
|
||||||
_add_punycode_if_needed(psl->suffixes);
|
_add_punycode_if_needed(psl->suffixes);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (dafsa)
|
|
||||||
_print_psl_entries_dafsa(fpout, psl->suffixes);
|
_print_psl_entries_dafsa(fpout, psl->suffixes);
|
||||||
else
|
|
||||||
_print_psl_entries(fpout, psl->suffixes, "suffixes");
|
|
||||||
|
|
||||||
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);
|
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);
|
||||||
if ((pp = popen(cmd, "r"))) {
|
if ((pp = popen(cmd, "r"))) {
|
||||||
|
|
Loading…
Reference in New Issue