Release v0.12.0

This commit is contained in:
Tim Rühsen 2016-01-02 14:30:36 +01:00
commit c69a18ff9b
21 changed files with 1895 additions and 441 deletions

View File

@ -1,29 +1,41 @@
sudo: false
language: c
compiler:
- gcc
- clang
# Change this to your needs
env:
- RUNTIME=libicu
- RUNTIME=libidn2
- RUNTIME=libidn
- RUNTIME=no
addons:
apt:
packages:
- automake
- autoconf
- autopoint
- libtool
- gtk-doc-tools
- gettext
- libidn11
- libidn11-dev
- libidn2-0
- libidn2-0-dev
- libicu48
- libicu-dev
- libunistring0
- libunistring-dev
script:
- ./autogen.sh
- ./configure && make -j4 && make check -j4
- ./configure --enable-runtime=libicu --enable-builtin=libicu && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libicu --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libicu --enable-builtin=libidn && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libicu --disable-builtin && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libidn2 --enable-builtin=libicu && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libidn2 --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libidn2 --enable-builtin=libidn && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libidn2 --disable-builtin && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libidn --enable-builtin=libicu && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libidn --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libidn --enable-builtin=libidn && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=libidn --disable-builtin && make clean && make -j4 && make check -j4
- ./configure --disable-runtime --enable-builtin=libicu && make clean && make -j4 && make check -j4
- ./configure --disable-runtime --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
- ./configure --disable-runtime --enable-builtin=libidn && make clean && make -j4 && make check -j4
- ./configure --disable-runtime --disable-builtin && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=$RUNTIME --enable-builtin=libicu && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=$RUNTIME --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=$RUNTIME --enable-builtin=libidn && make clean && make -j4 && make check -j4
- ./configure --enable-runtime=$RUNTIME --disable-builtin && make clean && make -j4 && make check -j4
- ./configure --enable-gtk-doc && make -j4 && make check -j4
- make distcheck
before_install:
- sudo apt-get -qq update
- sudo apt-get -q install autoconf automake autopoint libtool gtk-doc-tools gettext libidn11 libidn11-dev libidn2-0 libidn2-0-dev libicu48 libicu-dev libunistring0 libunistring-dev

View File

@ -15,3 +15,4 @@ Dagobert Michelsen (Fixed Solaris building)
Christopher Meng (Fedora building)
Jakub Čajka
Giuseppe Scrivano
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)

View File

@ -14,4 +14,8 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS}
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = libpsl.pc
EXTRA_DIST = config.rpath LICENSE $(PSL_FILE) list/tests/test_psl.txt
EXTRA_DIST = config.rpath LICENSE
dist-hook:
mkdir -p $(distdir)/list/tests
cp -p $(PSL_FILE) $(distdir)/list
cp -p $(PSL_TESTFILE) $(distdir)/list/tests

17
NEWS
View File

@ -1,10 +1,23 @@
Copyright (C) 2014-2015 Tim Rühsen
Copyright (C) 2014-2016 Tim Rühsen
02.01.2016 Release V0.12.0
* Load DAFSA binaries via psl_load_file() via auto-detection
* Add more tests
* Remove psl_builtin_compile_time()
* Compile PSL into DAFSA using make_dafsa.py
* Avoid libicu dependency with --enable-runtime=no
* Test on new Travis-CI build farm
* Use DAFSA format for builtin PSL data
* Add function psl_is_public_suffix2()
* Fix psl_builtin_outdated()
* Fix several bugs
* Cleanup code
23.09.2015 Release V0.11.0
* Add new function psl_check_version_number()
* Add version defines to include file
19.09.2025 Release V0.10.0
19.09.2015 Release V0.10.0
* Code simplified
* Less data entries, faster lookups
* Add new function psl_suffix_wildcard_count()

View File

@ -14,7 +14,7 @@ Browsers and other web clients can use it to
Libpsl...
- has built-in PSL data for fast access
- has built-in PSL data for fast access (DAWG/DAFSA reduces size from 180kB to ~32kB)
- allows to load PSL data from files
- checks if a given domain is a "public suffix"
- provides immediate cookie domain verification
@ -28,6 +28,8 @@ Find more information about the Publix Suffix List [here](http://publicsuffix.or
Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat).
The DAFSA code has been taken from [Chromium Project](https://code.google.com/p/chromium/).
API Documentation
-----------------
@ -74,6 +76,8 @@ License
Libpsl is made available under the terms of the MIT license.<br>
See the LICENSE file that accompanies this distribution for the full text of the license.
src/make_dafsa.py and src/lookup_string_in_fixed_set.c are licensed under the term written in
src/LICENSE.chromium.
Building from git
-----------------

View File

@ -1,21 +1,21 @@
#!/bin/sh -e
#!/bin/sh
AUTORECONF=$(which autoreconf 2>/dev/null || true)
AUTORECONF=$(which autoreconf 2>/dev/null)
if test $? -ne 0; then
echo "No 'autoreconf' found. You must install the autoconf package."
exit 1
fi
GIT=$(which git 2>/dev/null || true)
GIT=$(which git 2>/dev/null)
if test $? -ne 0; then
echo "No 'git' found. You must install the git package."
exit 1
fi
# create m4 before gtkdocize
mkdir m4 2>/dev/null || true
# create m4 before gtkdocize
mkdir -p m4 2>/dev/null
GTKDOCIZE=$(which gtkdocize 2>/dev/null || true)
GTKDOCIZE=$(which gtkdocize 2>/dev/null)
if test $? -ne 0; then
echo "No gtk-doc support found. You can't build the docs."
# rm because gtk-doc.make might be a link to a protected file
@ -24,7 +24,7 @@ if test $? -ne 0; then
echo "CLEANFILES =" >>gtk-doc.make
GTKDOCIZE=""
else
$GTKDOCIZE || exit $?
$GTKDOCIZE
fi
$GIT submodule init

View File

@ -1,5 +1,5 @@
AC_INIT([libpsl], [0.11.0], [tim.ruehsen@gmx.de], [libpsl], [http://github.com/rockdaboot/libpsl])
AC_INIT([libpsl], [0.12.0], [tim.ruehsen@gmx.de], [libpsl], [http://github.com/rockdaboot/libpsl])
AC_PREREQ([2.59])
AM_INIT_AUTOMAKE([1.10 -Wall no-define foreign])
@ -20,9 +20,9 @@ AC_C_INLINE
#
# Generate version defines for include file
#
AC_SUBST([LIBPSL_VERSION_MAJOR], [`echo -n $VERSION|cut -d'.' -f1`])
AC_SUBST([LIBPSL_VERSION_MINOR], [`echo -n $VERSION|cut -d'.' -f2`])
AC_SUBST([LIBPSL_VERSION_PATCH], [`echo -n $VERSION|cut -d'.' -f3`])
AC_SUBST([LIBPSL_VERSION_MAJOR], [`echo $VERSION|cut -d'.' -f1`])
AC_SUBST([LIBPSL_VERSION_MINOR], [`echo $VERSION|cut -d'.' -f2`])
AC_SUBST([LIBPSL_VERSION_PATCH], [`echo $VERSION|cut -d'.' -f3`])
AC_SUBST([LIBPSL_VERSION_NUMBER], [`printf '0x%02x%02x%02x' $LIBPSL_VERSION_MAJOR $LIBPSL_VERSION_MINOR $LIBPSL_VERSION_PATCH`])
AC_CONFIG_FILES([include/libpsl.h])
@ -85,7 +85,7 @@ PKG_PROG_PKG_CONFIG
# 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0.
# 5. If any interfaces have been added since the last public release, then increment age.
# 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0.
AC_SUBST([LIBPSL_SO_VERSION], [4:0:4])
AC_SUBST([LIBPSL_SO_VERSION], [5:0:0])
AC_SUBST([LIBPSL_VERSION], $VERSION)
# Check for enable/disable builtin PSL data
@ -154,8 +154,10 @@ if test "$enable_runtime" = "libicu" -o "$enable_builtin" = "libicu"; then
# using AC_SEARCH_LIBS also don't work since functions have the library version appended
PKG_CHECK_MODULES([LIBICU], [icu-uc], [
HAVE_LIBICU=yes
LIBS="$LIBICU_LIBS $LIBS"
CFLAGS="$LIBICU_CFLAGS $CFLAGS"
if test "$enable_runtime" = "libicu"; then
LIBS="$LIBICU_LIBS $LIBS"
CFLAGS="$LIBICU_CFLAGS $CFLAGS"
fi
], [
OLDLIBS=$LIBS
LIBS="-licuuc $LIBS"
@ -216,6 +218,9 @@ elif test -n "$NEEDS_NSL" ; then
LIBS="$LIBS -lnsl"
fi
# Check for clock_gettime() used for performance measurement
AC_SEARCH_LIBS(clock_gettime, rt)
# Check for valgrind
ac_enable_valgrind=no
AC_ARG_ENABLE(valgrind-tests,
@ -252,7 +257,7 @@ AC_SUBST(PSL_TESTFILE)
# check for alloca / alloca.h
AC_FUNC_ALLOCA
AC_CHECK_FUNCS([strndup])
AC_CHECK_FUNCS([strndup clock_gettime])
# Override the template file name of the generated .pc file, so that there
# is no need to rename the template file when the API version changes.

View File

@ -14,7 +14,7 @@ make distclean > /dev/null || true
# We define _GNU_SOURCE to avoid warnings with missing prototypes.
# C89 does not know snprintf, strdup, strndup, popen, pclose
CFLAGS="-std=c89 -pedantic -O2 -g -Wall -Wextra -Wstrict-prototypes -Wold-style-definition -Wwrite-strings -Wshadow -Wformat -Wformat-security -Wunreachable-code -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition -D_GNU_SOURCE"
CFLAGS="-std=gnu89 -pedantic -O2 -g -Wall -Wextra -Wstrict-prototypes -Wold-style-definition -Wwrite-strings -Wshadow -Wformat -Wformat-security -Wunreachable-code -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition"
CACHEFILE=$PWD/config_check.cache
@ -40,7 +40,8 @@ for CC in gcc clang; do
for options in \
"--enable-runtime=libicu --enable-builtin=libicu" \
"--enable-runtime=libidn2 --enable-builtin=libidn2" \
"--enable-runtime=libidn --enable-builtin=libidn"; do
"--enable-runtime=libidn --enable-builtin=libidn" \
"--disable-runtime --enable-builtin=libicu"; do
export DISTCHECK_CONFIGURE_FLAGS="-C --cache-file=$CACHEFILE $options"
echo
echo " *** ./configure $DISTCHECK_CONFIGURE_FLAGS"

View File

@ -6,6 +6,9 @@ PSL_VERSION_MAJOR
PSL_VERSION_MINOR
PSL_VERSION_NUMBER
PSL_VERSION_PATCH
PSL_TYPE_ICANN
PSL_TYPE_PRIVATE
PSL_TYPE_ANY
psl_error_t
psl_ctx_t
psl_load_file
@ -13,12 +16,12 @@ psl_load_fp
psl_builtin
psl_free
psl_is_public_suffix
psl_is_public_suffix2
psl_unregistrable_domain
psl_registrable_domain
psl_suffix_count
psl_suffix_exception_count
psl_suffix_wildcard_count
psl_builtin_compile_time
psl_builtin_file_time
psl_builtin_sha1sum
psl_builtin_filename

View File

@ -1,5 +1,5 @@
/*
* Copyright(c) 2014-2015 Tim Ruehsen
* Copyright(c) 2014-2016 Tim Ruehsen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -44,6 +44,11 @@
extern "C" {
#endif
/* types for psl_is_publix_suffix2() */
#define PSL_TYPE_ICANN (1<<0)
#define PSL_TYPE_PRIVATE (1<<1)
#define PSL_TYPE_ANY (PSL_TYPE_ICANN | PSL_TYPE_PRIVATE)
/**
* psl_error_t:
* @PSL_SUCCESS: Successful return.
@ -71,57 +76,75 @@ typedef struct _psl_ctx_st psl_ctx_t;
/* frees PSL context */
void
psl_free(psl_ctx_t *psl);
/* loads PSL data from file */
psl_ctx_t *
psl_load_file(const char *fname);
/* loads PSL data from FILE pointer */
psl_ctx_t *
psl_load_fp(FILE *fp);
/* retrieves builtin PSL data */
const psl_ctx_t *
psl_builtin(void);
/* checks whether domain is a public suffix or not */
int
psl_is_public_suffix(const psl_ctx_t *psl, const char *domain);
/* checks whether domain is a public suffix regarding the type or not */
int
psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type);
/* checks whether cookie_domain is acceptable for domain or not */
int
psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain);
/* returns the longest not registrable domain within 'domain' or NULL if none found */
const char *
psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain);
/* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */
const char *
psl_registrable_domain(const psl_ctx_t *psl, const char *domain);
/* convert a string into lowercase UTF-8 */
psl_error_t
psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower);
/* does not include exceptions */
int
psl_suffix_count(const psl_ctx_t *psl);
/* just counts exceptions */
int
psl_suffix_exception_count(const psl_ctx_t *psl);
/* just counts wildcards */
int
psl_suffix_wildcard_count(const psl_ctx_t *psl);
/* returns compilation time */
time_t
psl_builtin_compile_time(void);
/* returns mtime of PSL source file */
time_t
psl_builtin_file_time(void);
/* returns SHA1 checksum (hex-encoded, lowercase) of PSL source file */
const char *
psl_builtin_sha1sum(void);
/* returns file name of PSL source file */
const char *
psl_builtin_filename(void);
/* returns library version string */
const char *
psl_get_version(void);
/* checks library version number */
int
psl_check_version_number(int version);
/* returns wether the built-in data is outdated or not */
int
psl_builtin_outdated(void);

2
list

@ -1 +1 @@
Subproject commit 2930bb4a5256279e0f7ba44cf9d174fc93ecb732
Subproject commit 1f3ad51171235aafe423435606e869f0161582e4

30
src/LICENSE.chromium Normal file
View File

@ -0,0 +1,30 @@
* The following License is for the source code files
make_dafsa.py and lookup_string_in_fixed_set.c.
// Copyright 2015 The Chromium Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,12 +1,12 @@
# suffixes.c must be created before psl.c is compiled
BUILT_SOURCES = suffixes.c
BUILT_SOURCES = suffixes_dafsa.c
# suffixes.c is a built source that must be cleaned
CLEANFILES = suffixes.c
CLEANFILES = suffixes_dafsa.c
lib_LTLIBRARIES = libpsl.la
libpsl_la_SOURCES = psl.c
libpsl_la_SOURCES = psl.c lookup_string_in_fixed_set.c
libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
# include ABI version information
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
@ -21,8 +21,8 @@ if WITH_LIBIDN
endif
noinst_PROGRAMS = psl2c
psl2c_SOURCES = psl2c.c
psl2c_CPPFLAGS = -I$(top_srcdir)/include
psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/make_dafsa.py\"
if BUILTIN_GENERATOR_LIBICU
psl2c_LDADD = -licuuc
endif
@ -33,7 +33,9 @@ if BUILTIN_GENERATOR_LIBIDN
psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring
endif
# Build rule for suffix.c
# Build rule for suffix_dafsa.c
# PSL_FILE can be set by ./configure --with-psl-file=[PATH]
suffixes.c: $(PSL_FILE) psl2c$(EXEEXT)
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes.c
suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
EXTRA_DIST = make_dafsa.py LICENSE.chromium

View File

@ -0,0 +1,204 @@
/* Copyright 2015 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE.chromium file.
*
* Converted to C89 2015 by Tim Rühsen
*/
#include <stddef.h>
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
#else
# define _GCC_VERSION_AT_LEAST(major, minor) 0
#endif
#if _GCC_VERSION_AT_LEAST(4,0)
# define _HIDDEN __attribute__ ((visibility ("hidden")))
#else
# define _HIDDEN
#endif
#define CHECK_LT(a, b) if ((a) >= b) return 0
/*
* Read next offset from pos.
* Returns true if an offset could be read, false otherwise.
*/
static int GetNextOffset(const unsigned char** pos,
const unsigned char* end,
const unsigned char** offset)
{
size_t bytes_consumed;
if (*pos == end)
return 0;
/* When reading an offset the byte array must always contain at least
* three more bytes to consume. First the offset to read, then a node
* to skip over and finally a destination node. No object can be smaller
* than one byte. */
CHECK_LT(*pos + 2, end);
switch (**pos & 0x60) {
case 0x60: /* Read three byte offset */
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
bytes_consumed = 3;
break;
case 0x40: /* Read two byte offset */
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
bytes_consumed = 2;
break;
default:
*offset += (*pos)[0] & 0x3F;
bytes_consumed = 1;
}
if ((**pos & 0x80) != 0) {
*pos = end;
} else {
*pos += bytes_consumed;
}
return 1;
}
/*
* Check if byte at offset is last in label.
*/
static int IsEOL(const unsigned char* offset, const unsigned char* end)
{
CHECK_LT(offset, end);
return(*offset & 0x80) != 0;
}
/*
* Check if byte at offset matches first character in key.
* This version matches characters not last in label.
*/
static int IsMatch(const unsigned char* offset,
const unsigned char* end,
const char* key)
{
CHECK_LT(offset, end);
return *offset == *key;
}
/*
* Check if byte at offset matches first character in key.
* This version matches characters last in label.
*/
static int IsEndCharMatch(const unsigned char* offset,
const unsigned char* end,
const char* key)
{
CHECK_LT(offset, end);
return *offset == (*key | 0x80);
}
/*
* Read return value at offset.
* Returns true if a return value could be read, false otherwise.
*/
static int GetReturnValue(const unsigned char* offset,
const unsigned char* end,
int* return_value)
{
CHECK_LT(offset, end);
if ((*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x0F;
return 1;
}
return 0;
}
/*
* Looks up the string |key| with length |key_length| in a fixed set of
* strings. The set of strings must be known at compile time. It is converted to
* a graph structure named a DAFSA (Deterministic Acyclic Finite State
* Automaton) by the script make_dafsa.py during compilation. This permits
* efficient (in time and space) lookup. The graph generated by make_dafsa.py
* takes the form of a constant byte array which should be supplied via the
* |graph| and |length| parameters. The return value is kDafsaNotFound,
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
* kDafsaWildcardRule and kDafsaPrivateRule ORed together.
*
* Lookup a domain key in a byte array generated by make_dafsa.py.
*/
/* prototype to skip warning with -Wmissing-prototypes */
int _HIDDEN LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
size_t length,
const char* key,
size_t key_length)
{
const unsigned char* pos = graph;
const unsigned char* end = graph + length;
const unsigned char* offset = pos;
const char* key_end = key + key_length;
while (GetNextOffset(&pos, end, &offset)) {
/*char <char>+ end_char offsets
* char <char>+ return value
* char end_char offsets
* char return value
* end_char offsets
* return_value
*/
int did_consume = 0;
if (key != key_end && !IsEOL(offset, end)) {
/* Leading <char> is not a match. Don't dive into this child */
if (!IsMatch(offset, end, key))
continue;
did_consume = 1;
++offset;
++key;
/* Possible matches at this point:
* <char>+ end_char offsets
* <char>+ return value
* end_char offsets
* return value
*/
/* Remove all remaining <char> nodes possible */
while (!IsEOL(offset, end) && key != key_end) {
if (!IsMatch(offset, end, key))
return -1;
++key;
++offset;
}
}
/* Possible matches at this point:
* end_char offsets
* return_value
* If one or more <char> elements were consumed, a failure
* to match is terminal. Otherwise, try the next node.
*/
if (key == key_end) {
int return_value;
if (GetReturnValue(offset, end, &return_value))
return return_value;
/* The DAFSA guarantees that if the first char is a match, all
* remaining char elements MUST match if the key is truly present.
*/
if (did_consume)
return -1;
continue;
}
if (!IsEndCharMatch(offset, end, key)) {
if (did_consume)
return -1; /* Unexpected */
continue;
}
++key;
pos = ++offset; /* Dive into child */
}
return -1; /* No match */
}

588
src/make_dafsa.py Executable file
View File

@ -0,0 +1,588 @@
#!/usr/bin/env python
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE.chromium file.
"""
A Deterministic acyclic finite state automaton (DAFSA) is a compact
representation of an unordered word list (dictionary).
http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
This python program converts a list of strings to a byte array in C++.
This python program fetches strings and return values from a gperf file
and generates a C++ file with a byte array representing graph that can be
used as a memory efficient replacement for the perfect hash table.
The input strings are assumed to consist of printable 7-bit ASCII characters
and the return values are assumed to be one digit integers.
In this program a DAFSA is a diamond shaped graph starting at a common
source node and ending at a common sink node. All internal nodes contain
a label and each word is represented by the labels in one path from
the source node to the sink node.
The following python represention is used for nodes:
Source node: [ children ]
Internal node: (label, [ children ])
Sink node: None
The graph is first compressed by prefixes like a trie. In the next step
suffixes are compressed so that the graph gets diamond shaped. Finally
one to one linked nodes are replaced by nodes with the labels joined.
The order of the operations is crucial since lookups will be performed
starting from the source with no backtracking. Thus a node must have at
most one child with a label starting by the same character. The output
is also arranged so that all jumps are to increasing addresses, thus forward
in memory.
The generated output has suffix free decoding so that the sign of leading
bits in a link (a reference to a child node) indicate if it has a size of one,
two or three bytes and if it is the last outgoing link from the actual node.
A node label is terminated by a byte with the leading bit set.
The generated byte array can described by the following BNF:
<byte> ::= < 8-bit value in range [0x00-0xFF] >
<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
<offset1> ::= < byte in range [0x00-0x3F] >
<offset2> ::= < byte in range [0x40-0x5F] >
<offset3> ::= < byte in range [0x60-0x7F] >
<end_offset1> ::= < byte in range [0x80-0xBF] >
<end_offset2> ::= < byte in range [0xC0-0xDF] >
<end_offset3> ::= < byte in range [0xE0-0xFF] >
<prefix> ::= <char>
<label> ::= <end_char>
| <char> <label>
<end_label> ::= <return_value>
| <char> <end_label>
<offset> ::= <offset1>
| <offset2> <byte>
| <offset3> <byte> <byte>
<end_offset> ::= <end_offset1>
| <end_offset2> <byte>
| <end_offset3> <byte> <byte>
<offsets> ::= <end_offset>
| <offset> <offsets>
<source> ::= <offsets>
<node> ::= <label> <offsets>
| <prefix> <node>
| <end_label>
<dafsa> ::= <source>
| <dafsa> <node>
Decoding:
<char> -> printable 7-bit ASCII character
<end_char> & 0x7F -> printable 7-bit ASCII character
<return value> & 0x0F -> integer
<offset1 & 0x3F> -> integer
((<offset2> & 0x1F>) << 8) + <byte> -> integer
((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
offset2 and offset3 respectively.
The first offset in a list of offsets is the distance in bytes between the
offset itself and the first child node. Subsequent offsets are the distance
between previous child node and next child node. Thus each offset links a node
to a child node. The distance is always counted between start addresses, i.e.
first byte in decoded offset or first byte in child node.
Example 1:
%%
aa, 1
a, 2
%%
The input is first parsed to a list of words:
["aa1", "a2"]
A fully expanded graph is created from the words:
source = [node1, node4]
node1 = ("a", [node2])
node2 = ("a", [node3])
node3 = ("\x01", [sink])
node4 = ("a", [node5])
node5 = ("\x02", [sink])
sink = None
Compression results in the following graph:
source = [node1]
node1 = ("a", [node2, node3])
node2 = ("\x02", [sink])
node3 = ("a\x01", [sink])
sink = None
A C++ representation of the compressed graph is generated:
const unsigned char dafsa[7] = {
0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
};
The bytes in the generated array has the following meaning:
0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
4: 0x82 <return_value> 0x82 & 0x0F -> return 2
5: 0x61 <char> label character 0x61 -> match "a"
6: 0x81 <return_value> 0x81 & 0x0F -> return 1
Example 2:
%%
aa, 1
bbb, 2
baa, 1
%%
The input is first parsed to a list of words:
["aa1", "bbb2", "baa1"]
Compression results in the following graph:
source = [node1, node2]
node1 = ("b", [node2, node3])
node2 = ("aa\x01", [sink])
node3 = ("bb\x02", [sink])
sink = None
A C++ representation of the compressed graph is generated:
const unsigned char dafsa[11] = {
0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
};
The bytes in the generated array has the following meaning:
0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
5: 0x61 <char> label character 0x61 -> match "a"
6: 0x61 <char> label character 0x61 -> match "a"
7: 0x81 <return_value> 0x81 & 0x0F -> return 1
8: 0x62 <char> label character 0x62 -> match "b"
9: 0x62 <char> label character 0x62 -> match "b"
10: 0x82 <return_value> 0x82 & 0x0F -> return 2
"""
import sys
class InputError(Exception):
"""Exception raised for errors in the input file."""
def to_dafsa(words):
"""Generates a DAFSA from a word list and returns the source node.
Each word is split into characters so that each character is represented by
a unique node. It is assumed the word list is not empty.
"""
if not words:
raise InputError('The domain list must not be empty')
def to_nodes(word):
"""Split words into characters"""
if not 0x1F < ord(word[0]) < 0x80:
raise InputError('Domain names must be printable 7-bit ASCII')
if len(word) == 1:
return chr(int(word[0], 16) & 0x0F), [None]
return word[0], [to_nodes(word[1:])]
return [to_nodes(word) for word in words]
def to_words(node):
"""Generates a word list from all paths starting from an internal node."""
if not node:
return ['']
return [(node[0] + word) for child in node[1] for word in to_words(child)]
def reverse(dafsa):
"""Generates a new DAFSA that is reversed, so that the old sink node becomes
the new source node.
"""
sink = []
nodemap = {}
def dfs(node, parent):
"""Creates reverse nodes.
A new reverse node will be created for each old node. The new node will
get a reversed label and the parents of the old node as children.
"""
if not node:
sink.append(parent)
elif id(node) not in nodemap:
nodemap[id(node)] = (node[0][::-1], [parent])
for child in node[1]:
dfs(child, nodemap[id(node)])
else:
nodemap[id(node)][1].append(parent)
for node in dafsa:
dfs(node, None)
return sink
def join_labels(dafsa):
"""Generates a new DAFSA where internal nodes are merged if there is a one to
one connection.
"""
parentcount = {id(None): 2}
nodemap = {id(None): None}
def count_parents(node):
"""Count incoming references"""
if id(node) in parentcount:
parentcount[id(node)] += 1
else:
parentcount[id(node)] = 1
for child in node[1]:
count_parents(child)
def join(node):
"""Create new nodes"""
if id(node) not in nodemap:
children = [join(child) for child in node[1]]
if len(children) == 1 and parentcount[id(node[1][0])] == 1:
child = children[0]
nodemap[id(node)] = (node[0] + child[0], child[1])
else:
nodemap[id(node)] = (node[0], children)
return nodemap[id(node)]
for node in dafsa:
count_parents(node)
return [join(node) for node in dafsa]
def join_suffixes(dafsa):
"""Generates a new DAFSA where nodes that represent the same word lists
towards the sink are merged.
"""
nodemap = {frozenset(('',)): None}
def join(node):
"""Returns a macthing node. A new node is created if no matching node
exists. The graph is accessed in dfs order.
"""
suffixes = frozenset(to_words(node))
if suffixes not in nodemap:
nodemap[suffixes] = (node[0], [join(child) for child in node[1]])
return nodemap[suffixes]
return [join(node) for node in dafsa]
def top_sort(dafsa):
"""Generates list of nodes in topological sort order."""
incoming = {}
def count_incoming(node):
"""Counts incoming references."""
if node:
if id(node) not in incoming:
incoming[id(node)] = 1
for child in node[1]:
count_incoming(child)
else:
incoming[id(node)] += 1
for node in dafsa:
count_incoming(node)
for node in dafsa:
incoming[id(node)] -= 1
waiting = [node for node in dafsa if incoming[id(node)] == 0]
nodes = []
while waiting:
node = waiting.pop()
assert incoming[id(node)] == 0
nodes.append(node)
for child in node[1]:
if child:
incoming[id(child)] -= 1
if incoming[id(child)] == 0:
waiting.append(child)
return nodes
def encode_links(children, offsets, current):
"""Encodes a list of children as one, two or three byte offsets."""
if not children[0]:
# This is an <end_label> node and no links follow such nodes
assert len(children) == 1
return []
guess = 3 * len(children)
assert children
children = sorted(children, key=lambda x: -offsets[id(x)])
while True:
offset = current + guess
buf = []
for child in children:
last = len(buf)
distance = offset - offsets[id(child)]
assert distance > 0 and distance < (1 << 21)
if distance < (1 << 6):
# A 6-bit offset: "s0xxxxxx"
buf.append(distance)
elif distance < (1 << 13):
# A 13-bit offset: "s10xxxxxxxxxxxxx"
buf.append(0x40 | (distance >> 8))
buf.append(distance & 0xFF)
else:
# A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
buf.append(0x60 | (distance >> 16))
buf.append((distance >> 8) & 0xFF)
buf.append(distance & 0xFF)
# Distance in first link is relative to following record.
# Distance in other links are relative to previous link.
offset -= distance
if len(buf) == guess:
break
guess = len(buf)
# Set most significant bit to mark end of links in this node.
buf[last] |= (1 << 7)
buf.reverse()
return buf
def encode_prefix(label):
"""Encodes a node label as a list of bytes without a trailing high byte.
This method encodes a node if there is exactly one child and the
child follows immidiately after so that no jump is needed. This label
will then be a prefix to the label in the child node.
"""
assert label
return [ord(c) for c in reversed(label)]
def encode_label(label):
"""Encodes a node label as a list of bytes with a trailing high byte >0x80.
"""
buf = encode_prefix(label)
# Set most significant bit to mark end of label in this node.
buf[0] |= (1 << 7)
return buf
def encode(dafsa):
"""Encodes a DAFSA to a list of bytes"""
output = []
offsets = {}
for node in reversed(top_sort(dafsa)):
if (len(node[1]) == 1 and node[1][0] and
(offsets[id(node[1][0])] == len(output))):
output.extend(encode_prefix(node[0]))
else:
output.extend(encode_links(node[1], offsets, len(output)))
output.extend(encode_label(node[0]))
offsets[id(node)] = len(output)
output.extend(encode_links(dafsa, offsets, len(output)))
output.reverse()
return output
def to_cxx(data):
"""Generates C++ code from a list of encoded bytes."""
text = '/* This file is generated. DO NOT EDIT!\n\n'
text += 'The byte array encodes effective tld names. See make_dafsa.py for'
text += ' documentation.'
text += '*/\n\n'
text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
for i in range(0, len(data), 12):
text += ' '
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
text += ',\n'
text += '};\n'
return text
def words_to_whatever(words, converter):
"""Generates C++ code from a word list"""
dafsa = to_dafsa(words)
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
dafsa = fun(dafsa)
return converter(encode(dafsa))
def words_to_cxx(words):
"""Generates C++ code from a word list"""
return words_to_whatever(words, to_cxx)
def words_to_binary(words):
"""Generates C++ code from a word list"""
return words_to_whatever(words, bytearray)
def parse_psl2c(infile):
"""Parses file generated by psl2c and extract strings and return code"""
lines = [line.strip() for line in infile]
for line in lines:
if line[-3:-1] != ', ':
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
# Technically the DAFSA format could support return values in range [0-31],
# but the values below are the only with a defined meaning.
if line[-1] not in '0123456789ABCDEF':
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' %
line[-1])
# with open("gperf.out", 'w') as outfile:
# for line in lines:
# outfile.write(line[:-3] + line[-1] + "\n")
return [line[:-3] + line[-1] for line in lines]
def parse_psl(infile):
"""Parses PSL file and extract strings and return code"""
PSL_FLAG_EXCEPTION = (1<<0)
PSL_FLAG_WILDCARD = (1<<1)
PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
psl = {}
section = 0
for line in infile:
line = line.strip()
if not line:
continue
if line.startswith("//"):
if section == 0:
if "===BEGIN ICANN DOMAINS===" in line:
section = PSL_FLAG_ICANN
elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
section = PSL_FLAG_PRIVATE
elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
section = 0
elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
section = 0
continue # skip comments
if line[0] == '!':
flags = PSL_FLAG_EXCEPTION | section
line = line[1:]
elif line[0] == '*':
if line[1] != '.':
print 'Unsupported kind of rule (ignored): %s' % line
continue
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
line = line[2:]
else:
if not '.' in line:
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
flags = PSL_FLAG_PLAIN | section
line = line.decode('utf-8').encode("idna")
if line in psl:
"""Found existing entry:
Combination of exception and plain rule is ambiguous
!foo.bar
foo.bar
Allowed:
!foo.bar + *.foo.bar
foo.bar + *.foo.bar
"""
print('Found %s/%X (now %X)' % line, psl[line], flags)
continue
psl[line] = flags
# with open("psl.out", 'w') as outfile:
# for (domain, flags) in psl.iteritems():
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
return [domain + "%X" % (flags & 0x0F) for (domain, flags) in psl.iteritems()]
def usage():
"""Prints the usage"""
print 'usage: %s [options] infile outfile' % sys.argv[0]
print ' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)'
print ' --input-format=psl infile is a Public Suffix List file'
print ' --output-format=cxx Write DAFSA as C/C++ code'
print ' --output-format=binary Write DAFSA binary data'
exit(1)
def main():
"""Convert PSL file into C or binary DAFSA file"""
if len(sys.argv) < 3:
usage()
converter = words_to_cxx
parser = parse_psl2c
for arg in sys.argv[1:-2]:
if arg.startswith('--input-format='):
value = arg[15:].lower()
if value == 'psl':
parser = parse_psl
elif value == 'psl2c':
parser = parse_psl2c
else:
print "Unknown input format '%s'" % value
return 1
elif arg.startswith('--output-format='):
value = arg[16:].lower()
if value == 'binary':
converter = words_to_binary
elif value == 'cxx':
converter = words_to_cxx
else:
print "Unknown output format '%s'" % value
return 1
else:
usage()
if sys.argv[-2] == '-':
with open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(sys.stdin)))
else:
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(infile)))
return 0
if __name__ == '__main__':
sys.exit(main())

953
src/psl.c

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*
* Copyright(c) 2014-2015 Tim Ruehsen
* Copyright(c) 2014-2016 Tim Ruehsen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -45,8 +45,6 @@
# define _GENERATE_BUILTIN_DATA
#endif
#ifdef _GENERATE_BUILTIN_DATA
#include <libpsl.h>
/* here we include the library source code to have access to internal functions and data structures */
@ -54,6 +52,8 @@
# include "psl.c"
#undef _LIBPSL_INCLUDED_BY_PSL2C
#ifdef _GENERATE_BUILTIN_DATA
#if 0
static int _check_psl(const psl_ctx_t *psl)
{
@ -128,8 +128,9 @@ static int _check_psl(const psl_ctx_t *psl)
}
#endif
static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname)
static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
{
FILE *fp;
int it;
#ifdef BUILTIN_GENERATOR_LIBICU
@ -142,143 +143,170 @@ static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version);
} while (0);
#elif defined(BUILTIN_GENERATOR_LIBIDN2)
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL));
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL));
#elif defined(BUILTIN_GENERATOR_LIBIDN)
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL));
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL));
#else
fprintf(fpout, "/* automatically generated by psl2c (without punycode support) */\n");
fprintf(fpout, "/* automatically generated by psl2c (punycode generated internally) */\n");
#endif
fprintf(fpout, "static _psl_entry_t %s[] = {\n", varname);
if ((fp = fopen("in.tmp", "w"))) {
for (it = 0; it < v->cur; it++) {
_psl_entry_t *e = _vector_get(v, it);
unsigned char *s = (unsigned char *)e->label_buf;
for (it = 0; it < v->cur; it++) {
_psl_entry_t *e = _vector_get(v, it);
/* search for non-ASCII label and skip it */
while (*s && *s < 128) s++;
if (*s) continue;
fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n",
e->label_buf, e->length, (int) e->nlabels, (int) e->flags);
}
fprintf(fpout, "};\n");
}
#if 0
#if !defined(WITH_LIBICU) && !defined(WITH_IDN2)
static int _str_needs_encoding(const char *s)
{
while (*s && *((unsigned char *)s) < 128) s++;
return !!*s;
}
static void _add_punycode_if_needed(_psl_vector_t *v)
{
int it, n;
/* do not use 'it < v->cur' since v->cur is changed by _vector_add() ! */
for (it = 0, n = v->cur; it < n; it++) {
_psl_entry_t *e = _vector_get(v, it);
if (_str_needs_encoding(e->label_buf)) {
_psl_entry_t suffix, *suffixp;
char lookupname[64] = "";
/* this is much slower than the libidn2 API but should have no license issues */
FILE *pp;
char cmd[16 + sizeof(e->label_buf)];
snprintf(cmd, sizeof(cmd), "idn2 '%s'", e->label_buf);
if ((pp = popen(cmd, "r"))) {
if (fscanf(pp, "%63s", lookupname) >= 1 && strcmp(e->label_buf, lookupname)) {
/* fprintf(stderr, "idn2 '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.wildcard = e->wildcard;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
}
pclose(pp);
} else
fprintf(stderr, "Failed to call popen(%s, \"r\")\n", cmd);
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
}
fclose(fp);
}
_vector_sort(v);
}
#endif /* !defined(WITH_LIBICU) && !defined(WITH_IDN2) */
#endif
if ((it = system(MAKE_DAFSA " in.tmp out.tmp")))
fprintf(stderr, "Failed to execute " MAKE_DAFSA "\n");
if ((fp = fopen("out.tmp", "r"))) {
char buf[256];
while (fgets(buf, sizeof(buf), fp))
fputs(buf, fpout);
fclose(fp);
}
unlink("in.tmp");
unlink("out.tmp");
}
#endif /* _GENERATE_BUILTIN_DATA */
static int _print_psl_entries_dafsa_binary(const char *fname, const _psl_vector_t *v)
{
FILE *fp;
int ret = 0, it, rc;
char cmd[256];
if ((fp = fopen("in.tmp", "w"))) {
for (it = 0; it < v->cur; it++) {
_psl_entry_t *e = _vector_get(v, it);
unsigned char *s = (unsigned char *)e->label_buf;
/* search for non-ASCII label and skip it */
while (*s && *s < 128) s++;
if (*s) continue;
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
}
fclose(fp);
} else {
fprintf(stderr, "Failed to write open 'in.tmp'\n");
return 3;
}
snprintf(cmd, sizeof(cmd), MAKE_DAFSA " --binary in.tmp %s", fname);
if ((rc = system(cmd))) {
fprintf(stderr, "Failed to execute '%s' (%d)\n", cmd, rc);
ret = 2;
}
unlink("in.tmp");
return ret;
}
static void usage(void)
{
fprintf(stderr, "Usage: psl2c [--binary] <infile> <outfile>\n");
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
fprintf(stderr, " <outfile> is the the filename to be generated from <infile>\n");
fprintf(stderr, " --binary Generate binary DAFSA output (default: C code for psl.c)\n");
exit(1);
}
int main(int argc, const char **argv)
{
FILE *fpout;
#ifdef _GENERATE_BUILTIN_DATA
psl_ctx_t *psl;
#endif
int ret = 0;
int ret = 0, argpos = 1, binary = 0;
if (argc != 3) {
fprintf(stderr, "Usage: psl2c <infile> <outfile>\n");
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
fprintf(stderr, " <outfile> is the the C filename to be generated from <infile>\n");
return 1;
if (argc < 3)
usage();
if (strcmp(argv[argpos], "--binary") == 0) {
argpos++;
binary = 1;
}
if (argc - argpos != 2)
usage();
if (binary) {
if (!(psl = psl_load_file(argv[argpos])))
return 2;
ret = _print_psl_entries_dafsa_binary(argv[argpos + 1], psl->suffixes);
psl_free(psl);
return ret;
}
#ifdef _GENERATE_BUILTIN_DATA
if (!(psl = psl_load_file(argv[1])))
if (!(psl = psl_load_file(argv[argpos])))
return 2;
/* look for ambigious or double entries */
/* look for ambiguous or double entries */
/* if (_check_psl(psl)) {
psl_free(psl);
return 5;
}
*/
if ((fpout = fopen(argv[2], "w"))) {
if ((fpout = fopen(argv[argpos + 1], "w"))) {
FILE *pp;
struct stat st;
size_t cmdsize = 16 + strlen(argv[1]);
size_t cmdsize = 16 + strlen(argv[argpos]);
char *cmd = alloca(cmdsize), checksum[64] = "";
const char *source_date_epoch = NULL;
char *abs_srcfile;
#if 0
/* include library code did not generate punycode, so let's do it for the builtin data */
_add_punycode_if_needed(psl->suffixes);
#endif
_print_psl_entries_dafsa(fpout, psl->suffixes);
_print_psl_entries(fpout, psl->suffixes, "suffixes");
snprintf(cmd, cmdsize, "sha1sum %s", argv[1]);
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);
if ((pp = popen(cmd, "r"))) {
if (fscanf(pp, "%63[0-9a-zA-Z]", checksum) < 1)
*checksum = 0;
pclose(pp);
}
if (stat(argv[1], &st) != 0)
if (stat(argv[argpos], &st) != 0)
st.st_mtime = 0;
fprintf(fpout, "static time_t _psl_file_time = %lu;\n", st.st_mtime);
if ((source_date_epoch = getenv("SOURCE_DATE_EPOCH")))
fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", atol(source_date_epoch));
else
fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", time(NULL));
fprintf(fpout, "static int _psl_nsuffixes = %d;\n", psl->nsuffixes);
fprintf(fpout, "static int _psl_nexceptions = %d;\n", psl->nexceptions);
fprintf(fpout, "static int _psl_nwildcards = %d;\n", psl->nwildcards);
fprintf(fpout, "static const char _psl_sha1_checksum[] = \"%s\";\n", checksum);
fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", argv[1]);
/* We need an absolute path here, else psl_builtin_outdated() won't work reliable */
/* Caveat: symbolic links are resolved by realpath() */
if ((abs_srcfile = realpath(argv[argpos], NULL))) {
fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", abs_srcfile);
free(abs_srcfile);
} else
fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", argv[argpos]);
if (fclose(fpout) != 0)
ret = 4;
} else {
fprintf(stderr, "Failed to write open '%s'\n", argv[2]);
fprintf(stderr, "Failed to write open '%s'\n", argv[argpos + 1]);
ret = 3;
}
psl_free(psl);
#else
if ((fpout = fopen(argv[2], "w"))) {
fprintf(fpout, "static _psl_entry_t suffixes[1];\n");
if ((fpout = fopen(argv[argpos + 1], "w"))) {
fprintf(fpout, "static const unsigned char kDafsa[1];\n");
fprintf(fpout, "static time_t _psl_file_time;\n");
fprintf(fpout, "static time_t _psl_compile_time;\n");
fprintf(fpout, "static int _psl_nsuffixes = 0;\n");
fprintf(fpout, "static int _psl_nexceptions = 0;\n");
fprintf(fpout, "static int _psl_nwildcards = 0;\n");
@ -288,7 +316,7 @@ int main(int argc, const char **argv)
if (fclose(fpout) != 0)
ret = 4;
} else {
fprintf(stderr, "Failed to write open '%s'\n", argv[2]);
fprintf(stderr, "Failed to write open '%s'\n", argv[argpos + 1]);
ret = 3;
}
#endif /* GENERATE_BUILTIN_DATA */

View File

@ -1,4 +1,4 @@
DEFS = @DEFS@ -DDATADIR=\"$(top_srcdir)/data\" -DSRCDIR=\"$(srcdir)\" -DPSL_FILE=\"$(PSL_FILE)\" -DPSL_TESTFILE=\"$(PSL_TESTFILE)\"
DEFS = @DEFS@ -DSRCDIR=\"$(srcdir)\" -DPSL_FILE=\"$(PSL_FILE)\" -DPSL_TESTFILE=\"$(PSL_TESTFILE)\"
AM_CPPFLAGS = -I$(top_srcdir)/include
LDADD = ../src/libpsl.la

View File

@ -1,5 +1,5 @@
/*
* Copyright(c) 2014-2015 Tim Ruehsen
* Copyright(c) 2014-2016 Tim Ruehsen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -45,68 +45,140 @@
static int
ok,
failed;
#ifdef HAVE_CLOCK_GETTIME
struct timespec ts1, ts2;
#endif
static inline int _isspace_ascii(const char c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static void test_psl_entry(const psl_ctx_t *psl, const char *domain, int type)
{
int result;
if (*domain == '!') { /* an exception to a wildcard, e.g. !www.ck (wildcard is *.ck) */
if ((result = psl_is_public_suffix(psl, domain + 1))) {
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 0)\n", domain, result);
} else ok++;
if ((domain = strchr(domain, '.'))) {
if (!(result = psl_is_public_suffix(psl, domain + 1))) {
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", domain + 1, result);
} else ok++;
}
} else if (*domain == '*') { /* a wildcard, e.g. *.ck or *.platform.sh */
char *xdomain;
size_t len;
if (!(result = psl_is_public_suffix(psl, domain + 1))) {
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", domain + 1, result);
} else ok++;
len = strlen(domain);
xdomain = alloca(len + 1);
memcpy(xdomain, domain, len + 1);
*xdomain = 'x';
if (!(result = psl_is_public_suffix(psl, domain))) {
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", domain, result);
} else ok++;
} else {
if (!(result = psl_is_public_suffix(psl, domain))) {
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", domain, result);
} else ok++;
if (!(strchr(domain, '.'))) {
/* TLDs are always expected to be Publix Suffixes */
if (!(result = psl_is_public_suffix2(psl, domain, PSL_TYPE_PRIVATE))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_PRIVATE)=%d (expected 1)\n", domain, result);
} else ok++;
if (!(result = psl_is_public_suffix2(psl, domain, PSL_TYPE_ICANN))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_ICANN)=%d (expected 0)\n", domain, result);
} else ok++;
} else if (type == PSL_TYPE_PRIVATE) {
if (!(result = psl_is_public_suffix2(psl, domain, PSL_TYPE_PRIVATE))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_PRIVATE)=%d (expected 1)\n", domain, result);
} else ok++;
if ((result = psl_is_public_suffix2(psl, domain, PSL_TYPE_ICANN))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_ICANN)=%d (expected 0)\n", domain, result);
} else ok++;
} else if (type == PSL_TYPE_ICANN) {
if (!(result = psl_is_public_suffix2(psl, domain, PSL_TYPE_ICANN))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_ICANN)=%d (expected 1)\n", domain, result);
} else ok++;
if ((result = psl_is_public_suffix2(psl, domain, PSL_TYPE_PRIVATE))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_PRIVATE)=%d (expected 0)\n", domain, result);
} else ok++;
}
}
}
static void test_psl(void)
{
FILE *fp;
psl_ctx_t *psl;
int result;
const psl_ctx_t *psl2;
int type = 0;
char buf[256], *linep, *p;
psl = psl_load_file(PSL_FILE); /* PSL_FILE can be set by ./configure --with-psl-file=[PATH] */
printf("loaded %d suffixes and %d exceptions\n", psl_suffix_count(psl), psl_suffix_exception_count(psl));
psl2 = psl_builtin();
printf("builtin PSL has %d suffixes and %d exceptions\n", psl_suffix_count(psl2), psl_suffix_exception_count(psl2));
if ((fp = fopen(PSL_FILE, "r"))) {
#ifdef HAVE_CLOCK_GETTIME
clock_gettime(CLOCK_REALTIME, &ts1);
#endif
while ((linep = fgets(buf, sizeof(buf), fp))) {
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
if (!*linep) continue; /* skip empty lines */
if (*linep == '/' && linep[1] == '/')
if (*linep == '/' && linep[1] == '/') {
if (!type) {
if (strstr(linep + 2, "===BEGIN ICANN DOMAINS==="))
type = PSL_TYPE_ICANN;
else if (!type && strstr(linep + 2, "===BEGIN PRIVATE DOMAINS==="))
type = PSL_TYPE_PRIVATE;
}
else if (type == PSL_TYPE_ICANN && strstr(linep + 2, "===END ICANN DOMAINS==="))
type = 0;
else if (type == PSL_TYPE_PRIVATE && strstr(linep + 2, "===END PRIVATE DOMAINS==="))
type = 0;
continue; /* skip comments */
}
/* parse suffix rule */
for (p = linep; *linep && !_isspace_ascii(*linep);) linep++;
*linep = 0;
if (*p == '!') { /* an exception to a wildcard, e.g. !www.ck (wildcard is *.ck) */
if ((result = psl_is_public_suffix(psl, p + 1))) {
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 0)\n", p, result);
} else ok++;
test_psl_entry(psl, p, type);
if ((p = strchr(p, '.'))) {
if (!(result = psl_is_public_suffix(psl, p + 1))) {
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", p + 1, result);
} else ok++;
}
}
else if (*p == '*') { /* a wildcard, e.g. *.ck */
if (!(result = psl_is_public_suffix(psl, p + 1))) {
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", p + 1, result);
} else ok++;
*p = 'x';
if (!(result = psl_is_public_suffix(psl, p))) {
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", p, result);
} else ok++;
}
else {
if (!(result = psl_is_public_suffix(psl, p))) {
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", p, result);
} else ok++;
}
if (psl2)
test_psl_entry(psl2, p, type);
}
#ifdef HAVE_CLOCK_GETTIME
clock_gettime(CLOCK_REALTIME, &ts2);
#endif
fclose(fp);
} else {
printf("Failed to open %s\n", PSL_FILE);
@ -114,10 +186,15 @@ static void test_psl(void)
}
psl_free(psl);
psl_free((psl_ctx_t *)psl2);
}
int main(int argc, const char * const *argv)
{
#ifdef HAVE_CLOCK_GETTIME
long ns;
#endif
/* if VALGRIND testing is enabled, we have to call ourselves with valgrind checking */
if (argc == 1) {
const char *valgrind = getenv("TESTS_VALGRIND");
@ -138,6 +215,21 @@ int main(int argc, const char * const *argv)
return 1;
}
printf("Summary: All %d tests passed\n", ok + failed);
#ifdef HAVE_CLOCK_GETTIME
if (ts1.tv_sec == ts2.tv_sec)
ns = ts2.tv_nsec - ts1.tv_nsec;
else if (ts1.tv_sec == ts2.tv_sec - 1)
ns = 1000000000L - (ts2.tv_nsec - ts1.tv_nsec);
else
ns = 0; /* let's assume something is wrong and skip outputting measured time */
if (ns)
printf("Summary: All %d tests passed in %ld.%06ld ms\n", ok, ns / 1000000, ns % 1000000000);
else
printf("Summary: All %d tests passed\n", ok);
#else
printf("Summary: All %d tests passed\n", ok);
#endif
return 0;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright(c) 2014-2015 Tim Ruehsen
* Copyright(c) 2014-2016 Tim Ruehsen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -80,6 +80,10 @@ static void test_psl(void)
{ ".forgot.his.name", 1 },
{ "whoever.his.name", 0 },
{ "whoever.forgot.his.name", 0 },
{ "whatever.platform.sh", 1 },
{ ".platform.sh", 1 },
{ "whatever.yokohama.jp", 1 },
{ ".yokohama.jp", 1 },
{ ".", 1 }, /* special case */
{ "", 1 }, /* special case */
{ NULL, 1 }, /* special case */
@ -104,9 +108,6 @@ static void test_psl(void)
}
}
printf("psl_builtin_compile_time()=%ld\n", psl_builtin_compile_time());
psl_builtin_compile_time() == 0 ? failed++ : ok++;
printf("psl_builtin_file_time()=%ld\n", psl_builtin_file_time());
psl_builtin_file_time() == 0 ? failed++ : ok++;

View File

@ -1,5 +1,5 @@
/*
* Copyright(c) 2014-2015 Tim Ruehsen
* Copyright(c) 2014-2016 Tim Ruehsen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -117,7 +117,7 @@ int main(int argc, const char *const *argv)
usage(0, stdout);
}
else if (!strcmp(*arg, "--version")) {
printf("psl %s\n", PACKAGE_VERSION);
printf("psl %s (0x%06x)\n", PACKAGE_VERSION, psl_check_version_number(0));
printf("libpsl %s\n", psl_get_version());
printf("\n");
printf("Copyright (C) 2014-2015 Tim Ruehsen\n");
@ -211,9 +211,9 @@ int main(int argc, const char *const *argv)
printf("builtin exceptions: %d\n", psl_suffix_exception_count(psl));
printf("builtin wildcards: %d\n", psl_suffix_wildcard_count(psl));
printf("builtin filename: %s\n", psl_builtin_filename());
printf("builtin compile time: %ld (%s)\n", psl_builtin_compile_time(), time2str(psl_builtin_compile_time()));
printf("builtin file time: %ld (%s)\n", psl_builtin_file_time(), time2str(psl_builtin_file_time()));
printf("builtin SHA1 file hash: %s\n", psl_builtin_sha1sum());
printf("builtin outdated: %d\n", psl_builtin_outdated());
} else
printf("No builtin PSL data available\n");
}