Release V0.13.0
-----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABCgAGBQJW1s5mAAoJEAgwLbaiZwQol4YP/ReL/tHzUAOR2+fblzkmbZcG QkwpTynhgZrUI8sI0DASYR8ne0CBFymco2K3LHwDnjhgYFs5OtpHBwgLYa+ZGxCW HlVx79ZgHLyYX9LiwVgk/gCvJIuIPO2IS4qUjhVfBfKA6OJ2EHnGLl+62W8tGROm 0mxChLMZHQW7MmTx7ukQZwhqVxqcXnF1dQTFm3fymVtMq5wxbPq0i+y72xK/0nCZ 19xMPa2dH1yLRyS6OphwK9hyIHykCIBZf7iFvUz45nl8ONqi1DL3jOTcdZN0gZgk FFeSsDlHQ1EAxS2qv+Caa8xLUVcxRHz2JKZhYZIBoUdIzatJVtQmZL1LqioIu38Y cxcB6WMZjZBVZwhbHC8Q0jv/bKHbRMFTFDNK1c4ThNZAY69B3Nd7Im8UXGis0qms g3LuqjbM8J5fk+k9LFDVZgru3+lSwsBVP6n44esTck0aO6mqlCKpkOsR0djIW91P 1ueP0QquMWWySLulQVFOywrFSmLtQHiSMbCIZOJ9eFTe9s0klQe06zsFe16Xu0U7 VsR5wRVxtE7+OHg+iOeajgkOmp5dWoOpOK3lhiC/79CeQRudR881/sIR1+hEXkKa sFmpj3BEqrvac8059p4Xi0Hgc4qFDzDmXNhGWVZel0VRQWaIw/ssxqtPEyOfvLXn iWLMH9eQ9A1u2zMjM6oQ =xI69 -----END PGP SIGNATURE----- Merge tag 'libpsl-0.13.0' into debian Release V0.13.0
This commit is contained in:
commit
ecc0d75423
|
@ -1,8 +1,9 @@
|
|||
*.gz
|
||||
*.o
|
||||
*.lo
|
||||
*.la
|
||||
*.exe
|
||||
*.gz
|
||||
*.la
|
||||
*.lo
|
||||
*.log
|
||||
*.o
|
||||
*~
|
||||
*/.deps
|
||||
*/.libs
|
||||
|
|
57
.travis.yml
57
.travis.yml
|
@ -1,29 +1,46 @@
|
|||
sudo: false
|
||||
|
||||
language: c
|
||||
|
||||
compiler:
|
||||
- gcc
|
||||
- clang
|
||||
# Change this to your needs
|
||||
|
||||
branches:
|
||||
only:
|
||||
- master
|
||||
- develop
|
||||
|
||||
env:
|
||||
- RUNTIME=libicu
|
||||
- RUNTIME=libidn2
|
||||
- RUNTIME=libidn
|
||||
- RUNTIME=no
|
||||
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- automake
|
||||
- autoconf
|
||||
- autopoint
|
||||
- libtool
|
||||
- gtk-doc-tools
|
||||
- gettext
|
||||
- libidn11
|
||||
- libidn11-dev
|
||||
- libidn2-0
|
||||
- libidn2-0-dev
|
||||
- libicu48
|
||||
- libicu-dev
|
||||
- libunistring0
|
||||
- libunistring-dev
|
||||
|
||||
script:
|
||||
- ./autogen.sh
|
||||
- ./configure && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libicu --enable-builtin=libicu && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libicu --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libicu --enable-builtin=libidn && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libicu --disable-builtin && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libidn2 --enable-builtin=libicu && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libidn2 --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libidn2 --enable-builtin=libidn && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libidn2 --disable-builtin && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libidn --enable-builtin=libicu && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libidn --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libidn --enable-builtin=libidn && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=libidn --disable-builtin && make clean && make -j4 && make check -j4
|
||||
- ./configure --disable-runtime --enable-builtin=libicu && make clean && make -j4 && make check -j4
|
||||
- ./configure --disable-runtime --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
|
||||
- ./configure --disable-runtime --enable-builtin=libidn && make clean && make -j4 && make check -j4
|
||||
- ./configure --disable-runtime --disable-builtin && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=$RUNTIME --enable-builtin=libicu && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=$RUNTIME --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=$RUNTIME --enable-builtin=libidn && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-runtime=$RUNTIME --disable-builtin && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-gtk-doc && make -j4 && make check -j4
|
||||
- make distcheck
|
||||
before_install:
|
||||
- sudo apt-get -qq update
|
||||
- sudo apt-get -q install autoconf automake autopoint libtool gtk-doc-tools gettext libidn11 libidn11-dev libidn2-0 libidn2-0-dev libicu48 libicu-dev libunistring0 libunistring-dev
|
||||
|
|
1
AUTHORS
1
AUTHORS
|
@ -15,3 +15,4 @@ Dagobert Michelsen (Fixed Solaris building)
|
|||
Christopher Meng (Fedora building)
|
||||
Jakub Čajka
|
||||
Giuseppe Scrivano
|
||||
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
|
||||
|
|
|
@ -14,4 +14,8 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS}
|
|||
pkgconfigdir = $(libdir)/pkgconfig
|
||||
pkgconfig_DATA = libpsl.pc
|
||||
|
||||
EXTRA_DIST = config.rpath LICENSE $(PSL_FILE) list/tests/test_psl.txt
|
||||
EXTRA_DIST = config.rpath LICENSE
|
||||
dist-hook:
|
||||
mkdir -p $(distdir)/list/tests
|
||||
cp -p $(PSL_FILE) $(distdir)/list
|
||||
cp -p $(PSL_TESTFILE) $(distdir)/list/tests
|
||||
|
|
23
NEWS
23
NEWS
|
@ -1,10 +1,29 @@
|
|||
Copyright (C) 2014-2015 Tim Rühsen
|
||||
Copyright (C) 2014-2016 Tim Rühsen
|
||||
|
||||
02.03.2016 Release V0.13.0
|
||||
* Use tests.txt as PSL test file by default
|
||||
* Slightly shorter DAFSA array when sorting input
|
||||
* Check for python 2.7+ in configure.ac
|
||||
* Fix python3 incompatibilities in make_dafsa.py
|
||||
|
||||
02.01.2016 Release V0.12.0
|
||||
* Load DAFSA binaries via psl_load_file() via auto-detection
|
||||
* Add more tests
|
||||
* Remove psl_builtin_compile_time()
|
||||
* Compile PSL into DAFSA using make_dafsa.py
|
||||
* Avoid libicu dependency with --enable-runtime=no
|
||||
* Test on new Travis-CI build farm
|
||||
* Use DAFSA format for builtin PSL data
|
||||
* Add function psl_is_public_suffix2()
|
||||
* Fix psl_builtin_outdated()
|
||||
* Fix several bugs
|
||||
* Cleanup code
|
||||
|
||||
23.09.2015 Release V0.11.0
|
||||
* Add new function psl_check_version_number()
|
||||
* Add version defines to include file
|
||||
|
||||
19.09.2025 Release V0.10.0
|
||||
19.09.2015 Release V0.10.0
|
||||
* Code simplified
|
||||
* Less data entries, faster lookups
|
||||
* Add new function psl_suffix_wildcard_count()
|
||||
|
|
|
@ -14,7 +14,7 @@ Browsers and other web clients can use it to
|
|||
|
||||
Libpsl...
|
||||
|
||||
- has built-in PSL data for fast access
|
||||
- has built-in PSL data for fast access (DAWG/DAFSA reduces size from 180kB to ~32kB)
|
||||
- allows to load PSL data from files
|
||||
- checks if a given domain is a "public suffix"
|
||||
- provides immediate cookie domain verification
|
||||
|
@ -28,6 +28,8 @@ Find more information about the Publix Suffix List [here](http://publicsuffix.or
|
|||
|
||||
Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat).
|
||||
|
||||
The DAFSA code has been taken from [Chromium Project](https://code.google.com/p/chromium/).
|
||||
|
||||
|
||||
API Documentation
|
||||
-----------------
|
||||
|
@ -74,10 +76,14 @@ License
|
|||
Libpsl is made available under the terms of the MIT license.<br>
|
||||
See the LICENSE file that accompanies this distribution for the full text of the license.
|
||||
|
||||
src/make_dafsa.py and src/lookup_string_in_fixed_set.c are licensed under the term written in
|
||||
src/LICENSE.chromium.
|
||||
|
||||
Building from git
|
||||
-----------------
|
||||
|
||||
You should have python2.7+ installed.
|
||||
|
||||
Download project and prepare sources with
|
||||
|
||||
git clone http://github.com/rockdaboot/libpsl
|
||||
|
|
16
autogen.sh
16
autogen.sh
|
@ -1,21 +1,21 @@
|
|||
# !/bin/sh -e
|
||||
#!/bin/sh
|
||||
|
||||
AUTORECONF=`which autoreconf 2>/dev/null`
|
||||
AUTORECONF=$(which autoreconf 2>/dev/null)
|
||||
if test $? -ne 0; then
|
||||
echo "No 'autoreconf' found. You must install the autoconf package."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
GIT=`which git 2>/dev/null`
|
||||
GIT=$(which git 2>/dev/null)
|
||||
if test $? -ne 0; then
|
||||
echo "No 'git' found. You must install the git package."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# create m4 before gtkdocize
|
||||
mkdir m4 2>/dev/null
|
||||
# create m4 before gtkdocize
|
||||
mkdir -p m4 2>/dev/null
|
||||
|
||||
GTKDOCIZE=`which gtkdocize 2>/dev/null`
|
||||
GTKDOCIZE=$(which gtkdocize 2>/dev/null)
|
||||
if test $? -ne 0; then
|
||||
echo "No gtk-doc support found. You can't build the docs."
|
||||
# rm because gtk-doc.make might be a link to a protected file
|
||||
|
@ -24,12 +24,12 @@ if test $? -ne 0; then
|
|||
echo "CLEANFILES =" >>gtk-doc.make
|
||||
GTKDOCIZE=""
|
||||
else
|
||||
$GTKDOCIZE || exit $?
|
||||
$GTKDOCIZE
|
||||
fi
|
||||
|
||||
$GIT submodule init
|
||||
$GIT submodule update
|
||||
$AUTORECONF --install --force --symlink || exit $?
|
||||
$AUTORECONF --install --force --symlink
|
||||
|
||||
echo
|
||||
echo "----------------------------------------------------------------"
|
||||
|
|
26
configure.ac
26
configure.ac
|
@ -1,5 +1,5 @@
|
|||
|
||||
AC_INIT([libpsl], [0.11.0], [tim.ruehsen@gmx.de], [libpsl], [http://github.com/rockdaboot/libpsl])
|
||||
AC_INIT([libpsl], [0.13.0], [tim.ruehsen@gmx.de], [libpsl], [http://github.com/rockdaboot/libpsl])
|
||||
AC_PREREQ([2.59])
|
||||
AM_INIT_AUTOMAKE([1.10 -Wall no-define foreign])
|
||||
|
||||
|
@ -20,9 +20,9 @@ AC_C_INLINE
|
|||
#
|
||||
# Generate version defines for include file
|
||||
#
|
||||
AC_SUBST([LIBPSL_VERSION_MAJOR], [`echo -n $VERSION|cut -d'.' -f1`])
|
||||
AC_SUBST([LIBPSL_VERSION_MINOR], [`echo -n $VERSION|cut -d'.' -f2`])
|
||||
AC_SUBST([LIBPSL_VERSION_PATCH], [`echo -n $VERSION|cut -d'.' -f3`])
|
||||
AC_SUBST([LIBPSL_VERSION_MAJOR], [`echo $VERSION|cut -d'.' -f1`])
|
||||
AC_SUBST([LIBPSL_VERSION_MINOR], [`echo $VERSION|cut -d'.' -f2`])
|
||||
AC_SUBST([LIBPSL_VERSION_PATCH], [`echo $VERSION|cut -d'.' -f3`])
|
||||
AC_SUBST([LIBPSL_VERSION_NUMBER], [`printf '0x%02x%02x%02x' $LIBPSL_VERSION_MAJOR $LIBPSL_VERSION_MINOR $LIBPSL_VERSION_PATCH`])
|
||||
AC_CONFIG_FILES([include/libpsl.h])
|
||||
|
||||
|
@ -72,6 +72,9 @@ AS_IF([ test "$enable_man" != no ], [
|
|||
AC_MSG_RESULT([no])
|
||||
])
|
||||
|
||||
# src/make_dafsa.py needs python 2.7+
|
||||
AM_PATH_PYTHON([2.7])
|
||||
|
||||
PKG_PROG_PKG_CONFIG
|
||||
|
||||
# Define these substitions here to keep all version information in one place.
|
||||
|
@ -85,7 +88,7 @@ PKG_PROG_PKG_CONFIG
|
|||
# 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0.
|
||||
# 5. If any interfaces have been added since the last public release, then increment age.
|
||||
# 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0.
|
||||
AC_SUBST([LIBPSL_SO_VERSION], [4:0:4])
|
||||
AC_SUBST([LIBPSL_SO_VERSION], [5:0:0])
|
||||
AC_SUBST([LIBPSL_VERSION], $VERSION)
|
||||
|
||||
# Check for enable/disable builtin PSL data
|
||||
|
@ -154,8 +157,10 @@ if test "$enable_runtime" = "libicu" -o "$enable_builtin" = "libicu"; then
|
|||
# using AC_SEARCH_LIBS also don't work since functions have the library version appended
|
||||
PKG_CHECK_MODULES([LIBICU], [icu-uc], [
|
||||
HAVE_LIBICU=yes
|
||||
LIBS="$LIBICU_LIBS $LIBS"
|
||||
CFLAGS="$LIBICU_CFLAGS $CFLAGS"
|
||||
if test "$enable_runtime" = "libicu"; then
|
||||
LIBS="$LIBICU_LIBS $LIBS"
|
||||
CFLAGS="$LIBICU_CFLAGS $CFLAGS"
|
||||
fi
|
||||
], [
|
||||
OLDLIBS=$LIBS
|
||||
LIBS="-licuuc $LIBS"
|
||||
|
@ -216,6 +221,9 @@ elif test -n "$NEEDS_NSL" ; then
|
|||
LIBS="$LIBS -lnsl"
|
||||
fi
|
||||
|
||||
# Check for clock_gettime() used for performance measurement
|
||||
AC_SEARCH_LIBS(clock_gettime, rt)
|
||||
|
||||
# Check for valgrind
|
||||
ac_enable_valgrind=no
|
||||
AC_ARG_ENABLE(valgrind-tests,
|
||||
|
@ -247,12 +255,12 @@ AC_SUBST(PSL_FILE)
|
|||
AC_ARG_WITH(psl-testfile,
|
||||
AC_HELP_STRING([--with-psl-testfile=[PATH]], [path to PSL test file]),
|
||||
PSL_TESTFILE=$withval,
|
||||
PSL_TESTFILE="\$(top_srcdir)/list/tests/test_psl.txt")
|
||||
PSL_TESTFILE="\$(top_srcdir)/list/tests/tests.txt")
|
||||
AC_SUBST(PSL_TESTFILE)
|
||||
|
||||
# check for alloca / alloca.h
|
||||
AC_FUNC_ALLOCA
|
||||
AC_CHECK_FUNCS([strndup])
|
||||
AC_CHECK_FUNCS([strndup clock_gettime])
|
||||
|
||||
# Override the template file name of the generated .pc file, so that there
|
||||
# is no need to rename the template file when the API version changes.
|
||||
|
|
|
@ -14,7 +14,7 @@ make distclean > /dev/null || true
|
|||
|
||||
# We define _GNU_SOURCE to avoid warnings with missing prototypes.
|
||||
# C89 does not know snprintf, strdup, strndup, popen, pclose
|
||||
CFLAGS="-std=c89 -pedantic -O2 -g -Wall -Wextra -Wstrict-prototypes -Wold-style-definition -Wwrite-strings -Wshadow -Wformat -Wformat-security -Wunreachable-code -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition -D_GNU_SOURCE"
|
||||
CFLAGS="-std=gnu89 -pedantic -O2 -g -Wall -Wextra -Wstrict-prototypes -Wold-style-definition -Wwrite-strings -Wshadow -Wformat -Wformat-security -Wunreachable-code -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition"
|
||||
|
||||
CACHEFILE=$PWD/config_check.cache
|
||||
|
||||
|
@ -40,7 +40,8 @@ for CC in gcc clang; do
|
|||
for options in \
|
||||
"--enable-runtime=libicu --enable-builtin=libicu" \
|
||||
"--enable-runtime=libidn2 --enable-builtin=libidn2" \
|
||||
"--enable-runtime=libidn --enable-builtin=libidn"; do
|
||||
"--enable-runtime=libidn --enable-builtin=libidn" \
|
||||
"--disable-runtime --enable-builtin=libicu"; do
|
||||
export DISTCHECK_CONFIGURE_FLAGS="-C --cache-file=$CACHEFILE $options"
|
||||
echo
|
||||
echo " *** ./configure $DISTCHECK_CONFIGURE_FLAGS"
|
||||
|
|
|
@ -6,6 +6,9 @@ PSL_VERSION_MAJOR
|
|||
PSL_VERSION_MINOR
|
||||
PSL_VERSION_NUMBER
|
||||
PSL_VERSION_PATCH
|
||||
PSL_TYPE_ICANN
|
||||
PSL_TYPE_PRIVATE
|
||||
PSL_TYPE_ANY
|
||||
psl_error_t
|
||||
psl_ctx_t
|
||||
psl_load_file
|
||||
|
@ -13,12 +16,12 @@ psl_load_fp
|
|||
psl_builtin
|
||||
psl_free
|
||||
psl_is_public_suffix
|
||||
psl_is_public_suffix2
|
||||
psl_unregistrable_domain
|
||||
psl_registrable_domain
|
||||
psl_suffix_count
|
||||
psl_suffix_exception_count
|
||||
psl_suffix_wildcard_count
|
||||
psl_builtin_compile_time
|
||||
psl_builtin_file_time
|
||||
psl_builtin_sha1sum
|
||||
psl_builtin_filename
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright(c) 2014-2015 Tim Ruehsen
|
||||
* Copyright(c) 2014-2016 Tim Ruehsen
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -44,6 +44,11 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* types for psl_is_publix_suffix2() */
|
||||
#define PSL_TYPE_ICANN (1<<0)
|
||||
#define PSL_TYPE_PRIVATE (1<<1)
|
||||
#define PSL_TYPE_ANY (PSL_TYPE_ICANN | PSL_TYPE_PRIVATE)
|
||||
|
||||
/**
|
||||
* psl_error_t:
|
||||
* @PSL_SUCCESS: Successful return.
|
||||
|
@ -71,57 +76,75 @@ typedef struct _psl_ctx_st psl_ctx_t;
|
|||
/* frees PSL context */
|
||||
void
|
||||
psl_free(psl_ctx_t *psl);
|
||||
|
||||
/* loads PSL data from file */
|
||||
psl_ctx_t *
|
||||
psl_load_file(const char *fname);
|
||||
|
||||
/* loads PSL data from FILE pointer */
|
||||
psl_ctx_t *
|
||||
psl_load_fp(FILE *fp);
|
||||
|
||||
/* retrieves builtin PSL data */
|
||||
const psl_ctx_t *
|
||||
psl_builtin(void);
|
||||
|
||||
/* checks whether domain is a public suffix or not */
|
||||
int
|
||||
psl_is_public_suffix(const psl_ctx_t *psl, const char *domain);
|
||||
|
||||
/* checks whether domain is a public suffix regarding the type or not */
|
||||
int
|
||||
psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type);
|
||||
|
||||
/* checks whether cookie_domain is acceptable for domain or not */
|
||||
int
|
||||
psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain);
|
||||
|
||||
/* returns the longest not registrable domain within 'domain' or NULL if none found */
|
||||
const char *
|
||||
psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain);
|
||||
|
||||
/* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */
|
||||
const char *
|
||||
psl_registrable_domain(const psl_ctx_t *psl, const char *domain);
|
||||
|
||||
/* convert a string into lowercase UTF-8 */
|
||||
psl_error_t
|
||||
psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower);
|
||||
|
||||
/* does not include exceptions */
|
||||
int
|
||||
psl_suffix_count(const psl_ctx_t *psl);
|
||||
|
||||
/* just counts exceptions */
|
||||
int
|
||||
psl_suffix_exception_count(const psl_ctx_t *psl);
|
||||
|
||||
/* just counts wildcards */
|
||||
int
|
||||
psl_suffix_wildcard_count(const psl_ctx_t *psl);
|
||||
/* returns compilation time */
|
||||
time_t
|
||||
psl_builtin_compile_time(void);
|
||||
|
||||
/* returns mtime of PSL source file */
|
||||
time_t
|
||||
psl_builtin_file_time(void);
|
||||
|
||||
/* returns SHA1 checksum (hex-encoded, lowercase) of PSL source file */
|
||||
const char *
|
||||
psl_builtin_sha1sum(void);
|
||||
|
||||
/* returns file name of PSL source file */
|
||||
const char *
|
||||
psl_builtin_filename(void);
|
||||
|
||||
/* returns library version string */
|
||||
const char *
|
||||
psl_get_version(void);
|
||||
|
||||
/* checks library version number */
|
||||
int
|
||||
psl_check_version_number(int version);
|
||||
|
||||
/* returns wether the built-in data is outdated or not */
|
||||
int
|
||||
psl_builtin_outdated(void);
|
||||
|
|
2
list
2
list
|
@ -1 +1 @@
|
|||
Subproject commit 2930bb4a5256279e0f7ba44cf9d174fc93ecb732
|
||||
Subproject commit e2f2f4bfe2ae57651afb7268bb9a0b53da5eb8cf
|
|
@ -0,0 +1,30 @@
|
|||
* The following License is for the source code files
|
||||
make_dafsa.py and lookup_string_in_fixed_set.c.
|
||||
|
||||
// Copyright 2015 The Chromium Authors. All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following disclaimer
|
||||
// in the documentation and/or other materials provided with the
|
||||
// distribution.
|
||||
// * Neither the name of Google Inc. nor the names of its
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1,12 +1,12 @@
|
|||
# suffixes.c must be created before psl.c is compiled
|
||||
BUILT_SOURCES = suffixes.c
|
||||
BUILT_SOURCES = suffixes_dafsa.c
|
||||
|
||||
# suffixes.c is a built source that must be cleaned
|
||||
CLEANFILES = suffixes.c
|
||||
CLEANFILES = suffixes_dafsa.c
|
||||
|
||||
lib_LTLIBRARIES = libpsl.la
|
||||
|
||||
libpsl_la_SOURCES = psl.c
|
||||
libpsl_la_SOURCES = psl.c lookup_string_in_fixed_set.c
|
||||
libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
|
||||
# include ABI version information
|
||||
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
|
||||
|
@ -21,8 +21,8 @@ if WITH_LIBIDN
|
|||
endif
|
||||
|
||||
noinst_PROGRAMS = psl2c
|
||||
psl2c_SOURCES = psl2c.c
|
||||
psl2c_CPPFLAGS = -I$(top_srcdir)/include
|
||||
psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
|
||||
psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/make_dafsa.py\"
|
||||
if BUILTIN_GENERATOR_LIBICU
|
||||
psl2c_LDADD = -licuuc
|
||||
endif
|
||||
|
@ -33,7 +33,9 @@ if BUILTIN_GENERATOR_LIBIDN
|
|||
psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring
|
||||
endif
|
||||
|
||||
# Build rule for suffix.c
|
||||
# Build rule for suffix_dafsa.c
|
||||
# PSL_FILE can be set by ./configure --with-psl-file=[PATH]
|
||||
suffixes.c: $(PSL_FILE) psl2c$(EXEEXT)
|
||||
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes.c
|
||||
suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
|
||||
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
|
||||
|
||||
EXTRA_DIST = make_dafsa.py LICENSE.chromium
|
||||
|
|
|
@ -0,0 +1,204 @@
|
|||
/* Copyright 2015 The Chromium Authors. All rights reserved.
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE.chromium file.
|
||||
*
|
||||
* Converted to C89 2015 by Tim Rühsen
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
|
||||
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
|
||||
#else
|
||||
# define _GCC_VERSION_AT_LEAST(major, minor) 0
|
||||
#endif
|
||||
|
||||
#if _GCC_VERSION_AT_LEAST(4,0)
|
||||
# define _HIDDEN __attribute__ ((visibility ("hidden")))
|
||||
#else
|
||||
# define _HIDDEN
|
||||
#endif
|
||||
|
||||
#define CHECK_LT(a, b) if ((a) >= b) return 0
|
||||
|
||||
/*
|
||||
* Read next offset from pos.
|
||||
* Returns true if an offset could be read, false otherwise.
|
||||
*/
|
||||
|
||||
static int GetNextOffset(const unsigned char** pos,
|
||||
const unsigned char* end,
|
||||
const unsigned char** offset)
|
||||
{
|
||||
size_t bytes_consumed;
|
||||
|
||||
if (*pos == end)
|
||||
return 0;
|
||||
|
||||
/* When reading an offset the byte array must always contain at least
|
||||
* three more bytes to consume. First the offset to read, then a node
|
||||
* to skip over and finally a destination node. No object can be smaller
|
||||
* than one byte. */
|
||||
CHECK_LT(*pos + 2, end);
|
||||
switch (**pos & 0x60) {
|
||||
case 0x60: /* Read three byte offset */
|
||||
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
|
||||
bytes_consumed = 3;
|
||||
break;
|
||||
case 0x40: /* Read two byte offset */
|
||||
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
|
||||
bytes_consumed = 2;
|
||||
break;
|
||||
default:
|
||||
*offset += (*pos)[0] & 0x3F;
|
||||
bytes_consumed = 1;
|
||||
}
|
||||
if ((**pos & 0x80) != 0) {
|
||||
*pos = end;
|
||||
} else {
|
||||
*pos += bytes_consumed;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset is last in label.
|
||||
*/
|
||||
|
||||
static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
return(*offset & 0x80) != 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset matches first character in key.
|
||||
* This version matches characters not last in label.
|
||||
*/
|
||||
|
||||
static int IsMatch(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* key)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
return *offset == *key;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset matches first character in key.
|
||||
* This version matches characters last in label.
|
||||
*/
|
||||
|
||||
static int IsEndCharMatch(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* key)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
return *offset == (*key | 0x80);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read return value at offset.
|
||||
* Returns true if a return value could be read, false otherwise.
|
||||
*/
|
||||
|
||||
static int GetReturnValue(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
int* return_value)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
if ((*offset & 0xE0) == 0x80) {
|
||||
*return_value = *offset & 0x0F;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Looks up the string |key| with length |key_length| in a fixed set of
|
||||
* strings. The set of strings must be known at compile time. It is converted to
|
||||
* a graph structure named a DAFSA (Deterministic Acyclic Finite State
|
||||
* Automaton) by the script make_dafsa.py during compilation. This permits
|
||||
* efficient (in time and space) lookup. The graph generated by make_dafsa.py
|
||||
* takes the form of a constant byte array which should be supplied via the
|
||||
* |graph| and |length| parameters. The return value is kDafsaNotFound,
|
||||
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
|
||||
* kDafsaWildcardRule and kDafsaPrivateRule ORed together.
|
||||
*
|
||||
* Lookup a domain key in a byte array generated by make_dafsa.py.
|
||||
*/
|
||||
|
||||
/* prototype to skip warning with -Wmissing-prototypes */
|
||||
int _HIDDEN LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
|
||||
|
||||
int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||
size_t length,
|
||||
const char* key,
|
||||
size_t key_length)
|
||||
{
|
||||
const unsigned char* pos = graph;
|
||||
const unsigned char* end = graph + length;
|
||||
const unsigned char* offset = pos;
|
||||
const char* key_end = key + key_length;
|
||||
|
||||
while (GetNextOffset(&pos, end, &offset)) {
|
||||
/*char <char>+ end_char offsets
|
||||
* char <char>+ return value
|
||||
* char end_char offsets
|
||||
* char return value
|
||||
* end_char offsets
|
||||
* return_value
|
||||
*/
|
||||
int did_consume = 0;
|
||||
|
||||
if (key != key_end && !IsEOL(offset, end)) {
|
||||
/* Leading <char> is not a match. Don't dive into this child */
|
||||
if (!IsMatch(offset, end, key))
|
||||
continue;
|
||||
did_consume = 1;
|
||||
++offset;
|
||||
++key;
|
||||
/* Possible matches at this point:
|
||||
* <char>+ end_char offsets
|
||||
* <char>+ return value
|
||||
* end_char offsets
|
||||
* return value
|
||||
*/
|
||||
|
||||
/* Remove all remaining <char> nodes possible */
|
||||
while (!IsEOL(offset, end) && key != key_end) {
|
||||
if (!IsMatch(offset, end, key))
|
||||
return -1;
|
||||
++key;
|
||||
++offset;
|
||||
}
|
||||
}
|
||||
/* Possible matches at this point:
|
||||
* end_char offsets
|
||||
* return_value
|
||||
* If one or more <char> elements were consumed, a failure
|
||||
* to match is terminal. Otherwise, try the next node.
|
||||
*/
|
||||
if (key == key_end) {
|
||||
int return_value;
|
||||
|
||||
if (GetReturnValue(offset, end, &return_value))
|
||||
return return_value;
|
||||
/* The DAFSA guarantees that if the first char is a match, all
|
||||
* remaining char elements MUST match if the key is truly present.
|
||||
*/
|
||||
if (did_consume)
|
||||
return -1;
|
||||
continue;
|
||||
}
|
||||
if (!IsEndCharMatch(offset, end, key)) {
|
||||
if (did_consume)
|
||||
return -1; /* Unexpected */
|
||||
continue;
|
||||
}
|
||||
++key;
|
||||
pos = ++offset; /* Dive into child */
|
||||
}
|
||||
|
||||
return -1; /* No match */
|
||||
}
|
|
@ -0,0 +1,587 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright 2014 The Chromium Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE.chromium file.
|
||||
|
||||
"""
|
||||
A Deterministic acyclic finite state automaton (DAFSA) is a compact
|
||||
representation of an unordered word list (dictionary).
|
||||
|
||||
http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
|
||||
|
||||
This python program converts a list of strings to a byte array in C++.
|
||||
This python program fetches strings and return values from a gperf file
|
||||
and generates a C++ file with a byte array representing graph that can be
|
||||
used as a memory efficient replacement for the perfect hash table.
|
||||
|
||||
The input strings are assumed to consist of printable 7-bit ASCII characters
|
||||
and the return values are assumed to be one digit integers.
|
||||
|
||||
In this program a DAFSA is a diamond shaped graph starting at a common
|
||||
source node and ending at a common sink node. All internal nodes contain
|
||||
a label and each word is represented by the labels in one path from
|
||||
the source node to the sink node.
|
||||
|
||||
The following python represention is used for nodes:
|
||||
|
||||
Source node: [ children ]
|
||||
Internal node: (label, [ children ])
|
||||
Sink node: None
|
||||
|
||||
The graph is first compressed by prefixes like a trie. In the next step
|
||||
suffixes are compressed so that the graph gets diamond shaped. Finally
|
||||
one to one linked nodes are replaced by nodes with the labels joined.
|
||||
|
||||
The order of the operations is crucial since lookups will be performed
|
||||
starting from the source with no backtracking. Thus a node must have at
|
||||
most one child with a label starting by the same character. The output
|
||||
is also arranged so that all jumps are to increasing addresses, thus forward
|
||||
in memory.
|
||||
|
||||
The generated output has suffix free decoding so that the sign of leading
|
||||
bits in a link (a reference to a child node) indicate if it has a size of one,
|
||||
two or three bytes and if it is the last outgoing link from the actual node.
|
||||
A node label is terminated by a byte with the leading bit set.
|
||||
|
||||
The generated byte array can described by the following BNF:
|
||||
|
||||
<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
||||
|
||||
<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
|
||||
<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
|
||||
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
||||
|
||||
<offset1> ::= < byte in range [0x00-0x3F] >
|
||||
<offset2> ::= < byte in range [0x40-0x5F] >
|
||||
<offset3> ::= < byte in range [0x60-0x7F] >
|
||||
|
||||
<end_offset1> ::= < byte in range [0x80-0xBF] >
|
||||
<end_offset2> ::= < byte in range [0xC0-0xDF] >
|
||||
<end_offset3> ::= < byte in range [0xE0-0xFF] >
|
||||
|
||||
<prefix> ::= <char>
|
||||
|
||||
<label> ::= <end_char>
|
||||
| <char> <label>
|
||||
|
||||
<end_label> ::= <return_value>
|
||||
| <char> <end_label>
|
||||
|
||||
<offset> ::= <offset1>
|
||||
| <offset2> <byte>
|
||||
| <offset3> <byte> <byte>
|
||||
|
||||
<end_offset> ::= <end_offset1>
|
||||
| <end_offset2> <byte>
|
||||
| <end_offset3> <byte> <byte>
|
||||
|
||||
<offsets> ::= <end_offset>
|
||||
| <offset> <offsets>
|
||||
|
||||
<source> ::= <offsets>
|
||||
|
||||
<node> ::= <label> <offsets>
|
||||
| <prefix> <node>
|
||||
| <end_label>
|
||||
|
||||
<dafsa> ::= <source>
|
||||
| <dafsa> <node>
|
||||
|
||||
Decoding:
|
||||
|
||||
<char> -> printable 7-bit ASCII character
|
||||
<end_char> & 0x7F -> printable 7-bit ASCII character
|
||||
<return value> & 0x0F -> integer
|
||||
<offset1 & 0x3F> -> integer
|
||||
((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
||||
((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
|
||||
|
||||
end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
|
||||
offset2 and offset3 respectively.
|
||||
|
||||
The first offset in a list of offsets is the distance in bytes between the
|
||||
offset itself and the first child node. Subsequent offsets are the distance
|
||||
between previous child node and next child node. Thus each offset links a node
|
||||
to a child node. The distance is always counted between start addresses, i.e.
|
||||
first byte in decoded offset or first byte in child node.
|
||||
|
||||
Example 1:
|
||||
|
||||
%%
|
||||
aa, 1
|
||||
a, 2
|
||||
%%
|
||||
|
||||
The input is first parsed to a list of words:
|
||||
["aa1", "a2"]
|
||||
|
||||
A fully expanded graph is created from the words:
|
||||
source = [node1, node4]
|
||||
node1 = ("a", [node2])
|
||||
node2 = ("a", [node3])
|
||||
node3 = ("\x01", [sink])
|
||||
node4 = ("a", [node5])
|
||||
node5 = ("\x02", [sink])
|
||||
sink = None
|
||||
|
||||
Compression results in the following graph:
|
||||
source = [node1]
|
||||
node1 = ("a", [node2, node3])
|
||||
node2 = ("\x02", [sink])
|
||||
node3 = ("a\x01", [sink])
|
||||
sink = None
|
||||
|
||||
A C++ representation of the compressed graph is generated:
|
||||
|
||||
const unsigned char dafsa[7] = {
|
||||
0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
|
||||
};
|
||||
|
||||
The bytes in the generated array has the following meaning:
|
||||
|
||||
0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
|
||||
|
||||
1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
|
||||
2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
|
||||
|
||||
3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
|
||||
4: 0x82 <return_value> 0x82 & 0x0F -> return 2
|
||||
|
||||
5: 0x61 <char> label character 0x61 -> match "a"
|
||||
6: 0x81 <return_value> 0x81 & 0x0F -> return 1
|
||||
|
||||
Example 2:
|
||||
|
||||
%%
|
||||
aa, 1
|
||||
bbb, 2
|
||||
baa, 1
|
||||
%%
|
||||
|
||||
The input is first parsed to a list of words:
|
||||
["aa1", "bbb2", "baa1"]
|
||||
|
||||
Compression results in the following graph:
|
||||
source = [node1, node2]
|
||||
node1 = ("b", [node2, node3])
|
||||
node2 = ("aa\x01", [sink])
|
||||
node3 = ("bb\x02", [sink])
|
||||
sink = None
|
||||
|
||||
A C++ representation of the compressed graph is generated:
|
||||
|
||||
const unsigned char dafsa[11] = {
|
||||
0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
|
||||
};
|
||||
|
||||
The bytes in the generated array has the following meaning:
|
||||
|
||||
0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
|
||||
1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
|
||||
|
||||
2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
|
||||
3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
|
||||
4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
|
||||
|
||||
5: 0x61 <char> label character 0x61 -> match "a"
|
||||
6: 0x61 <char> label character 0x61 -> match "a"
|
||||
7: 0x81 <return_value> 0x81 & 0x0F -> return 1
|
||||
|
||||
8: 0x62 <char> label character 0x62 -> match "b"
|
||||
9: 0x62 <char> label character 0x62 -> match "b"
|
||||
10: 0x82 <return_value> 0x82 & 0x0F -> return 2
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
class InputError(Exception):
|
||||
"""Exception raised for errors in the input file."""
|
||||
|
||||
|
||||
def to_dafsa(words):
|
||||
"""Generates a DAFSA from a word list and returns the source node.
|
||||
|
||||
Each word is split into characters so that each character is represented by
|
||||
a unique node. It is assumed the word list is not empty.
|
||||
"""
|
||||
if not words:
|
||||
raise InputError('The domain list must not be empty')
|
||||
def to_nodes(word):
|
||||
"""Split words into characters"""
|
||||
if not 0x1F < ord(word[0]) < 0x80:
|
||||
raise InputError('Domain names must be printable 7-bit ASCII')
|
||||
if len(word) == 1:
|
||||
return chr(int(word[0], 16) & 0x0F), [None]
|
||||
return word[0], [to_nodes(word[1:])]
|
||||
return [to_nodes(word) for word in words]
|
||||
|
||||
|
||||
def to_words(node):
|
||||
"""Generates a word list from all paths starting from an internal node."""
|
||||
if not node:
|
||||
return ['']
|
||||
return [(node[0] + word) for child in node[1] for word in to_words(child)]
|
||||
|
||||
|
||||
def reverse(dafsa):
|
||||
"""Generates a new DAFSA that is reversed, so that the old sink node becomes
|
||||
the new source node.
|
||||
"""
|
||||
sink = []
|
||||
nodemap = {}
|
||||
|
||||
def dfs(node, parent):
|
||||
"""Creates reverse nodes.
|
||||
|
||||
A new reverse node will be created for each old node. The new node will
|
||||
get a reversed label and the parents of the old node as children.
|
||||
"""
|
||||
if not node:
|
||||
sink.append(parent)
|
||||
elif id(node) not in nodemap:
|
||||
nodemap[id(node)] = (node[0][::-1], [parent])
|
||||
for child in node[1]:
|
||||
dfs(child, nodemap[id(node)])
|
||||
else:
|
||||
nodemap[id(node)][1].append(parent)
|
||||
|
||||
for node in dafsa:
|
||||
dfs(node, None)
|
||||
return sink
|
||||
|
||||
|
||||
def join_labels(dafsa):
|
||||
"""Generates a new DAFSA where internal nodes are merged if there is a one to
|
||||
one connection.
|
||||
"""
|
||||
parentcount = {id(None): 2}
|
||||
nodemap = {id(None): None}
|
||||
|
||||
def count_parents(node):
|
||||
"""Count incoming references"""
|
||||
if id(node) in parentcount:
|
||||
parentcount[id(node)] += 1
|
||||
else:
|
||||
parentcount[id(node)] = 1
|
||||
for child in node[1]:
|
||||
count_parents(child)
|
||||
|
||||
def join(node):
|
||||
"""Create new nodes"""
|
||||
if id(node) not in nodemap:
|
||||
children = [join(child) for child in node[1]]
|
||||
if len(children) == 1 and parentcount[id(node[1][0])] == 1:
|
||||
child = children[0]
|
||||
nodemap[id(node)] = (node[0] + child[0], child[1])
|
||||
else:
|
||||
nodemap[id(node)] = (node[0], children)
|
||||
return nodemap[id(node)]
|
||||
|
||||
for node in dafsa:
|
||||
count_parents(node)
|
||||
return [join(node) for node in dafsa]
|
||||
|
||||
|
||||
def join_suffixes(dafsa):
|
||||
"""Generates a new DAFSA where nodes that represent the same word lists
|
||||
towards the sink are merged.
|
||||
"""
|
||||
nodemap = {frozenset(('',)): None}
|
||||
|
||||
def join(node):
|
||||
"""Returns a macthing node. A new node is created if no matching node
|
||||
exists. The graph is accessed in dfs order.
|
||||
"""
|
||||
suffixes = frozenset(to_words(node))
|
||||
if suffixes not in nodemap:
|
||||
nodemap[suffixes] = (node[0], [join(child) for child in node[1]])
|
||||
return nodemap[suffixes]
|
||||
|
||||
return [join(node) for node in dafsa]
|
||||
|
||||
|
||||
def top_sort(dafsa):
|
||||
"""Generates list of nodes in topological sort order."""
|
||||
incoming = {}
|
||||
|
||||
def count_incoming(node):
|
||||
"""Counts incoming references."""
|
||||
if node:
|
||||
if id(node) not in incoming:
|
||||
incoming[id(node)] = 1
|
||||
for child in node[1]:
|
||||
count_incoming(child)
|
||||
else:
|
||||
incoming[id(node)] += 1
|
||||
|
||||
for node in dafsa:
|
||||
count_incoming(node)
|
||||
|
||||
for node in dafsa:
|
||||
incoming[id(node)] -= 1
|
||||
|
||||
waiting = [node for node in dafsa if incoming[id(node)] == 0]
|
||||
nodes = []
|
||||
|
||||
while waiting:
|
||||
node = waiting.pop()
|
||||
assert incoming[id(node)] == 0
|
||||
nodes.append(node)
|
||||
for child in node[1]:
|
||||
if child:
|
||||
incoming[id(child)] -= 1
|
||||
if incoming[id(child)] == 0:
|
||||
waiting.append(child)
|
||||
return nodes
|
||||
|
||||
|
||||
def encode_links(children, offsets, current):
|
||||
"""Encodes a list of children as one, two or three byte offsets."""
|
||||
if not children[0]:
|
||||
# This is an <end_label> node and no links follow such nodes
|
||||
assert len(children) == 1
|
||||
return []
|
||||
guess = 3 * len(children)
|
||||
assert children
|
||||
children = sorted(children, key=lambda x: -offsets[id(x)])
|
||||
while True:
|
||||
offset = current + guess
|
||||
buf = []
|
||||
for child in children:
|
||||
last = len(buf)
|
||||
distance = offset - offsets[id(child)]
|
||||
assert distance > 0 and distance < (1 << 21)
|
||||
|
||||
if distance < (1 << 6):
|
||||
# A 6-bit offset: "s0xxxxxx"
|
||||
buf.append(distance)
|
||||
elif distance < (1 << 13):
|
||||
# A 13-bit offset: "s10xxxxxxxxxxxxx"
|
||||
buf.append(0x40 | (distance >> 8))
|
||||
buf.append(distance & 0xFF)
|
||||
else:
|
||||
# A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
|
||||
buf.append(0x60 | (distance >> 16))
|
||||
buf.append((distance >> 8) & 0xFF)
|
||||
buf.append(distance & 0xFF)
|
||||
# Distance in first link is relative to following record.
|
||||
# Distance in other links are relative to previous link.
|
||||
offset -= distance
|
||||
if len(buf) == guess:
|
||||
break
|
||||
guess = len(buf)
|
||||
# Set most significant bit to mark end of links in this node.
|
||||
buf[last] |= (1 << 7)
|
||||
buf.reverse()
|
||||
return buf
|
||||
|
||||
|
||||
def encode_prefix(label):
|
||||
"""Encodes a node label as a list of bytes without a trailing high byte.
|
||||
|
||||
This method encodes a node if there is exactly one child and the
|
||||
child follows immidiately after so that no jump is needed. This label
|
||||
will then be a prefix to the label in the child node.
|
||||
"""
|
||||
assert label
|
||||
return [ord(c) for c in reversed(label)]
|
||||
|
||||
|
||||
def encode_label(label):
|
||||
"""Encodes a node label as a list of bytes with a trailing high byte >0x80.
|
||||
"""
|
||||
buf = encode_prefix(label)
|
||||
# Set most significant bit to mark end of label in this node.
|
||||
buf[0] |= (1 << 7)
|
||||
return buf
|
||||
|
||||
|
||||
def encode(dafsa):
|
||||
"""Encodes a DAFSA to a list of bytes"""
|
||||
output = []
|
||||
offsets = {}
|
||||
|
||||
for node in reversed(top_sort(dafsa)):
|
||||
if (len(node[1]) == 1 and node[1][0] and
|
||||
(offsets[id(node[1][0])] == len(output))):
|
||||
output.extend(encode_prefix(node[0]))
|
||||
else:
|
||||
output.extend(encode_links(node[1], offsets, len(output)))
|
||||
output.extend(encode_label(node[0]))
|
||||
offsets[id(node)] = len(output)
|
||||
|
||||
output.extend(encode_links(dafsa, offsets, len(output)))
|
||||
output.reverse()
|
||||
return output
|
||||
|
||||
|
||||
def to_cxx(data):
|
||||
"""Generates C++ code from a list of encoded bytes."""
|
||||
text = '/* This file is generated. DO NOT EDIT!\n\n'
|
||||
text += 'The byte array encodes effective tld names. See make_dafsa.py for'
|
||||
text += ' documentation.'
|
||||
text += '*/\n\n'
|
||||
text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
|
||||
for i in range(0, len(data), 12):
|
||||
text += ' '
|
||||
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
|
||||
text += ',\n'
|
||||
text += '};\n'
|
||||
return text
|
||||
|
||||
|
||||
def words_to_whatever(words, converter):
|
||||
"""Generates C++ code from a word list"""
|
||||
dafsa = to_dafsa(words)
|
||||
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
||||
dafsa = fun(dafsa)
|
||||
return converter(encode(dafsa))
|
||||
|
||||
|
||||
def words_to_cxx(words):
|
||||
"""Generates C++ code from a word list"""
|
||||
return words_to_whatever(words, to_cxx)
|
||||
|
||||
|
||||
def words_to_binary(words):
|
||||
"""Generates C++ code from a word list"""
|
||||
return words_to_whatever(words, bytearray)
|
||||
|
||||
|
||||
def parse_psl2c(infile):
|
||||
"""Parses file generated by psl2c and extract strings and return code"""
|
||||
lines = [line.strip() for line in infile]
|
||||
|
||||
for line in lines:
|
||||
if line[-3:-1] != ', ':
|
||||
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
|
||||
# Technically the DAFSA format could support return values in range [0-31],
|
||||
# but the values below are the only with a defined meaning.
|
||||
if line[-1] not in '0123456789ABCDEF':
|
||||
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % line[-1])
|
||||
|
||||
# with open("gperf.out", 'w') as outfile:
|
||||
# for line in sorted(lines):
|
||||
# outfile.write(line[:-3] + line[-1] + "\n")
|
||||
|
||||
return [line[:-3] + line[-1] for line in sorted(lines)]
|
||||
|
||||
|
||||
def parse_psl(infile):
|
||||
"""Parses PSL file and extract strings and return code"""
|
||||
PSL_FLAG_EXCEPTION = (1<<0)
|
||||
PSL_FLAG_WILDCARD = (1<<1)
|
||||
PSL_FLAG_ICANN = (1<<2) # entry of ICANN section
|
||||
PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
|
||||
PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
|
||||
|
||||
psl = {}
|
||||
section = 0
|
||||
|
||||
for line in infile:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith("//"):
|
||||
if section == 0:
|
||||
if "===BEGIN ICANN DOMAINS===" in line:
|
||||
section = PSL_FLAG_ICANN
|
||||
elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
|
||||
section = PSL_FLAG_PRIVATE
|
||||
elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
|
||||
section = 0
|
||||
elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
|
||||
section = 0
|
||||
continue # skip comments
|
||||
|
||||
if line[0] == '!':
|
||||
flags = PSL_FLAG_EXCEPTION | section
|
||||
line = line[1:]
|
||||
elif line[0] == '*':
|
||||
if line[1] != '.':
|
||||
print('Unsupported kind of rule (ignored): %s' % line)
|
||||
continue
|
||||
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
|
||||
line = line[2:]
|
||||
else:
|
||||
if not '.' in line:
|
||||
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
|
||||
flags = PSL_FLAG_PLAIN | section
|
||||
|
||||
line = line.decode('utf-8').encode("idna")
|
||||
|
||||
if line in psl:
|
||||
"""Found existing entry:
|
||||
Combination of exception and plain rule is ambiguous
|
||||
!foo.bar
|
||||
foo.bar
|
||||
|
||||
Allowed:
|
||||
!foo.bar + *.foo.bar
|
||||
foo.bar + *.foo.bar
|
||||
"""
|
||||
print('Found %s/%X (now %X)' % line, psl[line], flags)
|
||||
continue
|
||||
|
||||
psl[line] = flags
|
||||
|
||||
# with open("psl.out", 'w') as outfile:
|
||||
# for (domain, flags) in sorted(psl.iteritems()):
|
||||
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
|
||||
|
||||
return [domain + "%X" % (flags & 0x0F) for (domain, flags) in sorted(psl.iteritems())]
|
||||
|
||||
|
||||
def usage():
|
||||
"""Prints the usage"""
|
||||
print('usage: %s [options] infile outfile' % sys.argv[0])
|
||||
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
|
||||
print(' --input-format=psl infile is a Public Suffix List file')
|
||||
print(' --output-format=cxx Write DAFSA as C/C++ code')
|
||||
print(' --output-format=binary Write DAFSA binary data')
|
||||
exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
"""Convert PSL file into C or binary DAFSA file"""
|
||||
if len(sys.argv) < 3:
|
||||
usage()
|
||||
|
||||
converter = words_to_cxx
|
||||
parser = parse_psl2c
|
||||
|
||||
for arg in sys.argv[1:-2]:
|
||||
if arg.startswith('--input-format='):
|
||||
value = arg[15:].lower()
|
||||
if value == 'psl':
|
||||
parser = parse_psl
|
||||
elif value == 'psl2c':
|
||||
parser = parse_psl2c
|
||||
else:
|
||||
print("Unknown input format '%s'" % value)
|
||||
return 1
|
||||
elif arg.startswith('--output-format='):
|
||||
value = arg[16:].lower()
|
||||
if value == 'binary':
|
||||
converter = words_to_binary
|
||||
elif value == 'cxx':
|
||||
converter = words_to_cxx
|
||||
else:
|
||||
print("Unknown output format '%s'" % value)
|
||||
return 1
|
||||
else:
|
||||
usage()
|
||||
|
||||
if sys.argv[-2] == '-':
|
||||
with open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(sys.stdin)))
|
||||
else:
|
||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(infile)))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
209
src/psl2c.c
209
src/psl2c.c
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright(c) 2014-2015 Tim Ruehsen
|
||||
* Copyright(c) 2014-2016 Tim Ruehsen
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -45,8 +45,6 @@
|
|||
# define _GENERATE_BUILTIN_DATA
|
||||
#endif
|
||||
|
||||
#ifdef _GENERATE_BUILTIN_DATA
|
||||
|
||||
#include <libpsl.h>
|
||||
|
||||
/* here we include the library source code to have access to internal functions and data structures */
|
||||
|
@ -54,6 +52,8 @@
|
|||
# include "psl.c"
|
||||
#undef _LIBPSL_INCLUDED_BY_PSL2C
|
||||
|
||||
#ifdef _GENERATE_BUILTIN_DATA
|
||||
|
||||
#if 0
|
||||
static int _check_psl(const psl_ctx_t *psl)
|
||||
{
|
||||
|
@ -128,8 +128,9 @@ static int _check_psl(const psl_ctx_t *psl)
|
|||
}
|
||||
#endif
|
||||
|
||||
static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname)
|
||||
static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
|
||||
{
|
||||
FILE *fp;
|
||||
int it;
|
||||
|
||||
#ifdef BUILTIN_GENERATOR_LIBICU
|
||||
|
@ -142,143 +143,175 @@ static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *
|
|||
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version);
|
||||
} while (0);
|
||||
#elif defined(BUILTIN_GENERATOR_LIBIDN2)
|
||||
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL));
|
||||
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL));
|
||||
#elif defined(BUILTIN_GENERATOR_LIBIDN)
|
||||
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL));
|
||||
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL));
|
||||
#else
|
||||
fprintf(fpout, "/* automatically generated by psl2c (without punycode support) */\n");
|
||||
fprintf(fpout, "/* automatically generated by psl2c (punycode generated internally) */\n");
|
||||
#endif
|
||||
|
||||
fprintf(fpout, "static _psl_entry_t %s[] = {\n", varname);
|
||||
if ((fp = fopen("in.tmp", "w"))) {
|
||||
for (it = 0; it < v->cur; it++) {
|
||||
_psl_entry_t *e = _vector_get(v, it);
|
||||
unsigned char *s = (unsigned char *)e->label_buf;
|
||||
|
||||
for (it = 0; it < v->cur; it++) {
|
||||
_psl_entry_t *e = _vector_get(v, it);
|
||||
/* search for non-ASCII label and skip it */
|
||||
while (*s && *s < 128) s++;
|
||||
if (*s) continue;
|
||||
|
||||
fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n",
|
||||
e->label_buf, e->length, (int) e->nlabels, (int) e->flags);
|
||||
}
|
||||
|
||||
fprintf(fpout, "};\n");
|
||||
}
|
||||
|
||||
#if 0
|
||||
#if !defined(WITH_LIBICU) && !defined(WITH_IDN2)
|
||||
static int _str_needs_encoding(const char *s)
|
||||
{
|
||||
while (*s && *((unsigned char *)s) < 128) s++;
|
||||
|
||||
return !!*s;
|
||||
}
|
||||
|
||||
static void _add_punycode_if_needed(_psl_vector_t *v)
|
||||
{
|
||||
int it, n;
|
||||
|
||||
/* do not use 'it < v->cur' since v->cur is changed by _vector_add() ! */
|
||||
for (it = 0, n = v->cur; it < n; it++) {
|
||||
_psl_entry_t *e = _vector_get(v, it);
|
||||
|
||||
if (_str_needs_encoding(e->label_buf)) {
|
||||
_psl_entry_t suffix, *suffixp;
|
||||
char lookupname[64] = "";
|
||||
|
||||
/* this is much slower than the libidn2 API but should have no license issues */
|
||||
FILE *pp;
|
||||
char cmd[16 + sizeof(e->label_buf)];
|
||||
snprintf(cmd, sizeof(cmd), "idn2 '%s'", e->label_buf);
|
||||
if ((pp = popen(cmd, "r"))) {
|
||||
if (fscanf(pp, "%63s", lookupname) >= 1 && strcmp(e->label_buf, lookupname)) {
|
||||
/* fprintf(stderr, "idn2 '%s' -> '%s'\n", e->label_buf, lookupname); */
|
||||
_suffix_init(&suffix, lookupname, strlen(lookupname));
|
||||
suffix.wildcard = e->wildcard;
|
||||
suffixp = _vector_get(v, _vector_add(v, &suffix));
|
||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
||||
}
|
||||
pclose(pp);
|
||||
} else
|
||||
fprintf(stderr, "Failed to call popen(%s, \"r\")\n", cmd);
|
||||
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
_vector_sort(v);
|
||||
}
|
||||
#endif /* !defined(WITH_LIBICU) && !defined(WITH_IDN2) */
|
||||
#endif
|
||||
if ((it = system(MAKE_DAFSA " in.tmp out.tmp")))
|
||||
fprintf(stderr, "Failed to execute " MAKE_DAFSA "\n");
|
||||
|
||||
if ((fp = fopen("out.tmp", "r"))) {
|
||||
char buf[256];
|
||||
|
||||
while (fgets(buf, sizeof(buf), fp))
|
||||
fputs(buf, fpout);
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
unlink("in.tmp");
|
||||
unlink("out.tmp");
|
||||
}
|
||||
#endif /* _GENERATE_BUILTIN_DATA */
|
||||
|
||||
static int _print_psl_entries_dafsa_binary(const char *fname, const _psl_vector_t *v)
|
||||
{
|
||||
FILE *fp;
|
||||
int ret = 0, it, rc;
|
||||
char cmd[256];
|
||||
|
||||
if ((fp = fopen("in.tmp", "w"))) {
|
||||
for (it = 0; it < v->cur; it++) {
|
||||
_psl_entry_t *e = _vector_get(v, it);
|
||||
unsigned char *s = (unsigned char *)e->label_buf;
|
||||
|
||||
/* search for non-ASCII label and skip it */
|
||||
while (*s && *s < 128) s++;
|
||||
if (*s) continue;
|
||||
|
||||
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
} else {
|
||||
fprintf(stderr, "Failed to write open 'in.tmp'\n");
|
||||
return 3;
|
||||
}
|
||||
|
||||
snprintf(cmd, sizeof(cmd), MAKE_DAFSA " --binary in.tmp %s", fname);
|
||||
if ((rc = system(cmd))) {
|
||||
fprintf(stderr, "Failed to execute '%s' (%d)\n", cmd, rc);
|
||||
ret = 2;
|
||||
}
|
||||
|
||||
unlink("in.tmp");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void usage(void)
|
||||
{
|
||||
fprintf(stderr, "Usage: psl2c [--binary] <infile> <outfile>\n");
|
||||
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
|
||||
fprintf(stderr, " <outfile> is the the filename to be generated from <infile>\n");
|
||||
fprintf(stderr, " --binary Generate binary DAFSA output (default: C code for psl.c)\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
FILE *fpout;
|
||||
#ifdef _GENERATE_BUILTIN_DATA
|
||||
psl_ctx_t *psl;
|
||||
#endif
|
||||
int ret = 0;
|
||||
int ret = 0, argpos = 1, binary = 0;
|
||||
|
||||
if (argc != 3) {
|
||||
fprintf(stderr, "Usage: psl2c <infile> <outfile>\n");
|
||||
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
|
||||
fprintf(stderr, " <outfile> is the the C filename to be generated from <infile>\n");
|
||||
return 1;
|
||||
if (argc < 3)
|
||||
usage();
|
||||
|
||||
if (strcmp(argv[argpos], "--binary") == 0) {
|
||||
argpos++;
|
||||
binary = 1;
|
||||
}
|
||||
|
||||
if (argc - argpos != 2)
|
||||
usage();
|
||||
|
||||
if (binary) {
|
||||
if (!(psl = psl_load_file(argv[argpos])))
|
||||
return 2;
|
||||
|
||||
ret = _print_psl_entries_dafsa_binary(argv[argpos + 1], psl->suffixes);
|
||||
|
||||
psl_free(psl);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef _GENERATE_BUILTIN_DATA
|
||||
if (!(psl = psl_load_file(argv[1])))
|
||||
if (!(psl = psl_load_file(argv[argpos])))
|
||||
return 2;
|
||||
|
||||
/* look for ambigious or double entries */
|
||||
if (!psl->suffixes || !psl->nsuffixes) {
|
||||
fprintf(stderr, "Failed to load PSL. Please check content of '%s'.\n", argv[argpos]);
|
||||
return 5;
|
||||
}
|
||||
|
||||
/* look for ambiguous or double entries */
|
||||
/* if (_check_psl(psl)) {
|
||||
psl_free(psl);
|
||||
return 5;
|
||||
}
|
||||
*/
|
||||
if ((fpout = fopen(argv[2], "w"))) {
|
||||
if ((fpout = fopen(argv[argpos + 1], "w"))) {
|
||||
FILE *pp;
|
||||
struct stat st;
|
||||
size_t cmdsize = 16 + strlen(argv[1]);
|
||||
size_t cmdsize = 16 + strlen(argv[argpos]);
|
||||
char *cmd = alloca(cmdsize), checksum[64] = "";
|
||||
const char *source_date_epoch = NULL;
|
||||
char *abs_srcfile;
|
||||
|
||||
#if 0
|
||||
/* include library code did not generate punycode, so let's do it for the builtin data */
|
||||
_add_punycode_if_needed(psl->suffixes);
|
||||
#endif
|
||||
_print_psl_entries_dafsa(fpout, psl->suffixes);
|
||||
|
||||
_print_psl_entries(fpout, psl->suffixes, "suffixes");
|
||||
|
||||
snprintf(cmd, cmdsize, "sha1sum %s", argv[1]);
|
||||
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);
|
||||
if ((pp = popen(cmd, "r"))) {
|
||||
if (fscanf(pp, "%63[0-9a-zA-Z]", checksum) < 1)
|
||||
*checksum = 0;
|
||||
pclose(pp);
|
||||
}
|
||||
|
||||
if (stat(argv[1], &st) != 0)
|
||||
if (stat(argv[argpos], &st) != 0)
|
||||
st.st_mtime = 0;
|
||||
fprintf(fpout, "static time_t _psl_file_time = %lu;\n", st.st_mtime);
|
||||
if ((source_date_epoch = getenv("SOURCE_DATE_EPOCH")))
|
||||
fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", atol(source_date_epoch));
|
||||
else
|
||||
fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", time(NULL));
|
||||
|
||||
fprintf(fpout, "static int _psl_nsuffixes = %d;\n", psl->nsuffixes);
|
||||
fprintf(fpout, "static int _psl_nexceptions = %d;\n", psl->nexceptions);
|
||||
fprintf(fpout, "static int _psl_nwildcards = %d;\n", psl->nwildcards);
|
||||
fprintf(fpout, "static const char _psl_sha1_checksum[] = \"%s\";\n", checksum);
|
||||
fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", argv[1]);
|
||||
|
||||
/* We need an absolute path here, else psl_builtin_outdated() won't work reliable */
|
||||
/* Caveat: symbolic links are resolved by realpath() */
|
||||
if ((abs_srcfile = realpath(argv[argpos], NULL))) {
|
||||
fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", abs_srcfile);
|
||||
free(abs_srcfile);
|
||||
} else
|
||||
fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", argv[argpos]);
|
||||
|
||||
if (fclose(fpout) != 0)
|
||||
ret = 4;
|
||||
} else {
|
||||
fprintf(stderr, "Failed to write open '%s'\n", argv[2]);
|
||||
fprintf(stderr, "Failed to write open '%s'\n", argv[argpos + 1]);
|
||||
ret = 3;
|
||||
}
|
||||
|
||||
psl_free(psl);
|
||||
#else
|
||||
if ((fpout = fopen(argv[2], "w"))) {
|
||||
fprintf(fpout, "static _psl_entry_t suffixes[1];\n");
|
||||
if ((fpout = fopen(argv[argpos + 1], "w"))) {
|
||||
fprintf(fpout, "static const unsigned char kDafsa[1];\n");
|
||||
fprintf(fpout, "static time_t _psl_file_time;\n");
|
||||
fprintf(fpout, "static time_t _psl_compile_time;\n");
|
||||
fprintf(fpout, "static int _psl_nsuffixes = 0;\n");
|
||||
fprintf(fpout, "static int _psl_nexceptions = 0;\n");
|
||||
fprintf(fpout, "static int _psl_nwildcards = 0;\n");
|
||||
|
@ -288,7 +321,7 @@ int main(int argc, const char **argv)
|
|||
if (fclose(fpout) != 0)
|
||||
ret = 4;
|
||||
} else {
|
||||
fprintf(stderr, "Failed to write open '%s'\n", argv[2]);
|
||||
fprintf(stderr, "Failed to write open '%s'\n", argv[argpos + 1]);
|
||||
ret = 3;
|
||||
}
|
||||
#endif /* GENERATE_BUILTIN_DATA */
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
DEFS = @DEFS@ -DDATADIR=\"$(top_srcdir)/data\" -DSRCDIR=\"$(srcdir)\" -DPSL_FILE=\"$(PSL_FILE)\" -DPSL_TESTFILE=\"$(PSL_TESTFILE)\"
|
||||
DEFS = @DEFS@ -DSRCDIR=\"$(srcdir)\" -DPSL_FILE=\"$(PSL_FILE)\" -DPSL_TESTFILE=\"$(PSL_TESTFILE)\"
|
||||
AM_CPPFLAGS = -I$(top_srcdir)/include
|
||||
LDADD = ../src/libpsl.la
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright(c) 2014-2015 Tim Ruehsen
|
||||
* Copyright(c) 2014-2016 Tim Ruehsen
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -45,68 +45,140 @@
|
|||
static int
|
||||
ok,
|
||||
failed;
|
||||
#ifdef HAVE_CLOCK_GETTIME
|
||||
struct timespec ts1, ts2;
|
||||
#endif
|
||||
|
||||
static inline int _isspace_ascii(const char c)
|
||||
{
|
||||
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
||||
}
|
||||
|
||||
static void test_psl_entry(const psl_ctx_t *psl, const char *domain, int type)
|
||||
{
|
||||
int result;
|
||||
|
||||
if (*domain == '!') { /* an exception to a wildcard, e.g. !www.ck (wildcard is *.ck) */
|
||||
if ((result = psl_is_public_suffix(psl, domain + 1))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix(%s)=%d (expected 0)\n", domain, result);
|
||||
} else ok++;
|
||||
|
||||
if ((domain = strchr(domain, '.'))) {
|
||||
if (!(result = psl_is_public_suffix(psl, domain + 1))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", domain + 1, result);
|
||||
} else ok++;
|
||||
}
|
||||
} else if (*domain == '*') { /* a wildcard, e.g. *.ck or *.platform.sh */
|
||||
char *xdomain;
|
||||
size_t len;
|
||||
|
||||
if (!(result = psl_is_public_suffix(psl, domain + 1))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", domain + 1, result);
|
||||
} else ok++;
|
||||
|
||||
len = strlen(domain);
|
||||
xdomain = alloca(len + 1);
|
||||
memcpy(xdomain, domain, len + 1);
|
||||
*xdomain = 'x';
|
||||
if (!(result = psl_is_public_suffix(psl, domain))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", domain, result);
|
||||
} else ok++;
|
||||
} else {
|
||||
if (!(result = psl_is_public_suffix(psl, domain))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", domain, result);
|
||||
} else ok++;
|
||||
|
||||
if (!(strchr(domain, '.'))) {
|
||||
/* TLDs are always expected to be Publix Suffixes */
|
||||
if (!(result = psl_is_public_suffix2(psl, domain, PSL_TYPE_PRIVATE))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix2(%s, PSL_TYPE_PRIVATE)=%d (expected 1)\n", domain, result);
|
||||
} else ok++;
|
||||
|
||||
if (!(result = psl_is_public_suffix2(psl, domain, PSL_TYPE_ICANN))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix2(%s, PSL_TYPE_ICANN)=%d (expected 0)\n", domain, result);
|
||||
} else ok++;
|
||||
} else if (type == PSL_TYPE_PRIVATE) {
|
||||
if (!(result = psl_is_public_suffix2(psl, domain, PSL_TYPE_PRIVATE))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix2(%s, PSL_TYPE_PRIVATE)=%d (expected 1)\n", domain, result);
|
||||
} else ok++;
|
||||
|
||||
if ((result = psl_is_public_suffix2(psl, domain, PSL_TYPE_ICANN))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix2(%s, PSL_TYPE_ICANN)=%d (expected 0)\n", domain, result);
|
||||
} else ok++;
|
||||
} else if (type == PSL_TYPE_ICANN) {
|
||||
if (!(result = psl_is_public_suffix2(psl, domain, PSL_TYPE_ICANN))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix2(%s, PSL_TYPE_ICANN)=%d (expected 1)\n", domain, result);
|
||||
} else ok++;
|
||||
|
||||
if ((result = psl_is_public_suffix2(psl, domain, PSL_TYPE_PRIVATE))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix2(%s, PSL_TYPE_PRIVATE)=%d (expected 0)\n", domain, result);
|
||||
} else ok++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void test_psl(void)
|
||||
{
|
||||
FILE *fp;
|
||||
psl_ctx_t *psl;
|
||||
int result;
|
||||
const psl_ctx_t *psl2;
|
||||
int type = 0;
|
||||
char buf[256], *linep, *p;
|
||||
|
||||
psl = psl_load_file(PSL_FILE); /* PSL_FILE can be set by ./configure --with-psl-file=[PATH] */
|
||||
|
||||
printf("loaded %d suffixes and %d exceptions\n", psl_suffix_count(psl), psl_suffix_exception_count(psl));
|
||||
|
||||
psl2 = psl_builtin();
|
||||
printf("builtin PSL has %d suffixes and %d exceptions\n", psl_suffix_count(psl2), psl_suffix_exception_count(psl2));
|
||||
|
||||
if ((fp = fopen(PSL_FILE, "r"))) {
|
||||
#ifdef HAVE_CLOCK_GETTIME
|
||||
clock_gettime(CLOCK_REALTIME, &ts1);
|
||||
#endif
|
||||
|
||||
while ((linep = fgets(buf, sizeof(buf), fp))) {
|
||||
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
|
||||
if (!*linep) continue; /* skip empty lines */
|
||||
|
||||
if (*linep == '/' && linep[1] == '/')
|
||||
if (*linep == '/' && linep[1] == '/') {
|
||||
if (!type) {
|
||||
if (strstr(linep + 2, "===BEGIN ICANN DOMAINS==="))
|
||||
type = PSL_TYPE_ICANN;
|
||||
else if (!type && strstr(linep + 2, "===BEGIN PRIVATE DOMAINS==="))
|
||||
type = PSL_TYPE_PRIVATE;
|
||||
}
|
||||
else if (type == PSL_TYPE_ICANN && strstr(linep + 2, "===END ICANN DOMAINS==="))
|
||||
type = 0;
|
||||
else if (type == PSL_TYPE_PRIVATE && strstr(linep + 2, "===END PRIVATE DOMAINS==="))
|
||||
type = 0;
|
||||
|
||||
continue; /* skip comments */
|
||||
}
|
||||
|
||||
/* parse suffix rule */
|
||||
for (p = linep; *linep && !_isspace_ascii(*linep);) linep++;
|
||||
*linep = 0;
|
||||
|
||||
if (*p == '!') { /* an exception to a wildcard, e.g. !www.ck (wildcard is *.ck) */
|
||||
if ((result = psl_is_public_suffix(psl, p + 1))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix(%s)=%d (expected 0)\n", p, result);
|
||||
} else ok++;
|
||||
test_psl_entry(psl, p, type);
|
||||
|
||||
if ((p = strchr(p, '.'))) {
|
||||
if (!(result = psl_is_public_suffix(psl, p + 1))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", p + 1, result);
|
||||
} else ok++;
|
||||
}
|
||||
}
|
||||
else if (*p == '*') { /* a wildcard, e.g. *.ck */
|
||||
if (!(result = psl_is_public_suffix(psl, p + 1))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", p + 1, result);
|
||||
} else ok++;
|
||||
|
||||
*p = 'x';
|
||||
if (!(result = psl_is_public_suffix(psl, p))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", p, result);
|
||||
} else ok++;
|
||||
}
|
||||
else {
|
||||
if (!(result = psl_is_public_suffix(psl, p))) {
|
||||
failed++;
|
||||
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", p, result);
|
||||
} else ok++;
|
||||
}
|
||||
if (psl2)
|
||||
test_psl_entry(psl2, p, type);
|
||||
}
|
||||
|
||||
#ifdef HAVE_CLOCK_GETTIME
|
||||
clock_gettime(CLOCK_REALTIME, &ts2);
|
||||
#endif
|
||||
fclose(fp);
|
||||
} else {
|
||||
printf("Failed to open %s\n", PSL_FILE);
|
||||
|
@ -114,10 +186,15 @@ static void test_psl(void)
|
|||
}
|
||||
|
||||
psl_free(psl);
|
||||
psl_free((psl_ctx_t *)psl2);
|
||||
}
|
||||
|
||||
int main(int argc, const char * const *argv)
|
||||
{
|
||||
#ifdef HAVE_CLOCK_GETTIME
|
||||
long ns;
|
||||
#endif
|
||||
|
||||
/* if VALGRIND testing is enabled, we have to call ourselves with valgrind checking */
|
||||
if (argc == 1) {
|
||||
const char *valgrind = getenv("TESTS_VALGRIND");
|
||||
|
@ -138,6 +215,21 @@ int main(int argc, const char * const *argv)
|
|||
return 1;
|
||||
}
|
||||
|
||||
printf("Summary: All %d tests passed\n", ok + failed);
|
||||
#ifdef HAVE_CLOCK_GETTIME
|
||||
if (ts1.tv_sec == ts2.tv_sec)
|
||||
ns = ts2.tv_nsec - ts1.tv_nsec;
|
||||
else if (ts1.tv_sec == ts2.tv_sec - 1)
|
||||
ns = 1000000000L - (ts2.tv_nsec - ts1.tv_nsec);
|
||||
else
|
||||
ns = 0; /* let's assume something is wrong and skip outputting measured time */
|
||||
|
||||
if (ns)
|
||||
printf("Summary: All %d tests passed in %ld.%06ld ms\n", ok, ns / 1000000, ns % 1000000000);
|
||||
else
|
||||
printf("Summary: All %d tests passed\n", ok);
|
||||
#else
|
||||
printf("Summary: All %d tests passed\n", ok);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright(c) 2014-2015 Tim Ruehsen
|
||||
* Copyright(c) 2014-2016 Tim Ruehsen
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -80,6 +80,10 @@ static void test_psl(void)
|
|||
{ ".forgot.his.name", 1 },
|
||||
{ "whoever.his.name", 0 },
|
||||
{ "whoever.forgot.his.name", 0 },
|
||||
{ "whatever.platform.sh", 1 },
|
||||
{ ".platform.sh", 1 },
|
||||
{ "whatever.yokohama.jp", 1 },
|
||||
{ ".yokohama.jp", 1 },
|
||||
{ ".", 1 }, /* special case */
|
||||
{ "", 1 }, /* special case */
|
||||
{ NULL, 1 }, /* special case */
|
||||
|
@ -104,9 +108,6 @@ static void test_psl(void)
|
|||
}
|
||||
}
|
||||
|
||||
printf("psl_builtin_compile_time()=%ld\n", psl_builtin_compile_time());
|
||||
psl_builtin_compile_time() == 0 ? failed++ : ok++;
|
||||
|
||||
printf("psl_builtin_file_time()=%ld\n", psl_builtin_file_time());
|
||||
psl_builtin_file_time() == 0 ? failed++ : ok++;
|
||||
|
||||
|
|
|
@ -130,6 +130,11 @@ static void test_psl(void)
|
|||
} else if (sscanf(p, "checkPublicSuffix ( null , null ) %1[;]", semicolon) == 1) {
|
||||
d_is_null = 1;
|
||||
er_is_null = 1;
|
||||
} else if (sscanf(p, "%127s %127s", domain, expected_regdom) == 2) {
|
||||
if (!strcmp(domain, "null"))
|
||||
d_is_null = 1;
|
||||
if (!strcmp(expected_regdom, "null"))
|
||||
er_is_null = 1;
|
||||
} else {
|
||||
failed++;
|
||||
printf("Malformed line from '" PSL_TESTFILE "': %s", buf);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright(c) 2014-2015 Tim Ruehsen
|
||||
* Copyright(c) 2014-2016 Tim Ruehsen
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
|
@ -117,7 +117,7 @@ int main(int argc, const char *const *argv)
|
|||
usage(0, stdout);
|
||||
}
|
||||
else if (!strcmp(*arg, "--version")) {
|
||||
printf("psl %s\n", PACKAGE_VERSION);
|
||||
printf("psl %s (0x%06x)\n", PACKAGE_VERSION, psl_check_version_number(0));
|
||||
printf("libpsl %s\n", psl_get_version());
|
||||
printf("\n");
|
||||
printf("Copyright (C) 2014-2015 Tim Ruehsen\n");
|
||||
|
@ -211,9 +211,9 @@ int main(int argc, const char *const *argv)
|
|||
printf("builtin exceptions: %d\n", psl_suffix_exception_count(psl));
|
||||
printf("builtin wildcards: %d\n", psl_suffix_wildcard_count(psl));
|
||||
printf("builtin filename: %s\n", psl_builtin_filename());
|
||||
printf("builtin compile time: %ld (%s)\n", psl_builtin_compile_time(), time2str(psl_builtin_compile_time()));
|
||||
printf("builtin file time: %ld (%s)\n", psl_builtin_file_time(), time2str(psl_builtin_file_time()));
|
||||
printf("builtin SHA1 file hash: %s\n", psl_builtin_sha1sum());
|
||||
printf("builtin outdated: %d\n", psl_builtin_outdated());
|
||||
} else
|
||||
printf("No builtin PSL data available\n");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue