diff --git a/.travis.yml b/.travis.yml index d72425b..3a0823c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,29 +1,41 @@ +sudo: false + language: c + compiler: - gcc - clang -# Change this to your needs + +env: + - RUNTIME=libicu + - RUNTIME=libidn2 + - RUNTIME=libidn + - RUNTIME=no + +addons: + apt: + packages: + - automake + - autoconf + - autopoint + - libtool + - gtk-doc-tools + - gettext + - libidn11 + - libidn11-dev + - libidn2-0 + - libidn2-0-dev + - libicu48 + - libicu-dev + - libunistring0 + - libunistring-dev + script: - ./autogen.sh - ./configure && make -j4 && make check -j4 - - ./configure --enable-runtime=libicu --enable-builtin=libicu && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libicu --enable-builtin=libidn2 && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libicu --enable-builtin=libidn && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libicu --disable-builtin && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn2 --enable-builtin=libicu && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn2 --enable-builtin=libidn2 && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn2 --enable-builtin=libidn && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn2 --disable-builtin && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn --enable-builtin=libicu && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn --enable-builtin=libidn2 && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn --enable-builtin=libidn && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn --disable-builtin && make clean && make -j4 && make check -j4 - - ./configure --disable-runtime --enable-builtin=libicu && make clean && make -j4 && make check -j4 - - ./configure --disable-runtime --enable-builtin=libidn2 && make clean && make -j4 && make check -j4 - - ./configure --disable-runtime --enable-builtin=libidn && make clean && make -j4 && make check -j4 - - ./configure --disable-runtime --disable-builtin && make clean && make -j4 && make check -j4 + - ./configure --enable-runtime=$RUNTIME --enable-builtin=libicu && make clean && make -j4 && make check -j4 + - ./configure --enable-runtime=$RUNTIME --enable-builtin=libidn2 && make clean && make -j4 && make check -j4 + - ./configure --enable-runtime=$RUNTIME --enable-builtin=libidn && make clean && make -j4 && make check -j4 + - ./configure --enable-runtime=$RUNTIME --disable-builtin && make clean && make -j4 && make check -j4 - ./configure --enable-gtk-doc && make -j4 && make check -j4 - make distcheck -before_install: - - sudo apt-get -qq update - - sudo apt-get -q install autoconf automake autopoint libtool gtk-doc-tools gettext libidn11 libidn11-dev libidn2-0 libidn2-0-dev libicu48 libicu-dev libunistring0 libunistring-dev diff --git a/AUTHORS b/AUTHORS index 12e83e6..33dad7b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -15,3 +15,4 @@ Dagobert Michelsen (Fixed Solaris building) Christopher Meng (Fedora building) Jakub Čajka Giuseppe Scrivano +Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support) diff --git a/Makefile.am b/Makefile.am index d488ce4..3904754 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,4 +14,8 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS} pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = libpsl.pc -EXTRA_DIST = config.rpath LICENSE $(PSL_FILE) list/tests/test_psl.txt +EXTRA_DIST = config.rpath LICENSE +dist-hook: + mkdir -p $(distdir)/list/tests + cp -p $(PSL_FILE) $(distdir)/list + cp -p $(PSL_TESTFILE) $(distdir)/list/tests diff --git a/NEWS b/NEWS index b17e17e..57c9b95 100644 --- a/NEWS +++ b/NEWS @@ -1,10 +1,23 @@ -Copyright (C) 2014-2015 Tim Rühsen +Copyright (C) 2014-2016 Tim Rühsen + +02.01.2016 Release V0.12.0 + * Load DAFSA binaries via psl_load_file() via auto-detection + * Add more tests + * Remove psl_builtin_compile_time() + * Compile PSL into DAFSA using make_dafsa.py + * Avoid libicu dependency with --enable-runtime=no + * Test on new Travis-CI build farm + * Use DAFSA format for builtin PSL data + * Add function psl_is_public_suffix2() + * Fix psl_builtin_outdated() + * Fix several bugs + * Cleanup code 23.09.2015 Release V0.11.0 * Add new function psl_check_version_number() * Add version defines to include file -19.09.2025 Release V0.10.0 +19.09.2015 Release V0.10.0 * Code simplified * Less data entries, faster lookups * Add new function psl_suffix_wildcard_count() diff --git a/README.md b/README.md index 7bc8fbc..75e1038 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Browsers and other web clients can use it to Libpsl... -- has built-in PSL data for fast access +- has built-in PSL data for fast access (DAWG/DAFSA reduces size from 180kB to ~32kB) - allows to load PSL data from files - checks if a given domain is a "public suffix" - provides immediate cookie domain verification @@ -28,6 +28,8 @@ Find more information about the Publix Suffix List [here](http://publicsuffix.or Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat). +The DAFSA code has been taken from [Chromium Project](https://code.google.com/p/chromium/). + API Documentation ----------------- @@ -74,6 +76,8 @@ License Libpsl is made available under the terms of the MIT license.
See the LICENSE file that accompanies this distribution for the full text of the license. +src/make_dafsa.py and src/lookup_string_in_fixed_set.c are licensed under the term written in +src/LICENSE.chromium. Building from git ----------------- diff --git a/autogen.sh b/autogen.sh index e714cf8..aee0a7c 100755 --- a/autogen.sh +++ b/autogen.sh @@ -1,21 +1,21 @@ -#!/bin/sh -e +#!/bin/sh -AUTORECONF=$(which autoreconf 2>/dev/null || true) +AUTORECONF=$(which autoreconf 2>/dev/null) if test $? -ne 0; then echo "No 'autoreconf' found. You must install the autoconf package." exit 1 fi -GIT=$(which git 2>/dev/null || true) +GIT=$(which git 2>/dev/null) if test $? -ne 0; then echo "No 'git' found. You must install the git package." exit 1 fi -# create m4 before gtkdocize -mkdir m4 2>/dev/null || true +# create m4 before gtkdocize +mkdir -p m4 2>/dev/null -GTKDOCIZE=$(which gtkdocize 2>/dev/null || true) +GTKDOCIZE=$(which gtkdocize 2>/dev/null) if test $? -ne 0; then echo "No gtk-doc support found. You can't build the docs." # rm because gtk-doc.make might be a link to a protected file @@ -24,7 +24,7 @@ if test $? -ne 0; then echo "CLEANFILES =" >>gtk-doc.make GTKDOCIZE="" else - $GTKDOCIZE || exit $? + $GTKDOCIZE fi $GIT submodule init diff --git a/configure.ac b/configure.ac index c3cbd2b..e7a1273 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ -AC_INIT([libpsl], [0.11.0], [tim.ruehsen@gmx.de], [libpsl], [http://github.com/rockdaboot/libpsl]) +AC_INIT([libpsl], [0.12.0], [tim.ruehsen@gmx.de], [libpsl], [http://github.com/rockdaboot/libpsl]) AC_PREREQ([2.59]) AM_INIT_AUTOMAKE([1.10 -Wall no-define foreign]) @@ -20,9 +20,9 @@ AC_C_INLINE # # Generate version defines for include file # -AC_SUBST([LIBPSL_VERSION_MAJOR], [`echo -n $VERSION|cut -d'.' -f1`]) -AC_SUBST([LIBPSL_VERSION_MINOR], [`echo -n $VERSION|cut -d'.' -f2`]) -AC_SUBST([LIBPSL_VERSION_PATCH], [`echo -n $VERSION|cut -d'.' -f3`]) +AC_SUBST([LIBPSL_VERSION_MAJOR], [`echo $VERSION|cut -d'.' -f1`]) +AC_SUBST([LIBPSL_VERSION_MINOR], [`echo $VERSION|cut -d'.' -f2`]) +AC_SUBST([LIBPSL_VERSION_PATCH], [`echo $VERSION|cut -d'.' -f3`]) AC_SUBST([LIBPSL_VERSION_NUMBER], [`printf '0x%02x%02x%02x' $LIBPSL_VERSION_MAJOR $LIBPSL_VERSION_MINOR $LIBPSL_VERSION_PATCH`]) AC_CONFIG_FILES([include/libpsl.h]) @@ -85,7 +85,7 @@ PKG_PROG_PKG_CONFIG # 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0. # 5. If any interfaces have been added since the last public release, then increment age. # 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0. -AC_SUBST([LIBPSL_SO_VERSION], [4:0:4]) +AC_SUBST([LIBPSL_SO_VERSION], [5:0:0]) AC_SUBST([LIBPSL_VERSION], $VERSION) # Check for enable/disable builtin PSL data @@ -154,8 +154,10 @@ if test "$enable_runtime" = "libicu" -o "$enable_builtin" = "libicu"; then # using AC_SEARCH_LIBS also don't work since functions have the library version appended PKG_CHECK_MODULES([LIBICU], [icu-uc], [ HAVE_LIBICU=yes - LIBS="$LIBICU_LIBS $LIBS" - CFLAGS="$LIBICU_CFLAGS $CFLAGS" + if test "$enable_runtime" = "libicu"; then + LIBS="$LIBICU_LIBS $LIBS" + CFLAGS="$LIBICU_CFLAGS $CFLAGS" + fi ], [ OLDLIBS=$LIBS LIBS="-licuuc $LIBS" @@ -216,6 +218,9 @@ elif test -n "$NEEDS_NSL" ; then LIBS="$LIBS -lnsl" fi +# Check for clock_gettime() used for performance measurement +AC_SEARCH_LIBS(clock_gettime, rt) + # Check for valgrind ac_enable_valgrind=no AC_ARG_ENABLE(valgrind-tests, @@ -252,7 +257,7 @@ AC_SUBST(PSL_TESTFILE) # check for alloca / alloca.h AC_FUNC_ALLOCA -AC_CHECK_FUNCS([strndup]) +AC_CHECK_FUNCS([strndup clock_gettime]) # Override the template file name of the generated .pc file, so that there # is no need to rename the template file when the API version changes. diff --git a/contrib/check-hard b/contrib/check-hard index a307ff3..a3a3c3a 100755 --- a/contrib/check-hard +++ b/contrib/check-hard @@ -14,7 +14,7 @@ make distclean > /dev/null || true # We define _GNU_SOURCE to avoid warnings with missing prototypes. # C89 does not know snprintf, strdup, strndup, popen, pclose -CFLAGS="-std=c89 -pedantic -O2 -g -Wall -Wextra -Wstrict-prototypes -Wold-style-definition -Wwrite-strings -Wshadow -Wformat -Wformat-security -Wunreachable-code -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition -D_GNU_SOURCE" +CFLAGS="-std=gnu89 -pedantic -O2 -g -Wall -Wextra -Wstrict-prototypes -Wold-style-definition -Wwrite-strings -Wshadow -Wformat -Wformat-security -Wunreachable-code -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition" CACHEFILE=$PWD/config_check.cache @@ -40,7 +40,8 @@ for CC in gcc clang; do for options in \ "--enable-runtime=libicu --enable-builtin=libicu" \ "--enable-runtime=libidn2 --enable-builtin=libidn2" \ - "--enable-runtime=libidn --enable-builtin=libidn"; do + "--enable-runtime=libidn --enable-builtin=libidn" \ + "--disable-runtime --enable-builtin=libicu"; do export DISTCHECK_CONFIGURE_FLAGS="-C --cache-file=$CACHEFILE $options" echo echo " *** ./configure $DISTCHECK_CONFIGURE_FLAGS" diff --git a/docs/libpsl/libpsl-sections.txt b/docs/libpsl/libpsl-sections.txt index a27a9da..d72ab64 100644 --- a/docs/libpsl/libpsl-sections.txt +++ b/docs/libpsl/libpsl-sections.txt @@ -6,6 +6,9 @@ PSL_VERSION_MAJOR PSL_VERSION_MINOR PSL_VERSION_NUMBER PSL_VERSION_PATCH +PSL_TYPE_ICANN +PSL_TYPE_PRIVATE +PSL_TYPE_ANY psl_error_t psl_ctx_t psl_load_file @@ -13,12 +16,12 @@ psl_load_fp psl_builtin psl_free psl_is_public_suffix +psl_is_public_suffix2 psl_unregistrable_domain psl_registrable_domain psl_suffix_count psl_suffix_exception_count psl_suffix_wildcard_count -psl_builtin_compile_time psl_builtin_file_time psl_builtin_sha1sum psl_builtin_filename diff --git a/include/libpsl.h.in b/include/libpsl.h.in index 4f86a50..467021f 100644 --- a/include/libpsl.h.in +++ b/include/libpsl.h.in @@ -1,5 +1,5 @@ /* - * Copyright(c) 2014-2015 Tim Ruehsen + * Copyright(c) 2014-2016 Tim Ruehsen * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -44,6 +44,11 @@ extern "C" { #endif +/* types for psl_is_publix_suffix2() */ +#define PSL_TYPE_ICANN (1<<0) +#define PSL_TYPE_PRIVATE (1<<1) +#define PSL_TYPE_ANY (PSL_TYPE_ICANN | PSL_TYPE_PRIVATE) + /** * psl_error_t: * @PSL_SUCCESS: Successful return. @@ -71,57 +76,75 @@ typedef struct _psl_ctx_st psl_ctx_t; /* frees PSL context */ void psl_free(psl_ctx_t *psl); + /* loads PSL data from file */ psl_ctx_t * psl_load_file(const char *fname); + /* loads PSL data from FILE pointer */ psl_ctx_t * psl_load_fp(FILE *fp); + /* retrieves builtin PSL data */ const psl_ctx_t * psl_builtin(void); + /* checks whether domain is a public suffix or not */ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain); + +/* checks whether domain is a public suffix regarding the type or not */ +int + psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type); + /* checks whether cookie_domain is acceptable for domain or not */ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain); + /* returns the longest not registrable domain within 'domain' or NULL if none found */ const char * psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain); + /* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */ const char * psl_registrable_domain(const psl_ctx_t *psl, const char *domain); + /* convert a string into lowercase UTF-8 */ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower); + /* does not include exceptions */ int psl_suffix_count(const psl_ctx_t *psl); + /* just counts exceptions */ int psl_suffix_exception_count(const psl_ctx_t *psl); + /* just counts wildcards */ int psl_suffix_wildcard_count(const psl_ctx_t *psl); -/* returns compilation time */ -time_t - psl_builtin_compile_time(void); + /* returns mtime of PSL source file */ time_t psl_builtin_file_time(void); + /* returns SHA1 checksum (hex-encoded, lowercase) of PSL source file */ const char * psl_builtin_sha1sum(void); + /* returns file name of PSL source file */ const char * psl_builtin_filename(void); + /* returns library version string */ const char * psl_get_version(void); + /* checks library version number */ int psl_check_version_number(int version); + /* returns wether the built-in data is outdated or not */ int psl_builtin_outdated(void); diff --git a/list b/list index 2930bb4..1f3ad51 160000 --- a/list +++ b/list @@ -1 +1 @@ -Subproject commit 2930bb4a5256279e0f7ba44cf9d174fc93ecb732 +Subproject commit 1f3ad51171235aafe423435606e869f0161582e4 diff --git a/src/LICENSE.chromium b/src/LICENSE.chromium new file mode 100644 index 0000000..ffe66fe --- /dev/null +++ b/src/LICENSE.chromium @@ -0,0 +1,30 @@ +* The following License is for the source code files + make_dafsa.py and lookup_string_in_fixed_set.c. + +// Copyright 2015 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/Makefile.am b/src/Makefile.am index 62cb87d..9e04d53 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,12 +1,12 @@ # suffixes.c must be created before psl.c is compiled -BUILT_SOURCES = suffixes.c +BUILT_SOURCES = suffixes_dafsa.c # suffixes.c is a built source that must be cleaned -CLEANFILES = suffixes.c +CLEANFILES = suffixes_dafsa.c lib_LTLIBRARIES = libpsl.la -libpsl_la_SOURCES = psl.c +libpsl_la_SOURCES = psl.c lookup_string_in_fixed_set.c libpsl_la_CPPFLAGS = -I$(top_srcdir)/include # include ABI version information libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION) @@ -21,8 +21,8 @@ if WITH_LIBIDN endif noinst_PROGRAMS = psl2c -psl2c_SOURCES = psl2c.c -psl2c_CPPFLAGS = -I$(top_srcdir)/include +psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c +psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/make_dafsa.py\" if BUILTIN_GENERATOR_LIBICU psl2c_LDADD = -licuuc endif @@ -33,7 +33,9 @@ if BUILTIN_GENERATOR_LIBIDN psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring endif -# Build rule for suffix.c +# Build rule for suffix_dafsa.c # PSL_FILE can be set by ./configure --with-psl-file=[PATH] -suffixes.c: $(PSL_FILE) psl2c$(EXEEXT) - ./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes.c +suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT) + ./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c + +EXTRA_DIST = make_dafsa.py LICENSE.chromium diff --git a/src/lookup_string_in_fixed_set.c b/src/lookup_string_in_fixed_set.c new file mode 100644 index 0000000..81a4e4d --- /dev/null +++ b/src/lookup_string_in_fixed_set.c @@ -0,0 +1,204 @@ +/* Copyright 2015 The Chromium Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE.chromium file. + * + * Converted to C89 2015 by Tim Rühsen + */ + +#include + +#if defined(__GNUC__) && defined(__GNUC_MINOR__) +# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) +#else +# define _GCC_VERSION_AT_LEAST(major, minor) 0 +#endif + +#if _GCC_VERSION_AT_LEAST(4,0) +# define _HIDDEN __attribute__ ((visibility ("hidden"))) +#else +# define _HIDDEN +#endif + +#define CHECK_LT(a, b) if ((a) >= b) return 0 + +/* + * Read next offset from pos. + * Returns true if an offset could be read, false otherwise. + */ + +static int GetNextOffset(const unsigned char** pos, + const unsigned char* end, + const unsigned char** offset) +{ + size_t bytes_consumed; + + if (*pos == end) + return 0; + + /* When reading an offset the byte array must always contain at least + * three more bytes to consume. First the offset to read, then a node + * to skip over and finally a destination node. No object can be smaller + * than one byte. */ + CHECK_LT(*pos + 2, end); + switch (**pos & 0x60) { + case 0x60: /* Read three byte offset */ + *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2]; + bytes_consumed = 3; + break; + case 0x40: /* Read two byte offset */ + *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1]; + bytes_consumed = 2; + break; + default: + *offset += (*pos)[0] & 0x3F; + bytes_consumed = 1; + } + if ((**pos & 0x80) != 0) { + *pos = end; + } else { + *pos += bytes_consumed; + } + return 1; +} + +/* + * Check if byte at offset is last in label. + */ + +static int IsEOL(const unsigned char* offset, const unsigned char* end) +{ + CHECK_LT(offset, end); + return(*offset & 0x80) != 0; +} + +/* + * Check if byte at offset matches first character in key. + * This version matches characters not last in label. + */ + +static int IsMatch(const unsigned char* offset, + const unsigned char* end, + const char* key) +{ + CHECK_LT(offset, end); + return *offset == *key; +} + +/* + * Check if byte at offset matches first character in key. + * This version matches characters last in label. + */ + +static int IsEndCharMatch(const unsigned char* offset, + const unsigned char* end, + const char* key) +{ + CHECK_LT(offset, end); + return *offset == (*key | 0x80); +} + +/* + * Read return value at offset. + * Returns true if a return value could be read, false otherwise. + */ + +static int GetReturnValue(const unsigned char* offset, + const unsigned char* end, + int* return_value) +{ + CHECK_LT(offset, end); + if ((*offset & 0xE0) == 0x80) { + *return_value = *offset & 0x0F; + return 1; + } + return 0; +} + +/* + * Looks up the string |key| with length |key_length| in a fixed set of + * strings. The set of strings must be known at compile time. It is converted to + * a graph structure named a DAFSA (Deterministic Acyclic Finite State + * Automaton) by the script make_dafsa.py during compilation. This permits + * efficient (in time and space) lookup. The graph generated by make_dafsa.py + * takes the form of a constant byte array which should be supplied via the + * |graph| and |length| parameters. The return value is kDafsaNotFound, + * kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule, + * kDafsaWildcardRule and kDafsaPrivateRule ORed together. + * + * Lookup a domain key in a byte array generated by make_dafsa.py. + */ + +/* prototype to skip warning with -Wmissing-prototypes */ +int _HIDDEN LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t); + +int _HIDDEN LookupStringInFixedSet(const unsigned char* graph, + size_t length, + const char* key, + size_t key_length) +{ + const unsigned char* pos = graph; + const unsigned char* end = graph + length; + const unsigned char* offset = pos; + const char* key_end = key + key_length; + + while (GetNextOffset(&pos, end, &offset)) { + /*char + end_char offsets + * char + return value + * char end_char offsets + * char return value + * end_char offsets + * return_value + */ + int did_consume = 0; + + if (key != key_end && !IsEOL(offset, end)) { + /* Leading is not a match. Don't dive into this child */ + if (!IsMatch(offset, end, key)) + continue; + did_consume = 1; + ++offset; + ++key; + /* Possible matches at this point: + * + end_char offsets + * + return value + * end_char offsets + * return value + */ + + /* Remove all remaining nodes possible */ + while (!IsEOL(offset, end) && key != key_end) { + if (!IsMatch(offset, end, key)) + return -1; + ++key; + ++offset; + } + } + /* Possible matches at this point: + * end_char offsets + * return_value + * If one or more elements were consumed, a failure + * to match is terminal. Otherwise, try the next node. + */ + if (key == key_end) { + int return_value; + + if (GetReturnValue(offset, end, &return_value)) + return return_value; + /* The DAFSA guarantees that if the first char is a match, all + * remaining char elements MUST match if the key is truly present. + */ + if (did_consume) + return -1; + continue; + } + if (!IsEndCharMatch(offset, end, key)) { + if (did_consume) + return -1; /* Unexpected */ + continue; + } + ++key; + pos = ++offset; /* Dive into child */ + } + + return -1; /* No match */ +} diff --git a/src/make_dafsa.py b/src/make_dafsa.py new file mode 100755 index 0000000..bb308f6 --- /dev/null +++ b/src/make_dafsa.py @@ -0,0 +1,588 @@ +#!/usr/bin/env python +# Copyright 2014 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE.chromium file. + +""" +A Deterministic acyclic finite state automaton (DAFSA) is a compact +representation of an unordered word list (dictionary). + +http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton + +This python program converts a list of strings to a byte array in C++. +This python program fetches strings and return values from a gperf file +and generates a C++ file with a byte array representing graph that can be +used as a memory efficient replacement for the perfect hash table. + +The input strings are assumed to consist of printable 7-bit ASCII characters +and the return values are assumed to be one digit integers. + +In this program a DAFSA is a diamond shaped graph starting at a common +source node and ending at a common sink node. All internal nodes contain +a label and each word is represented by the labels in one path from +the source node to the sink node. + +The following python represention is used for nodes: + + Source node: [ children ] + Internal node: (label, [ children ]) + Sink node: None + +The graph is first compressed by prefixes like a trie. In the next step +suffixes are compressed so that the graph gets diamond shaped. Finally +one to one linked nodes are replaced by nodes with the labels joined. + +The order of the operations is crucial since lookups will be performed +starting from the source with no backtracking. Thus a node must have at +most one child with a label starting by the same character. The output +is also arranged so that all jumps are to increasing addresses, thus forward +in memory. + +The generated output has suffix free decoding so that the sign of leading +bits in a link (a reference to a child node) indicate if it has a size of one, +two or three bytes and if it is the last outgoing link from the actual node. +A node label is terminated by a byte with the leading bit set. + +The generated byte array can described by the following BNF: + + ::= < 8-bit value in range [0x00-0xFF] > + + ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > + ::= < char + 0x80, byte in range [0xA0-0xFF] > + ::= < value + 0x80, byte in range [0x80-0x8F] > + + ::= < byte in range [0x00-0x3F] > + ::= < byte in range [0x40-0x5F] > + ::= < byte in range [0x60-0x7F] > + + ::= < byte in range [0x80-0xBF] > + ::= < byte in range [0xC0-0xDF] > + ::= < byte in range [0xE0-0xFF] > + + ::= + +