diff --git a/.travis.yml b/.travis.yml
index d72425b..3a0823c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,29 +1,41 @@
+sudo: false
+
language: c
+
compiler:
- gcc
- clang
-# Change this to your needs
+
+env:
+ - RUNTIME=libicu
+ - RUNTIME=libidn2
+ - RUNTIME=libidn
+ - RUNTIME=no
+
+addons:
+ apt:
+ packages:
+ - automake
+ - autoconf
+ - autopoint
+ - libtool
+ - gtk-doc-tools
+ - gettext
+ - libidn11
+ - libidn11-dev
+ - libidn2-0
+ - libidn2-0-dev
+ - libicu48
+ - libicu-dev
+ - libunistring0
+ - libunistring-dev
+
script:
- ./autogen.sh
- ./configure && make -j4 && make check -j4
- - ./configure --enable-runtime=libicu --enable-builtin=libicu && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libicu --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libicu --enable-builtin=libidn && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libicu --disable-builtin && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libidn2 --enable-builtin=libicu && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libidn2 --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libidn2 --enable-builtin=libidn && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libidn2 --disable-builtin && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libidn --enable-builtin=libicu && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libidn --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libidn --enable-builtin=libidn && make clean && make -j4 && make check -j4
- - ./configure --enable-runtime=libidn --disable-builtin && make clean && make -j4 && make check -j4
- - ./configure --disable-runtime --enable-builtin=libicu && make clean && make -j4 && make check -j4
- - ./configure --disable-runtime --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
- - ./configure --disable-runtime --enable-builtin=libidn && make clean && make -j4 && make check -j4
- - ./configure --disable-runtime --disable-builtin && make clean && make -j4 && make check -j4
+ - ./configure --enable-runtime=$RUNTIME --enable-builtin=libicu && make clean && make -j4 && make check -j4
+ - ./configure --enable-runtime=$RUNTIME --enable-builtin=libidn2 && make clean && make -j4 && make check -j4
+ - ./configure --enable-runtime=$RUNTIME --enable-builtin=libidn && make clean && make -j4 && make check -j4
+ - ./configure --enable-runtime=$RUNTIME --disable-builtin && make clean && make -j4 && make check -j4
- ./configure --enable-gtk-doc && make -j4 && make check -j4
- make distcheck
-before_install:
- - sudo apt-get -qq update
- - sudo apt-get -q install autoconf automake autopoint libtool gtk-doc-tools gettext libidn11 libidn11-dev libidn2-0 libidn2-0-dev libicu48 libicu-dev libunistring0 libunistring-dev
diff --git a/AUTHORS b/AUTHORS
index 12e83e6..33dad7b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -15,3 +15,4 @@ Dagobert Michelsen (Fixed Solaris building)
Christopher Meng (Fedora building)
Jakub Čajka
Giuseppe Scrivano
+Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
diff --git a/Makefile.am b/Makefile.am
index d488ce4..3904754 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,4 +14,8 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS}
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = libpsl.pc
-EXTRA_DIST = config.rpath LICENSE $(PSL_FILE) list/tests/test_psl.txt
+EXTRA_DIST = config.rpath LICENSE
+dist-hook:
+ mkdir -p $(distdir)/list/tests
+ cp -p $(PSL_FILE) $(distdir)/list
+ cp -p $(PSL_TESTFILE) $(distdir)/list/tests
diff --git a/NEWS b/NEWS
index b17e17e..57c9b95 100644
--- a/NEWS
+++ b/NEWS
@@ -1,10 +1,23 @@
-Copyright (C) 2014-2015 Tim Rühsen
+Copyright (C) 2014-2016 Tim Rühsen
+
+02.01.2016 Release V0.12.0
+ * Load DAFSA binaries via psl_load_file() via auto-detection
+ * Add more tests
+ * Remove psl_builtin_compile_time()
+ * Compile PSL into DAFSA using make_dafsa.py
+ * Avoid libicu dependency with --enable-runtime=no
+ * Test on new Travis-CI build farm
+ * Use DAFSA format for builtin PSL data
+ * Add function psl_is_public_suffix2()
+ * Fix psl_builtin_outdated()
+ * Fix several bugs
+ * Cleanup code
23.09.2015 Release V0.11.0
* Add new function psl_check_version_number()
* Add version defines to include file
-19.09.2025 Release V0.10.0
+19.09.2015 Release V0.10.0
* Code simplified
* Less data entries, faster lookups
* Add new function psl_suffix_wildcard_count()
diff --git a/README.md b/README.md
index 7bc8fbc..75e1038 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ Browsers and other web clients can use it to
Libpsl...
-- has built-in PSL data for fast access
+- has built-in PSL data for fast access (DAWG/DAFSA reduces size from 180kB to ~32kB)
- allows to load PSL data from files
- checks if a given domain is a "public suffix"
- provides immediate cookie domain verification
@@ -28,6 +28,8 @@ Find more information about the Publix Suffix List [here](http://publicsuffix.or
Download the Public Suffix List [here](https://hg.mozilla.org/mozilla-central/raw-file/tip/netwerk/dns/effective_tld_names.dat).
+The DAFSA code has been taken from [Chromium Project](https://code.google.com/p/chromium/).
+
API Documentation
-----------------
@@ -74,6 +76,8 @@ License
Libpsl is made available under the terms of the MIT license.
See the LICENSE file that accompanies this distribution for the full text of the license.
+src/make_dafsa.py and src/lookup_string_in_fixed_set.c are licensed under the term written in
+src/LICENSE.chromium.
Building from git
-----------------
diff --git a/autogen.sh b/autogen.sh
index e714cf8..aee0a7c 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -1,21 +1,21 @@
-#!/bin/sh -e
+#!/bin/sh
-AUTORECONF=$(which autoreconf 2>/dev/null || true)
+AUTORECONF=$(which autoreconf 2>/dev/null)
if test $? -ne 0; then
echo "No 'autoreconf' found. You must install the autoconf package."
exit 1
fi
-GIT=$(which git 2>/dev/null || true)
+GIT=$(which git 2>/dev/null)
if test $? -ne 0; then
echo "No 'git' found. You must install the git package."
exit 1
fi
-# create m4 before gtkdocize
-mkdir m4 2>/dev/null || true
+# create m4 before gtkdocize
+mkdir -p m4 2>/dev/null
-GTKDOCIZE=$(which gtkdocize 2>/dev/null || true)
+GTKDOCIZE=$(which gtkdocize 2>/dev/null)
if test $? -ne 0; then
echo "No gtk-doc support found. You can't build the docs."
# rm because gtk-doc.make might be a link to a protected file
@@ -24,7 +24,7 @@ if test $? -ne 0; then
echo "CLEANFILES =" >>gtk-doc.make
GTKDOCIZE=""
else
- $GTKDOCIZE || exit $?
+ $GTKDOCIZE
fi
$GIT submodule init
diff --git a/configure.ac b/configure.ac
index c3cbd2b..e7a1273 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
-AC_INIT([libpsl], [0.11.0], [tim.ruehsen@gmx.de], [libpsl], [http://github.com/rockdaboot/libpsl])
+AC_INIT([libpsl], [0.12.0], [tim.ruehsen@gmx.de], [libpsl], [http://github.com/rockdaboot/libpsl])
AC_PREREQ([2.59])
AM_INIT_AUTOMAKE([1.10 -Wall no-define foreign])
@@ -20,9 +20,9 @@ AC_C_INLINE
#
# Generate version defines for include file
#
-AC_SUBST([LIBPSL_VERSION_MAJOR], [`echo -n $VERSION|cut -d'.' -f1`])
-AC_SUBST([LIBPSL_VERSION_MINOR], [`echo -n $VERSION|cut -d'.' -f2`])
-AC_SUBST([LIBPSL_VERSION_PATCH], [`echo -n $VERSION|cut -d'.' -f3`])
+AC_SUBST([LIBPSL_VERSION_MAJOR], [`echo $VERSION|cut -d'.' -f1`])
+AC_SUBST([LIBPSL_VERSION_MINOR], [`echo $VERSION|cut -d'.' -f2`])
+AC_SUBST([LIBPSL_VERSION_PATCH], [`echo $VERSION|cut -d'.' -f3`])
AC_SUBST([LIBPSL_VERSION_NUMBER], [`printf '0x%02x%02x%02x' $LIBPSL_VERSION_MAJOR $LIBPSL_VERSION_MINOR $LIBPSL_VERSION_PATCH`])
AC_CONFIG_FILES([include/libpsl.h])
@@ -85,7 +85,7 @@ PKG_PROG_PKG_CONFIG
# 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0.
# 5. If any interfaces have been added since the last public release, then increment age.
# 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0.
-AC_SUBST([LIBPSL_SO_VERSION], [4:0:4])
+AC_SUBST([LIBPSL_SO_VERSION], [5:0:0])
AC_SUBST([LIBPSL_VERSION], $VERSION)
# Check for enable/disable builtin PSL data
@@ -154,8 +154,10 @@ if test "$enable_runtime" = "libicu" -o "$enable_builtin" = "libicu"; then
# using AC_SEARCH_LIBS also don't work since functions have the library version appended
PKG_CHECK_MODULES([LIBICU], [icu-uc], [
HAVE_LIBICU=yes
- LIBS="$LIBICU_LIBS $LIBS"
- CFLAGS="$LIBICU_CFLAGS $CFLAGS"
+ if test "$enable_runtime" = "libicu"; then
+ LIBS="$LIBICU_LIBS $LIBS"
+ CFLAGS="$LIBICU_CFLAGS $CFLAGS"
+ fi
], [
OLDLIBS=$LIBS
LIBS="-licuuc $LIBS"
@@ -216,6 +218,9 @@ elif test -n "$NEEDS_NSL" ; then
LIBS="$LIBS -lnsl"
fi
+# Check for clock_gettime() used for performance measurement
+AC_SEARCH_LIBS(clock_gettime, rt)
+
# Check for valgrind
ac_enable_valgrind=no
AC_ARG_ENABLE(valgrind-tests,
@@ -252,7 +257,7 @@ AC_SUBST(PSL_TESTFILE)
# check for alloca / alloca.h
AC_FUNC_ALLOCA
-AC_CHECK_FUNCS([strndup])
+AC_CHECK_FUNCS([strndup clock_gettime])
# Override the template file name of the generated .pc file, so that there
# is no need to rename the template file when the API version changes.
diff --git a/contrib/check-hard b/contrib/check-hard
index a307ff3..a3a3c3a 100755
--- a/contrib/check-hard
+++ b/contrib/check-hard
@@ -14,7 +14,7 @@ make distclean > /dev/null || true
# We define _GNU_SOURCE to avoid warnings with missing prototypes.
# C89 does not know snprintf, strdup, strndup, popen, pclose
-CFLAGS="-std=c89 -pedantic -O2 -g -Wall -Wextra -Wstrict-prototypes -Wold-style-definition -Wwrite-strings -Wshadow -Wformat -Wformat-security -Wunreachable-code -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition -D_GNU_SOURCE"
+CFLAGS="-std=gnu89 -pedantic -O2 -g -Wall -Wextra -Wstrict-prototypes -Wold-style-definition -Wwrite-strings -Wshadow -Wformat -Wformat-security -Wunreachable-code -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition"
CACHEFILE=$PWD/config_check.cache
@@ -40,7 +40,8 @@ for CC in gcc clang; do
for options in \
"--enable-runtime=libicu --enable-builtin=libicu" \
"--enable-runtime=libidn2 --enable-builtin=libidn2" \
- "--enable-runtime=libidn --enable-builtin=libidn"; do
+ "--enable-runtime=libidn --enable-builtin=libidn" \
+ "--disable-runtime --enable-builtin=libicu"; do
export DISTCHECK_CONFIGURE_FLAGS="-C --cache-file=$CACHEFILE $options"
echo
echo " *** ./configure $DISTCHECK_CONFIGURE_FLAGS"
diff --git a/docs/libpsl/libpsl-sections.txt b/docs/libpsl/libpsl-sections.txt
index a27a9da..d72ab64 100644
--- a/docs/libpsl/libpsl-sections.txt
+++ b/docs/libpsl/libpsl-sections.txt
@@ -6,6 +6,9 @@ PSL_VERSION_MAJOR
PSL_VERSION_MINOR
PSL_VERSION_NUMBER
PSL_VERSION_PATCH
+PSL_TYPE_ICANN
+PSL_TYPE_PRIVATE
+PSL_TYPE_ANY
psl_error_t
psl_ctx_t
psl_load_file
@@ -13,12 +16,12 @@ psl_load_fp
psl_builtin
psl_free
psl_is_public_suffix
+psl_is_public_suffix2
psl_unregistrable_domain
psl_registrable_domain
psl_suffix_count
psl_suffix_exception_count
psl_suffix_wildcard_count
-psl_builtin_compile_time
psl_builtin_file_time
psl_builtin_sha1sum
psl_builtin_filename
diff --git a/include/libpsl.h.in b/include/libpsl.h.in
index 4f86a50..467021f 100644
--- a/include/libpsl.h.in
+++ b/include/libpsl.h.in
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2014-2015 Tim Ruehsen
+ * Copyright(c) 2014-2016 Tim Ruehsen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -44,6 +44,11 @@
extern "C" {
#endif
+/* types for psl_is_publix_suffix2() */
+#define PSL_TYPE_ICANN (1<<0)
+#define PSL_TYPE_PRIVATE (1<<1)
+#define PSL_TYPE_ANY (PSL_TYPE_ICANN | PSL_TYPE_PRIVATE)
+
/**
* psl_error_t:
* @PSL_SUCCESS: Successful return.
@@ -71,57 +76,75 @@ typedef struct _psl_ctx_st psl_ctx_t;
/* frees PSL context */
void
psl_free(psl_ctx_t *psl);
+
/* loads PSL data from file */
psl_ctx_t *
psl_load_file(const char *fname);
+
/* loads PSL data from FILE pointer */
psl_ctx_t *
psl_load_fp(FILE *fp);
+
/* retrieves builtin PSL data */
const psl_ctx_t *
psl_builtin(void);
+
/* checks whether domain is a public suffix or not */
int
psl_is_public_suffix(const psl_ctx_t *psl, const char *domain);
+
+/* checks whether domain is a public suffix regarding the type or not */
+int
+ psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type);
+
/* checks whether cookie_domain is acceptable for domain or not */
int
psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain);
+
/* returns the longest not registrable domain within 'domain' or NULL if none found */
const char *
psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain);
+
/* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */
const char *
psl_registrable_domain(const psl_ctx_t *psl, const char *domain);
+
/* convert a string into lowercase UTF-8 */
psl_error_t
psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower);
+
/* does not include exceptions */
int
psl_suffix_count(const psl_ctx_t *psl);
+
/* just counts exceptions */
int
psl_suffix_exception_count(const psl_ctx_t *psl);
+
/* just counts wildcards */
int
psl_suffix_wildcard_count(const psl_ctx_t *psl);
-/* returns compilation time */
-time_t
- psl_builtin_compile_time(void);
+
/* returns mtime of PSL source file */
time_t
psl_builtin_file_time(void);
+
/* returns SHA1 checksum (hex-encoded, lowercase) of PSL source file */
const char *
psl_builtin_sha1sum(void);
+
/* returns file name of PSL source file */
const char *
psl_builtin_filename(void);
+
/* returns library version string */
const char *
psl_get_version(void);
+
/* checks library version number */
int
psl_check_version_number(int version);
+
/* returns wether the built-in data is outdated or not */
int
psl_builtin_outdated(void);
diff --git a/list b/list
index 2930bb4..1f3ad51 160000
--- a/list
+++ b/list
@@ -1 +1 @@
-Subproject commit 2930bb4a5256279e0f7ba44cf9d174fc93ecb732
+Subproject commit 1f3ad51171235aafe423435606e869f0161582e4
diff --git a/src/LICENSE.chromium b/src/LICENSE.chromium
new file mode 100644
index 0000000..ffe66fe
--- /dev/null
+++ b/src/LICENSE.chromium
@@ -0,0 +1,30 @@
+* The following License is for the source code files
+ make_dafsa.py and lookup_string_in_fixed_set.c.
+
+// Copyright 2015 The Chromium Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/Makefile.am b/src/Makefile.am
index 62cb87d..9e04d53 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,12 +1,12 @@
# suffixes.c must be created before psl.c is compiled
-BUILT_SOURCES = suffixes.c
+BUILT_SOURCES = suffixes_dafsa.c
# suffixes.c is a built source that must be cleaned
-CLEANFILES = suffixes.c
+CLEANFILES = suffixes_dafsa.c
lib_LTLIBRARIES = libpsl.la
-libpsl_la_SOURCES = psl.c
+libpsl_la_SOURCES = psl.c lookup_string_in_fixed_set.c
libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
# include ABI version information
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
@@ -21,8 +21,8 @@ if WITH_LIBIDN
endif
noinst_PROGRAMS = psl2c
-psl2c_SOURCES = psl2c.c
-psl2c_CPPFLAGS = -I$(top_srcdir)/include
+psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
+psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/make_dafsa.py\"
if BUILTIN_GENERATOR_LIBICU
psl2c_LDADD = -licuuc
endif
@@ -33,7 +33,9 @@ if BUILTIN_GENERATOR_LIBIDN
psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring
endif
-# Build rule for suffix.c
+# Build rule for suffix_dafsa.c
# PSL_FILE can be set by ./configure --with-psl-file=[PATH]
-suffixes.c: $(PSL_FILE) psl2c$(EXEEXT)
- ./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes.c
+suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
+ ./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
+
+EXTRA_DIST = make_dafsa.py LICENSE.chromium
diff --git a/src/lookup_string_in_fixed_set.c b/src/lookup_string_in_fixed_set.c
new file mode 100644
index 0000000..81a4e4d
--- /dev/null
+++ b/src/lookup_string_in_fixed_set.c
@@ -0,0 +1,204 @@
+/* Copyright 2015 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE.chromium file.
+ *
+ * Converted to C89 2015 by Tim Rühsen
+ */
+
+#include
+
+#if defined(__GNUC__) && defined(__GNUC_MINOR__)
+# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#else
+# define _GCC_VERSION_AT_LEAST(major, minor) 0
+#endif
+
+#if _GCC_VERSION_AT_LEAST(4,0)
+# define _HIDDEN __attribute__ ((visibility ("hidden")))
+#else
+# define _HIDDEN
+#endif
+
+#define CHECK_LT(a, b) if ((a) >= b) return 0
+
+/*
+ * Read next offset from pos.
+ * Returns true if an offset could be read, false otherwise.
+ */
+
+static int GetNextOffset(const unsigned char** pos,
+ const unsigned char* end,
+ const unsigned char** offset)
+{
+ size_t bytes_consumed;
+
+ if (*pos == end)
+ return 0;
+
+ /* When reading an offset the byte array must always contain at least
+ * three more bytes to consume. First the offset to read, then a node
+ * to skip over and finally a destination node. No object can be smaller
+ * than one byte. */
+ CHECK_LT(*pos + 2, end);
+ switch (**pos & 0x60) {
+ case 0x60: /* Read three byte offset */
+ *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
+ bytes_consumed = 3;
+ break;
+ case 0x40: /* Read two byte offset */
+ *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
+ bytes_consumed = 2;
+ break;
+ default:
+ *offset += (*pos)[0] & 0x3F;
+ bytes_consumed = 1;
+ }
+ if ((**pos & 0x80) != 0) {
+ *pos = end;
+ } else {
+ *pos += bytes_consumed;
+ }
+ return 1;
+}
+
+/*
+ * Check if byte at offset is last in label.
+ */
+
+static int IsEOL(const unsigned char* offset, const unsigned char* end)
+{
+ CHECK_LT(offset, end);
+ return(*offset & 0x80) != 0;
+}
+
+/*
+ * Check if byte at offset matches first character in key.
+ * This version matches characters not last in label.
+ */
+
+static int IsMatch(const unsigned char* offset,
+ const unsigned char* end,
+ const char* key)
+{
+ CHECK_LT(offset, end);
+ return *offset == *key;
+}
+
+/*
+ * Check if byte at offset matches first character in key.
+ * This version matches characters last in label.
+ */
+
+static int IsEndCharMatch(const unsigned char* offset,
+ const unsigned char* end,
+ const char* key)
+{
+ CHECK_LT(offset, end);
+ return *offset == (*key | 0x80);
+}
+
+/*
+ * Read return value at offset.
+ * Returns true if a return value could be read, false otherwise.
+ */
+
+static int GetReturnValue(const unsigned char* offset,
+ const unsigned char* end,
+ int* return_value)
+{
+ CHECK_LT(offset, end);
+ if ((*offset & 0xE0) == 0x80) {
+ *return_value = *offset & 0x0F;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Looks up the string |key| with length |key_length| in a fixed set of
+ * strings. The set of strings must be known at compile time. It is converted to
+ * a graph structure named a DAFSA (Deterministic Acyclic Finite State
+ * Automaton) by the script make_dafsa.py during compilation. This permits
+ * efficient (in time and space) lookup. The graph generated by make_dafsa.py
+ * takes the form of a constant byte array which should be supplied via the
+ * |graph| and |length| parameters. The return value is kDafsaNotFound,
+ * kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
+ * kDafsaWildcardRule and kDafsaPrivateRule ORed together.
+ *
+ * Lookup a domain key in a byte array generated by make_dafsa.py.
+ */
+
+/* prototype to skip warning with -Wmissing-prototypes */
+int _HIDDEN LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
+
+int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
+ size_t length,
+ const char* key,
+ size_t key_length)
+{
+ const unsigned char* pos = graph;
+ const unsigned char* end = graph + length;
+ const unsigned char* offset = pos;
+ const char* key_end = key + key_length;
+
+ while (GetNextOffset(&pos, end, &offset)) {
+ /*char + end_char offsets
+ * char + return value
+ * char end_char offsets
+ * char return value
+ * end_char offsets
+ * return_value
+ */
+ int did_consume = 0;
+
+ if (key != key_end && !IsEOL(offset, end)) {
+ /* Leading is not a match. Don't dive into this child */
+ if (!IsMatch(offset, end, key))
+ continue;
+ did_consume = 1;
+ ++offset;
+ ++key;
+ /* Possible matches at this point:
+ * + end_char offsets
+ * + return value
+ * end_char offsets
+ * return value
+ */
+
+ /* Remove all remaining nodes possible */
+ while (!IsEOL(offset, end) && key != key_end) {
+ if (!IsMatch(offset, end, key))
+ return -1;
+ ++key;
+ ++offset;
+ }
+ }
+ /* Possible matches at this point:
+ * end_char offsets
+ * return_value
+ * If one or more elements were consumed, a failure
+ * to match is terminal. Otherwise, try the next node.
+ */
+ if (key == key_end) {
+ int return_value;
+
+ if (GetReturnValue(offset, end, &return_value))
+ return return_value;
+ /* The DAFSA guarantees that if the first char is a match, all
+ * remaining char elements MUST match if the key is truly present.
+ */
+ if (did_consume)
+ return -1;
+ continue;
+ }
+ if (!IsEndCharMatch(offset, end, key)) {
+ if (did_consume)
+ return -1; /* Unexpected */
+ continue;
+ }
+ ++key;
+ pos = ++offset; /* Dive into child */
+ }
+
+ return -1; /* No match */
+}
diff --git a/src/make_dafsa.py b/src/make_dafsa.py
new file mode 100755
index 0000000..bb308f6
--- /dev/null
+++ b/src/make_dafsa.py
@@ -0,0 +1,588 @@
+#!/usr/bin/env python
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE.chromium file.
+
+"""
+A Deterministic acyclic finite state automaton (DAFSA) is a compact
+representation of an unordered word list (dictionary).
+
+http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
+
+This python program converts a list of strings to a byte array in C++.
+This python program fetches strings and return values from a gperf file
+and generates a C++ file with a byte array representing graph that can be
+used as a memory efficient replacement for the perfect hash table.
+
+The input strings are assumed to consist of printable 7-bit ASCII characters
+and the return values are assumed to be one digit integers.
+
+In this program a DAFSA is a diamond shaped graph starting at a common
+source node and ending at a common sink node. All internal nodes contain
+a label and each word is represented by the labels in one path from
+the source node to the sink node.
+
+The following python represention is used for nodes:
+
+ Source node: [ children ]
+ Internal node: (label, [ children ])
+ Sink node: None
+
+The graph is first compressed by prefixes like a trie. In the next step
+suffixes are compressed so that the graph gets diamond shaped. Finally
+one to one linked nodes are replaced by nodes with the labels joined.
+
+The order of the operations is crucial since lookups will be performed
+starting from the source with no backtracking. Thus a node must have at
+most one child with a label starting by the same character. The output
+is also arranged so that all jumps are to increasing addresses, thus forward
+in memory.
+
+The generated output has suffix free decoding so that the sign of leading
+bits in a link (a reference to a child node) indicate if it has a size of one,
+two or three bytes and if it is the last outgoing link from the actual node.
+A node label is terminated by a byte with the leading bit set.
+
+The generated byte array can described by the following BNF:
+
+ ::= < 8-bit value in range [0x00-0xFF] >
+
+ ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
+ ::= < char + 0x80, byte in range [0xA0-0xFF] >
+ ::= < value + 0x80, byte in range [0x80-0x8F] >
+
+ ::= < byte in range [0x00-0x3F] >
+ ::= < byte in range [0x40-0x5F] >
+ ::= < byte in range [0x60-0x7F] >
+
+ ::= < byte in range [0x80-0xBF] >
+ ::= < byte in range [0xC0-0xDF] >
+ ::= < byte in range [0xE0-0xFF] >
+
+ ::=
+
+