From 643e523f09277454c5f89b51f9c4fc11dcf8dbf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Sun, 27 Sep 2015 19:14:13 +0200 Subject: [PATCH 01/38] Fix psl_builtin_outdated() --- src/psl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/psl.c b/src/psl.c index e9bcae2..486da89 100644 --- a/src/psl.c +++ b/src/psl.c @@ -972,10 +972,10 @@ int psl_builtin_outdated(void) { struct stat st; - if (stat(_psl_filename, &st) == 0 && st.st_mtime > _psl_file_time) - return 0; + if (stat(_psl_filename, &st) == 0 && st.st_mtime < _psl_file_time) + return 1; - return 1; + return 0; } /** From ac40a6bfc8e577f36d605d392cc46d84448776ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Sun, 27 Sep 2015 19:14:58 +0200 Subject: [PATCH 02/38] Extend tools/psl --print-info and --version --- tools/psl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/psl.c b/tools/psl.c index 6790efe..fb3972c 100644 --- a/tools/psl.c +++ b/tools/psl.c @@ -117,7 +117,7 @@ int main(int argc, const char *const *argv) usage(0, stdout); } else if (!strcmp(*arg, "--version")) { - printf("psl %s\n", PACKAGE_VERSION); + printf("psl %s (0x%06x)\n", PACKAGE_VERSION, psl_check_version_number(0)); printf("libpsl %s\n", psl_get_version()); printf("\n"); printf("Copyright (C) 2014-2015 Tim Ruehsen\n"); @@ -214,6 +214,7 @@ int main(int argc, const char *const *argv) printf("builtin compile time: %ld (%s)\n", psl_builtin_compile_time(), time2str(psl_builtin_compile_time())); printf("builtin file time: %ld (%s)\n", psl_builtin_file_time(), time2str(psl_builtin_file_time())); printf("builtin SHA1 file hash: %s\n", psl_builtin_sha1sum()); + printf("builtin outdated: %d\n", psl_builtin_outdated()); } else printf("No builtin PSL data available\n"); } From db9a3613d684181738aad4962c47bb7845713157 Mon Sep 17 00:00:00 2001 From: Christopher Meng Date: Wed, 7 Oct 2015 19:08:26 +0800 Subject: [PATCH 03/38] Correct the present year --- NEWS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS b/NEWS index b17e17e..97b270d 100644 --- a/NEWS +++ b/NEWS @@ -4,7 +4,7 @@ Copyright (C) 2014-2015 Tim Rühsen * Add new function psl_check_version_number() * Add version defines to include file -19.09.2025 Release V0.10.0 +19.09.2015 Release V0.10.0 * Code simplified * Less data entries, faster lookups * Add new function psl_suffix_wildcard_count() From dbefdb67678d0774c352aebe4eb37e526c280a3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Thu, 19 Nov 2015 10:06:04 +0100 Subject: [PATCH 04/38] Remove include of bits/stat.h --- src/psl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/psl.c b/src/psl.c index 486da89..ed14a19 100644 --- a/src/psl.c +++ b/src/psl.c @@ -87,7 +87,6 @@ #endif #include -#include /* number of elements within an array */ #define countof(a) (sizeof(a)/sizeof(*(a))) From 71a3f764140a3a37b924d2ca90c08ec99a1190f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Thu, 19 Nov 2015 10:06:30 +0100 Subject: [PATCH 05/38] Update publixsuffix/list submodule --- list | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/list b/list index 2930bb4..e801df4 160000 --- a/list +++ b/list @@ -1 +1 @@ -Subproject commit 2930bb4a5256279e0f7ba44cf9d174fc93ecb732 +Subproject commit e801df4a56ac8c7519d349ad5125433206930d6e From f3b479fd45d32ac1afb6b5e5cc015c0713751b75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Thu, 19 Nov 2015 11:14:31 +0100 Subject: [PATCH 06/38] Remove -DDATADIR from DEFS in tests/Makefile.am --- tests/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Makefile.am b/tests/Makefile.am index 2bc8f82..c71cd99 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,4 +1,4 @@ -DEFS = @DEFS@ -DDATADIR=\"$(top_srcdir)/data\" -DSRCDIR=\"$(srcdir)\" -DPSL_FILE=\"$(PSL_FILE)\" -DPSL_TESTFILE=\"$(PSL_TESTFILE)\" +DEFS = @DEFS@ -DSRCDIR=\"$(srcdir)\" -DPSL_FILE=\"$(PSL_FILE)\" -DPSL_TESTFILE=\"$(PSL_TESTFILE)\" AM_CPPFLAGS = -I$(top_srcdir)/include LDADD = ../src/libpsl.la From 583f97f2bf7f69ce0428ecc86e4101153ff3b998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Thu, 19 Nov 2015 11:16:16 +0100 Subject: [PATCH 07/38] Copy custom PSL file and test file into distribution tarball --- Makefile.am | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index d488ce4..3904754 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,4 +14,8 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS} pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = libpsl.pc -EXTRA_DIST = config.rpath LICENSE $(PSL_FILE) list/tests/test_psl.txt +EXTRA_DIST = config.rpath LICENSE +dist-hook: + mkdir -p $(distdir)/list/tests + cp -p $(PSL_FILE) $(distdir)/list + cp -p $(PSL_TESTFILE) $(distdir)/list/tests From b53273d4063a748dc017e9b1838745cbe02a848a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Thu, 19 Nov 2015 11:18:17 +0100 Subject: [PATCH 08/38] Use absolute PSL path to make psl_builtin_outdated() work reliable --- src/psl2c.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/psl2c.c b/src/psl2c.c index e050194..a050193 100644 --- a/src/psl2c.c +++ b/src/psl2c.c @@ -237,6 +237,7 @@ int main(int argc, const char **argv) struct stat st; size_t cmdsize = 16 + strlen(argv[1]); char *cmd = alloca(cmdsize), checksum[64] = ""; + char *abs_srcfile; const char *source_date_epoch = NULL; #if 0 @@ -256,6 +257,7 @@ int main(int argc, const char **argv) if (stat(argv[1], &st) != 0) st.st_mtime = 0; fprintf(fpout, "static time_t _psl_file_time = %lu;\n", st.st_mtime); + if ((source_date_epoch = getenv("SOURCE_DATE_EPOCH"))) fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", atol(source_date_epoch)); else @@ -264,7 +266,14 @@ int main(int argc, const char **argv) fprintf(fpout, "static int _psl_nexceptions = %d;\n", psl->nexceptions); fprintf(fpout, "static int _psl_nwildcards = %d;\n", psl->nwildcards); fprintf(fpout, "static const char _psl_sha1_checksum[] = \"%s\";\n", checksum); - fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", argv[1]); + + /* We need an absolute path here, else psl_builtin_outdated() won't work reliable */ + /* Caveat: symbolic links are resolved by realpath() */ + if ((abs_srcfile = realpath(argv[1], NULL))) { + fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", abs_srcfile); + free(abs_srcfile); + } else + fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", argv[1]); if (fclose(fpout) != 0) ret = 4; From d14ada235c4d29b9c54721e78e14bde1a6f79906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Thu, 19 Nov 2015 12:11:27 +0100 Subject: [PATCH 09/38] Use echo without -n in configure.ac (compatibility) --- configure.ac | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index c3cbd2b..d8572f6 100644 --- a/configure.ac +++ b/configure.ac @@ -20,9 +20,9 @@ AC_C_INLINE # # Generate version defines for include file # -AC_SUBST([LIBPSL_VERSION_MAJOR], [`echo -n $VERSION|cut -d'.' -f1`]) -AC_SUBST([LIBPSL_VERSION_MINOR], [`echo -n $VERSION|cut -d'.' -f2`]) -AC_SUBST([LIBPSL_VERSION_PATCH], [`echo -n $VERSION|cut -d'.' -f3`]) +AC_SUBST([LIBPSL_VERSION_MAJOR], [`echo $VERSION|cut -d'.' -f1`]) +AC_SUBST([LIBPSL_VERSION_MINOR], [`echo $VERSION|cut -d'.' -f2`]) +AC_SUBST([LIBPSL_VERSION_PATCH], [`echo $VERSION|cut -d'.' -f3`]) AC_SUBST([LIBPSL_VERSION_NUMBER], [`printf '0x%02x%02x%02x' $LIBPSL_VERSION_MAJOR $LIBPSL_VERSION_MINOR $LIBPSL_VERSION_PATCH`]) AC_CONFIG_FILES([include/libpsl.h]) From 519b8c9d17cd701529d6b7b6ed0bcdaaba792c51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Thu, 3 Dec 2015 10:08:04 +0100 Subject: [PATCH 10/38] Add time measurement for test-is-public-all.c --- configure.ac | 5 ++++- tests/test-is-public-all.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index d8572f6..1a8698b 100644 --- a/configure.ac +++ b/configure.ac @@ -216,6 +216,9 @@ elif test -n "$NEEDS_NSL" ; then LIBS="$LIBS -lnsl" fi +# Check for clock_gettime() used for performance measurement +AC_SEARCH_LIBS(clock_gettime, rt) + # Check for valgrind ac_enable_valgrind=no AC_ARG_ENABLE(valgrind-tests, @@ -252,7 +255,7 @@ AC_SUBST(PSL_TESTFILE) # check for alloca / alloca.h AC_FUNC_ALLOCA -AC_CHECK_FUNCS([strndup]) +AC_CHECK_FUNCS([strndup clock_gettime]) # Override the template file name of the generated .pc file, so that there # is no need to rename the template file when the API version changes. diff --git a/tests/test-is-public-all.c b/tests/test-is-public-all.c index 705e79f..ea3bb0e 100644 --- a/tests/test-is-public-all.c +++ b/tests/test-is-public-all.c @@ -45,6 +45,9 @@ static int ok, failed; +#ifdef HAVE_CLOCK_GETTIME + struct timespec ts1, ts2; +#endif static inline int _isspace_ascii(const char c) { @@ -63,6 +66,10 @@ static void test_psl(void) printf("loaded %d suffixes and %d exceptions\n", psl_suffix_count(psl), psl_suffix_exception_count(psl)); if ((fp = fopen(PSL_FILE, "r"))) { +#ifdef HAVE_CLOCK_GETTIME + clock_gettime(CLOCK_REALTIME, &ts1); +#endif + while ((linep = fgets(buf, sizeof(buf), fp))) { while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */ if (!*linep) continue; /* skip empty lines */ @@ -107,6 +114,9 @@ static void test_psl(void) } } +#ifdef HAVE_CLOCK_GETTIME + clock_gettime(CLOCK_REALTIME, &ts2); +#endif fclose(fp); } else { printf("Failed to open %s\n", PSL_FILE); @@ -118,6 +128,10 @@ static void test_psl(void) int main(int argc, const char * const *argv) { +#ifdef HAVE_CLOCK_GETTIME + long ns; +#endif + /* if VALGRIND testing is enabled, we have to call ourselves with valgrind checking */ if (argc == 1) { const char *valgrind = getenv("TESTS_VALGRIND"); @@ -138,6 +152,21 @@ int main(int argc, const char * const *argv) return 1; } - printf("Summary: All %d tests passed\n", ok + failed); +#ifdef HAVE_CLOCK_GETTIME + if (ts1.tv_sec == ts2.tv_sec) + ns = ts2.tv_nsec - ts1.tv_nsec; + else if (ts1.tv_sec == ts2.tv_sec - 1) + ns = 1000000000L - (ts2.tv_nsec - ts1.tv_nsec); + else + ns = 0; /* let's assume something is wrong and skip outputting measured time */ + + if (ns) + printf("Summary: All %d tests passed in %ld.%06ld ms\n", ok, ns / 1000000, ns % 1000000000); + else + printf("Summary: All %d tests passed\n", ok); +#else + printf("Summary: All %d tests passed\n", ok); +#endif + return 0; } From aa0593460ca73a78a83583ddccdf877c6433d11c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Fri, 4 Dec 2015 17:15:03 +0100 Subject: [PATCH 11/38] Remove .travis.yml from branch --- .travis.yml | 29 ----------------------------- src/psl2c.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 34 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d72425b..0000000 --- a/.travis.yml +++ /dev/null @@ -1,29 +0,0 @@ -language: c -compiler: - - gcc - - clang -# Change this to your needs -script: - - ./autogen.sh - - ./configure && make -j4 && make check -j4 - - ./configure --enable-runtime=libicu --enable-builtin=libicu && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libicu --enable-builtin=libidn2 && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libicu --enable-builtin=libidn && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libicu --disable-builtin && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn2 --enable-builtin=libicu && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn2 --enable-builtin=libidn2 && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn2 --enable-builtin=libidn && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn2 --disable-builtin && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn --enable-builtin=libicu && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn --enable-builtin=libidn2 && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn --enable-builtin=libidn && make clean && make -j4 && make check -j4 - - ./configure --enable-runtime=libidn --disable-builtin && make clean && make -j4 && make check -j4 - - ./configure --disable-runtime --enable-builtin=libicu && make clean && make -j4 && make check -j4 - - ./configure --disable-runtime --enable-builtin=libidn2 && make clean && make -j4 && make check -j4 - - ./configure --disable-runtime --enable-builtin=libidn && make clean && make -j4 && make check -j4 - - ./configure --disable-runtime --disable-builtin && make clean && make -j4 && make check -j4 - - ./configure --enable-gtk-doc && make -j4 && make check -j4 - - make distcheck -before_install: - - sudo apt-get -qq update - - sudo apt-get -q install autoconf automake autopoint libtool gtk-doc-tools gettext libidn11 libidn11-dev libidn2-0 libidn2-0-dev libicu48 libicu-dev libunistring0 libunistring-dev diff --git a/src/psl2c.c b/src/psl2c.c index a050193..285fee0 100644 --- a/src/psl2c.c +++ b/src/psl2c.c @@ -161,6 +161,36 @@ static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char * fprintf(fpout, "};\n"); } +static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v, const char *varname) +{ + int it; + +#ifdef BUILTIN_GENERATOR_LIBICU + do { + UVersionInfo version_info; + char version[U_MAX_VERSION_STRING_LENGTH]; + + u_getVersion(version_info); + u_versionToString(version_info, version); + fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version); + } while (0); +#elif defined(BUILTIN_GENERATOR_LIBIDN2) + fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL)); +#elif defined(BUILTIN_GENERATOR_LIBIDN) + fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL)); +#else + fprintf(fpout, "/* automatically generated by psl2c (without punycode support) */\n"); +#endif + + for (it = 0; it < v->cur; it++) { + _psl_entry_t *e = _vector_get(v, it); + + + fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n", + e->label_buf, e->length, (int) e->nlabels, (int) e->flags); + } +} + #if 0 #if !defined(WITH_LIBICU) && !defined(WITH_IDN2) static int _str_needs_encoding(const char *s) @@ -213,17 +243,22 @@ int main(int argc, const char **argv) #ifdef _GENERATE_BUILTIN_DATA psl_ctx_t *psl; #endif - int ret = 0; + int ret = 0, argpos = 1, dafsa = 0; - if (argc != 3) { - fprintf(stderr, "Usage: psl2c \n"); + if (argc == 4 && !strcmp(argv[1], "--dafsa")) { + argpos = 2; + dafsa = 1; + } + + if (argc - argpos != 2) { + fprintf(stderr, "Usage: psl2c [--dafsa] \n"); fprintf(stderr, " is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n"); fprintf(stderr, " is the the C filename to be generated from \n"); return 1; } #ifdef _GENERATE_BUILTIN_DATA - if (!(psl = psl_load_file(argv[1]))) + if (!(psl = psl_load_file(argv[argpos]))) return 2; /* look for ambigious or double entries */ @@ -245,7 +280,10 @@ int main(int argc, const char **argv) _add_punycode_if_needed(psl->suffixes); #endif - _print_psl_entries(fpout, psl->suffixes, "suffixes"); + if (dafsa) + _print_psl_entries(fpout, psl->suffixes, "suffixes"); + else + _print_psl_entries_dafsa(fpout, psl->suffixes, "suffixes_dafsa"); snprintf(cmd, cmdsize, "sha1sum %s", argv[1]); if ((pp = popen(cmd, "r"))) { From 375aef05ae2e0f1c4edef2ef54b8844dd108b2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20R=C3=BChsen?= Date: Fri, 4 Dec 2015 17:15:55 +0100 Subject: [PATCH 12/38] Add tools/make_dafsa.py --- tools/make_dafsa.py | 469 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 469 insertions(+) create mode 100755 tools/make_dafsa.py diff --git a/tools/make_dafsa.py b/tools/make_dafsa.py new file mode 100755 index 0000000..78358ef --- /dev/null +++ b/tools/make_dafsa.py @@ -0,0 +1,469 @@ +#!/usr/bin/env python +# Copyright 2014 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +""" +A Deterministic acyclic finite state automaton (DAFSA) is a compact +representation of an unordered word list (dictionary). + +http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton + +This python program converts a list of strings to a byte array in C++. +This python program fetches strings and return values from a gperf file +and generates a C++ file with a byte array representing graph that can be +used as a memory efficient replacement for the perfect hash table. + +The input strings are assumed to consist of printable 7-bit ASCII characters +and the return values are assumed to be one digit integers. + +In this program a DAFSA is a diamond shaped graph starting at a common +source node and ending at a common sink node. All internal nodes contain +a label and each word is represented by the labels in one path from +the source node to the sink node. + +The following python represention is used for nodes: + + Source node: [ children ] + Internal node: (label, [ children ]) + Sink node: None + +The graph is first compressed by prefixes like a trie. In the next step +suffixes are compressed so that the graph gets diamond shaped. Finally +one to one linked nodes are replaced by nodes with the labels joined. + +The order of the operations is crucial since lookups will be performed +starting from the source with no backtracking. Thus a node must have at +most one child with a label starting by the same character. The output +is also arranged so that all jumps are to increasing addresses, thus forward +in memory. + +The generated output has suffix free decoding so that the sign of leading +bits in a link (a reference to a child node) indicate if it has a size of one, +two or three bytes and if it is the last outgoing link from the actual node. +A node label is terminated by a byte with the leading bit set. + +The generated byte array can described by the following BNF: + + ::= < 8-bit value in range [0x00-0xFF] > + + ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > + ::= < char + 0x80, byte in range [0xA0-0xFF] > + ::= < value + 0x80, byte in range [0x80-0x8F] > + + ::= < byte in range [0x00-0x3F] > + ::= < byte in range [0x40-0x5F] > + ::= < byte in range [0x60-0x7F] > + + ::= < byte in range [0x80-0xBF] > + ::= < byte in range [0xC0-0xDF] > + ::= < byte in range [0xE0-0xFF] > + + ::= + +