diff --git a/README.md b/README.md index 45a82c8..a64d6f9 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,20 @@ library API via command line. prints the usage. +Convert PSL into DAFSA +---------------------- + +The [DAFSA](https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton) format is a compressed +representation of strings. Here we use it to reduce the whole PSL to about 32k in size. + +Generate `psl.dafsa` from `list/public_suffix_list.dat` + + $ src/make_dafsa.py --output-format=binary --input-format=psl list/public_suffix_list.dat psl.dafsa + +Test the result (example) + + $ tools/psl --load-psl-file psl.dafsa aeroclub.aero + License ------- diff --git a/src/make_dafsa.py b/src/make_dafsa.py index 55dc181..e22aae6 100755 --- a/src/make_dafsa.py +++ b/src/make_dafsa.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 # Copyright 2014 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE.chromium file. @@ -445,7 +445,7 @@ def words_to_cxx(words): def words_to_binary(words): """Generates C++ code from a word list""" - return words_to_whatever(words, bytearray) + return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray) def parse_psl2c(infile): diff --git a/src/psl.c b/src/psl.c index d704fff..c3a4ffe 100644 --- a/src/psl.c +++ b/src/psl.c @@ -1110,8 +1110,7 @@ psl_ctx_t *psl_load_fp(FILE *fp) psl_ctx_t *psl; _psl_entry_t suffix, *suffixp; char buf[256], *linep, *p; - size_t n; - int type = 0; + int type = 0, is_dafsa; _psl_idna_t *idna; if (!fp) @@ -1121,14 +1120,18 @@ psl_ctx_t *psl_load_fp(FILE *fp) return NULL; /* read first line to allow ASCII / DAFSA detection */ - if ((n = fread(buf, 1, sizeof(buf) - 1, fp)) < 1) + if (!(linep = fgets(buf, sizeof(buf) - 1, fp))) goto fail; - buf[n] = 0; + is_dafsa = strlen(buf) == 16 && !strncmp(buf, ".DAFSA@PSL_", 11); - if (!strstr(buf, "This Source Code Form is subject to")) { + if (is_dafsa) { void *m; - size_t size = 65536, len = n; + size_t size = 65536, n, len = 0; + int version = atoi(buf + 11); + + if (version != 0) + goto fail; if (!(psl->dafsa = malloc(size))) goto fail; @@ -1148,11 +1151,11 @@ psl_ctx_t *psl_load_fp(FILE *fp) if ((m = realloc(psl->dafsa, len))) psl->dafsa = m; + psl->dafsa_size = len; + return psl; } - rewind(fp); - idna = _psl_idna_open(); /* @@ -1161,7 +1164,7 @@ psl_ctx_t *psl_load_fp(FILE *fp) */ psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array); - while ((linep = fgets(buf, sizeof(buf), fp))) { + do { while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */ if (!*linep) continue; /* skip empty lines */ @@ -1232,7 +1235,7 @@ psl_ctx_t *psl_load_fp(FILE *fp) _add_punycode_if_needed(idna, psl->suffixes, suffixp); } - } + } while ((linep = fgets(buf, sizeof(buf), fp))); _vector_sort(psl->suffixes);