Add magic header to DAFSA binary files

This commit is contained in:
Tim Rühsen 2016-07-13 11:14:18 +02:00
parent 852931571f
commit 8dba092c73
3 changed files with 29 additions and 12 deletions

View File

@ -70,6 +70,20 @@ library API via command line.
prints the usage. prints the usage.
Convert PSL into DAFSA
----------------------
The [DAFSA](https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton) format is a compressed
representation of strings. Here we use it to reduce the whole PSL to about 32k in size.
Generate `psl.dafsa` from `list/public_suffix_list.dat`
$ src/make_dafsa.py --output-format=binary --input-format=psl list/public_suffix_list.dat psl.dafsa
Test the result (example)
$ tools/psl --load-psl-file psl.dafsa aeroclub.aero
License License
------- -------

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python2
# Copyright 2014 The Chromium Authors. All rights reserved. # Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE.chromium file. # found in the LICENSE.chromium file.
@ -445,7 +445,7 @@ def words_to_cxx(words):
def words_to_binary(words): def words_to_binary(words):
"""Generates C++ code from a word list""" """Generates C++ code from a word list"""
return words_to_whatever(words, bytearray) return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray)
def parse_psl2c(infile): def parse_psl2c(infile):

View File

@ -1110,8 +1110,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
psl_ctx_t *psl; psl_ctx_t *psl;
_psl_entry_t suffix, *suffixp; _psl_entry_t suffix, *suffixp;
char buf[256], *linep, *p; char buf[256], *linep, *p;
size_t n; int type = 0, is_dafsa;
int type = 0;
_psl_idna_t *idna; _psl_idna_t *idna;
if (!fp) if (!fp)
@ -1121,14 +1120,18 @@ psl_ctx_t *psl_load_fp(FILE *fp)
return NULL; return NULL;
/* read first line to allow ASCII / DAFSA detection */ /* read first line to allow ASCII / DAFSA detection */
if ((n = fread(buf, 1, sizeof(buf) - 1, fp)) < 1) if (!(linep = fgets(buf, sizeof(buf) - 1, fp)))
goto fail; goto fail;
buf[n] = 0; is_dafsa = strlen(buf) == 16 && !strncmp(buf, ".DAFSA@PSL_", 11);
if (!strstr(buf, "This Source Code Form is subject to")) { if (is_dafsa) {
void *m; void *m;
size_t size = 65536, len = n; size_t size = 65536, n, len = 0;
int version = atoi(buf + 11);
if (version != 0)
goto fail;
if (!(psl->dafsa = malloc(size))) if (!(psl->dafsa = malloc(size)))
goto fail; goto fail;
@ -1148,11 +1151,11 @@ psl_ctx_t *psl_load_fp(FILE *fp)
if ((m = realloc(psl->dafsa, len))) if ((m = realloc(psl->dafsa, len)))
psl->dafsa = m; psl->dafsa = m;
psl->dafsa_size = len;
return psl; return psl;
} }
rewind(fp);
idna = _psl_idna_open(); idna = _psl_idna_open();
/* /*
@ -1161,7 +1164,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
*/ */
psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array); psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
while ((linep = fgets(buf, sizeof(buf), fp))) { do {
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */ while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
if (!*linep) continue; /* skip empty lines */ if (!*linep) continue; /* skip empty lines */
@ -1232,7 +1235,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
_add_punycode_if_needed(idna, psl->suffixes, suffixp); _add_punycode_if_needed(idna, psl->suffixes, suffixp);
} }
} } while ((linep = fgets(buf, sizeof(buf), fp)));
_vector_sort(psl->suffixes); _vector_sort(psl->suffixes);