Add magic header to DAFSA binary files
This commit is contained in:
parent
852931571f
commit
8dba092c73
14
README.md
14
README.md
|
@ -70,6 +70,20 @@ library API via command line.
|
||||||
|
|
||||||
prints the usage.
|
prints the usage.
|
||||||
|
|
||||||
|
Convert PSL into DAFSA
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
The [DAFSA](https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton) format is a compressed
|
||||||
|
representation of strings. Here we use it to reduce the whole PSL to about 32k in size.
|
||||||
|
|
||||||
|
Generate `psl.dafsa` from `list/public_suffix_list.dat`
|
||||||
|
|
||||||
|
$ src/make_dafsa.py --output-format=binary --input-format=psl list/public_suffix_list.dat psl.dafsa
|
||||||
|
|
||||||
|
Test the result (example)
|
||||||
|
|
||||||
|
$ tools/psl --load-psl-file psl.dafsa aeroclub.aero
|
||||||
|
|
||||||
License
|
License
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python2
|
||||||
# Copyright 2014 The Chromium Authors. All rights reserved.
|
# Copyright 2014 The Chromium Authors. All rights reserved.
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE.chromium file.
|
# found in the LICENSE.chromium file.
|
||||||
|
@ -445,7 +445,7 @@ def words_to_cxx(words):
|
||||||
|
|
||||||
def words_to_binary(words):
|
def words_to_binary(words):
|
||||||
"""Generates C++ code from a word list"""
|
"""Generates C++ code from a word list"""
|
||||||
return words_to_whatever(words, bytearray)
|
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray)
|
||||||
|
|
||||||
|
|
||||||
def parse_psl2c(infile):
|
def parse_psl2c(infile):
|
||||||
|
|
23
src/psl.c
23
src/psl.c
|
@ -1110,8 +1110,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
psl_ctx_t *psl;
|
psl_ctx_t *psl;
|
||||||
_psl_entry_t suffix, *suffixp;
|
_psl_entry_t suffix, *suffixp;
|
||||||
char buf[256], *linep, *p;
|
char buf[256], *linep, *p;
|
||||||
size_t n;
|
int type = 0, is_dafsa;
|
||||||
int type = 0;
|
|
||||||
_psl_idna_t *idna;
|
_psl_idna_t *idna;
|
||||||
|
|
||||||
if (!fp)
|
if (!fp)
|
||||||
|
@ -1121,14 +1120,18 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
/* read first line to allow ASCII / DAFSA detection */
|
/* read first line to allow ASCII / DAFSA detection */
|
||||||
if ((n = fread(buf, 1, sizeof(buf) - 1, fp)) < 1)
|
if (!(linep = fgets(buf, sizeof(buf) - 1, fp)))
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
buf[n] = 0;
|
is_dafsa = strlen(buf) == 16 && !strncmp(buf, ".DAFSA@PSL_", 11);
|
||||||
|
|
||||||
if (!strstr(buf, "This Source Code Form is subject to")) {
|
if (is_dafsa) {
|
||||||
void *m;
|
void *m;
|
||||||
size_t size = 65536, len = n;
|
size_t size = 65536, n, len = 0;
|
||||||
|
int version = atoi(buf + 11);
|
||||||
|
|
||||||
|
if (version != 0)
|
||||||
|
goto fail;
|
||||||
|
|
||||||
if (!(psl->dafsa = malloc(size)))
|
if (!(psl->dafsa = malloc(size)))
|
||||||
goto fail;
|
goto fail;
|
||||||
|
@ -1148,11 +1151,11 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
if ((m = realloc(psl->dafsa, len)))
|
if ((m = realloc(psl->dafsa, len)))
|
||||||
psl->dafsa = m;
|
psl->dafsa = m;
|
||||||
|
|
||||||
|
psl->dafsa_size = len;
|
||||||
|
|
||||||
return psl;
|
return psl;
|
||||||
}
|
}
|
||||||
|
|
||||||
rewind(fp);
|
|
||||||
|
|
||||||
idna = _psl_idna_open();
|
idna = _psl_idna_open();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1161,7 +1164,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
*/
|
*/
|
||||||
psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
|
psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
|
||||||
|
|
||||||
while ((linep = fgets(buf, sizeof(buf), fp))) {
|
do {
|
||||||
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
|
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
|
||||||
if (!*linep) continue; /* skip empty lines */
|
if (!*linep) continue; /* skip empty lines */
|
||||||
|
|
||||||
|
@ -1232,7 +1235,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
|
|
||||||
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
|
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
|
||||||
}
|
}
|
||||||
}
|
} while ((linep = fgets(buf, sizeof(buf), fp)));
|
||||||
|
|
||||||
_vector_sort(psl->suffixes);
|
_vector_sort(psl->suffixes);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue