Add magic header to DAFSA binary files
This commit is contained in:
parent
852931571f
commit
8dba092c73
14
README.md
14
README.md
|
@ -70,6 +70,20 @@ library API via command line.
|
|||
|
||||
prints the usage.
|
||||
|
||||
Convert PSL into DAFSA
|
||||
----------------------
|
||||
|
||||
The [DAFSA](https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton) format is a compressed
|
||||
representation of strings. Here we use it to reduce the whole PSL to about 32k in size.
|
||||
|
||||
Generate `psl.dafsa` from `list/public_suffix_list.dat`
|
||||
|
||||
$ src/make_dafsa.py --output-format=binary --input-format=psl list/public_suffix_list.dat psl.dafsa
|
||||
|
||||
Test the result (example)
|
||||
|
||||
$ tools/psl --load-psl-file psl.dafsa aeroclub.aero
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python2
|
||||
# Copyright 2014 The Chromium Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE.chromium file.
|
||||
|
@ -445,7 +445,7 @@ def words_to_cxx(words):
|
|||
|
||||
def words_to_binary(words):
|
||||
"""Generates C++ code from a word list"""
|
||||
return words_to_whatever(words, bytearray)
|
||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray)
|
||||
|
||||
|
||||
def parse_psl2c(infile):
|
||||
|
|
23
src/psl.c
23
src/psl.c
|
@ -1110,8 +1110,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
|||
psl_ctx_t *psl;
|
||||
_psl_entry_t suffix, *suffixp;
|
||||
char buf[256], *linep, *p;
|
||||
size_t n;
|
||||
int type = 0;
|
||||
int type = 0, is_dafsa;
|
||||
_psl_idna_t *idna;
|
||||
|
||||
if (!fp)
|
||||
|
@ -1121,14 +1120,18 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
|||
return NULL;
|
||||
|
||||
/* read first line to allow ASCII / DAFSA detection */
|
||||
if ((n = fread(buf, 1, sizeof(buf) - 1, fp)) < 1)
|
||||
if (!(linep = fgets(buf, sizeof(buf) - 1, fp)))
|
||||
goto fail;
|
||||
|
||||
buf[n] = 0;
|
||||
is_dafsa = strlen(buf) == 16 && !strncmp(buf, ".DAFSA@PSL_", 11);
|
||||
|
||||
if (!strstr(buf, "This Source Code Form is subject to")) {
|
||||
if (is_dafsa) {
|
||||
void *m;
|
||||
size_t size = 65536, len = n;
|
||||
size_t size = 65536, n, len = 0;
|
||||
int version = atoi(buf + 11);
|
||||
|
||||
if (version != 0)
|
||||
goto fail;
|
||||
|
||||
if (!(psl->dafsa = malloc(size)))
|
||||
goto fail;
|
||||
|
@ -1148,11 +1151,11 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
|||
if ((m = realloc(psl->dafsa, len)))
|
||||
psl->dafsa = m;
|
||||
|
||||
psl->dafsa_size = len;
|
||||
|
||||
return psl;
|
||||
}
|
||||
|
||||
rewind(fp);
|
||||
|
||||
idna = _psl_idna_open();
|
||||
|
||||
/*
|
||||
|
@ -1161,7 +1164,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
|||
*/
|
||||
psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
|
||||
|
||||
while ((linep = fgets(buf, sizeof(buf), fp))) {
|
||||
do {
|
||||
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
|
||||
if (!*linep) continue; /* skip empty lines */
|
||||
|
||||
|
@ -1232,7 +1235,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
|||
|
||||
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
|
||||
}
|
||||
}
|
||||
} while ((linep = fgets(buf, sizeof(buf), fp)));
|
||||
|
||||
_vector_sort(psl->suffixes);
|
||||
|
||||
|
|
Loading…
Reference in New Issue