Replace psl2c by psl-make-dafsa

Removed --input-format from psl-make-dafsa.
Added --output-format=cxx+ to psl-make-dafsa.
Removed psl2c.
This commit is contained in:
Tim Rühsen 2016-12-06 15:14:35 +01:00
parent 6490b8214b
commit deabd4a546
8 changed files with 56 additions and 397 deletions

View File

@ -33,7 +33,6 @@ check-coverage:
lcov --capture --initial --directory src --output-file libpsl.info
$(MAKE) check
lcov --capture --directory src --output-file libpsl.info
lcov --remove libpsl.info 'src/psl2c.c' -o libpsl.info
genhtml --prefix . libpsl.info --legend --title "libpsl" --output-directory=lcov
check-coverage-libidn:

View File

@ -30,7 +30,7 @@ Libpsl...
- finds the shortest private part of a given domain
- works with international domains (UTF-8 and IDNA2008 Punycode)
- is thread-safe
- handles IDNA2008 UTS#46 (libicu is used by psl2c if installed)
- handles IDNA2008 UTS#46 (if libicu is available)
Find more information about the Publix Suffix List [here](https://publicsuffix.org/).
@ -86,7 +86,7 @@ representation of strings. Here we use it to reduce the whole PSL to about 32k i
Generate `psl.dafsa` from `list/public_suffix_list.dat`
$ src/psl-make-dafsa --output-format=binary --input-format=psl list/public_suffix_list.dat psl.dafsa
$ src/psl-make-dafsa --output-format=binary list/public_suffix_list.dat psl.dafsa
Test the result (example)

View File

@ -20,23 +20,10 @@ if WITH_LIBIDN
libpsl_la_LDFLAGS += -lidn -lunistring
endif
noinst_PROGRAMS = psl2c
psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/psl-make-dafsa\"
if BUILTIN_GENERATOR_LIBICU
psl2c_LDADD = $(LIBICU_LIBS)
endif
if BUILTIN_GENERATOR_LIBIDN2
psl2c_LDADD = @LTLIBICONV@ -lidn2 -lunistring
endif
if BUILTIN_GENERATOR_LIBIDN
psl2c_LDADD = @LTLIBICONV@ -lidn -lunistring
endif
# Build rule for suffix_dafsa.c
# PSL_FILE can be set by ./configure --with-psl-file=[PATH]
suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
suffixes_dafsa.c: $(PSL_FILE) $(srcdir)/psl-make-dafsa
$(srcdir)/psl-make-dafsa --output-format=cxx+ "$(PSL_FILE)" suffixes_dafsa.c
EXTRA_DIST = psl-make-dafsa LICENSE.chromium

View File

@ -221,6 +221,9 @@ The bytes in the generated array has the following meaning:
"""
import sys
import os.path
import time
import hashlib
class InputError(Exception):
"""Exception raised for errors in the input file."""
@ -498,6 +501,26 @@ def to_cxx(data, codecs):
text += b'};\n'
return text
def sha1_file(name):
sha1 = hashlib.sha1()
with open(name, 'rb') as f:
while True:
data = f.read(65536)
if not data:
break
sha1.update(data)
return sha1.hexdigest()
def to_cxx_plus(data, codecs):
"""Generates C++ code from a word list plus some variable assignments as needed by libpsl"""
text = to_cxx(data, codecs)
text += b'static time_t _psl_file_time = %d;\n' % os.stat(psl_input_file).st_mtime
text += b'static int _psl_nsuffixes = %d;\n' % psl_nsuffixes
text += b'static int _psl_nexceptions = %d;\n' % psl_nexceptions
text += b'static int _psl_nwildcards = %d;\n' % psl_nwildcards
text += b'static const char _psl_sha1_checksum[] = "%s";\n' % bytes(sha1_file(psl_input_file), **codecs)
text += b'static const char _psl_filename[] = "%s";\n' % bytes(psl_input_file, **codecs)
return text
def words_to_whatever(words, converter, utf_mode, codecs):
"""Generates C++ code from a word list"""
@ -511,31 +534,15 @@ def words_to_cxx(words, utf_mode, codecs):
"""Generates C++ code from a word list"""
return words_to_whatever(words, to_cxx, utf_mode, codecs)
def words_to_cxx_plus(words, utf_mode, codecs):
"""Generates C++ code from a word list plus some variable assignments as needed by libpsl"""
return words_to_whatever(words, to_cxx_plus, utf_mode, codecs)
def words_to_binary(words, utf_mode, codecs):
"""Generates C++ code from a word list"""
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
def parse_psl2c(infile, utf_mode, codecs):
"""Parses file generated by psl2c and extract strings and return code"""
lines = [bytes(line.strip(), **codecs) for line in infile]
for line in lines:
if line[-3:-1] != b', ':
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
# Technically the DAFSA format could support return values in range [0x00-0x1E],
# but the values below are the only with a defined meaning.
if line[-1] not in b'0123456789ABCDEF':
raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1:])
# with open("gperf.out", 'w') as outfile:
# for line in sorted(lines):
# outfile.write(line[:-3] + line[-1] + "\n")
return [line[:-3] + line[-1:] for line in sorted(lines)]
def parse_psl(infile, utf_mode, codecs):
"""Parses PSL file and extract strings and return code"""
PSL_FLAG_EXCEPTION = (1<<0)
@ -544,6 +551,8 @@ def parse_psl(infile, utf_mode, codecs):
PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section
PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking
global psl_nsuffixes, psl_nexceptions, psl_nwildcards
psl = {}
section = 0
@ -565,17 +574,21 @@ def parse_psl(infile, utf_mode, codecs):
continue # skip comments
if line[:1] == b'!':
psl_nexceptions += 1
flags = PSL_FLAG_EXCEPTION | section
line = line[1:]
elif line[:1] == b'*':
if line[1:2] != b'.':
print('Unsupported kind of rule (ignored): %s' % line)
continue
psl_nwildcards += 1
psl_nsuffixes += 1
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
line = line[2:]
else:
if not b'.' in line:
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
psl_nsuffixes += 1
flags = PSL_FLAG_PLAIN | section
punycode = line.decode('utf-8').encode('idna')
@ -607,9 +620,8 @@ def parse_psl(infile, utf_mode, codecs):
def usage():
"""Prints the usage"""
print('usage: %s [options] infile outfile' % sys.argv[0])
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
print(' --input-format=psl infile is a Public Suffix List file')
print(' --output-format=cxx Write DAFSA as C/C++ code (default)')
print(' --output-format=cxx+ Write DAFSA as C/C++ code plus statistical assignments')
print(' --output-format=binary Write DAFSA binary data')
print(' --encoding=ascii 7-bit ASCII mode')
print(' --encoding=utf-8 UTF-8 mode (default)')
@ -622,7 +634,7 @@ def main():
usage()
converter = words_to_cxx
parser = parse_psl2c
parser = parse_psl
utf_mode = True
codecs = dict()
@ -630,12 +642,11 @@ def main():
codecs['encoding'] = 'utf-8'
for arg in sys.argv[1:-2]:
# Check --input-format for backward compatibility
if arg.startswith('--input-format='):
value = arg[15:].lower()
if value == 'psl':
parser = parse_psl
elif value == 'psl2c':
parser = parse_psl2c
else:
print("Unknown input format '%s'" % value)
return 1
@ -645,6 +656,8 @@ def main():
converter = words_to_binary
elif value == 'cxx':
converter = words_to_cxx
elif value == 'cxx+':
converter = words_to_cxx_plus
else:
print("Unknown output format '%s'" % value)
return 1
@ -664,6 +677,14 @@ def main():
with open(sys.argv[-1], 'wb') as outfile:
outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
else:
"""Some statistical data for --output-format=cxx+"""
global psl_input_file, psl_nsuffixes, psl_nexceptions, psl_nwildcards
psl_input_file = sys.argv[-2]
psl_nsuffixes = 0
psl_nexceptions = 0
psl_nwildcards = 0
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'wb') as outfile:
outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))

View File

@ -19,14 +19,11 @@ The format of the data read and written by \fBpsl-make-dafsa\fR
depends on options passed to it.
.br
.TP
\fB\-\-input\-format=\fR[\fIpsl2c\fR|\fIpsl\fR]
\fBpsl2c\fR: (default) input is C code generated by libpsl/psl2c
.br
\fBpsl\fR: input is standard textual Public Suffix List file
.TP
\fB\-\-output\-format=\fR[\fIcxx\fR|\fIbinary\fR]
\fB\-\-output\-format=\fR[\fIcxx\fR|\fIcxx+\fR|\fIbinary\fR]
\fBcxx\fR: (default) output is C/C++ code
.br
\fBcxx+\fR: output is C/C++ code plus statistical assignments (used by libpsl build process)
.br
\fBbinary\fR: output is an architecture-independent binary format
.TP
\fB\-\-encoding=\fR[\fIutf-8\fR|\fIascii\fR]

View File

@ -44,20 +44,6 @@
# define _UNUSED
#endif
/* if this file is included by psl2c, redefine to use requested library for builtin data */
#ifdef _LIBPSL_INCLUDED_BY_PSL2C
# undef WITH_LIBICU
# undef WITH_LIBIDN2
# undef WITH_LIBIDN
# ifdef BUILTIN_GENERATOR_LIBICU
# define WITH_LIBICU
# elif defined(BUILTIN_GENERATOR_LIBIDN2)
# define WITH_LIBIDN2
# elif defined(BUILTIN_GENERATOR_LIBIDN)
# define WITH_LIBIDN
# endif
#endif
#if ENABLE_NLS != 0
# include <libintl.h>
# define _(STRING) gettext(STRING)
@ -183,19 +169,8 @@ struct _psl_ctx_st {
utf8 : 1; /* 1: data contains UTF-8 + punycode encoded rules */
};
/* include the PSL data compiled by 'psl2c' */
#ifndef _LIBPSL_INCLUDED_BY_PSL2C
# include "suffixes_dafsa.c"
#else
/* if this source file is included by psl2c.c, provide empty builtin data */
static const unsigned char kDafsa[1];
static time_t _psl_file_time;
static int _psl_nsuffixes;
static int _psl_nexceptions;
static int _psl_nwildcards;
static const char _psl_sha1_checksum[] = "";
static const char _psl_filename[] = "";
#endif
/* include the PSL data generated by psl-make-dafsa */
#include "suffixes_dafsa.c"
/* references to these PSLs will result in lookups to built-in data */
static const psl_ctx_t

View File

@ -1,320 +0,0 @@
/*
* Copyright(c) 2014-2016 Tim Ruehsen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* This file is part of libpsl.
*
* Precompile Public Suffix List into a C source file
*
* Changelog
* 22.03.2014 Tim Ruehsen created
*
*/
#if HAVE_CONFIG_H
# include <config.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/stat.h>
#ifdef HAVE_ALLOCA_H
# include <alloca.h>
#endif
#if defined(BUILTIN_GENERATOR_LIBICU) || defined(BUILTIN_GENERATOR_LIBIDN2) || defined(BUILTIN_GENERATOR_LIBIDN)
# define _GENERATE_BUILTIN_DATA
#endif
#include <libpsl.h>
/* here we include the library source code to have access to internal functions and data structures */
#define _LIBPSL_INCLUDED_BY_PSL2C
# include "psl.c"
#undef _LIBPSL_INCLUDED_BY_PSL2C
#ifdef _GENERATE_BUILTIN_DATA
#if 0
static int _check_psl(const psl_ctx_t *psl)
{
int it, pos, err = 0;
/* check if plain suffix also appears in exceptions */
for (it = 0; it < psl->suffixes->cur; it++) {
_psl_entry_t *e = _vector_get(psl->suffixes, it);
if (!e->wildcard && _vector_find(psl->suffix_exceptions, e) >= 0) {
fprintf(stderr, "Found entry '%s' also in exceptions\n", e->label);
err = 1;
}
}
/* check if exception also appears in suffix list as plain entry */
for (it = 0; it < psl->suffix_exceptions->cur; it++) {
_psl_entry_t *e2, *e = _vector_get(psl->suffix_exceptions, it);
if ((e2 = _vector_get(psl->suffixes, pos = _vector_find(psl->suffixes, e)))) {
if (!e2->wildcard) {
fprintf(stderr, "Found exception '!%s' also as suffix\n", e->label);
err = 1;
}
/* Two same domains in a row are allowed: wildcard and non-wildcard.
* Binary search find either of them, so also check previous and next entry. */
else if (pos > 0 && _suffix_compare(e, e2 = _vector_get(psl->suffixes, pos - 1)) == 0 && !e2->wildcard) {
fprintf(stderr, "Found exception '!%s' also as suffix\n", e->label);
err = 1;
}
else if (pos < psl->suffixes->cur - 1 && _suffix_compare(e, e2 = _vector_get(psl->suffixes, pos + 1)) == 0 && !e2->wildcard) {
fprintf(stderr, "Found exception '!%s' also as suffix\n", e->label);
err = 1;
}
}
}
/* check if non-wildcard entry is already covered by wildcard entry */
for (it = 0; it < psl->suffixes->cur; it++) {
const char *p;
_psl_entry_t *e = _vector_get(psl->suffixes, it);
if (e->nlabels > 1 && !e->wildcard && (p = strchr(e->label, '.'))) {
_psl_entry_t *e2, *e3, suffix;
suffix.label = p + 1;
suffix.length = strlen(p + 1);
suffix.nlabels = e->nlabels - 1;
e2 = _vector_get(psl->suffixes, pos = _vector_find(psl->suffixes, &suffix));
if (e2) {
if (e2->wildcard) {
fprintf(stderr, "Found superfluous '%s' already covered by '*.%s'\n", e->label, e2->label);
err = 1;
}
/* Two same domains in a row are allowed: wildcard and non-wildcard.
* Binary search find either of them, so also check previous and next entry. */
else if (pos > 0 && _suffix_compare(e2, e3 = _vector_get(psl->suffixes, pos - 1)) == 0 && e3->wildcard) {
fprintf(stderr, "Found superfluous '%s' already covered by '*.%s'\n", e->label, e2->label);
err = 1;
}
else if (pos < psl->suffixes->cur - 1 && _suffix_compare(e2, e3 = _vector_get(psl->suffixes, pos + 1)) == 0 && e3->wildcard) {
fprintf(stderr, "Found superfluous '%s' already covered by '*.%s'\n", e->label, e2->label);
err = 1;
}
}
}
}
return err;
}
#endif
static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
{
FILE *fp;
int it;
#ifdef BUILTIN_GENERATOR_LIBICU
do {
UVersionInfo version_info;
char version[U_MAX_VERSION_STRING_LENGTH];
u_getVersion(version_info);
u_versionToString(version_info, version);
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version);
} while (0);
#elif defined(BUILTIN_GENERATOR_LIBIDN2)
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn2/%s) */\n", idn2_check_version(NULL));
#elif defined(BUILTIN_GENERATOR_LIBIDN)
fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libidn/%s) */\n", stringprep_check_version(NULL));
#else
fprintf(fpout, "/* automatically generated by psl2c (punycode generated internally) */\n");
#endif
if ((fp = fopen("in.tmp", "w"))) {
for (it = 0; it < v->cur; it++) {
_psl_entry_t *e = _vector_get(v, it);
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
}
fclose(fp);
}
if ((it = system(MAKE_DAFSA " in.tmp out.tmp")))
fprintf(stderr, "Failed to execute " MAKE_DAFSA "\n");
if ((fp = fopen("out.tmp", "r"))) {
char buf[256];
while (fgets(buf, sizeof(buf), fp))
fputs(buf, fpout);
fclose(fp);
}
unlink("in.tmp");
unlink("out.tmp");
}
#endif /* _GENERATE_BUILTIN_DATA */
static int _print_psl_entries_dafsa_binary(const char *fname, const _psl_vector_t *v)
{
FILE *fp;
int ret = 0, it, rc;
char cmd[256];
if ((fp = fopen("in.tmp", "w"))) {
for (it = 0; it < v->cur; it++) {
_psl_entry_t *e = _vector_get(v, it);
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
}
fclose(fp);
} else {
fprintf(stderr, "Failed to write open 'in.tmp'\n");
return 3;
}
snprintf(cmd, sizeof(cmd), MAKE_DAFSA " --output-format=binary in.tmp %s", fname);
if ((rc = system(cmd))) {
fprintf(stderr, "Failed to execute '%s' (%d)\n", cmd, rc);
ret = 2;
}
unlink("in.tmp");
return ret;
}
static void usage(void)
{
fprintf(stderr, "Usage: psl2c [--binary] <infile> <outfile>\n");
fprintf(stderr, " <infile> is the 'public_suffix_list.dat', lowercase UTF-8 encoded\n");
fprintf(stderr, " <outfile> is the the filename to be generated from <infile>\n");
fprintf(stderr, " --binary Generate binary DAFSA output (default: C code for psl.c)\n");
exit(1);
}
int main(int argc, const char **argv)
{
FILE *fpout;
psl_ctx_t *psl;
int ret = 0, argpos = 1, binary = 0;
if (argc < 3)
usage();
if (strcmp(argv[argpos], "--binary") == 0) {
argpos++;
binary = 1;
}
if (argc - argpos != 2)
usage();
if (binary) {
if (!(psl = psl_load_file(argv[argpos])))
return 2;
ret = _print_psl_entries_dafsa_binary(argv[argpos + 1], psl->suffixes);
psl_free(psl);
return ret;
}
#ifdef _GENERATE_BUILTIN_DATA
if (!(psl = psl_load_file(argv[argpos])))
return 2;
if (!psl->suffixes || !psl->nsuffixes) {
fprintf(stderr, "Failed to load PSL. Please check content of '%s'.\n", argv[argpos]);
return 5;
}
/* look for ambiguous or double entries */
/* if (_check_psl(psl)) {
psl_free(psl);
return 5;
}
*/
if ((fpout = fopen(argv[argpos + 1], "w"))) {
FILE *pp;
struct stat st;
size_t cmdsize = 16 + strlen(argv[argpos]);
char *cmd = alloca(cmdsize), checksum[64] = "";
char *abs_srcfile;
_print_psl_entries_dafsa(fpout, psl->suffixes);
snprintf(cmd, cmdsize, "sha1sum %s", argv[argpos]);
if ((pp = popen(cmd, "r"))) {
if (fscanf(pp, "%63[0-9a-zA-Z]", checksum) < 1)
*checksum = 0;
pclose(pp);
}
if (stat(argv[argpos], &st) != 0)
st.st_mtime = 0;
fprintf(fpout, "static time_t _psl_file_time = %lu;\n", st.st_mtime);
fprintf(fpout, "static int _psl_nsuffixes = %d;\n", psl->nsuffixes);
fprintf(fpout, "static int _psl_nexceptions = %d;\n", psl->nexceptions);
fprintf(fpout, "static int _psl_nwildcards = %d;\n", psl->nwildcards);
fprintf(fpout, "static const char _psl_sha1_checksum[] = \"%s\";\n", checksum);
/* We need an absolute path here, else psl_builtin_outdated() won't work reliable */
/* Caveat: symbolic links are resolved by realpath() */
if ((abs_srcfile = realpath(argv[argpos], NULL))) {
fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", abs_srcfile);
free(abs_srcfile);
} else
fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", argv[argpos]);
if (fclose(fpout) != 0)
ret = 4;
} else {
fprintf(stderr, "Failed to write open '%s'\n", argv[argpos + 1]);
ret = 3;
}
psl_free(psl);
#else
if ((fpout = fopen(argv[argpos + 1], "w"))) {
fprintf(fpout, "static const unsigned char kDafsa[1];\n");
fprintf(fpout, "static time_t _psl_file_time;\n");
fprintf(fpout, "static int _psl_nsuffixes = 0;\n");
fprintf(fpout, "static int _psl_nexceptions = 0;\n");
fprintf(fpout, "static int _psl_nwildcards = 0;\n");
fprintf(fpout, "static const char _psl_sha1_checksum[] = \"\";\n");
fprintf(fpout, "static const char _psl_filename[] = \"\";\n");
if (fclose(fpout) != 0)
ret = 4;
} else {
fprintf(stderr, "Failed to write open '%s'\n", argv[argpos + 1]);
ret = 3;
}
#endif /* GENERATE_BUILTIN_DATA */
return ret;
}

View File

@ -28,9 +28,9 @@ TESTS = $(PSL_TESTS)
# check-local target works in parallel to the tests, so the test suite will likely fail
BUILT_SOURCES = psl.dafsa psl_ascii.dafsa
psl.dafsa:
$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary "$(PSL_FILE)" psl.dafsa
$(top_srcdir)/src/psl-make-dafsa --output-format=binary "$(PSL_FILE)" psl.dafsa
psl_ascii.dafsa:
$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary --encoding=ascii "$(PSL_FILE)" psl_ascii.dafsa
$(top_srcdir)/src/psl-make-dafsa --output-format=binary --encoding=ascii "$(PSL_FILE)" psl_ascii.dafsa
clean-local:
rm -f psl.dafsa psl_ascii.dafsa