Code cleanup, faster lookups

This commit is contained in:
Tim Rühsen 2015-09-19 10:50:00 +02:00
parent fb942952ec
commit e443d21b61
3 changed files with 86 additions and 133 deletions

View File

@ -14,4 +14,4 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS}
pkgconfigdir = $(libdir)/pkgconfig pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = libpsl.pc pkgconfig_DATA = libpsl.pc
EXTRA_DIST = config.rpath LICENSE list/public_suffix_list.dat list/tests/test_psl.txt EXTRA_DIST = config.rpath LICENSE $(PSL_FILE) list/tests/test_psl.txt

171
src/psl.c
View File

@ -123,6 +123,10 @@ static char *strndup(const char *s, size_t n)
#define countof(a) (sizeof(a)/sizeof(*(a))) #define countof(a) (sizeof(a)/sizeof(*(a)))
#define _PSL_FLAG_PLAIN (1<<0)
#define _PSL_FLAG_EXCEPTION (1<<1)
#define _PSL_FLAG_WILDCARD (1<<2)
typedef struct { typedef struct {
char char
label_buf[48]; label_buf[48];
@ -132,7 +136,7 @@ typedef struct {
length; length;
unsigned char unsigned char
nlabels, /* number of labels */ nlabels, /* number of labels */
wildcard; /* this is a wildcard rule (e.g. *.sapporo.jp) */ flags;
} _psl_entry_t; } _psl_entry_t;
/* stripped down version libmget vector routines */ /* stripped down version libmget vector routines */
@ -148,8 +152,10 @@ typedef struct {
struct _psl_ctx_st { struct _psl_ctx_st {
_psl_vector_t _psl_vector_t
*suffixes, *suffixes;
*suffix_exceptions; int
nsuffixes,
nexceptions;
}; };
/* include the PSL data compiled by 'psl2c' */ /* include the PSL data compiled by 'psl2c' */
@ -158,9 +164,10 @@ struct _psl_ctx_st {
#else #else
/* if this source file is included by psl2c.c, provide empty builtin data */ /* if this source file is included by psl2c.c, provide empty builtin data */
static _psl_entry_t suffixes[1]; static _psl_entry_t suffixes[1];
static _psl_entry_t suffix_exceptions[1];
static time_t _psl_file_time; static time_t _psl_file_time;
static time_t _psl_compile_time; static time_t _psl_compile_time;
static int _psl_nsuffixes;
static int _psl_nexceptions;
static const char _psl_sha1_checksum[] = ""; static const char _psl_sha1_checksum[] = "";
static const char _psl_filename[] = ""; static const char _psl_filename[] = "";
#endif #endif
@ -251,11 +258,6 @@ static void _vector_sort(_psl_vector_t *v)
qsort(v->entry, v->cur, sizeof(_psl_vector_t **), (int(*)(const void *, const void *))v->cmp); qsort(v->entry, v->cur, sizeof(_psl_vector_t **), (int(*)(const void *, const void *))v->cmp);
} }
static int _vector_size(_psl_vector_t *v)
{
return v ? v->cur : 0;
}
/* by this kind of sorting, we can easily see if a domain matches or not */ /* by this kind of sorting, we can easily see if a domain matches or not */
static int _suffix_compare(const _psl_entry_t *s1, const _psl_entry_t *s2) static int _suffix_compare(const _psl_entry_t *s1, const _psl_entry_t *s2)
{ {
@ -289,19 +291,7 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length)
return -1; return -1;
} }
if (*rule == '*') { suffix->length = (unsigned char)length;
if (*++rule != '.') {
suffix->nlabels = 0;
/* fprintf(stderr, _("Unsupported kind of rule (ignored): %s\n"), rule); */
return -2;
}
rule++;
suffix->wildcard = 1;
suffix->length = (unsigned char)length - 2;
} else {
suffix->wildcard = 0;
suffix->length = (unsigned char)length;
}
suffix->nlabels = 1; suffix->nlabels = 1;
@ -318,19 +308,24 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length)
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
{ {
_psl_entry_t suffix, *rule; _psl_entry_t suffix, *rule;
const char *p, *label_bak; const char *p;
unsigned short length_bak;
/* this function should be called without leading dots, just make sure */ /* this function should be called without leading dots, just make sure */
suffix.label = domain + (*domain == '.'); suffix.label = domain + (*domain == '.');
suffix.length = strlen(suffix.label); suffix.length = strlen(suffix.label);
suffix.wildcard = 0;
suffix.nlabels = 1; suffix.nlabels = 1;
for (p = suffix.label; *p; p++) for (p = suffix.label; *p; p++)
if (*p == '.') if (*p == '.')
suffix.nlabels++; suffix.nlabels++;
if (suffix.nlabels == 1) {
/* TLD, this is the prevailing '*' match.
* We don't currently support exception TLDs (TLDs that are not a public suffix)
*/
return 1;
}
/* if domain has enough labels, it is public */ /* if domain has enough labels, it is public */
if (psl == &_builtin_psl) if (psl == &_builtin_psl)
rule = &suffixes[0]; rule = &suffixes[0];
@ -347,15 +342,12 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
if (rule) { if (rule) {
/* definitely a match, no matter if the found rule is a wildcard or not */ /* definitely a match, no matter if the found rule is a wildcard or not */
return 1; if (rule->flags & _PSL_FLAG_EXCEPTION)
} else if (suffix.nlabels == 1) { return 0;
/* unknown TLD, this is the prevailing '*' match */ if (rule->flags & _PSL_FLAG_PLAIN)
return 1; return 1;
} }
label_bak = suffix.label;
length_bak = suffix.length;
if ((suffix.label = strchr(suffix.label, '.'))) { if ((suffix.label = strchr(suffix.label, '.'))) {
int pos = rule - suffixes; int pos = rule - suffixes;
@ -369,43 +361,8 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix))); rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix)));
if (rule) { if (rule) {
if (!rule->wildcard) { if ((rule->flags & _PSL_FLAG_WILDCARD))
/* Due to binary search ambiguity we need the following check of neighbour entries.
* TODO: The data structures needs a revision: wildcard and non-wildcard entries must be separated. */
if (psl == &_builtin_psl) {
pos = rule - suffixes;
if (pos > 0 && _suffix_compare(rule, &suffixes[pos - 1]) == 0 && suffixes[pos -1].wildcard)
rule = &suffixes[pos - 1];
else if (pos < (int) (countof(suffixes) - 1) && _suffix_compare(rule, &suffixes[pos + 1]) == 0 && suffixes[pos + 1].wildcard)
rule = &suffixes[pos + 1];
} else {
_psl_entry_t *e;
if (pos > 0 && _suffix_compare(rule, e = _vector_get(psl->suffixes, pos - 1)) == 0 && e->wildcard) {
rule = e;
}
else if (pos < psl->suffixes->cur - 1 && _suffix_compare(rule, e = _vector_get(psl->suffixes, pos + 1)) == 0 && e->wildcard) {
rule = e;
}
}
}
if (rule->wildcard) {
/* now that we matched a wildcard, we have to check for an exception */
suffix.label = label_bak;
suffix.length = length_bak;
suffix.nlabels++;
if (psl == &_builtin_psl) {
if (bsearch(&suffix, suffix_exceptions, countof(suffix_exceptions), sizeof(suffix_exceptions[0]), (int(*)(const void *, const void *))_suffix_compare))
return 0; /* found an exception, so 'domain' is not a public suffix */
} else {
if (_vector_get(psl->suffix_exceptions, _vector_find(psl->suffix_exceptions, &suffix)) != 0)
return 0; /* found an exception, so 'domain' is not a public suffix */
}
return 1; return 1;
}
} }
} }
@ -596,7 +553,7 @@ static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t
/* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */ /* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname)); _suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.wildcard = e->wildcard; suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix)); suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */ suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */ } /* else ignore */
@ -641,7 +598,7 @@ static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
/* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */ /* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname)); _suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.wildcard = e->wildcard; suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix)); suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */ suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */ } /* else ignore */
@ -673,7 +630,7 @@ static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
/* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */ /* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname)); _suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.wildcard = e->wildcard; suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix)); suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */ suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */ } /* else ignore */
@ -749,7 +706,6 @@ psl_ctx_t *psl_load_fp(FILE *fp)
* as of 19.02.2014, the list at http://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions. * as of 19.02.2014, the list at http://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
*/ */
psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array); psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
psl->suffix_exceptions = _vector_alloc(64, _suffix_compare_array);
while ((linep = fgets(buf, sizeof(buf), fp))) { while ((linep = fgets(buf, sizeof(buf), fp))) {
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */ while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
@ -763,31 +719,57 @@ psl_ctx_t *psl_load_fp(FILE *fp)
*linep = 0; *linep = 0;
if (*p == '!') { if (*p == '!') {
/* add to exceptions */ p++;
if (_suffix_init(&suffix, p + 1, linep - p - 1) == 0) { suffix.flags = _PSL_FLAG_EXCEPTION;
suffixp = _vector_get(psl->suffix_exceptions, _vector_add(psl->suffix_exceptions, &suffix)); psl->nexceptions++;
suffixp->label = suffixp->label_buf; /* set label to changed address */ } else if (*p == '*') {
#ifdef WITH_LIBICU if (*++p != '.') {
_add_punycode_if_needed(idna, psl->suffix_exceptions, suffixp); /* fprintf(stderr, _("Unsupported kind of rule (ignored): %s\n"), p - 1); */
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN) continue;
_add_punycode_if_needed(psl->suffix_exceptions, suffixp);
#endif
} }
p++;
/* wildcard *.foo.bar implicitely make foo.bar a public suffix */
suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN;
psl->nsuffixes++;
} else { } else {
/* add to suffixes */ if (!strchr(p, '.'))
if (_suffix_init(&suffix, p, linep - p) == 0) { continue; /* we do not need an explicit plain TLD rule, already covered by implicit '*' rule */
suffix.flags = _PSL_FLAG_PLAIN;
psl->nsuffixes++;
}
if (_suffix_init(&suffix, p, linep - p) == 0) {
int index;
if ((index = _vector_find(psl->suffixes, &suffix)) >= 0) {
/* Found existing entry:
* Combination of exception and plain rule is ambigous
* !foo.bar
* foo.bar
*
* Allowed:
* !foo.bar + *.foo.bar
* foo.bar + *.foo.bar
*
* We do not check here, let's do it later.
*/
suffixp = _vector_get(psl->suffixes, index);
suffixp->flags |= suffix.flags;
} else {
/* New entry */
suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix)); suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
#ifdef WITH_LIBICU
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
_add_punycode_if_needed(psl->suffixes, suffixp);
#endif
} }
suffixp->label = suffixp->label_buf; /* set label to changed address */
#ifdef WITH_LIBICU
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
_add_punycode_if_needed(psl->suffixes, suffixp);
#endif
} }
} }
_vector_sort(psl->suffix_exceptions);
_vector_sort(psl->suffixes); _vector_sort(psl->suffixes);
#ifdef WITH_LIBICU #ifdef WITH_LIBICU
@ -811,7 +793,6 @@ void psl_free(psl_ctx_t *psl)
{ {
if (psl && psl != &_builtin_psl) { if (psl && psl != &_builtin_psl) {
_vector_free(&psl->suffixes); _vector_free(&psl->suffixes);
_vector_free(&psl->suffix_exceptions);
free(psl); free(psl);
} }
} }
@ -855,9 +836,9 @@ const psl_ctx_t *psl_builtin(void)
int psl_suffix_count(const psl_ctx_t *psl) int psl_suffix_count(const psl_ctx_t *psl)
{ {
if (psl == &_builtin_psl) if (psl == &_builtin_psl)
return countof(suffixes); return _psl_nsuffixes;
else if (psl) else if (psl)
return _vector_size(psl->suffixes); return psl->nsuffixes;
else else
return 0; return 0;
} }
@ -875,9 +856,9 @@ int psl_suffix_count(const psl_ctx_t *psl)
int psl_suffix_exception_count(const psl_ctx_t *psl) int psl_suffix_exception_count(const psl_ctx_t *psl)
{ {
if (psl == &_builtin_psl) if (psl == &_builtin_psl)
return countof(suffix_exceptions); return _psl_nexceptions;
else if (psl) else if (psl)
return _vector_size(psl->suffix_exceptions); return psl->nexceptions;
else else
return 0; return 0;
} }

View File

@ -54,41 +54,11 @@
# include "psl.c" # include "psl.c"
#undef _LIBPSL_INCLUDED_BY_PSL2C #undef _LIBPSL_INCLUDED_BY_PSL2C
static int _check_psl_entries(const _psl_vector_t *v) #if 0
{
int it, doublet = 0, err = 0;
for (it = 0; it < v->cur - 1; it++) {
_psl_entry_t *cur = _vector_get(v, it);
_psl_entry_t *next = _vector_get(v, it + 1);
if (_suffix_compare(cur, next) == 0) {
/* we allow '*.foo' and 'foo' */
if (cur->wildcard == next->wildcard) {
fprintf(stderr, "Double entry '%s' detected\n", cur->label);
err = 1;
}
else if (++doublet > 1) {
fprintf(stderr, "Double entry '%s' detected\n", cur->label);
err = 1;
}
} else
doublet = 0;
}
return err;
}
static int _check_psl(const psl_ctx_t *psl) static int _check_psl(const psl_ctx_t *psl)
{ {
int it, pos, err = 0; int it, pos, err = 0;
if (_check_psl_entries(psl->suffixes))
err = 1;
if (_check_psl_entries(psl->suffix_exceptions))
err = 1;
/* check if plain suffix also appears in exceptions */ /* check if plain suffix also appears in exceptions */
for (it = 0; it < psl->suffixes->cur; it++) { for (it = 0; it < psl->suffixes->cur; it++) {
_psl_entry_t *e = _vector_get(psl->suffixes, it); _psl_entry_t *e = _vector_get(psl->suffixes, it);
@ -156,6 +126,7 @@ static int _check_psl(const psl_ctx_t *psl)
return err; return err;
} }
#endif
static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname) static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname)
{ {
@ -184,7 +155,7 @@ static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *
_psl_entry_t *e = _vector_get(v, it); _psl_entry_t *e = _vector_get(v, it);
fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n", fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n",
e->label_buf, e->length, (int) e->nlabels, (int) e->wildcard); e->label_buf, e->length, (int) e->nlabels, (int) e->flags);
} }
fprintf(fpout, "};\n"); fprintf(fpout, "};\n");
@ -256,11 +227,11 @@ int main(int argc, const char **argv)
return 2; return 2;
/* look for ambigious or double entries */ /* look for ambigious or double entries */
if (_check_psl(psl)) { /* if (_check_psl(psl)) {
psl_free(psl); psl_free(psl);
return 5; return 5;
} }
*/
if ((fpout = fopen(argv[2], "w"))) { if ((fpout = fopen(argv[2], "w"))) {
FILE *pp; FILE *pp;
struct stat st; struct stat st;
@ -271,11 +242,9 @@ int main(int argc, const char **argv)
#if 0 #if 0
/* include library code did not generate punycode, so let's do it for the builtin data */ /* include library code did not generate punycode, so let's do it for the builtin data */
_add_punycode_if_needed(psl->suffixes); _add_punycode_if_needed(psl->suffixes);
_add_punycode_if_needed(psl->suffix_exceptions);
#endif #endif
_print_psl_entries(fpout, psl->suffixes, "suffixes"); _print_psl_entries(fpout, psl->suffixes, "suffixes");
_print_psl_entries(fpout, psl->suffix_exceptions, "suffix_exceptions");
snprintf(cmd, cmdsize, "sha1sum %s", argv[1]); snprintf(cmd, cmdsize, "sha1sum %s", argv[1]);
if ((pp = popen(cmd, "r"))) { if ((pp = popen(cmd, "r"))) {
@ -291,6 +260,8 @@ int main(int argc, const char **argv)
fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", atol(source_date_epoch)); fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", atol(source_date_epoch));
else else
fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", time(NULL)); fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", time(NULL));
fprintf(fpout, "static int _psl_nsuffixes = %d;\n", psl->nsuffixes);
fprintf(fpout, "static int _psl_nexceptions = %d;\n", psl->nexceptions);
fprintf(fpout, "static const char _psl_sha1_checksum[] = \"%s\";\n", checksum); fprintf(fpout, "static const char _psl_sha1_checksum[] = \"%s\";\n", checksum);
fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", argv[1]); fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", argv[1]);
@ -305,9 +276,10 @@ int main(int argc, const char **argv)
#else #else
if ((fpout = fopen(argv[2], "w"))) { if ((fpout = fopen(argv[2], "w"))) {
fprintf(fpout, "static _psl_entry_t suffixes[1];\n"); fprintf(fpout, "static _psl_entry_t suffixes[1];\n");
fprintf(fpout, "static _psl_entry_t suffix_exceptions[1];\n");
fprintf(fpout, "static time_t _psl_file_time;\n"); fprintf(fpout, "static time_t _psl_file_time;\n");
fprintf(fpout, "static time_t _psl_compile_time;\n"); fprintf(fpout, "static time_t _psl_compile_time;\n");
fprintf(fpout, "static int _psl_nsuffixes = 0;\n");
fprintf(fpout, "static int _psl_nexceptions = 0;\n");
fprintf(fpout, "static const char _psl_sha1_checksum[] = \"\";\n"); fprintf(fpout, "static const char _psl_sha1_checksum[] = \"\";\n");
fprintf(fpout, "static const char _psl_filename[] = \"\";\n"); fprintf(fpout, "static const char _psl_filename[] = \"\";\n");