diff --git a/Makefile.am b/Makefile.am index d27fa22..d488ce4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,4 +14,4 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS} pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = libpsl.pc -EXTRA_DIST = config.rpath LICENSE list/public_suffix_list.dat list/tests/test_psl.txt +EXTRA_DIST = config.rpath LICENSE $(PSL_FILE) list/tests/test_psl.txt diff --git a/src/psl.c b/src/psl.c index 224eb88..40003fa 100644 --- a/src/psl.c +++ b/src/psl.c @@ -123,6 +123,10 @@ static char *strndup(const char *s, size_t n) #define countof(a) (sizeof(a)/sizeof(*(a))) +#define _PSL_FLAG_PLAIN (1<<0) +#define _PSL_FLAG_EXCEPTION (1<<1) +#define _PSL_FLAG_WILDCARD (1<<2) + typedef struct { char label_buf[48]; @@ -132,7 +136,7 @@ typedef struct { length; unsigned char nlabels, /* number of labels */ - wildcard; /* this is a wildcard rule (e.g. *.sapporo.jp) */ + flags; } _psl_entry_t; /* stripped down version libmget vector routines */ @@ -148,8 +152,10 @@ typedef struct { struct _psl_ctx_st { _psl_vector_t - *suffixes, - *suffix_exceptions; + *suffixes; + int + nsuffixes, + nexceptions; }; /* include the PSL data compiled by 'psl2c' */ @@ -158,9 +164,10 @@ struct _psl_ctx_st { #else /* if this source file is included by psl2c.c, provide empty builtin data */ static _psl_entry_t suffixes[1]; - static _psl_entry_t suffix_exceptions[1]; static time_t _psl_file_time; static time_t _psl_compile_time; + static int _psl_nsuffixes; + static int _psl_nexceptions; static const char _psl_sha1_checksum[] = ""; static const char _psl_filename[] = ""; #endif @@ -251,11 +258,6 @@ static void _vector_sort(_psl_vector_t *v) qsort(v->entry, v->cur, sizeof(_psl_vector_t **), (int(*)(const void *, const void *))v->cmp); } -static int _vector_size(_psl_vector_t *v) -{ - return v ? v->cur : 0; -} - /* by this kind of sorting, we can easily see if a domain matches or not */ static int _suffix_compare(const _psl_entry_t *s1, const _psl_entry_t *s2) { @@ -289,19 +291,7 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length) return -1; } - if (*rule == '*') { - if (*++rule != '.') { - suffix->nlabels = 0; - /* fprintf(stderr, _("Unsupported kind of rule (ignored): %s\n"), rule); */ - return -2; - } - rule++; - suffix->wildcard = 1; - suffix->length = (unsigned char)length - 2; - } else { - suffix->wildcard = 0; - suffix->length = (unsigned char)length; - } + suffix->length = (unsigned char)length; suffix->nlabels = 1; @@ -318,19 +308,24 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length) static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) { _psl_entry_t suffix, *rule; - const char *p, *label_bak; - unsigned short length_bak; + const char *p; /* this function should be called without leading dots, just make sure */ suffix.label = domain + (*domain == '.'); suffix.length = strlen(suffix.label); - suffix.wildcard = 0; suffix.nlabels = 1; for (p = suffix.label; *p; p++) if (*p == '.') suffix.nlabels++; + if (suffix.nlabels == 1) { + /* TLD, this is the prevailing '*' match. + * We don't currently support exception TLDs (TLDs that are not a public suffix) + */ + return 1; + } + /* if domain has enough labels, it is public */ if (psl == &_builtin_psl) rule = &suffixes[0]; @@ -347,15 +342,12 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) if (rule) { /* definitely a match, no matter if the found rule is a wildcard or not */ - return 1; - } else if (suffix.nlabels == 1) { - /* unknown TLD, this is the prevailing '*' match */ - return 1; + if (rule->flags & _PSL_FLAG_EXCEPTION) + return 0; + if (rule->flags & _PSL_FLAG_PLAIN) + return 1; } - label_bak = suffix.label; - length_bak = suffix.length; - if ((suffix.label = strchr(suffix.label, '.'))) { int pos = rule - suffixes; @@ -369,43 +361,8 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix))); if (rule) { - if (!rule->wildcard) { - /* Due to binary search ambiguity we need the following check of neighbour entries. - * TODO: The data structures needs a revision: wildcard and non-wildcard entries must be separated. */ - if (psl == &_builtin_psl) { - pos = rule - suffixes; - - if (pos > 0 && _suffix_compare(rule, &suffixes[pos - 1]) == 0 && suffixes[pos -1].wildcard) - rule = &suffixes[pos - 1]; - else if (pos < (int) (countof(suffixes) - 1) && _suffix_compare(rule, &suffixes[pos + 1]) == 0 && suffixes[pos + 1].wildcard) - rule = &suffixes[pos + 1]; - } else { - _psl_entry_t *e; - - if (pos > 0 && _suffix_compare(rule, e = _vector_get(psl->suffixes, pos - 1)) == 0 && e->wildcard) { - rule = e; - } - else if (pos < psl->suffixes->cur - 1 && _suffix_compare(rule, e = _vector_get(psl->suffixes, pos + 1)) == 0 && e->wildcard) { - rule = e; - } - } - } - if (rule->wildcard) { - /* now that we matched a wildcard, we have to check for an exception */ - suffix.label = label_bak; - suffix.length = length_bak; - suffix.nlabels++; - - if (psl == &_builtin_psl) { - if (bsearch(&suffix, suffix_exceptions, countof(suffix_exceptions), sizeof(suffix_exceptions[0]), (int(*)(const void *, const void *))_suffix_compare)) - return 0; /* found an exception, so 'domain' is not a public suffix */ - } else { - if (_vector_get(psl->suffix_exceptions, _vector_find(psl->suffix_exceptions, &suffix)) != 0) - return 0; /* found an exception, so 'domain' is not a public suffix */ - } - + if ((rule->flags & _PSL_FLAG_WILDCARD)) return 1; - } } } @@ -596,7 +553,7 @@ static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t /* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */ _suffix_init(&suffix, lookupname, strlen(lookupname)); - suffix.wildcard = e->wildcard; + suffix.flags = e->flags; suffixp = _vector_get(v, _vector_add(v, &suffix)); suffixp->label = suffixp->label_buf; /* set label to changed address */ } /* else ignore */ @@ -641,7 +598,7 @@ static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e) /* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */ _suffix_init(&suffix, lookupname, strlen(lookupname)); - suffix.wildcard = e->wildcard; + suffix.flags = e->flags; suffixp = _vector_get(v, _vector_add(v, &suffix)); suffixp->label = suffixp->label_buf; /* set label to changed address */ } /* else ignore */ @@ -673,7 +630,7 @@ static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e) /* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */ _suffix_init(&suffix, lookupname, strlen(lookupname)); - suffix.wildcard = e->wildcard; + suffix.flags = e->flags; suffixp = _vector_get(v, _vector_add(v, &suffix)); suffixp->label = suffixp->label_buf; /* set label to changed address */ } /* else ignore */ @@ -749,7 +706,6 @@ psl_ctx_t *psl_load_fp(FILE *fp) * as of 19.02.2014, the list at http://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions. */ psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array); - psl->suffix_exceptions = _vector_alloc(64, _suffix_compare_array); while ((linep = fgets(buf, sizeof(buf), fp))) { while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */ @@ -763,31 +719,57 @@ psl_ctx_t *psl_load_fp(FILE *fp) *linep = 0; if (*p == '!') { - /* add to exceptions */ - if (_suffix_init(&suffix, p + 1, linep - p - 1) == 0) { - suffixp = _vector_get(psl->suffix_exceptions, _vector_add(psl->suffix_exceptions, &suffix)); - suffixp->label = suffixp->label_buf; /* set label to changed address */ -#ifdef WITH_LIBICU - _add_punycode_if_needed(idna, psl->suffix_exceptions, suffixp); -#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN) - _add_punycode_if_needed(psl->suffix_exceptions, suffixp); -#endif + p++; + suffix.flags = _PSL_FLAG_EXCEPTION; + psl->nexceptions++; + } else if (*p == '*') { + if (*++p != '.') { + /* fprintf(stderr, _("Unsupported kind of rule (ignored): %s\n"), p - 1); */ + continue; } + p++; + /* wildcard *.foo.bar implicitely make foo.bar a public suffix */ + suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN; + psl->nsuffixes++; } else { - /* add to suffixes */ - if (_suffix_init(&suffix, p, linep - p) == 0) { + if (!strchr(p, '.')) + continue; /* we do not need an explicit plain TLD rule, already covered by implicit '*' rule */ + suffix.flags = _PSL_FLAG_PLAIN; + psl->nsuffixes++; + } + + if (_suffix_init(&suffix, p, linep - p) == 0) { + int index; + + if ((index = _vector_find(psl->suffixes, &suffix)) >= 0) { + /* Found existing entry: + * Combination of exception and plain rule is ambigous + * !foo.bar + * foo.bar + * + * Allowed: + * !foo.bar + *.foo.bar + * foo.bar + *.foo.bar + * + * We do not check here, let's do it later. + */ + + suffixp = _vector_get(psl->suffixes, index); + suffixp->flags |= suffix.flags; + } else { + /* New entry */ suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix)); - suffixp->label = suffixp->label_buf; /* set label to changed address */ -#ifdef WITH_LIBICU - _add_punycode_if_needed(idna, psl->suffixes, suffixp); -#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN) - _add_punycode_if_needed(psl->suffixes, suffixp); -#endif } + + suffixp->label = suffixp->label_buf; /* set label to changed address */ +#ifdef WITH_LIBICU + _add_punycode_if_needed(idna, psl->suffixes, suffixp); +#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN) + _add_punycode_if_needed(psl->suffixes, suffixp); +#endif } } - _vector_sort(psl->suffix_exceptions); _vector_sort(psl->suffixes); #ifdef WITH_LIBICU @@ -811,7 +793,6 @@ void psl_free(psl_ctx_t *psl) { if (psl && psl != &_builtin_psl) { _vector_free(&psl->suffixes); - _vector_free(&psl->suffix_exceptions); free(psl); } } @@ -855,9 +836,9 @@ const psl_ctx_t *psl_builtin(void) int psl_suffix_count(const psl_ctx_t *psl) { if (psl == &_builtin_psl) - return countof(suffixes); + return _psl_nsuffixes; else if (psl) - return _vector_size(psl->suffixes); + return psl->nsuffixes; else return 0; } @@ -875,9 +856,9 @@ int psl_suffix_count(const psl_ctx_t *psl) int psl_suffix_exception_count(const psl_ctx_t *psl) { if (psl == &_builtin_psl) - return countof(suffix_exceptions); + return _psl_nexceptions; else if (psl) - return _vector_size(psl->suffix_exceptions); + return psl->nexceptions; else return 0; } diff --git a/src/psl2c.c b/src/psl2c.c index 14bc4b9..6daa722 100644 --- a/src/psl2c.c +++ b/src/psl2c.c @@ -54,41 +54,11 @@ # include "psl.c" #undef _LIBPSL_INCLUDED_BY_PSL2C -static int _check_psl_entries(const _psl_vector_t *v) -{ - int it, doublet = 0, err = 0; - - for (it = 0; it < v->cur - 1; it++) { - _psl_entry_t *cur = _vector_get(v, it); - _psl_entry_t *next = _vector_get(v, it + 1); - - if (_suffix_compare(cur, next) == 0) { - /* we allow '*.foo' and 'foo' */ - if (cur->wildcard == next->wildcard) { - fprintf(stderr, "Double entry '%s' detected\n", cur->label); - err = 1; - } - else if (++doublet > 1) { - fprintf(stderr, "Double entry '%s' detected\n", cur->label); - err = 1; - } - } else - doublet = 0; - } - - return err; -} - +#if 0 static int _check_psl(const psl_ctx_t *psl) { int it, pos, err = 0; - if (_check_psl_entries(psl->suffixes)) - err = 1; - - if (_check_psl_entries(psl->suffix_exceptions)) - err = 1; - /* check if plain suffix also appears in exceptions */ for (it = 0; it < psl->suffixes->cur; it++) { _psl_entry_t *e = _vector_get(psl->suffixes, it); @@ -156,6 +126,7 @@ static int _check_psl(const psl_ctx_t *psl) return err; } +#endif static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname) { @@ -184,7 +155,7 @@ static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char * _psl_entry_t *e = _vector_get(v, it); fprintf(fpout, "\t{ \"%s\", NULL, %hd, %d, %d },\n", - e->label_buf, e->length, (int) e->nlabels, (int) e->wildcard); + e->label_buf, e->length, (int) e->nlabels, (int) e->flags); } fprintf(fpout, "};\n"); @@ -256,11 +227,11 @@ int main(int argc, const char **argv) return 2; /* look for ambigious or double entries */ - if (_check_psl(psl)) { +/* if (_check_psl(psl)) { psl_free(psl); return 5; } - +*/ if ((fpout = fopen(argv[2], "w"))) { FILE *pp; struct stat st; @@ -271,11 +242,9 @@ int main(int argc, const char **argv) #if 0 /* include library code did not generate punycode, so let's do it for the builtin data */ _add_punycode_if_needed(psl->suffixes); - _add_punycode_if_needed(psl->suffix_exceptions); #endif _print_psl_entries(fpout, psl->suffixes, "suffixes"); - _print_psl_entries(fpout, psl->suffix_exceptions, "suffix_exceptions"); snprintf(cmd, cmdsize, "sha1sum %s", argv[1]); if ((pp = popen(cmd, "r"))) { @@ -291,6 +260,8 @@ int main(int argc, const char **argv) fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", atol(source_date_epoch)); else fprintf(fpout, "static time_t _psl_compile_time = %lu;\n", time(NULL)); + fprintf(fpout, "static int _psl_nsuffixes = %d;\n", psl->nsuffixes); + fprintf(fpout, "static int _psl_nexceptions = %d;\n", psl->nexceptions); fprintf(fpout, "static const char _psl_sha1_checksum[] = \"%s\";\n", checksum); fprintf(fpout, "static const char _psl_filename[] = \"%s\";\n", argv[1]); @@ -305,9 +276,10 @@ int main(int argc, const char **argv) #else if ((fpout = fopen(argv[2], "w"))) { fprintf(fpout, "static _psl_entry_t suffixes[1];\n"); - fprintf(fpout, "static _psl_entry_t suffix_exceptions[1];\n"); fprintf(fpout, "static time_t _psl_file_time;\n"); fprintf(fpout, "static time_t _psl_compile_time;\n"); + fprintf(fpout, "static int _psl_nsuffixes = 0;\n"); + fprintf(fpout, "static int _psl_nexceptions = 0;\n"); fprintf(fpout, "static const char _psl_sha1_checksum[] = \"\";\n"); fprintf(fpout, "static const char _psl_filename[] = \"\";\n");