New function psl_is_public_suffix2()

The current PSL has two sections, ICANN and PRIVATE.
This new function allows to limit the check for one or both
of these sections.
This commit is contained in:
Tim Rühsen 2015-12-06 21:55:56 +01:00
parent 519b8c9d17
commit 9d2e93f0b8
5 changed files with 153 additions and 19 deletions

View File

@ -13,6 +13,7 @@ psl_load_fp
psl_builtin
psl_free
psl_is_public_suffix
psl_is_public_suffix2
psl_unregistrable_domain
psl_registrable_domain
psl_suffix_count

View File

@ -44,6 +44,11 @@
extern "C" {
#endif
/* types for psl_is_publix_suffix2() */
#define PSL_TYPE_ICANN (1<<0)
#define PSL_TYPE_PRIVATE (1<<1)
#define PSL_TYPE_ANY (PSL_TYPE_ICANN | PSL_TYPE_PRIVATE)
/**
* psl_error_t:
* @PSL_SUCCESS: Successful return.
@ -71,57 +76,79 @@ typedef struct _psl_ctx_st psl_ctx_t;
/* frees PSL context */
void
psl_free(psl_ctx_t *psl);
/* loads PSL data from file */
psl_ctx_t *
psl_load_file(const char *fname);
/* loads PSL data from FILE pointer */
psl_ctx_t *
psl_load_fp(FILE *fp);
/* retrieves builtin PSL data */
const psl_ctx_t *
psl_builtin(void);
/* checks whether domain is a public suffix or not */
int
psl_is_public_suffix(const psl_ctx_t *psl, const char *domain);
/* checks whether domain is a public suffix regarding the type or not */
int
psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type);
/* checks whether cookie_domain is acceptable for domain or not */
int
psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain);
/* returns the longest not registrable domain within 'domain' or NULL if none found */
const char *
psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain);
/* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */
const char *
psl_registrable_domain(const psl_ctx_t *psl, const char *domain);
/* convert a string into lowercase UTF-8 */
psl_error_t
psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower);
/* does not include exceptions */
int
psl_suffix_count(const psl_ctx_t *psl);
/* just counts exceptions */
int
psl_suffix_exception_count(const psl_ctx_t *psl);
/* just counts wildcards */
int
psl_suffix_wildcard_count(const psl_ctx_t *psl);
/* returns compilation time */
time_t
psl_builtin_compile_time(void);
/* returns mtime of PSL source file */
time_t
psl_builtin_file_time(void);
/* returns SHA1 checksum (hex-encoded, lowercase) of PSL source file */
const char *
psl_builtin_sha1sum(void);
/* returns file name of PSL source file */
const char *
psl_builtin_filename(void);
/* returns library version string */
const char *
psl_get_version(void);
/* checks library version number */
int
psl_check_version_number(int version);
/* returns wether the built-in data is outdated or not */
int
psl_builtin_outdated(void);

2
list

@ -1 +1 @@
Subproject commit e801df4a56ac8c7519d349ad5125433206930d6e
Subproject commit 1f3ad51171235aafe423435606e869f0161582e4

View File

@ -126,9 +126,11 @@ static char *strndup(const char *s, size_t n)
#define countof(a) (sizeof(a)/sizeof(*(a)))
#define _PSL_FLAG_PLAIN (1<<0)
#define _PSL_FLAG_EXCEPTION (1<<1)
#define _PSL_FLAG_WILDCARD (1<<2)
#define _PSL_FLAG_EXCEPTION (1<<0)
#define _PSL_FLAG_WILDCARD (1<<1)
#define _PSL_FLAG_ICANN (1<<2) /* entry of ICANN section */
#define _PSL_FLAG_PRIVATE (1<<3) /* entry of PRIVATE section */
#define _PSL_FLAG_PLAIN (1<<4) /* just used for PSL syntax checking */
typedef struct {
char
@ -157,6 +159,7 @@ struct _psl_ctx_st {
_psl_vector_t
*suffixes;
int
mode,
nsuffixes,
nexceptions,
nwildcards;
@ -177,7 +180,7 @@ struct _psl_ctx_st {
static const char _psl_filename[] = "";
#endif
/* references to this PSL will result in lookups to built-in data */
/* references to these PSLs will result in lookups to built-in data */
static const psl_ctx_t
_builtin_psl;
@ -310,10 +313,11 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length)
return 0;
}
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
{
_psl_entry_t suffix, *rule;
const char *p;
int builtin;
/* this function should be called without leading dots, just make sure */
suffix.label = domain + (*domain == '.');
@ -332,7 +336,9 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
}
/* if domain has enough labels, it is public */
if (psl == &_builtin_psl)
builtin = (psl == &_builtin_psl);
if (builtin)
rule = &suffixes[0];
else
rule = _vector_get(psl->suffixes, 0);
@ -340,12 +346,18 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
if (!rule || rule->nlabels < suffix.nlabels - 1)
return 0;
if (psl == &_builtin_psl)
if (rule == &suffixes[0])
rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
else
rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
if (rule) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
return 0;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
return 0;
/* definitely a match, no matter if the found rule is a wildcard or not */
if (rule->flags & _PSL_FLAG_EXCEPTION)
return 0;
@ -360,12 +372,18 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
suffix.length = strlen(suffix.label);
suffix.nlabels--;
if (psl == &_builtin_psl)
if (builtin)
rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
else
rule = _vector_get(psl->suffixes, (pos = _vector_find(psl->suffixes, &suffix)));
if (rule) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & _PSL_FLAG_ICANN))
return 0;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & _PSL_FLAG_PRIVATE))
return 0;
if ((rule->flags & _PSL_FLAG_WILDCARD))
return 1;
}
@ -399,7 +417,37 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
if (!psl || !domain)
return 1;
return _psl_is_public_suffix(psl, domain);
return _psl_is_public_suffix(psl, domain, PSL_TYPE_ANY);
}
/**
* psl_is_public_suffix2:
* @psl: PSL context
* @domain: Domain string
* @type: Domain type
*
* This function checks if @domain is a public suffix by the means of the
* [Mozilla Public Suffix List](http://publicsuffix.org).
*
* @type specifies the PSL section where to perform the lookup. Valid values are
* %PSL_TYPE_PRIVATE, %PSL_TYPE_ICANN and %PSL_TYPE_ANY.
*
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
* Other encodings result in unexpected behavior.
*
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
*
* Returns: 1 if domain is a public suffix, 0 if not.
*
* Since: 0.1
*/
int psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type)
{
if (!psl || !domain)
return 1;
return _psl_is_public_suffix(psl, domain, type);
}
/**
@ -431,7 +479,7 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
* 'forgot.his.name' and 'name' are in the PSL while 'his.name' is not.
*/
while (!_psl_is_public_suffix(psl, domain)) {
while (!_psl_is_public_suffix(psl, domain, 0)) {
if ((domain = strchr(domain, '.')))
domain++;
else
@ -472,7 +520,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
* 'forgot.his.name' and 'name' are in the PSL while 'his.name' is not.
*/
while (!_psl_is_public_suffix(psl, domain)) {
while (!_psl_is_public_suffix(psl, domain, 0)) {
if ((p = strchr(domain, '.'))) {
regdom = domain;
domain = p + 1;
@ -691,6 +739,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
psl_ctx_t *psl;
_psl_entry_t suffix, *suffixp;
char buf[256], *linep, *p;
int type = 0;
#ifdef WITH_LIBICU
UIDNA *idna;
UErrorCode status = 0;
@ -716,8 +765,20 @@ psl_ctx_t *psl_load_fp(FILE *fp)
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
if (!*linep) continue; /* skip empty lines */
if (*linep == '/' && linep[1] == '/')
if (*linep == '/' && linep[1] == '/') {
if (!type) {
if (strstr(linep + 2, "===BEGIN ICANN DOMAINS==="))
type = _PSL_FLAG_ICANN;
else if (!type && strstr(linep + 2, "===BEGIN PRIVATE DOMAINS==="))
type = _PSL_FLAG_PRIVATE;
}
else if (type == _PSL_FLAG_ICANN && strstr(linep + 2, "===END ICANN DOMAINS==="))
type = 0;
else if (type == _PSL_FLAG_PRIVATE && strstr(linep + 2, "===END PRIVATE DOMAINS==="))
type = 0;
continue; /* skip comments */
}
/* parse suffix rule */
for (p = linep; *linep && !_isspace_ascii(*linep);) linep++;
@ -725,7 +786,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
if (*p == '!') {
p++;
suffix.flags = _PSL_FLAG_EXCEPTION;
suffix.flags = _PSL_FLAG_EXCEPTION | type;
psl->nexceptions++;
} else if (*p == '*') {
if (*++p != '.') {
@ -734,13 +795,13 @@ psl_ctx_t *psl_load_fp(FILE *fp)
}
p++;
/* wildcard *.foo.bar implicitely make foo.bar a public suffix */
suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN;
suffix.flags = _PSL_FLAG_WILDCARD | _PSL_FLAG_PLAIN | type;
psl->nwildcards++;
psl->nsuffixes++;
} else {
if (!strchr(p, '.'))
continue; /* we do not need an explicit plain TLD rule, already covered by implicit '*' rule */
suffix.flags = _PSL_FLAG_PLAIN;
suffix.flags = _PSL_FLAG_PLAIN | type;
psl->nsuffixes++;
}
@ -812,7 +873,7 @@ void psl_free(psl_ctx_t *psl)
* The builtin data also contains punycode entries, one for each international domain name.
*
* If the generation of built-in data has been disabled during compilation, %NULL will be returned.
* So if using the builtin psl context, you can provide UTF-8 or punycode representations of domains to
* When using the builtin psl context, you can provide UTF-8 or punycode representations of domains to
* functions like psl_is_public_suffix().
*
* Returns: Pointer to the built in PSL data or NULL if this data is not available.

View File

@ -58,7 +58,7 @@ static void test_psl(void)
{
FILE *fp;
psl_ctx_t *psl;
int result;
int result, type = 0;
char buf[256], *linep, *p;
psl = psl_load_file(PSL_FILE); /* PSL_FILE can be set by ./configure --with-psl-file=[PATH] */
@ -74,8 +74,20 @@ static void test_psl(void)
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
if (!*linep) continue; /* skip empty lines */
if (*linep == '/' && linep[1] == '/')
if (*linep == '/' && linep[1] == '/') {
if (!type) {
if (strstr(linep + 2, "===BEGIN ICANN DOMAINS==="))
type = PSL_TYPE_ICANN;
else if (!type && strstr(linep + 2, "===BEGIN PRIVATE DOMAINS==="))
type = PSL_TYPE_PRIVATE;
}
else if (type == PSL_TYPE_ICANN && strstr(linep + 2, "===END ICANN DOMAINS==="))
type = 0;
else if (type == PSL_TYPE_PRIVATE && strstr(linep + 2, "===END PRIVATE DOMAINS==="))
type = 0;
continue; /* skip comments */
}
/* parse suffix rule */
for (p = linep; *linep && !_isspace_ascii(*linep);) linep++;
@ -111,6 +123,39 @@ static void test_psl(void)
failed++;
printf("psl_is_public_suffix(%s)=%d (expected 1)\n", p, result);
} else ok++;
if (!(strchr(p, '.'))) {
/* TLDs are always expected to be Publix Suffixes */
if (!(result = psl_is_public_suffix2(psl, p, PSL_TYPE_PRIVATE))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_PRIVATE)=%d (expected 1)\n", p, result);
} else ok++;
if (!(result = psl_is_public_suffix2(psl, p, PSL_TYPE_ICANN))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_ICANN)=%d (expected 0)\n", p, result);
} else ok++;
} else if (type == PSL_TYPE_PRIVATE) {
if (!(result = psl_is_public_suffix2(psl, p, PSL_TYPE_PRIVATE))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_PRIVATE)=%d (expected 1)\n", p, result);
} else ok++;
if ((result = psl_is_public_suffix2(psl, p, PSL_TYPE_ICANN))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_ICANN)=%d (expected 0)\n", p, result);
} else ok++;
} else if (type == PSL_TYPE_ICANN) {
if (!(result = psl_is_public_suffix2(psl, p, PSL_TYPE_ICANN))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_ICANN)=%d (expected 1)\n", p, result);
} else ok++;
if ((result = psl_is_public_suffix2(psl, p, PSL_TYPE_PRIVATE))) {
failed++;
printf("psl_is_public_suffix2(%s, PSL_TYPE_PRIVATE)=%d (expected 0)\n", p, result);
} else ok++;
}
}
}