added psl_registrable_domain(), renamed psl_registered_domain -> psl_unregistrable_domain

This commit is contained in:
Tim Ruehsen 2014-03-26 22:27:31 +01:00
parent 96574a795c
commit fd0ff2023b
6 changed files with 249 additions and 5 deletions

98
data/test_psl.txt Normal file
View File

@ -0,0 +1,98 @@
// Any copyright is dedicated to the Public Domain.
// http://creativecommons.org/publicdomain/zero/1.0/
// null input.
checkPublicSuffix(null, null);
// Mixed case.
checkPublicSuffix('COM', null);
checkPublicSuffix('example.COM', 'example.com');
checkPublicSuffix('WwW.example.COM', 'example.com');
// Leading dot.
checkPublicSuffix('.com', null);
checkPublicSuffix('.example', null);
checkPublicSuffix('.example.com', null);
checkPublicSuffix('.example.example', null);
// Unlisted TLD.
checkPublicSuffix('example', null);
checkPublicSuffix('example.example', 'example.example');
checkPublicSuffix('b.example.example', 'example.example');
checkPublicSuffix('a.b.example.example', 'example.example');
// Listed, but non-Internet, TLD.
//checkPublicSuffix('local', null);
//checkPublicSuffix('example.local', null);
//checkPublicSuffix('b.example.local', null);
//checkPublicSuffix('a.b.example.local', null);
// TLD with only 1 rule.
checkPublicSuffix('biz', null);
checkPublicSuffix('domain.biz', 'domain.biz');
checkPublicSuffix('b.domain.biz', 'domain.biz');
checkPublicSuffix('a.b.domain.biz', 'domain.biz');
// TLD with some 2-level rules.
checkPublicSuffix('com', null);
checkPublicSuffix('example.com', 'example.com');
checkPublicSuffix('b.example.com', 'example.com');
checkPublicSuffix('a.b.example.com', 'example.com');
checkPublicSuffix('uk.com', null);
checkPublicSuffix('example.uk.com', 'example.uk.com');
checkPublicSuffix('b.example.uk.com', 'example.uk.com');
checkPublicSuffix('a.b.example.uk.com', 'example.uk.com');
checkPublicSuffix('test.ac', 'test.ac');
// TLD with only 1 (wildcard) rule.
checkPublicSuffix('cy', null);
checkPublicSuffix('c.cy', null);
checkPublicSuffix('b.c.cy', 'b.c.cy');
checkPublicSuffix('a.b.c.cy', 'b.c.cy');
// More complex TLD.
checkPublicSuffix('jp', null);
checkPublicSuffix('test.jp', 'test.jp');
checkPublicSuffix('www.test.jp', 'test.jp');
checkPublicSuffix('ac.jp', null);
checkPublicSuffix('test.ac.jp', 'test.ac.jp');
checkPublicSuffix('www.test.ac.jp', 'test.ac.jp');
checkPublicSuffix('kyoto.jp', null);
checkPublicSuffix('test.kyoto.jp', 'test.kyoto.jp');
checkPublicSuffix('ide.kyoto.jp', null);
checkPublicSuffix('b.ide.kyoto.jp', 'b.ide.kyoto.jp');
checkPublicSuffix('a.b.ide.kyoto.jp', 'b.ide.kyoto.jp');
checkPublicSuffix('c.kobe.jp', null);
checkPublicSuffix('b.c.kobe.jp', 'b.c.kobe.jp');
checkPublicSuffix('a.b.c.kobe.jp', 'b.c.kobe.jp');
checkPublicSuffix('city.kobe.jp', 'city.kobe.jp');
checkPublicSuffix('www.city.kobe.jp', 'city.kobe.jp');
// TLD with a wildcard rule and exceptions.
checkPublicSuffix('ck', null);
checkPublicSuffix('test.ck', null);
checkPublicSuffix('b.test.ck', 'b.test.ck');
checkPublicSuffix('a.b.test.ck', 'b.test.ck');
checkPublicSuffix('www.ck', 'www.ck');
checkPublicSuffix('www.www.ck', 'www.ck');
// US K12.
checkPublicSuffix('us', null);
checkPublicSuffix('test.us', 'test.us');
checkPublicSuffix('www.test.us', 'test.us');
checkPublicSuffix('ak.us', null);
checkPublicSuffix('test.ak.us', 'test.ak.us');
checkPublicSuffix('www.test.ak.us', 'test.ak.us');
checkPublicSuffix('k12.ak.us', null);
checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us');
checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us');
// IDN labels.
checkPublicSuffix('食狮.com.cn', '食狮.com.cn');
checkPublicSuffix('食狮.公司.cn', '食狮.公司.cn');
checkPublicSuffix('www.食狮.公司.cn', '食狮.公司.cn');
checkPublicSuffix('shishi.公司.cn', 'shishi.公司.cn');
checkPublicSuffix('公司.cn', null);
checkPublicSuffix('食狮.中国', '食狮.中国');
checkPublicSuffix('www.食狮.中国', '食狮.中国');
checkPublicSuffix('shishi.中国', 'shishi.中国');
checkPublicSuffix('中国', null);
// Same as above, but punycoded.
checkPublicSuffix('xn--85x722f.com.cn', 'xn--85x722f.com.cn');
checkPublicSuffix('xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn');
checkPublicSuffix('www.xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn');
checkPublicSuffix('shishi.xn--55qx5d.cn', 'shishi.xn--55qx5d.cn');
checkPublicSuffix('xn--55qx5d.cn', null);
checkPublicSuffix('xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s');
checkPublicSuffix('www.xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s');
checkPublicSuffix('shishi.xn--fiqs8s', 'shishi.xn--fiqs8s');
checkPublicSuffix('xn--fiqs8s', null);

View File

@ -64,9 +64,12 @@ const psl_ctx_t *
psl_builtin(void); psl_builtin(void);
int int
psl_is_public(const psl_ctx_t *psl, const char *domain); psl_is_public(const psl_ctx_t *psl, const char *domain);
// return pointer to longest registered domain within 'domain' or NULL if none found // returns the longest unregistrable domain within 'domain' or NULL if none found
const char * const char *
psl_registered_domain(const psl_ctx_t *psl, const char *domain); psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain);
// returns the shortest possible registrable domain part or NULL if domain is not registrable at all
const char *
psl_registrable_domain(const psl_ctx_t *psl, const char *domain);
// does not include exceptions // does not include exceptions
int int
psl_suffix_count(const psl_ctx_t *psl); psl_suffix_count(const psl_ctx_t *psl);

View File

@ -301,10 +301,13 @@ int psl_is_public(const psl_ctx_t *psl, const char *domain)
// return NULL, if string domain does not contain a registered domain // return NULL, if string domain does not contain a registered domain
// else return a pointer to the longest registered domain within 'domain' // else return a pointer to the longest registered domain within 'domain'
const char *psl_registered_domain(const psl_ctx_t *psl, const char *domain) const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
{ {
const char *p, *ret_domain; const char *p, *ret_domain;
if (!psl || !domain)
return NULL;
// We check from right to left, e.g. in www.xxx.org we check org, xxx.org, www.xxx.org in this order // We check from right to left, e.g. in www.xxx.org we check org, xxx.org, www.xxx.org in this order
// for being a registered domain. // for being a registered domain.
@ -325,6 +328,30 @@ const char *psl_registered_domain(const psl_ctx_t *psl, const char *domain)
} }
} }
// returns the shortest possible registrable domain part or NULL if domain is not registrable at all
const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
{
const char *p;
int ispublic;
if (!psl || !domain || *domain == '.')
return NULL;
// We check from right to left, e.g. in www.xxx.org we check org, xxx.org, www.xxx.org in this order
// for being a registrable domain.
if (!(p = strrchr(domain, '.')))
p = domain;
while (!(ispublic = psl_is_public(psl, p)) && p > domain) {
// go left to next dot
while (p > domain && *--p != '.')
;
}
return ispublic ? (*p == '.' ? p + 1 : p) : NULL;
}
psl_ctx_t *psl_load_file(const char *fname) psl_ctx_t *psl_load_file(const char *fname)
{ {
FILE *fp; FILE *fp;

View File

@ -2,7 +2,7 @@ DEFS = @DEFS@ -DDATADIR=\"$(top_srcdir)/data\" -DSRCDIR=\"$(srcdir)\"
AM_CPPFLAGS = -I$(top_srcdir)/include AM_CPPFLAGS = -I$(top_srcdir)/include
LDADD = ../src/libpsl-@LIBPSL_API_VERSION@.la LDADD = ../src/libpsl-@LIBPSL_API_VERSION@.la
PSL_TESTS = test-is-public test-is-public-builtin test-is-public-all PSL_TESTS = test-is-public test-is-public-builtin test-is-public-all test-registrable-domain
check_PROGRAMS = $(PSL_TESTS) check_PROGRAMS = $(PSL_TESTS)

View File

@ -24,7 +24,7 @@
* Test psl_is_public() for all entries in effective_tld_names.dat * Test psl_is_public() for all entries in effective_tld_names.dat
* *
* Changelog * Changelog
* 19.03.2014 Tim Ruehsen created from libmget/cookie.c * 19.03.2014 Tim Ruehsen created
* *
*/ */

View File

@ -0,0 +1,116 @@
/*
* Copyright(c) 2014 Tim Ruehsen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* This file is part of the test suite of libpsl.
*
* Test psl_registered_domain() for all entries in test_psl.dat
*
* Changelog
* 26.03.2014 Tim Ruehsen created
*
*/
#if HAVE_CONFIG_H
# include <config.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <libpsl.h>
#define countof(a) (sizeof(a)/sizeof(*(a)))
#define TESTDATA DATADIR"/test_psl.txt"
static int
ok,
failed;
static void test_psl(void)
{
FILE *fp;
const psl_ctx_t *psl;
const char *result;
char buf[256], domain[128], expected_regdom[128], *p;
psl = psl_builtin();
printf("have %d suffixes and %d exceptions\n", psl_suffix_count(psl), psl_suffix_exception_count(psl));
if ((fp = fopen(TESTDATA, "r"))) {
while ((fgets(buf, sizeof(buf), fp))) {
if (sscanf(buf, " checkPublicSuffix('%127[^']' , '%127[^']", domain, expected_regdom) != 2) {
if (sscanf(buf, " checkPublicSuffix('%127[^']' , %127[nul]", domain, expected_regdom) != 2)
continue;
}
// we have to lowercase the domain - the PSL API just takes lowercase
for (p = domain; *p; p++)
if (isupper(*p))
*p = tolower(*p);
result = psl_registrable_domain(psl, domain);
if (result == NULL) {
if (strcmp(expected_regdom, "null")) {
failed++;
printf("psl_registrable_domain(%s)=NULL (expected %s)\n", domain, expected_regdom);
} else ok++;
} else {
if (strcmp(expected_regdom, result)) {
failed++;
printf("psl_registrable_domain(%s)=%s (expected %s)\n", domain, result, expected_regdom);
} else ok++;
}
}
fclose(fp);
} else {
printf("Failed to open %s\n", TESTDATA);
failed++;
}
}
int main(int argc, const char * const *argv)
{
// if VALGRIND testing is enabled, we have to call ourselves with valgrind checking
if (argc == 1) {
const char *valgrind = getenv("TESTS_VALGRIND");
if (valgrind && *valgrind) {
char cmd[strlen(valgrind)+strlen(argv[0])+32];
snprintf(cmd, sizeof(cmd), "TESTS_VALGRIND="" %s %s", valgrind, argv[0]);
return system(cmd) != 0;
}
}
test_psl();
if (failed) {
printf("Summary: %d out of %d tests failed\n", failed, ok + failed);
return 1;
}
printf("Summary: All %d tests passed\n", ok + failed);
return 0;
}