diff --git a/data/test_psl.txt b/data/test_psl.txt new file mode 100644 index 0000000..35c8ccf --- /dev/null +++ b/data/test_psl.txt @@ -0,0 +1,98 @@ +// Any copyright is dedicated to the Public Domain. +// http://creativecommons.org/publicdomain/zero/1.0/ + +// null input. +checkPublicSuffix(null, null); +// Mixed case. +checkPublicSuffix('COM', null); +checkPublicSuffix('example.COM', 'example.com'); +checkPublicSuffix('WwW.example.COM', 'example.com'); +// Leading dot. +checkPublicSuffix('.com', null); +checkPublicSuffix('.example', null); +checkPublicSuffix('.example.com', null); +checkPublicSuffix('.example.example', null); +// Unlisted TLD. +checkPublicSuffix('example', null); +checkPublicSuffix('example.example', 'example.example'); +checkPublicSuffix('b.example.example', 'example.example'); +checkPublicSuffix('a.b.example.example', 'example.example'); +// Listed, but non-Internet, TLD. +//checkPublicSuffix('local', null); +//checkPublicSuffix('example.local', null); +//checkPublicSuffix('b.example.local', null); +//checkPublicSuffix('a.b.example.local', null); +// TLD with only 1 rule. +checkPublicSuffix('biz', null); +checkPublicSuffix('domain.biz', 'domain.biz'); +checkPublicSuffix('b.domain.biz', 'domain.biz'); +checkPublicSuffix('a.b.domain.biz', 'domain.biz'); +// TLD with some 2-level rules. +checkPublicSuffix('com', null); +checkPublicSuffix('example.com', 'example.com'); +checkPublicSuffix('b.example.com', 'example.com'); +checkPublicSuffix('a.b.example.com', 'example.com'); +checkPublicSuffix('uk.com', null); +checkPublicSuffix('example.uk.com', 'example.uk.com'); +checkPublicSuffix('b.example.uk.com', 'example.uk.com'); +checkPublicSuffix('a.b.example.uk.com', 'example.uk.com'); +checkPublicSuffix('test.ac', 'test.ac'); +// TLD with only 1 (wildcard) rule. +checkPublicSuffix('cy', null); +checkPublicSuffix('c.cy', null); +checkPublicSuffix('b.c.cy', 'b.c.cy'); +checkPublicSuffix('a.b.c.cy', 'b.c.cy'); +// More complex TLD. +checkPublicSuffix('jp', null); +checkPublicSuffix('test.jp', 'test.jp'); +checkPublicSuffix('www.test.jp', 'test.jp'); +checkPublicSuffix('ac.jp', null); +checkPublicSuffix('test.ac.jp', 'test.ac.jp'); +checkPublicSuffix('www.test.ac.jp', 'test.ac.jp'); +checkPublicSuffix('kyoto.jp', null); +checkPublicSuffix('test.kyoto.jp', 'test.kyoto.jp'); +checkPublicSuffix('ide.kyoto.jp', null); +checkPublicSuffix('b.ide.kyoto.jp', 'b.ide.kyoto.jp'); +checkPublicSuffix('a.b.ide.kyoto.jp', 'b.ide.kyoto.jp'); +checkPublicSuffix('c.kobe.jp', null); +checkPublicSuffix('b.c.kobe.jp', 'b.c.kobe.jp'); +checkPublicSuffix('a.b.c.kobe.jp', 'b.c.kobe.jp'); +checkPublicSuffix('city.kobe.jp', 'city.kobe.jp'); +checkPublicSuffix('www.city.kobe.jp', 'city.kobe.jp'); +// TLD with a wildcard rule and exceptions. +checkPublicSuffix('ck', null); +checkPublicSuffix('test.ck', null); +checkPublicSuffix('b.test.ck', 'b.test.ck'); +checkPublicSuffix('a.b.test.ck', 'b.test.ck'); +checkPublicSuffix('www.ck', 'www.ck'); +checkPublicSuffix('www.www.ck', 'www.ck'); +// US K12. +checkPublicSuffix('us', null); +checkPublicSuffix('test.us', 'test.us'); +checkPublicSuffix('www.test.us', 'test.us'); +checkPublicSuffix('ak.us', null); +checkPublicSuffix('test.ak.us', 'test.ak.us'); +checkPublicSuffix('www.test.ak.us', 'test.ak.us'); +checkPublicSuffix('k12.ak.us', null); +checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us'); +checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us'); +// IDN labels. +checkPublicSuffix('食狮.com.cn', '食狮.com.cn'); +checkPublicSuffix('食狮.公司.cn', '食狮.公司.cn'); +checkPublicSuffix('www.食狮.公司.cn', '食狮.公司.cn'); +checkPublicSuffix('shishi.公司.cn', 'shishi.公司.cn'); +checkPublicSuffix('公司.cn', null); +checkPublicSuffix('食狮.中国', '食狮.中国'); +checkPublicSuffix('www.食狮.中国', '食狮.中国'); +checkPublicSuffix('shishi.中国', 'shishi.中国'); +checkPublicSuffix('中国', null); +// Same as above, but punycoded. +checkPublicSuffix('xn--85x722f.com.cn', 'xn--85x722f.com.cn'); +checkPublicSuffix('xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn'); +checkPublicSuffix('www.xn--85x722f.xn--55qx5d.cn', 'xn--85x722f.xn--55qx5d.cn'); +checkPublicSuffix('shishi.xn--55qx5d.cn', 'shishi.xn--55qx5d.cn'); +checkPublicSuffix('xn--55qx5d.cn', null); +checkPublicSuffix('xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s'); +checkPublicSuffix('www.xn--85x722f.xn--fiqs8s', 'xn--85x722f.xn--fiqs8s'); +checkPublicSuffix('shishi.xn--fiqs8s', 'shishi.xn--fiqs8s'); +checkPublicSuffix('xn--fiqs8s', null); diff --git a/include/libpsl.h b/include/libpsl.h index b500fcc..eeb4de6 100644 --- a/include/libpsl.h +++ b/include/libpsl.h @@ -64,9 +64,12 @@ const psl_ctx_t * psl_builtin(void); int psl_is_public(const psl_ctx_t *psl, const char *domain); -// return pointer to longest registered domain within 'domain' or NULL if none found +// returns the longest unregistrable domain within 'domain' or NULL if none found const char * - psl_registered_domain(const psl_ctx_t *psl, const char *domain); + psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain); +// returns the shortest possible registrable domain part or NULL if domain is not registrable at all +const char * + psl_registrable_domain(const psl_ctx_t *psl, const char *domain); // does not include exceptions int psl_suffix_count(const psl_ctx_t *psl); diff --git a/src/psl.c b/src/psl.c index 7ddee6a..34450d4 100644 --- a/src/psl.c +++ b/src/psl.c @@ -301,10 +301,13 @@ int psl_is_public(const psl_ctx_t *psl, const char *domain) // return NULL, if string domain does not contain a registered domain // else return a pointer to the longest registered domain within 'domain' -const char *psl_registered_domain(const psl_ctx_t *psl, const char *domain) +const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain) { const char *p, *ret_domain; + if (!psl || !domain) + return NULL; + // We check from right to left, e.g. in www.xxx.org we check org, xxx.org, www.xxx.org in this order // for being a registered domain. @@ -325,6 +328,30 @@ const char *psl_registered_domain(const psl_ctx_t *psl, const char *domain) } } +// returns the shortest possible registrable domain part or NULL if domain is not registrable at all +const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain) +{ + const char *p; + int ispublic; + + if (!psl || !domain || *domain == '.') + return NULL; + + // We check from right to left, e.g. in www.xxx.org we check org, xxx.org, www.xxx.org in this order + // for being a registrable domain. + + if (!(p = strrchr(domain, '.'))) + p = domain; + + while (!(ispublic = psl_is_public(psl, p)) && p > domain) { + // go left to next dot + while (p > domain && *--p != '.') + ; + } + + return ispublic ? (*p == '.' ? p + 1 : p) : NULL; +} + psl_ctx_t *psl_load_file(const char *fname) { FILE *fp; diff --git a/tests/Makefile.am b/tests/Makefile.am index bf2514c..d5b9e06 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -2,7 +2,7 @@ DEFS = @DEFS@ -DDATADIR=\"$(top_srcdir)/data\" -DSRCDIR=\"$(srcdir)\" AM_CPPFLAGS = -I$(top_srcdir)/include LDADD = ../src/libpsl-@LIBPSL_API_VERSION@.la -PSL_TESTS = test-is-public test-is-public-builtin test-is-public-all +PSL_TESTS = test-is-public test-is-public-builtin test-is-public-all test-registrable-domain check_PROGRAMS = $(PSL_TESTS) diff --git a/tests/test-is-public-all.c b/tests/test-is-public-all.c index e207625..b8b6f53 100644 --- a/tests/test-is-public-all.c +++ b/tests/test-is-public-all.c @@ -24,7 +24,7 @@ * Test psl_is_public() for all entries in effective_tld_names.dat * * Changelog - * 19.03.2014 Tim Ruehsen created from libmget/cookie.c + * 19.03.2014 Tim Ruehsen created * */ diff --git a/tests/test-registrable-domain.c b/tests/test-registrable-domain.c new file mode 100644 index 0000000..e6ed954 --- /dev/null +++ b/tests/test-registrable-domain.c @@ -0,0 +1,116 @@ +/* + * Copyright(c) 2014 Tim Ruehsen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * This file is part of the test suite of libpsl. + * + * Test psl_registered_domain() for all entries in test_psl.dat + * + * Changelog + * 26.03.2014 Tim Ruehsen created + * + */ + +#if HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include + +#define countof(a) (sizeof(a)/sizeof(*(a))) +#define TESTDATA DATADIR"/test_psl.txt" +static int + ok, + failed; + +static void test_psl(void) +{ + FILE *fp; + const psl_ctx_t *psl; + const char *result; + char buf[256], domain[128], expected_regdom[128], *p; + + psl = psl_builtin(); + + printf("have %d suffixes and %d exceptions\n", psl_suffix_count(psl), psl_suffix_exception_count(psl)); + + if ((fp = fopen(TESTDATA, "r"))) { + while ((fgets(buf, sizeof(buf), fp))) { + if (sscanf(buf, " checkPublicSuffix('%127[^']' , '%127[^']", domain, expected_regdom) != 2) { + if (sscanf(buf, " checkPublicSuffix('%127[^']' , %127[nul]", domain, expected_regdom) != 2) + continue; + } + + // we have to lowercase the domain - the PSL API just takes lowercase + for (p = domain; *p; p++) + if (isupper(*p)) + *p = tolower(*p); + + result = psl_registrable_domain(psl, domain); + + if (result == NULL) { + if (strcmp(expected_regdom, "null")) { + failed++; + printf("psl_registrable_domain(%s)=NULL (expected %s)\n", domain, expected_regdom); + } else ok++; + } else { + if (strcmp(expected_regdom, result)) { + failed++; + printf("psl_registrable_domain(%s)=%s (expected %s)\n", domain, result, expected_regdom); + } else ok++; + } + } + + fclose(fp); + } else { + printf("Failed to open %s\n", TESTDATA); + failed++; + } +} + +int main(int argc, const char * const *argv) +{ + // if VALGRIND testing is enabled, we have to call ourselves with valgrind checking + if (argc == 1) { + const char *valgrind = getenv("TESTS_VALGRIND"); + + if (valgrind && *valgrind) { + char cmd[strlen(valgrind)+strlen(argv[0])+32]; + + snprintf(cmd, sizeof(cmd), "TESTS_VALGRIND="" %s %s", valgrind, argv[0]); + return system(cmd) != 0; + } + } + + test_psl(); + + if (failed) { + printf("Summary: %d out of %d tests failed\n", failed, ok + failed); + return 1; + } + + printf("Summary: All %d tests passed\n", ok + failed); + return 0; +}