diff --git a/docs/libpsl/libpsl-sections.txt b/docs/libpsl/libpsl-sections.txt index b2b1617..5e758e7 100644 --- a/docs/libpsl/libpsl-sections.txt +++ b/docs/libpsl/libpsl-sections.txt @@ -17,4 +17,5 @@ psl_builtin_sha1sum psl_builtin_filename psl_is_cookie_domain_acceptable psl_get_version +psl_str_to_utf8lower diff --git a/include/libpsl.h b/include/libpsl.h index 265bdf6..dff8974 100644 --- a/include/libpsl.h +++ b/include/libpsl.h @@ -65,6 +65,9 @@ const char * /* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */ const char * psl_registrable_domain(const psl_ctx_t *psl, const char *domain); +/* convert a string into lowercase UTF-8 */ +int + psl_str_to_utf8lower(const char *s, const char *encoding, const char *locale, char **lower); /* does not include exceptions */ int psl_suffix_count(const psl_ctx_t *psl); diff --git a/src/psl.c b/src/psl.c index 881052e..133b66f 100644 --- a/src/psl.c +++ b/src/psl.c @@ -49,9 +49,20 @@ #include #include #include +#include + +#ifdef WITH_LIBICU +# include +# include +# include +# include +#endif #include +/* number of elements within an array */ +#define countof(a) (sizeof(a)/sizeof(*(a))) + /** * SECTION:libpsl * @short_description: Public Suffix List library functions @@ -239,39 +250,19 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length) for (dst = suffix->label_buf, src = rule; *src;) { if (*src == '.') suffix->nlabels++; - *dst++ = tolower(*src++); + *dst++ = *src++; } *dst = 0; return 0; } -/** - * psl_is_public_suffix: - * @psl: PSL context - * @domain: Domain string - * - * This function checks if @domain is a public suffix by the means of the - * [Mozilla Public Suffix List](http://publicsuffix.org). - * - * For cookie domain checking see psl_is_cookie_domain_acceptable(). - * - * @psl is a context returned by either psl_load_file(), psl_load_fp() or - * psl_builtin(). - * - * Returns: 1 if domain is a public suffix, 0 if not. - * - * Since: 0.1 - */ -int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) +static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) { _psl_entry_t suffix, *rule; const char *p, *label_bak; unsigned short length_bak; - if (!psl || !domain) - return 1; - /* this function should be called without leading dots, just make sure */ suffix.label = domain + (*domain == '.'); suffix.length = strlen(suffix.label); @@ -340,6 +331,31 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) return 0; } +/** + * psl_is_public_suffix: + * @psl: PSL context + * @domain: Domain string + * + * This function checks if @domain is a public suffix by the means of the + * [Mozilla Public Suffix List](http://publicsuffix.org). + * + * For cookie domain checking see psl_is_cookie_domain_acceptable(). + * + * @psl is a context returned by either psl_load_file(), psl_load_fp() or + * psl_builtin(). + * + * Returns: 1 if domain is a public suffix, 0 if not. + * + * Since: 0.1 + */ +int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) +{ + if (!psl || !domain) + return 1; + + return _psl_is_public_suffix(psl, domain); +} + /** * psl_unregistrable_domain: * @psl: PSL context @@ -366,7 +382,7 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain) * 'forgot.his.name' and 'name' are in the PSL while 'his.name' is not. */ - while (!psl_is_public_suffix(psl, domain)) { + while (!_psl_is_public_suffix(psl, domain)) { if ((domain = strchr(domain, '.'))) domain++; else @@ -404,7 +420,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain) * 'forgot.his.name' and 'name' are in the PSL while 'his.name' is not. */ - while (!psl_is_public_suffix(psl, domain)) { + while (!_psl_is_public_suffix(psl, domain)) { if ((p = strchr(domain, '.'))) { regdom = domain; domain = p + 1; @@ -415,6 +431,51 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain) return regdom; } +static int _str_is_ascii(const char *s) +{ + while (*s > 0) s++; + + return !*s; +} + +static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t *e) +{ + if (_str_is_ascii(e->label_buf)) + return; + +#ifdef WITH_LIBICU + /* IDNA2008 UTS#46 punycode conversion */ + if (idna) { + _psl_entry_t suffix, *suffixp; + char lookupname[128] = ""; + UErrorCode status = 0; + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + UChar utf16_dst[128], utf16_src[128]; + int32_t utf16_src_length; + + u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, -1, &status); + if (U_SUCCESS(status)) { + int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status); + if (U_SUCCESS(status)) { + u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status); + if (U_SUCCESS(status)) { + if (strcmp(e->label_buf, lookupname)) { + /* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */ + _suffix_init(&suffix, lookupname, strlen(lookupname)); + suffix.wildcard = e->wildcard; + suffixp = _vector_get(v, _vector_add(v, &suffix)); + suffixp->label = suffixp->label_buf; /* set label to changed address */ + } /* else ignore */ + } /* else + fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */ + } /* else + fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */ + } /* else + fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */ + } +#endif +} + /** * psl_load_file: * @fname: Name of PSL file @@ -422,13 +483,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain) * This function loads the public suffixes file named @fname. * To free the allocated resources, call psl_free(). * - * If you want to use punycode representations for functions like psl_is_public_suffix(), - * these have to exist as entries within @fname. This is a design decision to not pull in - * dependencies for UTF-8 case-handling and IDNA libraries. - * - * On the contrary, the builtin data already contains punycode entries. - * - * Have a look into psl2c.c for example code on how to convert UTF-8 to lowercase and to punycode. + * The suffixes are expected to be lowercase UTF-8 encoded. * * Returns: Pointer to a PSL context or %NULL on failure. * @@ -457,7 +512,7 @@ psl_ctx_t *psl_load_file(const char *fname) * This function loads the public suffixes from a FILE pointer. * To free the allocated resources, call psl_free(). * - * Have a look at psl_load_fp() for punycode considerations. + * The suffixes are expected to be lowercase UTF-8 encoded. * * Returns: Pointer to a PSL context or %NULL on failure. * @@ -467,8 +522,11 @@ psl_ctx_t *psl_load_fp(FILE *fp) { psl_ctx_t *psl; _psl_entry_t suffix, *suffixp; - int nsuffixes = 0; char buf[256], *linep, *p; +#ifdef WITH_LIBICU + UIDNA *idna; + UErrorCode status = 0; +#endif if (!fp) return NULL; @@ -476,6 +534,10 @@ psl_ctx_t *psl_load_fp(FILE *fp) if (!(psl = calloc(1, sizeof(psl_ctx_t)))) return NULL; +#ifdef WITH_LIBICU + idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status); +#endif + /* * as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions. * as of 19.02.2014, the list at http://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions. @@ -496,26 +558,29 @@ psl_ctx_t *psl_load_fp(FILE *fp) if (*p == '!') { /* add to exceptions */ - if (_suffix_init(&suffix, p + 1, linep - p - 1) == 0) + if (_suffix_init(&suffix, p + 1, linep - p - 1) == 0) { suffixp = _vector_get(psl->suffix_exceptions, _vector_add(psl->suffix_exceptions, &suffix)); - else - suffixp = NULL; + suffixp->label = suffixp->label_buf; /* set label to changed address */ + _add_punycode_if_needed(idna, psl->suffix_exceptions, suffixp); + } } else { - if (_suffix_init(&suffix, p, linep - p) == 0) + /* add to suffixes */ + if (_suffix_init(&suffix, p, linep - p) == 0) { suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix)); - else - suffixp = NULL; + suffixp->label = suffixp->label_buf; /* set label to changed address */ + _add_punycode_if_needed(idna, psl->suffixes, suffixp); + } } - - if (suffixp) - suffixp->label = suffixp->label_buf; /* set label to changed address */ - - nsuffixes++;; } _vector_sort(psl->suffix_exceptions); _vector_sort(psl->suffixes); +#ifdef WITH_LIBICU + if (idna) + uidna_close(idna); +#endif + return psl; } @@ -685,7 +750,11 @@ const char *psl_builtin_filename(void) **/ const char *psl_get_version (void) { +#ifdef WITH_LIBICU + return PACKAGE_VERSION " +libicu/" U_ICU_VERSION; +#else return PACKAGE_VERSION; +#endif } /** @@ -741,3 +810,81 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, return 0; } + +/** + * psl_str_to_utf8lower: + * @str: string to convert + * @encoding: charset encoding of @str, e.g. 'iso-8859-1' or %NULL + * @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL + * @lower: return value containing the converted string + * + * This helper function converts a string to lowercase UTF-8 representation. + * Lowercase UTF-8 is needed as input to the domain checking functions. + * + * The return value 'lower' must be freed after usage. + * + * Returns: 0 on success, negative value on error. + * -2 failed to open converter with name @encoding + * -3 failed to convert @str to unicode + * -4 failed to convert unicode to lowercase + * -5 failed to convert unicode to UTF-8 + * + * Since: 0.4 + */ +int psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower) +{ + int ret = -1; + + if (lower) + *lower = NULL; + + if (!str) + return 0; + +#ifdef WITH_LIBICU + size_t str_length = strlen(str); + UErrorCode status = 0; + UChar *utf16_dst, *utf16_lower; + int32_t utf16_dst_length; + char *utf8_lower; + UConverter *uconv; + + /* C89 allocation */ + utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1)); + utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1)); + utf8_lower = alloca(str_length * 2 + 1); + + uconv = ucnv_open(encoding, &status); + if (U_SUCCESS(status)) { + utf16_dst_length = ucnv_toUChars(uconv, utf16_dst, str_length * 2 + 1, str, str_length, &status); + ucnv_close(uconv); + + if (U_SUCCESS(status)) { + int32_t utf16_lower_length = u_strToLower(utf16_lower, str_length * 2 + 1, utf16_dst, utf16_dst_length, locale, &status); + if (U_SUCCESS(status)) { + u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status); + if (U_SUCCESS(status)) { + if (lower) + *lower = strdup(utf8_lower); + ret = 0; + } else { + ret = -5; + fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); + } + } else { + ret = -4; + fprintf(stderr, "Failed to convert UTF-16 to lowercase (status %d)\n", status); + } + } else { + ret = -3; + fprintf(stderr, "Failed to convert string to UTF-16 (status %d)\n", status); + } + } else { + ret = -2; + fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); + } + +#endif + + return ret; +} diff --git a/tests/test-is-public.c b/tests/test-is-public.c index 1f3b8b3..fdc60b9 100644 --- a/tests/test-is-public.c +++ b/tests/test-is-public.c @@ -65,7 +65,7 @@ static void test_psl(void) { "abc.www.ck", 0 }, { "xxx.ck", 1 }, { "www.xxx.ck", 0 }, - { "\345\225\206\346\240\207", 1 }, /* xn--czr694b oder 商标 */ + { "\345\225\206\346\240\207", 1 }, /* xn--czr694b or 商标 */ { "www.\345\225\206\346\240\207", 0 }, /* some special test follow ('name' and 'forgot.his.name' are public, but e.g. his.name is not) */ { "name", 1 }, diff --git a/tests/test-registrable-domain.c b/tests/test-registrable-domain.c index 715498c..8c492fb 100644 --- a/tests/test-registrable-domain.c +++ b/tests/test-registrable-domain.c @@ -52,53 +52,10 @@ static int static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result) { const char *result; - char lookupname[128]; + char *lower; - /* check if there might be some utf-8 characters */ - if (domain) { - int utf8; - const char *p; - - for (p = domain, utf8 = 0; *p && !utf8; p++) - if (*p < 0) - utf8 = 1; - - /* if we found utf-8, make sure to convert domain correctly to lowercase */ - /* does it work, if we are not in a utf-8 env ? */ - if (utf8) { -#ifdef WITH_LIBICU - UErrorCode status = 0; - UChar utf16_dst[64], utf16_src[64]; - int32_t utf16_src_length; - - /* UTF-8 to lowercase conversion */ - u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, domain, (int32_t) strlen(domain), &status); - if (U_SUCCESS(status)) { - int32_t dst_length = u_strToLower(utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), utf16_src, -1, "en", &status); - if (U_SUCCESS(status)) { - u_strToUTF8(lookupname, (int32_t) sizeof(lookupname), NULL, utf16_dst, dst_length, &status); - if (U_SUCCESS(status)) { - domain = lookupname; - } else - fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); - } else - fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); - } else - fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); -#else - FILE *pp; - size_t cmdsize = 48 + strlen(domain); - char *cmd = alloca(cmdsize); - - snprintf(cmd, cmdsize, "echo -n '%s' | sed -e 's/./\\L\\0/g'", domain); - if ((pp = popen(cmd, "r"))) { - if (fscanf(pp, "%127s", lookupname) >= 1) - domain = lookupname; - pclose(pp); - } -#endif - } - } + if (psl_str_to_utf8lower(domain, NULL, NULL, &lower) == 0) + domain = lower; result = psl_registrable_domain(psl, domain); @@ -115,7 +72,7 @@ static void test_psl(void) { FILE *fp; const psl_ctx_t *psl; - char buf[256], domain[128], expected_regdom[128], *p; + char buf[256], domain[128], expected_regdom[128]; psl = psl_builtin(); @@ -152,11 +109,6 @@ static void test_psl(void) continue; } - /* we have to lowercase the domain - the PSL API just takes lowercase */ - for (p = domain; *p; p++) - if (*p > 0 && isupper(*p)) - *p = tolower(*p); - if (!strcmp(expected_regdom, "null")) test(psl, domain, NULL); else