added psl_str_to_utf8lower()

This commit is contained in:
Tim Ruehsen 2014-06-17 17:14:02 +02:00
parent 9fb63f7a49
commit 57394eb1f8
5 changed files with 200 additions and 97 deletions

View File

@ -17,4 +17,5 @@ psl_builtin_sha1sum
psl_builtin_filename
psl_is_cookie_domain_acceptable
psl_get_version
psl_str_to_utf8lower
</SECTION>

View File

@ -65,6 +65,9 @@ const char *
/* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */
const char *
psl_registrable_domain(const psl_ctx_t *psl, const char *domain);
/* convert a string into lowercase UTF-8 */
int
psl_str_to_utf8lower(const char *s, const char *encoding, const char *locale, char **lower);
/* does not include exceptions */
int
psl_suffix_count(const psl_ctx_t *psl);

235
src/psl.c
View File

@ -49,9 +49,20 @@
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <alloca.h>
#ifdef WITH_LIBICU
# include <unicode/uversion.h>
# include <unicode/ustring.h>
# include <unicode/uidna.h>
# include <unicode/ucnv.h>
#endif
#include <libpsl.h>
/* number of elements within an array */
#define countof(a) (sizeof(a)/sizeof(*(a)))
/**
* SECTION:libpsl
* @short_description: Public Suffix List library functions
@ -239,39 +250,19 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length)
for (dst = suffix->label_buf, src = rule; *src;) {
if (*src == '.')
suffix->nlabels++;
*dst++ = tolower(*src++);
*dst++ = *src++;
}
*dst = 0;
return 0;
}
/**
* psl_is_public_suffix:
* @psl: PSL context
* @domain: Domain string
*
* This function checks if @domain is a public suffix by the means of the
* [Mozilla Public Suffix List](http://publicsuffix.org).
*
* For cookie domain checking see psl_is_cookie_domain_acceptable().
*
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
*
* Returns: 1 if domain is a public suffix, 0 if not.
*
* Since: 0.1
*/
int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
{
_psl_entry_t suffix, *rule;
const char *p, *label_bak;
unsigned short length_bak;
if (!psl || !domain)
return 1;
/* this function should be called without leading dots, just make sure */
suffix.label = domain + (*domain == '.');
suffix.length = strlen(suffix.label);
@ -340,6 +331,31 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
return 0;
}
/**
* psl_is_public_suffix:
* @psl: PSL context
* @domain: Domain string
*
* This function checks if @domain is a public suffix by the means of the
* [Mozilla Public Suffix List](http://publicsuffix.org).
*
* For cookie domain checking see psl_is_cookie_domain_acceptable().
*
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
*
* Returns: 1 if domain is a public suffix, 0 if not.
*
* Since: 0.1
*/
int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
{
if (!psl || !domain)
return 1;
return _psl_is_public_suffix(psl, domain);
}
/**
* psl_unregistrable_domain:
* @psl: PSL context
@ -366,7 +382,7 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
* 'forgot.his.name' and 'name' are in the PSL while 'his.name' is not.
*/
while (!psl_is_public_suffix(psl, domain)) {
while (!_psl_is_public_suffix(psl, domain)) {
if ((domain = strchr(domain, '.')))
domain++;
else
@ -404,7 +420,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
* 'forgot.his.name' and 'name' are in the PSL while 'his.name' is not.
*/
while (!psl_is_public_suffix(psl, domain)) {
while (!_psl_is_public_suffix(psl, domain)) {
if ((p = strchr(domain, '.'))) {
regdom = domain;
domain = p + 1;
@ -415,6 +431,51 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
return regdom;
}
static int _str_is_ascii(const char *s)
{
while (*s > 0) s++;
return !*s;
}
static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t *e)
{
if (_str_is_ascii(e->label_buf))
return;
#ifdef WITH_LIBICU
/* IDNA2008 UTS#46 punycode conversion */
if (idna) {
_psl_entry_t suffix, *suffixp;
char lookupname[128] = "";
UErrorCode status = 0;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
UChar utf16_dst[128], utf16_src[128];
int32_t utf16_src_length;
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, -1, &status);
if (U_SUCCESS(status)) {
int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
if (U_SUCCESS(status)) {
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
if (U_SUCCESS(status)) {
if (strcmp(e->label_buf, lookupname)) {
/* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.wildcard = e->wildcard;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */
} /* else
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
} /* else
fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */
} /* else
fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */
}
#endif
}
/**
* psl_load_file:
* @fname: Name of PSL file
@ -422,13 +483,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
* This function loads the public suffixes file named @fname.
* To free the allocated resources, call psl_free().
*
* If you want to use punycode representations for functions like psl_is_public_suffix(),
* these have to exist as entries within @fname. This is a design decision to not pull in
* dependencies for UTF-8 case-handling and IDNA libraries.
*
* On the contrary, the builtin data already contains punycode entries.
*
* Have a look into psl2c.c for example code on how to convert UTF-8 to lowercase and to punycode.
* The suffixes are expected to be lowercase UTF-8 encoded.
*
* Returns: Pointer to a PSL context or %NULL on failure.
*
@ -457,7 +512,7 @@ psl_ctx_t *psl_load_file(const char *fname)
* This function loads the public suffixes from a FILE pointer.
* To free the allocated resources, call psl_free().
*
* Have a look at psl_load_fp() for punycode considerations.
* The suffixes are expected to be lowercase UTF-8 encoded.
*
* Returns: Pointer to a PSL context or %NULL on failure.
*
@ -467,8 +522,11 @@ psl_ctx_t *psl_load_fp(FILE *fp)
{
psl_ctx_t *psl;
_psl_entry_t suffix, *suffixp;
int nsuffixes = 0;
char buf[256], *linep, *p;
#ifdef WITH_LIBICU
UIDNA *idna;
UErrorCode status = 0;
#endif
if (!fp)
return NULL;
@ -476,6 +534,10 @@ psl_ctx_t *psl_load_fp(FILE *fp)
if (!(psl = calloc(1, sizeof(psl_ctx_t))))
return NULL;
#ifdef WITH_LIBICU
idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status);
#endif
/*
* as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions.
* as of 19.02.2014, the list at http://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
@ -496,26 +558,29 @@ psl_ctx_t *psl_load_fp(FILE *fp)
if (*p == '!') {
/* add to exceptions */
if (_suffix_init(&suffix, p + 1, linep - p - 1) == 0)
if (_suffix_init(&suffix, p + 1, linep - p - 1) == 0) {
suffixp = _vector_get(psl->suffix_exceptions, _vector_add(psl->suffix_exceptions, &suffix));
else
suffixp = NULL;
suffixp->label = suffixp->label_buf; /* set label to changed address */
_add_punycode_if_needed(idna, psl->suffix_exceptions, suffixp);
}
} else {
if (_suffix_init(&suffix, p, linep - p) == 0)
/* add to suffixes */
if (_suffix_init(&suffix, p, linep - p) == 0) {
suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
else
suffixp = NULL;
suffixp->label = suffixp->label_buf; /* set label to changed address */
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
}
}
if (suffixp)
suffixp->label = suffixp->label_buf; /* set label to changed address */
nsuffixes++;;
}
_vector_sort(psl->suffix_exceptions);
_vector_sort(psl->suffixes);
#ifdef WITH_LIBICU
if (idna)
uidna_close(idna);
#endif
return psl;
}
@ -685,7 +750,11 @@ const char *psl_builtin_filename(void)
**/
const char *psl_get_version (void)
{
#ifdef WITH_LIBICU
return PACKAGE_VERSION " +libicu/" U_ICU_VERSION;
#else
return PACKAGE_VERSION;
#endif
}
/**
@ -741,3 +810,81 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
return 0;
}
/**
* psl_str_to_utf8lower:
* @str: string to convert
* @encoding: charset encoding of @str, e.g. 'iso-8859-1' or %NULL
* @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
* @lower: return value containing the converted string
*
* This helper function converts a string to lowercase UTF-8 representation.
* Lowercase UTF-8 is needed as input to the domain checking functions.
*
* The return value 'lower' must be freed after usage.
*
* Returns: 0 on success, negative value on error.
* -2 failed to open converter with name @encoding
* -3 failed to convert @str to unicode
* -4 failed to convert unicode to lowercase
* -5 failed to convert unicode to UTF-8
*
* Since: 0.4
*/
int psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower)
{
int ret = -1;
if (lower)
*lower = NULL;
if (!str)
return 0;
#ifdef WITH_LIBICU
size_t str_length = strlen(str);
UErrorCode status = 0;
UChar *utf16_dst, *utf16_lower;
int32_t utf16_dst_length;
char *utf8_lower;
UConverter *uconv;
/* C89 allocation */
utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1));
utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
utf8_lower = alloca(str_length * 2 + 1);
uconv = ucnv_open(encoding, &status);
if (U_SUCCESS(status)) {
utf16_dst_length = ucnv_toUChars(uconv, utf16_dst, str_length * 2 + 1, str, str_length, &status);
ucnv_close(uconv);
if (U_SUCCESS(status)) {
int32_t utf16_lower_length = u_strToLower(utf16_lower, str_length * 2 + 1, utf16_dst, utf16_dst_length, locale, &status);
if (U_SUCCESS(status)) {
u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status);
if (U_SUCCESS(status)) {
if (lower)
*lower = strdup(utf8_lower);
ret = 0;
} else {
ret = -5;
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status);
}
} else {
ret = -4;
fprintf(stderr, "Failed to convert UTF-16 to lowercase (status %d)\n", status);
}
} else {
ret = -3;
fprintf(stderr, "Failed to convert string to UTF-16 (status %d)\n", status);
}
} else {
ret = -2;
fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status);
}
#endif
return ret;
}

View File

@ -65,7 +65,7 @@ static void test_psl(void)
{ "abc.www.ck", 0 },
{ "xxx.ck", 1 },
{ "www.xxx.ck", 0 },
{ "\345\225\206\346\240\207", 1 }, /* xn--czr694b oder 商标 */
{ "\345\225\206\346\240\207", 1 }, /* xn--czr694b or 商标 */
{ "www.\345\225\206\346\240\207", 0 },
/* some special test follow ('name' and 'forgot.his.name' are public, but e.g. his.name is not) */
{ "name", 1 },

View File

@ -52,53 +52,10 @@ static int
static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
{
const char *result;
char lookupname[128];
char *lower;
/* check if there might be some utf-8 characters */
if (domain) {
int utf8;
const char *p;
for (p = domain, utf8 = 0; *p && !utf8; p++)
if (*p < 0)
utf8 = 1;
/* if we found utf-8, make sure to convert domain correctly to lowercase */
/* does it work, if we are not in a utf-8 env ? */
if (utf8) {
#ifdef WITH_LIBICU
UErrorCode status = 0;
UChar utf16_dst[64], utf16_src[64];
int32_t utf16_src_length;
/* UTF-8 to lowercase conversion */
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, domain, (int32_t) strlen(domain), &status);
if (U_SUCCESS(status)) {
int32_t dst_length = u_strToLower(utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), utf16_src, -1, "en", &status);
if (U_SUCCESS(status)) {
u_strToUTF8(lookupname, (int32_t) sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
if (U_SUCCESS(status)) {
domain = lookupname;
} else
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status);
} else
fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status);
} else
fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status);
#else
FILE *pp;
size_t cmdsize = 48 + strlen(domain);
char *cmd = alloca(cmdsize);
snprintf(cmd, cmdsize, "echo -n '%s' | sed -e 's/./\\L\\0/g'", domain);
if ((pp = popen(cmd, "r"))) {
if (fscanf(pp, "%127s", lookupname) >= 1)
domain = lookupname;
pclose(pp);
}
#endif
}
}
if (psl_str_to_utf8lower(domain, NULL, NULL, &lower) == 0)
domain = lower;
result = psl_registrable_domain(psl, domain);
@ -115,7 +72,7 @@ static void test_psl(void)
{
FILE *fp;
const psl_ctx_t *psl;
char buf[256], domain[128], expected_regdom[128], *p;
char buf[256], domain[128], expected_regdom[128];
psl = psl_builtin();
@ -152,11 +109,6 @@ static void test_psl(void)
continue;
}
/* we have to lowercase the domain - the PSL API just takes lowercase */
for (p = domain; *p; p++)
if (*p > 0 && isupper(*p))
*p = tolower(*p);
if (!strcmp(expected_regdom, "null"))
test(psl, domain, NULL);
else