added psl_str_to_utf8lower()

2014-06-17 17:14:02 +02:00 · 2014-06-17 17:14:02 +02:00 · 57394eb1f8
parent 9fb63f7a49
commit 57394eb1f8
5 changed files with 200 additions and 97 deletions
--- a/docs/libpsl/libpsl-sections.txt
+++ b/docs/libpsl/libpsl-sections.txt
@ -17,4 +17,5 @@ psl_builtin_sha1sum
 psl_builtin_filename
 psl_is_cookie_domain_acceptable
 psl_get_version
 psl_str_to_utf8lower
 </SECTION>
--- a/include/libpsl.h
+++ b/include/libpsl.h
@ -65,6 +65,9 @@ const char *
 /* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */
 const char *
 	psl_registrable_domain(const psl_ctx_t *psl, const char *domain);
 /* convert a string into lowercase UTF-8 */
 int
 	psl_str_to_utf8lower(const char *s, const char *encoding, const char *locale, char **lower);
 /* does not include exceptions */
 int
 	psl_suffix_count(const psl_ctx_t *psl);
--- a/src/psl.c
+++ b/src/psl.c
@ -49,9 +49,20 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #include <alloca.h>
 #ifdef WITH_LIBICU
 #	include <unicode/uversion.h>
 #	include <unicode/ustring.h>
 #	include <unicode/uidna.h>
 #	include <unicode/ucnv.h>
 #endif
 #include <libpsl.h>
 /* number of elements within an array */
 #define countof(a) (sizeof(a)/sizeof(*(a)))
 /**
 * SECTION:libpsl
 * @short_description: Public Suffix List library functions
@ -239,39 +250,19 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length)
 	for (dst = suffix->label_buf, src = rule; *src;) {
 		if (*src == '.')
 			suffix->nlabels++;
-		*dst++ = tolower(*src++);
+		*dst++ = *src++;
 	}
 	*dst = 0;
 	return 0;
 }
-/**
+static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
 * psl_is_public_suffix:
 * @psl: PSL context
 * @domain: Domain string
 *
 * This function checks if @domain is a public suffix by the means of the
 * [Mozilla Public Suffix List](http://publicsuffix.org).
 *
 * For cookie domain checking see psl_is_cookie_domain_acceptable().
 *
 * @psl is a context returned by either psl_load_file(), psl_load_fp() or
 * psl_builtin().
 *
 * Returns: 1 if domain is a public suffix, 0 if not.
 *
 * Since: 0.1
 */
 int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
 {
 	_psl_entry_t suffix, *rule;
 	const char *p, *label_bak;
 	unsigned short length_bak;
 	if (!psl || !domain)
 		return 1;
 	/* this function should be called without leading dots, just make sure */
 	suffix.label = domain + (*domain == '.');
 	suffix.length = strlen(suffix.label);
@ -340,6 +331,31 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
 	return 0;
 }
 /**
 * psl_is_public_suffix:
 * @psl: PSL context
 * @domain: Domain string
 *
 * This function checks if @domain is a public suffix by the means of the
 * [Mozilla Public Suffix List](http://publicsuffix.org).
 *
 * For cookie domain checking see psl_is_cookie_domain_acceptable().
 *
 * @psl is a context returned by either psl_load_file(), psl_load_fp() or
 * psl_builtin().
 *
 * Returns: 1 if domain is a public suffix, 0 if not.
 *
 * Since: 0.1
 */
 int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
 {
 	if (!psl || !domain)
 		return 1;
 	return _psl_is_public_suffix(psl, domain);
 }
 /**
 * psl_unregistrable_domain:
 * @psl: PSL context
@ -366,7 +382,7 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
 	 *   'forgot.his.name' and 'name' are in the PSL while 'his.name' is not.
 	 */
-	while (!psl_is_public_suffix(psl, domain)) {
+	while (!_psl_is_public_suffix(psl, domain)) {
 		if ((domain = strchr(domain, '.')))
 			domain++;
 		else
@ -404,7 +420,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
 	 *   'forgot.his.name' and 'name' are in the PSL while 'his.name' is not.
 	 */
-	while (!psl_is_public_suffix(psl, domain)) {
+	while (!_psl_is_public_suffix(psl, domain)) {
 		if ((p = strchr(domain, '.'))) {
 			regdom = domain;
 			domain = p + 1;
@ -415,6 +431,51 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
 	return regdom;
 }
 static int _str_is_ascii(const char *s)
 {
 	while (*s > 0) s++;
 	return !*s;
 }
 static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t *e)
 {
 	if (_str_is_ascii(e->label_buf))
 		return;
 #ifdef WITH_LIBICU
 	/* IDNA2008 UTS#46 punycode conversion */
 	if (idna) {
 		_psl_entry_t suffix, *suffixp;
 		char lookupname[128] = "";
 		UErrorCode status = 0;
 		UIDNAInfo info = UIDNA_INFO_INITIALIZER;
 		UChar utf16_dst[128], utf16_src[128];
 		int32_t utf16_src_length;
 		u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, -1, &status);
 		if (U_SUCCESS(status)) {
 			int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
 			if (U_SUCCESS(status)) {
 				u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
 				if (U_SUCCESS(status)) {
 					if (strcmp(e->label_buf, lookupname)) {
 						/* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */
 						_suffix_init(&suffix, lookupname, strlen(lookupname));
 						suffix.wildcard = e->wildcard;
 						suffixp = _vector_get(v, _vector_add(v, &suffix));
 						suffixp->label = suffixp->label_buf; /* set label to changed address */
 					} /* else ignore */
 				} /* else
 					fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
 			} /* else
 				fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */
 		} /* else
 			fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */
 	}
 #endif
 }
 /**
 * psl_load_file:
 * @fname: Name of PSL file
@ -422,13 +483,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
 * This function loads the public suffixes file named @fname.
 * To free the allocated resources, call psl_free().
 *
- * If you want to use punycode representations for functions like psl_is_public_suffix(),
+ * The suffixes are expected to be lowercase UTF-8 encoded.
 * these have to exist as entries within @fname. This is a design decision to not pull in
 * dependencies for UTF-8 case-handling and IDNA libraries.
 *
 * On the contrary, the builtin data already contains punycode entries.
 *
 * Have a look into psl2c.c for example code on how to convert UTF-8 to lowercase and to punycode.
 *
 * Returns: Pointer to a PSL context or %NULL on failure.
 *
@ -457,7 +512,7 @@ psl_ctx_t *psl_load_file(const char *fname)
 * This function loads the public suffixes from a FILE pointer.
 * To free the allocated resources, call psl_free().
 *
- * Have a look at psl_load_fp() for punycode considerations.
+ * The suffixes are expected to be lowercase UTF-8 encoded.
 *
 * Returns: Pointer to a PSL context or %NULL on failure.
 *
@ -467,8 +522,11 @@ psl_ctx_t *psl_load_fp(FILE *fp)
 {
 	psl_ctx_t *psl;
 	_psl_entry_t suffix, *suffixp;
 	int nsuffixes = 0;
 	char buf[256], *linep, *p;
 #ifdef WITH_LIBICU
 	UIDNA *idna;
 	UErrorCode status = 0;
 #endif
 	if (!fp)
 		return NULL;
@ -476,6 +534,10 @@ psl_ctx_t *psl_load_fp(FILE *fp)
 	if (!(psl = calloc(1, sizeof(psl_ctx_t))))
 		return NULL;
 #ifdef WITH_LIBICU
 	idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status);
 #endif
 	/*
 	 *  as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions.
 	 *  as of 19.02.2014, the list at http://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
@ -496,26 +558,29 @@ psl_ctx_t *psl_load_fp(FILE *fp)
 		if (*p == '!') {
 			/* add to exceptions */
-			if (_suffix_init(&suffix, p + 1, linep - p - 1) == 0)
+			if (_suffix_init(&suffix, p + 1, linep - p - 1) == 0) {
 				suffixp = _vector_get(psl->suffix_exceptions, _vector_add(psl->suffix_exceptions, &suffix));
 			else
 				suffixp = NULL;
 		} else {
 			if (_suffix_init(&suffix, p, linep - p) == 0)
 				suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
 			else
 				suffixp = NULL;
 		}
 		if (suffixp)
 				suffixp->label = suffixp->label_buf; /* set label to changed address */
-
+				_add_punycode_if_needed(idna, psl->suffix_exceptions, suffixp);
-		nsuffixes++;;
+			}
 		} else {
 			/* add to suffixes */
 			if (_suffix_init(&suffix, p, linep - p) == 0) {
 				suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
 				suffixp->label = suffixp->label_buf; /* set label to changed address */
 				_add_punycode_if_needed(idna, psl->suffixes, suffixp);
 			}
 		}
 	}
 	_vector_sort(psl->suffix_exceptions);
 	_vector_sort(psl->suffixes);
 #ifdef WITH_LIBICU
 	if (idna)
 		uidna_close(idna);
 #endif
 	return psl;
 }
@ -685,7 +750,11 @@ const char *psl_builtin_filename(void)
 **/
 const char *psl_get_version (void)
 {
 #ifdef WITH_LIBICU
 	return PACKAGE_VERSION " +libicu/" U_ICU_VERSION;
 #else
 	return PACKAGE_VERSION;
 #endif
 }
 /**
@ -741,3 +810,81 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
 	return 0;
 }
 /**
 * psl_str_to_utf8lower:
 * @str: string to convert
 * @encoding: charset encoding of @str, e.g. 'iso-8859-1' or %NULL
 * @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
 * @lower: return value containing the converted string
 *
 * This helper function converts a string to lowercase UTF-8 representation.
 * Lowercase UTF-8 is needed as input to the domain checking functions.
 *
 * The return value 'lower' must be freed after usage.
 *
 * Returns: 0 on success, negative value on error.
 *   -2 failed to open converter with name @encoding
 *   -3 failed to convert @str to unicode
 *   -4 failed to convert unicode to lowercase
 *   -5 failed to convert unicode to UTF-8
 *
 * Since: 0.4
 */
 int psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower)
 {
 	int ret = -1;
 	if (lower)
 		*lower = NULL;
 	if (!str)
 		return 0;
 #ifdef WITH_LIBICU
 	size_t str_length = strlen(str);
 	UErrorCode status = 0;
 	UChar *utf16_dst, *utf16_lower;
 	int32_t utf16_dst_length;
 	char *utf8_lower;
 	UConverter *uconv;
 	/* C89 allocation */
 	utf16_dst   = alloca(sizeof(UChar) * (str_length * 2 + 1));
 	utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
 	utf8_lower  = alloca(str_length * 2 + 1);
 	uconv = ucnv_open(encoding, &status);
 	if (U_SUCCESS(status)) {
 		utf16_dst_length = ucnv_toUChars(uconv, utf16_dst, str_length * 2 + 1, str, str_length, &status);
 		ucnv_close(uconv);
 		if (U_SUCCESS(status)) {
 			int32_t utf16_lower_length = u_strToLower(utf16_lower, str_length * 2 + 1, utf16_dst, utf16_dst_length, locale, &status);
 			if (U_SUCCESS(status)) {
 				u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status);
 				if (U_SUCCESS(status)) {
 					if (lower)
 						*lower = strdup(utf8_lower);
 					ret = 0;
 				} else {
 					ret = -5;
 					fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status);
 				}
 			} else {
 				ret = -4;
 				fprintf(stderr, "Failed to convert UTF-16 to lowercase (status %d)\n", status);
 			}
 		} else {
 			ret = -3;
 			fprintf(stderr, "Failed to convert string to UTF-16 (status %d)\n", status);
 		}
 	} else {
 		ret = -2;
 		fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status);
 	}
 #endif
 	return ret;
 }
--- a/tests/test-is-public.c
+++ b/tests/test-is-public.c
@ -65,7 +65,7 @@ static void test_psl(void)
 		{ "abc.www.ck", 0 },
 		{ "xxx.ck", 1 },
 		{ "www.xxx.ck", 0 },
-		{ "\345\225\206\346\240\207", 1 }, /* xn--czr694b oder 商标 */
+		{ "\345\225\206\346\240\207", 1 }, /* xn--czr694b or 商标 */
 		{ "www.\345\225\206\346\240\207", 0 },
 		/* some special test follow ('name' and 'forgot.his.name' are public, but e.g. his.name is not) */
 		{ "name", 1 },
--- a/tests/test-registrable-domain.c
+++ b/tests/test-registrable-domain.c
@ -52,53 +52,10 @@ static int
 static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
 {
 	const char *result;
-	char lookupname[128];
+	char *lower;
-	/* check if there might be some utf-8 characters */
+	if (psl_str_to_utf8lower(domain, NULL, NULL, &lower) == 0)
-	if (domain) {
+		domain = lower;
 		int utf8;
 		const char *p;
 		for (p = domain, utf8 = 0; *p && !utf8; p++)
 			if (*p < 0)
 				utf8 = 1;
 		/* if we found utf-8, make sure to convert domain correctly to lowercase */
 		/* does it work, if we are not in a utf-8 env ? */
 		if (utf8) {
 #ifdef WITH_LIBICU
 			UErrorCode status = 0;
 			UChar utf16_dst[64], utf16_src[64];
 			int32_t utf16_src_length;
 			/* UTF-8 to lowercase conversion */
 			u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, domain, (int32_t) strlen(domain), &status);
 			if (U_SUCCESS(status)) {
 				int32_t dst_length = u_strToLower(utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), utf16_src, -1, "en", &status);
 				if (U_SUCCESS(status)) {
 					u_strToUTF8(lookupname, (int32_t) sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
 					if (U_SUCCESS(status)) {
 						domain = lookupname;
 					} else
 						fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status);
 				} else
 					fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status);
 			} else
 				fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status);
 #else
 			FILE *pp;
 			size_t cmdsize = 48 + strlen(domain);
 			char *cmd = alloca(cmdsize);
 			snprintf(cmd, cmdsize, "echo -n '%s' | sed -e 's/./\\L\\0/g'", domain);
 			if ((pp = popen(cmd, "r"))) {
 				if (fscanf(pp, "%127s", lookupname) >= 1)
 					domain = lookupname;
 				pclose(pp);
 			}
 #endif
 		}
 	}
 	result = psl_registrable_domain(psl, domain);
@ -115,7 +72,7 @@ static void test_psl(void)
 {
 	FILE *fp;
 	const psl_ctx_t *psl;
-	char buf[256], domain[128], expected_regdom[128], *p;
+	char buf[256], domain[128], expected_regdom[128];
 	psl = psl_builtin();
@ -152,11 +109,6 @@ static void test_psl(void)
 					continue;
 			}
 			/* we have to lowercase the domain - the PSL API just takes lowercase */
 			for (p = domain; *p; p++)
 				if (*p > 0 && isupper(*p))
 					*p = tolower(*p);
 			if (!strcmp(expected_regdom, "null"))
 				test(psl, domain, NULL);
 			else