libpsl/src/psl.c

1956 lines
52 KiB
C
Raw Normal View History

2014-03-20 22:43:04 +01:00
/*
2018-02-22 10:03:37 +01:00
* Copyright(c) 2014-2018 Tim Ruehsen
2014-03-20 22:43:04 +01:00
*
2014-03-24 17:29:56 +01:00
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
2014-03-24 17:29:56 +01:00
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
2014-03-24 17:29:56 +01:00
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
2014-03-24 17:29:56 +01:00
* This file is part of libpsl.
2014-03-20 22:43:04 +01:00
*
2014-03-24 17:29:56 +01:00
* Public Suffix List routines
2014-03-20 22:43:04 +01:00
*
* Changelog
* 19.03.2014 Tim Ruehsen created from libmget/cookie.c
*
*/
#if HAVE_CONFIG_H
# include <config.h>
#endif
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
2018-10-29 11:53:41 +01:00
# define GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
#else
2018-10-29 11:53:41 +01:00
# define GCC_VERSION_AT_LEAST(major, minor) 0
#endif
2018-10-29 11:53:41 +01:00
#if GCC_VERSION_AT_LEAST(2,95)
# define PSL_UNUSED __attribute__ ((unused))
#else
2018-10-29 11:53:41 +01:00
# define PSL_UNUSED
#endif
#include <sys/types.h>
#include <sys/stat.h>
#ifdef _WIN32
# include <winsock2.h>
# include <ws2tcpip.h>
#else
# include <sys/socket.h>
# include <netinet/in.h>
# include <unistd.h>
#endif
2019-04-04 10:11:44 +02:00
#if defined(_MSC_VER) && ! defined(ssize_t)
# include <basetsd.h>
typedef SSIZE_T ssize_t;
#endif
2014-03-20 22:43:04 +01:00
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_STRINGS_H
# include <strings.h>
#endif
2014-03-20 22:43:04 +01:00
#include <ctype.h>
#include <time.h>
#include <errno.h>
#include <limits.h> /* for UINT_MAX */
2019-06-03 21:52:48 +02:00
#ifdef HAVE_NL_LANGINFO
# include <langinfo.h>
2019-06-03 21:52:48 +02:00
#endif
#ifndef _WIN32
# include <arpa/inet.h>
#endif
2014-10-28 15:41:35 +01:00
#ifdef HAVE_ALLOCA_H
# include <alloca.h>
#endif
2014-06-17 17:14:02 +02:00
#ifdef WITH_LIBICU
# include <unicode/uversion.h>
# include <unicode/ustring.h>
# include <unicode/uidna.h>
# include <unicode/ucnv.h>
#elif defined(WITH_LIBIDN2)
# include <iconv.h>
# include <idn2.h>
# include <unicase.h>
# include <unistr.h>
#elif defined(WITH_LIBIDN)
# include <iconv.h>
# include <stringprep.h>
# include <idna.h>
# include <unicase.h>
# include <unistr.h>
2014-06-17 17:14:02 +02:00
#endif
2014-03-20 22:43:04 +01:00
2018-04-21 11:30:22 +02:00
#ifndef WINICONV_CONST
# define WINICONV_CONST
2015-01-23 15:05:02 +01:00
#endif
2018-04-21 11:30:22 +02:00
#include <libpsl.h>
/**
* SECTION:libpsl
* @short_description: Public Suffix List library functions
* @title: libpsl
2015-01-21 15:38:18 +01:00
* @stability: Stable
* @include: libpsl.h
*
2016-07-05 17:49:14 +02:00
* [Public Suffix List](https://publicsuffix.org/) library functions.
*
*/
2014-03-20 22:43:04 +01:00
#define countof(a) (sizeof(a)/sizeof(*(a)))
#define PRIV_PSL_FLAG_EXCEPTION (1<<0)
#define PRIV_PSL_FLAG_WILDCARD (1<<1)
#define PRIV_PSL_FLAG_ICANN (1<<2) /* entry of ICANN section */
#define PRIV_PSL_FLAG_PRIVATE (1<<3) /* entry of PRIVATE section */
#define PRIV_PSL_FLAG_PLAIN (1<<4) /* just used for PSL syntax checking */
2015-09-19 10:50:00 +02:00
2014-03-20 22:43:04 +01:00
typedef struct {
char
label_buf[48];
2014-03-20 22:43:04 +01:00
const char *
label;
unsigned short
length;
unsigned char
2014-05-12 12:20:59 +02:00
nlabels, /* number of labels */
2015-09-19 10:50:00 +02:00
flags;
} psl_entry_t;
2014-03-20 22:43:04 +01:00
2014-05-12 12:20:59 +02:00
/* stripped down version libmget vector routines */
2014-03-20 22:43:04 +01:00
typedef struct {
int
(*cmp)(const psl_entry_t **, const psl_entry_t **); /* comparison function */
psl_entry_t
2014-05-12 12:20:59 +02:00
**entry; /* pointer to array of pointers to elements */
2014-03-20 22:43:04 +01:00
int
2014-05-12 12:20:59 +02:00
max, /* allocated elements */
cur; /* number of elements in use */
} psl_vector_t;
2014-03-20 22:43:04 +01:00
struct psl_ctx_st {
psl_vector_t
2015-09-19 10:50:00 +02:00
*suffixes;
unsigned char
*dafsa;
size_t
dafsa_size;
2015-09-19 10:50:00 +02:00
int
nsuffixes,
nexceptions,
nwildcards;
unsigned
utf8 : 1; /* 1: data contains UTF-8 + punycode encoded rules */
2014-03-20 22:43:04 +01:00
};
/* include the PSL data generated by psl-make-dafsa */
2018-03-05 11:25:06 +01:00
#if defined(BUILTIN_GENERATOR_LIBICU) || defined(BUILTIN_GENERATOR_LIBIDN2) || defined(BUILTIN_GENERATOR_LIBIDN)
#include "suffixes_dafsa.h"
2018-03-05 11:25:06 +01:00
#else
static const unsigned char kDafsa[] = "";
2018-03-05 11:25:06 +01:00
static time_t _psl_file_time = 0;
static int _psl_nsuffixes = 0;
static int _psl_nexceptions = 0;
static int _psl_nwildcards = 0;
static const char _psl_sha1_checksum[] = "";
static const char _psl_filename[] = "";
#endif
2014-03-24 17:29:56 +01:00
/* references to these PSLs will result in lookups to built-in data */
static const psl_ctx_t
builtin_psl;
2014-03-24 17:29:56 +01:00
#ifdef PSL_DISTFILE
static const char _psl_dist_filename[] = PSL_DISTFILE;
#else
static const char _psl_dist_filename[] = "";
#endif
static psl_vector_t *vector_alloc(int max, int (*cmp)(const psl_entry_t **, const psl_entry_t **))
2014-03-20 22:43:04 +01:00
{
psl_vector_t *v;
if (!(v = calloc(1, sizeof(psl_vector_t))))
2014-03-20 22:43:04 +01:00
return NULL;
if (!(v->entry = malloc(max * sizeof(psl_entry_t *)))) {
2014-03-20 22:43:04 +01:00
free(v);
return NULL;
}
v->max = max;
v->cmp = cmp;
return v;
}
static void vector_free(psl_vector_t **v)
2014-03-20 22:43:04 +01:00
{
if (v && *v) {
if ((*v)->entry) {
int it;
for (it = 0; it < (*v)->cur; it++)
free((*v)->entry[it]);
free((*v)->entry);
}
free(*v);
}
}
static psl_entry_t *vector_get(const psl_vector_t *v, int pos)
2014-03-20 22:43:04 +01:00
{
if (pos < 0 || !v || pos >= v->cur) return NULL;
return v->entry[pos];
}
2014-05-12 12:20:59 +02:00
/* the entries must be sorted by */
static int vector_find(const psl_vector_t *v, const psl_entry_t *elem)
2014-03-20 22:43:04 +01:00
{
if (v) {
int l, r, m;
int res;
2014-05-12 12:20:59 +02:00
/* binary search for element (exact match) */
2014-03-20 22:43:04 +01:00
for (l = 0, r = v->cur - 1; l <= r;) {
m = (l + r) / 2;
if ((res = v->cmp(&elem, (const psl_entry_t **)&(v->entry[m]))) > 0) l = m + 1;
2014-03-20 22:43:04 +01:00
else if (res < 0) r = m - 1;
else return m;
}
}
2014-05-12 12:20:59 +02:00
return -1; /* not found */
2014-03-20 22:43:04 +01:00
}
static int vector_add(psl_vector_t *v, const psl_entry_t *elem)
2014-03-20 22:43:04 +01:00
{
if (v) {
void *elemp;
if (!(elemp = malloc(sizeof(psl_entry_t))))
return -1;
memcpy(elemp, elem, sizeof(psl_entry_t));
2014-03-20 22:43:04 +01:00
if (v->max == v->cur) {
void *m = realloc(v->entry, (v->max *= 2) * sizeof(psl_entry_t *));
if (m)
v->entry = m;
else {
free(elemp);
return -1;
}
}
2014-03-20 22:43:04 +01:00
v->entry[v->cur++] = elemp;
return v->cur - 1;
}
return -1;
}
static void vector_sort(psl_vector_t *v)
2014-03-20 22:43:04 +01:00
{
if (v && v->cmp)
qsort(v->entry, v->cur, sizeof(psl_vector_t **), (int(*)(const void *, const void *))v->cmp);
2014-03-20 22:43:04 +01:00
}
2014-05-12 12:20:59 +02:00
/* by this kind of sorting, we can easily see if a domain matches or not */
static int suffix_compare(const psl_entry_t *s1, const psl_entry_t *s2)
2014-03-20 22:43:04 +01:00
{
int n;
if ((n = s2->nlabels - s1->nlabels))
2015-01-21 12:21:32 +01:00
return n; /* most labels first */
2014-03-20 22:43:04 +01:00
2014-03-22 20:35:56 +01:00
if ((n = s1->length - s2->length))
2015-01-21 12:21:32 +01:00
return n; /* shorter rules first */
2014-03-20 22:43:04 +01:00
return strcmp(s1->label ? s1->label : s1->label_buf, s2->label ? s2->label : s2->label_buf);
2014-03-20 22:43:04 +01:00
}
/* needed to sort array of pointers, given to qsort() */
static int suffix_compare_array(const psl_entry_t **s1, const psl_entry_t **s2)
{
return suffix_compare(*s1, *s2);
}
static int suffix_init(psl_entry_t *suffix, const char *rule, size_t length)
2014-03-20 22:43:04 +01:00
{
const char *src;
char *dst;
suffix->label = suffix->label_buf;
if (length >= sizeof(suffix->label_buf) - 1) {
suffix->nlabels = 0;
2018-10-13 22:37:00 +02:00
/* fprintf(stderr, "Suffix rule too long (%zd, ignored): %s\n", length, rule); */
2014-03-29 18:12:45 +01:00
return -1;
2014-03-20 22:43:04 +01:00
}
2015-09-19 10:50:00 +02:00
suffix->length = (unsigned char)length;
2014-03-20 22:43:04 +01:00
suffix->nlabels = 1;
for (dst = suffix->label_buf, src = rule; *src;) {
if (*src == '.')
suffix->nlabels++;
2014-06-17 17:14:02 +02:00
*dst++ = *src++;
2014-03-20 22:43:04 +01:00
}
*dst = 0;
2014-03-29 18:12:45 +01:00
return 0;
2014-03-20 22:43:04 +01:00
}
#if !defined(WITH_LIBIDN) && !defined(WITH_LIBIDN2) && !defined(WITH_LIBICU)
/*
* When configured without runtime IDNA support (./configure --disable-runtime), we need a pure ASCII
* representation of non-ASCII characters in labels as found in UTF-8 domain names.
* This is because the current DAFSA format used may only hold character values [21..127].
*
Code copied from http://www.nicemice.net/idn/punycode-spec.gz on
2011-01-04 with SHA-1 a966a8017f6be579d74a50a226accc7607c40133
labeled punycode-spec 1.0.3 (2006-Mar-24-Thu). It is modified for
libpsl by Tim Rühsen. License on the original code:
punycode-spec 1.0.3 (2006-Mar-23-Thu)
http://www.nicemice.net/idn/
Adam M. Costello
http://www.nicemice.net/amc/
B. Disclaimer and license
Regarding this entire document or any portion of it (including
the pseudocode and C code), the author makes no guarantees and
is not responsible for any damage resulting from its use. The
author grants irrevocable permission to anyone to use, modify,
and distribute it in any way that does not diminish the rights
of anyone else to use, modify, and distribute it, provided that
redistributed derivative works do not contain misleading author or
version information. Derivative works need not be licensed under
similar terms.
C. Punycode sample implementation
punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
http://www.nicemice.net/idn/
Adam M. Costello
http://www.nicemice.net/amc/
This is ANSI C code (C89) implementing Punycode 1.0.x.
*/
enum punycode_status {
punycode_success = 0,
punycode_bad_input = 1, /* Input is invalid. */
punycode_big_output = 2, /* Output would exceed the space provided. */
punycode_overflow = 3 /* Wider integers needed to process input. */
};
#ifdef PUNYCODE_UINT
typedef PUNYCODE_UINT punycode_uint;
#elif UINT_MAX >= (1 << 26) - 1
typedef unsigned int punycode_uint;
#else
typedef unsigned long punycode_uint;
#endif
/*** Bootstring parameters for Punycode ***/
enum {
base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
};
static char encode_digit(punycode_uint d)
{
return d + 22 + 75 * (d < 26);
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
}
#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
static const punycode_uint maxint = -1;
static punycode_uint adapt(punycode_uint delta, punycode_uint numpoints, int firsttime)
{
punycode_uint k;
delta = firsttime ? delta / damp : delta >> 1;
/* delta >> 1 is a faster way of doing delta / 2 */
delta += delta / numpoints;
for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
delta /= base - tmin;
}
return k + (base - tmin + 1) * delta / (delta + skew);
}
static enum punycode_status punycode_encode(
size_t input_length_orig,
const punycode_uint input[],
size_t *output_length,
char output[])
{
punycode_uint input_length, n, delta, h, b, bias, j, m, q, k, t;
size_t out, max_out;
/* The Punycode spec assumes that the input length is the same type */
/* of integer as a code point, so we need to convert the size_t to */
/* a punycode_uint, which could overflow. */
if (input_length_orig > maxint)
return punycode_overflow;
input_length = (punycode_uint) input_length_orig;
/* Initialize the state: */
n = initial_n;
delta = 0;
out = 0;
max_out = *output_length;
bias = initial_bias;
/* Handle the basic code points: */
for (j = 0; j < input_length; ++j) {
if (input[j] < 0x80) {
if (max_out - out < 2)
return punycode_big_output;
output[out++] = (char) input[j];
}
/* else if (input[j] < n) return punycode_bad_input; */
/* (not needed for Punycode with unsigned code points) */
}
h = b = (punycode_uint) out;
/* cannot overflow because out <= input_length <= maxint */
/* h is the number of code points that have been handled, b is the */
/* number of basic code points, and out is the number of ASCII code */
/* points that have been output. */
if (b > 0)
output[out++] = delimiter;
/* Main encoding loop: */
while (h < input_length) {
/* All non-basic code points < n have been */
/* handled already. Find the next larger one: */
for (m = maxint, j = 0; j < input_length; ++j) {
/* if (basic(input[j])) continue; */
/* (not needed for Punycode) */
if (input[j] >= n && input[j] < m)
m = input[j];
}
/* Increase delta enough to advance the decoder's */
/* <n,i> state to <m,0>, but guard against overflow: */
if (m - n > (maxint - delta) / (h + 1))
return punycode_overflow;
delta += (m - n) * (h + 1);
n = m;
for (j = 0; j < input_length; ++j) {
/* Punycode does not need to check whether input[j] is basic: */
if (input[j] < n /* || basic(input[j]) */) {
if (++delta == 0)
return punycode_overflow;
}
if (input[j] == n) {
/* Represent delta as a generalized variable-length integer: */
for (q = delta, k = base;; k += base) {
if (out >= max_out)
return punycode_big_output;
t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
k >= bias + tmax ? tmax : k - bias;
if (q < t)
break;
output[out++] = encode_digit(t + (q - t) % (base - t));
q = (q - t) / (base - t);
}
output[out++] = encode_digit(q);
bias = adapt(delta, h + 1, h == b);
delta = 0;
++h;
}
}
++delta, ++n;
}
*output_length = out;
return punycode_success;
}
static ssize_t utf8_to_utf32(const char *in, size_t inlen, punycode_uint *out, size_t outlen)
{
size_t n = 0;
const unsigned char *s = (void *)in;
const unsigned char *e = (void *)(in + inlen);
if (!outlen)
return -1;
outlen--;
while (n < outlen) {
size_t inleft = e - s;
if (inleft >= 1 && (*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
out[n++] = *s;
s++;
} else if (inleft >= 2 && (*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80)
return -1;
out[n++] = ((*s & 0x1F) << 6) | (s[1] & 0x3F);
s += 2;
} else if (inleft >= 3 && (*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
return -1;
out[n++] = ((*s & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
s += 3;
} else if (inleft >= 4 && (*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
return -1;
out[n++] = ((*s & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
s += 4;
} else if (!inleft) {
break;
} else
return -1;
}
return n;
}
static int mem_is_ascii(const char *s, size_t n)
{
for (; n; n--) /* 'while(n--)' generates unsigned integer overflow on n = 0 */
if (*((unsigned char *)s++) >= 128)
return 0;
return 1;
}
static int domain_to_punycode(const char *domain, char *out, size_t outsize)
{
size_t outlen = 0, labellen;
punycode_uint input[256];
const char *label, *e;
for (e = label = domain; e; label = e + 1) {
e = strchr(label, '.');
labellen = e ? (size_t) (e - label) : strlen(label);
/* printf("s=%s inlen=%zd\n", label, labellen); */
if (mem_is_ascii(label, labellen)) {
2016-09-22 11:33:31 +02:00
if (outlen + labellen + (e != NULL) >= outsize)
return 1;
/* printf("outlen=%zd labellen=%zd\n", outlen, labellen); */
memcpy(out + outlen, label, labellen);
outlen += labellen;
} else {
ssize_t inputlen = 0;
if (outlen + labellen + (e != NULL) + 4 >= outsize)
return 1;
if ((inputlen = utf8_to_utf32(label, labellen, input, countof(input))) < 0)
return 1;
memcpy(out + outlen, "xn--", 4);
outlen += 4;
labellen = outsize - outlen;
/* printf("n=%zd space_left=%zd\n", n, labellen); */
if (punycode_encode(inputlen, input, &labellen, out + outlen))
return 1;
outlen += labellen;
}
if (e)
out[outlen++] = '.';
out[outlen] = 0;
}
return 0;
}
#endif
static int isspace_ascii(const char c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
static int str_is_ascii(const char *s)
{
while (*s && *((unsigned char *)s) < 128) s++;
return !*s;
}
#if defined(WITH_LIBIDN)
/*
* Work around a libidn <= 1.30 vulnerability.
*
* The function checks for a valid UTF-8 character sequence before
* passing it to idna_to_ascii_8z().
*
2016-07-05 17:49:14 +02:00
* [1] https://lists.gnu.org/archive/html/help-libidn/2015-05/msg00002.html
* [2] https://lists.gnu.org/archive/html/bug-wget/2015-06/msg00002.html
2016-07-05 17:49:14 +02:00
* [3] https://curl.haxx.se/mail/lib-2015-06/0143.html
*/
static int utf8_is_valid(const char *utf8)
{
const unsigned char *s = (const unsigned char *) utf8;
while (*s) {
if ((*s & 0x80) == 0) /* 0xxxxxxx ASCII char */
s++;
else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80)
return 0;
s += 2;
} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
return 0;
s += 3;
} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
return 0;
s += 4;
} else
return 0;
}
return 1;
}
#endif
typedef void *psl_idna_t;
static psl_idna_t *psl_idna_open(void)
{
#if defined(WITH_LIBICU)
UErrorCode status = 0;
return (void *)uidna_openUTS46(UIDNA_USE_STD3_RULES | UIDNA_NONTRANSITIONAL_TO_ASCII, &status);
#endif
return NULL;
}
2018-10-29 14:56:27 +01:00
static void psl_idna_close(psl_idna_t *idna PSL_UNUSED)
{
#if defined(WITH_LIBICU)
if (idna)
uidna_close((UIDNA *)idna);
#endif
}
2018-10-29 14:56:27 +01:00
static int psl_idna_toASCII(psl_idna_t *idna PSL_UNUSED, const char *utf8, char **ascii)
{
int ret = -1;
#if defined(WITH_LIBICU)
/* IDNA2008 UTS#46 punycode conversion */
if (idna) {
char lookupname_buf[128] = "", *lookupname = lookupname_buf;
UErrorCode status = 0;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;
UChar utf16_dst[128], utf16_src_buf[128];
UChar *utf16_src = utf16_src_buf;
int32_t utf16_src_length, bytes_written;
int32_t utf16_dst_length;
u_strFromUTF8(utf16_src, countof(utf16_src_buf), &utf16_src_length, utf8, -1, &status);
if (!U_SUCCESS(status)) goto cleanup; /* UTF-8 to UTF-16 conversion failed */
2017-07-20 11:35:31 +02:00
if (utf16_src_length >= (int) countof(utf16_src_buf)) {
utf16_src = malloc((utf16_src_length + 1) * sizeof(UChar));
if (!utf16_src) goto cleanup;
u_strFromUTF8(utf16_src, utf16_src_length, NULL, utf8, -1, &status);
if (!U_SUCCESS(status)) goto cleanup; /* UTF-8 to UTF-16 conversion failed */
utf16_src[utf16_src_length] = 0; /* u_strFromUTF8() doesn't 0-terminate if dest is filled up */
}
utf16_dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, countof(utf16_dst), &info, &status);
if (!U_SUCCESS(status)) goto cleanup; /* to ASCII conversion failed */
u_strToUTF8(lookupname, sizeof(lookupname_buf), &bytes_written, utf16_dst, utf16_dst_length, &status);
if (!U_SUCCESS(status)) goto cleanup; /* UTF-16 to UTF-8 conversion failed */
2017-07-20 11:35:31 +02:00
if (bytes_written >= (int) sizeof(lookupname_buf)) {
lookupname = malloc(bytes_written + 1);
if (!lookupname) goto cleanup;
u_strToUTF8(lookupname, bytes_written, NULL, utf16_dst, utf16_dst_length, &status);
if (!U_SUCCESS(status)) goto cleanup; /* UTF-16 to UTF-8 conversion failed */
lookupname[bytes_written] = 0; /* u_strToUTF8() doesn't 0-terminate if dest is filled up */
} else {
if (!(lookupname = strdup(lookupname)))
goto cleanup;
}
if (ascii) {
*ascii = lookupname;
lookupname = NULL;
}
ret = 0;
cleanup:
if (lookupname != lookupname_buf)
free(lookupname);
if (utf16_src != utf16_src_buf)
free(utf16_src);
}
#elif defined(WITH_LIBIDN2)
#if IDN2_VERSION_NUMBER >= 0x00140000
int rc;
/* IDN2_TRANSITIONAL automatically converts to lowercase
* IDN2_NFC_INPUT converts to NFC before toASCII conversion
2017-01-16 10:26:12 +01:00
* Since IDN2_TRANSITIONAL implicitly does NFC conversion, we don't need
* the additional IDN2_NFC_INPUT. But just for the unlikely case that the linked
* library is not matching the headers when building and it doesn't support TR46,
* we provide IDN2_NFC_INPUT. */
if ((rc = idn2_lookup_u8((uint8_t *)utf8, (uint8_t **)ascii, IDN2_NFC_INPUT | IDN2_NONTRANSITIONAL)) == IDN2_OK)
ret = 0;
/* else
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
#else
int rc;
uint8_t *lower;
size_t len = u8_strlen((uint8_t *)utf8) + 1;
/* we need a conversion to lowercase */
if (!(lower = u8_tolower((uint8_t *)utf8, len, 0, UNINORM_NFKC, NULL, &len))) {
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
return -1;
}
if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
ret = 0;
} /* else
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
free(lower);
#endif
#elif defined(WITH_LIBIDN)
int rc;
2018-10-29 14:56:27 +01:00
if (!utf8_is_valid(utf8)) {
2018-10-13 22:37:00 +02:00
/* fprintf(stderr, "Invalid UTF-8 sequence not converted: '%s'\n", utf8); */
return -1;
}
/* idna_to_ascii_8z() automatically converts UTF-8 to lowercase */
if ((rc = idna_to_ascii_8z(utf8, ascii, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
ret = 0;
} /* else
2018-10-13 22:37:00 +02:00
fprintf(stderr, "toASCII failed (%d): %s\n", rc, idna_strerror(rc)); */
#else
char lookupname[128];
if (domain_to_punycode(utf8, lookupname, sizeof(lookupname)) == 0) {
if (ascii)
if ((*ascii = strdup(lookupname)))
ret = 0;
}
#endif
return ret;
}
static void add_punycode_if_needed(psl_idna_t *idna, psl_vector_t *v, psl_entry_t *e)
{
char *lookupname;
if (str_is_ascii(e->label_buf))
return;
if (psl_idna_toASCII(idna, e->label_buf, &lookupname) == 0) {
if (strcmp(e->label_buf, lookupname)) {
psl_entry_t suffix, *suffixp;
/* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
if (suffix_init(&suffix, lookupname, strlen(lookupname)) == 0) {
suffix.flags = e->flags;
if ((suffixp = vector_get(v, vector_add(v, &suffix))))
suffixp->label = suffixp->label_buf; /* set label to changed address */
}
} /* else ignore */
free(lookupname);
}
}
/* prototypes */
int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
int GetUtfMode(const unsigned char *graph, size_t length);
static int is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
2014-03-20 22:43:04 +01:00
{
psl_entry_t suffix;
2015-09-19 10:50:00 +02:00
const char *p;
char *punycode = NULL;
int need_conversion = 0;
2014-03-20 22:43:04 +01:00
2014-05-12 12:20:59 +02:00
/* this function should be called without leading dots, just make sure */
if (*domain == '.')
domain++;
2014-03-20 22:43:04 +01:00
suffix.nlabels = 1;
for (p = domain; *p; p++) {
2014-03-20 22:43:04 +01:00
if (*p == '.')
suffix.nlabels++;
else if (*((unsigned char *)p) >= 128)
need_conversion = 1; /* in case domain is non-ascii we need a toASCII conversion */
}
2014-03-20 22:43:04 +01:00
2015-09-19 10:50:00 +02:00
if (suffix.nlabels == 1) {
/* TLD, this is the prevailing '*' match. If type excludes the '*' rule, continue.
2015-09-19 10:50:00 +02:00
*/
if (!(type & PSL_TYPE_NO_STAR_RULE))
return 1;
2015-09-19 10:50:00 +02:00
}
type &= ~PSL_TYPE_NO_STAR_RULE;
if (psl->utf8 || psl == &builtin_psl)
need_conversion = 0;
if (need_conversion) {
psl_idna_t *idna = psl_idna_open();
2014-03-24 17:29:56 +01:00
if (psl_idna_toASCII(idna, domain, &punycode) == 0) {
suffix.label = punycode;
suffix.length = strlen(punycode);
} else {
/* fallback */
suffix.label = domain;
suffix.length = p - suffix.label;
}
2014-03-20 22:43:04 +01:00
psl_idna_close(idna);
} else {
suffix.label = domain;
suffix.length = p - suffix.label;
}
2014-03-24 17:29:56 +01:00
if (psl == &builtin_psl || psl->dafsa) {
size_t dafsa_size = psl == &builtin_psl ? sizeof(kDafsa) : psl->dafsa_size;
const unsigned char *dafsa = psl == &builtin_psl ? kDafsa : psl->dafsa;
int rc = LookupStringInFixedSet(dafsa, dafsa_size, suffix.label, suffix.length);
if (rc != -1) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rc & PRIV_PSL_FLAG_ICANN))
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rc & PRIV_PSL_FLAG_PRIVATE))
goto suffix_no;
if (rc & PRIV_PSL_FLAG_EXCEPTION)
goto suffix_no;
2014-03-20 22:43:04 +01:00
/* wildcard *.foo.bar implicitly make foo.bar a public suffix */
/* definitely a match, no matter if the found rule is a wildcard or not */
goto suffix_yes;
}
if ((suffix.label = strchr(suffix.label, '.'))) {
suffix.label++;
suffix.length = strlen(suffix.label);
suffix.nlabels--;
rc = LookupStringInFixedSet(dafsa, dafsa_size, suffix.label, suffix.length);
if (rc != -1) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rc & PRIV_PSL_FLAG_ICANN))
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rc & PRIV_PSL_FLAG_PRIVATE))
goto suffix_no;
if (rc & PRIV_PSL_FLAG_WILDCARD)
goto suffix_yes;
}
}
} else {
psl_entry_t *rule = vector_get(psl->suffixes, 0);
if (!rule || rule->nlabels < suffix.nlabels - 1)
goto suffix_no;
2014-03-20 22:43:04 +01:00
rule = vector_get(psl->suffixes, vector_find(psl->suffixes, &suffix));
2014-03-24 17:29:56 +01:00
2014-03-20 22:43:04 +01:00
if (rule) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & PRIV_PSL_FLAG_ICANN))
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & PRIV_PSL_FLAG_PRIVATE))
goto suffix_no;
if (rule->flags & PRIV_PSL_FLAG_EXCEPTION)
goto suffix_no;
/* wildcard *.foo.bar implicitly make foo.bar a public suffix */
/* definitely a match, no matter if the found rule is a wildcard or not */
goto suffix_yes;
}
if ((suffix.label = strchr(suffix.label, '.'))) {
int pos;
suffix.label++;
suffix.length = strlen(suffix.label);
suffix.nlabels--;
rule = vector_get(psl->suffixes, (pos = vector_find(psl->suffixes, &suffix)));
if (rule) {
/* check for correct rule type */
if (type == PSL_TYPE_ICANN && !(rule->flags & PRIV_PSL_FLAG_ICANN))
goto suffix_no;
else if (type == PSL_TYPE_PRIVATE && !(rule->flags & PRIV_PSL_FLAG_PRIVATE))
goto suffix_no;
if (rule->flags & PRIV_PSL_FLAG_WILDCARD)
goto suffix_yes;
}
2014-03-20 22:43:04 +01:00
}
}
suffix_no:
if (punycode)
free(punycode);
return 0;
suffix_yes:
if (punycode)
free(punycode);
return 1;
2014-03-20 22:43:04 +01:00
}
2014-06-17 17:14:02 +02:00
/**
* psl_is_public_suffix:
* @psl: PSL context
* @domain: Domain string
*
* This function checks if @domain is a public suffix by the means of the
2016-07-05 17:49:14 +02:00
* [Mozilla Public Suffix List](https://publicsuffix.org).
2014-06-17 17:14:02 +02:00
*
* For cookie domain checking see psl_is_cookie_domain_acceptable().
*
2016-11-29 14:49:35 +01:00
* International @domain names have to be either in UTF-8 (lowercase + NFKC) or in ASCII/ACE format (punycode).
* Other encodings likely result in incorrect return values.
2016-11-14 12:08:20 +01:00
* Use helper function psl_str_to_utf8lower() for normalization @domain.
*
2014-06-17 17:14:02 +02:00
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
*
* Returns: 1 if domain is a public suffix, 0 if not.
*
* Since: 0.1
*/
int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
{
if (!psl || !domain)
return 1;
return is_public_suffix(psl, domain, PSL_TYPE_ANY);
}
/**
* psl_is_public_suffix2:
* @psl: PSL context
* @domain: Domain string
* @type: Domain type
*
* This function checks if @domain is a public suffix by the means of the
2016-07-05 17:49:14 +02:00
* [Mozilla Public Suffix List](https://publicsuffix.org).
*
* @type specifies the PSL section where to perform the lookup. Valid values are
* %PSL_TYPE_PRIVATE, %PSL_TYPE_ICANN, %PSL_TYPE_NO_STAR_RULE, and %PSL_TYPE_ANY.
*
2018-02-26 11:45:57 +01:00
* %PSL_TYPE_NO_STAR_RULE switches of the 'prevailing star rule' (see
* [List](https://publicsuffix.org/list) under 'Algorithm' 2.).
* Applying the flag means that TLDs not explicitly listed in the PSL are *not* treated as public suffixes.
*
2016-11-29 14:49:35 +01:00
* International @domain names have to be either in UTF-8 (lowercase + NFKC) or in ASCII/ACE format (punycode).
* Other encodings likely result in incorrect return values.
2016-11-14 12:08:20 +01:00
* Use helper function psl_str_to_utf8lower() for normalization @domain.
*
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
*
* Returns: 1 if domain is a public suffix, 0 if not.
*
* Since: 0.1
*/
int psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type)
{
if (!psl || !domain)
return 1;
return is_public_suffix(psl, domain, type);
2014-06-17 17:14:02 +02:00
}
/**
* psl_unregistrable_domain:
* @psl: PSL context
* @domain: Domain string
*
2016-07-05 18:58:13 +02:00
* This function finds the longest public suffix part of @domain by the means
2016-07-05 17:49:14 +02:00
* of the [Mozilla Public Suffix List](https://publicsuffix.org).
*
2016-11-29 14:49:35 +01:00
* International @domain names have to be either in UTF-8 (lowercase + NFKC) or in ASCII/ACE format (punycode).
* Other encodings likely result in incorrect return values.
2016-11-14 12:08:20 +01:00
* Use helper function psl_str_to_utf8lower() for normalization @domain.
*
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
*
* Returns: Pointer to longest public suffix part of @domain or %NULL if @domain
* does not contain a public suffix (or if @psl is %NULL).
*
* Since: 0.1
*/
const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
2014-03-26 17:14:25 +01:00
{
int nlabels = 0;
const char *p;
if (!psl || !domain)
return NULL;
/*
* In the main loop we introduce a O(N^2) behavior to avoid code duplication.
* To avoid nasty CPU hogging, we limit the lookup to max. 8 domain labels to the right.
*/
2018-02-21 15:56:58 +01:00
for (p = domain + strlen(domain) - 1; p >= domain; p--) {
if (*p == '.' && ++nlabels > 8) {
domain = p + 1;
break;
}
}
2014-05-12 12:20:59 +02:00
/*
* We check from left to right to catch special PSL entries like 'forgot.his.name':
* 'forgot.his.name' and 'name' are in the PSL while 'his.name' is not.
*/
2014-03-26 17:14:25 +01:00
while (!is_public_suffix(psl, domain, 0)) {
if ((domain = strchr(domain, '.')))
domain++;
else
break; /* prevent endless loop if is_public_suffix() is broken. */
2014-03-26 17:14:25 +01:00
}
return domain;
2014-03-26 17:14:25 +01:00
}
/**
* psl_registrable_domain:
* @psl: PSL context
* @domain: Domain string
*
* This function finds the shortest private suffix part of @domain by the means
2016-07-05 17:49:14 +02:00
* of the [Mozilla Public Suffix List](https://publicsuffix.org).
*
2016-11-29 14:49:35 +01:00
* International @domain names have to be either in UTF-8 (lowercase + NFKC) or in ASCII/ACE format (punycode).
* Other encodings likely result in incorrect return values.
2016-11-14 12:08:20 +01:00
* Use helper function psl_str_to_utf8lower() for normalization @domain.
*
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
*
* Returns: Pointer to shortest private suffix part of @domain or %NULL if @domain
* does not contain a private suffix (or if @psl is %NULL).
*
* Since: 0.1
*/
const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
{
const char *p, *regdom = NULL;
int nlabels = 0;
if (!psl || !domain || *domain == '.')
return NULL;
/*
* In the main loop we introduce a O(N^2) behavior to avoid code duplication.
* To avoid nasty CPU hogging, we limit the lookup to max. 8 domain labels to the right.
*/
for (p = domain + strlen(domain) - 1; p >= domain; p--) {
if (*p == '.' && ++nlabels > 8) {
domain = p + 1;
break;
}
}
2014-05-12 12:20:59 +02:00
/*
* We check from left to right to catch special PSL entries like 'forgot.his.name':
* 'forgot.his.name' and 'name' are in the PSL while 'his.name' is not.
*/
while (!is_public_suffix(psl, domain, 0)) {
if ((p = strchr(domain, '.'))) {
regdom = domain;
domain = p + 1;
} else
break; /* prevent endless loop if is_public_suffix() is broken. */
}
return regdom;
}
/**
* psl_load_file:
* @fname: Name of PSL file
*
* This function loads the public suffixes file named @fname.
* To free the allocated resources, call psl_free().
*
2016-11-29 14:49:35 +01:00
* The suffixes are expected to be UTF-8 encoded (lowercase + NFKC) if they are international.
*
* Returns: Pointer to a PSL context or %NULL on failure.
*
* Since: 0.1
*/
2014-03-20 22:43:04 +01:00
psl_ctx_t *psl_load_file(const char *fname)
2014-03-22 14:28:55 +01:00
{
FILE *fp;
psl_ctx_t *psl = NULL;
if (!fname)
return NULL;
2018-10-05 20:47:26 +02:00
if ((fp = fopen(fname, "rb"))) {
2014-03-22 14:28:55 +01:00
psl = psl_load_fp(fp);
fclose(fp);
}
return psl;
}
/**
* psl_load_fp:
* @fp: %FILE pointer
*
* This function loads the public suffixes from a %FILE pointer.
* To free the allocated resources, call psl_free().
*
2016-11-29 14:49:35 +01:00
* The suffixes are expected to be UTF-8 encoded (lowercase + NFKC) if they are international.
*
* Returns: Pointer to a PSL context or %NULL on failure.
*
* Since: 0.1
*/
2014-03-22 14:28:55 +01:00
psl_ctx_t *psl_load_fp(FILE *fp)
2014-03-20 22:43:04 +01:00
{
psl_ctx_t *psl;
psl_entry_t suffix, *suffixp;
char buf[256], *linep, *p;
2016-07-13 11:14:18 +02:00
int type = 0, is_dafsa;
psl_idna_t *idna;
2014-03-20 22:43:04 +01:00
2014-03-22 14:28:55 +01:00
if (!fp)
2014-03-20 22:43:04 +01:00
return NULL;
2014-03-22 14:28:55 +01:00
if (!(psl = calloc(1, sizeof(psl_ctx_t))))
return NULL;
2014-03-20 22:43:04 +01:00
/* read first line to allow ASCII / DAFSA detection */
2016-07-13 11:14:18 +02:00
if (!(linep = fgets(buf, sizeof(buf) - 1, fp)))
goto fail;
2016-07-13 11:14:18 +02:00
is_dafsa = strlen(buf) == 16 && !strncmp(buf, ".DAFSA@PSL_", 11);
2016-07-13 11:14:18 +02:00
if (is_dafsa) {
void *m;
2016-07-13 11:14:18 +02:00
size_t size = 65536, n, len = 0;
int version = atoi(buf + 11);
if (version != 0)
goto fail;
if (!(psl->dafsa = malloc(size)))
goto fail;
memcpy(psl->dafsa, buf, len);
while ((n = fread(psl->dafsa + len, 1, size - len, fp)) > 0) {
len += n;
if (len >= size) {
if (!(m = realloc(psl->dafsa, size *= 2)))
goto fail;
psl->dafsa = m;
}
}
/* release unused memory */
if ((m = realloc(psl->dafsa, len)))
psl->dafsa = m;
else if (!len)
2017-07-20 11:35:31 +02:00
psl->dafsa = NULL; /* realloc() just free'd psl->dafsa */
2016-07-13 11:14:18 +02:00
psl->dafsa_size = len;
psl->utf8 = !!GetUtfMode(psl->dafsa, len);
2016-07-13 11:14:18 +02:00
return psl;
}
idna = psl_idna_open();
2014-06-17 17:14:02 +02:00
2014-05-12 12:20:59 +02:00
/*
2016-07-05 17:49:14 +02:00
* as of 02.11.2012, the list at https://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions.
* as of 19.02.2014, the list at https://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
2018-10-07 21:15:07 +02:00
* as of 07.10.2018, the list at https://publicsuffix.org/list/ contains ~8600 rules and 8 exceptions.
2014-05-12 12:20:59 +02:00
*/
psl->suffixes = vector_alloc(8*1024, suffix_compare_array);
psl->utf8 = 1; /* we put UTF-8 and punycode rules in the lookup vector */
2014-03-22 14:28:55 +01:00
2016-07-13 11:14:18 +02:00
do {
while (isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
2014-05-12 12:20:59 +02:00
if (!*linep) continue; /* skip empty lines */
2014-03-22 14:28:55 +01:00
if (*linep == '/' && linep[1] == '/') {
if (!type) {
if (strstr(linep + 2, "===BEGIN ICANN DOMAINS==="))
type = PRIV_PSL_FLAG_ICANN;
else if (!type && strstr(linep + 2, "===BEGIN PRIVATE DOMAINS==="))
type = PRIV_PSL_FLAG_PRIVATE;
}
else if (type == PRIV_PSL_FLAG_ICANN && strstr(linep + 2, "===END ICANN DOMAINS==="))
type = 0;
else if (type == PRIV_PSL_FLAG_PRIVATE && strstr(linep + 2, "===END PRIVATE DOMAINS==="))
type = 0;
2014-05-12 12:20:59 +02:00
continue; /* skip comments */
}
2014-03-22 14:28:55 +01:00
2014-05-12 12:20:59 +02:00
/* parse suffix rule */
for (p = linep; *linep && !isspace_ascii(*linep);) linep++;
2014-03-22 14:28:55 +01:00
*linep = 0;
if (*p == '!') {
2015-09-19 10:50:00 +02:00
p++;
suffix.flags = PRIV_PSL_FLAG_EXCEPTION | type;
2015-09-19 10:50:00 +02:00
psl->nexceptions++;
} else if (*p == '*') {
if (*++p != '.') {
2018-10-13 22:37:00 +02:00
/* fprintf(stderr, "Unsupported kind of rule (ignored): %s\n", p - 1); */
2015-09-19 10:50:00 +02:00
continue;
2014-06-17 17:14:02 +02:00
}
2015-09-19 10:50:00 +02:00
p++;
/* wildcard *.foo.bar implicitly make foo.bar a public suffix */
suffix.flags = PRIV_PSL_FLAG_WILDCARD | PRIV_PSL_FLAG_PLAIN | type;
psl->nwildcards++;
2015-09-19 10:50:00 +02:00
psl->nsuffixes++;
2014-03-22 14:28:55 +01:00
} else {
suffix.flags = PRIV_PSL_FLAG_PLAIN | type;
2015-09-19 10:50:00 +02:00
psl->nsuffixes++;
}
if (suffix_init(&suffix, p, linep - p) == 0) {
2015-09-19 10:50:00 +02:00
int index;
if ((index = vector_find(psl->suffixes, &suffix)) >= 0) {
2015-09-19 10:50:00 +02:00
/* Found existing entry:
* Combination of exception and plain rule is ambiguous
2015-09-19 10:50:00 +02:00
* !foo.bar
* foo.bar
*
* Allowed:
* !foo.bar + *.foo.bar
* foo.bar + *.foo.bar
*
* We do not check here, let's do it later.
*/
suffixp = vector_get(psl->suffixes, index);
2015-09-19 10:50:00 +02:00
suffixp->flags |= suffix.flags;
} else {
/* New entry */
suffixp = vector_get(psl->suffixes, vector_add(psl->suffixes, &suffix));
2015-09-19 10:50:00 +02:00
}
if (suffixp) {
suffixp->label = suffixp->label_buf; /* set label to changed address */
add_punycode_if_needed(idna, psl->suffixes, suffixp);
}
2014-03-20 22:43:04 +01:00
}
2016-07-13 11:14:18 +02:00
} while ((linep = fgets(buf, sizeof(buf), fp)));
2014-03-20 22:43:04 +01:00
vector_sort(psl->suffixes);
2014-03-22 14:28:55 +01:00
psl_idna_close(idna);
2014-06-17 17:14:02 +02:00
2014-03-20 22:43:04 +01:00
return psl;
fail:
psl_free(psl);
return NULL;
2014-03-20 22:43:04 +01:00
}
/**
2015-08-12 05:33:12 +02:00
* psl_free:
* @psl: PSL context pointer
*
* This function frees the the PSL context that has been retrieved via
* psl_load_fp() or psl_load_file().
*
* Since: 0.1
*/
void psl_free(psl_ctx_t *psl)
2014-03-24 17:29:56 +01:00
{
if (psl && psl != &builtin_psl) {
vector_free(&psl->suffixes);
free(psl->dafsa);
free(psl);
2014-03-24 17:29:56 +01:00
}
}
/**
* psl_builtin:
*
* This function returns the PSL context that has been generated and built in at compile-time.
2017-04-19 11:46:27 +02:00
* You don't have to free the returned context explicitly.
*
* The builtin data also contains punycode entries, one for each international domain name.
*
* If the generation of built-in data has been disabled during compilation, %NULL will be returned.
2016-11-29 14:49:35 +01:00
* When using the builtin psl context, you can provide UTF-8 (lowercase + NFKC) or ASCII/ACE (punycode)
2016-11-14 12:08:20 +01:00
* representations of domains to functions like psl_is_public_suffix().
*
* Returns: Pointer to the built in PSL data or %NULL if this data is not available.
*
* Since: 0.1
*/
const psl_ctx_t *psl_builtin(void)
{
#if defined(BUILTIN_GENERATOR_LIBICU) || defined(BUILTIN_GENERATOR_LIBIDN2) || defined(BUILTIN_GENERATOR_LIBIDN)
return &builtin_psl;
#else
return NULL;
#endif
}
/**
* psl_suffix_count:
* @psl: PSL context pointer
*
* This function returns number of public suffixes maintained by @psl.
* The number of exceptions within the Public Suffix List are not included.
*
* If the information is not available, the return value is -1 (since 0.19).
* This is the case with DAFSA blobs or if @psl is %NULL.
*
* Returns: Number of public suffixes entries in PSL context or -1 if this information is not available.
*
* Since: 0.1
*/
int psl_suffix_count(const psl_ctx_t *psl)
{
if (psl == &builtin_psl)
2015-09-19 10:50:00 +02:00
return _psl_nsuffixes;
else if (psl)
return psl->dafsa ? -1 : psl->nsuffixes;
2014-03-29 18:58:24 +01:00
else
return -1;
}
2014-03-22 20:35:56 +01:00
/**
* psl_suffix_exception_count:
* @psl: PSL context pointer
*
* This function returns number of public suffix exceptions maintained by @psl.
*
* If the information is not available, the return value is -1 (since 0.19).
* This is the case with DAFSA blobs or if @psl is %NULL.
*
* Returns: Number of public suffix exceptions in PSL context or -1 if this information is not available.
*
* Since: 0.1
*/
int psl_suffix_exception_count(const psl_ctx_t *psl)
{
if (psl == &builtin_psl)
2015-09-19 10:50:00 +02:00
return _psl_nexceptions;
else if (psl)
return psl->dafsa ? -1 : psl->nexceptions;
2014-03-29 18:58:24 +01:00
else
return -1;
}
/**
* psl_suffix_wildcard_count:
* @psl: PSL context pointer
*
* This function returns number of public suffix wildcards maintained by @psl.
*
* If the information is not available, the return value is -1 (since 0.19).
* This is the case with DAFSA blobs or if @psl is %NULL.
*
* Returns: Number of public suffix wildcards in PSL context or -1 if this information is not available.
*
* Since: 0.10.0
*/
int psl_suffix_wildcard_count(const psl_ctx_t *psl)
{
if (psl == &builtin_psl)
return _psl_nwildcards;
else if (psl)
return psl->dafsa ? -1 : psl->nwildcards;
else
return -1;
}
/**
* psl_builtin_file_time:
*
2016-12-05 15:01:27 +01:00
* This function returns the mtime of the Public Suffix List file that has been built in.
*
* If the generation of built-in data has been disabled during compilation, 0 will be returned.
*
* Returns: time_t value or 0.
*
* Since: 0.1
*/
2014-03-24 17:29:56 +01:00
time_t psl_builtin_file_time(void)
{
return _psl_file_time;
}
/**
* psl_builtin_sha1sum:
*
2016-12-05 15:01:27 +01:00
* This function returns the SHA1 checksum of the Public Suffix List file that has been built in.
* The returned string is in lowercase hex encoding, e.g. "2af1e9e3044eda0678bb05949d7cca2f769901d8".
*
* If the generation of built-in data has been disabled during compilation, an empty string will be returned.
*
* Returns: String containing SHA1 checksum or an empty string.
*
* Since: 0.1
*/
2014-03-24 17:29:56 +01:00
const char *psl_builtin_sha1sum(void)
{
return _psl_sha1_checksum;
2014-03-20 22:43:04 +01:00
}
2014-04-17 12:31:06 +02:00
/**
* psl_builtin_filename:
*
2016-12-05 15:01:27 +01:00
* This function returns the file name of the Public Suffix List file that has been built in.
2014-04-17 12:31:06 +02:00
*
* If the generation of built-in data has been disabled during compilation, an empty string will be returned.
*
* Returns: String containing the PSL file name or an empty string.
*
* Since: 0.1
*/
const char *psl_builtin_filename(void)
{
return _psl_filename;
}
/**
* psl_builtin_outdated:
*
* This function checks if the built-in data is older than the file it has been created from.
* If it is, it might be a good idea for the application to reload the PSL.
* The mtime is taken as reference.
*
* If the PSL file does not exist, it is assumed that the built-in data is not outdated.
*
* Returns: 1 if the built-in is outdated, 0 otherwise.
*
* Since: 0.10.0
*/
int psl_builtin_outdated(void)
{
struct stat st;
2015-12-12 04:59:15 +01:00
if (stat(_psl_filename, &st) == 0 && st.st_mtime > _psl_file_time)
2015-09-27 19:14:13 +02:00
return 1;
2015-09-27 19:14:13 +02:00
return 0;
}
/**
* psl_dist_filename:
*
* This function returns the file name of the distribution/system PSL data file.
* This file will be considered by psl_latest().
*
* Return the filename that is set by ./configure --with-psl-distfile, or an empty string.
*
* Returns: String containing a PSL file name or an empty string.
*
* Since: 0.16
*/
const char *psl_dist_filename(void)
{
return _psl_dist_filename;
}
/**
* psl_get_version:
*
* Get libpsl version.
*
* Returns: String containing version of libpsl.
*
* Since: 0.2.5
**/
2014-08-14 11:05:47 +02:00
const char *psl_get_version(void)
{
2014-06-17 17:14:02 +02:00
#ifdef WITH_LIBICU
return PACKAGE_VERSION " (+libicu/" U_ICU_VERSION ")";
#elif defined(WITH_LIBIDN2)
return PACKAGE_VERSION " (+libidn2/" IDN2_VERSION ")";
#elif defined(WITH_LIBIDN)
return PACKAGE_VERSION " (+libidn/" STRINGPREP_VERSION ")";
2014-06-17 17:14:02 +02:00
#else
return PACKAGE_VERSION " (no IDNA support)";
2014-06-17 17:14:02 +02:00
#endif
}
/**
* psl_check_version_number:
* @version: Version number (hex) to check against.
*
* Check the given version number is at minimum the current library version number.
* The version number must be a hexadecimal number like 0x000a01 (V0.10.1).
*
* Returns: Returns the library version number if the given version number is at least
* the version of the library, else return 0; If the argument is 0, the function returns
* the library version number without performing a check.
*
* Since: 0.11.0
**/
int psl_check_version_number(int version)
{
if (version) {
int major = version >> 16;
int minor = (version >> 8) & 0xFF;
int patch = version & 0xFF;
if (major < PSL_VERSION_MAJOR
|| (major == PSL_VERSION_MAJOR && minor < PSL_VERSION_MINOR)
|| (major == PSL_VERSION_MAJOR && minor == PSL_VERSION_MINOR && patch < PSL_VERSION_PATCH))
{
return 0;
}
}
return PSL_VERSION_NUMBER;
}
/* return whether hostname is an IP address or not */
static int isip(const char *hostname)
{
#ifdef _WIN32
WCHAR wName[INET6_ADDRSTRLEN+1];
struct sockaddr_in addr = {0};
struct sockaddr_in6 addr6 = {0};
INT size = sizeof(addr);
INT size6 = sizeof(addr6);
if (!MultiByteToWideChar(CP_UTF8, 0, hostname, -1, wName, countof(wName)))
return 0;
return (WSAStringToAddressW(wName, AF_INET, NULL, (struct sockaddr *)&addr, &size) != SOCKET_ERROR) |
(WSAStringToAddressW(wName, AF_INET6, NULL, (struct sockaddr *)&addr6, &size6) != SOCKET_ERROR);
#else
struct in_addr addr;
struct in6_addr addr6;
return inet_pton(AF_INET, hostname, &addr) || inet_pton(AF_INET6, hostname, &addr6);
#endif
}
/**
* psl_is_cookie_domain_acceptable:
* @psl: PSL context pointer
* @hostname: The request hostname.
* @cookie_domain: The domain value from a cookie
*
* This helper function checks whether @cookie_domain is an acceptable cookie domain value for the request
* @hostname.
*
2016-11-29 14:49:35 +01:00
* For international domain names both, @hostname and @cookie_domain, have to be either in UTF-8 (lowercase + NFKC)
2016-11-14 12:08:20 +01:00
* or in ASCII/ACE (punycode) format. Other encodings or mixing UTF-8 and punycode likely result in incorrect return values.
*
* Use helper function psl_str_to_utf8lower() for normalization of @hostname and @cookie_domain.
*
2014-04-16 11:36:37 +02:00
* Examples:
* 1. Cookie domain 'example.com' would be acceptable for hostname 'www.example.com',
* but '.com' or 'com' would NOT be acceptable since 'com' is a public suffix.
*
* 2. Cookie domain 'his.name' would be acceptable for hostname 'remember.his.name',
* but NOT for 'forgot.his.name' since 'forgot.his.name' is a public suffix.
*
* Returns: 1 if acceptable, 0 if not acceptable.
*
* Since: 0.1
*/
int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain)
{
2014-04-16 11:36:37 +02:00
const char *p;
size_t hostname_length, cookie_domain_length;
if (!psl || !hostname || !cookie_domain)
return 0;
while (*cookie_domain == '.')
cookie_domain++;
if (!strcmp(hostname, cookie_domain))
2014-05-12 12:20:59 +02:00
return 1; /* an exact match is acceptable (and pretty common) */
if (isip(hostname))
return 0; /* Hostname is an IP address and these must match fully (RFC 6265, 5.1.3) */
cookie_domain_length = strlen(cookie_domain);
hostname_length = strlen(hostname);
if (cookie_domain_length >= hostname_length)
2014-05-12 12:20:59 +02:00
return 0; /* cookie_domain is too long */
p = hostname + hostname_length - cookie_domain_length;
if (!strcmp(p, cookie_domain) && p[-1] == '.') {
2014-05-12 12:20:59 +02:00
/* OK, cookie_domain matches, but it must be longer than the longest public suffix in 'hostname' */
if (!(p = psl_unregistrable_domain(psl, hostname)))
return 1;
if (cookie_domain_length > strlen(p))
return 1;
}
return 0;
}
2014-06-17 17:14:02 +02:00
/**
* psl_free_string:
* @str: pointer to lowercase string returned by psl_str_to_utf8lower()
*
* This function free()'s the memory allocated by psl_str_to_utf8lower() when
* returning a lowercase string
*
* Since: 0.19
*/
void psl_free_string(char *str)
{
if (str)
free(str);
}
2014-06-17 17:14:02 +02:00
/**
* psl_str_to_utf8lower:
* @str: string to convert
* @encoding: charset encoding of @str, e.g. 'iso-8859-1' or %NULL
* @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
* @lower: return value containing the converted string
*
2016-11-29 14:49:35 +01:00
* This helper function converts a string to UTF-8 lowercase + NFKC representation.
* Lowercase + NFKC UTF-8 is needed as input to the domain checking functions.
2014-06-17 17:14:02 +02:00
*
* @lower stays unchanged on error.
2014-06-23 12:56:13 +02:00
*
* When returning PSL_SUCCESS, the return value 'lower' must be freed after usage.
2014-06-17 17:14:02 +02:00
*
2014-06-20 12:36:51 +02:00
* Returns: psl_error_t value.
* PSL_SUCCESS: Success
* PSL_ERR_INVALID_ARG: @str is a %NULL value.
* PSL_ERR_CONVERTER: Failed to open the unicode converter with name @encoding
* PSL_ERR_TO_UTF16: Failed to convert @str to unicode
* PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase
* PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8
* PSL_ERR_NO_MEM: Failed to allocate memory
2014-06-17 17:14:02 +02:00
*
* Since: 0.4
*/
2018-10-29 11:53:41 +01:00
psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding PSL_UNUSED, const char *locale PSL_UNUSED, char **lower)
2014-06-17 17:14:02 +02:00
{
2014-06-20 12:36:51 +02:00
int ret = PSL_ERR_INVALID_ARG;
2014-06-17 17:14:02 +02:00
if (!str)
2014-06-20 12:36:51 +02:00
return PSL_ERR_INVALID_ARG;
/* shortcut to avoid costly conversion */
if (str_is_ascii(str)) {
if (lower) {
char *p, *tmp;
if (!(tmp = strdup(str)))
return PSL_ERR_NO_MEM;
*lower = tmp;
/* convert ASCII string to lowercase */
for (p = *lower; *p; p++)
if (isupper(*p))
*p = tolower(*p);
}
2014-06-20 12:36:51 +02:00
return PSL_SUCCESS;
}
2014-06-17 17:14:02 +02:00
#ifdef WITH_LIBICU
2014-06-18 16:27:29 +02:00
do {
2014-06-17 17:14:02 +02:00
size_t str_length = strlen(str);
UErrorCode status = 0;
UChar *utf16_dst, *utf16_lower;
int32_t utf16_dst_length;
char *utf8_lower;
UConverter *uconv;
if (str_length < 256) {
/* C89 allocation */
utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1));
utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
utf8_lower = alloca(str_length * 6 + 1);
} else {
utf16_dst = malloc(sizeof(UChar) * (str_length * 2 + 1));
utf16_lower = malloc(sizeof(UChar) * (str_length * 2 + 1));
utf8_lower = malloc(str_length * 6 + 1);
if (!utf16_dst || !utf16_lower || !utf8_lower) {
ret = PSL_ERR_NO_MEM;
goto out;
}
}
2014-06-17 17:14:02 +02:00
uconv = ucnv_open(encoding, &status);
if (U_SUCCESS(status)) {
utf16_dst_length = ucnv_toUChars(uconv, utf16_dst, str_length * 2 + 1, str, str_length, &status);
ucnv_close(uconv);
if (U_SUCCESS(status)) {
int32_t utf16_lower_length = u_strToLower(utf16_lower, str_length * 2 + 1, utf16_dst, utf16_dst_length, locale, &status);
if (U_SUCCESS(status)) {
u_strToUTF8(utf8_lower, str_length * 6 + 1, NULL, utf16_lower, utf16_lower_length, &status);
2014-06-17 17:14:02 +02:00
if (U_SUCCESS(status)) {
ret = PSL_SUCCESS;
if (lower) {
char *tmp = strdup(utf8_lower);
if (tmp)
*lower = tmp;
else
ret = PSL_ERR_NO_MEM;
}
2014-06-17 17:14:02 +02:00
} else {
2014-06-20 12:36:51 +02:00
ret = PSL_ERR_TO_UTF8;
/* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
2014-06-17 17:14:02 +02:00
}
} else {
2014-06-20 12:36:51 +02:00
ret = PSL_ERR_TO_LOWER;
/* fprintf(stderr, "Failed to convert UTF-16 to lowercase (status %d)\n", status); */
2014-06-17 17:14:02 +02:00
}
} else {
2014-06-20 12:36:51 +02:00
ret = PSL_ERR_TO_UTF16;
/* fprintf(stderr, "Failed to convert string to UTF-16 (status %d)\n", status); */
2014-06-17 17:14:02 +02:00
}
} else {
2014-06-20 12:36:51 +02:00
ret = PSL_ERR_CONVERTER;
/* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */
2014-06-17 17:14:02 +02:00
}
out:
if (str_length >= 256) {
free(utf16_dst);
free(utf16_lower);
free(utf8_lower);
}
2014-06-18 16:27:29 +02:00
} while (0);
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
do {
/* find out local charset encoding */
if (!encoding) {
2018-04-21 11:30:22 +02:00
#ifdef HAVE_NL_LANGINFO
encoding = nl_langinfo(CODESET);
2018-04-21 11:30:22 +02:00
#elif defined _WIN32
static char buf[16];
snprintf(buf, sizeof(buf), "CP%u", GetACP());
encoding = buf;
#endif
if (!encoding || !*encoding)
encoding = "ASCII";
}
/* convert to UTF-8 */
if (strcasecmp(encoding, "utf-8")) {
iconv_t cd = iconv_open("utf-8", encoding);
if (cd != (iconv_t)-1) {
char *tmp = (char *)str; /* iconv won't change where str points to, but changes tmp itself */
size_t tmp_len = strlen(str) + 1;
size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
char *dst = malloc(dst_len + 1), *dst_tmp = dst;
if (!dst) {
ret = PSL_ERR_NO_MEM;
}
2018-04-21 11:30:22 +02:00
else if (iconv(cd, (WINICONV_CONST char **)&tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
{
/* start size for u8_tolower internal memory allocation.
* u8_tolower() does not terminate the result string. we have 0 byte included in above tmp_len
* and thus in len. */
size_t len = dst_len - dst_len_tmp;
if ((tmp = (char *)u8_tolower((uint8_t *)dst, len, 0, UNINORM_NFKC, NULL, &len))) {
ret = PSL_SUCCESS;
if (lower) {
*lower = tmp;
tmp = NULL;
} else
free(tmp);
} else {
ret = PSL_ERR_TO_LOWER;
/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
}
} else {
ret = PSL_ERR_TO_UTF8;
/* fprintf(stderr, "Failed to convert '%s' string into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
}
free(dst);
iconv_close(cd);
} else {
ret = PSL_ERR_TO_UTF8;
/* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
}
} else {
/* we need a conversion to lowercase */
uint8_t *tmp;
/* start size for u8_tolower internal memory allocation.
* u8_tolower() does not terminate the result string, so include terminating 0 byte in len. */
size_t len = u8_strlen((uint8_t *)str) + 1;
if ((tmp = u8_tolower((uint8_t *)str, len, 0, UNINORM_NFKC, NULL, &len))) {
ret = PSL_SUCCESS;
if (lower) {
2016-09-26 15:15:34 +02:00
*lower = (char*)tmp;
tmp = NULL;
} else
free(tmp);
} else {
ret = PSL_ERR_TO_LOWER;
/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
}
}
} while (0);
2014-06-17 17:14:02 +02:00
#endif
return ret;
}
/* if file is newer than the builtin data, insert it reverse sorted by mtime */
static int insert_file(const char *fname, const char **psl_fname, time_t *psl_mtime, int n)
{
struct stat st;
int it;
if (fname && *fname && stat(fname, &st) == 0 && st.st_mtime > _psl_file_time) {
/* add file name and mtime to end of array */
psl_fname[n] = fname;
psl_mtime[n++] = st.st_mtime;
/* move the new entry to it's correct position */
for (it = n - 2; it >= 0 && st.st_mtime > psl_mtime[it]; it--) {
psl_fname[it + 1] = psl_fname[it];
psl_mtime[it + 1] = psl_mtime[it];
psl_fname[it] = fname;
psl_mtime[it] = st.st_mtime;
}
}
return n;
}
/**
* psl_latest:
* @fname: Name of PSL file or %NULL
*
* This function loads the the latest available PSL data from either
* - @fname (application specific filename, may be %NULL)
* - location specified during built-time (filename from ./configure --with-psl-distfile)
* - built-in PSL data (generated from ./configure --with-psl-file)
* - location of built-in data (filename from ./configure --with-psl-file)
*
* If none of the above is available, the function returns %NULL.
*
* To free the allocated resources, call psl_free().
*
* Returns: Pointer to a PSL context or %NULL on failure.
*
* Since: 0.16
*/
psl_ctx_t *psl_latest(const char *fname)
{
psl_ctx_t *psl;
const char *psl_fname[3];
time_t psl_mtime[3];
int it, ntimes;
psl_fname[0] = NULL; /* silence gcc 6.2 false warning */
/* create array of PSL files reverse sorted by mtime (latest first) */
ntimes = insert_file(fname, psl_fname, psl_mtime, 0);
ntimes = insert_file(_psl_dist_filename, psl_fname, psl_mtime, ntimes);
ntimes = insert_file(_psl_filename, psl_fname, psl_mtime, ntimes);
/* load PSL data from the latest file, falling back to the second recent, ... */
for (psl = NULL, it = 0; it < ntimes; it++) {
if (psl_mtime[it] > _psl_file_time)
if ((psl = psl_load_file(psl_fname[it])))
break;
}
/* if file loading failed or there is no file newer than the builtin data,
* then return the builtin data. */
return psl ? psl : (psl_ctx_t *) psl_builtin();
}