libpsl/src/psl.c

435 lines
10 KiB
C
Raw Normal View History

2014-03-20 22:43:04 +01:00
/*
* Copyright(c) 2014 Tim Ruehsen
*
2014-03-24 17:29:56 +01:00
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* This file is part of libpsl.
2014-03-20 22:43:04 +01:00
*
2014-03-24 17:29:56 +01:00
* Public Suffix List routines
2014-03-20 22:43:04 +01:00
*
* Changelog
* 19.03.2014 Tim Ruehsen created from libmget/cookie.c
*
*/
2014-03-21 11:05:09 +01:00
// need _GNU_SOURCE for qsort_r()
#ifndef _GNU_SOURCE
# define _GNU_SOURCE
#endif
2014-03-20 22:43:04 +01:00
#if HAVE_CONFIG_H
# include <config.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <libpsl.h>
#define countof(a) (sizeof(a)/sizeof(*(a)))
2014-03-24 17:29:56 +01:00
// an invalid pointer
#define _PSL_INTERNAL 1
2014-03-20 22:43:04 +01:00
typedef struct {
char
label_buf[48];
2014-03-20 22:43:04 +01:00
const char *
label;
unsigned short
length;
unsigned char
nlabels, // number of labels
wildcard; // this is a wildcard rule (e.g. *.sapporo.jp)
} _psl_entry_t;
// stripped down version libmget vector routines
typedef struct {
int
(*cmp)(const _psl_entry_t *, const _psl_entry_t *); // comparison function
_psl_entry_t
**entry; // pointer to array of pointers to elements
int
max, // allocated elements
cur; // number of elements in use
} _psl_vector_t;
struct _psl_ctx_st {
_psl_vector_t
*suffixes,
*suffix_exceptions;
};
2014-03-24 17:29:56 +01:00
#include "suffixes.c"
// references to this PSL will result in lookups to built-in data
static psl_ctx_t
_builtin_psl;
2014-03-20 22:43:04 +01:00
static _psl_vector_t *_vector_alloc(int max, int (*cmp)(const _psl_entry_t *, const _psl_entry_t *))
{
_psl_vector_t *v;
if (!(v = calloc(1, sizeof(_psl_vector_t))))
return NULL;
if (!(v->entry = malloc(max * sizeof(_psl_entry_t *)))) {
free(v);
return NULL;
}
v->max = max;
v->cmp = cmp;
return v;
}
static void _vector_free(_psl_vector_t **v)
{
if (v && *v) {
if ((*v)->entry) {
int it;
for (it = 0; it < (*v)->cur; it++)
free((*v)->entry[it]);
free((*v)->entry);
}
free(*v);
}
}
static _psl_entry_t *_vector_get(const _psl_vector_t *v, int pos)
{
if (pos < 0 || !v || pos >= v->cur) return NULL;
return v->entry[pos];
}
// the entries must be sorted by
static int _vector_find(const _psl_vector_t *v, const _psl_entry_t *elem)
{
if (v) {
int l, r, m;
int res;
// binary search for element (exact match)
for (l = 0, r = v->cur - 1; l <= r;) {
m = (l + r) / 2;
if ((res = v->cmp(elem, v->entry[m])) > 0) l = m + 1;
else if (res < 0) r = m - 1;
else return m;
}
}
return -1; // not found
}
static int _vector_add(_psl_vector_t *v, const _psl_entry_t *elem)
{
if (v) {
void *elemp;
elemp = malloc(sizeof(_psl_entry_t));
memcpy(elemp, elem, sizeof(_psl_entry_t));
if (v->max == v->cur)
v->entry = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
v->entry[v->cur++] = elemp;
return v->cur - 1;
}
return -1;
}
static int _compare(const void *p1, const void *p2, void *v)
{
return ((_psl_vector_t *)v)->cmp(*((_psl_entry_t **)p1), *((_psl_entry_t **)p2));
}
static void _vector_sort(_psl_vector_t *v)
{
if (v && v->cmp)
qsort_r(v->entry, v->cur, sizeof(_psl_vector_t *), _compare, v);
}
2014-03-22 20:35:56 +01:00
static inline int _vector_size(_psl_vector_t *v)
{
return v ? v->cur : 0;
}
2014-03-20 22:43:04 +01:00
// by this kind of sorting, we can easily see if a domain matches or not (match = supercookie !)
static int _suffix_compare(const _psl_entry_t *s1, const _psl_entry_t *s2)
{
int n;
if ((n = s2->nlabels - s1->nlabels))
return n; // most labels first
2014-03-22 20:35:56 +01:00
if ((n = s1->length - s2->length))
2014-03-20 22:43:04 +01:00
return n; // shorter rules first
return strcmp(s1->label, s2->label);
}
static void _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length)
{
const char *src;
char *dst;
suffix->label = suffix->label_buf;
if (length >= sizeof(suffix->label_buf) - 1) {
suffix->nlabels = 0;
fprintf(stderr, _("Suffix rule too long (%zd, ignored): %s\n"), length, rule);
2014-03-20 22:43:04 +01:00
return;
}
if (*rule == '*') {
if (*++rule != '.') {
suffix->nlabels = 0;
fprintf(stderr, _("Unsupported kind of rule (ignored): %s\n"), rule);
return;
}
rule++;
suffix->wildcard = 1;
suffix->length = (unsigned char)length - 2;
} else {
suffix->wildcard = 0;
suffix->length = (unsigned char)length;
}
suffix->nlabels = 1;
for (dst = suffix->label_buf, src = rule; *src;) {
if (*src == '.')
suffix->nlabels++;
*dst++ = tolower(*src++);
}
*dst = 0;
}
int psl_is_public(const psl_ctx_t *psl, const char *domain)
2014-03-20 22:43:04 +01:00
{
_psl_entry_t suffix, *rule;
const char *p, *label_bak;
unsigned short length_bak;
2014-03-22 20:35:56 +01:00
// this function should be called without leading dots, just make sure
2014-03-20 22:43:04 +01:00
suffix.label = domain + (*domain == '.');
suffix.length = strlen(suffix.label);
suffix.wildcard = 0;
suffix.nlabels = 1;
for (p = suffix.label; *p; p++)
if (*p == '.')
suffix.nlabels++;
2014-03-22 22:19:20 +01:00
// if domain has enough labels, it is public
2014-03-24 17:29:56 +01:00
if (psl == &_builtin_psl)
rule = &suffixes[0];
else
rule = _vector_get(psl->suffixes, 0);
2014-03-20 22:43:04 +01:00
if (!rule || rule->nlabels < suffix.nlabels - 1)
2014-03-22 22:19:20 +01:00
return 1;
2014-03-20 22:43:04 +01:00
2014-03-24 17:29:56 +01:00
if (psl == &_builtin_psl)
rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
else
rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
2014-03-20 22:43:04 +01:00
if (rule) {
// definitely a match, no matter if the found rule is a wildcard or not
2014-03-22 22:19:20 +01:00
return 0;
2014-03-20 22:43:04 +01:00
}
label_bak = suffix.label;
length_bak = suffix.length;
if ((suffix.label = strchr(suffix.label, '.'))) {
suffix.label++;
suffix.length = strlen(suffix.label);
suffix.nlabels--;
2014-03-24 17:29:56 +01:00
if (psl == &_builtin_psl)
rule = bsearch(&suffix, suffixes, countof(suffixes), sizeof(suffixes[0]), (int(*)(const void *, const void *))_suffix_compare);
else
rule = _vector_get(psl->suffixes, _vector_find(psl->suffixes, &suffix));
2014-03-20 22:43:04 +01:00
if (rule) {
if (rule->wildcard) {
// now that we matched a wildcard, we have to check for an exception
suffix.label = label_bak;
suffix.length = length_bak;
suffix.nlabels++;
2014-03-24 17:29:56 +01:00
if (psl == &_builtin_psl) {
if (bsearch(&suffix, suffix_exceptions, countof(suffix_exceptions), sizeof(suffix_exceptions[0]), (int(*)(const void *, const void *))_suffix_compare))
return 1; // found an exception, so 'domain' is public
} else {
if (_vector_get(psl->suffix_exceptions, _vector_find(psl->suffix_exceptions, &suffix)) != 0)
return 1; // found an exception, so 'domain' is public
}
2014-03-20 22:43:04 +01:00
2014-03-22 22:19:20 +01:00
return 0;
2014-03-20 22:43:04 +01:00
}
}
}
2014-03-22 22:19:20 +01:00
return 1;
2014-03-20 22:43:04 +01:00
}
2014-03-24 17:29:56 +01:00
int psl_global_init(void)
{
size_t it;
for (it = 0; it < countof(suffixes); it++)
suffixes[it].label = suffixes[it].label_buf;
for (it = 0; it < countof(suffix_exceptions); it++)
suffix_exceptions[it].label = suffix_exceptions[it].label_buf;
return 0; // 0 = OK
}
void psl_global_deinit(void)
{
}
2014-03-20 22:43:04 +01:00
psl_ctx_t *psl_load_file(const char *fname)
2014-03-22 14:28:55 +01:00
{
FILE *fp;
psl_ctx_t *psl = NULL;
if ((fp = fopen(fname, "r"))) {
psl = psl_load_fp(fp);
fclose(fp);
}
return psl;
}
psl_ctx_t *psl_load_fp(FILE *fp)
2014-03-20 22:43:04 +01:00
{
psl_ctx_t *psl;
_psl_entry_t suffix, *suffixp;
int nsuffixes = 0;
char buf[256], *linep, *p;
2014-03-20 22:43:04 +01:00
2014-03-22 14:28:55 +01:00
if (!fp)
2014-03-20 22:43:04 +01:00
return NULL;
2014-03-22 14:28:55 +01:00
if (!(psl = calloc(1, sizeof(psl_ctx_t))))
return NULL;
2014-03-20 22:43:04 +01:00
2014-03-22 14:28:55 +01:00
// as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions.
// as of 19.02.2014, the list at http://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
psl->suffixes = _vector_alloc(8*1024, _suffix_compare);
psl->suffix_exceptions = _vector_alloc(64, _suffix_compare);
while ((linep = fgets(buf, sizeof(buf), fp))) {
while (isspace(*linep)) linep++; // ignore leading whitespace
if (!*linep) continue; // skip empty lines
if (*linep == '/' && linep[1] == '/')
continue; // skip comments
// parse suffix rule
for (p = linep; *linep && !isspace(*linep);) linep++;
*linep = 0;
if (*p == '!') {
// add to exceptions
_suffix_init(&suffix, p + 1, linep - p - 1);
suffixp = _vector_get(psl->suffix_exceptions, _vector_add(psl->suffix_exceptions, &suffix));
} else {
_suffix_init(&suffix, p, linep - p);
suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
2014-03-20 22:43:04 +01:00
}
2014-03-22 14:28:55 +01:00
if (suffixp)
suffixp->label = suffixp->label_buf; // set label to changed address
2014-03-20 22:43:04 +01:00
2014-03-22 14:28:55 +01:00
nsuffixes++;;
}
2014-03-20 22:43:04 +01:00
2014-03-22 14:28:55 +01:00
_vector_sort(psl->suffix_exceptions);
_vector_sort(psl->suffixes);
2014-03-20 22:43:04 +01:00
return psl;
}
2014-03-24 17:29:56 +01:00
// return built-in PSL structure
psl_ctx_t *psl_builtin(void)
{
return &_builtin_psl;
}
void psl_free(psl_ctx_t **psl)
{
if (psl && *psl) {
if (*psl != &_builtin_psl) {
_vector_free(&(*psl)->suffixes);
_vector_free(&(*psl)->suffix_exceptions);
}
free(*psl);
*psl = NULL;
}
}
/* does not include exceptions */
int psl_suffix_count(const psl_ctx_t *psl)
{
2014-03-24 17:29:56 +01:00
if (psl == &_builtin_psl)
return countof(suffixes);
else
return _vector_size(psl->suffixes);
}
2014-03-22 20:35:56 +01:00
/* just counts exceptions */
int psl_suffix_exception_count(const psl_ctx_t *psl)
{
2014-03-24 17:29:56 +01:00
if (psl == &_builtin_psl)
return countof(suffix_exceptions);
else
return _vector_size(psl->suffix_exceptions);
}
2014-03-24 17:29:56 +01:00
// returns compilation time
time_t psl_builtin_compile_time(void)
2014-03-20 22:43:04 +01:00
{
2014-03-24 17:29:56 +01:00
return _psl_compile_time;
}
// returns mtime of PSL source file
time_t psl_builtin_file_time(void)
{
return _psl_file_time;
}
// returns MD5 checksum (hex-encoded, lowercase) of PSL source file
const char *psl_builtin_sha1sum(void)
{
return _psl_sha1_checksum;
2014-03-20 22:43:04 +01:00
}