diff --git a/src/psl.c b/src/psl.c index a1a7d3f..6f5947b 100644 --- a/src/psl.c +++ b/src/psl.c @@ -106,7 +106,17 @@ struct _psl_ctx_st { }; /* include the PSL data compiled by 'psl2c' */ -#include "suffixes.c" +#ifndef _LIBPSL_INCLUDED_BY_PSL2C +# include "suffixes.c" +#else + /* if this source file is included by psl2c.c, provide empty builtin data */ + static _psl_entry_t suffixes[0]; + static _psl_entry_t suffix_exceptions[0]; + static time_t _psl_file_time; + static time_t _psl_compile_time; + static const char _psl_sha1_checksum[] = ""; + static const char _psl_filename[] = ""; +#endif /* references to this PSL will result in lookups to built-in data */ static const psl_ctx_t diff --git a/src/psl2c.c b/src/psl2c.c index 8b1ec95..a270307 100644 --- a/src/psl2c.c +++ b/src/psl2c.c @@ -28,6 +28,8 @@ * */ + + #if HAVE_CONFIG_H # include #endif @@ -39,235 +41,14 @@ #include #include -/* -#ifdef WITH_LIBIDN2 -# include -#endif -*/ - -#ifdef WITH_LIBICU -# include -# include -# include -#endif - #ifdef WITH_BUILTIN #include -typedef struct { - char - label_buf[48]; - const char * - label; - unsigned short - length; - unsigned char - nlabels, /* number of labels */ - wildcard; /* this is a wildcard rule (e.g. *.sapporo.jp) */ -} _psl_entry_t; - -/* stripped down version libmget vector routines */ -typedef struct { - int - (*cmp)(const _psl_entry_t *, const _psl_entry_t *); /* comparison function */ - _psl_entry_t - **entry; /* pointer to array of pointers to elements */ - int - max, /* allocated elements */ - cur; /* number of elements in use */ -} _psl_vector_t; - -struct _psl_ctx_st { - _psl_vector_t - *suffixes, - *suffix_exceptions; -}; - -static _psl_vector_t *_vector_alloc(int max, int (*cmp)(const _psl_entry_t *, const _psl_entry_t *)) -{ - _psl_vector_t *v; - - if (!(v = calloc(1, sizeof(_psl_vector_t)))) - return NULL; - - if (!(v->entry = malloc(max * sizeof(_psl_entry_t *)))) { - free(v); - return NULL; - } - - v->max = max; - v->cmp = cmp; - return v; -} - -static void _vector_free(_psl_vector_t **v) -{ - if (v && *v) { - if ((*v)->entry) { - int it; - - for (it = 0; it < (*v)->cur; it++) - free((*v)->entry[it]); - - free((*v)->entry); - } - free(*v); - } -} - -static _psl_entry_t *_vector_get(const _psl_vector_t *v, int pos) -{ - if (pos < 0 || !v || pos >= v->cur) return NULL; - - return v->entry[pos]; -} - -static int _vector_add(_psl_vector_t *v, const _psl_entry_t *elem) -{ - if (v) { - void *elemp; - - elemp = malloc(sizeof(_psl_entry_t)); - memcpy(elemp, elem, sizeof(_psl_entry_t)); - - if (v->max == v->cur) - v->entry = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *)); - - v->entry[v->cur++] = elemp; - return v->cur - 1; - } - - return -1; -} - -static int _compare(const void *p1, const void *p2, void *v) -{ - return ((_psl_vector_t *)v)->cmp(*((_psl_entry_t **)p1), *((_psl_entry_t **)p2)); -} - -static void _vector_sort(_psl_vector_t *v) -{ - if (v && v->cmp) - qsort_r(v->entry, v->cur, sizeof(_psl_vector_t *), _compare, v); -} - -/* by this kind of sorting, we can easily see if a domain matches or not (match = supercookie !) */ - -static int _suffix_compare(const _psl_entry_t *s1, const _psl_entry_t *s2) -{ - int n; - - if ((n = s2->nlabels - s1->nlabels)) - return n; /* most labels first */ - - if ((n = s1->length - s2->length)) - return n; /* shorter rules first */ - - return strcmp(s1->label, s2->label); -} - -static void _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length) -{ - const char *src; - char *dst; - - suffix->label = suffix->label_buf; - - if (length >= sizeof(suffix->label_buf) - 1) { - suffix->nlabels = 0; - fprintf(stderr, "Suffix rule too long (%d, ignored): %s\n", (int) length, rule); - return; - } - - if (*rule == '*') { - if (*++rule != '.') { - suffix->nlabels = 0; - fprintf(stderr, "Unsupported kind of rule (ignored): %s\n", rule); - return; - } - rule++; - suffix->wildcard = 1; - suffix->length = (unsigned char)length - 2; - } else { - suffix->wildcard = 0; - suffix->length = (unsigned char)length; - } - - suffix->nlabels = 1; - - for (dst = suffix->label_buf, src = rule; *src;) { - if (*src == '.') - suffix->nlabels++; - *dst++ = tolower(*src++); - } - *dst = 0; -} - -psl_ctx_t *psl_load_file(const char *fname) -{ - FILE *fp; - psl_ctx_t *psl = NULL; - - if ((fp = fopen(fname, "r"))) { - psl = psl_load_fp(fp); - fclose(fp); - } - - return psl; -} - -psl_ctx_t *psl_load_fp(FILE *fp) -{ - psl_ctx_t *psl; - _psl_entry_t suffix, *suffixp; - int nsuffixes = 0; - char buf[256], *linep, *p; - - if (!fp) - return NULL; - - if (!(psl = calloc(1, sizeof(psl_ctx_t)))) - return NULL; - - /* - * as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions. - * as of 19.02.2014, the list at http://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions. - */ - psl->suffixes = _vector_alloc(8*1024, _suffix_compare); - psl->suffix_exceptions = _vector_alloc(64, _suffix_compare); - - while ((linep = fgets(buf, sizeof(buf), fp))) { - while (isspace(*linep)) linep++; /* ignore leading whitespace */ - if (!*linep) continue; /* skip empty lines */ - - if (*linep == '/' && linep[1] == '/') - continue; /* skip comments */ - - /* parse suffix rule */ - for (p = linep; *linep && !isspace(*linep);) linep++; - *linep = 0; - - if (*p == '!') { - /* add to exceptions */ - _suffix_init(&suffix, p + 1, linep - p - 1); - suffixp = _vector_get(psl->suffix_exceptions, _vector_add(psl->suffix_exceptions, &suffix)); - } else { - _suffix_init(&suffix, p, linep - p); - suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix)); - } - - if (suffixp) - suffixp->label = suffixp->label_buf; /* set label to changed address */ - - nsuffixes++;; - } - - _vector_sort(psl->suffix_exceptions); - _vector_sort(psl->suffixes); - - return psl; -} +/* here we include the library source code to have access to internal functions and data structures */ +#define _LIBPSL_INCLUDED_BY_PSL2C +# include "psl.c" +#undef _LIBPSL_INCLUDED_BY_PSL2C static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname) { @@ -283,7 +64,7 @@ static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char * fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version); } while (0); #else - fprintf(fpout, "/* automatically generated by psl2c (punycode generated with idn2) */\n"); + fprintf(fpout, "/* automatically generated by psl2c (without punycode support) */\n"); #endif fprintf(fpout, "static _psl_entry_t %s[] = {\n", varname); @@ -298,109 +79,6 @@ static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char * fprintf(fpout, "};\n"); } -void psl_free(psl_ctx_t *psl) -{ - if (psl) { - _vector_free(&psl->suffixes); - _vector_free(&psl->suffix_exceptions); - free(psl); - } -} - -static int _str_needs_encoding(const char *s) -{ - while (*s > 0) s++; - - return !!*s; -} - -static void _add_punycode_if_needed(_psl_vector_t *v) -{ - int it, n; - - /* do not use 'it < v->cur' since v->cur is changed by _vector_add() ! */ - for (it = 0, n = v->cur; it < n; it++) { - _psl_entry_t *e = _vector_get(v, it); - - if (_str_needs_encoding(e->label_buf)) { - _psl_entry_t suffix, *suffixp; - char lookupname[64] = ""; - - /* the following lines will have GPL3+ license issues */ -/* char *asc = NULL; - int rc; - - if ((rc = idn2_lookup_u8((uint8_t *)e->label_buf, (uint8_t **)&asc, 0)) == IDN2_OK) { - // fprintf(stderr, "idn2 '%s' -> '%s'\n", e->label_buf, asc); - _suffix_init(&suffix, asc, strlen(asc)); - suffix.wildcard = e->wildcard; - suffixp = _vector_get(v, _vector_add(v, &suffix)); - suffixp->label = suffixp->e_label_buf; // set label to changed address - } else - fprintf(stderr, "toASCII(%s) failed (%d): %s\n", e->label_buf, rc, idn2_strerror(rc)); -*/ - -#ifdef WITH_LIBICU - UIDNA *idna; - UErrorCode status = 0; - - /* IDNA2003 punycode conversion */ - /* destLen = uidna_toASCII(e->label_buf, (int32_t) strlen(e->label_buf), lookupname, (int32_t) sizeof(lookupname), - UIDNA_DEFAULT, NULL, &status); - */ - - /* IDNA2008 UTS#46 punycode conversion */ - if ((idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status))) { - UChar utf16_dst[64], utf16_src[64]; - int32_t utf16_src_length; - UIDNAInfo info = UIDNA_INFO_INITIALIZER; - - u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, (int32_t) strlen(e->label_buf), &status); - if (U_SUCCESS(status)) { - int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status); - if (U_SUCCESS(status)) { - u_strToUTF8(lookupname, (int32_t) sizeof(lookupname), NULL, utf16_dst, dst_length, &status); - if (U_SUCCESS(status)) { - if (strcmp(e->label_buf, lookupname)) { - /* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */ - _suffix_init(&suffix, lookupname, strlen(lookupname)); - suffix.wildcard = e->wildcard; - suffixp = _vector_get(v, _vector_add(v, &suffix)); - suffixp->label = suffixp->label_buf; /* set label to changed address */ - } /* else ignore */ - } else - fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); - } else - fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); - } else - fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); - - uidna_close(idna); - } else - fprintf(stderr, "Failed to get UTS46 IDNA handle\n"); - -#else - /* this is much slower than the libidn2 API but should have no license issues */ - FILE *pp; - char cmd[16 + sizeof(e->label_buf)]; - snprintf(cmd, sizeof(cmd), "idn2 '%s'", e->label_buf); - if ((pp = popen(cmd, "r"))) { - if (fscanf(pp, "%63s", lookupname) >= 1 && strcmp(e->label_buf, lookupname)) { - /* fprintf(stderr, "idn2 '%s' -> '%s'\n", e->label_buf, lookupname); */ - _suffix_init(&suffix, lookupname, strlen(lookupname)); - suffix.wildcard = e->wildcard; - suffixp = _vector_get(v, _vector_add(v, &suffix)); - suffixp->label = suffixp->label_buf; /* set label to changed address */ - } - pclose(pp); - } else - fprintf(stderr, "Failed to call popen(%s, \"r\")\n", cmd); -#endif - } - } - - _vector_sort(v); -} #endif /* WITH_BUILTIN */ int main(int argc, const char **argv) @@ -413,7 +91,7 @@ int main(int argc, const char **argv) if (argc != 3) { fprintf(stderr, "Usage: psl2c \n"); - fprintf(stderr, " is the 'effective_tld_names.dat' (aka Public Suffix List)\n"); + fprintf(stderr, " is the 'effective_tld_names.dat' (aka Public Suffix List), lowercase UTF-8 encoded\n"); fprintf(stderr, " is the the C filename to be generated from \n"); return 1; } @@ -428,9 +106,6 @@ int main(int argc, const char **argv) size_t cmdsize = 16 + strlen(argv[1]); char *cmd = alloca(cmdsize), checksum[64] = ""; - _add_punycode_if_needed(psl->suffixes); - _add_punycode_if_needed(psl->suffix_exceptions); - _print_psl_entries(fpout, psl->suffixes, "suffixes"); _print_psl_entries(fpout, psl->suffix_exceptions, "suffix_exceptions");