diff --git a/.travis.yml b/.travis.yml index 27bcd14..6976291 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,15 @@ compiler: - gcc - clang # Change this to your needs -script: ./autogen.sh && ./configure --enable-gtk-doc && make -j4 && make check -j4 && make distcheck +script: + - ./autogen.sh + - ./configure && make -j4 && make check -j4 + - ./configure --without-libicu && make clean && make -j4 && make check -j4 + - ./configure --disable-builtin && make clean && make -j4 && make check -j4 + - ./configure --disable-builtin --without-libicu && make clean && make -j4 && make check -j4 + - ./configure --enable-gtk-doc && make -j4 && make check -j4 + - make distcheck before_install: - apt-cache search libicu | grep icu - sudo apt-get -qq update - - sudo apt-get -q install autoconf automake autopoint libtool gtk-doc-tools gettext idn2 libidn2-0 libidn2-0-dev libicu-dev + - sudo apt-get -q install autoconf automake autopoint libtool gtk-doc-tools gettext idn2 libidn2-0 libidn2-0-dev libicu48 libicu-dev diff --git a/AUTHORS b/AUTHORS index b1bf772..562d9b7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -8,4 +8,6 @@ Please drop me a note if you feel you should have been mentioned here. Tim Ruehsen (Implementation of libpsl) -Daniel Kahn Gillmor (Discussion, Ideas, Organization) +Daniel Kahn Gillmor (Discussion, Ideas, Organization, Code) +Daniel Stenberg (Discussion, Ideas) +Darshit Shah (Patching Wget to work with libpsl) diff --git a/NEWS b/NEWS index 8323009..df6d258 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,14 @@ Copyright (C) 2014 Tim Ruehsen +23.06.2014 Release V0.4.0 + * depend on libicu for punycode, utf-8 and lowercase conversions + * added function psl_str_to_utf8lower() + * fixed locale issues + * introducing psl_error_t for error codes + defines + * removed redundant code from psl2c.c + * updated docs + * psl utility reads from stdin if no argument specified + 10.06.2014 Release V0.3.1 * link psl utility dynamically * fix output of psl_filename() diff --git a/configure.ac b/configure.ac index 95f2df7..f89b1e1 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ -AC_INIT([libpsl], [0.3.1], [tim.ruehsen@gmx.de], [libpsl], [http://github.com/rockdaboot/libpsl]) +AC_INIT([libpsl], [0.4.0], [tim.ruehsen@gmx.de], [libpsl], [http://github.com/rockdaboot/libpsl]) AC_PREREQ([2.59]) AM_INIT_AUTOMAKE([1.10 -Wall no-define]) @@ -62,10 +62,35 @@ AS_IF([ test "$enable_man" != no ], [ # 3. If the library source code has changed at all since the last update, then increment revision (‘c:r:a’ becomes ‘c:r+1:a’). # 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0. # 5. If any interfaces have been added since the last public release, then increment age. -# 6. If any interfaces have been removed or changed since the last public release, then set age to 0. -AC_SUBST([LIBPSL_SO_VERSION], [1:1:1]) +# 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0. +AC_SUBST([LIBPSL_SO_VERSION], [2:0:2]) AC_SUBST([LIBPSL_VERSION], $VERSION) +# Check for libicu +HAVE_LIBICU=no +AC_ARG_WITH(libicu, + AC_HELP_STRING([--without-libicu], [build libpsl without IDNA/Punycode support]), + [], + [ + # using pkg-config won't work on older systems like Ubuntu 12.04 LTS Server Edition 64bit + OLDLIBS=$LIBS + LIBS="-licuuc $LIBS" + AC_MSG_CHECKING([for ICU unicode library]) + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[#include ]], + [[u_strToUTF8(NULL, 0, NULL, NULL, 0, NULL);]])], + [HAVE_LIBICU=yes; AC_MSG_RESULT([yes]) AC_DEFINE([WITH_LIBICU], [1], [generate PSL data with IDNA2008 UTS#46 punycode])], + [LIBS=$OLDLIBS; AC_MSG_ERROR([no working ICU unicode library was found])]) + +# AC_SEARCH_LIBS(uidna_close, icuuc, +# [HAVE_LIBICU=yes; AC_DEFINE([WITH_LIBICU], [1], [generate PSL data with IDNA2008 UTS#46 punycode])], +# [AC_MSG_ERROR(*** libicu was not found. Aborting.)], +# -licudata ) +# PKG_CHECK_MODULES(LIBICU, [icu-uc], +# [HAVE_LIBICU=yes; AC_DEFINE([WITH_LIBICU], [1], [generate PSL data with IDNA2008 UTS#46 punycode])]) + ]) + # Check for enable/disable builtin PSL data AC_ARG_ENABLE(builtin, AS_HELP_STRING([--disable-builtin], [do not compile PSL data into library]), @@ -74,10 +99,11 @@ AC_ARG_ENABLE(builtin, ], [ enable_builtin=yes AC_DEFINE([WITH_BUILTIN], [1], [compile PSL data into library]) - - PKG_CHECK_MODULES(LIBICU, [icu-uc], - [AC_DEFINE([WITH_LIBICU], [1], [generate PSL data with IDNA2008 UTS#46 punycode])], - [AC_CHECK_PROG(HAVE_IDN2, idn2, yes, AC_MSG_ERROR(Cannot find required tool 'idn2'.))]) + AS_IF([test $HAVE_LIBICU != yes], + [ + # Check for idn2 fallback to generate punycode + AC_CHECK_PROG(HAVE_IDN2, idn2, yes, AC_MSG_ERROR(Cannot find required tool 'idn2' as fallback.)) + ]) ]) AM_CONDITIONAL([WITH_BUILTIN], [test $enable_builtin = yes]) @@ -102,16 +128,14 @@ fi # Check for custom PSL file AC_ARG_WITH(psl-file, - AC_HELP_STRING([--with-psl-file=[PATH]], - [path to PSL file]), + AC_HELP_STRING([--with-psl-file=[PATH]], [path to PSL file]), PSL_FILE=$withval, PSL_FILE="\$(top_srcdir)/data/effective_tld_names.dat") AC_SUBST(PSL_FILE) # Check for custom PSL test file AC_ARG_WITH(psl-testfile, - AC_HELP_STRING([--with-psl-testfile=[PATH]], - [path to PSL test file]), + AC_HELP_STRING([--with-psl-testfile=[PATH]], [path to PSL test file]), PSL_TESTFILE=$withval, PSL_TESTFILE="\$(top_srcdir)/data/test_psl.txt") AC_SUBST(PSL_TESTFILE) @@ -138,6 +162,7 @@ AC_MSG_NOTICE([Summary of build options: Compiler: ${CC} CFlags: ${CFLAGS} ${CPPFLAGS} LDFlags: ${LDFLAGS} + ICU: ${HAVE_LIBICU} Builtin PSL: ${enable_builtin} PSL File: ${PSL_FILE} PSL Test File: ${PSL_TESTFILE} diff --git a/data/effective_tld_names.dat b/data/effective_tld_names.dat index 989ec21..fd84dc6 100644 --- a/data/effective_tld_names.dat +++ b/data/effective_tld_names.dat @@ -180,6 +180,7 @@ ar com.ar edu.ar gob.ar +gov.ar int.ar mil.ar net.ar @@ -222,7 +223,6 @@ edu.au gov.au asn.au id.au -csiro.au // Historic 2LDs (closed to new registration, but sites still exist) info.au conf.au @@ -245,7 +245,7 @@ sa.edu.au tas.edu.au vic.edu.au wa.edu.au -act.gov.au +// act.gov.au Bug 984824 - Removed at request of Greg Tankard // nsw.gov.au Bug 547985 - Removed at request of // nt.gov.au Bug 940478 - Removed at request of Greg Connors qld.gov.au @@ -292,6 +292,7 @@ rs.ba // bb : http://en.wikipedia.org/wiki/.bb bb biz.bb +co.bb com.bb edu.bb gov.bb @@ -299,6 +300,7 @@ info.bb net.bb org.bb store.bb +tv.bb // bd : http://en.wikipedia.org/wiki/.bd *.bd @@ -596,9 +598,12 @@ gob.cl co.cl mil.cl -// cm : http://en.wikipedia.org/wiki/.cm +// cm : http://en.wikipedia.org/wiki/.cm plus bug 981927 cm +co.cm +com.cm gov.cm +net.cm // cn : http://en.wikipedia.org/wiki/.cn // Submitted by registry 2008-06-11 @@ -5146,7 +5151,24 @@ com.nr nu // nz : http://en.wikipedia.org/wiki/.nz -*.nz +// Confirmed by registry 2014-05-19 +nz +ac.nz +co.nz +cri.nz +geek.nz +gen.nz +govt.nz +health.nz +iwi.nz +kiwi.nz +maori.nz +mil.nz +māori.nz +net.nz +org.nz +parliament.nz +school.nz // om : http://en.wikipedia.org/wiki/.om om @@ -5613,7 +5635,6 @@ oryol.ru palana.ru penza.ru perm.ru -pskov.ru ptz.ru rnd.ru ryazan.ru @@ -6150,19 +6171,19 @@ com.ug org.ug // uk : http://en.wikipedia.org/wiki/.uk -// Submitted by registry 2012-10-02 -// and tweaked by us pending further consultation. -*.uk +// Submitted by registry +uk +ac.uk +co.uk +gov.uk +ltd.uk +me.uk +net.uk +nhs.uk +org.uk +plc.uk +police.uk *.sch.uk -!bl.uk -!british-library.uk -!jet.uk -!mod.uk -!national-library-scotland.uk -!nel.uk -!nic.uk -!nls.uk -!parliament.uk // us : http://en.wikipedia.org/wiki/.us us @@ -6440,16 +6461,24 @@ edu.vc // ve : https://registro.nic.ve/ // Confirmed by registry 2012-10-04 +// Updated 2014-05-20 - Bug 940478 ve +arts.ve co.ve com.ve e12.ve edu.ve +firm.ve +gob.ve gov.ve info.ve +int.ve mil.ve net.ve org.ve +rec.ve +store.ve +tec.ve web.ve // vg : http://en.wikipedia.org/wiki/.vg @@ -6482,8 +6511,12 @@ pro.vn health.vn // vu : http://en.wikipedia.org/wiki/.vu -// list of 2nd level tlds ? +// http://www.vunic.vu/ vu +com.vu +edu.vu +net.vu +org.vu // wf : http://www.afnic.fr/medias/documents/AFNIC-naming-policy2012.pdf wf @@ -6609,7 +6642,14 @@ yt فلسطين // xn--90a3ac ("srb" Cyrillic) : RS +// http://www.rnids.rs/en/the-.срб-domain срб +пр.срб +орг.срб +обр.срб +од.срб +упр.срб +ак.срб // xn--p1ai ("rf" Russian-Cyrillic) : RU // http://www.cctld.ru/en/docs/rulesrf.php @@ -7651,9 +7691,302 @@ hiv // sca : 2014-03-13 SVENSKA CELLULOSA AKTIEBOLAGET SCA (publ) sca -// reise : 2014-03-13 dotreise GmbH +// reise : 2014-03-13 dotreise GmbH reise +// accountants : 2014-03-20 Knob Town, LLC +accountants + +// clinic : 2014-03-20 Goose Park, LLC +clinic + +// versicherung : 2014-03-20 dotversicherung-registry GmbH +versicherung + +// top : 2014-03-20 Jiangsu Bangning Science & Technology Co.,Ltd. +top + +// furniture : 2014-03-20 Lone Fields, LLC +furniture + +// dental : 2014-03-20 Tin Birch, LLC +dental + +// fund : 2014-03-20 John Castle, LLC +fund + +// creditcard : 2014-03-20 Binky Frostbite, LLC +creditcard + +// insure : 2014-03-20 Pioneer Willow, LLC +insure + +// audio : 2014-03-20 Uniregistry, Corp. +audio + +// claims : 2014-03-20 Black Corner, LLC +claims + +// loans : 2014-03-20 June Woods, LLC +loans + +// auction : 2014-03-20 Sand Galley, LLC +auction + +// attorney : 2014-03-20 Victor North, LLC +attorney + +// finance : 2014-03-20 Cotton Cypress, LLC +finance + +// investments : 2014-03-20 Holly Glen, LLC +investments + +// juegos : 2014-03-20 Uniregistry, Corp. +juegos + +// dentist : 2014-03-20 Outer Lake, LLC +dentist + +// lds : 2014-03-20 IRI Domain Management, LLC +lds + +// lawyer : 2014-03-20 Atomic Station, LLC +lawyer + +// surgery : 2014-03-20 Tin Avenue, LLC +surgery + +// gratis : 2014-03-20 Pioneer Tigers, LLC +gratis + +// software : 2014-03-20 Over Birch, LLC +software + +// mortgage : 2014-03-20 Outer Gardens, LLC +mortgage + +// republican : 2014-03-20 United TLD Holdco Ltd. +republican + +// credit : 2014-03-20 Snow Shadow, LLC +credit + +// tax : 2014-03-20 Storm Orchard, LLC +tax + +// africa : 2014-03-24 ZA Central Registry NPC trading as Registry.Africa +africa + +// joburg : 2014-03-24 ZA Central Registry NPC trading as ZA Central Registry +joburg + +// durban : 2014-03-24 ZA Central Registry NPC trading as ZA Central Registry +durban + +// capetown : 2014-03-24 ZA Central Registry NPC trading as ZA Central Registry +capetown + +// sap : 2014-03-27 SAP AG +sap + +// datsun : 2014-03-27 NISSAN MOTOR CO., LTD. +datsun + +// infiniti : 2014-03-27 NISSAN MOTOR CO., LTD. +infiniti + +// firmdale : 2014-03-27 Firmdale Holdings Limited +firmdale + +// organic : 2014-03-27 Afilias Limited +organic + +// nissan : 2014-03-27 NISSAN MOTOR CO., LTD. +nissan + +// website : 2014-04-03 DotWebsite Inc. +website + +// space : 2014-04-03 DotSpace Inc. +space + +// schmidt : 2014-04-03 SALM S.A.S. +schmidt + +// cuisinella : 2014-04-03 SALM S.A.S. +cuisinella + +// samsung : 2014-04-03 SAMSUNG SDS CO., LTD +samsung + +// crs : 2014-04-03 Federated Co operatives Limited +crs + +// doosan : 2014-04-03 Doosan Corporation +doosan + +// press : 2014-04-03 DotPress Inc. +press + +// emerck : 2014-04-03 Merck KGaA +emerck + +// erni : 2014-04-03 ERNI Group Holding AG +erni + +// direct : 2014-04-10 Half Trail, LLC +direct + +// yandex : 2014-04-10 YANDEX, LLC +yandex + +// lotto : 2014-04-10 Afilias Limited +lotto + +// toshiba : 2014-04-10 TOSHIBA Corporation +toshiba + +// bauhaus : 2014-04-17 Werkhaus GmbH +bauhaus + +// host : 2014-04-17 DotHost Inc. +host + +// ltda : 2014-04-17 DOMAIN ROBOT SERVICOS DE HOSPEDAGEM NA INTERNET LTDA +ltda + +// global : 2014-04-17 Dot GLOBAL AS +global + +// abogado : 2014-04-24 Top Level Domain Holdings Limited +abogado + +// place : 2014-04-24 Snow Galley, LLC +place + +// tirol : 2014-04-24 punkt Tirol GmbH +tirol + +// gmx : 2014-04-24 1&1 Mail & Media GmbH +gmx + +// tatar : 2014-04-24 Limited Liability Company "Coordination Center of Regional Domain of Tatarstan Republic" +tatar + +// scholarships : 2014-04-24 Scholarships.com, LLC +scholarships + +// eurovision : 2014-04-24 European Broadcasting Union (EBU) +eurovision + +// wedding : 2014-04-24 Top Level Domain Holdings Limited +wedding + +// active : 2014-05-01 The Active Network, Inc +active + +// madrid : 2014-05-01 Comunidad de Madrid +madrid + +// youtube : 2014-05-01 Charleston Road Registry Inc. +youtube + +// sharp : 2014-05-01 Sharp Corporation +sharp + +// uol : 2014-05-01 UBN INTERNET LTDA. +uol + +// physio : 2014-05-01 PhysBiz Pty Ltd +physio + +// gmail : 2014-05-01 Charleston Road Registry Inc. +gmail + +// channel : 2014-05-08 Charleston Road Registry Inc. +channel + +// fly : 2014-05-08 Charleston Road Registry Inc. +fly + +// zip : 2014-05-08 Charleston Road Registry Inc. +zip + +// esq : 2014-05-08 Charleston Road Registry Inc. +esq + +// rsvp : 2014-05-08 Charleston Road Registry Inc. +rsvp + +// wales : 2014-05-08 Nominet UK +wales + +// cymru : 2014-05-08 Nominet UK +cymru + +// green : 2014-05-08 Afilias Limited +green + +// lgbt : 2014-05-08 Afilias Limited +lgbt + +// xn--hxt814e : 2014-05-15 Zodiac Libra Limited +网店 + +// cancerresearch : 2014-05-15 Australian Cancer Research Foundation +cancerresearch + +// everbank : 2014-05-15 EverBank +everbank + +// frl : 2014-05-15 FRLregistry B.V. +frl + +// property : 2014-05-22 Uniregistry, Corp. +property + +// forsale : 2014-05-22 Sea Oaks, LLC +forsale + +// seat : 2014-05-22 SEAT, S.A. (Sociedad Unipersonal) +seat + +// deals : 2014-05-22 Sand Sunset, LLC +deals + +// nra : 2014-05-22 NRA Holdings Company, INC. +nra + +// xn--fjq720a : 2014-05-22 Will Bloom, LLC +娱乐 + +// realtor : 2014-05-29 Real Estate Domains LLC +realtor + +// bnpparibas : 2014-05-29 BNP Paribas +bnpparibas + +// melbourne : 2014-05-29 The Crown in right of the State of Victoria, represented by its Department of State Development, Business and Innovation +melbourne + +// hosting : 2014-05-29 Uniregistry, Corp. +hosting + +// yoga : 2014-05-29 Top Level Domain Holdings Limited +yoga + +// city : 2014-05-29 Snow Sky, LLC +city + +// bond : 2014-06-05 Bond University Limited +bond + +// click : 2014-06-05 Uniregistry, Corp. +click + +// cern : 2014-06-05 European Organization for Nuclear Research ("CERN") +cern // ===END ICANN DOMAINS=== // ===BEGIN PRIVATE DOMAINS=== @@ -7663,20 +7996,22 @@ reise cloudfront.net // Amazon Elastic Compute Cloud: https://aws.amazon.com/ec2/ -// Submitted by Osman Surkatty 2013-04-02 -compute.amazonaws.com -us-east-1.amazonaws.com -compute-1.amazonaws.com -z-1.compute-1.amazonaws.com -z-2.compute-1.amazonaws.com +// Submitted by Osman Surkatty 2014-05-20 ap-northeast-1.compute.amazonaws.com ap-southeast-1.compute.amazonaws.com ap-southeast-2.compute.amazonaws.com +cn-north-1.compute.amazonaws.cn +compute.amazonaws.cn +compute.amazonaws.com +compute-1.amazonaws.com eu-west-1.compute.amazonaws.com sa-east-1.compute.amazonaws.com +us-east-1.amazonaws.com us-gov-west-1.compute.amazonaws.com us-west-1.compute.amazonaws.com us-west-2.compute.amazonaws.com +z-1.compute-1.amazonaws.com +z-2.compute-1.amazonaws.com // Amazon Elastic Beanstalk : https://aws.amazon.com/elasticbeanstalk/ // Submitted by Adam Stein 2013-04-02 @@ -7719,6 +8054,7 @@ ar.com br.com cn.com com.de +com.se de.com eu.com gb.com @@ -8074,6 +8410,10 @@ global.ssl.fastly.net a.prod.fastly.net global.prod.fastly.net +// Firebase, Inc. +// Submitted by Chris Raynor 2014-01-21 +firebaseapp.com + // GitHub, Inc. // Submitted by Ben Toews 2014-02-06 github.io @@ -8153,10 +8493,18 @@ azurewebsites.net azure-mobile.net cloudapp.net +// NFSN, Inc. : https://www.NearlyFreeSpeech.NET/ +// Submitted by Jeff Wheelhouse 2014-02-02 +nfshost.com + // NYC.mn : http://www.information.nyc.mn // Submitted by Matthew Brown 2013-03-11 nyc.mn +// One Fold Media : http://www.onefoldmedia.com/ +// Submitted by Eddie Jones 2014-06-10 +nid.io + // Opera Software, A.S.A. // Submitted by Yngve Pettersen 2009-11-26 operaunite.com diff --git a/docs/libpsl/libpsl-sections.txt b/docs/libpsl/libpsl-sections.txt index b2b1617..4c73e47 100644 --- a/docs/libpsl/libpsl-sections.txt +++ b/docs/libpsl/libpsl-sections.txt @@ -1,6 +1,7 @@
libpsl Public Suffix List functions +psl_error_t psl_ctx_t psl_load_file psl_load_fp @@ -17,4 +18,5 @@ psl_builtin_sha1sum psl_builtin_filename psl_is_cookie_domain_acceptable psl_get_version +psl_str_to_utf8lower
diff --git a/include/libpsl.h b/include/libpsl.h index 265bdf6..b7fe952 100644 --- a/include/libpsl.h +++ b/include/libpsl.h @@ -38,6 +38,27 @@ extern "C" { #endif +/** + * psl_error_t: + * @PSL_SUCCESS: Successful return. + * @PSL_ERR_INVALID_ARG: Invalid argument. + * @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter + * @PSL_ERR_TO_UTF16: Failed to convert to utf-16. + * @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase. + * @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8. + * + * Return codes for PSL functions. + * Negative return codes mean failure. + * Positive values are reserved for non-error return codes. + */ +typedef enum { + PSL_SUCCESS = 0, + PSL_ERR_INVALID_ARG = -1, + PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */ + PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */ + PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */ + PSL_ERR_TO_UTF8 = -5 /* failed to convert utf-16 to utf-8 */ +} psl_error_t; typedef struct _psl_ctx_st psl_ctx_t; @@ -65,6 +86,9 @@ const char * /* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */ const char * psl_registrable_domain(const psl_ctx_t *psl, const char *domain); +/* convert a string into lowercase UTF-8 */ +int + psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower); /* does not include exceptions */ int psl_suffix_count(const psl_ctx_t *psl); diff --git a/src/psl.c b/src/psl.c index 881052e..2875c35 100644 --- a/src/psl.c +++ b/src/psl.c @@ -49,9 +49,20 @@ #include #include #include +#include + +#ifdef WITH_LIBICU +# include +# include +# include +# include +#endif #include +/* number of elements within an array */ +#define countof(a) (sizeof(a)/sizeof(*(a))) + /** * SECTION:libpsl * @short_description: Public Suffix List library functions @@ -95,7 +106,17 @@ struct _psl_ctx_st { }; /* include the PSL data compiled by 'psl2c' */ -#include "suffixes.c" +#ifndef _LIBPSL_INCLUDED_BY_PSL2C +# include "suffixes.c" +#else + /* if this source file is included by psl2c.c, provide empty builtin data */ + static _psl_entry_t suffixes[1]; + static _psl_entry_t suffix_exceptions[1]; + static time_t _psl_file_time; + static time_t _psl_compile_time; + static const char _psl_sha1_checksum[] = ""; + static const char _psl_filename[] = ""; +#endif /* references to this PSL will result in lookups to built-in data */ static const psl_ctx_t @@ -239,39 +260,19 @@ static int _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length) for (dst = suffix->label_buf, src = rule; *src;) { if (*src == '.') suffix->nlabels++; - *dst++ = tolower(*src++); + *dst++ = *src++; } *dst = 0; return 0; } -/** - * psl_is_public_suffix: - * @psl: PSL context - * @domain: Domain string - * - * This function checks if @domain is a public suffix by the means of the - * [Mozilla Public Suffix List](http://publicsuffix.org). - * - * For cookie domain checking see psl_is_cookie_domain_acceptable(). - * - * @psl is a context returned by either psl_load_file(), psl_load_fp() or - * psl_builtin(). - * - * Returns: 1 if domain is a public suffix, 0 if not. - * - * Since: 0.1 - */ -int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) +static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) { _psl_entry_t suffix, *rule; const char *p, *label_bak; unsigned short length_bak; - if (!psl || !domain) - return 1; - /* this function should be called without leading dots, just make sure */ suffix.label = domain + (*domain == '.'); suffix.length = strlen(suffix.label); @@ -340,6 +341,34 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) return 0; } +/** + * psl_is_public_suffix: + * @psl: PSL context + * @domain: Domain string + * + * This function checks if @domain is a public suffix by the means of the + * [Mozilla Public Suffix List](http://publicsuffix.org). + * + * For cookie domain checking see psl_is_cookie_domain_acceptable(). + * + * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode). + * Other encodings result in unexpected behavior. + * + * @psl is a context returned by either psl_load_file(), psl_load_fp() or + * psl_builtin(). + * + * Returns: 1 if domain is a public suffix, 0 if not. + * + * Since: 0.1 + */ +int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) +{ + if (!psl || !domain) + return 1; + + return _psl_is_public_suffix(psl, domain); +} + /** * psl_unregistrable_domain: * @psl: PSL context @@ -348,6 +377,9 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain) * This function finds the longest publix suffix part of @domain by the means * of the [Mozilla Public Suffix List](http://publicsuffix.org). * + * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode). + * Other encodings result in unexpected behavior. + * * @psl is a context returned by either psl_load_file(), psl_load_fp() or * psl_builtin(). * @@ -366,7 +398,7 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain) * 'forgot.his.name' and 'name' are in the PSL while 'his.name' is not. */ - while (!psl_is_public_suffix(psl, domain)) { + while (!_psl_is_public_suffix(psl, domain)) { if ((domain = strchr(domain, '.'))) domain++; else @@ -384,6 +416,9 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain) * This function finds the shortest private suffix part of @domain by the means * of the [Mozilla Public Suffix List](http://publicsuffix.org). * + * International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode). + * Other encodings result in unexpected behavior. + * * @psl is a context returned by either psl_load_file(), psl_load_fp() or * psl_builtin(). * @@ -404,7 +439,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain) * 'forgot.his.name' and 'name' are in the PSL while 'his.name' is not. */ - while (!psl_is_public_suffix(psl, domain)) { + while (!_psl_is_public_suffix(psl, domain)) { if ((p = strchr(domain, '.'))) { regdom = domain; domain = p + 1; @@ -415,6 +450,51 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain) return regdom; } +static int _str_is_ascii(const char *s) +{ + while (*s > 0) s++; + + return !*s; +} + +#ifdef WITH_LIBICU +static void _add_punycode_if_needed(UIDNA *idna, _psl_vector_t *v, _psl_entry_t *e) +{ + if (_str_is_ascii(e->label_buf)) + return; + + /* IDNA2008 UTS#46 punycode conversion */ + if (idna) { + _psl_entry_t suffix, *suffixp; + char lookupname[128] = ""; + UErrorCode status = 0; + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + UChar utf16_dst[128], utf16_src[128]; + int32_t utf16_src_length; + + u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, -1, &status); + if (U_SUCCESS(status)) { + int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status); + if (U_SUCCESS(status)) { + u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status); + if (U_SUCCESS(status)) { + if (strcmp(e->label_buf, lookupname)) { + /* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */ + _suffix_init(&suffix, lookupname, strlen(lookupname)); + suffix.wildcard = e->wildcard; + suffixp = _vector_get(v, _vector_add(v, &suffix)); + suffixp->label = suffixp->label_buf; /* set label to changed address */ + } /* else ignore */ + } /* else + fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */ + } /* else + fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); */ + } /* else + fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); */ + } +} +#endif + /** * psl_load_file: * @fname: Name of PSL file @@ -422,13 +502,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain) * This function loads the public suffixes file named @fname. * To free the allocated resources, call psl_free(). * - * If you want to use punycode representations for functions like psl_is_public_suffix(), - * these have to exist as entries within @fname. This is a design decision to not pull in - * dependencies for UTF-8 case-handling and IDNA libraries. - * - * On the contrary, the builtin data already contains punycode entries. - * - * Have a look into psl2c.c for example code on how to convert UTF-8 to lowercase and to punycode. + * The suffixes are expected to be lowercase UTF-8 encoded if they are international. * * Returns: Pointer to a PSL context or %NULL on failure. * @@ -457,7 +531,7 @@ psl_ctx_t *psl_load_file(const char *fname) * This function loads the public suffixes from a FILE pointer. * To free the allocated resources, call psl_free(). * - * Have a look at psl_load_fp() for punycode considerations. + * The suffixes are expected to be lowercase UTF-8 encoded if they are international. * * Returns: Pointer to a PSL context or %NULL on failure. * @@ -467,8 +541,11 @@ psl_ctx_t *psl_load_fp(FILE *fp) { psl_ctx_t *psl; _psl_entry_t suffix, *suffixp; - int nsuffixes = 0; char buf[256], *linep, *p; +#ifdef WITH_LIBICU + UIDNA *idna; + UErrorCode status = 0; +#endif if (!fp) return NULL; @@ -476,6 +553,10 @@ psl_ctx_t *psl_load_fp(FILE *fp) if (!(psl = calloc(1, sizeof(psl_ctx_t)))) return NULL; +#ifdef WITH_LIBICU + idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status); +#endif + /* * as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions. * as of 19.02.2014, the list at http://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions. @@ -496,26 +577,33 @@ psl_ctx_t *psl_load_fp(FILE *fp) if (*p == '!') { /* add to exceptions */ - if (_suffix_init(&suffix, p + 1, linep - p - 1) == 0) + if (_suffix_init(&suffix, p + 1, linep - p - 1) == 0) { suffixp = _vector_get(psl->suffix_exceptions, _vector_add(psl->suffix_exceptions, &suffix)); - else - suffixp = NULL; + suffixp->label = suffixp->label_buf; /* set label to changed address */ +#ifdef WITH_LIBICU + _add_punycode_if_needed(idna, psl->suffix_exceptions, suffixp); +#endif + } } else { - if (_suffix_init(&suffix, p, linep - p) == 0) + /* add to suffixes */ + if (_suffix_init(&suffix, p, linep - p) == 0) { suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix)); - else - suffixp = NULL; + suffixp->label = suffixp->label_buf; /* set label to changed address */ +#ifdef WITH_LIBICU + _add_punycode_if_needed(idna, psl->suffixes, suffixp); +#endif + } } - - if (suffixp) - suffixp->label = suffixp->label_buf; /* set label to changed address */ - - nsuffixes++;; } _vector_sort(psl->suffix_exceptions); _vector_sort(psl->suffixes); +#ifdef WITH_LIBICU + if (idna) + uidna_close(idna); +#endif + return psl; } @@ -685,7 +773,13 @@ const char *psl_builtin_filename(void) **/ const char *psl_get_version (void) { - return PACKAGE_VERSION; + return PACKAGE_VERSION +#ifdef WITH_LIBICU + " (+libicu/" U_ICU_VERSION ")" +#else + " (limited IDNA support)" +#endif + ; } /** @@ -697,6 +791,9 @@ const char *psl_get_version (void) * This helper function checks whether @cookie_domain is an acceptable cookie domain value for the request * @hostname. * + * For international domain names both, @hostname and @cookie_domain, have to be either in lowercase UTF-8 + * or in ASCII form (punycode). Other encodings or mixing UTF-8 and punycode result in unexpected behavior. + * * Examples: * 1. Cookie domain 'example.com' would be acceptable for hostname 'www.example.com', * but '.com' or 'com' would NOT be acceptable since 'com' is a public suffix. @@ -741,3 +838,100 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, return 0; } + +/** + * psl_str_to_utf8lower: + * @str: string to convert + * @encoding: charset encoding of @str, e.g. 'iso-8859-1' or %NULL + * @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL + * @lower: return value containing the converted string + * + * This helper function converts a string to lowercase UTF-8 representation. + * Lowercase UTF-8 is needed as input to the domain checking functions. + * + * @lower is %NULL on error. + * The return value 'lower' must be freed after usage. + * + * Returns: psl_error_t value. + * PSL_SUCCESS: Success + * PSL_ERR_INVALID_ARG: @str is a %NULL value. + * PSL_ERR_CONVERTER: Failed to open the unicode converter with name @encoding + * PSL_ERR_TO_UTF16: Failed to convert @str to unicode + * PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase + * PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8 + * + * Since: 0.4 + */ +psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower) +{ + int ret = PSL_ERR_INVALID_ARG; + + if (lower) + *lower = NULL; + + if (!str) + return PSL_ERR_INVALID_ARG; + + /* shortcut to avoid costly conversion */ + if (_str_is_ascii(str)) { + if (lower) { + char *p; + + *lower = strdup(str); + + /* convert ASCII string to lowercase */ + for (p = *lower; *p; p++) + if (isupper(*p)) + *p = tolower(*p); + } + return PSL_SUCCESS; + } + +#ifdef WITH_LIBICU + do { + size_t str_length = strlen(str); + UErrorCode status = 0; + UChar *utf16_dst, *utf16_lower; + int32_t utf16_dst_length; + char *utf8_lower; + UConverter *uconv; + + /* C89 allocation */ + utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1)); + utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1)); + utf8_lower = alloca(str_length * 2 + 1); + + uconv = ucnv_open(encoding, &status); + if (U_SUCCESS(status)) { + utf16_dst_length = ucnv_toUChars(uconv, utf16_dst, str_length * 2 + 1, str, str_length, &status); + ucnv_close(uconv); + + if (U_SUCCESS(status)) { + int32_t utf16_lower_length = u_strToLower(utf16_lower, str_length * 2 + 1, utf16_dst, utf16_dst_length, locale, &status); + if (U_SUCCESS(status)) { + u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status); + if (U_SUCCESS(status)) { + if (lower) + *lower = strdup(utf8_lower); + ret = PSL_SUCCESS; + } else { + ret = PSL_ERR_TO_UTF8; + /* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */ + } + } else { + ret = PSL_ERR_TO_LOWER; + /* fprintf(stderr, "Failed to convert UTF-16 to lowercase (status %d)\n", status); */ + } + } else { + ret = PSL_ERR_TO_UTF16; + /* fprintf(stderr, "Failed to convert string to UTF-16 (status %d)\n", status); */ + } + } else { + ret = PSL_ERR_CONVERTER; + /* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */ + } + } while (0); +#endif + + return ret; +} diff --git a/src/psl2c.c b/src/psl2c.c index 8b1ec95..daeec83 100644 --- a/src/psl2c.c +++ b/src/psl2c.c @@ -39,235 +39,14 @@ #include #include -/* -#ifdef WITH_LIBIDN2 -# include -#endif -*/ - -#ifdef WITH_LIBICU -# include -# include -# include -#endif - #ifdef WITH_BUILTIN #include -typedef struct { - char - label_buf[48]; - const char * - label; - unsigned short - length; - unsigned char - nlabels, /* number of labels */ - wildcard; /* this is a wildcard rule (e.g. *.sapporo.jp) */ -} _psl_entry_t; - -/* stripped down version libmget vector routines */ -typedef struct { - int - (*cmp)(const _psl_entry_t *, const _psl_entry_t *); /* comparison function */ - _psl_entry_t - **entry; /* pointer to array of pointers to elements */ - int - max, /* allocated elements */ - cur; /* number of elements in use */ -} _psl_vector_t; - -struct _psl_ctx_st { - _psl_vector_t - *suffixes, - *suffix_exceptions; -}; - -static _psl_vector_t *_vector_alloc(int max, int (*cmp)(const _psl_entry_t *, const _psl_entry_t *)) -{ - _psl_vector_t *v; - - if (!(v = calloc(1, sizeof(_psl_vector_t)))) - return NULL; - - if (!(v->entry = malloc(max * sizeof(_psl_entry_t *)))) { - free(v); - return NULL; - } - - v->max = max; - v->cmp = cmp; - return v; -} - -static void _vector_free(_psl_vector_t **v) -{ - if (v && *v) { - if ((*v)->entry) { - int it; - - for (it = 0; it < (*v)->cur; it++) - free((*v)->entry[it]); - - free((*v)->entry); - } - free(*v); - } -} - -static _psl_entry_t *_vector_get(const _psl_vector_t *v, int pos) -{ - if (pos < 0 || !v || pos >= v->cur) return NULL; - - return v->entry[pos]; -} - -static int _vector_add(_psl_vector_t *v, const _psl_entry_t *elem) -{ - if (v) { - void *elemp; - - elemp = malloc(sizeof(_psl_entry_t)); - memcpy(elemp, elem, sizeof(_psl_entry_t)); - - if (v->max == v->cur) - v->entry = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *)); - - v->entry[v->cur++] = elemp; - return v->cur - 1; - } - - return -1; -} - -static int _compare(const void *p1, const void *p2, void *v) -{ - return ((_psl_vector_t *)v)->cmp(*((_psl_entry_t **)p1), *((_psl_entry_t **)p2)); -} - -static void _vector_sort(_psl_vector_t *v) -{ - if (v && v->cmp) - qsort_r(v->entry, v->cur, sizeof(_psl_vector_t *), _compare, v); -} - -/* by this kind of sorting, we can easily see if a domain matches or not (match = supercookie !) */ - -static int _suffix_compare(const _psl_entry_t *s1, const _psl_entry_t *s2) -{ - int n; - - if ((n = s2->nlabels - s1->nlabels)) - return n; /* most labels first */ - - if ((n = s1->length - s2->length)) - return n; /* shorter rules first */ - - return strcmp(s1->label, s2->label); -} - -static void _suffix_init(_psl_entry_t *suffix, const char *rule, size_t length) -{ - const char *src; - char *dst; - - suffix->label = suffix->label_buf; - - if (length >= sizeof(suffix->label_buf) - 1) { - suffix->nlabels = 0; - fprintf(stderr, "Suffix rule too long (%d, ignored): %s\n", (int) length, rule); - return; - } - - if (*rule == '*') { - if (*++rule != '.') { - suffix->nlabels = 0; - fprintf(stderr, "Unsupported kind of rule (ignored): %s\n", rule); - return; - } - rule++; - suffix->wildcard = 1; - suffix->length = (unsigned char)length - 2; - } else { - suffix->wildcard = 0; - suffix->length = (unsigned char)length; - } - - suffix->nlabels = 1; - - for (dst = suffix->label_buf, src = rule; *src;) { - if (*src == '.') - suffix->nlabels++; - *dst++ = tolower(*src++); - } - *dst = 0; -} - -psl_ctx_t *psl_load_file(const char *fname) -{ - FILE *fp; - psl_ctx_t *psl = NULL; - - if ((fp = fopen(fname, "r"))) { - psl = psl_load_fp(fp); - fclose(fp); - } - - return psl; -} - -psl_ctx_t *psl_load_fp(FILE *fp) -{ - psl_ctx_t *psl; - _psl_entry_t suffix, *suffixp; - int nsuffixes = 0; - char buf[256], *linep, *p; - - if (!fp) - return NULL; - - if (!(psl = calloc(1, sizeof(psl_ctx_t)))) - return NULL; - - /* - * as of 02.11.2012, the list at http://publicsuffix.org/list/ contains ~6000 rules and 40 exceptions. - * as of 19.02.2014, the list at http://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions. - */ - psl->suffixes = _vector_alloc(8*1024, _suffix_compare); - psl->suffix_exceptions = _vector_alloc(64, _suffix_compare); - - while ((linep = fgets(buf, sizeof(buf), fp))) { - while (isspace(*linep)) linep++; /* ignore leading whitespace */ - if (!*linep) continue; /* skip empty lines */ - - if (*linep == '/' && linep[1] == '/') - continue; /* skip comments */ - - /* parse suffix rule */ - for (p = linep; *linep && !isspace(*linep);) linep++; - *linep = 0; - - if (*p == '!') { - /* add to exceptions */ - _suffix_init(&suffix, p + 1, linep - p - 1); - suffixp = _vector_get(psl->suffix_exceptions, _vector_add(psl->suffix_exceptions, &suffix)); - } else { - _suffix_init(&suffix, p, linep - p); - suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix)); - } - - if (suffixp) - suffixp->label = suffixp->label_buf; /* set label to changed address */ - - nsuffixes++;; - } - - _vector_sort(psl->suffix_exceptions); - _vector_sort(psl->suffixes); - - return psl; -} +/* here we include the library source code to have access to internal functions and data structures */ +#define _LIBPSL_INCLUDED_BY_PSL2C +# include "psl.c" +#undef _LIBPSL_INCLUDED_BY_PSL2C static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char *varname) { @@ -283,8 +62,8 @@ static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char * fprintf(fpout, "/* automatically generated by psl2c (punycode generated with libicu/%s) */\n", version); } while (0); #else - fprintf(fpout, "/* automatically generated by psl2c (punycode generated with idn2) */\n"); -#endif + fprintf(fpout, "/* automatically generated by psl2c (without punycode support) */\n"); +#endif /* WITH_LIBICU */ fprintf(fpout, "static _psl_entry_t %s[] = {\n", varname); @@ -298,15 +77,7 @@ static void _print_psl_entries(FILE *fpout, const _psl_vector_t *v, const char * fprintf(fpout, "};\n"); } -void psl_free(psl_ctx_t *psl) -{ - if (psl) { - _vector_free(&psl->suffixes); - _vector_free(&psl->suffix_exceptions); - free(psl); - } -} - +#ifndef WITH_LIBICU static int _str_needs_encoding(const char *s) { while (*s > 0) s++; @@ -326,60 +97,6 @@ static void _add_punycode_if_needed(_psl_vector_t *v) _psl_entry_t suffix, *suffixp; char lookupname[64] = ""; - /* the following lines will have GPL3+ license issues */ -/* char *asc = NULL; - int rc; - - if ((rc = idn2_lookup_u8((uint8_t *)e->label_buf, (uint8_t **)&asc, 0)) == IDN2_OK) { - // fprintf(stderr, "idn2 '%s' -> '%s'\n", e->label_buf, asc); - _suffix_init(&suffix, asc, strlen(asc)); - suffix.wildcard = e->wildcard; - suffixp = _vector_get(v, _vector_add(v, &suffix)); - suffixp->label = suffixp->e_label_buf; // set label to changed address - } else - fprintf(stderr, "toASCII(%s) failed (%d): %s\n", e->label_buf, rc, idn2_strerror(rc)); -*/ - -#ifdef WITH_LIBICU - UIDNA *idna; - UErrorCode status = 0; - - /* IDNA2003 punycode conversion */ - /* destLen = uidna_toASCII(e->label_buf, (int32_t) strlen(e->label_buf), lookupname, (int32_t) sizeof(lookupname), - UIDNA_DEFAULT, NULL, &status); - */ - - /* IDNA2008 UTS#46 punycode conversion */ - if ((idna = uidna_openUTS46(UIDNA_USE_STD3_RULES, &status))) { - UChar utf16_dst[64], utf16_src[64]; - int32_t utf16_src_length; - UIDNAInfo info = UIDNA_INFO_INITIALIZER; - - u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, e->label_buf, (int32_t) strlen(e->label_buf), &status); - if (U_SUCCESS(status)) { - int32_t dst_length = uidna_nameToASCII(idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status); - if (U_SUCCESS(status)) { - u_strToUTF8(lookupname, (int32_t) sizeof(lookupname), NULL, utf16_dst, dst_length, &status); - if (U_SUCCESS(status)) { - if (strcmp(e->label_buf, lookupname)) { - /* fprintf(stderr, "libicu '%s' -> '%s'\n", e->label_buf, lookupname); */ - _suffix_init(&suffix, lookupname, strlen(lookupname)); - suffix.wildcard = e->wildcard; - suffixp = _vector_get(v, _vector_add(v, &suffix)); - suffixp->label = suffixp->label_buf; /* set label to changed address */ - } /* else ignore */ - } else - fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); - } else - fprintf(stderr, "Failed to convert to ASCII (status %d)\n", status); - } else - fprintf(stderr, "Failed to convert UTF-8 to UTF-16 (status %d)\n", status); - - uidna_close(idna); - } else - fprintf(stderr, "Failed to get UTS46 IDNA handle\n"); - -#else /* this is much slower than the libidn2 API but should have no license issues */ FILE *pp; char cmd[16 + sizeof(e->label_buf)]; @@ -395,12 +112,13 @@ static void _add_punycode_if_needed(_psl_vector_t *v) pclose(pp); } else fprintf(stderr, "Failed to call popen(%s, \"r\")\n", cmd); -#endif } } _vector_sort(v); } +#endif /* ! WITH_LIBICU */ + #endif /* WITH_BUILTIN */ int main(int argc, const char **argv) @@ -413,7 +131,7 @@ int main(int argc, const char **argv) if (argc != 3) { fprintf(stderr, "Usage: psl2c \n"); - fprintf(stderr, " is the 'effective_tld_names.dat' (aka Public Suffix List)\n"); + fprintf(stderr, " is the 'effective_tld_names.dat' (aka Public Suffix List), lowercase UTF-8 encoded\n"); fprintf(stderr, " is the the C filename to be generated from \n"); return 1; } @@ -428,8 +146,12 @@ int main(int argc, const char **argv) size_t cmdsize = 16 + strlen(argv[1]); char *cmd = alloca(cmdsize), checksum[64] = ""; +#ifndef WITH_LIBICU + /* If libicu is not configured, we still need to have punycode in our built-in data. */ + /* Else the test suite fails. */ _add_punycode_if_needed(psl->suffixes); _add_punycode_if_needed(psl->suffix_exceptions); +#endif _print_psl_entries(fpout, psl->suffixes, "suffixes"); _print_psl_entries(fpout, psl->suffix_exceptions, "suffix_exceptions"); @@ -458,8 +180,8 @@ int main(int argc, const char **argv) psl_free(psl); #else if ((fpout = fopen(argv[2], "w"))) { - fprintf(fpout, "static _psl_entry_t suffixes[0];\n"); - fprintf(fpout, "static _psl_entry_t suffix_exceptions[0];\n"); + fprintf(fpout, "static _psl_entry_t suffixes[1];\n"); + fprintf(fpout, "static _psl_entry_t suffix_exceptions[1];\n"); fprintf(fpout, "static time_t _psl_file_time;\n"); fprintf(fpout, "static time_t _psl_compile_time;\n"); fprintf(fpout, "static const char _psl_sha1_checksum[] = \"\";\n"); diff --git a/tests/test-is-public-builtin.c b/tests/test-is-public-builtin.c index ffcac12..2ccde36 100644 --- a/tests/test-is-public-builtin.c +++ b/tests/test-is-public-builtin.c @@ -47,8 +47,8 @@ static int static void test_psl(void) { - /* punycode generation: idn 商标 */ - /* octal code generation: echo -n "商标" | od -b */ + /* punycode generation: idn ?? */ + /* octal code generation: echo -n "??" | od -b */ static const struct test_data { const char *domain; @@ -65,7 +65,7 @@ static void test_psl(void) { "abc.www.ck", 0 }, { "xxx.ck", 1 }, { "www.xxx.ck", 0 }, - { "\345\225\206\346\240\207", 1 }, /* xn--czr694b oder 商标 */ + { "\345\225\206\346\240\207", 1 }, /* xn--czr694b oder ?? */ { "www.\345\225\206\346\240\207", 0 }, { "xn--czr694b", 1 }, { "www.xn--czr694b", 0 }, diff --git a/tests/test-is-public.c b/tests/test-is-public.c index 1f3b8b3..0afbdd7 100644 --- a/tests/test-is-public.c +++ b/tests/test-is-public.c @@ -47,8 +47,8 @@ static int static void test_psl(void) { - /* punycode generation: idn 商标 */ - /* octal code generation: echo -n "商标" | od -b */ + /* punycode generation: idn ?? */ + /* octal code generation: echo -n "??" | od -b */ static const struct test_data { const char *domain; @@ -65,7 +65,7 @@ static void test_psl(void) { "abc.www.ck", 0 }, { "xxx.ck", 1 }, { "www.xxx.ck", 0 }, - { "\345\225\206\346\240\207", 1 }, /* xn--czr694b oder 商标 */ + { "\345\225\206\346\240\207", 1 }, /* xn--czr694b or ?? */ { "www.\345\225\206\346\240\207", 0 }, /* some special test follow ('name' and 'forgot.his.name' are public, but e.g. his.name is not) */ { "name", 1 }, diff --git a/tests/test-registrable-domain.c b/tests/test-registrable-domain.c index c075500..8bc06b1 100644 --- a/tests/test-registrable-domain.c +++ b/tests/test-registrable-domain.c @@ -38,6 +38,11 @@ #include #include +#ifdef WITH_LIBICU +# include +# include +#endif + #include static int @@ -47,32 +52,11 @@ static int static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result) { const char *result; - char lookupname[128]; + char *lower; - /* check if there might be some utf-8 characters */ - if (domain) { - int utf8; - const char *p; - - for (p = domain, utf8 = 0; *p && !utf8; p++) - if (*p < 0) - utf8 = 1; - - /* if we found utf-8, make sure to convert domain correctly to lowercase */ - /* does it work, if we are not in a utf-8 env ? */ - if (utf8) { - FILE *pp; - size_t cmdsize = 48 + strlen(domain); - char *cmd = alloca(cmdsize); - - snprintf(cmd, cmdsize, "echo -n '%s' | sed -e 's/./\\L\\0/g'", domain); - if ((pp = popen(cmd, "r"))) { - if (fscanf(pp, "%127s", lookupname) >= 1) - domain = lookupname; - pclose(pp); - } - } - } + /* our test data is fixed to UTF-8 (english), so provide it here */ + if (psl_str_to_utf8lower(domain, "utf-8", "en", &lower) == PSL_SUCCESS) + domain = lower; result = psl_registrable_domain(psl, domain); @@ -83,13 +67,15 @@ static void test(const psl_ctx_t *psl, const char *domain, const char *expected_ printf("psl_registrable_domain(%s)=%s (expected %s)\n", domain, result ? result : "NULL", expected_result ? expected_result : "NULL"); } + + free(lower); } static void test_psl(void) { FILE *fp; const psl_ctx_t *psl; - char buf[256], domain[128], expected_regdom[128], *p; + char buf[256], domain[128], expected_regdom[128]; psl = psl_builtin(); @@ -105,7 +91,9 @@ static void test_psl(void) test(NULL, "com", NULL); /* Norwegian with uppercase oe */ +#ifdef WITH_LIBICU test(psl, "www.\303\230yer.no", "www.\303\270yer.no"); +#endif /* Norwegian with lowercase oe */ test(psl, "www.\303\270yer.no", "www.\303\270yer.no"); @@ -126,11 +114,6 @@ static void test_psl(void) continue; } - /* we have to lowercase the domain - the PSL API just takes lowercase */ - for (p = domain; *p; p++) - if (*p > 0 && isupper(*p)) - *p = tolower(*p); - if (!strcmp(expected_regdom, "null")) test(psl, domain, NULL); else diff --git a/tools/psl.c b/tools/psl.c index 3764599..976ada6 100644 --- a/tools/psl.c +++ b/tools/psl.c @@ -32,8 +32,16 @@ # include #endif +#include #include #include +#include + +#ifdef WITH_LIBICU +# include +# include +#endif + #include static void usage(int err, FILE* f) @@ -71,6 +79,10 @@ int main(int argc, const char *const *argv) const char *const *arg, *psl_file = NULL, *cookie_domain = NULL; psl_ctx_t *psl = (psl_ctx_t *) psl_builtin(); + /* set current locale according to the environment variables */ + #include + setlocale(LC_ALL, ""); + for (arg = argv + 1; arg < argv + argc; arg++) { if (!strncmp(*arg, "--", 2)) { if (!strcmp(*arg, "--is-public-suffix")) @@ -135,8 +147,41 @@ int main(int argc, const char *const *argv) exit(2); } if (arg >= argv + argc) { - fprintf(stderr, "No domains given - aborting\n"); - exit(3); + char buf[256], *domain, *lower; + size_t len; + psl_error_t rc; + + /* read URLs from STDIN */ + while (fgets(buf, sizeof(buf), stdin)) { + for (domain = buf; isspace(*domain); domain++); /* skip leading spaces */ + if (*domain == '#' || !*domain) continue; /* skip empty lines and comments */ + for (len = strlen(domain); len && isspace(domain[len - 1]); len--); /* skip trailing spaces */ + domain[len] = 0; + + if ((rc = psl_str_to_utf8lower(domain, NULL, NULL, &lower)) != PSL_SUCCESS) + fprintf(stderr, "%s: Failed to convert to lowercase UTF-8 (%d)\n", domain, rc); + else if (mode == 1) + printf("%s: %d (%s)\n", domain, psl_is_public_suffix(psl, lower), lower); + else if (mode == 2) + printf("%s: %s\n", domain, psl_unregistrable_domain(psl, lower)); + else if (mode == 3) + printf("%s: %s\n", domain, psl_registrable_domain(psl, lower)); + else if (mode == 4) { + char *cookie_domain_lower; + + if ((rc = psl_str_to_utf8lower(domain, NULL, NULL, &cookie_domain_lower)) != PSL_SUCCESS) + fprintf(stderr, "%s: Failed to convert cookie domain '%s' to lowercase UTF-8 (%d)\n", domain, cookie_domain, rc); + else + printf("%s: %d\n", domain, psl_is_cookie_domain_acceptable(psl, lower, cookie_domain)); + + free(cookie_domain_lower); + } + + free(lower); + } + + psl_free(psl); + exit(0); } } @@ -172,6 +217,11 @@ int main(int argc, const char *const *argv) printf("builtin compile time: %ld (%s)\n", psl_builtin_compile_time(), time2str(psl_builtin_compile_time())); printf("builtin file time: %ld (%s)\n", psl_builtin_file_time(), time2str(psl_builtin_file_time())); printf("builtin SHA1 file hash: %s\n", psl_builtin_sha1sum()); + +#ifdef WITH_LIBICU + printf("uloc_getDefault=%s\n", uloc_getDefault()); + printf("ucnv_getDefaultName=%s\n", ucnv_getDefaultName()); +#endif } else printf("No builtin PSL data available\n"); }