Release v0.15.0

-----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEHLJ9vJhhSy1YQWRtCDAttqJnBCgFAlgppfATHHRpbS5ydWVo
 c2VuQGdteC5kZQAKCRAIMC22omcEKGy/D/9iduEEwzSDt22U6MxmqD77hvgB9hQn
 8Xn7CsTye408EUlw2ENYg4H/V3xNQN7ZbA4wJi20FmcniFhSUbSv9UD5Vr2FSTZS
 NJ1EpAbqljswE5x49u3lWRyo8XOEbVdWZS66+E5W9T/0Nl6kLUk4nYkBE6LBQGhp
 vd6+p74kqpjJGHhrZ4uYV5bkttoeSee/arGzvWTR3kmgERVCm9Qr90ldOx3Sp91s
 iqwb6RpDVkL3q5sA9bOfrpEDdADJdQYLr1BkkTOb7ZA52uEhdU6nEyfswoJsaBuI
 aj1hOgspekVqEs7ZUpltnT2GPbFyXtj338SA0738xxZaTm/eYzvNea5Fnpg4fnQb
 /w7I++IZGmdXljQnk1gtqzIgxCwia34u2/T4XgEpyd/h9A5PUdjo2EKPtBgHRFG7
 GnK9IRgLHqdxZFpfiUyp2zIZL8+/PUlD5Ekwi1D3Wgc5PSOO0rMHR1IWzCmpopbU
 Mo9E511RcIdsn+IStB1gwclT5qk1fo3n5dcQBBXtpPTEJ6CRedLK+WcbLyhh3R0Z
 ham1D8t3kVDQgfg57mEJOIS5sgcLj5LR3ydya5ELf3pS6FVo4qvBO4Sp3E6wbgpE
 9n5D150bKyv+RkTuNTgW8uahhYdR++bXUPWbaZReGVxKy3VB7VikDusRfnVFej9c
 cJP1HAskz6qTwA==
 =ksJN
 -----END PGP SIGNATURE-----

Merge tag 'libpsl-0.15.0' into debian

Release v0.15.0
This commit is contained in:
Daniel Kahn Gillmor 2016-11-15 08:38:51 +09:00
commit 4ef2e7c54b
23 changed files with 630 additions and 206 deletions

9
.gitignore vendored
View File

@ -1,4 +1,7 @@
*.exe
*.gcda
*.gcno
*.gcov
*.gz
*.la
*.lo
@ -10,6 +13,7 @@
*.cache
*.plist
*.stamp
ABOUT-NLS
aclocal.m4
ar-lib
autom4te.cache/
@ -43,6 +47,8 @@ gtk-doc.m4
gtk-doc.make
include/libpsl.h
install-sh
lcov/
libpsl.info
libpsl.pc
libtool
ltmain.sh
@ -67,10 +73,13 @@ po/remove-potcdate.sed
po/stamp-po
src/psl2c
src/suffixes.c
src/suffixes_dafsa.c
stamp-h1
test-driver
tests/*.log
tests/*.trs
tests/psl.dafsa
tests/psl_ascii.dafsa
tests/test-is-cookie-domain-acceptable
tests/test-is-public
tests/test-is-public-all

View File

@ -34,6 +34,7 @@ addons:
- libicu-dev
- libunistring0
- libunistring-dev
- lcov
script:
- ./autogen.sh
@ -44,3 +45,4 @@ script:
- ./configure --enable-runtime=$RUNTIME --disable-builtin && make clean && make -j4 && make check -j4
- ./configure --enable-gtk-doc && make -j4 && make check -j4
- make distcheck
- if [[ $CC == "gcc" && $RUNTIME == "libicu" ]]; then ./.travis_coveralls.sh; fi

5
.travis_coveralls.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/bash
make check-coverage-libicu
pip install --user cpp-coveralls
coveralls --include libwget/ --include src/ -e "src/psl2c.c"

View File

@ -16,3 +16,5 @@ Christopher Meng (Fedora building)
Jakub Čajka
Giuseppe Scrivano
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
Daurnimator (Code review, discussion, reports)
Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)

View File

@ -19,3 +19,28 @@ dist-hook:
mkdir -p $(distdir)/list/tests
cp -p $(PSL_FILE) $(distdir)/list
cp -p $(PSL_TESTFILE) $(distdir)/list/tests
clean-local:
rm -rf */*.gc?? */*/*.gc?? libpsl.info lcov
check-coverage:
if test -z "$(XLIB)"; then \
CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --disable-runtime --disable-builtin; \
else \
CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --enable-runtime=$(XLIB) --enable-builtin=$(XLIB); \
fi
$(MAKE) clean && $(MAKE)
lcov --capture --initial --directory src --output-file libpsl.info
$(MAKE) check
lcov --capture --directory src --output-file libpsl.info
lcov --remove libpsl.info 'src/psl2c.c' -o libpsl.info
genhtml --prefix . libpsl.info --legend --title "libpsl" --output-directory=lcov
check-coverage-libidn:
XLIB=libidn $(MAKE) check-coverage
check-coverage-libidn2:
XLIB=libidn2 $(MAKE) check-coverage
check-coverage-libicu:
XLIB=libicu $(MAKE) check-coverage

9
NEWS
View File

@ -1,5 +1,14 @@
Copyright (C) 2014-2016 Tim Rühsen
14.11.2016 Release V0.15.0
* Python3 compatibility for psl-make-dafsa
* Support for UTF-8 in DAFSA data
* Skip punycode conversion if DAFSA has UTF-8
* Better code coverage by test suite
* Code cleanup and enhancements
* Install man pages for psl-make-dafsa and psl
* Enhancements to the documentation
30.07.2016 Release V0.14.0
* Remove unneeded libraries from tools/psl link step
* Use https instead of http where possible

View File

@ -1,4 +1,12 @@
[![Build Status](https://travis-ci.org/rockdaboot/libpsl.png?branch=master)](https://travis-ci.org/rockdaboot/libpsl)
[![Travis-CI Status](https://travis-ci.org/rockdaboot/libpsl.png?branch=master)](https://travis-ci.org/rockdaboot/libpsl)
[![Coverity Scan](https://img.shields.io/coverity/scan/10227.svg)](https://scan.coverity.com/projects/rockdaboot-libpsl)
[![Coverage Status](https://coveralls.io/repos/github/rockdaboot/libpsl/badge.svg?branch=master)](https://coveralls.io/github/rockdaboot/libpsl?branch=master)
Solaris OpenCSW [![Build Status Solaris amd64](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-amd64)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-amd64)
[![Build Status Solaris i386](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-i386)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-i386)
[![Build Status Solaris Sparc](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-sparc)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-sparc)
[![Build Status Solaris SparcV9](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-sparcv9)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-sparcv9)
libpsl - C library to handle the Public Suffix List
===================================================
@ -116,7 +124,7 @@ Mailing List
To join the mailing list send an email to
<libpsl-bugs+subscribe@googlegroups.com>
libpsl-bugs+subscribe@googlegroups.com
and follow the instructions provided by the answer mail.

View File

@ -1,7 +1,7 @@
AC_INIT([libpsl], [0.14.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
AC_INIT([libpsl], [0.15.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
AC_PREREQ([2.59])
AM_INIT_AUTOMAKE([1.10 -Wall no-define foreign])
AM_INIT_AUTOMAKE([1.10 no-define foreign])
# Generate two configuration headers; one for building the library itself with
# an autogenerated template, and a second one that will be installed alongside
@ -88,7 +88,7 @@ PKG_PROG_PKG_CONFIG
# 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0.
# 5. If any interfaces have been added since the last public release, then increment age.
# 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0.
AC_SUBST([LIBPSL_SO_VERSION], [5:1:0])
AC_SUBST([LIBPSL_SO_VERSION], [5:2:0])
AC_SUBST([LIBPSL_VERSION], $VERSION)
# Check for enable/disable builtin PSL data
@ -168,7 +168,7 @@ if test "$enable_runtime" = "libicu" -o "$enable_builtin" = "libicu"; then
[AC_LANG_PROGRAM(
[[#include <unicode/ustring.h>]],
[[u_strToUTF8(NULL, 0, NULL, NULL, 0, NULL);]])],
[HAVE_LIBICU=yes; AC_MSG_RESULT([yes])],
[HAVE_LIBICU=yes; LIBICU_LIBS="-licuuc"; AC_MSG_RESULT([yes])],
[AC_MSG_RESULT([no]); AC_MSG_ERROR(You requested libicu but it is not installed.)])
LIBS=$OLDLIBS
])
@ -191,7 +191,7 @@ fi
if test "x$HAVE_LIBIDN2" = "xyes" -o "x$HAVE_LIBIDN" = "xyes"; then
# Check for libunistring, we need it for psl_str_to_utf8lower()
OLDLIBS=$LIBS
AC_SEARCH_LIBS(u8_tolower, unistring, HAVE_UNISTRING=yes, AC_MSG_ERROR(You requested libidn2 but libunistring is not installed.))
AC_SEARCH_LIBS(u8_tolower, unistring, HAVE_UNISTRING=yes, AC_MSG_ERROR(You requested libidn2|libidn but libunistring is not installed.))
LIBS=$OLDLIBS
fi

View File

@ -51,6 +51,7 @@ for CC in gcc clang; do
for xLCALL in C tr_TR.utf8; do
export TESTS_ENVIRONMENT="LC_ALL=$xLCALL VALGRIND_TESTS=$xVALGRIND"
echo " *** TESTS_ENVIRONMENT=\"$TESTS_ENVIRONMENT\"" make check -j$CORES
make clean > /dev/null
make check -j$CORES > /dev/null
done
done

View File

@ -53,10 +53,11 @@ extern "C" {
* psl_error_t:
* @PSL_SUCCESS: Successful return.
* @PSL_ERR_INVALID_ARG: Invalid argument.
* @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter
* @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter.
* @PSL_ERR_TO_UTF16: Failed to convert to utf-16.
* @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase.
* @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8.
* @PSL_ERR_NO_MEM: Failed to allocate memory.
*
* Return codes for PSL functions.
* Negative return codes mean failure.
@ -66,9 +67,10 @@ typedef enum {
PSL_SUCCESS = 0,
PSL_ERR_INVALID_ARG = -1,
PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */
PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */
PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */
PSL_ERR_TO_UTF8 = -5 /* failed to convert utf-16 to utf-8 */
PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */
PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */
PSL_ERR_TO_UTF8 = -5, /* failed to convert utf-16 to utf-8 */
PSL_ERR_NO_MEM = -6 /* failed to allocate memory */
} psl_error_t;
typedef struct _psl_ctx_st psl_ctx_t;

2
list

@ -1 +1 @@
Subproject commit 1df90f84db1a041991a48e46e786705f7161ab4c
Subproject commit 41a519ad34cf86ff4470b967d9e4755d72b63a6c

View File

@ -11,7 +11,7 @@ libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
# include ABI version information
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
if WITH_LIBICU
libpsl_la_LDFLAGS += -licuuc
libpsl_la_LDFLAGS += $(LIBICU_LIBS)
endif
if WITH_LIBIDN2
libpsl_la_LDFLAGS += -lidn2 -lunistring
@ -24,7 +24,7 @@ noinst_PROGRAMS = psl2c
psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/psl-make-dafsa\"
if BUILTIN_GENERATOR_LIBICU
psl2c_LDADD = -licuuc
psl2c_LDADD = $(LIBICU_LIBS)
endif
if BUILTIN_GENERATOR_LIBIDN2
psl2c_LDADD = @LTLIBICONV@ -lidn2 -lunistring
@ -39,3 +39,5 @@ suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
EXTRA_DIST = psl-make-dafsa LICENSE.chromium
dist_man_MANS = psl-make-dafsa.1

View File

@ -21,6 +21,48 @@
#define CHECK_LT(a, b) if ((a) >= b) return 0
static const char multibyte_length_table[16] = {
0, 0, 0, 0, /* 0x00-0x3F */
0, 0, 0, 0, /* 0x40-0x7F */
0, 0, 0, 0, /* 0x80-0xBF */
2, 2, 3, 4, /* 0xC0-0xFF */
};
/**
* Get lenght of multibyte character sequence starting at a given byte.
* Returns zero if the byte is not a valid leading byte in UTF-8.
*/
static int GetMultibyteLength(char c) {
return multibyte_length_table[((unsigned char)c) >> 4];
}
/**
* Moves pointers one byte forward.
*/
static void NextPos(const unsigned char** pos,
const char** key,
const char** multibyte_start)
{
++*pos;
if (*multibyte_start) {
/* Advance key to next byte in multibyte sequence. */
++*key;
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
*multibyte_start = 0;
} else {
if (GetMultibyteLength(**key)) {
/* Multibyte prefix was matched in the dafsa, start matching multibyte
* content in next round. */
*multibyte_start = *key;
} else {
/* Advance key as a single byte character was matched. */
++*key;
}
}
}
/*
* Read next offset from pos.
* Returns true if an offset could be read, false otherwise.
@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
return(*offset & 0x80) != 0;
}
/*
* Check if byte at offset matches first character in key.
* This version assumes a range check was already performed by the caller.
*/
static int IsMatchUnchecked(const unsigned char matcher,
const char* key,
const char* multibyte_start)
{
if (multibyte_start) {
/* Multibyte matching mode. */
if (multibyte_start == key) {
/* Match leading byte, which will also match the sequence length. */
return (matcher ^ 0x80) == (const unsigned char)*key;
} else {
/* Match following bytes. */
return (matcher ^ 0xC0) == (const unsigned char)*key;
}
}
/* If key points at a leading byte in a multibyte sequence, but we are not yet
* in multibyte mode, then the dafsa should contain a special byte to indicate
* a mode switch. */
if (GetMultibyteLength(*key)) {
return matcher == 0x1F;
}
/* Normal matching of a single byte character. */
return matcher == (const unsigned char)*key;
}
/*
* Check if byte at offset matches first character in key.
* This version matches characters not last in label.
@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
static int IsMatch(const unsigned char* offset,
const unsigned char* end,
const char* key)
const char* key,
const char* multibyte_start)
{
CHECK_LT(offset, end);
return *offset == *key;
return IsMatchUnchecked(*offset, key, multibyte_start);
}
/*
@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,
static int IsEndCharMatch(const unsigned char* offset,
const unsigned char* end,
const char* key)
const char* key,
const char* multibyte_start)
{
CHECK_LT(offset, end);
return *offset == (*key | 0x80);
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
}
/*
@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,
static int GetReturnValue(const unsigned char* offset,
const unsigned char* end,
const char* multibyte_start,
int* return_value)
{
CHECK_LT(offset, end);
if ((*offset & 0xE0) == 0x80) {
if (!multibyte_start && (*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x0F;
return 1;
}
@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
const unsigned char* end = graph + length;
const unsigned char* offset = pos;
const char* key_end = key + key_length;
const char* multibyte_start = 0;
while (GetNextOffset(&pos, end, &offset)) {
/*char <char>+ end_char offsets
@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
if (key != key_end && !IsEOL(offset, end)) {
/* Leading <char> is not a match. Don't dive into this child */
if (!IsMatch(offset, end, key))
if (!IsMatch(offset, end, key, multibyte_start))
continue;
did_consume = 1;
++offset;
++key;
NextPos(&offset, &key, &multibyte_start);
/* Possible matches at this point:
* <char>+ end_char offsets
* <char>+ return value
@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
/* Remove all remaining <char> nodes possible */
while (!IsEOL(offset, end) && key != key_end) {
if (!IsMatch(offset, end, key))
if (!IsMatch(offset, end, key, multibyte_start))
return -1;
++key;
++offset;
NextPos(&offset, &key, &multibyte_start);
}
}
/* Possible matches at this point:
@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
if (key == key_end) {
int return_value;
if (GetReturnValue(offset, end, &return_value))
if (GetReturnValue(offset, end, multibyte_start, &return_value))
return return_value;
/* The DAFSA guarantees that if the first char is a match, all
* remaining char elements MUST match if the key is truly present.
@ -191,14 +264,22 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
return -1;
continue;
}
if (!IsEndCharMatch(offset, end, key)) {
if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
if (did_consume)
return -1; /* Unexpected */
continue;
}
++key;
pos = ++offset; /* Dive into child */
NextPos(&offset, &key, &multibyte_start);
pos = offset; /* Dive into child */
}
return -1; /* No match */
}
/* prototype to skip warning with -Wmissing-prototypes */
int _HIDDEN GetUtfMode(const unsigned char *graph, size_t length);
int _HIDDEN GetUtfMode(const unsigned char *graph, size_t length)
{
return length > 0 && graph[length - 1] < 0x80;
}

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python2
#!/usr/bin/env python
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE.chromium file.
@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
and generates a C++ file with a byte array representing graph that can be
used as a memory efficient replacement for the perfect hash table.
The input strings are assumed to consist of printable 7-bit ASCII characters
and the return values are assumed to be one digit integers.
The input strings must consist of printable 7-bit ASCII characters or UTF-8
multibyte sequences. Control characters in the range [0x00-0x1F] are not
allowed. The return values must be one digit integers. .
In this program a DAFSA is a diamond shaped graph starting at a common
source node and ending at a common sink node. All internal nodes contain
@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:
<byte> ::= < 8-bit value in range [0x00-0xFF] >
<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
<char> ::= < byte in range [0x1F-0x7F] >
<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
<offset1> ::= < byte in range [0x00-0x3F] >
@ -84,13 +85,18 @@ The generated byte array can described by the following BNF:
| <prefix> <node>
| <end_label>
<dafsa> ::= <source>
| <dafsa> <node>
<graph> ::= <graph>
| <graph> <node>
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
<dafsa> ::= <graph> <version>
Decoding:
<char> -> printable 7-bit ASCII character
<end_char> & 0x7F -> printable 7-bit ASCII character
<char> -> character
<end_char> & 0x7F -> character
<return value> & 0x0F -> integer
<offset1 & 0x3F> -> integer
((<offset2> & 0x1F>) << 8) + <byte> -> integer
@ -105,6 +111,28 @@ between previous child node and next child node. Thus each offset links a node
to a child node. The distance is always counted between start addresses, i.e.
first byte in decoded offset or first byte in child node.
Transcoding of UTF-8 multibyte sequences:
The original DAFSA format was limited to 7-bit printable ASCII characters in
range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
By transcoding of such characters the new format preserves compatibility with
old parsers, so that a DAFSA in the extended format can be used by an old
parser without false positives, although strings containing transcoded
characters will never match. Since the format is extended rather than being
changed, a parser supporting the new format will automatically support data
generated in the old format.
Transcoding is performed by insertion of a start byte with the special value
0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
the range of printable ASCII.
2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
Example 1:
%%
@ -197,8 +225,29 @@ import sys
class InputError(Exception):
"""Exception raised for errors in the input file."""
# Length of a character starting at a given byte.
char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
def to_dafsa(words):
def to_bytes(n):
"""Converts an integer value to a bytes object."""
return bytes(bytearray((n,)))
def to_dafsa(words, utf_mode):
"""Generates a DAFSA from a word list and returns the source node.
Each word is split into characters so that each character is represented by
@ -206,20 +255,36 @@ def to_dafsa(words):
"""
if not words:
raise InputError('The domain list must not be empty')
def to_nodes(word):
def to_nodes(word, multibyte_length):
"""Split words into characters"""
if not 0x1F < ord(word[0]) < 0x80:
raise InputError('Domain names must be printable 7-bit ASCII')
if len(word) == 1:
return chr(int(word[0], 16) & 0x0F), [None]
return word[0], [to_nodes(word[1:])]
return [to_nodes(word) for word in words]
byte = ord(word[:1])
if multibyte_length:
# Consume next byte in multibyte sequence.
if byte & 0xC0 != 0x80:
raise InputError('Invalid UTF-8 multibyte sequence')
return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
char_length = char_length_table[byte]
if char_length == 1:
# 7-bit printable ASCII.
if len(word) == 1:
return to_bytes(int(word[:1], 16) & 0x0F), [None]
return word[:1], [to_nodes(word[1:], 0)]
elif char_length > 1:
# Leading byte in multibyte sequence.
if not utf_mode:
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
if len(word) <= char_length:
raise InputError('Unterminated UTF-8 multibyte sequence')
return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
# Unexpected character.
raise InputError('Domain names must be printable ASCII or UTF-8')
return [to_nodes(word, 0) for word in words]
def to_words(node):
"""Generates a word list from all paths starting from an internal node."""
if not node:
return ['']
return [b'']
return [(node[0] + word) for child in node[1] for word in to_words(child)]
@ -286,7 +351,7 @@ def join_suffixes(dafsa):
"""Generates a new DAFSA where nodes that represent the same word lists
towards the sink are merged.
"""
nodemap = {frozenset(('',)): None}
nodemap = {frozenset((b'',)): None}
def join(node):
"""Returns a macthing node. A new node is created if no matching node
@ -384,7 +449,7 @@ def encode_prefix(label):
will then be a prefix to the label in the child node.
"""
assert label
return [ord(c) for c in reversed(label)]
return [c for c in bytearray(reversed(label))]
def encode_label(label):
@ -396,7 +461,7 @@ def encode_label(label):
return buf
def encode(dafsa):
def encode(dafsa, utf_mode):
"""Encodes a DAFSA to a list of bytes"""
output = []
offsets = {}
@ -412,62 +477,66 @@ def encode(dafsa):
output.extend(encode_links(dafsa, offsets, len(output)))
output.reverse()
if utf_mode:
output.append(0x01)
return output
def to_cxx(data):
def to_cxx(data, codecs):
"""Generates C++ code from a list of encoded bytes."""
text = '/* This file is generated. DO NOT EDIT!\n\n'
text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
text += ' documentation.'
text += '*/\n\n'
text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
text += b' documentation.'
text += b'*/\n\n'
text += b'static const unsigned char kDafsa['
text += bytes(str(len(data)), **codecs)
text += b'] = {\n'
for i in range(0, len(data), 12):
text += ' '
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
text += ',\n'
text += '};\n'
text += b' '
text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
text += b',\n'
text += b'};\n'
return text
def words_to_whatever(words, converter):
def words_to_whatever(words, converter, utf_mode, codecs):
"""Generates C++ code from a word list"""
dafsa = to_dafsa(words)
dafsa = to_dafsa(words, utf_mode)
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
dafsa = fun(dafsa)
return converter(encode(dafsa))
return converter(encode(dafsa, utf_mode), codecs)
def words_to_cxx(words):
def words_to_cxx(words, utf_mode, codecs):
"""Generates C++ code from a word list"""
return words_to_whatever(words, to_cxx)
return words_to_whatever(words, to_cxx, utf_mode, codecs)
def words_to_binary(words):
def words_to_binary(words, utf_mode, codecs):
"""Generates C++ code from a word list"""
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray)
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
def parse_psl2c(infile):
def parse_psl2c(infile, utf_mode, codecs):
"""Parses file generated by psl2c and extract strings and return code"""
lines = [line.strip() for line in infile]
lines = [bytes(line.strip(), **codecs) for line in infile]
for line in lines:
if line[-3:-1] != ', ':
if line[-3:-1] != b', ':
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
# Technically the DAFSA format could support return values in range [0-31],
# Technically the DAFSA format could support return values in range [0x00-0x1E],
# but the values below are the only with a defined meaning.
if line[-1] not in '0123456789ABCDEF':
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % line[-1])
if line[-1] not in b'0123456789ABCDEF':
raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1:])
# with open("gperf.out", 'w') as outfile:
# for line in sorted(lines):
# outfile.write(line[:-3] + line[-1] + "\n")
return [line[:-3] + line[-1] for line in sorted(lines)]
return [line[:-3] + line[-1:] for line in sorted(lines)]
def parse_psl(infile):
def parse_psl(infile, utf_mode, codecs):
"""Parses PSL file and extract strings and return code"""
PSL_FLAG_EXCEPTION = (1<<0)
PSL_FLAG_WILDCARD = (1<<1)
@ -479,39 +548,39 @@ def parse_psl(infile):
section = 0
for line in infile:
line = line.strip()
line = bytes(line.strip(), **codecs)
if not line:
continue
if line.startswith("//"):
if line.startswith(b'//'):
if section == 0:
if "===BEGIN ICANN DOMAINS===" in line:
if b'===BEGIN ICANN DOMAINS===' in line:
section = PSL_FLAG_ICANN
elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
elif section == 0 and b'===BEGIN PRIVATE DOMAINS===' in line:
section = PSL_FLAG_PRIVATE
elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
section = 0
elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
section = 0
continue # skip comments
if line[0] == '!':
if line[:1] == b'!':
flags = PSL_FLAG_EXCEPTION | section
line = line[1:]
elif line[0] == '*':
if line[1] != '.':
elif line[:1] == b'*':
if line[1:2] != b'.':
print('Unsupported kind of rule (ignored): %s' % line)
continue
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
line = line[2:]
else:
if not '.' in line:
if not b'.' in line:
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
flags = PSL_FLAG_PLAIN | section
line = line.decode('utf-8').encode("idna")
punycode = line.decode('utf-8').encode('idna')
if line in psl:
if punycode in psl:
"""Found existing entry:
Combination of exception and plain rule is ambiguous
!foo.bar
@ -521,16 +590,18 @@ def parse_psl(infile):
!foo.bar + *.foo.bar
foo.bar + *.foo.bar
"""
print('Found %s/%X (now %X)' % line, psl[line], flags)
print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
continue
psl[line] = flags
if utf_mode:
psl[line] = flags
psl[punycode] = flags
# with open("psl.out", 'w') as outfile:
# for (domain, flags) in sorted(psl.iteritems()):
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
return [domain + "%X" % (flags & 0x0F) for (domain, flags) in sorted(psl.iteritems())]
return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]
def usage():
@ -538,8 +609,10 @@ def usage():
print('usage: %s [options] infile outfile' % sys.argv[0])
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
print(' --input-format=psl infile is a Public Suffix List file')
print(' --output-format=cxx Write DAFSA as C/C++ code')
print(' --output-format=cxx Write DAFSA as C/C++ code (default)')
print(' --output-format=binary Write DAFSA binary data')
print(' --encoding=ascii 7-bit ASCII mode')
print(' --encoding=utf-8 UTF-8 mode (default)')
exit(1)
@ -550,6 +623,11 @@ def main():
converter = words_to_cxx
parser = parse_psl2c
utf_mode = True
codecs = dict()
if sys.version_info.major > 2:
codecs['encoding'] = 'utf-8'
for arg in sys.argv[1:-2]:
if arg.startswith('--input-format='):
@ -570,15 +648,24 @@ def main():
else:
print("Unknown output format '%s'" % value)
return 1
elif arg.startswith('--encoding='):
value = arg[11:].lower()
if value == 'ascii':
utf_mode = False
elif value == 'utf-8':
utf_mode = True
else:
print("Unknown encoding '%s'" % value)
return 1
else:
usage()
if sys.argv[-2] == '-':
with open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(sys.stdin)))
with open(sys.argv[-1], 'wb') as outfile:
outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
else:
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
outfile.write(converter(parser(infile)))
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'wb') as outfile:
outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
return 0

View File

@ -28,9 +28,14 @@ depends on options passed to it.
\fBcxx\fR: (default) output is C/C++ code
.br
\fBbinary\fR: output is an architecture-independent binary format
.TP
\fB\-\-encoding=\fR[\fIutf-8\fR|\fIascii\fR]
\fButf-8\fR: (default) UTF-8 mode (output contains UTF-8 + punycode)
.br
\fBascii\fR: (deprecated) 7-bit ASCII mode (output contains punycode only)
.SH SEE ALSO
.IR https://publicsuffix.org/ ", " https://github.com/rockdaboot/libpsl
.SH COPYRIGHT
\fBpsl-make-dafsa\fR was originally part of the Chromium project, and
\fBpsl-make-dafsa\fR was was written by Olle Liljenzin as part of the Chromium project and
has been modified by Tim Ruehsen and Daniel Kahn Gillmor. The code
and its documentation is governed by a BSD-style license.

237
src/psl.c
View File

@ -73,6 +73,7 @@
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#include <errno.h>
#include <limits.h> /* for UINT_MAX */
#include <langinfo.h>
@ -101,9 +102,6 @@
#include <libpsl.h>
/* number of elements within an array */
#define countof(a) (sizeof(a)/sizeof(*(a)))
#ifndef HAVE_STRNDUP
/* I found no strndup on my old SUSE 7.3 test system (gcc 2.95) */
@ -176,10 +174,11 @@ struct _psl_ctx_st {
size_t
dafsa_size;
int
mode,
nsuffixes,
nexceptions,
nwildcards;
unsigned
utf8 : 1; /* 1: data contains UTF-8 + punycode encoded rules */
};
/* include the PSL data compiled by 'psl2c' */
@ -263,11 +262,21 @@ static int _vector_add(_psl_vector_t *v, const _psl_entry_t *elem)
if (v) {
void *elemp;
elemp = malloc(sizeof(_psl_entry_t));
if (!(elemp = malloc(sizeof(_psl_entry_t))))
return -1;
memcpy(elemp, elem, sizeof(_psl_entry_t));
if (v->max == v->cur)
v->entry = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
if (v->max == v->cur) {
void *m = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
if (m)
v->entry = m;
else {
free(elemp);
return -1;
}
}
v->entry[v->cur++] = elemp;
return v->cur - 1;
@ -517,36 +526,37 @@ static enum punycode_status punycode_encode(
static ssize_t _utf8_to_utf32(const char *in, size_t inlen, punycode_uint *out, size_t outlen)
{
size_t n = 0;
unsigned char *s;
const unsigned char *s = (void *)in;
const unsigned char *e = (void *)(in + inlen);
if (!outlen)
return -1;
outlen--;
s = alloca(inlen + 1);
memcpy(s, in, inlen);
s[inlen] = 0;
while (n < outlen) {
size_t inleft = e - s;
while (*s && n < outlen) {
if ((*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
if (inleft >= 1 && (*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
out[n++] = *s;
s++;
} else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
} else if (inleft >= 2 && (*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80)
return -1;
out[n++] = ((*s & 0x1F) << 6) | (s[1] & 0x3F);
s += 2;
} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
} else if (inleft >= 3 && (*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
return -1;
out[n++] = ((*s & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
s += 3;
} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
} else if (inleft >= 4 && (*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
return -1;
out[n++] = ((*s & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
s += 4;
} else if (!inleft) {
break;
} else
return -1;
}
@ -575,7 +585,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
/* printf("s=%s inlen=%zd\n", label, labellen); */
if (_mem_is_ascii(label, labellen)) {
if (outlen + labellen + (e != NULL)>= outsize)
if (outlen + labellen + (e != NULL) >= outsize)
return 1;
/* printf("outlen=%zd labellen=%zd\n", outlen, labellen); */
@ -587,7 +597,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
if (outlen + labellen + (e != NULL) + 4 >= outsize)
return 1;
if ((inputlen = _utf8_to_utf32(label, labellen, input, sizeof (input) / sizeof (input[0]))) < 0)
if ((inputlen = _utf8_to_utf32(label, labellen, input, countof(input))) < 0)
return 1;
memcpy(out + outlen, "xn--", 4);
@ -609,7 +619,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
}
#endif
static inline int _isspace_ascii(const char c)
static int _isspace_ascii(const char c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
@ -691,15 +701,15 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
UChar utf16_dst[128], utf16_src[128];
int32_t utf16_src_length;
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, utf8, -1, &status);
u_strFromUTF8(utf16_src, countof(utf16_src), &utf16_src_length, utf8, -1, &status);
if (U_SUCCESS(status)) {
int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, countof(utf16_dst), &info, &status);
if (U_SUCCESS(status)) {
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
if (U_SUCCESS(status)) {
if (ascii)
*ascii = strdup(lookupname);
ret = 0;
if ((*ascii = strdup(lookupname)))
ret = 0;
} /* else
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
} /* else
@ -709,32 +719,21 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
}
#elif defined(WITH_LIBIDN2)
int rc;
uint8_t *lower, resbuf[256];
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
uint8_t *lower;
size_t len = u8_strlen((uint8_t *)utf8) + 1;
/* we need a conversion to lowercase */
lower = u8_tolower((uint8_t *)utf8, u8_strlen((uint8_t *)utf8), 0, UNINORM_NFKC, resbuf, &len);
if (!lower) {
if (!(lower = u8_tolower((uint8_t *)utf8, len, 0, UNINORM_NFKC, NULL, &len))) {
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
return -1;
}
/* u8_tolower() does not terminate the result string */
if (lower == resbuf) {
lower[len]=0;
} else {
uint8_t *tmp = lower;
lower = (uint8_t *)strndup((char *)lower, len);
free(tmp);
}
if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
ret = 0;
} /* else
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
if (lower != resbuf)
free(lower);
free(lower);
#elif defined(WITH_LIBIDN)
int rc;
@ -754,8 +753,8 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
if (_domain_to_punycode(utf8, lookupname, sizeof(lookupname)) == 0) {
if (ascii)
*ascii = strdup(lookupname);
ret = 0;
if ((*ascii = strdup(lookupname)))
ret = 0;
}
#endif
@ -776,16 +775,17 @@ static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_en
/* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
_suffix_init(&suffix, lookupname, strlen(lookupname));
suffix.flags = e->flags;
suffixp = _vector_get(v, _vector_add(v, &suffix));
suffixp->label = suffixp->label_buf; /* set label to changed address */
if ((suffixp = _vector_get(v, _vector_add(v, &suffix))))
suffixp->label = suffixp->label_buf; /* set label to changed address */
} /* else ignore */
free(lookupname);
}
}
/* prototype */
/* prototypes */
int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
int GetUtfMode(const unsigned char *graph, size_t length);
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
{
@ -814,6 +814,14 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
return 1;
}
if (psl->utf8 || psl == &_builtin_psl)
need_conversion = 0;
#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
if (psl == &_builtin_psl)
need_conversion = 0;
#endif
if (need_conversion) {
_psl_idna_t *idna = _psl_idna_open();
@ -934,8 +942,9 @@ suffix_yes:
*
* For cookie domain checking see psl_is_cookie_domain_acceptable().
*
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
* Other encodings result in unexpected behavior.
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
* Other encodings likely result in incorrect return values.
* Use helper function psl_str_to_utf8lower() for normalization @domain.
*
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
@ -964,8 +973,9 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
* @type specifies the PSL section where to perform the lookup. Valid values are
* %PSL_TYPE_PRIVATE, %PSL_TYPE_ICANN and %PSL_TYPE_ANY.
*
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
* Other encodings result in unexpected behavior.
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
* Other encodings likely result in incorrect return values.
* Use helper function psl_str_to_utf8lower() for normalization @domain.
*
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
@ -990,8 +1000,9 @@ int psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type)
* This function finds the longest public suffix part of @domain by the means
* of the [Mozilla Public Suffix List](https://publicsuffix.org).
*
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
* Other encodings result in unexpected behavior.
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
* Other encodings likely result in incorrect return values.
* Use helper function psl_str_to_utf8lower() for normalization @domain.
*
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
@ -1029,8 +1040,9 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
* This function finds the shortest private suffix part of @domain by the means
* of the [Mozilla Public Suffix List](https://publicsuffix.org).
*
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
* Other encodings result in unexpected behavior.
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
* Other encodings likely result in incorrect return values.
* Use helper function psl_str_to_utf8lower() for normalization @domain.
*
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
* psl_builtin().
@ -1070,7 +1082,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
* This function loads the public suffixes file named @fname.
* To free the allocated resources, call psl_free().
*
* The suffixes are expected to be lowercase UTF-8 encoded if they are international.
* The suffixes are expected to be UTF-8 encoded (lowercase + NFCK) if they are international.
*
* Returns: Pointer to a PSL context or %NULL on failure.
*
@ -1099,7 +1111,7 @@ psl_ctx_t *psl_load_file(const char *fname)
* This function loads the public suffixes from a FILE pointer.
* To free the allocated resources, call psl_free().
*
* The suffixes are expected to be lowercase UTF-8 encoded if they are international.
* The suffixes are expected to be UTF-8 encoded (lowercase + NFCK) if they are international.
*
* Returns: Pointer to a PSL context or %NULL on failure.
*
@ -1152,6 +1164,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
psl->dafsa = m;
psl->dafsa_size = len;
psl->utf8 = !!GetUtfMode(psl->dafsa, len);
return psl;
}
@ -1163,6 +1176,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
* as of 19.02.2014, the list at https://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
*/
psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
psl->utf8 = 1; /* we put UTF-8 and punycode rules in the lookup vector */
do {
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
@ -1231,9 +1245,10 @@ psl_ctx_t *psl_load_fp(FILE *fp)
suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
}
suffixp->label = suffixp->label_buf; /* set label to changed address */
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
if (suffixp) {
suffixp->label = suffixp->label_buf; /* set label to changed address */
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
}
}
} while ((linep = fgets(buf, sizeof(buf), fp)));
@ -1275,8 +1290,8 @@ void psl_free(psl_ctx_t *psl)
* The builtin data also contains punycode entries, one for each international domain name.
*
* If the generation of built-in data has been disabled during compilation, %NULL will be returned.
* When using the builtin psl context, you can provide UTF-8 or punycode representations of domains to
* functions like psl_is_public_suffix().
* When using the builtin psl context, you can provide UTF-8 (lowercase + NFCK) or ASCII/ACE (punycode)
* representations of domains to functions like psl_is_public_suffix().
*
* Returns: Pointer to the built in PSL data or NULL if this data is not available.
*
@ -1495,8 +1510,10 @@ static int _isip(const char *hostname)
* This helper function checks whether @cookie_domain is an acceptable cookie domain value for the request
* @hostname.
*
* For international domain names both, @hostname and @cookie_domain, have to be either in lowercase UTF-8
* or in ASCII form (punycode). Other encodings or mixing UTF-8 and punycode result in unexpected behavior.
* For international domain names both, @hostname and @cookie_domain, have to be either in UTF-8 (lowercase + NFCK)
* or in ASCII/ACE (punycode) format. Other encodings or mixing UTF-8 and punycode likely result in incorrect return values.
*
* Use helper function psl_str_to_utf8lower() for normalization of @hostname and @cookie_domain.
*
* Examples:
* 1. Cookie domain 'example.com' would be acceptable for hostname 'www.example.com',
@ -1553,8 +1570,8 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
* @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
* @lower: return value containing the converted string
*
* This helper function converts a string to lowercase UTF-8 representation.
* Lowercase UTF-8 is needed as input to the domain checking functions.
* This helper function converts a string to UTF-8 lowercase + NFCK representation.
* Lowercase + NFCK UTF-8 is needed as input to the domain checking functions.
*
* @lower is set to %NULL on error.
*
@ -1567,6 +1584,7 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
* PSL_ERR_TO_UTF16: Failed to convert @str to unicode
* PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase
* PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8
* PSL_ERR_NO_MEM: Failed to allocate memory
*
* Since: 0.4
*/
@ -1585,7 +1603,8 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
if (lower) {
char *p;
*lower = strdup(str);
if (!(*lower = strdup(str)))
return PSL_ERR_NO_MEM;
/* convert ASCII string to lowercase */
for (p = *lower; *p; p++)
@ -1604,10 +1623,21 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
char *utf8_lower;
UConverter *uconv;
/* C89 allocation */
utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1));
utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
utf8_lower = alloca(str_length * 2 + 1);
if (str_length < 256) {
/* C89 allocation */
utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1));
utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
utf8_lower = alloca(str_length * 2 + 1);
} else {
utf16_dst = malloc(sizeof(UChar) * (str_length * 2 + 1));
utf16_lower = malloc(sizeof(UChar) * (str_length * 2 + 1));
utf8_lower = malloc(str_length * 2 + 1);
if (!utf16_dst || !utf16_lower || !utf8_lower) {
ret = PSL_ERR_NO_MEM;
goto out;
}
}
uconv = ucnv_open(encoding, &status);
if (U_SUCCESS(status)) {
@ -1619,9 +1649,16 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
if (U_SUCCESS(status)) {
u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status);
if (U_SUCCESS(status)) {
if (lower)
*lower = strdup(utf8_lower);
ret = PSL_SUCCESS;
if (lower) {
if (str_length < 256) {
if (!(*lower = strdup(utf8_lower)))
ret = PSL_ERR_NO_MEM;
} else {
*lower = utf8_lower;
utf8_lower = NULL;
}
}
} else {
ret = PSL_ERR_TO_UTF8;
/* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
@ -1638,6 +1675,12 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
ret = PSL_ERR_CONVERTER;
/* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */
}
out:
if (str_length >= 256) {
free(utf16_dst);
free(utf16_lower);
free(utf8_lower);
}
} while (0);
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
do {
@ -1655,26 +1698,32 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
if (cd != (iconv_t)-1) {
char *tmp = (char *)str; /* iconv won't change where str points to, but changes tmp itself */
size_t tmp_len = strlen(str);
size_t tmp_len = strlen(str) + 1;
size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
char *dst = malloc(dst_len + 1), *dst_tmp = dst;
if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1) {
uint8_t *resbuf = malloc(dst_len * 2 + 1);
size_t len = dst_len * 2; /* leave space for additional \0 byte */
if (!dst) {
ret = PSL_ERR_NO_MEM;
}
else if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
{
/* start size for u8_tolower internal memory allocation.
* u8_tolower() does not terminate the result string. we have 0 byte included in above tmp_len
* and thus in len. */
size_t len = dst_len - dst_len_tmp;
if ((dst = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
/* u8_tolower() does not terminate the result string */
if (lower)
*lower = strndup((char *)dst, len);
if ((tmp = (char *)u8_tolower((uint8_t *)dst, len, 0, UNINORM_NFKC, NULL, &len))) {
ret = PSL_SUCCESS;
if (lower) {
*lower = tmp;
tmp = NULL;
} else
free(tmp);
} else {
ret = PSL_ERR_TO_LOWER;
/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
}
if (lower)
*lower = strndup(dst, dst_len - dst_len_tmp);
ret = PSL_SUCCESS;
} else {
ret = PSL_ERR_TO_UTF8;
/* fprintf(stderr, "Failed to convert '%s' string into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
@ -1686,19 +1735,21 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
ret = PSL_ERR_TO_UTF8;
/* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
}
} else
ret = PSL_SUCCESS;
/* convert to lowercase */
if (ret == PSL_SUCCESS) {
uint8_t *dst, resbuf[256];
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
} else {
/* we need a conversion to lowercase */
if ((dst = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str), 0, UNINORM_NFKC, resbuf, &len))) {
/* u8_tolower() does not terminate the result string */
if (lower)
*lower = strndup((char *)dst, len);
uint8_t *tmp;
/* start size for u8_tolower internal memory allocation.
* u8_tolower() does not terminate the result string, so include terminating 0 byte in len. */
size_t len = u8_strlen((uint8_t *)str) + 1;
if ((tmp = u8_tolower((uint8_t *)str, len, 0, UNINORM_NFKC, NULL, &len))) {
ret = PSL_SUCCESS;
if (lower) {
*lower = (char*)tmp;
tmp = NULL;
} else
free(tmp);
} else {
ret = PSL_ERR_TO_LOWER;
/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */

View File

@ -153,11 +153,6 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
if ((fp = fopen("in.tmp", "w"))) {
for (it = 0; it < v->cur; it++) {
_psl_entry_t *e = _vector_get(v, it);
unsigned char *s = (unsigned char *)e->label_buf;
/* search for non-ASCII label and skip it */
while (*s && *s < 128) s++;
if (*s) continue;
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
}
@ -191,11 +186,6 @@ static int _print_psl_entries_dafsa_binary(const char *fname, const _psl_vector_
if ((fp = fopen("in.tmp", "w"))) {
for (it = 0; it < v->cur; it++) {
_psl_entry_t *e = _vector_get(v, it);
unsigned char *s = (unsigned char *)e->label_buf;
/* search for non-ASCII label and skip it */
while (*s && *s < 128) s++;
if (*s) continue;
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
}

View File

@ -23,3 +23,14 @@ check_PROGRAMS = $(PSL_TESTS)
TESTS_ENVIRONMENT = TESTS_VALGRIND="@VALGRIND_ENVIRONMENT@"
TESTS = $(PSL_TESTS)
# dafsa.psl and dafsa_ascii.psl must be created before any test is executed
# check-local target works in parallel to the tests, so the test suite will likely fail
BUILT_SOURCES = psl.dafsa psl_ascii.dafsa
psl.dafsa:
$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary "$(PSL_FILE)" psl.dafsa
psl_ascii.dafsa:
$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary --encoding=ascii "$(PSL_FILE)" psl_ascii.dafsa
clean-local:
rm -f psl.dafsa psl_ascii.dafsa

View File

@ -65,6 +65,7 @@ static void test_psl(void)
{ "www.his.name", "his.name", 1 },
{ "www.his.name", "name", 0 },
{ "www.example.com", "www.example.com", 1 },
{ "www.example.com", "wwww.example.com", 0 },
{ "www.example.com", "example.com", 1 },
{ "www.example.com", "com", 0 }, /* not accepted by normalization (PSL rule 'com') */
{ "www.example.com", "example.org", 0 },
@ -77,6 +78,8 @@ static void test_psl(void)
{ "2a00:1450:4013:c01::8b", ":1450:4013:c01::8b", 0 }, /* IPv6 address, partial match */
{ "::ffff:192.1.123.2", "::ffff:192.1.123.2", 1 }, /* IPv6 address dotted-quad, full match */
{ "::ffff:192.1.123.2", ".1.123.2", 0 }, /* IPv6 address dotted-quad, partial match */
{ NULL, ".1.123.2", 0 },
{ "hiho", NULL, 0 },
};
unsigned it;
psl_ctx_t *psl;
@ -98,6 +101,9 @@ static void test_psl(void)
}
}
/* do checks to cover more code paths in libpsl */
psl_is_cookie_domain_acceptable(NULL, "example.com", "example.com");
psl_free(psl);
}

View File

@ -49,7 +49,7 @@ static int
struct timespec ts1, ts2;
#endif
static inline int _isspace_ascii(const char c)
static int _isspace_ascii(const char c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
@ -131,7 +131,7 @@ static void test_psl_entry(const psl_ctx_t *psl, const char *domain, int type)
static void test_psl(void)
{
FILE *fp;
psl_ctx_t *psl;
psl_ctx_t *psl, *psl3, *psl4;
const psl_ctx_t *psl2;
int type = 0;
char buf[256], *linep, *p;
@ -142,6 +142,16 @@ static void test_psl(void)
psl2 = psl_builtin();
printf("builtin PSL has %d suffixes and %d exceptions\n", psl_suffix_count(psl2), psl_suffix_exception_count(psl2));
if (!(psl3 = psl_load_file("psl.dafsa"))) {
fprintf(stderr, "Failed to load 'psl.dafsa'\n");
failed++;
}
if (!(psl4 = psl_load_file("psl_ascii.dafsa"))) {
fprintf(stderr, "Failed to load 'psl_ascii.dafsa'\n");
failed++;
}
if ((fp = fopen(PSL_FILE, "r"))) {
#ifdef HAVE_CLOCK_GETTIME
clock_gettime(CLOCK_REALTIME, &ts1);
@ -174,6 +184,12 @@ static void test_psl(void)
if (psl2)
test_psl_entry(psl2, p, type);
if (psl3)
test_psl_entry(psl3, p, type);
if (psl4)
test_psl_entry(psl4, p, type);
}
#ifdef HAVE_CLOCK_GETTIME
@ -185,8 +201,10 @@ static void test_psl(void)
failed++;
}
psl_free(psl);
psl_free(psl4);
psl_free(psl3);
psl_free((psl_ctx_t *)psl2);
psl_free(psl);
}
int main(int argc, const char * const *argv)

View File

@ -84,6 +84,7 @@ static void test_psl(void)
{ "adfhoweirh", 1 }, /* unknown TLD */
};
unsigned it;
int result, ver;
psl_ctx_t *psl;
psl = psl_load_file(PSL_FILE);
@ -92,7 +93,7 @@ static void test_psl(void)
for (it = 0; it < countof(test_data); it++) {
const struct test_data *t = &test_data[it];
int result = psl_is_public_suffix(psl, t->domain);
result = psl_is_public_suffix(psl, t->domain);
if (result == t->result) {
ok++;
@ -102,6 +103,68 @@ static void test_psl(void)
}
}
/* do some checks to cover more code paths in libpsl */
psl_is_public_suffix(NULL, "xxx");
if ((ver = psl_check_version_number(0)) == 0) {
printf("psl_check_version_number(0) is 0\n");
failed++;
} else {
if (((result = psl_check_version_number(ver)) != ver)) {
printf("psl_check_version_number(%06X) is %06X\n", ver, result);
failed++;
}
if (((result = psl_check_version_number(ver - 1)) != 0)) {
printf("psl_check_version_number(%06X) is %06X\n", ver - 1, result);
failed++;
}
if (((result = psl_check_version_number(ver + 1)) != ver)) {
printf("psl_check_version_number(%06X) is %06X\n", ver, result);
failed++;
}
}
psl_str_to_utf8lower("www.example.com", "utf-8", "en", NULL);
psl_str_to_utf8lower(NULL, "utf-8", "en", NULL);
{
char *lower = NULL;
psl_str_to_utf8lower("www.example.com", NULL, "de", &lower);
free(lower); lower = NULL;
psl_str_to_utf8lower("\374bel.de", NULL, "de", &lower);
free(lower); lower = NULL;
psl_str_to_utf8lower("\374bel.de", "iso-8859-1", NULL, &lower);
free(lower); lower = NULL;
psl_str_to_utf8lower(NULL, "utf-8", "en", &lower);
free(lower); lower = NULL;
}
psl_get_version();
psl_builtin_filename();
psl_builtin_outdated();
psl_builtin_file_time();
psl_builtin_sha1sum();
psl_suffix_wildcard_count(NULL);
psl_suffix_wildcard_count(psl);
psl_suffix_wildcard_count(psl_builtin());
psl_suffix_count(NULL);
psl_suffix_exception_count(NULL);
psl_load_file(NULL);
psl_load_fp(NULL);
psl_registrable_domain(NULL, "");
psl_registrable_domain(psl, NULL);
psl_registrable_domain(psl, "www.example.com");
psl_unregistrable_domain(NULL, "");
psl_unregistrable_domain(psl, NULL);
psl_is_public_suffix2(NULL, "", PSL_TYPE_ANY);
psl_is_public_suffix2(psl, NULL, PSL_TYPE_ANY);
psl_free(psl);
}

View File

@ -50,14 +50,28 @@ static int
ok,
failed;
static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
static void testx(const psl_ctx_t *psl, const char *domain, const char *encoding, const char *lang, const char *expected_result)
{
const char *result;
char *lower;
int rc;
/* our test data is fixed to UTF-8 (english), so provide it here */
if (psl_str_to_utf8lower(domain, "utf-8", "en", &lower) == PSL_SUCCESS)
/* just to cover special code paths for valgrind checking */
psl_str_to_utf8lower(domain, encoding, lang, NULL);
if ((rc = psl_str_to_utf8lower(domain, encoding, lang, &lower)) == PSL_SUCCESS)
domain = lower;
/* non-ASCII domains fail here if no runtime IDN library is configured, so skip it */
#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
else if (domain) {
/* if we do not runtime support, test failure have to be skipped */
failed++;
printf("psl_str_to_utf8lower(%s)=%d\n", domain ? domain : "NULL", rc);
free(lower);
return;
}
#endif
result = psl_registrable_domain(psl, domain);
@ -72,13 +86,28 @@ static void test(const psl_ctx_t *psl, const char *domain, const char *expected_
free(lower);
}
static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
{
testx(psl, domain, "utf-8", "en", expected_result);
}
static void test_iso(const psl_ctx_t *psl, const char *domain, const char *expected_result)
{
/* makes only sense with a runtime IDN library configured */
#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
testx(psl, domain, "iso-8859-15", "de", expected_result);
#endif
}
static void test_psl(void)
{
FILE *fp;
const psl_ctx_t *psl;
const char *p;
char buf[256], domain[128], expected_regdom[128], semicolon[2];
char lbuf[258];
int er_is_null, d_is_null;
unsigned it;
psl = psl_builtin();
@ -101,6 +130,22 @@ static void test_psl(void)
/* Norwegian with lowercase oe */
test(psl, "www.\303\270yer.no", "www.\303\270yer.no");
/* Norwegian with lowercase oe, encoded as ISO-8859-15 */
test_iso(psl, "www.\370yer.no", "www.\303\270yer.no");
/* Testing special code paths of psl_str_to_utf8lower() */
for (it = 254; it <= 257; it++) {
memset(lbuf, 'a', it);
lbuf[it] = 0;
lbuf[0] = '\370';
test_iso(psl, lbuf, NULL);
lbuf[0] = '\303';
lbuf[1] = '\270';
test(psl, lbuf, NULL);
}
/* special check with NULL psl context and TLD */
test(psl, "whoever.forgot.his.name", "whoever.forgot.his.name");

View File

@ -12,3 +12,5 @@ LDADD = ../src/libpsl.la
#if WITH_LIBIDN
# LDADD += -lidn
#endif
dist_man_MANS = psl.1