Release v0.15.0
-----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEHLJ9vJhhSy1YQWRtCDAttqJnBCgFAlgppfATHHRpbS5ydWVo c2VuQGdteC5kZQAKCRAIMC22omcEKGy/D/9iduEEwzSDt22U6MxmqD77hvgB9hQn 8Xn7CsTye408EUlw2ENYg4H/V3xNQN7ZbA4wJi20FmcniFhSUbSv9UD5Vr2FSTZS NJ1EpAbqljswE5x49u3lWRyo8XOEbVdWZS66+E5W9T/0Nl6kLUk4nYkBE6LBQGhp vd6+p74kqpjJGHhrZ4uYV5bkttoeSee/arGzvWTR3kmgERVCm9Qr90ldOx3Sp91s iqwb6RpDVkL3q5sA9bOfrpEDdADJdQYLr1BkkTOb7ZA52uEhdU6nEyfswoJsaBuI aj1hOgspekVqEs7ZUpltnT2GPbFyXtj338SA0738xxZaTm/eYzvNea5Fnpg4fnQb /w7I++IZGmdXljQnk1gtqzIgxCwia34u2/T4XgEpyd/h9A5PUdjo2EKPtBgHRFG7 GnK9IRgLHqdxZFpfiUyp2zIZL8+/PUlD5Ekwi1D3Wgc5PSOO0rMHR1IWzCmpopbU Mo9E511RcIdsn+IStB1gwclT5qk1fo3n5dcQBBXtpPTEJ6CRedLK+WcbLyhh3R0Z ham1D8t3kVDQgfg57mEJOIS5sgcLj5LR3ydya5ELf3pS6FVo4qvBO4Sp3E6wbgpE 9n5D150bKyv+RkTuNTgW8uahhYdR++bXUPWbaZReGVxKy3VB7VikDusRfnVFej9c cJP1HAskz6qTwA== =ksJN -----END PGP SIGNATURE----- Merge tag 'libpsl-0.15.0' into debian Release v0.15.0
This commit is contained in:
commit
4ef2e7c54b
|
@ -1,4 +1,7 @@
|
|||
*.exe
|
||||
*.gcda
|
||||
*.gcno
|
||||
*.gcov
|
||||
*.gz
|
||||
*.la
|
||||
*.lo
|
||||
|
@ -10,6 +13,7 @@
|
|||
*.cache
|
||||
*.plist
|
||||
*.stamp
|
||||
ABOUT-NLS
|
||||
aclocal.m4
|
||||
ar-lib
|
||||
autom4te.cache/
|
||||
|
@ -43,6 +47,8 @@ gtk-doc.m4
|
|||
gtk-doc.make
|
||||
include/libpsl.h
|
||||
install-sh
|
||||
lcov/
|
||||
libpsl.info
|
||||
libpsl.pc
|
||||
libtool
|
||||
ltmain.sh
|
||||
|
@ -67,10 +73,13 @@ po/remove-potcdate.sed
|
|||
po/stamp-po
|
||||
src/psl2c
|
||||
src/suffixes.c
|
||||
src/suffixes_dafsa.c
|
||||
stamp-h1
|
||||
test-driver
|
||||
tests/*.log
|
||||
tests/*.trs
|
||||
tests/psl.dafsa
|
||||
tests/psl_ascii.dafsa
|
||||
tests/test-is-cookie-domain-acceptable
|
||||
tests/test-is-public
|
||||
tests/test-is-public-all
|
||||
|
|
|
@ -34,6 +34,7 @@ addons:
|
|||
- libicu-dev
|
||||
- libunistring0
|
||||
- libunistring-dev
|
||||
- lcov
|
||||
|
||||
script:
|
||||
- ./autogen.sh
|
||||
|
@ -44,3 +45,4 @@ script:
|
|||
- ./configure --enable-runtime=$RUNTIME --disable-builtin && make clean && make -j4 && make check -j4
|
||||
- ./configure --enable-gtk-doc && make -j4 && make check -j4
|
||||
- make distcheck
|
||||
- if [[ $CC == "gcc" && $RUNTIME == "libicu" ]]; then ./.travis_coveralls.sh; fi
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
#!/bin/bash
|
||||
|
||||
make check-coverage-libicu
|
||||
pip install --user cpp-coveralls
|
||||
coveralls --include libwget/ --include src/ -e "src/psl2c.c"
|
2
AUTHORS
2
AUTHORS
|
@ -16,3 +16,5 @@ Christopher Meng (Fedora building)
|
|||
Jakub Čajka
|
||||
Giuseppe Scrivano
|
||||
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
|
||||
Daurnimator (Code review, discussion, reports)
|
||||
Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)
|
||||
|
|
25
Makefile.am
25
Makefile.am
|
@ -19,3 +19,28 @@ dist-hook:
|
|||
mkdir -p $(distdir)/list/tests
|
||||
cp -p $(PSL_FILE) $(distdir)/list
|
||||
cp -p $(PSL_TESTFILE) $(distdir)/list/tests
|
||||
|
||||
clean-local:
|
||||
rm -rf */*.gc?? */*/*.gc?? libpsl.info lcov
|
||||
|
||||
check-coverage:
|
||||
if test -z "$(XLIB)"; then \
|
||||
CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --disable-runtime --disable-builtin; \
|
||||
else \
|
||||
CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --enable-runtime=$(XLIB) --enable-builtin=$(XLIB); \
|
||||
fi
|
||||
$(MAKE) clean && $(MAKE)
|
||||
lcov --capture --initial --directory src --output-file libpsl.info
|
||||
$(MAKE) check
|
||||
lcov --capture --directory src --output-file libpsl.info
|
||||
lcov --remove libpsl.info 'src/psl2c.c' -o libpsl.info
|
||||
genhtml --prefix . libpsl.info --legend --title "libpsl" --output-directory=lcov
|
||||
|
||||
check-coverage-libidn:
|
||||
XLIB=libidn $(MAKE) check-coverage
|
||||
|
||||
check-coverage-libidn2:
|
||||
XLIB=libidn2 $(MAKE) check-coverage
|
||||
|
||||
check-coverage-libicu:
|
||||
XLIB=libicu $(MAKE) check-coverage
|
||||
|
|
9
NEWS
9
NEWS
|
@ -1,5 +1,14 @@
|
|||
Copyright (C) 2014-2016 Tim Rühsen
|
||||
|
||||
14.11.2016 Release V0.15.0
|
||||
* Python3 compatibility for psl-make-dafsa
|
||||
* Support for UTF-8 in DAFSA data
|
||||
* Skip punycode conversion if DAFSA has UTF-8
|
||||
* Better code coverage by test suite
|
||||
* Code cleanup and enhancements
|
||||
* Install man pages for psl-make-dafsa and psl
|
||||
* Enhancements to the documentation
|
||||
|
||||
30.07.2016 Release V0.14.0
|
||||
* Remove unneeded libraries from tools/psl link step
|
||||
* Use https instead of http where possible
|
||||
|
|
12
README.md
12
README.md
|
@ -1,4 +1,12 @@
|
|||
[](https://travis-ci.org/rockdaboot/libpsl)
|
||||
[](https://travis-ci.org/rockdaboot/libpsl)
|
||||
[](https://scan.coverity.com/projects/rockdaboot-libpsl)
|
||||
[](https://coveralls.io/github/rockdaboot/libpsl?branch=master)
|
||||
|
||||
Solaris OpenCSW [](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-amd64)
|
||||
[](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-i386)
|
||||
[](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-sparc)
|
||||
[](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-sparcv9)
|
||||
|
||||
|
||||
libpsl - C library to handle the Public Suffix List
|
||||
===================================================
|
||||
|
@ -116,7 +124,7 @@ Mailing List
|
|||
|
||||
To join the mailing list send an email to
|
||||
|
||||
<libpsl-bugs+subscribe@googlegroups.com>
|
||||
libpsl-bugs+subscribe@googlegroups.com
|
||||
|
||||
and follow the instructions provided by the answer mail.
|
||||
|
||||
|
|
10
configure.ac
10
configure.ac
|
@ -1,7 +1,7 @@
|
|||
|
||||
AC_INIT([libpsl], [0.14.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
|
||||
AC_INIT([libpsl], [0.15.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
|
||||
AC_PREREQ([2.59])
|
||||
AM_INIT_AUTOMAKE([1.10 -Wall no-define foreign])
|
||||
AM_INIT_AUTOMAKE([1.10 no-define foreign])
|
||||
|
||||
# Generate two configuration headers; one for building the library itself with
|
||||
# an autogenerated template, and a second one that will be installed alongside
|
||||
|
@ -88,7 +88,7 @@ PKG_PROG_PKG_CONFIG
|
|||
# 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0.
|
||||
# 5. If any interfaces have been added since the last public release, then increment age.
|
||||
# 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0.
|
||||
AC_SUBST([LIBPSL_SO_VERSION], [5:1:0])
|
||||
AC_SUBST([LIBPSL_SO_VERSION], [5:2:0])
|
||||
AC_SUBST([LIBPSL_VERSION], $VERSION)
|
||||
|
||||
# Check for enable/disable builtin PSL data
|
||||
|
@ -168,7 +168,7 @@ if test "$enable_runtime" = "libicu" -o "$enable_builtin" = "libicu"; then
|
|||
[AC_LANG_PROGRAM(
|
||||
[[#include <unicode/ustring.h>]],
|
||||
[[u_strToUTF8(NULL, 0, NULL, NULL, 0, NULL);]])],
|
||||
[HAVE_LIBICU=yes; AC_MSG_RESULT([yes])],
|
||||
[HAVE_LIBICU=yes; LIBICU_LIBS="-licuuc"; AC_MSG_RESULT([yes])],
|
||||
[AC_MSG_RESULT([no]); AC_MSG_ERROR(You requested libicu but it is not installed.)])
|
||||
LIBS=$OLDLIBS
|
||||
])
|
||||
|
@ -191,7 +191,7 @@ fi
|
|||
if test "x$HAVE_LIBIDN2" = "xyes" -o "x$HAVE_LIBIDN" = "xyes"; then
|
||||
# Check for libunistring, we need it for psl_str_to_utf8lower()
|
||||
OLDLIBS=$LIBS
|
||||
AC_SEARCH_LIBS(u8_tolower, unistring, HAVE_UNISTRING=yes, AC_MSG_ERROR(You requested libidn2 but libunistring is not installed.))
|
||||
AC_SEARCH_LIBS(u8_tolower, unistring, HAVE_UNISTRING=yes, AC_MSG_ERROR(You requested libidn2|libidn but libunistring is not installed.))
|
||||
LIBS=$OLDLIBS
|
||||
fi
|
||||
|
||||
|
|
|
@ -51,6 +51,7 @@ for CC in gcc clang; do
|
|||
for xLCALL in C tr_TR.utf8; do
|
||||
export TESTS_ENVIRONMENT="LC_ALL=$xLCALL VALGRIND_TESTS=$xVALGRIND"
|
||||
echo " *** TESTS_ENVIRONMENT=\"$TESTS_ENVIRONMENT\"" make check -j$CORES
|
||||
make clean > /dev/null
|
||||
make check -j$CORES > /dev/null
|
||||
done
|
||||
done
|
||||
|
|
|
@ -53,10 +53,11 @@ extern "C" {
|
|||
* psl_error_t:
|
||||
* @PSL_SUCCESS: Successful return.
|
||||
* @PSL_ERR_INVALID_ARG: Invalid argument.
|
||||
* @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter
|
||||
* @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter.
|
||||
* @PSL_ERR_TO_UTF16: Failed to convert to utf-16.
|
||||
* @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase.
|
||||
* @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8.
|
||||
* @PSL_ERR_NO_MEM: Failed to allocate memory.
|
||||
*
|
||||
* Return codes for PSL functions.
|
||||
* Negative return codes mean failure.
|
||||
|
@ -66,9 +67,10 @@ typedef enum {
|
|||
PSL_SUCCESS = 0,
|
||||
PSL_ERR_INVALID_ARG = -1,
|
||||
PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */
|
||||
PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */
|
||||
PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */
|
||||
PSL_ERR_TO_UTF8 = -5 /* failed to convert utf-16 to utf-8 */
|
||||
PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */
|
||||
PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */
|
||||
PSL_ERR_TO_UTF8 = -5, /* failed to convert utf-16 to utf-8 */
|
||||
PSL_ERR_NO_MEM = -6 /* failed to allocate memory */
|
||||
} psl_error_t;
|
||||
|
||||
typedef struct _psl_ctx_st psl_ctx_t;
|
||||
|
|
2
list
2
list
|
@ -1 +1 @@
|
|||
Subproject commit 1df90f84db1a041991a48e46e786705f7161ab4c
|
||||
Subproject commit 41a519ad34cf86ff4470b967d9e4755d72b63a6c
|
|
@ -11,7 +11,7 @@ libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
|
|||
# include ABI version information
|
||||
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
|
||||
if WITH_LIBICU
|
||||
libpsl_la_LDFLAGS += -licuuc
|
||||
libpsl_la_LDFLAGS += $(LIBICU_LIBS)
|
||||
endif
|
||||
if WITH_LIBIDN2
|
||||
libpsl_la_LDFLAGS += -lidn2 -lunistring
|
||||
|
@ -24,7 +24,7 @@ noinst_PROGRAMS = psl2c
|
|||
psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
|
||||
psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/psl-make-dafsa\"
|
||||
if BUILTIN_GENERATOR_LIBICU
|
||||
psl2c_LDADD = -licuuc
|
||||
psl2c_LDADD = $(LIBICU_LIBS)
|
||||
endif
|
||||
if BUILTIN_GENERATOR_LIBIDN2
|
||||
psl2c_LDADD = @LTLIBICONV@ -lidn2 -lunistring
|
||||
|
@ -39,3 +39,5 @@ suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
|
|||
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
|
||||
|
||||
EXTRA_DIST = psl-make-dafsa LICENSE.chromium
|
||||
|
||||
dist_man_MANS = psl-make-dafsa.1
|
||||
|
|
|
@ -21,6 +21,48 @@
|
|||
|
||||
#define CHECK_LT(a, b) if ((a) >= b) return 0
|
||||
|
||||
static const char multibyte_length_table[16] = {
|
||||
0, 0, 0, 0, /* 0x00-0x3F */
|
||||
0, 0, 0, 0, /* 0x40-0x7F */
|
||||
0, 0, 0, 0, /* 0x80-0xBF */
|
||||
2, 2, 3, 4, /* 0xC0-0xFF */
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Get lenght of multibyte character sequence starting at a given byte.
|
||||
* Returns zero if the byte is not a valid leading byte in UTF-8.
|
||||
*/
|
||||
static int GetMultibyteLength(char c) {
|
||||
return multibyte_length_table[((unsigned char)c) >> 4];
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves pointers one byte forward.
|
||||
*/
|
||||
static void NextPos(const unsigned char** pos,
|
||||
const char** key,
|
||||
const char** multibyte_start)
|
||||
{
|
||||
++*pos;
|
||||
if (*multibyte_start) {
|
||||
/* Advance key to next byte in multibyte sequence. */
|
||||
++*key;
|
||||
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
|
||||
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
|
||||
*multibyte_start = 0;
|
||||
} else {
|
||||
if (GetMultibyteLength(**key)) {
|
||||
/* Multibyte prefix was matched in the dafsa, start matching multibyte
|
||||
* content in next round. */
|
||||
*multibyte_start = *key;
|
||||
} else {
|
||||
/* Advance key as a single byte character was matched. */
|
||||
++*key;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read next offset from pos.
|
||||
* Returns true if an offset could be read, false otherwise.
|
||||
|
@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
|||
return(*offset & 0x80) != 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset matches first character in key.
|
||||
* This version assumes a range check was already performed by the caller.
|
||||
*/
|
||||
|
||||
static int IsMatchUnchecked(const unsigned char matcher,
|
||||
const char* key,
|
||||
const char* multibyte_start)
|
||||
{
|
||||
if (multibyte_start) {
|
||||
/* Multibyte matching mode. */
|
||||
if (multibyte_start == key) {
|
||||
/* Match leading byte, which will also match the sequence length. */
|
||||
return (matcher ^ 0x80) == (const unsigned char)*key;
|
||||
} else {
|
||||
/* Match following bytes. */
|
||||
return (matcher ^ 0xC0) == (const unsigned char)*key;
|
||||
}
|
||||
}
|
||||
/* If key points at a leading byte in a multibyte sequence, but we are not yet
|
||||
* in multibyte mode, then the dafsa should contain a special byte to indicate
|
||||
* a mode switch. */
|
||||
if (GetMultibyteLength(*key)) {
|
||||
return matcher == 0x1F;
|
||||
}
|
||||
/* Normal matching of a single byte character. */
|
||||
return matcher == (const unsigned char)*key;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if byte at offset matches first character in key.
|
||||
* This version matches characters not last in label.
|
||||
|
@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
|||
|
||||
static int IsMatch(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* key)
|
||||
const char* key,
|
||||
const char* multibyte_start)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
return *offset == *key;
|
||||
return IsMatchUnchecked(*offset, key, multibyte_start);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,
|
|||
|
||||
static int IsEndCharMatch(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* key)
|
||||
const char* key,
|
||||
const char* multibyte_start)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
return *offset == (*key | 0x80);
|
||||
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,
|
|||
|
||||
static int GetReturnValue(const unsigned char* offset,
|
||||
const unsigned char* end,
|
||||
const char* multibyte_start,
|
||||
int* return_value)
|
||||
{
|
||||
CHECK_LT(offset, end);
|
||||
if ((*offset & 0xE0) == 0x80) {
|
||||
if (!multibyte_start && (*offset & 0xE0) == 0x80) {
|
||||
*return_value = *offset & 0x0F;
|
||||
return 1;
|
||||
}
|
||||
|
@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
|||
const unsigned char* end = graph + length;
|
||||
const unsigned char* offset = pos;
|
||||
const char* key_end = key + key_length;
|
||||
const char* multibyte_start = 0;
|
||||
|
||||
while (GetNextOffset(&pos, end, &offset)) {
|
||||
/*char <char>+ end_char offsets
|
||||
|
@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
|||
|
||||
if (key != key_end && !IsEOL(offset, end)) {
|
||||
/* Leading <char> is not a match. Don't dive into this child */
|
||||
if (!IsMatch(offset, end, key))
|
||||
if (!IsMatch(offset, end, key, multibyte_start))
|
||||
continue;
|
||||
did_consume = 1;
|
||||
++offset;
|
||||
++key;
|
||||
NextPos(&offset, &key, &multibyte_start);
|
||||
/* Possible matches at this point:
|
||||
* <char>+ end_char offsets
|
||||
* <char>+ return value
|
||||
|
@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
|||
|
||||
/* Remove all remaining <char> nodes possible */
|
||||
while (!IsEOL(offset, end) && key != key_end) {
|
||||
if (!IsMatch(offset, end, key))
|
||||
if (!IsMatch(offset, end, key, multibyte_start))
|
||||
return -1;
|
||||
++key;
|
||||
++offset;
|
||||
NextPos(&offset, &key, &multibyte_start);
|
||||
}
|
||||
}
|
||||
/* Possible matches at this point:
|
||||
|
@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
|||
if (key == key_end) {
|
||||
int return_value;
|
||||
|
||||
if (GetReturnValue(offset, end, &return_value))
|
||||
if (GetReturnValue(offset, end, multibyte_start, &return_value))
|
||||
return return_value;
|
||||
/* The DAFSA guarantees that if the first char is a match, all
|
||||
* remaining char elements MUST match if the key is truly present.
|
||||
|
@ -191,14 +264,22 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
|||
return -1;
|
||||
continue;
|
||||
}
|
||||
if (!IsEndCharMatch(offset, end, key)) {
|
||||
if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
|
||||
if (did_consume)
|
||||
return -1; /* Unexpected */
|
||||
continue;
|
||||
}
|
||||
++key;
|
||||
pos = ++offset; /* Dive into child */
|
||||
NextPos(&offset, &key, &multibyte_start);
|
||||
pos = offset; /* Dive into child */
|
||||
}
|
||||
|
||||
return -1; /* No match */
|
||||
}
|
||||
|
||||
/* prototype to skip warning with -Wmissing-prototypes */
|
||||
int _HIDDEN GetUtfMode(const unsigned char *graph, size_t length);
|
||||
|
||||
int _HIDDEN GetUtfMode(const unsigned char *graph, size_t length)
|
||||
{
|
||||
return length > 0 && graph[length - 1] < 0x80;
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python2
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2014 The Chromium Authors. All rights reserved.
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE.chromium file.
|
||||
|
@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
|
|||
and generates a C++ file with a byte array representing graph that can be
|
||||
used as a memory efficient replacement for the perfect hash table.
|
||||
|
||||
The input strings are assumed to consist of printable 7-bit ASCII characters
|
||||
and the return values are assumed to be one digit integers.
|
||||
The input strings must consist of printable 7-bit ASCII characters or UTF-8
|
||||
multibyte sequences. Control characters in the range [0x00-0x1F] are not
|
||||
allowed. The return values must be one digit integers. .
|
||||
|
||||
In this program a DAFSA is a diamond shaped graph starting at a common
|
||||
source node and ending at a common sink node. All internal nodes contain
|
||||
|
@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:
|
|||
|
||||
<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
||||
|
||||
<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
|
||||
<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
|
||||
<char> ::= < byte in range [0x1F-0x7F] >
|
||||
<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
|
||||
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
||||
|
||||
<offset1> ::= < byte in range [0x00-0x3F] >
|
||||
|
@ -84,13 +85,18 @@ The generated byte array can described by the following BNF:
|
|||
| <prefix> <node>
|
||||
| <end_label>
|
||||
|
||||
<dafsa> ::= <source>
|
||||
| <dafsa> <node>
|
||||
<graph> ::= <graph>
|
||||
| <graph> <node>
|
||||
|
||||
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
|
||||
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
|
||||
|
||||
<dafsa> ::= <graph> <version>
|
||||
|
||||
Decoding:
|
||||
|
||||
<char> -> printable 7-bit ASCII character
|
||||
<end_char> & 0x7F -> printable 7-bit ASCII character
|
||||
<char> -> character
|
||||
<end_char> & 0x7F -> character
|
||||
<return value> & 0x0F -> integer
|
||||
<offset1 & 0x3F> -> integer
|
||||
((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
||||
|
@ -105,6 +111,28 @@ between previous child node and next child node. Thus each offset links a node
|
|||
to a child node. The distance is always counted between start addresses, i.e.
|
||||
first byte in decoded offset or first byte in child node.
|
||||
|
||||
Transcoding of UTF-8 multibyte sequences:
|
||||
|
||||
The original DAFSA format was limited to 7-bit printable ASCII characters in
|
||||
range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
|
||||
By transcoding of such characters the new format preserves compatibility with
|
||||
old parsers, so that a DAFSA in the extended format can be used by an old
|
||||
parser without false positives, although strings containing transcoded
|
||||
characters will never match. Since the format is extended rather than being
|
||||
changed, a parser supporting the new format will automatically support data
|
||||
generated in the old format.
|
||||
|
||||
Transcoding is performed by insertion of a start byte with the special value
|
||||
0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
|
||||
the range of printable ASCII.
|
||||
|
||||
2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
|
||||
|
||||
3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
|
||||
|
||||
4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
|
||||
00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
|
||||
|
||||
Example 1:
|
||||
|
||||
%%
|
||||
|
@ -197,8 +225,29 @@ import sys
|
|||
class InputError(Exception):
|
||||
"""Exception raised for errors in the input file."""
|
||||
|
||||
# Length of a character starting at a given byte.
|
||||
char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
|
||||
|
||||
def to_dafsa(words):
|
||||
def to_bytes(n):
|
||||
"""Converts an integer value to a bytes object."""
|
||||
return bytes(bytearray((n,)))
|
||||
|
||||
def to_dafsa(words, utf_mode):
|
||||
"""Generates a DAFSA from a word list and returns the source node.
|
||||
|
||||
Each word is split into characters so that each character is represented by
|
||||
|
@ -206,20 +255,36 @@ def to_dafsa(words):
|
|||
"""
|
||||
if not words:
|
||||
raise InputError('The domain list must not be empty')
|
||||
def to_nodes(word):
|
||||
def to_nodes(word, multibyte_length):
|
||||
"""Split words into characters"""
|
||||
if not 0x1F < ord(word[0]) < 0x80:
|
||||
raise InputError('Domain names must be printable 7-bit ASCII')
|
||||
if len(word) == 1:
|
||||
return chr(int(word[0], 16) & 0x0F), [None]
|
||||
return word[0], [to_nodes(word[1:])]
|
||||
return [to_nodes(word) for word in words]
|
||||
byte = ord(word[:1])
|
||||
if multibyte_length:
|
||||
# Consume next byte in multibyte sequence.
|
||||
if byte & 0xC0 != 0x80:
|
||||
raise InputError('Invalid UTF-8 multibyte sequence')
|
||||
return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
|
||||
char_length = char_length_table[byte]
|
||||
if char_length == 1:
|
||||
# 7-bit printable ASCII.
|
||||
if len(word) == 1:
|
||||
return to_bytes(int(word[:1], 16) & 0x0F), [None]
|
||||
return word[:1], [to_nodes(word[1:], 0)]
|
||||
elif char_length > 1:
|
||||
# Leading byte in multibyte sequence.
|
||||
if not utf_mode:
|
||||
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
|
||||
if len(word) <= char_length:
|
||||
raise InputError('Unterminated UTF-8 multibyte sequence')
|
||||
return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
||||
# Unexpected character.
|
||||
raise InputError('Domain names must be printable ASCII or UTF-8')
|
||||
|
||||
return [to_nodes(word, 0) for word in words]
|
||||
|
||||
def to_words(node):
|
||||
"""Generates a word list from all paths starting from an internal node."""
|
||||
if not node:
|
||||
return ['']
|
||||
return [b'']
|
||||
return [(node[0] + word) for child in node[1] for word in to_words(child)]
|
||||
|
||||
|
||||
|
@ -286,7 +351,7 @@ def join_suffixes(dafsa):
|
|||
"""Generates a new DAFSA where nodes that represent the same word lists
|
||||
towards the sink are merged.
|
||||
"""
|
||||
nodemap = {frozenset(('',)): None}
|
||||
nodemap = {frozenset((b'',)): None}
|
||||
|
||||
def join(node):
|
||||
"""Returns a macthing node. A new node is created if no matching node
|
||||
|
@ -384,7 +449,7 @@ def encode_prefix(label):
|
|||
will then be a prefix to the label in the child node.
|
||||
"""
|
||||
assert label
|
||||
return [ord(c) for c in reversed(label)]
|
||||
return [c for c in bytearray(reversed(label))]
|
||||
|
||||
|
||||
def encode_label(label):
|
||||
|
@ -396,7 +461,7 @@ def encode_label(label):
|
|||
return buf
|
||||
|
||||
|
||||
def encode(dafsa):
|
||||
def encode(dafsa, utf_mode):
|
||||
"""Encodes a DAFSA to a list of bytes"""
|
||||
output = []
|
||||
offsets = {}
|
||||
|
@ -412,62 +477,66 @@ def encode(dafsa):
|
|||
|
||||
output.extend(encode_links(dafsa, offsets, len(output)))
|
||||
output.reverse()
|
||||
if utf_mode:
|
||||
output.append(0x01)
|
||||
return output
|
||||
|
||||
|
||||
def to_cxx(data):
|
||||
def to_cxx(data, codecs):
|
||||
"""Generates C++ code from a list of encoded bytes."""
|
||||
text = '/* This file is generated. DO NOT EDIT!\n\n'
|
||||
text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
||||
text += ' documentation.'
|
||||
text += '*/\n\n'
|
||||
text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
|
||||
text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
|
||||
text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
||||
text += b' documentation.'
|
||||
text += b'*/\n\n'
|
||||
text += b'static const unsigned char kDafsa['
|
||||
text += bytes(str(len(data)), **codecs)
|
||||
text += b'] = {\n'
|
||||
for i in range(0, len(data), 12):
|
||||
text += ' '
|
||||
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
|
||||
text += ',\n'
|
||||
text += '};\n'
|
||||
text += b' '
|
||||
text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
|
||||
text += b',\n'
|
||||
text += b'};\n'
|
||||
return text
|
||||
|
||||
|
||||
def words_to_whatever(words, converter):
|
||||
def words_to_whatever(words, converter, utf_mode, codecs):
|
||||
"""Generates C++ code from a word list"""
|
||||
dafsa = to_dafsa(words)
|
||||
dafsa = to_dafsa(words, utf_mode)
|
||||
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
||||
dafsa = fun(dafsa)
|
||||
return converter(encode(dafsa))
|
||||
return converter(encode(dafsa, utf_mode), codecs)
|
||||
|
||||
|
||||
def words_to_cxx(words):
|
||||
def words_to_cxx(words, utf_mode, codecs):
|
||||
"""Generates C++ code from a word list"""
|
||||
return words_to_whatever(words, to_cxx)
|
||||
return words_to_whatever(words, to_cxx, utf_mode, codecs)
|
||||
|
||||
|
||||
def words_to_binary(words):
|
||||
def words_to_binary(words, utf_mode, codecs):
|
||||
"""Generates C++ code from a word list"""
|
||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray)
|
||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
|
||||
|
||||
|
||||
def parse_psl2c(infile):
|
||||
def parse_psl2c(infile, utf_mode, codecs):
|
||||
"""Parses file generated by psl2c and extract strings and return code"""
|
||||
lines = [line.strip() for line in infile]
|
||||
lines = [bytes(line.strip(), **codecs) for line in infile]
|
||||
|
||||
for line in lines:
|
||||
if line[-3:-1] != ', ':
|
||||
if line[-3:-1] != b', ':
|
||||
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
|
||||
# Technically the DAFSA format could support return values in range [0-31],
|
||||
# Technically the DAFSA format could support return values in range [0x00-0x1E],
|
||||
# but the values below are the only with a defined meaning.
|
||||
if line[-1] not in '0123456789ABCDEF':
|
||||
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % line[-1])
|
||||
if line[-1] not in b'0123456789ABCDEF':
|
||||
raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1:])
|
||||
|
||||
# with open("gperf.out", 'w') as outfile:
|
||||
# for line in sorted(lines):
|
||||
# outfile.write(line[:-3] + line[-1] + "\n")
|
||||
|
||||
return [line[:-3] + line[-1] for line in sorted(lines)]
|
||||
return [line[:-3] + line[-1:] for line in sorted(lines)]
|
||||
|
||||
|
||||
def parse_psl(infile):
|
||||
def parse_psl(infile, utf_mode, codecs):
|
||||
"""Parses PSL file and extract strings and return code"""
|
||||
PSL_FLAG_EXCEPTION = (1<<0)
|
||||
PSL_FLAG_WILDCARD = (1<<1)
|
||||
|
@ -479,39 +548,39 @@ def parse_psl(infile):
|
|||
section = 0
|
||||
|
||||
for line in infile:
|
||||
line = line.strip()
|
||||
line = bytes(line.strip(), **codecs)
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.startswith("//"):
|
||||
if line.startswith(b'//'):
|
||||
if section == 0:
|
||||
if "===BEGIN ICANN DOMAINS===" in line:
|
||||
if b'===BEGIN ICANN DOMAINS===' in line:
|
||||
section = PSL_FLAG_ICANN
|
||||
elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
|
||||
elif section == 0 and b'===BEGIN PRIVATE DOMAINS===' in line:
|
||||
section = PSL_FLAG_PRIVATE
|
||||
elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
|
||||
elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
|
||||
section = 0
|
||||
elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
|
||||
elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
|
||||
section = 0
|
||||
continue # skip comments
|
||||
|
||||
if line[0] == '!':
|
||||
if line[:1] == b'!':
|
||||
flags = PSL_FLAG_EXCEPTION | section
|
||||
line = line[1:]
|
||||
elif line[0] == '*':
|
||||
if line[1] != '.':
|
||||
elif line[:1] == b'*':
|
||||
if line[1:2] != b'.':
|
||||
print('Unsupported kind of rule (ignored): %s' % line)
|
||||
continue
|
||||
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
|
||||
line = line[2:]
|
||||
else:
|
||||
if not '.' in line:
|
||||
if not b'.' in line:
|
||||
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
|
||||
flags = PSL_FLAG_PLAIN | section
|
||||
|
||||
line = line.decode('utf-8').encode("idna")
|
||||
punycode = line.decode('utf-8').encode('idna')
|
||||
|
||||
if line in psl:
|
||||
if punycode in psl:
|
||||
"""Found existing entry:
|
||||
Combination of exception and plain rule is ambiguous
|
||||
!foo.bar
|
||||
|
@ -521,16 +590,18 @@ def parse_psl(infile):
|
|||
!foo.bar + *.foo.bar
|
||||
foo.bar + *.foo.bar
|
||||
"""
|
||||
print('Found %s/%X (now %X)' % line, psl[line], flags)
|
||||
print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
|
||||
continue
|
||||
|
||||
psl[line] = flags
|
||||
if utf_mode:
|
||||
psl[line] = flags
|
||||
psl[punycode] = flags
|
||||
|
||||
# with open("psl.out", 'w') as outfile:
|
||||
# for (domain, flags) in sorted(psl.iteritems()):
|
||||
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
|
||||
|
||||
return [domain + "%X" % (flags & 0x0F) for (domain, flags) in sorted(psl.iteritems())]
|
||||
return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]
|
||||
|
||||
|
||||
def usage():
|
||||
|
@ -538,8 +609,10 @@ def usage():
|
|||
print('usage: %s [options] infile outfile' % sys.argv[0])
|
||||
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
|
||||
print(' --input-format=psl infile is a Public Suffix List file')
|
||||
print(' --output-format=cxx Write DAFSA as C/C++ code')
|
||||
print(' --output-format=cxx Write DAFSA as C/C++ code (default)')
|
||||
print(' --output-format=binary Write DAFSA binary data')
|
||||
print(' --encoding=ascii 7-bit ASCII mode')
|
||||
print(' --encoding=utf-8 UTF-8 mode (default)')
|
||||
exit(1)
|
||||
|
||||
|
||||
|
@ -550,6 +623,11 @@ def main():
|
|||
|
||||
converter = words_to_cxx
|
||||
parser = parse_psl2c
|
||||
utf_mode = True
|
||||
|
||||
codecs = dict()
|
||||
if sys.version_info.major > 2:
|
||||
codecs['encoding'] = 'utf-8'
|
||||
|
||||
for arg in sys.argv[1:-2]:
|
||||
if arg.startswith('--input-format='):
|
||||
|
@ -570,15 +648,24 @@ def main():
|
|||
else:
|
||||
print("Unknown output format '%s'" % value)
|
||||
return 1
|
||||
elif arg.startswith('--encoding='):
|
||||
value = arg[11:].lower()
|
||||
if value == 'ascii':
|
||||
utf_mode = False
|
||||
elif value == 'utf-8':
|
||||
utf_mode = True
|
||||
else:
|
||||
print("Unknown encoding '%s'" % value)
|
||||
return 1
|
||||
else:
|
||||
usage()
|
||||
|
||||
if sys.argv[-2] == '-':
|
||||
with open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(sys.stdin)))
|
||||
with open(sys.argv[-1], 'wb') as outfile:
|
||||
outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
|
||||
else:
|
||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
||||
outfile.write(converter(parser(infile)))
|
||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'wb') as outfile:
|
||||
outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
|
||||
|
||||
return 0
|
||||
|
||||
|
|
|
@ -28,9 +28,14 @@ depends on options passed to it.
|
|||
\fBcxx\fR: (default) output is C/C++ code
|
||||
.br
|
||||
\fBbinary\fR: output is an architecture-independent binary format
|
||||
.TP
|
||||
\fB\-\-encoding=\fR[\fIutf-8\fR|\fIascii\fR]
|
||||
\fButf-8\fR: (default) UTF-8 mode (output contains UTF-8 + punycode)
|
||||
.br
|
||||
\fBascii\fR: (deprecated) 7-bit ASCII mode (output contains punycode only)
|
||||
.SH SEE ALSO
|
||||
.IR https://publicsuffix.org/ ", " https://github.com/rockdaboot/libpsl
|
||||
.SH COPYRIGHT
|
||||
\fBpsl-make-dafsa\fR was originally part of the Chromium project, and
|
||||
\fBpsl-make-dafsa\fR was was written by Olle Liljenzin as part of the Chromium project and
|
||||
has been modified by Tim Ruehsen and Daniel Kahn Gillmor. The code
|
||||
and its documentation is governed by a BSD-style license.
|
||||
|
|
237
src/psl.c
237
src/psl.c
|
@ -73,6 +73,7 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <time.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h> /* for UINT_MAX */
|
||||
#include <langinfo.h>
|
||||
|
@ -101,9 +102,6 @@
|
|||
|
||||
#include <libpsl.h>
|
||||
|
||||
/* number of elements within an array */
|
||||
#define countof(a) (sizeof(a)/sizeof(*(a)))
|
||||
|
||||
#ifndef HAVE_STRNDUP
|
||||
/* I found no strndup on my old SUSE 7.3 test system (gcc 2.95) */
|
||||
|
||||
|
@ -176,10 +174,11 @@ struct _psl_ctx_st {
|
|||
size_t
|
||||
dafsa_size;
|
||||
int
|
||||
mode,
|
||||
nsuffixes,
|
||||
nexceptions,
|
||||
nwildcards;
|
||||
unsigned
|
||||
utf8 : 1; /* 1: data contains UTF-8 + punycode encoded rules */
|
||||
};
|
||||
|
||||
/* include the PSL data compiled by 'psl2c' */
|
||||
|
@ -263,11 +262,21 @@ static int _vector_add(_psl_vector_t *v, const _psl_entry_t *elem)
|
|||
if (v) {
|
||||
void *elemp;
|
||||
|
||||
elemp = malloc(sizeof(_psl_entry_t));
|
||||
if (!(elemp = malloc(sizeof(_psl_entry_t))))
|
||||
return -1;
|
||||
|
||||
memcpy(elemp, elem, sizeof(_psl_entry_t));
|
||||
|
||||
if (v->max == v->cur)
|
||||
v->entry = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
|
||||
if (v->max == v->cur) {
|
||||
void *m = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
|
||||
|
||||
if (m)
|
||||
v->entry = m;
|
||||
else {
|
||||
free(elemp);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
v->entry[v->cur++] = elemp;
|
||||
return v->cur - 1;
|
||||
|
@ -517,36 +526,37 @@ static enum punycode_status punycode_encode(
|
|||
static ssize_t _utf8_to_utf32(const char *in, size_t inlen, punycode_uint *out, size_t outlen)
|
||||
{
|
||||
size_t n = 0;
|
||||
unsigned char *s;
|
||||
const unsigned char *s = (void *)in;
|
||||
const unsigned char *e = (void *)(in + inlen);
|
||||
|
||||
if (!outlen)
|
||||
return -1;
|
||||
|
||||
outlen--;
|
||||
|
||||
s = alloca(inlen + 1);
|
||||
memcpy(s, in, inlen);
|
||||
s[inlen] = 0;
|
||||
while (n < outlen) {
|
||||
size_t inleft = e - s;
|
||||
|
||||
while (*s && n < outlen) {
|
||||
if ((*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
|
||||
if (inleft >= 1 && (*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
|
||||
out[n++] = *s;
|
||||
s++;
|
||||
} else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
|
||||
} else if (inleft >= 2 && (*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
|
||||
if ((s[1] & 0xC0) != 0x80)
|
||||
return -1;
|
||||
out[n++] = ((*s & 0x1F) << 6) | (s[1] & 0x3F);
|
||||
s += 2;
|
||||
} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
|
||||
} else if (inleft >= 3 && (*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
|
||||
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
|
||||
return -1;
|
||||
out[n++] = ((*s & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
|
||||
s += 3;
|
||||
} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
|
||||
} else if (inleft >= 4 && (*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
|
||||
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
|
||||
return -1;
|
||||
out[n++] = ((*s & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
|
||||
s += 4;
|
||||
} else if (!inleft) {
|
||||
break;
|
||||
} else
|
||||
return -1;
|
||||
}
|
||||
|
@ -575,7 +585,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
|
|||
/* printf("s=%s inlen=%zd\n", label, labellen); */
|
||||
|
||||
if (_mem_is_ascii(label, labellen)) {
|
||||
if (outlen + labellen + (e != NULL)>= outsize)
|
||||
if (outlen + labellen + (e != NULL) >= outsize)
|
||||
return 1;
|
||||
|
||||
/* printf("outlen=%zd labellen=%zd\n", outlen, labellen); */
|
||||
|
@ -587,7 +597,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
|
|||
if (outlen + labellen + (e != NULL) + 4 >= outsize)
|
||||
return 1;
|
||||
|
||||
if ((inputlen = _utf8_to_utf32(label, labellen, input, sizeof (input) / sizeof (input[0]))) < 0)
|
||||
if ((inputlen = _utf8_to_utf32(label, labellen, input, countof(input))) < 0)
|
||||
return 1;
|
||||
|
||||
memcpy(out + outlen, "xn--", 4);
|
||||
|
@ -609,7 +619,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
|
|||
}
|
||||
#endif
|
||||
|
||||
static inline int _isspace_ascii(const char c)
|
||||
static int _isspace_ascii(const char c)
|
||||
{
|
||||
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
||||
}
|
||||
|
@ -691,15 +701,15 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
|
|||
UChar utf16_dst[128], utf16_src[128];
|
||||
int32_t utf16_src_length;
|
||||
|
||||
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, utf8, -1, &status);
|
||||
u_strFromUTF8(utf16_src, countof(utf16_src), &utf16_src_length, utf8, -1, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
|
||||
int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, countof(utf16_dst), &info, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
if (ascii)
|
||||
*ascii = strdup(lookupname);
|
||||
ret = 0;
|
||||
if ((*ascii = strdup(lookupname)))
|
||||
ret = 0;
|
||||
} /* else
|
||||
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
|
||||
} /* else
|
||||
|
@ -709,32 +719,21 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
|
|||
}
|
||||
#elif defined(WITH_LIBIDN2)
|
||||
int rc;
|
||||
uint8_t *lower, resbuf[256];
|
||||
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
|
||||
uint8_t *lower;
|
||||
size_t len = u8_strlen((uint8_t *)utf8) + 1;
|
||||
|
||||
/* we need a conversion to lowercase */
|
||||
lower = u8_tolower((uint8_t *)utf8, u8_strlen((uint8_t *)utf8), 0, UNINORM_NFKC, resbuf, &len);
|
||||
if (!lower) {
|
||||
if (!(lower = u8_tolower((uint8_t *)utf8, len, 0, UNINORM_NFKC, NULL, &len))) {
|
||||
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* u8_tolower() does not terminate the result string */
|
||||
if (lower == resbuf) {
|
||||
lower[len]=0;
|
||||
} else {
|
||||
uint8_t *tmp = lower;
|
||||
lower = (uint8_t *)strndup((char *)lower, len);
|
||||
free(tmp);
|
||||
}
|
||||
|
||||
if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
|
||||
ret = 0;
|
||||
} /* else
|
||||
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
|
||||
|
||||
if (lower != resbuf)
|
||||
free(lower);
|
||||
free(lower);
|
||||
#elif defined(WITH_LIBIDN)
|
||||
int rc;
|
||||
|
||||
|
@ -754,8 +753,8 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
|
|||
|
||||
if (_domain_to_punycode(utf8, lookupname, sizeof(lookupname)) == 0) {
|
||||
if (ascii)
|
||||
*ascii = strdup(lookupname);
|
||||
ret = 0;
|
||||
if ((*ascii = strdup(lookupname)))
|
||||
ret = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -776,16 +775,17 @@ static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_en
|
|||
/* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
|
||||
_suffix_init(&suffix, lookupname, strlen(lookupname));
|
||||
suffix.flags = e->flags;
|
||||
suffixp = _vector_get(v, _vector_add(v, &suffix));
|
||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
||||
if ((suffixp = _vector_get(v, _vector_add(v, &suffix))))
|
||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
||||
} /* else ignore */
|
||||
|
||||
free(lookupname);
|
||||
}
|
||||
}
|
||||
|
||||
/* prototype */
|
||||
/* prototypes */
|
||||
int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
|
||||
int GetUtfMode(const unsigned char *graph, size_t length);
|
||||
|
||||
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
|
||||
{
|
||||
|
@ -814,6 +814,14 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (psl->utf8 || psl == &_builtin_psl)
|
||||
need_conversion = 0;
|
||||
|
||||
#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
|
||||
if (psl == &_builtin_psl)
|
||||
need_conversion = 0;
|
||||
#endif
|
||||
|
||||
if (need_conversion) {
|
||||
_psl_idna_t *idna = _psl_idna_open();
|
||||
|
||||
|
@ -934,8 +942,9 @@ suffix_yes:
|
|||
*
|
||||
* For cookie domain checking see psl_is_cookie_domain_acceptable().
|
||||
*
|
||||
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
|
||||
* Other encodings result in unexpected behavior.
|
||||
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
|
||||
* Other encodings likely result in incorrect return values.
|
||||
* Use helper function psl_str_to_utf8lower() for normalization @domain.
|
||||
*
|
||||
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
||||
* psl_builtin().
|
||||
|
@ -964,8 +973,9 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
|
|||
* @type specifies the PSL section where to perform the lookup. Valid values are
|
||||
* %PSL_TYPE_PRIVATE, %PSL_TYPE_ICANN and %PSL_TYPE_ANY.
|
||||
*
|
||||
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
|
||||
* Other encodings result in unexpected behavior.
|
||||
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
|
||||
* Other encodings likely result in incorrect return values.
|
||||
* Use helper function psl_str_to_utf8lower() for normalization @domain.
|
||||
*
|
||||
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
||||
* psl_builtin().
|
||||
|
@ -990,8 +1000,9 @@ int psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type)
|
|||
* This function finds the longest public suffix part of @domain by the means
|
||||
* of the [Mozilla Public Suffix List](https://publicsuffix.org).
|
||||
*
|
||||
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
|
||||
* Other encodings result in unexpected behavior.
|
||||
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
|
||||
* Other encodings likely result in incorrect return values.
|
||||
* Use helper function psl_str_to_utf8lower() for normalization @domain.
|
||||
*
|
||||
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
||||
* psl_builtin().
|
||||
|
@ -1029,8 +1040,9 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
|
|||
* This function finds the shortest private suffix part of @domain by the means
|
||||
* of the [Mozilla Public Suffix List](https://publicsuffix.org).
|
||||
*
|
||||
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
|
||||
* Other encodings result in unexpected behavior.
|
||||
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
|
||||
* Other encodings likely result in incorrect return values.
|
||||
* Use helper function psl_str_to_utf8lower() for normalization @domain.
|
||||
*
|
||||
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
||||
* psl_builtin().
|
||||
|
@ -1070,7 +1082,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
|
|||
* This function loads the public suffixes file named @fname.
|
||||
* To free the allocated resources, call psl_free().
|
||||
*
|
||||
* The suffixes are expected to be lowercase UTF-8 encoded if they are international.
|
||||
* The suffixes are expected to be UTF-8 encoded (lowercase + NFCK) if they are international.
|
||||
*
|
||||
* Returns: Pointer to a PSL context or %NULL on failure.
|
||||
*
|
||||
|
@ -1099,7 +1111,7 @@ psl_ctx_t *psl_load_file(const char *fname)
|
|||
* This function loads the public suffixes from a FILE pointer.
|
||||
* To free the allocated resources, call psl_free().
|
||||
*
|
||||
* The suffixes are expected to be lowercase UTF-8 encoded if they are international.
|
||||
* The suffixes are expected to be UTF-8 encoded (lowercase + NFCK) if they are international.
|
||||
*
|
||||
* Returns: Pointer to a PSL context or %NULL on failure.
|
||||
*
|
||||
|
@ -1152,6 +1164,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
|||
psl->dafsa = m;
|
||||
|
||||
psl->dafsa_size = len;
|
||||
psl->utf8 = !!GetUtfMode(psl->dafsa, len);
|
||||
|
||||
return psl;
|
||||
}
|
||||
|
@ -1163,6 +1176,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
|||
* as of 19.02.2014, the list at https://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
|
||||
*/
|
||||
psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
|
||||
psl->utf8 = 1; /* we put UTF-8 and punycode rules in the lookup vector */
|
||||
|
||||
do {
|
||||
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
|
||||
|
@ -1231,9 +1245,10 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
|||
suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
|
||||
}
|
||||
|
||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
||||
|
||||
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
|
||||
if (suffixp) {
|
||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
||||
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
|
||||
}
|
||||
}
|
||||
} while ((linep = fgets(buf, sizeof(buf), fp)));
|
||||
|
||||
|
@ -1275,8 +1290,8 @@ void psl_free(psl_ctx_t *psl)
|
|||
* The builtin data also contains punycode entries, one for each international domain name.
|
||||
*
|
||||
* If the generation of built-in data has been disabled during compilation, %NULL will be returned.
|
||||
* When using the builtin psl context, you can provide UTF-8 or punycode representations of domains to
|
||||
* functions like psl_is_public_suffix().
|
||||
* When using the builtin psl context, you can provide UTF-8 (lowercase + NFCK) or ASCII/ACE (punycode)
|
||||
* representations of domains to functions like psl_is_public_suffix().
|
||||
*
|
||||
* Returns: Pointer to the built in PSL data or NULL if this data is not available.
|
||||
*
|
||||
|
@ -1495,8 +1510,10 @@ static int _isip(const char *hostname)
|
|||
* This helper function checks whether @cookie_domain is an acceptable cookie domain value for the request
|
||||
* @hostname.
|
||||
*
|
||||
* For international domain names both, @hostname and @cookie_domain, have to be either in lowercase UTF-8
|
||||
* or in ASCII form (punycode). Other encodings or mixing UTF-8 and punycode result in unexpected behavior.
|
||||
* For international domain names both, @hostname and @cookie_domain, have to be either in UTF-8 (lowercase + NFCK)
|
||||
* or in ASCII/ACE (punycode) format. Other encodings or mixing UTF-8 and punycode likely result in incorrect return values.
|
||||
*
|
||||
* Use helper function psl_str_to_utf8lower() for normalization of @hostname and @cookie_domain.
|
||||
*
|
||||
* Examples:
|
||||
* 1. Cookie domain 'example.com' would be acceptable for hostname 'www.example.com',
|
||||
|
@ -1553,8 +1570,8 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
|
|||
* @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
|
||||
* @lower: return value containing the converted string
|
||||
*
|
||||
* This helper function converts a string to lowercase UTF-8 representation.
|
||||
* Lowercase UTF-8 is needed as input to the domain checking functions.
|
||||
* This helper function converts a string to UTF-8 lowercase + NFCK representation.
|
||||
* Lowercase + NFCK UTF-8 is needed as input to the domain checking functions.
|
||||
*
|
||||
* @lower is set to %NULL on error.
|
||||
*
|
||||
|
@ -1567,6 +1584,7 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
|
|||
* PSL_ERR_TO_UTF16: Failed to convert @str to unicode
|
||||
* PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase
|
||||
* PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8
|
||||
* PSL_ERR_NO_MEM: Failed to allocate memory
|
||||
*
|
||||
* Since: 0.4
|
||||
*/
|
||||
|
@ -1585,7 +1603,8 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
|||
if (lower) {
|
||||
char *p;
|
||||
|
||||
*lower = strdup(str);
|
||||
if (!(*lower = strdup(str)))
|
||||
return PSL_ERR_NO_MEM;
|
||||
|
||||
/* convert ASCII string to lowercase */
|
||||
for (p = *lower; *p; p++)
|
||||
|
@ -1604,10 +1623,21 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
|||
char *utf8_lower;
|
||||
UConverter *uconv;
|
||||
|
||||
/* C89 allocation */
|
||||
utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1));
|
||||
utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
|
||||
utf8_lower = alloca(str_length * 2 + 1);
|
||||
if (str_length < 256) {
|
||||
/* C89 allocation */
|
||||
utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1));
|
||||
utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
|
||||
utf8_lower = alloca(str_length * 2 + 1);
|
||||
} else {
|
||||
utf16_dst = malloc(sizeof(UChar) * (str_length * 2 + 1));
|
||||
utf16_lower = malloc(sizeof(UChar) * (str_length * 2 + 1));
|
||||
utf8_lower = malloc(str_length * 2 + 1);
|
||||
|
||||
if (!utf16_dst || !utf16_lower || !utf8_lower) {
|
||||
ret = PSL_ERR_NO_MEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
uconv = ucnv_open(encoding, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
|
@ -1619,9 +1649,16 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
|||
if (U_SUCCESS(status)) {
|
||||
u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
if (lower)
|
||||
*lower = strdup(utf8_lower);
|
||||
ret = PSL_SUCCESS;
|
||||
if (lower) {
|
||||
if (str_length < 256) {
|
||||
if (!(*lower = strdup(utf8_lower)))
|
||||
ret = PSL_ERR_NO_MEM;
|
||||
} else {
|
||||
*lower = utf8_lower;
|
||||
utf8_lower = NULL;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ret = PSL_ERR_TO_UTF8;
|
||||
/* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
|
||||
|
@ -1638,6 +1675,12 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
|||
ret = PSL_ERR_CONVERTER;
|
||||
/* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */
|
||||
}
|
||||
out:
|
||||
if (str_length >= 256) {
|
||||
free(utf16_dst);
|
||||
free(utf16_lower);
|
||||
free(utf8_lower);
|
||||
}
|
||||
} while (0);
|
||||
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
|
||||
do {
|
||||
|
@ -1655,26 +1698,32 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
|||
|
||||
if (cd != (iconv_t)-1) {
|
||||
char *tmp = (char *)str; /* iconv won't change where str points to, but changes tmp itself */
|
||||
size_t tmp_len = strlen(str);
|
||||
size_t tmp_len = strlen(str) + 1;
|
||||
size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
|
||||
char *dst = malloc(dst_len + 1), *dst_tmp = dst;
|
||||
|
||||
if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1) {
|
||||
uint8_t *resbuf = malloc(dst_len * 2 + 1);
|
||||
size_t len = dst_len * 2; /* leave space for additional \0 byte */
|
||||
if (!dst) {
|
||||
ret = PSL_ERR_NO_MEM;
|
||||
}
|
||||
else if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
|
||||
&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
|
||||
{
|
||||
/* start size for u8_tolower internal memory allocation.
|
||||
* u8_tolower() does not terminate the result string. we have 0 byte included in above tmp_len
|
||||
* and thus in len. */
|
||||
size_t len = dst_len - dst_len_tmp;
|
||||
|
||||
if ((dst = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
|
||||
/* u8_tolower() does not terminate the result string */
|
||||
if (lower)
|
||||
*lower = strndup((char *)dst, len);
|
||||
if ((tmp = (char *)u8_tolower((uint8_t *)dst, len, 0, UNINORM_NFKC, NULL, &len))) {
|
||||
ret = PSL_SUCCESS;
|
||||
if (lower) {
|
||||
*lower = tmp;
|
||||
tmp = NULL;
|
||||
} else
|
||||
free(tmp);
|
||||
} else {
|
||||
ret = PSL_ERR_TO_LOWER;
|
||||
/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
|
||||
}
|
||||
|
||||
if (lower)
|
||||
*lower = strndup(dst, dst_len - dst_len_tmp);
|
||||
ret = PSL_SUCCESS;
|
||||
} else {
|
||||
ret = PSL_ERR_TO_UTF8;
|
||||
/* fprintf(stderr, "Failed to convert '%s' string into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
|
||||
|
@ -1686,19 +1735,21 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
|||
ret = PSL_ERR_TO_UTF8;
|
||||
/* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
|
||||
}
|
||||
} else
|
||||
ret = PSL_SUCCESS;
|
||||
|
||||
/* convert to lowercase */
|
||||
if (ret == PSL_SUCCESS) {
|
||||
uint8_t *dst, resbuf[256];
|
||||
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
|
||||
|
||||
} else {
|
||||
/* we need a conversion to lowercase */
|
||||
if ((dst = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str), 0, UNINORM_NFKC, resbuf, &len))) {
|
||||
/* u8_tolower() does not terminate the result string */
|
||||
if (lower)
|
||||
*lower = strndup((char *)dst, len);
|
||||
uint8_t *tmp;
|
||||
|
||||
/* start size for u8_tolower internal memory allocation.
|
||||
* u8_tolower() does not terminate the result string, so include terminating 0 byte in len. */
|
||||
size_t len = u8_strlen((uint8_t *)str) + 1;
|
||||
|
||||
if ((tmp = u8_tolower((uint8_t *)str, len, 0, UNINORM_NFKC, NULL, &len))) {
|
||||
ret = PSL_SUCCESS;
|
||||
if (lower) {
|
||||
*lower = (char*)tmp;
|
||||
tmp = NULL;
|
||||
} else
|
||||
free(tmp);
|
||||
} else {
|
||||
ret = PSL_ERR_TO_LOWER;
|
||||
/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
|
||||
|
|
10
src/psl2c.c
10
src/psl2c.c
|
@ -153,11 +153,6 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
|
|||
if ((fp = fopen("in.tmp", "w"))) {
|
||||
for (it = 0; it < v->cur; it++) {
|
||||
_psl_entry_t *e = _vector_get(v, it);
|
||||
unsigned char *s = (unsigned char *)e->label_buf;
|
||||
|
||||
/* search for non-ASCII label and skip it */
|
||||
while (*s && *s < 128) s++;
|
||||
if (*s) continue;
|
||||
|
||||
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
|
||||
}
|
||||
|
@ -191,11 +186,6 @@ static int _print_psl_entries_dafsa_binary(const char *fname, const _psl_vector_
|
|||
if ((fp = fopen("in.tmp", "w"))) {
|
||||
for (it = 0; it < v->cur; it++) {
|
||||
_psl_entry_t *e = _vector_get(v, it);
|
||||
unsigned char *s = (unsigned char *)e->label_buf;
|
||||
|
||||
/* search for non-ASCII label and skip it */
|
||||
while (*s && *s < 128) s++;
|
||||
if (*s) continue;
|
||||
|
||||
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
|
||||
}
|
||||
|
|
|
@ -23,3 +23,14 @@ check_PROGRAMS = $(PSL_TESTS)
|
|||
|
||||
TESTS_ENVIRONMENT = TESTS_VALGRIND="@VALGRIND_ENVIRONMENT@"
|
||||
TESTS = $(PSL_TESTS)
|
||||
|
||||
# dafsa.psl and dafsa_ascii.psl must be created before any test is executed
|
||||
# check-local target works in parallel to the tests, so the test suite will likely fail
|
||||
BUILT_SOURCES = psl.dafsa psl_ascii.dafsa
|
||||
psl.dafsa:
|
||||
$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary "$(PSL_FILE)" psl.dafsa
|
||||
psl_ascii.dafsa:
|
||||
$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary --encoding=ascii "$(PSL_FILE)" psl_ascii.dafsa
|
||||
|
||||
clean-local:
|
||||
rm -f psl.dafsa psl_ascii.dafsa
|
||||
|
|
|
@ -65,6 +65,7 @@ static void test_psl(void)
|
|||
{ "www.his.name", "his.name", 1 },
|
||||
{ "www.his.name", "name", 0 },
|
||||
{ "www.example.com", "www.example.com", 1 },
|
||||
{ "www.example.com", "wwww.example.com", 0 },
|
||||
{ "www.example.com", "example.com", 1 },
|
||||
{ "www.example.com", "com", 0 }, /* not accepted by normalization (PSL rule 'com') */
|
||||
{ "www.example.com", "example.org", 0 },
|
||||
|
@ -77,6 +78,8 @@ static void test_psl(void)
|
|||
{ "2a00:1450:4013:c01::8b", ":1450:4013:c01::8b", 0 }, /* IPv6 address, partial match */
|
||||
{ "::ffff:192.1.123.2", "::ffff:192.1.123.2", 1 }, /* IPv6 address dotted-quad, full match */
|
||||
{ "::ffff:192.1.123.2", ".1.123.2", 0 }, /* IPv6 address dotted-quad, partial match */
|
||||
{ NULL, ".1.123.2", 0 },
|
||||
{ "hiho", NULL, 0 },
|
||||
};
|
||||
unsigned it;
|
||||
psl_ctx_t *psl;
|
||||
|
@ -98,6 +101,9 @@ static void test_psl(void)
|
|||
}
|
||||
}
|
||||
|
||||
/* do checks to cover more code paths in libpsl */
|
||||
psl_is_cookie_domain_acceptable(NULL, "example.com", "example.com");
|
||||
|
||||
psl_free(psl);
|
||||
}
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ static int
|
|||
struct timespec ts1, ts2;
|
||||
#endif
|
||||
|
||||
static inline int _isspace_ascii(const char c)
|
||||
static int _isspace_ascii(const char c)
|
||||
{
|
||||
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
||||
}
|
||||
|
@ -131,7 +131,7 @@ static void test_psl_entry(const psl_ctx_t *psl, const char *domain, int type)
|
|||
static void test_psl(void)
|
||||
{
|
||||
FILE *fp;
|
||||
psl_ctx_t *psl;
|
||||
psl_ctx_t *psl, *psl3, *psl4;
|
||||
const psl_ctx_t *psl2;
|
||||
int type = 0;
|
||||
char buf[256], *linep, *p;
|
||||
|
@ -142,6 +142,16 @@ static void test_psl(void)
|
|||
psl2 = psl_builtin();
|
||||
printf("builtin PSL has %d suffixes and %d exceptions\n", psl_suffix_count(psl2), psl_suffix_exception_count(psl2));
|
||||
|
||||
if (!(psl3 = psl_load_file("psl.dafsa"))) {
|
||||
fprintf(stderr, "Failed to load 'psl.dafsa'\n");
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (!(psl4 = psl_load_file("psl_ascii.dafsa"))) {
|
||||
fprintf(stderr, "Failed to load 'psl_ascii.dafsa'\n");
|
||||
failed++;
|
||||
}
|
||||
|
||||
if ((fp = fopen(PSL_FILE, "r"))) {
|
||||
#ifdef HAVE_CLOCK_GETTIME
|
||||
clock_gettime(CLOCK_REALTIME, &ts1);
|
||||
|
@ -174,6 +184,12 @@ static void test_psl(void)
|
|||
|
||||
if (psl2)
|
||||
test_psl_entry(psl2, p, type);
|
||||
|
||||
if (psl3)
|
||||
test_psl_entry(psl3, p, type);
|
||||
|
||||
if (psl4)
|
||||
test_psl_entry(psl4, p, type);
|
||||
}
|
||||
|
||||
#ifdef HAVE_CLOCK_GETTIME
|
||||
|
@ -185,8 +201,10 @@ static void test_psl(void)
|
|||
failed++;
|
||||
}
|
||||
|
||||
psl_free(psl);
|
||||
psl_free(psl4);
|
||||
psl_free(psl3);
|
||||
psl_free((psl_ctx_t *)psl2);
|
||||
psl_free(psl);
|
||||
}
|
||||
|
||||
int main(int argc, const char * const *argv)
|
||||
|
|
|
@ -84,6 +84,7 @@ static void test_psl(void)
|
|||
{ "adfhoweirh", 1 }, /* unknown TLD */
|
||||
};
|
||||
unsigned it;
|
||||
int result, ver;
|
||||
psl_ctx_t *psl;
|
||||
|
||||
psl = psl_load_file(PSL_FILE);
|
||||
|
@ -92,7 +93,7 @@ static void test_psl(void)
|
|||
|
||||
for (it = 0; it < countof(test_data); it++) {
|
||||
const struct test_data *t = &test_data[it];
|
||||
int result = psl_is_public_suffix(psl, t->domain);
|
||||
result = psl_is_public_suffix(psl, t->domain);
|
||||
|
||||
if (result == t->result) {
|
||||
ok++;
|
||||
|
@ -102,6 +103,68 @@ static void test_psl(void)
|
|||
}
|
||||
}
|
||||
|
||||
/* do some checks to cover more code paths in libpsl */
|
||||
psl_is_public_suffix(NULL, "xxx");
|
||||
|
||||
if ((ver = psl_check_version_number(0)) == 0) {
|
||||
printf("psl_check_version_number(0) is 0\n");
|
||||
failed++;
|
||||
} else {
|
||||
if (((result = psl_check_version_number(ver)) != ver)) {
|
||||
printf("psl_check_version_number(%06X) is %06X\n", ver, result);
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (((result = psl_check_version_number(ver - 1)) != 0)) {
|
||||
printf("psl_check_version_number(%06X) is %06X\n", ver - 1, result);
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (((result = psl_check_version_number(ver + 1)) != ver)) {
|
||||
printf("psl_check_version_number(%06X) is %06X\n", ver, result);
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
|
||||
psl_str_to_utf8lower("www.example.com", "utf-8", "en", NULL);
|
||||
psl_str_to_utf8lower(NULL, "utf-8", "en", NULL);
|
||||
|
||||
{
|
||||
char *lower = NULL;
|
||||
|
||||
psl_str_to_utf8lower("www.example.com", NULL, "de", &lower);
|
||||
free(lower); lower = NULL;
|
||||
|
||||
psl_str_to_utf8lower("\374bel.de", NULL, "de", &lower);
|
||||
free(lower); lower = NULL;
|
||||
|
||||
psl_str_to_utf8lower("\374bel.de", "iso-8859-1", NULL, &lower);
|
||||
free(lower); lower = NULL;
|
||||
|
||||
psl_str_to_utf8lower(NULL, "utf-8", "en", &lower);
|
||||
free(lower); lower = NULL;
|
||||
}
|
||||
|
||||
psl_get_version();
|
||||
psl_builtin_filename();
|
||||
psl_builtin_outdated();
|
||||
psl_builtin_file_time();
|
||||
psl_builtin_sha1sum();
|
||||
psl_suffix_wildcard_count(NULL);
|
||||
psl_suffix_wildcard_count(psl);
|
||||
psl_suffix_wildcard_count(psl_builtin());
|
||||
psl_suffix_count(NULL);
|
||||
psl_suffix_exception_count(NULL);
|
||||
psl_load_file(NULL);
|
||||
psl_load_fp(NULL);
|
||||
psl_registrable_domain(NULL, "");
|
||||
psl_registrable_domain(psl, NULL);
|
||||
psl_registrable_domain(psl, "www.example.com");
|
||||
psl_unregistrable_domain(NULL, "");
|
||||
psl_unregistrable_domain(psl, NULL);
|
||||
psl_is_public_suffix2(NULL, "", PSL_TYPE_ANY);
|
||||
psl_is_public_suffix2(psl, NULL, PSL_TYPE_ANY);
|
||||
|
||||
psl_free(psl);
|
||||
}
|
||||
|
||||
|
|
|
@ -50,14 +50,28 @@ static int
|
|||
ok,
|
||||
failed;
|
||||
|
||||
static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
|
||||
static void testx(const psl_ctx_t *psl, const char *domain, const char *encoding, const char *lang, const char *expected_result)
|
||||
{
|
||||
const char *result;
|
||||
char *lower;
|
||||
int rc;
|
||||
|
||||
/* our test data is fixed to UTF-8 (english), so provide it here */
|
||||
if (psl_str_to_utf8lower(domain, "utf-8", "en", &lower) == PSL_SUCCESS)
|
||||
/* just to cover special code paths for valgrind checking */
|
||||
psl_str_to_utf8lower(domain, encoding, lang, NULL);
|
||||
|
||||
if ((rc = psl_str_to_utf8lower(domain, encoding, lang, &lower)) == PSL_SUCCESS)
|
||||
domain = lower;
|
||||
/* non-ASCII domains fail here if no runtime IDN library is configured, so skip it */
|
||||
#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
|
||||
else if (domain) {
|
||||
/* if we do not runtime support, test failure have to be skipped */
|
||||
failed++;
|
||||
printf("psl_str_to_utf8lower(%s)=%d\n", domain ? domain : "NULL", rc);
|
||||
|
||||
free(lower);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
result = psl_registrable_domain(psl, domain);
|
||||
|
||||
|
@ -72,13 +86,28 @@ static void test(const psl_ctx_t *psl, const char *domain, const char *expected_
|
|||
free(lower);
|
||||
}
|
||||
|
||||
static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
|
||||
{
|
||||
testx(psl, domain, "utf-8", "en", expected_result);
|
||||
}
|
||||
|
||||
static void test_iso(const psl_ctx_t *psl, const char *domain, const char *expected_result)
|
||||
{
|
||||
/* makes only sense with a runtime IDN library configured */
|
||||
#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
|
||||
testx(psl, domain, "iso-8859-15", "de", expected_result);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void test_psl(void)
|
||||
{
|
||||
FILE *fp;
|
||||
const psl_ctx_t *psl;
|
||||
const char *p;
|
||||
char buf[256], domain[128], expected_regdom[128], semicolon[2];
|
||||
char lbuf[258];
|
||||
int er_is_null, d_is_null;
|
||||
unsigned it;
|
||||
|
||||
psl = psl_builtin();
|
||||
|
||||
|
@ -101,6 +130,22 @@ static void test_psl(void)
|
|||
/* Norwegian with lowercase oe */
|
||||
test(psl, "www.\303\270yer.no", "www.\303\270yer.no");
|
||||
|
||||
/* Norwegian with lowercase oe, encoded as ISO-8859-15 */
|
||||
test_iso(psl, "www.\370yer.no", "www.\303\270yer.no");
|
||||
|
||||
/* Testing special code paths of psl_str_to_utf8lower() */
|
||||
for (it = 254; it <= 257; it++) {
|
||||
memset(lbuf, 'a', it);
|
||||
lbuf[it] = 0;
|
||||
|
||||
lbuf[0] = '\370';
|
||||
test_iso(psl, lbuf, NULL);
|
||||
|
||||
lbuf[0] = '\303';
|
||||
lbuf[1] = '\270';
|
||||
test(psl, lbuf, NULL);
|
||||
}
|
||||
|
||||
/* special check with NULL psl context and TLD */
|
||||
test(psl, "whoever.forgot.his.name", "whoever.forgot.his.name");
|
||||
|
||||
|
|
|
@ -12,3 +12,5 @@ LDADD = ../src/libpsl.la
|
|||
#if WITH_LIBIDN
|
||||
# LDADD += -lidn
|
||||
#endif
|
||||
|
||||
dist_man_MANS = psl.1
|
||||
|
|
Loading…
Reference in New Issue