Release v0.15.0
-----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEHLJ9vJhhSy1YQWRtCDAttqJnBCgFAlgppfATHHRpbS5ydWVo c2VuQGdteC5kZQAKCRAIMC22omcEKGy/D/9iduEEwzSDt22U6MxmqD77hvgB9hQn 8Xn7CsTye408EUlw2ENYg4H/V3xNQN7ZbA4wJi20FmcniFhSUbSv9UD5Vr2FSTZS NJ1EpAbqljswE5x49u3lWRyo8XOEbVdWZS66+E5W9T/0Nl6kLUk4nYkBE6LBQGhp vd6+p74kqpjJGHhrZ4uYV5bkttoeSee/arGzvWTR3kmgERVCm9Qr90ldOx3Sp91s iqwb6RpDVkL3q5sA9bOfrpEDdADJdQYLr1BkkTOb7ZA52uEhdU6nEyfswoJsaBuI aj1hOgspekVqEs7ZUpltnT2GPbFyXtj338SA0738xxZaTm/eYzvNea5Fnpg4fnQb /w7I++IZGmdXljQnk1gtqzIgxCwia34u2/T4XgEpyd/h9A5PUdjo2EKPtBgHRFG7 GnK9IRgLHqdxZFpfiUyp2zIZL8+/PUlD5Ekwi1D3Wgc5PSOO0rMHR1IWzCmpopbU Mo9E511RcIdsn+IStB1gwclT5qk1fo3n5dcQBBXtpPTEJ6CRedLK+WcbLyhh3R0Z ham1D8t3kVDQgfg57mEJOIS5sgcLj5LR3ydya5ELf3pS6FVo4qvBO4Sp3E6wbgpE 9n5D150bKyv+RkTuNTgW8uahhYdR++bXUPWbaZReGVxKy3VB7VikDusRfnVFej9c cJP1HAskz6qTwA== =ksJN -----END PGP SIGNATURE----- Merge tag 'libpsl-0.15.0' into debian Release v0.15.0
This commit is contained in:
commit
4ef2e7c54b
|
@ -1,4 +1,7 @@
|
||||||
*.exe
|
*.exe
|
||||||
|
*.gcda
|
||||||
|
*.gcno
|
||||||
|
*.gcov
|
||||||
*.gz
|
*.gz
|
||||||
*.la
|
*.la
|
||||||
*.lo
|
*.lo
|
||||||
|
@ -10,6 +13,7 @@
|
||||||
*.cache
|
*.cache
|
||||||
*.plist
|
*.plist
|
||||||
*.stamp
|
*.stamp
|
||||||
|
ABOUT-NLS
|
||||||
aclocal.m4
|
aclocal.m4
|
||||||
ar-lib
|
ar-lib
|
||||||
autom4te.cache/
|
autom4te.cache/
|
||||||
|
@ -43,6 +47,8 @@ gtk-doc.m4
|
||||||
gtk-doc.make
|
gtk-doc.make
|
||||||
include/libpsl.h
|
include/libpsl.h
|
||||||
install-sh
|
install-sh
|
||||||
|
lcov/
|
||||||
|
libpsl.info
|
||||||
libpsl.pc
|
libpsl.pc
|
||||||
libtool
|
libtool
|
||||||
ltmain.sh
|
ltmain.sh
|
||||||
|
@ -67,10 +73,13 @@ po/remove-potcdate.sed
|
||||||
po/stamp-po
|
po/stamp-po
|
||||||
src/psl2c
|
src/psl2c
|
||||||
src/suffixes.c
|
src/suffixes.c
|
||||||
|
src/suffixes_dafsa.c
|
||||||
stamp-h1
|
stamp-h1
|
||||||
test-driver
|
test-driver
|
||||||
tests/*.log
|
tests/*.log
|
||||||
tests/*.trs
|
tests/*.trs
|
||||||
|
tests/psl.dafsa
|
||||||
|
tests/psl_ascii.dafsa
|
||||||
tests/test-is-cookie-domain-acceptable
|
tests/test-is-cookie-domain-acceptable
|
||||||
tests/test-is-public
|
tests/test-is-public
|
||||||
tests/test-is-public-all
|
tests/test-is-public-all
|
||||||
|
|
|
@ -34,6 +34,7 @@ addons:
|
||||||
- libicu-dev
|
- libicu-dev
|
||||||
- libunistring0
|
- libunistring0
|
||||||
- libunistring-dev
|
- libunistring-dev
|
||||||
|
- lcov
|
||||||
|
|
||||||
script:
|
script:
|
||||||
- ./autogen.sh
|
- ./autogen.sh
|
||||||
|
@ -44,3 +45,4 @@ script:
|
||||||
- ./configure --enable-runtime=$RUNTIME --disable-builtin && make clean && make -j4 && make check -j4
|
- ./configure --enable-runtime=$RUNTIME --disable-builtin && make clean && make -j4 && make check -j4
|
||||||
- ./configure --enable-gtk-doc && make -j4 && make check -j4
|
- ./configure --enable-gtk-doc && make -j4 && make check -j4
|
||||||
- make distcheck
|
- make distcheck
|
||||||
|
- if [[ $CC == "gcc" && $RUNTIME == "libicu" ]]; then ./.travis_coveralls.sh; fi
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
make check-coverage-libicu
|
||||||
|
pip install --user cpp-coveralls
|
||||||
|
coveralls --include libwget/ --include src/ -e "src/psl2c.c"
|
2
AUTHORS
2
AUTHORS
|
@ -16,3 +16,5 @@ Christopher Meng (Fedora building)
|
||||||
Jakub Čajka
|
Jakub Čajka
|
||||||
Giuseppe Scrivano
|
Giuseppe Scrivano
|
||||||
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
|
Ryan Sleevi (Discussion, Requested DAFSA format and ICANN/PRIVATE support)
|
||||||
|
Daurnimator (Code review, discussion, reports)
|
||||||
|
Olle Liljenzin (Original DAFSA implementation and UTF-8 patch)
|
||||||
|
|
25
Makefile.am
25
Makefile.am
|
@ -19,3 +19,28 @@ dist-hook:
|
||||||
mkdir -p $(distdir)/list/tests
|
mkdir -p $(distdir)/list/tests
|
||||||
cp -p $(PSL_FILE) $(distdir)/list
|
cp -p $(PSL_FILE) $(distdir)/list
|
||||||
cp -p $(PSL_TESTFILE) $(distdir)/list/tests
|
cp -p $(PSL_TESTFILE) $(distdir)/list/tests
|
||||||
|
|
||||||
|
clean-local:
|
||||||
|
rm -rf */*.gc?? */*/*.gc?? libpsl.info lcov
|
||||||
|
|
||||||
|
check-coverage:
|
||||||
|
if test -z "$(XLIB)"; then \
|
||||||
|
CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --disable-runtime --disable-builtin; \
|
||||||
|
else \
|
||||||
|
CFLAGS=$$CFLAGS" --coverage -O0" LDFLAGS=$$LDFLAGS" --coverage" ./configure --enable-runtime=$(XLIB) --enable-builtin=$(XLIB); \
|
||||||
|
fi
|
||||||
|
$(MAKE) clean && $(MAKE)
|
||||||
|
lcov --capture --initial --directory src --output-file libpsl.info
|
||||||
|
$(MAKE) check
|
||||||
|
lcov --capture --directory src --output-file libpsl.info
|
||||||
|
lcov --remove libpsl.info 'src/psl2c.c' -o libpsl.info
|
||||||
|
genhtml --prefix . libpsl.info --legend --title "libpsl" --output-directory=lcov
|
||||||
|
|
||||||
|
check-coverage-libidn:
|
||||||
|
XLIB=libidn $(MAKE) check-coverage
|
||||||
|
|
||||||
|
check-coverage-libidn2:
|
||||||
|
XLIB=libidn2 $(MAKE) check-coverage
|
||||||
|
|
||||||
|
check-coverage-libicu:
|
||||||
|
XLIB=libicu $(MAKE) check-coverage
|
||||||
|
|
9
NEWS
9
NEWS
|
@ -1,5 +1,14 @@
|
||||||
Copyright (C) 2014-2016 Tim Rühsen
|
Copyright (C) 2014-2016 Tim Rühsen
|
||||||
|
|
||||||
|
14.11.2016 Release V0.15.0
|
||||||
|
* Python3 compatibility for psl-make-dafsa
|
||||||
|
* Support for UTF-8 in DAFSA data
|
||||||
|
* Skip punycode conversion if DAFSA has UTF-8
|
||||||
|
* Better code coverage by test suite
|
||||||
|
* Code cleanup and enhancements
|
||||||
|
* Install man pages for psl-make-dafsa and psl
|
||||||
|
* Enhancements to the documentation
|
||||||
|
|
||||||
30.07.2016 Release V0.14.0
|
30.07.2016 Release V0.14.0
|
||||||
* Remove unneeded libraries from tools/psl link step
|
* Remove unneeded libraries from tools/psl link step
|
||||||
* Use https instead of http where possible
|
* Use https instead of http where possible
|
||||||
|
|
12
README.md
12
README.md
|
@ -1,4 +1,12 @@
|
||||||
[![Build Status](https://travis-ci.org/rockdaboot/libpsl.png?branch=master)](https://travis-ci.org/rockdaboot/libpsl)
|
[![Travis-CI Status](https://travis-ci.org/rockdaboot/libpsl.png?branch=master)](https://travis-ci.org/rockdaboot/libpsl)
|
||||||
|
[![Coverity Scan](https://img.shields.io/coverity/scan/10227.svg)](https://scan.coverity.com/projects/rockdaboot-libpsl)
|
||||||
|
[![Coverage Status](https://coveralls.io/repos/github/rockdaboot/libpsl/badge.svg?branch=master)](https://coveralls.io/github/rockdaboot/libpsl?branch=master)
|
||||||
|
|
||||||
|
Solaris OpenCSW [![Build Status Solaris amd64](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-amd64)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-amd64)
|
||||||
|
[![Build Status Solaris i386](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-i386)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-i386)
|
||||||
|
[![Build Status Solaris Sparc](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-sparc)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-sparc)
|
||||||
|
[![Build Status Solaris SparcV9](https://buildfarm.opencsw.org/buildbot/png?builder=libpsl-solaris10-sparcv9)](https://buildfarm.opencsw.org/buildbot/builders/libpsl-solaris10-sparcv9)
|
||||||
|
|
||||||
|
|
||||||
libpsl - C library to handle the Public Suffix List
|
libpsl - C library to handle the Public Suffix List
|
||||||
===================================================
|
===================================================
|
||||||
|
@ -116,7 +124,7 @@ Mailing List
|
||||||
|
|
||||||
To join the mailing list send an email to
|
To join the mailing list send an email to
|
||||||
|
|
||||||
<libpsl-bugs+subscribe@googlegroups.com>
|
libpsl-bugs+subscribe@googlegroups.com
|
||||||
|
|
||||||
and follow the instructions provided by the answer mail.
|
and follow the instructions provided by the answer mail.
|
||||||
|
|
||||||
|
|
10
configure.ac
10
configure.ac
|
@ -1,7 +1,7 @@
|
||||||
|
|
||||||
AC_INIT([libpsl], [0.14.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
|
AC_INIT([libpsl], [0.15.0], [tim.ruehsen@gmx.de], [libpsl], [https://github.com/rockdaboot/libpsl])
|
||||||
AC_PREREQ([2.59])
|
AC_PREREQ([2.59])
|
||||||
AM_INIT_AUTOMAKE([1.10 -Wall no-define foreign])
|
AM_INIT_AUTOMAKE([1.10 no-define foreign])
|
||||||
|
|
||||||
# Generate two configuration headers; one for building the library itself with
|
# Generate two configuration headers; one for building the library itself with
|
||||||
# an autogenerated template, and a second one that will be installed alongside
|
# an autogenerated template, and a second one that will be installed alongside
|
||||||
|
@ -88,7 +88,7 @@ PKG_PROG_PKG_CONFIG
|
||||||
# 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0.
|
# 4. If any interfaces have been added, removed, or changed since the last update, increment current, and set revision to 0.
|
||||||
# 5. If any interfaces have been added since the last public release, then increment age.
|
# 5. If any interfaces have been added since the last public release, then increment age.
|
||||||
# 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0.
|
# 6. If any existing interfaces have been removed or changed since the last public release, then set age to 0.
|
||||||
AC_SUBST([LIBPSL_SO_VERSION], [5:1:0])
|
AC_SUBST([LIBPSL_SO_VERSION], [5:2:0])
|
||||||
AC_SUBST([LIBPSL_VERSION], $VERSION)
|
AC_SUBST([LIBPSL_VERSION], $VERSION)
|
||||||
|
|
||||||
# Check for enable/disable builtin PSL data
|
# Check for enable/disable builtin PSL data
|
||||||
|
@ -168,7 +168,7 @@ if test "$enable_runtime" = "libicu" -o "$enable_builtin" = "libicu"; then
|
||||||
[AC_LANG_PROGRAM(
|
[AC_LANG_PROGRAM(
|
||||||
[[#include <unicode/ustring.h>]],
|
[[#include <unicode/ustring.h>]],
|
||||||
[[u_strToUTF8(NULL, 0, NULL, NULL, 0, NULL);]])],
|
[[u_strToUTF8(NULL, 0, NULL, NULL, 0, NULL);]])],
|
||||||
[HAVE_LIBICU=yes; AC_MSG_RESULT([yes])],
|
[HAVE_LIBICU=yes; LIBICU_LIBS="-licuuc"; AC_MSG_RESULT([yes])],
|
||||||
[AC_MSG_RESULT([no]); AC_MSG_ERROR(You requested libicu but it is not installed.)])
|
[AC_MSG_RESULT([no]); AC_MSG_ERROR(You requested libicu but it is not installed.)])
|
||||||
LIBS=$OLDLIBS
|
LIBS=$OLDLIBS
|
||||||
])
|
])
|
||||||
|
@ -191,7 +191,7 @@ fi
|
||||||
if test "x$HAVE_LIBIDN2" = "xyes" -o "x$HAVE_LIBIDN" = "xyes"; then
|
if test "x$HAVE_LIBIDN2" = "xyes" -o "x$HAVE_LIBIDN" = "xyes"; then
|
||||||
# Check for libunistring, we need it for psl_str_to_utf8lower()
|
# Check for libunistring, we need it for psl_str_to_utf8lower()
|
||||||
OLDLIBS=$LIBS
|
OLDLIBS=$LIBS
|
||||||
AC_SEARCH_LIBS(u8_tolower, unistring, HAVE_UNISTRING=yes, AC_MSG_ERROR(You requested libidn2 but libunistring is not installed.))
|
AC_SEARCH_LIBS(u8_tolower, unistring, HAVE_UNISTRING=yes, AC_MSG_ERROR(You requested libidn2|libidn but libunistring is not installed.))
|
||||||
LIBS=$OLDLIBS
|
LIBS=$OLDLIBS
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
@ -51,6 +51,7 @@ for CC in gcc clang; do
|
||||||
for xLCALL in C tr_TR.utf8; do
|
for xLCALL in C tr_TR.utf8; do
|
||||||
export TESTS_ENVIRONMENT="LC_ALL=$xLCALL VALGRIND_TESTS=$xVALGRIND"
|
export TESTS_ENVIRONMENT="LC_ALL=$xLCALL VALGRIND_TESTS=$xVALGRIND"
|
||||||
echo " *** TESTS_ENVIRONMENT=\"$TESTS_ENVIRONMENT\"" make check -j$CORES
|
echo " *** TESTS_ENVIRONMENT=\"$TESTS_ENVIRONMENT\"" make check -j$CORES
|
||||||
|
make clean > /dev/null
|
||||||
make check -j$CORES > /dev/null
|
make check -j$CORES > /dev/null
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
|
@ -53,10 +53,11 @@ extern "C" {
|
||||||
* psl_error_t:
|
* psl_error_t:
|
||||||
* @PSL_SUCCESS: Successful return.
|
* @PSL_SUCCESS: Successful return.
|
||||||
* @PSL_ERR_INVALID_ARG: Invalid argument.
|
* @PSL_ERR_INVALID_ARG: Invalid argument.
|
||||||
* @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter
|
* @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter.
|
||||||
* @PSL_ERR_TO_UTF16: Failed to convert to utf-16.
|
* @PSL_ERR_TO_UTF16: Failed to convert to utf-16.
|
||||||
* @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase.
|
* @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase.
|
||||||
* @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8.
|
* @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8.
|
||||||
|
* @PSL_ERR_NO_MEM: Failed to allocate memory.
|
||||||
*
|
*
|
||||||
* Return codes for PSL functions.
|
* Return codes for PSL functions.
|
||||||
* Negative return codes mean failure.
|
* Negative return codes mean failure.
|
||||||
|
@ -68,7 +69,8 @@ typedef enum {
|
||||||
PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */
|
PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */
|
||||||
PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */
|
PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */
|
||||||
PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */
|
PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */
|
||||||
PSL_ERR_TO_UTF8 = -5 /* failed to convert utf-16 to utf-8 */
|
PSL_ERR_TO_UTF8 = -5, /* failed to convert utf-16 to utf-8 */
|
||||||
|
PSL_ERR_NO_MEM = -6 /* failed to allocate memory */
|
||||||
} psl_error_t;
|
} psl_error_t;
|
||||||
|
|
||||||
typedef struct _psl_ctx_st psl_ctx_t;
|
typedef struct _psl_ctx_st psl_ctx_t;
|
||||||
|
|
2
list
2
list
|
@ -1 +1 @@
|
||||||
Subproject commit 1df90f84db1a041991a48e46e786705f7161ab4c
|
Subproject commit 41a519ad34cf86ff4470b967d9e4755d72b63a6c
|
|
@ -11,7 +11,7 @@ libpsl_la_CPPFLAGS = -I$(top_srcdir)/include
|
||||||
# include ABI version information
|
# include ABI version information
|
||||||
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
|
libpsl_la_LDFLAGS = -version-info $(LIBPSL_SO_VERSION)
|
||||||
if WITH_LIBICU
|
if WITH_LIBICU
|
||||||
libpsl_la_LDFLAGS += -licuuc
|
libpsl_la_LDFLAGS += $(LIBICU_LIBS)
|
||||||
endif
|
endif
|
||||||
if WITH_LIBIDN2
|
if WITH_LIBIDN2
|
||||||
libpsl_la_LDFLAGS += -lidn2 -lunistring
|
libpsl_la_LDFLAGS += -lidn2 -lunistring
|
||||||
|
@ -24,7 +24,7 @@ noinst_PROGRAMS = psl2c
|
||||||
psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
|
psl2c_SOURCES = psl2c.c lookup_string_in_fixed_set.c
|
||||||
psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/psl-make-dafsa\"
|
psl2c_CPPFLAGS = -I$(top_srcdir)/include -DMAKE_DAFSA=\"$(top_srcdir)/src/psl-make-dafsa\"
|
||||||
if BUILTIN_GENERATOR_LIBICU
|
if BUILTIN_GENERATOR_LIBICU
|
||||||
psl2c_LDADD = -licuuc
|
psl2c_LDADD = $(LIBICU_LIBS)
|
||||||
endif
|
endif
|
||||||
if BUILTIN_GENERATOR_LIBIDN2
|
if BUILTIN_GENERATOR_LIBIDN2
|
||||||
psl2c_LDADD = @LTLIBICONV@ -lidn2 -lunistring
|
psl2c_LDADD = @LTLIBICONV@ -lidn2 -lunistring
|
||||||
|
@ -39,3 +39,5 @@ suffixes_dafsa.c: $(PSL_FILE) psl2c$(EXEEXT)
|
||||||
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
|
./psl2c$(EXEEXT) "$(PSL_FILE)" suffixes_dafsa.c
|
||||||
|
|
||||||
EXTRA_DIST = psl-make-dafsa LICENSE.chromium
|
EXTRA_DIST = psl-make-dafsa LICENSE.chromium
|
||||||
|
|
||||||
|
dist_man_MANS = psl-make-dafsa.1
|
||||||
|
|
|
@ -21,6 +21,48 @@
|
||||||
|
|
||||||
#define CHECK_LT(a, b) if ((a) >= b) return 0
|
#define CHECK_LT(a, b) if ((a) >= b) return 0
|
||||||
|
|
||||||
|
static const char multibyte_length_table[16] = {
|
||||||
|
0, 0, 0, 0, /* 0x00-0x3F */
|
||||||
|
0, 0, 0, 0, /* 0x40-0x7F */
|
||||||
|
0, 0, 0, 0, /* 0x80-0xBF */
|
||||||
|
2, 2, 3, 4, /* 0xC0-0xFF */
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get lenght of multibyte character sequence starting at a given byte.
|
||||||
|
* Returns zero if the byte is not a valid leading byte in UTF-8.
|
||||||
|
*/
|
||||||
|
static int GetMultibyteLength(char c) {
|
||||||
|
return multibyte_length_table[((unsigned char)c) >> 4];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Moves pointers one byte forward.
|
||||||
|
*/
|
||||||
|
static void NextPos(const unsigned char** pos,
|
||||||
|
const char** key,
|
||||||
|
const char** multibyte_start)
|
||||||
|
{
|
||||||
|
++*pos;
|
||||||
|
if (*multibyte_start) {
|
||||||
|
/* Advance key to next byte in multibyte sequence. */
|
||||||
|
++*key;
|
||||||
|
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
|
||||||
|
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
|
||||||
|
*multibyte_start = 0;
|
||||||
|
} else {
|
||||||
|
if (GetMultibyteLength(**key)) {
|
||||||
|
/* Multibyte prefix was matched in the dafsa, start matching multibyte
|
||||||
|
* content in next round. */
|
||||||
|
*multibyte_start = *key;
|
||||||
|
} else {
|
||||||
|
/* Advance key as a single byte character was matched. */
|
||||||
|
++*key;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Read next offset from pos.
|
* Read next offset from pos.
|
||||||
* Returns true if an offset could be read, false otherwise.
|
* Returns true if an offset could be read, false otherwise.
|
||||||
|
@ -71,6 +113,35 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
||||||
return(*offset & 0x80) != 0;
|
return(*offset & 0x80) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if byte at offset matches first character in key.
|
||||||
|
* This version assumes a range check was already performed by the caller.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int IsMatchUnchecked(const unsigned char matcher,
|
||||||
|
const char* key,
|
||||||
|
const char* multibyte_start)
|
||||||
|
{
|
||||||
|
if (multibyte_start) {
|
||||||
|
/* Multibyte matching mode. */
|
||||||
|
if (multibyte_start == key) {
|
||||||
|
/* Match leading byte, which will also match the sequence length. */
|
||||||
|
return (matcher ^ 0x80) == (const unsigned char)*key;
|
||||||
|
} else {
|
||||||
|
/* Match following bytes. */
|
||||||
|
return (matcher ^ 0xC0) == (const unsigned char)*key;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* If key points at a leading byte in a multibyte sequence, but we are not yet
|
||||||
|
* in multibyte mode, then the dafsa should contain a special byte to indicate
|
||||||
|
* a mode switch. */
|
||||||
|
if (GetMultibyteLength(*key)) {
|
||||||
|
return matcher == 0x1F;
|
||||||
|
}
|
||||||
|
/* Normal matching of a single byte character. */
|
||||||
|
return matcher == (const unsigned char)*key;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check if byte at offset matches first character in key.
|
* Check if byte at offset matches first character in key.
|
||||||
* This version matches characters not last in label.
|
* This version matches characters not last in label.
|
||||||
|
@ -78,10 +149,11 @@ static int IsEOL(const unsigned char* offset, const unsigned char* end)
|
||||||
|
|
||||||
static int IsMatch(const unsigned char* offset,
|
static int IsMatch(const unsigned char* offset,
|
||||||
const unsigned char* end,
|
const unsigned char* end,
|
||||||
const char* key)
|
const char* key,
|
||||||
|
const char* multibyte_start)
|
||||||
{
|
{
|
||||||
CHECK_LT(offset, end);
|
CHECK_LT(offset, end);
|
||||||
return *offset == *key;
|
return IsMatchUnchecked(*offset, key, multibyte_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -91,10 +163,11 @@ static int IsMatch(const unsigned char* offset,
|
||||||
|
|
||||||
static int IsEndCharMatch(const unsigned char* offset,
|
static int IsEndCharMatch(const unsigned char* offset,
|
||||||
const unsigned char* end,
|
const unsigned char* end,
|
||||||
const char* key)
|
const char* key,
|
||||||
|
const char* multibyte_start)
|
||||||
{
|
{
|
||||||
CHECK_LT(offset, end);
|
CHECK_LT(offset, end);
|
||||||
return *offset == (*key | 0x80);
|
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -104,10 +177,11 @@ static int IsEndCharMatch(const unsigned char* offset,
|
||||||
|
|
||||||
static int GetReturnValue(const unsigned char* offset,
|
static int GetReturnValue(const unsigned char* offset,
|
||||||
const unsigned char* end,
|
const unsigned char* end,
|
||||||
|
const char* multibyte_start,
|
||||||
int* return_value)
|
int* return_value)
|
||||||
{
|
{
|
||||||
CHECK_LT(offset, end);
|
CHECK_LT(offset, end);
|
||||||
if ((*offset & 0xE0) == 0x80) {
|
if (!multibyte_start && (*offset & 0xE0) == 0x80) {
|
||||||
*return_value = *offset & 0x0F;
|
*return_value = *offset & 0x0F;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -140,6 +214,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
const unsigned char* end = graph + length;
|
const unsigned char* end = graph + length;
|
||||||
const unsigned char* offset = pos;
|
const unsigned char* offset = pos;
|
||||||
const char* key_end = key + key_length;
|
const char* key_end = key + key_length;
|
||||||
|
const char* multibyte_start = 0;
|
||||||
|
|
||||||
while (GetNextOffset(&pos, end, &offset)) {
|
while (GetNextOffset(&pos, end, &offset)) {
|
||||||
/*char <char>+ end_char offsets
|
/*char <char>+ end_char offsets
|
||||||
|
@ -153,11 +228,10 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
|
|
||||||
if (key != key_end && !IsEOL(offset, end)) {
|
if (key != key_end && !IsEOL(offset, end)) {
|
||||||
/* Leading <char> is not a match. Don't dive into this child */
|
/* Leading <char> is not a match. Don't dive into this child */
|
||||||
if (!IsMatch(offset, end, key))
|
if (!IsMatch(offset, end, key, multibyte_start))
|
||||||
continue;
|
continue;
|
||||||
did_consume = 1;
|
did_consume = 1;
|
||||||
++offset;
|
NextPos(&offset, &key, &multibyte_start);
|
||||||
++key;
|
|
||||||
/* Possible matches at this point:
|
/* Possible matches at this point:
|
||||||
* <char>+ end_char offsets
|
* <char>+ end_char offsets
|
||||||
* <char>+ return value
|
* <char>+ return value
|
||||||
|
@ -167,10 +241,9 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
|
|
||||||
/* Remove all remaining <char> nodes possible */
|
/* Remove all remaining <char> nodes possible */
|
||||||
while (!IsEOL(offset, end) && key != key_end) {
|
while (!IsEOL(offset, end) && key != key_end) {
|
||||||
if (!IsMatch(offset, end, key))
|
if (!IsMatch(offset, end, key, multibyte_start))
|
||||||
return -1;
|
return -1;
|
||||||
++key;
|
NextPos(&offset, &key, &multibyte_start);
|
||||||
++offset;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Possible matches at this point:
|
/* Possible matches at this point:
|
||||||
|
@ -182,7 +255,7 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
if (key == key_end) {
|
if (key == key_end) {
|
||||||
int return_value;
|
int return_value;
|
||||||
|
|
||||||
if (GetReturnValue(offset, end, &return_value))
|
if (GetReturnValue(offset, end, multibyte_start, &return_value))
|
||||||
return return_value;
|
return return_value;
|
||||||
/* The DAFSA guarantees that if the first char is a match, all
|
/* The DAFSA guarantees that if the first char is a match, all
|
||||||
* remaining char elements MUST match if the key is truly present.
|
* remaining char elements MUST match if the key is truly present.
|
||||||
|
@ -191,14 +264,22 @@ int _HIDDEN LookupStringInFixedSet(const unsigned char* graph,
|
||||||
return -1;
|
return -1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!IsEndCharMatch(offset, end, key)) {
|
if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
|
||||||
if (did_consume)
|
if (did_consume)
|
||||||
return -1; /* Unexpected */
|
return -1; /* Unexpected */
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
++key;
|
NextPos(&offset, &key, &multibyte_start);
|
||||||
pos = ++offset; /* Dive into child */
|
pos = offset; /* Dive into child */
|
||||||
}
|
}
|
||||||
|
|
||||||
return -1; /* No match */
|
return -1; /* No match */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* prototype to skip warning with -Wmissing-prototypes */
|
||||||
|
int _HIDDEN GetUtfMode(const unsigned char *graph, size_t length);
|
||||||
|
|
||||||
|
int _HIDDEN GetUtfMode(const unsigned char *graph, size_t length)
|
||||||
|
{
|
||||||
|
return length > 0 && graph[length - 1] < 0x80;
|
||||||
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python
|
||||||
# Copyright 2014 The Chromium Authors. All rights reserved.
|
# Copyright 2014 The Chromium Authors. All rights reserved.
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE.chromium file.
|
# found in the LICENSE.chromium file.
|
||||||
|
@ -14,8 +14,9 @@ This python program fetches strings and return values from a gperf file
|
||||||
and generates a C++ file with a byte array representing graph that can be
|
and generates a C++ file with a byte array representing graph that can be
|
||||||
used as a memory efficient replacement for the perfect hash table.
|
used as a memory efficient replacement for the perfect hash table.
|
||||||
|
|
||||||
The input strings are assumed to consist of printable 7-bit ASCII characters
|
The input strings must consist of printable 7-bit ASCII characters or UTF-8
|
||||||
and the return values are assumed to be one digit integers.
|
multibyte sequences. Control characters in the range [0x00-0x1F] are not
|
||||||
|
allowed. The return values must be one digit integers. .
|
||||||
|
|
||||||
In this program a DAFSA is a diamond shaped graph starting at a common
|
In this program a DAFSA is a diamond shaped graph starting at a common
|
||||||
source node and ending at a common sink node. All internal nodes contain
|
source node and ending at a common sink node. All internal nodes contain
|
||||||
|
@ -47,8 +48,8 @@ The generated byte array can described by the following BNF:
|
||||||
|
|
||||||
<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
||||||
|
|
||||||
<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
|
<char> ::= < byte in range [0x1F-0x7F] >
|
||||||
<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
|
<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] >
|
||||||
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
||||||
|
|
||||||
<offset1> ::= < byte in range [0x00-0x3F] >
|
<offset1> ::= < byte in range [0x00-0x3F] >
|
||||||
|
@ -84,13 +85,18 @@ The generated byte array can described by the following BNF:
|
||||||
| <prefix> <node>
|
| <prefix> <node>
|
||||||
| <end_label>
|
| <end_label>
|
||||||
|
|
||||||
<dafsa> ::= <source>
|
<graph> ::= <graph>
|
||||||
| <dafsa> <node>
|
| <graph> <node>
|
||||||
|
|
||||||
|
<version> ::= <empty> # The DAFSA was generated in ASCII mode.
|
||||||
|
| < byte value 0x01 > # The DAFSA was generated in UTF-8 mode.
|
||||||
|
|
||||||
|
<dafsa> ::= <graph> <version>
|
||||||
|
|
||||||
Decoding:
|
Decoding:
|
||||||
|
|
||||||
<char> -> printable 7-bit ASCII character
|
<char> -> character
|
||||||
<end_char> & 0x7F -> printable 7-bit ASCII character
|
<end_char> & 0x7F -> character
|
||||||
<return value> & 0x0F -> integer
|
<return value> & 0x0F -> integer
|
||||||
<offset1 & 0x3F> -> integer
|
<offset1 & 0x3F> -> integer
|
||||||
((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
||||||
|
@ -105,6 +111,28 @@ between previous child node and next child node. Thus each offset links a node
|
||||||
to a child node. The distance is always counted between start addresses, i.e.
|
to a child node. The distance is always counted between start addresses, i.e.
|
||||||
first byte in decoded offset or first byte in child node.
|
first byte in decoded offset or first byte in child node.
|
||||||
|
|
||||||
|
Transcoding of UTF-8 multibyte sequences:
|
||||||
|
|
||||||
|
The original DAFSA format was limited to 7-bit printable ASCII characters in
|
||||||
|
range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences.
|
||||||
|
By transcoding of such characters the new format preserves compatibility with
|
||||||
|
old parsers, so that a DAFSA in the extended format can be used by an old
|
||||||
|
parser without false positives, although strings containing transcoded
|
||||||
|
characters will never match. Since the format is extended rather than being
|
||||||
|
changed, a parser supporting the new format will automatically support data
|
||||||
|
generated in the old format.
|
||||||
|
|
||||||
|
Transcoding is performed by insertion of a start byte with the special value
|
||||||
|
0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside
|
||||||
|
the range of printable ASCII.
|
||||||
|
|
||||||
|
2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn
|
||||||
|
|
||||||
|
3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn
|
||||||
|
|
||||||
|
4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn ->
|
||||||
|
00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn
|
||||||
|
|
||||||
Example 1:
|
Example 1:
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
@ -197,8 +225,29 @@ import sys
|
||||||
class InputError(Exception):
|
class InputError(Exception):
|
||||||
"""Exception raised for errors in the input file."""
|
"""Exception raised for errors in the input file."""
|
||||||
|
|
||||||
|
# Length of a character starting at a given byte.
|
||||||
|
char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF
|
||||||
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF
|
||||||
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF
|
||||||
|
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF
|
||||||
|
|
||||||
def to_dafsa(words):
|
def to_bytes(n):
|
||||||
|
"""Converts an integer value to a bytes object."""
|
||||||
|
return bytes(bytearray((n,)))
|
||||||
|
|
||||||
|
def to_dafsa(words, utf_mode):
|
||||||
"""Generates a DAFSA from a word list and returns the source node.
|
"""Generates a DAFSA from a word list and returns the source node.
|
||||||
|
|
||||||
Each word is split into characters so that each character is represented by
|
Each word is split into characters so that each character is represented by
|
||||||
|
@ -206,20 +255,36 @@ def to_dafsa(words):
|
||||||
"""
|
"""
|
||||||
if not words:
|
if not words:
|
||||||
raise InputError('The domain list must not be empty')
|
raise InputError('The domain list must not be empty')
|
||||||
def to_nodes(word):
|
def to_nodes(word, multibyte_length):
|
||||||
"""Split words into characters"""
|
"""Split words into characters"""
|
||||||
if not 0x1F < ord(word[0]) < 0x80:
|
byte = ord(word[:1])
|
||||||
raise InputError('Domain names must be printable 7-bit ASCII')
|
if multibyte_length:
|
||||||
|
# Consume next byte in multibyte sequence.
|
||||||
|
if byte & 0xC0 != 0x80:
|
||||||
|
raise InputError('Invalid UTF-8 multibyte sequence')
|
||||||
|
return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)]
|
||||||
|
char_length = char_length_table[byte]
|
||||||
|
if char_length == 1:
|
||||||
|
# 7-bit printable ASCII.
|
||||||
if len(word) == 1:
|
if len(word) == 1:
|
||||||
return chr(int(word[0], 16) & 0x0F), [None]
|
return to_bytes(int(word[:1], 16) & 0x0F), [None]
|
||||||
return word[0], [to_nodes(word[1:])]
|
return word[:1], [to_nodes(word[1:], 0)]
|
||||||
return [to_nodes(word) for word in words]
|
elif char_length > 1:
|
||||||
|
# Leading byte in multibyte sequence.
|
||||||
|
if not utf_mode:
|
||||||
|
raise InputError('UTF-8 encoded characters are not allowed in ASCII mode')
|
||||||
|
if len(word) <= char_length:
|
||||||
|
raise InputError('Unterminated UTF-8 multibyte sequence')
|
||||||
|
return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])]
|
||||||
|
# Unexpected character.
|
||||||
|
raise InputError('Domain names must be printable ASCII or UTF-8')
|
||||||
|
|
||||||
|
return [to_nodes(word, 0) for word in words]
|
||||||
|
|
||||||
def to_words(node):
|
def to_words(node):
|
||||||
"""Generates a word list from all paths starting from an internal node."""
|
"""Generates a word list from all paths starting from an internal node."""
|
||||||
if not node:
|
if not node:
|
||||||
return ['']
|
return [b'']
|
||||||
return [(node[0] + word) for child in node[1] for word in to_words(child)]
|
return [(node[0] + word) for child in node[1] for word in to_words(child)]
|
||||||
|
|
||||||
|
|
||||||
|
@ -286,7 +351,7 @@ def join_suffixes(dafsa):
|
||||||
"""Generates a new DAFSA where nodes that represent the same word lists
|
"""Generates a new DAFSA where nodes that represent the same word lists
|
||||||
towards the sink are merged.
|
towards the sink are merged.
|
||||||
"""
|
"""
|
||||||
nodemap = {frozenset(('',)): None}
|
nodemap = {frozenset((b'',)): None}
|
||||||
|
|
||||||
def join(node):
|
def join(node):
|
||||||
"""Returns a macthing node. A new node is created if no matching node
|
"""Returns a macthing node. A new node is created if no matching node
|
||||||
|
@ -384,7 +449,7 @@ def encode_prefix(label):
|
||||||
will then be a prefix to the label in the child node.
|
will then be a prefix to the label in the child node.
|
||||||
"""
|
"""
|
||||||
assert label
|
assert label
|
||||||
return [ord(c) for c in reversed(label)]
|
return [c for c in bytearray(reversed(label))]
|
||||||
|
|
||||||
|
|
||||||
def encode_label(label):
|
def encode_label(label):
|
||||||
|
@ -396,7 +461,7 @@ def encode_label(label):
|
||||||
return buf
|
return buf
|
||||||
|
|
||||||
|
|
||||||
def encode(dafsa):
|
def encode(dafsa, utf_mode):
|
||||||
"""Encodes a DAFSA to a list of bytes"""
|
"""Encodes a DAFSA to a list of bytes"""
|
||||||
output = []
|
output = []
|
||||||
offsets = {}
|
offsets = {}
|
||||||
|
@ -412,62 +477,66 @@ def encode(dafsa):
|
||||||
|
|
||||||
output.extend(encode_links(dafsa, offsets, len(output)))
|
output.extend(encode_links(dafsa, offsets, len(output)))
|
||||||
output.reverse()
|
output.reverse()
|
||||||
|
if utf_mode:
|
||||||
|
output.append(0x01)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def to_cxx(data):
|
def to_cxx(data, codecs):
|
||||||
"""Generates C++ code from a list of encoded bytes."""
|
"""Generates C++ code from a list of encoded bytes."""
|
||||||
text = '/* This file is generated. DO NOT EDIT!\n\n'
|
text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n'
|
||||||
text += 'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
text += b'The byte array encodes effective tld names. See psl-make-dafsa source for'
|
||||||
text += ' documentation.'
|
text += b' documentation.'
|
||||||
text += '*/\n\n'
|
text += b'*/\n\n'
|
||||||
text += 'static const unsigned char kDafsa[%s] = {\n' % len(data)
|
text += b'static const unsigned char kDafsa['
|
||||||
|
text += bytes(str(len(data)), **codecs)
|
||||||
|
text += b'] = {\n'
|
||||||
for i in range(0, len(data), 12):
|
for i in range(0, len(data), 12):
|
||||||
text += ' '
|
text += b' '
|
||||||
text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
|
text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs)
|
||||||
text += ',\n'
|
text += b',\n'
|
||||||
text += '};\n'
|
text += b'};\n'
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def words_to_whatever(words, converter):
|
def words_to_whatever(words, converter, utf_mode, codecs):
|
||||||
"""Generates C++ code from a word list"""
|
"""Generates C++ code from a word list"""
|
||||||
dafsa = to_dafsa(words)
|
dafsa = to_dafsa(words, utf_mode)
|
||||||
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
|
||||||
dafsa = fun(dafsa)
|
dafsa = fun(dafsa)
|
||||||
return converter(encode(dafsa))
|
return converter(encode(dafsa, utf_mode), codecs)
|
||||||
|
|
||||||
|
|
||||||
def words_to_cxx(words):
|
def words_to_cxx(words, utf_mode, codecs):
|
||||||
"""Generates C++ code from a word list"""
|
"""Generates C++ code from a word list"""
|
||||||
return words_to_whatever(words, to_cxx)
|
return words_to_whatever(words, to_cxx, utf_mode, codecs)
|
||||||
|
|
||||||
|
|
||||||
def words_to_binary(words):
|
def words_to_binary(words, utf_mode, codecs):
|
||||||
"""Generates C++ code from a word list"""
|
"""Generates C++ code from a word list"""
|
||||||
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, bytearray)
|
return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs)
|
||||||
|
|
||||||
|
|
||||||
def parse_psl2c(infile):
|
def parse_psl2c(infile, utf_mode, codecs):
|
||||||
"""Parses file generated by psl2c and extract strings and return code"""
|
"""Parses file generated by psl2c and extract strings and return code"""
|
||||||
lines = [line.strip() for line in infile]
|
lines = [bytes(line.strip(), **codecs) for line in infile]
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if line[-3:-1] != ', ':
|
if line[-3:-1] != b', ':
|
||||||
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
|
raise InputError('Expected "domainname, <digit>", found "%s"' % line)
|
||||||
# Technically the DAFSA format could support return values in range [0-31],
|
# Technically the DAFSA format could support return values in range [0x00-0x1E],
|
||||||
# but the values below are the only with a defined meaning.
|
# but the values below are the only with a defined meaning.
|
||||||
if line[-1] not in '0123456789ABCDEF':
|
if line[-1] not in b'0123456789ABCDEF':
|
||||||
raise InputError('Expected value to be one of {0,1,2,4,5}, found "%s"' % line[-1])
|
raise InputError('Expected value to be in range [0-9] or [A-F], found "%s"' % line[-1:])
|
||||||
|
|
||||||
# with open("gperf.out", 'w') as outfile:
|
# with open("gperf.out", 'w') as outfile:
|
||||||
# for line in sorted(lines):
|
# for line in sorted(lines):
|
||||||
# outfile.write(line[:-3] + line[-1] + "\n")
|
# outfile.write(line[:-3] + line[-1] + "\n")
|
||||||
|
|
||||||
return [line[:-3] + line[-1] for line in sorted(lines)]
|
return [line[:-3] + line[-1:] for line in sorted(lines)]
|
||||||
|
|
||||||
|
|
||||||
def parse_psl(infile):
|
def parse_psl(infile, utf_mode, codecs):
|
||||||
"""Parses PSL file and extract strings and return code"""
|
"""Parses PSL file and extract strings and return code"""
|
||||||
PSL_FLAG_EXCEPTION = (1<<0)
|
PSL_FLAG_EXCEPTION = (1<<0)
|
||||||
PSL_FLAG_WILDCARD = (1<<1)
|
PSL_FLAG_WILDCARD = (1<<1)
|
||||||
|
@ -479,39 +548,39 @@ def parse_psl(infile):
|
||||||
section = 0
|
section = 0
|
||||||
|
|
||||||
for line in infile:
|
for line in infile:
|
||||||
line = line.strip()
|
line = bytes(line.strip(), **codecs)
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if line.startswith("//"):
|
if line.startswith(b'//'):
|
||||||
if section == 0:
|
if section == 0:
|
||||||
if "===BEGIN ICANN DOMAINS===" in line:
|
if b'===BEGIN ICANN DOMAINS===' in line:
|
||||||
section = PSL_FLAG_ICANN
|
section = PSL_FLAG_ICANN
|
||||||
elif section == 0 and "===BEGIN PRIVATE DOMAINS===" in line:
|
elif section == 0 and b'===BEGIN PRIVATE DOMAINS===' in line:
|
||||||
section = PSL_FLAG_PRIVATE
|
section = PSL_FLAG_PRIVATE
|
||||||
elif section == PSL_FLAG_ICANN and "===END ICANN DOMAINS===" in line:
|
elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line:
|
||||||
section = 0
|
section = 0
|
||||||
elif section == PSL_FLAG_PRIVATE and "===END PRIVATE DOMAINS===" in line:
|
elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line:
|
||||||
section = 0
|
section = 0
|
||||||
continue # skip comments
|
continue # skip comments
|
||||||
|
|
||||||
if line[0] == '!':
|
if line[:1] == b'!':
|
||||||
flags = PSL_FLAG_EXCEPTION | section
|
flags = PSL_FLAG_EXCEPTION | section
|
||||||
line = line[1:]
|
line = line[1:]
|
||||||
elif line[0] == '*':
|
elif line[:1] == b'*':
|
||||||
if line[1] != '.':
|
if line[1:2] != b'.':
|
||||||
print('Unsupported kind of rule (ignored): %s' % line)
|
print('Unsupported kind of rule (ignored): %s' % line)
|
||||||
continue
|
continue
|
||||||
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
|
flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section
|
||||||
line = line[2:]
|
line = line[2:]
|
||||||
else:
|
else:
|
||||||
if not '.' in line:
|
if not b'.' in line:
|
||||||
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
|
continue # we do not need an explicit plain TLD rule, already covered by implicit '*' rule
|
||||||
flags = PSL_FLAG_PLAIN | section
|
flags = PSL_FLAG_PLAIN | section
|
||||||
|
|
||||||
line = line.decode('utf-8').encode("idna")
|
punycode = line.decode('utf-8').encode('idna')
|
||||||
|
|
||||||
if line in psl:
|
if punycode in psl:
|
||||||
"""Found existing entry:
|
"""Found existing entry:
|
||||||
Combination of exception and plain rule is ambiguous
|
Combination of exception and plain rule is ambiguous
|
||||||
!foo.bar
|
!foo.bar
|
||||||
|
@ -521,16 +590,18 @@ def parse_psl(infile):
|
||||||
!foo.bar + *.foo.bar
|
!foo.bar + *.foo.bar
|
||||||
foo.bar + *.foo.bar
|
foo.bar + *.foo.bar
|
||||||
"""
|
"""
|
||||||
print('Found %s/%X (now %X)' % line, psl[line], flags)
|
print('Found %s/%X (now %X)' % punycode, psl[punycode], flags)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if utf_mode:
|
||||||
psl[line] = flags
|
psl[line] = flags
|
||||||
|
psl[punycode] = flags
|
||||||
|
|
||||||
# with open("psl.out", 'w') as outfile:
|
# with open("psl.out", 'w') as outfile:
|
||||||
# for (domain, flags) in sorted(psl.iteritems()):
|
# for (domain, flags) in sorted(psl.iteritems()):
|
||||||
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
|
# outfile.write(domain + "%X" % (flags & 0x0F) + "\n")
|
||||||
|
|
||||||
return [domain + "%X" % (flags & 0x0F) for (domain, flags) in sorted(psl.iteritems())]
|
return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())]
|
||||||
|
|
||||||
|
|
||||||
def usage():
|
def usage():
|
||||||
|
@ -538,8 +609,10 @@ def usage():
|
||||||
print('usage: %s [options] infile outfile' % sys.argv[0])
|
print('usage: %s [options] infile outfile' % sys.argv[0])
|
||||||
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
|
print(' --input-format=psl2c infile has been generated by libpsl/psl2c utility (default)')
|
||||||
print(' --input-format=psl infile is a Public Suffix List file')
|
print(' --input-format=psl infile is a Public Suffix List file')
|
||||||
print(' --output-format=cxx Write DAFSA as C/C++ code')
|
print(' --output-format=cxx Write DAFSA as C/C++ code (default)')
|
||||||
print(' --output-format=binary Write DAFSA binary data')
|
print(' --output-format=binary Write DAFSA binary data')
|
||||||
|
print(' --encoding=ascii 7-bit ASCII mode')
|
||||||
|
print(' --encoding=utf-8 UTF-8 mode (default)')
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
@ -550,6 +623,11 @@ def main():
|
||||||
|
|
||||||
converter = words_to_cxx
|
converter = words_to_cxx
|
||||||
parser = parse_psl2c
|
parser = parse_psl2c
|
||||||
|
utf_mode = True
|
||||||
|
|
||||||
|
codecs = dict()
|
||||||
|
if sys.version_info.major > 2:
|
||||||
|
codecs['encoding'] = 'utf-8'
|
||||||
|
|
||||||
for arg in sys.argv[1:-2]:
|
for arg in sys.argv[1:-2]:
|
||||||
if arg.startswith('--input-format='):
|
if arg.startswith('--input-format='):
|
||||||
|
@ -570,15 +648,24 @@ def main():
|
||||||
else:
|
else:
|
||||||
print("Unknown output format '%s'" % value)
|
print("Unknown output format '%s'" % value)
|
||||||
return 1
|
return 1
|
||||||
|
elif arg.startswith('--encoding='):
|
||||||
|
value = arg[11:].lower()
|
||||||
|
if value == 'ascii':
|
||||||
|
utf_mode = False
|
||||||
|
elif value == 'utf-8':
|
||||||
|
utf_mode = True
|
||||||
|
else:
|
||||||
|
print("Unknown encoding '%s'" % value)
|
||||||
|
return 1
|
||||||
else:
|
else:
|
||||||
usage()
|
usage()
|
||||||
|
|
||||||
if sys.argv[-2] == '-':
|
if sys.argv[-2] == '-':
|
||||||
with open(sys.argv[-1], 'w') as outfile:
|
with open(sys.argv[-1], 'wb') as outfile:
|
||||||
outfile.write(converter(parser(sys.stdin)))
|
outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs))
|
||||||
else:
|
else:
|
||||||
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'w') as outfile:
|
with open(sys.argv[-2], 'r') as infile, open(sys.argv[-1], 'wb') as outfile:
|
||||||
outfile.write(converter(parser(infile)))
|
outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs))
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
|
@ -28,9 +28,14 @@ depends on options passed to it.
|
||||||
\fBcxx\fR: (default) output is C/C++ code
|
\fBcxx\fR: (default) output is C/C++ code
|
||||||
.br
|
.br
|
||||||
\fBbinary\fR: output is an architecture-independent binary format
|
\fBbinary\fR: output is an architecture-independent binary format
|
||||||
|
.TP
|
||||||
|
\fB\-\-encoding=\fR[\fIutf-8\fR|\fIascii\fR]
|
||||||
|
\fButf-8\fR: (default) UTF-8 mode (output contains UTF-8 + punycode)
|
||||||
|
.br
|
||||||
|
\fBascii\fR: (deprecated) 7-bit ASCII mode (output contains punycode only)
|
||||||
.SH SEE ALSO
|
.SH SEE ALSO
|
||||||
.IR https://publicsuffix.org/ ", " https://github.com/rockdaboot/libpsl
|
.IR https://publicsuffix.org/ ", " https://github.com/rockdaboot/libpsl
|
||||||
.SH COPYRIGHT
|
.SH COPYRIGHT
|
||||||
\fBpsl-make-dafsa\fR was originally part of the Chromium project, and
|
\fBpsl-make-dafsa\fR was was written by Olle Liljenzin as part of the Chromium project and
|
||||||
has been modified by Tim Ruehsen and Daniel Kahn Gillmor. The code
|
has been modified by Tim Ruehsen and Daniel Kahn Gillmor. The code
|
||||||
and its documentation is governed by a BSD-style license.
|
and its documentation is governed by a BSD-style license.
|
||||||
|
|
215
src/psl.c
215
src/psl.c
|
@ -73,6 +73,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
#include <time.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <limits.h> /* for UINT_MAX */
|
#include <limits.h> /* for UINT_MAX */
|
||||||
#include <langinfo.h>
|
#include <langinfo.h>
|
||||||
|
@ -101,9 +102,6 @@
|
||||||
|
|
||||||
#include <libpsl.h>
|
#include <libpsl.h>
|
||||||
|
|
||||||
/* number of elements within an array */
|
|
||||||
#define countof(a) (sizeof(a)/sizeof(*(a)))
|
|
||||||
|
|
||||||
#ifndef HAVE_STRNDUP
|
#ifndef HAVE_STRNDUP
|
||||||
/* I found no strndup on my old SUSE 7.3 test system (gcc 2.95) */
|
/* I found no strndup on my old SUSE 7.3 test system (gcc 2.95) */
|
||||||
|
|
||||||
|
@ -176,10 +174,11 @@ struct _psl_ctx_st {
|
||||||
size_t
|
size_t
|
||||||
dafsa_size;
|
dafsa_size;
|
||||||
int
|
int
|
||||||
mode,
|
|
||||||
nsuffixes,
|
nsuffixes,
|
||||||
nexceptions,
|
nexceptions,
|
||||||
nwildcards;
|
nwildcards;
|
||||||
|
unsigned
|
||||||
|
utf8 : 1; /* 1: data contains UTF-8 + punycode encoded rules */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* include the PSL data compiled by 'psl2c' */
|
/* include the PSL data compiled by 'psl2c' */
|
||||||
|
@ -263,11 +262,21 @@ static int _vector_add(_psl_vector_t *v, const _psl_entry_t *elem)
|
||||||
if (v) {
|
if (v) {
|
||||||
void *elemp;
|
void *elemp;
|
||||||
|
|
||||||
elemp = malloc(sizeof(_psl_entry_t));
|
if (!(elemp = malloc(sizeof(_psl_entry_t))))
|
||||||
|
return -1;
|
||||||
|
|
||||||
memcpy(elemp, elem, sizeof(_psl_entry_t));
|
memcpy(elemp, elem, sizeof(_psl_entry_t));
|
||||||
|
|
||||||
if (v->max == v->cur)
|
if (v->max == v->cur) {
|
||||||
v->entry = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
|
void *m = realloc(v->entry, (v->max *= 2) * sizeof(_psl_entry_t *));
|
||||||
|
|
||||||
|
if (m)
|
||||||
|
v->entry = m;
|
||||||
|
else {
|
||||||
|
free(elemp);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
v->entry[v->cur++] = elemp;
|
v->entry[v->cur++] = elemp;
|
||||||
return v->cur - 1;
|
return v->cur - 1;
|
||||||
|
@ -517,36 +526,37 @@ static enum punycode_status punycode_encode(
|
||||||
static ssize_t _utf8_to_utf32(const char *in, size_t inlen, punycode_uint *out, size_t outlen)
|
static ssize_t _utf8_to_utf32(const char *in, size_t inlen, punycode_uint *out, size_t outlen)
|
||||||
{
|
{
|
||||||
size_t n = 0;
|
size_t n = 0;
|
||||||
unsigned char *s;
|
const unsigned char *s = (void *)in;
|
||||||
|
const unsigned char *e = (void *)(in + inlen);
|
||||||
|
|
||||||
if (!outlen)
|
if (!outlen)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
outlen--;
|
outlen--;
|
||||||
|
|
||||||
s = alloca(inlen + 1);
|
while (n < outlen) {
|
||||||
memcpy(s, in, inlen);
|
size_t inleft = e - s;
|
||||||
s[inlen] = 0;
|
|
||||||
|
|
||||||
while (*s && n < outlen) {
|
if (inleft >= 1 && (*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
|
||||||
if ((*s & 0x80) == 0) { /* 0xxxxxxx ASCII char */
|
|
||||||
out[n++] = *s;
|
out[n++] = *s;
|
||||||
s++;
|
s++;
|
||||||
} else if ((*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
|
} else if (inleft >= 2 && (*s & 0xE0) == 0xC0) /* 110xxxxx 10xxxxxx */ {
|
||||||
if ((s[1] & 0xC0) != 0x80)
|
if ((s[1] & 0xC0) != 0x80)
|
||||||
return -1;
|
return -1;
|
||||||
out[n++] = ((*s & 0x1F) << 6) | (s[1] & 0x3F);
|
out[n++] = ((*s & 0x1F) << 6) | (s[1] & 0x3F);
|
||||||
s += 2;
|
s += 2;
|
||||||
} else if ((*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
|
} else if (inleft >= 3 && (*s & 0xF0) == 0xE0) /* 1110xxxx 10xxxxxx 10xxxxxx */ {
|
||||||
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
|
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
|
||||||
return -1;
|
return -1;
|
||||||
out[n++] = ((*s & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
|
out[n++] = ((*s & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
|
||||||
s += 3;
|
s += 3;
|
||||||
} else if ((*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
|
} else if (inleft >= 4 && (*s & 0xF8) == 0xF0) /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ {
|
||||||
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
|
if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80 || (s[3] & 0xC0) != 0x80)
|
||||||
return -1;
|
return -1;
|
||||||
out[n++] = ((*s & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
|
out[n++] = ((*s & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
|
||||||
s += 4;
|
s += 4;
|
||||||
|
} else if (!inleft) {
|
||||||
|
break;
|
||||||
} else
|
} else
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -587,7 +597,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
|
||||||
if (outlen + labellen + (e != NULL) + 4 >= outsize)
|
if (outlen + labellen + (e != NULL) + 4 >= outsize)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if ((inputlen = _utf8_to_utf32(label, labellen, input, sizeof (input) / sizeof (input[0]))) < 0)
|
if ((inputlen = _utf8_to_utf32(label, labellen, input, countof(input))) < 0)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
memcpy(out + outlen, "xn--", 4);
|
memcpy(out + outlen, "xn--", 4);
|
||||||
|
@ -609,7 +619,7 @@ static int _domain_to_punycode(const char *domain, char *out, size_t outsize)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static inline int _isspace_ascii(const char c)
|
static int _isspace_ascii(const char c)
|
||||||
{
|
{
|
||||||
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
||||||
}
|
}
|
||||||
|
@ -691,14 +701,14 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
|
||||||
UChar utf16_dst[128], utf16_src[128];
|
UChar utf16_dst[128], utf16_src[128];
|
||||||
int32_t utf16_src_length;
|
int32_t utf16_src_length;
|
||||||
|
|
||||||
u_strFromUTF8(utf16_src, sizeof(utf16_src)/sizeof(utf16_src[0]), &utf16_src_length, utf8, -1, &status);
|
u_strFromUTF8(utf16_src, countof(utf16_src), &utf16_src_length, utf8, -1, &status);
|
||||||
if (U_SUCCESS(status)) {
|
if (U_SUCCESS(status)) {
|
||||||
int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, sizeof(utf16_dst)/sizeof(utf16_dst[0]), &info, &status);
|
int32_t dst_length = uidna_nameToASCII((UIDNA *)idna, utf16_src, utf16_src_length, utf16_dst, countof(utf16_dst), &info, &status);
|
||||||
if (U_SUCCESS(status)) {
|
if (U_SUCCESS(status)) {
|
||||||
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
|
u_strToUTF8(lookupname, sizeof(lookupname), NULL, utf16_dst, dst_length, &status);
|
||||||
if (U_SUCCESS(status)) {
|
if (U_SUCCESS(status)) {
|
||||||
if (ascii)
|
if (ascii)
|
||||||
*ascii = strdup(lookupname);
|
if ((*ascii = strdup(lookupname)))
|
||||||
ret = 0;
|
ret = 0;
|
||||||
} /* else
|
} /* else
|
||||||
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
|
fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
|
||||||
|
@ -709,31 +719,20 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
|
||||||
}
|
}
|
||||||
#elif defined(WITH_LIBIDN2)
|
#elif defined(WITH_LIBIDN2)
|
||||||
int rc;
|
int rc;
|
||||||
uint8_t *lower, resbuf[256];
|
uint8_t *lower;
|
||||||
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
|
size_t len = u8_strlen((uint8_t *)utf8) + 1;
|
||||||
|
|
||||||
/* we need a conversion to lowercase */
|
/* we need a conversion to lowercase */
|
||||||
lower = u8_tolower((uint8_t *)utf8, u8_strlen((uint8_t *)utf8), 0, UNINORM_NFKC, resbuf, &len);
|
if (!(lower = u8_tolower((uint8_t *)utf8, len, 0, UNINORM_NFKC, NULL, &len))) {
|
||||||
if (!lower) {
|
|
||||||
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
|
/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", utf8, errno); */
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* u8_tolower() does not terminate the result string */
|
|
||||||
if (lower == resbuf) {
|
|
||||||
lower[len]=0;
|
|
||||||
} else {
|
|
||||||
uint8_t *tmp = lower;
|
|
||||||
lower = (uint8_t *)strndup((char *)lower, len);
|
|
||||||
free(tmp);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
|
if ((rc = idn2_lookup_u8(lower, (uint8_t **)ascii, 0)) == IDN2_OK) {
|
||||||
ret = 0;
|
ret = 0;
|
||||||
} /* else
|
} /* else
|
||||||
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
|
fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */
|
||||||
|
|
||||||
if (lower != resbuf)
|
|
||||||
free(lower);
|
free(lower);
|
||||||
#elif defined(WITH_LIBIDN)
|
#elif defined(WITH_LIBIDN)
|
||||||
int rc;
|
int rc;
|
||||||
|
@ -754,7 +753,7 @@ static int _psl_idna_toASCII(_psl_idna_t *idna _UNUSED, const char *utf8, char *
|
||||||
|
|
||||||
if (_domain_to_punycode(utf8, lookupname, sizeof(lookupname)) == 0) {
|
if (_domain_to_punycode(utf8, lookupname, sizeof(lookupname)) == 0) {
|
||||||
if (ascii)
|
if (ascii)
|
||||||
*ascii = strdup(lookupname);
|
if ((*ascii = strdup(lookupname)))
|
||||||
ret = 0;
|
ret = 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -776,7 +775,7 @@ static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_en
|
||||||
/* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
|
/* fprintf(stderr, "toASCII '%s' -> '%s'\n", e->label_buf, lookupname); */
|
||||||
_suffix_init(&suffix, lookupname, strlen(lookupname));
|
_suffix_init(&suffix, lookupname, strlen(lookupname));
|
||||||
suffix.flags = e->flags;
|
suffix.flags = e->flags;
|
||||||
suffixp = _vector_get(v, _vector_add(v, &suffix));
|
if ((suffixp = _vector_get(v, _vector_add(v, &suffix))))
|
||||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
||||||
} /* else ignore */
|
} /* else ignore */
|
||||||
|
|
||||||
|
@ -784,8 +783,9 @@ static void _add_punycode_if_needed(_psl_idna_t *idna, _psl_vector_t *v, _psl_en
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* prototype */
|
/* prototypes */
|
||||||
int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
|
int LookupStringInFixedSet(const unsigned char* graph, size_t length, const char* key, size_t key_length);
|
||||||
|
int GetUtfMode(const unsigned char *graph, size_t length);
|
||||||
|
|
||||||
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
|
static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int type)
|
||||||
{
|
{
|
||||||
|
@ -814,6 +814,14 @@ static int _psl_is_public_suffix(const psl_ctx_t *psl, const char *domain, int t
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (psl->utf8 || psl == &_builtin_psl)
|
||||||
|
need_conversion = 0;
|
||||||
|
|
||||||
|
#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
|
||||||
|
if (psl == &_builtin_psl)
|
||||||
|
need_conversion = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (need_conversion) {
|
if (need_conversion) {
|
||||||
_psl_idna_t *idna = _psl_idna_open();
|
_psl_idna_t *idna = _psl_idna_open();
|
||||||
|
|
||||||
|
@ -934,8 +942,9 @@ suffix_yes:
|
||||||
*
|
*
|
||||||
* For cookie domain checking see psl_is_cookie_domain_acceptable().
|
* For cookie domain checking see psl_is_cookie_domain_acceptable().
|
||||||
*
|
*
|
||||||
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
|
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
|
||||||
* Other encodings result in unexpected behavior.
|
* Other encodings likely result in incorrect return values.
|
||||||
|
* Use helper function psl_str_to_utf8lower() for normalization @domain.
|
||||||
*
|
*
|
||||||
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
||||||
* psl_builtin().
|
* psl_builtin().
|
||||||
|
@ -964,8 +973,9 @@ int psl_is_public_suffix(const psl_ctx_t *psl, const char *domain)
|
||||||
* @type specifies the PSL section where to perform the lookup. Valid values are
|
* @type specifies the PSL section where to perform the lookup. Valid values are
|
||||||
* %PSL_TYPE_PRIVATE, %PSL_TYPE_ICANN and %PSL_TYPE_ANY.
|
* %PSL_TYPE_PRIVATE, %PSL_TYPE_ICANN and %PSL_TYPE_ANY.
|
||||||
*
|
*
|
||||||
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
|
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
|
||||||
* Other encodings result in unexpected behavior.
|
* Other encodings likely result in incorrect return values.
|
||||||
|
* Use helper function psl_str_to_utf8lower() for normalization @domain.
|
||||||
*
|
*
|
||||||
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
||||||
* psl_builtin().
|
* psl_builtin().
|
||||||
|
@ -990,8 +1000,9 @@ int psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type)
|
||||||
* This function finds the longest public suffix part of @domain by the means
|
* This function finds the longest public suffix part of @domain by the means
|
||||||
* of the [Mozilla Public Suffix List](https://publicsuffix.org).
|
* of the [Mozilla Public Suffix List](https://publicsuffix.org).
|
||||||
*
|
*
|
||||||
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
|
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
|
||||||
* Other encodings result in unexpected behavior.
|
* Other encodings likely result in incorrect return values.
|
||||||
|
* Use helper function psl_str_to_utf8lower() for normalization @domain.
|
||||||
*
|
*
|
||||||
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
||||||
* psl_builtin().
|
* psl_builtin().
|
||||||
|
@ -1029,8 +1040,9 @@ const char *psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain)
|
||||||
* This function finds the shortest private suffix part of @domain by the means
|
* This function finds the shortest private suffix part of @domain by the means
|
||||||
* of the [Mozilla Public Suffix List](https://publicsuffix.org).
|
* of the [Mozilla Public Suffix List](https://publicsuffix.org).
|
||||||
*
|
*
|
||||||
* International @domain names have to be either in lowercase UTF-8 or in ASCII form (punycode).
|
* International @domain names have to be either in UTF-8 (lowercase + NFCK) or in ASCII/ACE format (punycode).
|
||||||
* Other encodings result in unexpected behavior.
|
* Other encodings likely result in incorrect return values.
|
||||||
|
* Use helper function psl_str_to_utf8lower() for normalization @domain.
|
||||||
*
|
*
|
||||||
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
* @psl is a context returned by either psl_load_file(), psl_load_fp() or
|
||||||
* psl_builtin().
|
* psl_builtin().
|
||||||
|
@ -1070,7 +1082,7 @@ const char *psl_registrable_domain(const psl_ctx_t *psl, const char *domain)
|
||||||
* This function loads the public suffixes file named @fname.
|
* This function loads the public suffixes file named @fname.
|
||||||
* To free the allocated resources, call psl_free().
|
* To free the allocated resources, call psl_free().
|
||||||
*
|
*
|
||||||
* The suffixes are expected to be lowercase UTF-8 encoded if they are international.
|
* The suffixes are expected to be UTF-8 encoded (lowercase + NFCK) if they are international.
|
||||||
*
|
*
|
||||||
* Returns: Pointer to a PSL context or %NULL on failure.
|
* Returns: Pointer to a PSL context or %NULL on failure.
|
||||||
*
|
*
|
||||||
|
@ -1099,7 +1111,7 @@ psl_ctx_t *psl_load_file(const char *fname)
|
||||||
* This function loads the public suffixes from a FILE pointer.
|
* This function loads the public suffixes from a FILE pointer.
|
||||||
* To free the allocated resources, call psl_free().
|
* To free the allocated resources, call psl_free().
|
||||||
*
|
*
|
||||||
* The suffixes are expected to be lowercase UTF-8 encoded if they are international.
|
* The suffixes are expected to be UTF-8 encoded (lowercase + NFCK) if they are international.
|
||||||
*
|
*
|
||||||
* Returns: Pointer to a PSL context or %NULL on failure.
|
* Returns: Pointer to a PSL context or %NULL on failure.
|
||||||
*
|
*
|
||||||
|
@ -1152,6 +1164,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
psl->dafsa = m;
|
psl->dafsa = m;
|
||||||
|
|
||||||
psl->dafsa_size = len;
|
psl->dafsa_size = len;
|
||||||
|
psl->utf8 = !!GetUtfMode(psl->dafsa, len);
|
||||||
|
|
||||||
return psl;
|
return psl;
|
||||||
}
|
}
|
||||||
|
@ -1163,6 +1176,7 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
* as of 19.02.2014, the list at https://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
|
* as of 19.02.2014, the list at https://publicsuffix.org/list/ contains ~6500 rules and 19 exceptions.
|
||||||
*/
|
*/
|
||||||
psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
|
psl->suffixes = _vector_alloc(8*1024, _suffix_compare_array);
|
||||||
|
psl->utf8 = 1; /* we put UTF-8 and punycode rules in the lookup vector */
|
||||||
|
|
||||||
do {
|
do {
|
||||||
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
|
while (_isspace_ascii(*linep)) linep++; /* ignore leading whitespace */
|
||||||
|
@ -1231,10 +1245,11 @@ psl_ctx_t *psl_load_fp(FILE *fp)
|
||||||
suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
|
suffixp = _vector_get(psl->suffixes, _vector_add(psl->suffixes, &suffix));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (suffixp) {
|
||||||
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
suffixp->label = suffixp->label_buf; /* set label to changed address */
|
||||||
|
|
||||||
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
|
_add_punycode_if_needed(idna, psl->suffixes, suffixp);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} while ((linep = fgets(buf, sizeof(buf), fp)));
|
} while ((linep = fgets(buf, sizeof(buf), fp)));
|
||||||
|
|
||||||
_vector_sort(psl->suffixes);
|
_vector_sort(psl->suffixes);
|
||||||
|
@ -1275,8 +1290,8 @@ void psl_free(psl_ctx_t *psl)
|
||||||
* The builtin data also contains punycode entries, one for each international domain name.
|
* The builtin data also contains punycode entries, one for each international domain name.
|
||||||
*
|
*
|
||||||
* If the generation of built-in data has been disabled during compilation, %NULL will be returned.
|
* If the generation of built-in data has been disabled during compilation, %NULL will be returned.
|
||||||
* When using the builtin psl context, you can provide UTF-8 or punycode representations of domains to
|
* When using the builtin psl context, you can provide UTF-8 (lowercase + NFCK) or ASCII/ACE (punycode)
|
||||||
* functions like psl_is_public_suffix().
|
* representations of domains to functions like psl_is_public_suffix().
|
||||||
*
|
*
|
||||||
* Returns: Pointer to the built in PSL data or NULL if this data is not available.
|
* Returns: Pointer to the built in PSL data or NULL if this data is not available.
|
||||||
*
|
*
|
||||||
|
@ -1495,8 +1510,10 @@ static int _isip(const char *hostname)
|
||||||
* This helper function checks whether @cookie_domain is an acceptable cookie domain value for the request
|
* This helper function checks whether @cookie_domain is an acceptable cookie domain value for the request
|
||||||
* @hostname.
|
* @hostname.
|
||||||
*
|
*
|
||||||
* For international domain names both, @hostname and @cookie_domain, have to be either in lowercase UTF-8
|
* For international domain names both, @hostname and @cookie_domain, have to be either in UTF-8 (lowercase + NFCK)
|
||||||
* or in ASCII form (punycode). Other encodings or mixing UTF-8 and punycode result in unexpected behavior.
|
* or in ASCII/ACE (punycode) format. Other encodings or mixing UTF-8 and punycode likely result in incorrect return values.
|
||||||
|
*
|
||||||
|
* Use helper function psl_str_to_utf8lower() for normalization of @hostname and @cookie_domain.
|
||||||
*
|
*
|
||||||
* Examples:
|
* Examples:
|
||||||
* 1. Cookie domain 'example.com' would be acceptable for hostname 'www.example.com',
|
* 1. Cookie domain 'example.com' would be acceptable for hostname 'www.example.com',
|
||||||
|
@ -1553,8 +1570,8 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
|
||||||
* @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
|
* @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
|
||||||
* @lower: return value containing the converted string
|
* @lower: return value containing the converted string
|
||||||
*
|
*
|
||||||
* This helper function converts a string to lowercase UTF-8 representation.
|
* This helper function converts a string to UTF-8 lowercase + NFCK representation.
|
||||||
* Lowercase UTF-8 is needed as input to the domain checking functions.
|
* Lowercase + NFCK UTF-8 is needed as input to the domain checking functions.
|
||||||
*
|
*
|
||||||
* @lower is set to %NULL on error.
|
* @lower is set to %NULL on error.
|
||||||
*
|
*
|
||||||
|
@ -1567,6 +1584,7 @@ int psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname,
|
||||||
* PSL_ERR_TO_UTF16: Failed to convert @str to unicode
|
* PSL_ERR_TO_UTF16: Failed to convert @str to unicode
|
||||||
* PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase
|
* PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase
|
||||||
* PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8
|
* PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8
|
||||||
|
* PSL_ERR_NO_MEM: Failed to allocate memory
|
||||||
*
|
*
|
||||||
* Since: 0.4
|
* Since: 0.4
|
||||||
*/
|
*/
|
||||||
|
@ -1585,7 +1603,8 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
||||||
if (lower) {
|
if (lower) {
|
||||||
char *p;
|
char *p;
|
||||||
|
|
||||||
*lower = strdup(str);
|
if (!(*lower = strdup(str)))
|
||||||
|
return PSL_ERR_NO_MEM;
|
||||||
|
|
||||||
/* convert ASCII string to lowercase */
|
/* convert ASCII string to lowercase */
|
||||||
for (p = *lower; *p; p++)
|
for (p = *lower; *p; p++)
|
||||||
|
@ -1604,10 +1623,21 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
||||||
char *utf8_lower;
|
char *utf8_lower;
|
||||||
UConverter *uconv;
|
UConverter *uconv;
|
||||||
|
|
||||||
|
if (str_length < 256) {
|
||||||
/* C89 allocation */
|
/* C89 allocation */
|
||||||
utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1));
|
utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1));
|
||||||
utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
|
utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
|
||||||
utf8_lower = alloca(str_length * 2 + 1);
|
utf8_lower = alloca(str_length * 2 + 1);
|
||||||
|
} else {
|
||||||
|
utf16_dst = malloc(sizeof(UChar) * (str_length * 2 + 1));
|
||||||
|
utf16_lower = malloc(sizeof(UChar) * (str_length * 2 + 1));
|
||||||
|
utf8_lower = malloc(str_length * 2 + 1);
|
||||||
|
|
||||||
|
if (!utf16_dst || !utf16_lower || !utf8_lower) {
|
||||||
|
ret = PSL_ERR_NO_MEM;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
uconv = ucnv_open(encoding, &status);
|
uconv = ucnv_open(encoding, &status);
|
||||||
if (U_SUCCESS(status)) {
|
if (U_SUCCESS(status)) {
|
||||||
|
@ -1619,9 +1649,16 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
||||||
if (U_SUCCESS(status)) {
|
if (U_SUCCESS(status)) {
|
||||||
u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status);
|
u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status);
|
||||||
if (U_SUCCESS(status)) {
|
if (U_SUCCESS(status)) {
|
||||||
if (lower)
|
|
||||||
*lower = strdup(utf8_lower);
|
|
||||||
ret = PSL_SUCCESS;
|
ret = PSL_SUCCESS;
|
||||||
|
if (lower) {
|
||||||
|
if (str_length < 256) {
|
||||||
|
if (!(*lower = strdup(utf8_lower)))
|
||||||
|
ret = PSL_ERR_NO_MEM;
|
||||||
|
} else {
|
||||||
|
*lower = utf8_lower;
|
||||||
|
utf8_lower = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
ret = PSL_ERR_TO_UTF8;
|
ret = PSL_ERR_TO_UTF8;
|
||||||
/* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
|
/* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
|
||||||
|
@ -1638,6 +1675,12 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
||||||
ret = PSL_ERR_CONVERTER;
|
ret = PSL_ERR_CONVERTER;
|
||||||
/* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */
|
/* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */
|
||||||
}
|
}
|
||||||
|
out:
|
||||||
|
if (str_length >= 256) {
|
||||||
|
free(utf16_dst);
|
||||||
|
free(utf16_lower);
|
||||||
|
free(utf8_lower);
|
||||||
|
}
|
||||||
} while (0);
|
} while (0);
|
||||||
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
|
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
|
||||||
do {
|
do {
|
||||||
|
@ -1655,26 +1698,32 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
||||||
|
|
||||||
if (cd != (iconv_t)-1) {
|
if (cd != (iconv_t)-1) {
|
||||||
char *tmp = (char *)str; /* iconv won't change where str points to, but changes tmp itself */
|
char *tmp = (char *)str; /* iconv won't change where str points to, but changes tmp itself */
|
||||||
size_t tmp_len = strlen(str);
|
size_t tmp_len = strlen(str) + 1;
|
||||||
size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
|
size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
|
||||||
char *dst = malloc(dst_len + 1), *dst_tmp = dst;
|
char *dst = malloc(dst_len + 1), *dst_tmp = dst;
|
||||||
|
|
||||||
if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1) {
|
if (!dst) {
|
||||||
uint8_t *resbuf = malloc(dst_len * 2 + 1);
|
ret = PSL_ERR_NO_MEM;
|
||||||
size_t len = dst_len * 2; /* leave space for additional \0 byte */
|
}
|
||||||
|
else if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1
|
||||||
|
&& iconv(cd, NULL, NULL, &dst_tmp, &dst_len_tmp) != (size_t)-1)
|
||||||
|
{
|
||||||
|
/* start size for u8_tolower internal memory allocation.
|
||||||
|
* u8_tolower() does not terminate the result string. we have 0 byte included in above tmp_len
|
||||||
|
* and thus in len. */
|
||||||
|
size_t len = dst_len - dst_len_tmp;
|
||||||
|
|
||||||
if ((dst = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
|
if ((tmp = (char *)u8_tolower((uint8_t *)dst, len, 0, UNINORM_NFKC, NULL, &len))) {
|
||||||
/* u8_tolower() does not terminate the result string */
|
ret = PSL_SUCCESS;
|
||||||
if (lower)
|
if (lower) {
|
||||||
*lower = strndup((char *)dst, len);
|
*lower = tmp;
|
||||||
|
tmp = NULL;
|
||||||
|
} else
|
||||||
|
free(tmp);
|
||||||
} else {
|
} else {
|
||||||
ret = PSL_ERR_TO_LOWER;
|
ret = PSL_ERR_TO_LOWER;
|
||||||
/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
|
/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lower)
|
|
||||||
*lower = strndup(dst, dst_len - dst_len_tmp);
|
|
||||||
ret = PSL_SUCCESS;
|
|
||||||
} else {
|
} else {
|
||||||
ret = PSL_ERR_TO_UTF8;
|
ret = PSL_ERR_TO_UTF8;
|
||||||
/* fprintf(stderr, "Failed to convert '%s' string into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
|
/* fprintf(stderr, "Failed to convert '%s' string into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
|
||||||
|
@ -1686,19 +1735,21 @@ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding _UNUSED,
|
||||||
ret = PSL_ERR_TO_UTF8;
|
ret = PSL_ERR_TO_UTF8;
|
||||||
/* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
|
/* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
|
||||||
}
|
}
|
||||||
} else
|
} else {
|
||||||
ret = PSL_SUCCESS;
|
|
||||||
|
|
||||||
/* convert to lowercase */
|
|
||||||
if (ret == PSL_SUCCESS) {
|
|
||||||
uint8_t *dst, resbuf[256];
|
|
||||||
size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */
|
|
||||||
|
|
||||||
/* we need a conversion to lowercase */
|
/* we need a conversion to lowercase */
|
||||||
if ((dst = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str), 0, UNINORM_NFKC, resbuf, &len))) {
|
uint8_t *tmp;
|
||||||
/* u8_tolower() does not terminate the result string */
|
|
||||||
if (lower)
|
/* start size for u8_tolower internal memory allocation.
|
||||||
*lower = strndup((char *)dst, len);
|
* u8_tolower() does not terminate the result string, so include terminating 0 byte in len. */
|
||||||
|
size_t len = u8_strlen((uint8_t *)str) + 1;
|
||||||
|
|
||||||
|
if ((tmp = u8_tolower((uint8_t *)str, len, 0, UNINORM_NFKC, NULL, &len))) {
|
||||||
|
ret = PSL_SUCCESS;
|
||||||
|
if (lower) {
|
||||||
|
*lower = (char*)tmp;
|
||||||
|
tmp = NULL;
|
||||||
|
} else
|
||||||
|
free(tmp);
|
||||||
} else {
|
} else {
|
||||||
ret = PSL_ERR_TO_LOWER;
|
ret = PSL_ERR_TO_LOWER;
|
||||||
/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
|
/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */
|
||||||
|
|
10
src/psl2c.c
10
src/psl2c.c
|
@ -153,11 +153,6 @@ static void _print_psl_entries_dafsa(FILE *fpout, const _psl_vector_t *v)
|
||||||
if ((fp = fopen("in.tmp", "w"))) {
|
if ((fp = fopen("in.tmp", "w"))) {
|
||||||
for (it = 0; it < v->cur; it++) {
|
for (it = 0; it < v->cur; it++) {
|
||||||
_psl_entry_t *e = _vector_get(v, it);
|
_psl_entry_t *e = _vector_get(v, it);
|
||||||
unsigned char *s = (unsigned char *)e->label_buf;
|
|
||||||
|
|
||||||
/* search for non-ASCII label and skip it */
|
|
||||||
while (*s && *s < 128) s++;
|
|
||||||
if (*s) continue;
|
|
||||||
|
|
||||||
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
|
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
|
||||||
}
|
}
|
||||||
|
@ -191,11 +186,6 @@ static int _print_psl_entries_dafsa_binary(const char *fname, const _psl_vector_
|
||||||
if ((fp = fopen("in.tmp", "w"))) {
|
if ((fp = fopen("in.tmp", "w"))) {
|
||||||
for (it = 0; it < v->cur; it++) {
|
for (it = 0; it < v->cur; it++) {
|
||||||
_psl_entry_t *e = _vector_get(v, it);
|
_psl_entry_t *e = _vector_get(v, it);
|
||||||
unsigned char *s = (unsigned char *)e->label_buf;
|
|
||||||
|
|
||||||
/* search for non-ASCII label and skip it */
|
|
||||||
while (*s && *s < 128) s++;
|
|
||||||
if (*s) continue;
|
|
||||||
|
|
||||||
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
|
fprintf(fp, "%s, %X\n", e->label_buf, (int) (e->flags & 0x0F));
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,3 +23,14 @@ check_PROGRAMS = $(PSL_TESTS)
|
||||||
|
|
||||||
TESTS_ENVIRONMENT = TESTS_VALGRIND="@VALGRIND_ENVIRONMENT@"
|
TESTS_ENVIRONMENT = TESTS_VALGRIND="@VALGRIND_ENVIRONMENT@"
|
||||||
TESTS = $(PSL_TESTS)
|
TESTS = $(PSL_TESTS)
|
||||||
|
|
||||||
|
# dafsa.psl and dafsa_ascii.psl must be created before any test is executed
|
||||||
|
# check-local target works in parallel to the tests, so the test suite will likely fail
|
||||||
|
BUILT_SOURCES = psl.dafsa psl_ascii.dafsa
|
||||||
|
psl.dafsa:
|
||||||
|
$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary "$(PSL_FILE)" psl.dafsa
|
||||||
|
psl_ascii.dafsa:
|
||||||
|
$(top_srcdir)/src/psl-make-dafsa --input-format=psl --output-format=binary --encoding=ascii "$(PSL_FILE)" psl_ascii.dafsa
|
||||||
|
|
||||||
|
clean-local:
|
||||||
|
rm -f psl.dafsa psl_ascii.dafsa
|
||||||
|
|
|
@ -65,6 +65,7 @@ static void test_psl(void)
|
||||||
{ "www.his.name", "his.name", 1 },
|
{ "www.his.name", "his.name", 1 },
|
||||||
{ "www.his.name", "name", 0 },
|
{ "www.his.name", "name", 0 },
|
||||||
{ "www.example.com", "www.example.com", 1 },
|
{ "www.example.com", "www.example.com", 1 },
|
||||||
|
{ "www.example.com", "wwww.example.com", 0 },
|
||||||
{ "www.example.com", "example.com", 1 },
|
{ "www.example.com", "example.com", 1 },
|
||||||
{ "www.example.com", "com", 0 }, /* not accepted by normalization (PSL rule 'com') */
|
{ "www.example.com", "com", 0 }, /* not accepted by normalization (PSL rule 'com') */
|
||||||
{ "www.example.com", "example.org", 0 },
|
{ "www.example.com", "example.org", 0 },
|
||||||
|
@ -77,6 +78,8 @@ static void test_psl(void)
|
||||||
{ "2a00:1450:4013:c01::8b", ":1450:4013:c01::8b", 0 }, /* IPv6 address, partial match */
|
{ "2a00:1450:4013:c01::8b", ":1450:4013:c01::8b", 0 }, /* IPv6 address, partial match */
|
||||||
{ "::ffff:192.1.123.2", "::ffff:192.1.123.2", 1 }, /* IPv6 address dotted-quad, full match */
|
{ "::ffff:192.1.123.2", "::ffff:192.1.123.2", 1 }, /* IPv6 address dotted-quad, full match */
|
||||||
{ "::ffff:192.1.123.2", ".1.123.2", 0 }, /* IPv6 address dotted-quad, partial match */
|
{ "::ffff:192.1.123.2", ".1.123.2", 0 }, /* IPv6 address dotted-quad, partial match */
|
||||||
|
{ NULL, ".1.123.2", 0 },
|
||||||
|
{ "hiho", NULL, 0 },
|
||||||
};
|
};
|
||||||
unsigned it;
|
unsigned it;
|
||||||
psl_ctx_t *psl;
|
psl_ctx_t *psl;
|
||||||
|
@ -98,6 +101,9 @@ static void test_psl(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* do checks to cover more code paths in libpsl */
|
||||||
|
psl_is_cookie_domain_acceptable(NULL, "example.com", "example.com");
|
||||||
|
|
||||||
psl_free(psl);
|
psl_free(psl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -49,7 +49,7 @@ static int
|
||||||
struct timespec ts1, ts2;
|
struct timespec ts1, ts2;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static inline int _isspace_ascii(const char c)
|
static int _isspace_ascii(const char c)
|
||||||
{
|
{
|
||||||
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
||||||
}
|
}
|
||||||
|
@ -131,7 +131,7 @@ static void test_psl_entry(const psl_ctx_t *psl, const char *domain, int type)
|
||||||
static void test_psl(void)
|
static void test_psl(void)
|
||||||
{
|
{
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
psl_ctx_t *psl;
|
psl_ctx_t *psl, *psl3, *psl4;
|
||||||
const psl_ctx_t *psl2;
|
const psl_ctx_t *psl2;
|
||||||
int type = 0;
|
int type = 0;
|
||||||
char buf[256], *linep, *p;
|
char buf[256], *linep, *p;
|
||||||
|
@ -142,6 +142,16 @@ static void test_psl(void)
|
||||||
psl2 = psl_builtin();
|
psl2 = psl_builtin();
|
||||||
printf("builtin PSL has %d suffixes and %d exceptions\n", psl_suffix_count(psl2), psl_suffix_exception_count(psl2));
|
printf("builtin PSL has %d suffixes and %d exceptions\n", psl_suffix_count(psl2), psl_suffix_exception_count(psl2));
|
||||||
|
|
||||||
|
if (!(psl3 = psl_load_file("psl.dafsa"))) {
|
||||||
|
fprintf(stderr, "Failed to load 'psl.dafsa'\n");
|
||||||
|
failed++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(psl4 = psl_load_file("psl_ascii.dafsa"))) {
|
||||||
|
fprintf(stderr, "Failed to load 'psl_ascii.dafsa'\n");
|
||||||
|
failed++;
|
||||||
|
}
|
||||||
|
|
||||||
if ((fp = fopen(PSL_FILE, "r"))) {
|
if ((fp = fopen(PSL_FILE, "r"))) {
|
||||||
#ifdef HAVE_CLOCK_GETTIME
|
#ifdef HAVE_CLOCK_GETTIME
|
||||||
clock_gettime(CLOCK_REALTIME, &ts1);
|
clock_gettime(CLOCK_REALTIME, &ts1);
|
||||||
|
@ -174,6 +184,12 @@ static void test_psl(void)
|
||||||
|
|
||||||
if (psl2)
|
if (psl2)
|
||||||
test_psl_entry(psl2, p, type);
|
test_psl_entry(psl2, p, type);
|
||||||
|
|
||||||
|
if (psl3)
|
||||||
|
test_psl_entry(psl3, p, type);
|
||||||
|
|
||||||
|
if (psl4)
|
||||||
|
test_psl_entry(psl4, p, type);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_CLOCK_GETTIME
|
#ifdef HAVE_CLOCK_GETTIME
|
||||||
|
@ -185,8 +201,10 @@ static void test_psl(void)
|
||||||
failed++;
|
failed++;
|
||||||
}
|
}
|
||||||
|
|
||||||
psl_free(psl);
|
psl_free(psl4);
|
||||||
|
psl_free(psl3);
|
||||||
psl_free((psl_ctx_t *)psl2);
|
psl_free((psl_ctx_t *)psl2);
|
||||||
|
psl_free(psl);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, const char * const *argv)
|
int main(int argc, const char * const *argv)
|
||||||
|
|
|
@ -84,6 +84,7 @@ static void test_psl(void)
|
||||||
{ "adfhoweirh", 1 }, /* unknown TLD */
|
{ "adfhoweirh", 1 }, /* unknown TLD */
|
||||||
};
|
};
|
||||||
unsigned it;
|
unsigned it;
|
||||||
|
int result, ver;
|
||||||
psl_ctx_t *psl;
|
psl_ctx_t *psl;
|
||||||
|
|
||||||
psl = psl_load_file(PSL_FILE);
|
psl = psl_load_file(PSL_FILE);
|
||||||
|
@ -92,7 +93,7 @@ static void test_psl(void)
|
||||||
|
|
||||||
for (it = 0; it < countof(test_data); it++) {
|
for (it = 0; it < countof(test_data); it++) {
|
||||||
const struct test_data *t = &test_data[it];
|
const struct test_data *t = &test_data[it];
|
||||||
int result = psl_is_public_suffix(psl, t->domain);
|
result = psl_is_public_suffix(psl, t->domain);
|
||||||
|
|
||||||
if (result == t->result) {
|
if (result == t->result) {
|
||||||
ok++;
|
ok++;
|
||||||
|
@ -102,6 +103,68 @@ static void test_psl(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* do some checks to cover more code paths in libpsl */
|
||||||
|
psl_is_public_suffix(NULL, "xxx");
|
||||||
|
|
||||||
|
if ((ver = psl_check_version_number(0)) == 0) {
|
||||||
|
printf("psl_check_version_number(0) is 0\n");
|
||||||
|
failed++;
|
||||||
|
} else {
|
||||||
|
if (((result = psl_check_version_number(ver)) != ver)) {
|
||||||
|
printf("psl_check_version_number(%06X) is %06X\n", ver, result);
|
||||||
|
failed++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (((result = psl_check_version_number(ver - 1)) != 0)) {
|
||||||
|
printf("psl_check_version_number(%06X) is %06X\n", ver - 1, result);
|
||||||
|
failed++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (((result = psl_check_version_number(ver + 1)) != ver)) {
|
||||||
|
printf("psl_check_version_number(%06X) is %06X\n", ver, result);
|
||||||
|
failed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
psl_str_to_utf8lower("www.example.com", "utf-8", "en", NULL);
|
||||||
|
psl_str_to_utf8lower(NULL, "utf-8", "en", NULL);
|
||||||
|
|
||||||
|
{
|
||||||
|
char *lower = NULL;
|
||||||
|
|
||||||
|
psl_str_to_utf8lower("www.example.com", NULL, "de", &lower);
|
||||||
|
free(lower); lower = NULL;
|
||||||
|
|
||||||
|
psl_str_to_utf8lower("\374bel.de", NULL, "de", &lower);
|
||||||
|
free(lower); lower = NULL;
|
||||||
|
|
||||||
|
psl_str_to_utf8lower("\374bel.de", "iso-8859-1", NULL, &lower);
|
||||||
|
free(lower); lower = NULL;
|
||||||
|
|
||||||
|
psl_str_to_utf8lower(NULL, "utf-8", "en", &lower);
|
||||||
|
free(lower); lower = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
psl_get_version();
|
||||||
|
psl_builtin_filename();
|
||||||
|
psl_builtin_outdated();
|
||||||
|
psl_builtin_file_time();
|
||||||
|
psl_builtin_sha1sum();
|
||||||
|
psl_suffix_wildcard_count(NULL);
|
||||||
|
psl_suffix_wildcard_count(psl);
|
||||||
|
psl_suffix_wildcard_count(psl_builtin());
|
||||||
|
psl_suffix_count(NULL);
|
||||||
|
psl_suffix_exception_count(NULL);
|
||||||
|
psl_load_file(NULL);
|
||||||
|
psl_load_fp(NULL);
|
||||||
|
psl_registrable_domain(NULL, "");
|
||||||
|
psl_registrable_domain(psl, NULL);
|
||||||
|
psl_registrable_domain(psl, "www.example.com");
|
||||||
|
psl_unregistrable_domain(NULL, "");
|
||||||
|
psl_unregistrable_domain(psl, NULL);
|
||||||
|
psl_is_public_suffix2(NULL, "", PSL_TYPE_ANY);
|
||||||
|
psl_is_public_suffix2(psl, NULL, PSL_TYPE_ANY);
|
||||||
|
|
||||||
psl_free(psl);
|
psl_free(psl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -50,14 +50,28 @@ static int
|
||||||
ok,
|
ok,
|
||||||
failed;
|
failed;
|
||||||
|
|
||||||
static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
|
static void testx(const psl_ctx_t *psl, const char *domain, const char *encoding, const char *lang, const char *expected_result)
|
||||||
{
|
{
|
||||||
const char *result;
|
const char *result;
|
||||||
char *lower;
|
char *lower;
|
||||||
|
int rc;
|
||||||
|
|
||||||
/* our test data is fixed to UTF-8 (english), so provide it here */
|
/* just to cover special code paths for valgrind checking */
|
||||||
if (psl_str_to_utf8lower(domain, "utf-8", "en", &lower) == PSL_SUCCESS)
|
psl_str_to_utf8lower(domain, encoding, lang, NULL);
|
||||||
|
|
||||||
|
if ((rc = psl_str_to_utf8lower(domain, encoding, lang, &lower)) == PSL_SUCCESS)
|
||||||
domain = lower;
|
domain = lower;
|
||||||
|
/* non-ASCII domains fail here if no runtime IDN library is configured, so skip it */
|
||||||
|
#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
|
||||||
|
else if (domain) {
|
||||||
|
/* if we do not runtime support, test failure have to be skipped */
|
||||||
|
failed++;
|
||||||
|
printf("psl_str_to_utf8lower(%s)=%d\n", domain ? domain : "NULL", rc);
|
||||||
|
|
||||||
|
free(lower);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
result = psl_registrable_domain(psl, domain);
|
result = psl_registrable_domain(psl, domain);
|
||||||
|
|
||||||
|
@ -72,13 +86,28 @@ static void test(const psl_ctx_t *psl, const char *domain, const char *expected_
|
||||||
free(lower);
|
free(lower);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void test(const psl_ctx_t *psl, const char *domain, const char *expected_result)
|
||||||
|
{
|
||||||
|
testx(psl, domain, "utf-8", "en", expected_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void test_iso(const psl_ctx_t *psl, const char *domain, const char *expected_result)
|
||||||
|
{
|
||||||
|
/* makes only sense with a runtime IDN library configured */
|
||||||
|
#if defined(WITH_LIBIDN) || defined(WITH_LIBIDN2) || defined(WITH_LIBICU)
|
||||||
|
testx(psl, domain, "iso-8859-15", "de", expected_result);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static void test_psl(void)
|
static void test_psl(void)
|
||||||
{
|
{
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
const psl_ctx_t *psl;
|
const psl_ctx_t *psl;
|
||||||
const char *p;
|
const char *p;
|
||||||
char buf[256], domain[128], expected_regdom[128], semicolon[2];
|
char buf[256], domain[128], expected_regdom[128], semicolon[2];
|
||||||
|
char lbuf[258];
|
||||||
int er_is_null, d_is_null;
|
int er_is_null, d_is_null;
|
||||||
|
unsigned it;
|
||||||
|
|
||||||
psl = psl_builtin();
|
psl = psl_builtin();
|
||||||
|
|
||||||
|
@ -101,6 +130,22 @@ static void test_psl(void)
|
||||||
/* Norwegian with lowercase oe */
|
/* Norwegian with lowercase oe */
|
||||||
test(psl, "www.\303\270yer.no", "www.\303\270yer.no");
|
test(psl, "www.\303\270yer.no", "www.\303\270yer.no");
|
||||||
|
|
||||||
|
/* Norwegian with lowercase oe, encoded as ISO-8859-15 */
|
||||||
|
test_iso(psl, "www.\370yer.no", "www.\303\270yer.no");
|
||||||
|
|
||||||
|
/* Testing special code paths of psl_str_to_utf8lower() */
|
||||||
|
for (it = 254; it <= 257; it++) {
|
||||||
|
memset(lbuf, 'a', it);
|
||||||
|
lbuf[it] = 0;
|
||||||
|
|
||||||
|
lbuf[0] = '\370';
|
||||||
|
test_iso(psl, lbuf, NULL);
|
||||||
|
|
||||||
|
lbuf[0] = '\303';
|
||||||
|
lbuf[1] = '\270';
|
||||||
|
test(psl, lbuf, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/* special check with NULL psl context and TLD */
|
/* special check with NULL psl context and TLD */
|
||||||
test(psl, "whoever.forgot.his.name", "whoever.forgot.his.name");
|
test(psl, "whoever.forgot.his.name", "whoever.forgot.his.name");
|
||||||
|
|
||||||
|
|
|
@ -12,3 +12,5 @@ LDADD = ../src/libpsl.la
|
||||||
#if WITH_LIBIDN
|
#if WITH_LIBIDN
|
||||||
# LDADD += -lidn
|
# LDADD += -lidn
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
dist_man_MANS = psl.1
|
||||||
|
|
Loading…
Reference in New Issue