Further work on pcre2test (can now display compiled code).
This commit is contained in:
parent
9812ca8b0a
commit
225992aa3a
17
Makefile.am
17
Makefile.am
|
@ -269,25 +269,26 @@ COMMON_SOURCES = \
|
|||
src/pcre2_error.c \
|
||||
src/pcre2_match.c \
|
||||
src/pcre2_internal.h \
|
||||
src/pcre2_intmodedep.h \
|
||||
src/pcre2_jit_compile.c \
|
||||
src/pcre2_jit_match.c \
|
||||
src/pcre2_jit_misc.c \
|
||||
src/pcre2_maketables.c \
|
||||
src/pcre2_match_data.c \
|
||||
src/pcre2_pattern_info.c \
|
||||
src/pcre2_string_utils.c \
|
||||
src/pcre2_substring.c \
|
||||
src/pcre2_tables.c \
|
||||
src/pcre2_ucd.c \
|
||||
src/pcre2_ucp.h \
|
||||
src/pcre2_version.c
|
||||
|
||||
# src/pcre2_newline.c \
|
||||
# src/pcre2_ord2utf8.c \
|
||||
# src/pcre2_refcount.c \
|
||||
# src/pcre2_string_utils.c \
|
||||
# src/pcre2_study.c \
|
||||
# src/pcre2_tables.c \
|
||||
# src/pcre2_ucd.c \
|
||||
# src/pcre2_valid_utf8.c \
|
||||
# src/pcre2_xclass.c \
|
||||
# src/ucp.h
|
||||
# src/pcre2_xclass.c
|
||||
|
||||
|
||||
if WITH_PCRE8
|
||||
|
@ -450,7 +451,7 @@ endif # WITH_GCOV
|
|||
endif # WITH_JIT
|
||||
|
||||
# Build the general pcre2test program. The file src/pcre2_printint.c is
|
||||
# #included by pcre2test as many times as needed, at different code unit
|
||||
# #included by pcre2test as many times as needed, at different code unit
|
||||
# widths.
|
||||
|
||||
bin_PROGRAMS += pcre2test
|
||||
|
@ -593,8 +594,8 @@ CLEANFILES += \
|
|||
testtemp* \
|
||||
testtry \
|
||||
testNinput
|
||||
|
||||
## ------------ End of testing -------------
|
||||
|
||||
## ------------ End of testing -------------
|
||||
|
||||
|
||||
# PCRE demonstration program. Not built automatcally. The point is that the
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# Generate utt tables. Note: this script is written in Python 2 and is
|
||||
# incompatible with Python 3. However, the 2to3 conversion script has been
|
||||
# successfully tested on it.
|
||||
|
||||
# The source file pcre2_tables.c contains (amongst other things), a table that
|
||||
# is indexed by script name. In order to reduce the number of relocations when
|
||||
# loading the library, the names are held as a single large string, with
|
||||
# offsets in the table. This is tedious to maintain by hand. Therefore, this
|
||||
# script is used to generate the table. The output is sent to stdout; usually
|
||||
# that should be directed to a temporary file. Then pcre2_tables.c can be
|
||||
# edited by replacing the relevant definitions and table therein with the
|
||||
# temporary file.
|
||||
|
||||
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
||||
# for UTF-support in EBCDIC as well as ASCII environments.
|
||||
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
|
||||
# Modified by PH 04-May-2010 to add new "X.." special categories.
|
||||
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
|
||||
# Modified by ChPe 30-September-2012 to add this note; no other changes were
|
||||
# necessary for Unicode 6.2.0 support.
|
||||
# Modfied by PH 26-February-2013 to add the Xuc special category.
|
||||
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
|
||||
|
||||
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
||||
# New for Unicode 5.0
|
||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
||||
# New for Unicode 5.1
|
||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
|
||||
# New for Unicode 5.2
|
||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
|
||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
|
||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
|
||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
|
||||
# New for Unicode 6.0.0
|
||||
'Batak', 'Brahmi', 'Mandaic', \
|
||||
# New for Unicode 6.1.0
|
||||
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri'
|
||||
]
|
||||
|
||||
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
||||
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
||||
|
||||
general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
|
||||
|
||||
# First add the Unicode script and category names.
|
||||
|
||||
utt_table = zip(script_names, ['PT_SC'] * len(script_names))
|
||||
utt_table += zip(category_names, ['PT_PC'] * len(category_names))
|
||||
utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names))
|
||||
|
||||
# Now add our own specials.
|
||||
|
||||
utt_table.append(('Any', 'PT_ANY'))
|
||||
utt_table.append(('L&', 'PT_LAMP'))
|
||||
utt_table.append(('Xan', 'PT_ALNUM'))
|
||||
utt_table.append(('Xps', 'PT_PXSPACE'))
|
||||
utt_table.append(('Xsp', 'PT_SPACE'))
|
||||
utt_table.append(('Xuc', 'PT_UCNC'))
|
||||
utt_table.append(('Xwd', 'PT_WORD'))
|
||||
|
||||
# Sort the table.
|
||||
|
||||
utt_table.sort()
|
||||
|
||||
# We have to use STR_ macros to define the strings so that it all works in
|
||||
# UTF-8 mode on EBCDIC platforms.
|
||||
|
||||
for utt in utt_table:
|
||||
print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')),
|
||||
for c in utt[0]:
|
||||
if c == '_':
|
||||
print 'STR_UNDERSCORE',
|
||||
elif c == '&':
|
||||
print 'STR_AMPERSAND',
|
||||
else:
|
||||
print 'STR_%s' % c,;
|
||||
print '"\\0"'
|
||||
|
||||
# Print the actual table, using the string names
|
||||
|
||||
print ''
|
||||
print 'const char PRIV(utt_names)[] =';
|
||||
last = ''
|
||||
for utt in utt_table:
|
||||
if utt == utt_table[-1]:
|
||||
last = ';'
|
||||
print ' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)
|
||||
# This was how it was done before the EBCDIC-compatible modification.
|
||||
# print ' "%s\\0"%s' % (utt[0], last)
|
||||
|
||||
print '\nconst ucp_type_table PRIV(utt)[] = {'
|
||||
offset = 0
|
||||
last = ','
|
||||
for utt in utt_table:
|
||||
if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
|
||||
'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
|
||||
value = '0'
|
||||
else:
|
||||
value = 'ucp_' + utt[0]
|
||||
if utt == utt_table[-1]:
|
||||
last = ''
|
||||
print ' { %3d, %s, %s }%s' % (offset, utt[1], value, last)
|
||||
offset += len(utt[0]) + 1
|
||||
print '};'
|
|
@ -0,0 +1,305 @@
|
|||
#! /bin/sh
|
||||
|
||||
# This is a script for the use of PCRE maintainers. It configures and rebuilds
|
||||
# PCRE2 with a variety of configuration options, and in each case runs the
|
||||
# tests to ensure that all goes well. Every possible combination would take far
|
||||
# too long, so we use a representative sample. This script should be run in the
|
||||
# PCRE2 source directory.
|
||||
|
||||
# Some of the tests have to be skipped when PCRE2 is built with non-Unix
|
||||
# newline recognition. I am planning to reduce this as much as possible in due
|
||||
# course.
|
||||
|
||||
|
||||
# This is in case the caller has set aliases (as I do - PH)
|
||||
|
||||
unset cp ls mv rm
|
||||
|
||||
# Use -v to make the output more verbose
|
||||
|
||||
verbose=0
|
||||
if [ "$1" = "-v" ] ; then verbose=1; fi
|
||||
|
||||
# This is a temporary directory for testing out-of-line builds
|
||||
|
||||
tmp=/tmp/pcretesting
|
||||
|
||||
# Don't bother with compiler optimization for most tests; it just slows down
|
||||
# compilation a lot (and running the tests themselves is quick). However, a
|
||||
# few specific tests turn optimization on, because it can provoke some compiler
|
||||
# warnings.
|
||||
|
||||
CFLAGS="-g -O0"
|
||||
CXXFLAGS="$CFLAGS"
|
||||
ISGCC="no"
|
||||
|
||||
# If the compiler is gcc, add a lot of warning switches.
|
||||
|
||||
cc --version >zzz 2>/dev/null
|
||||
if [ $? -eq 0 ] && grep GCC zzz >/dev/null; then
|
||||
ISGCC="yes"
|
||||
CFLAGS="$CFLAGS -Wall"
|
||||
CFLAGS="$CFLAGS -Wno-overlength-strings"
|
||||
CFLAGS="$CFLAGS -Wpointer-arith"
|
||||
CFLAGS="$CFLAGS -Wwrite-strings"
|
||||
CFLAGS="$CFLAGS -Wundef -Wshadow"
|
||||
CFLAGS="$CFLAGS -Wmissing-field-initializers"
|
||||
CFLAGS="$CFLAGS -Wunused-parameter"
|
||||
CFLAGS="$CFLAGS -Wextra -Wformat"
|
||||
CFLAGS="$CFLAGS -Wbad-function-cast"
|
||||
CFLAGS="$CFLAGS -Wmissing-declarations"
|
||||
CFLAGS="$CFLAGS -Wnested-externs"
|
||||
CFLAGS="$CFLAGS -pedantic"
|
||||
CFLAGS="$CFLAGS -Wuninitialized"
|
||||
CFLAGS="$CFLAGS -Wmissing-prototypes"
|
||||
CFLAGS="$CFLAGS -Wstrict-prototypes"
|
||||
fi
|
||||
|
||||
|
||||
# This function runs a single test with the set of configuration options that
|
||||
# are in $opts. The source directory must be set in srcdir.
|
||||
|
||||
function runtest()
|
||||
{
|
||||
rm -f *_unittest
|
||||
testcount=`expr $testcount + 1`
|
||||
|
||||
if [ "$opts" = "" ] ; then
|
||||
echo "[$testcount/$testtotal] Configuring with: default settings"
|
||||
else
|
||||
echo "[$testcount/$testtotal] Configuring with:"
|
||||
echo " $opts"
|
||||
fi
|
||||
|
||||
CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" \
|
||||
$srcdir/configure $opts >/dev/null 2>teststderr
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo " "
|
||||
echo "**** Error while configuring ****"
|
||||
cat teststderr
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Making"
|
||||
make -j >/dev/null 2>teststderr
|
||||
if [ $? -ne 0 -o -s teststderr ]; then
|
||||
echo " "
|
||||
echo "**** Errors or warnings while making ****"
|
||||
echo " "
|
||||
cat teststderr
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $verbose -eq 1 ]; then
|
||||
./pcre2test -C
|
||||
fi
|
||||
|
||||
nl=`./pcre2test -C newline`
|
||||
./pcretest -C jit >/dev/null
|
||||
jit=$?
|
||||
./pcre2test -C utf >/dev/null
|
||||
utf=$?
|
||||
|
||||
if [ "$nl" = "LF" -o "$nl" = "ANY" ]; then
|
||||
echo "Running C library tests $withvalgrind"
|
||||
$srcdir/RunTest $valgrind >teststdout
|
||||
if [ $? -ne 0 ]; then
|
||||
echo " "
|
||||
echo "**** Test failed ****"
|
||||
cat teststdout
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Skipping C library tests: newline is $nl"
|
||||
fi
|
||||
|
||||
if [ "$nl" = "LF" ]; then
|
||||
echo "Running pcre2grep tests $withvalgrind"
|
||||
$srcdir/RunGrepTest $valgrind >teststdout 2>teststderr
|
||||
if [ $? -ne 0 ]; then
|
||||
echo " "
|
||||
echo "**** Test failed ****"
|
||||
cat teststderr
|
||||
cat teststdout
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Skipping pcre2grep tests: newline is $nl"
|
||||
fi
|
||||
|
||||
if [ "$jit" -gt 0 -a $utf -gt 0 ]; then
|
||||
echo "Running JIT regression tests $withvalgrind"
|
||||
$cvalgrind $srcdir/pcre2_jit_test >teststdout 2>teststderr
|
||||
if [ $? -ne 0 ]; then
|
||||
echo " "
|
||||
echo "**** Test failed ****"
|
||||
cat teststderr
|
||||
cat teststdout
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Skipping JIT regression tests: JIT or UTF not enabled"
|
||||
fi
|
||||
|
||||
# if [ "$nl" = "LF" -o "$nl" = "ANY" ]; then
|
||||
# if [ -f pcrecpp_unittest ] ; then
|
||||
# for utest in pcrecpp_unittest \
|
||||
# pcre_scanner_unittest \
|
||||
# pcre_stringpiece_unittest
|
||||
# do
|
||||
# echo "Running $utest $withvalgrind"
|
||||
# $cvalgrind $utest >teststdout
|
||||
# if [ $? -ne 0 ]; then
|
||||
# echo " "
|
||||
# echo "**** Test failed ****"
|
||||
# cat teststdout
|
||||
# exit 1
|
||||
# fi
|
||||
# done
|
||||
# else
|
||||
# echo "Skipping C++ tests: pcrecpp_unittest does not exist"
|
||||
# fi
|
||||
# else
|
||||
# echo "Skipping C++ tests: newline is $nl"
|
||||
# fi
|
||||
}
|
||||
|
||||
|
||||
# Update the total count whenever a new test is added; it is used to show
|
||||
# progess as each test is run.
|
||||
|
||||
testtotal=40
|
||||
testcount=0
|
||||
|
||||
# This set of tests builds PCRE and runs the tests with a variety of configure
|
||||
# options, in the current (source) directory. The empty configuration builds
|
||||
# with all the default settings. As well as testing that these options work, we
|
||||
# use --disable-shared or --disable-static after the default test (which builds
|
||||
# both) to save a bit of time by building only one version of the library for
|
||||
# the subsequent tests.
|
||||
|
||||
valgrind=
|
||||
cvalgrind=
|
||||
withvalgrind=
|
||||
srcdir=.
|
||||
export srcdir
|
||||
|
||||
# If gcc is in use, run a maximally configured test with -O2, because that can
|
||||
# throw up warnings that are not detected with -O0.
|
||||
|
||||
if [ "$ISGCC" = "yes" ]; then
|
||||
echo "Maximally configured test with -O2"
|
||||
SAVECLFAGS="$CFLAGS"
|
||||
CFLAGS="$CFLAGS -O2"
|
||||
opts="--disable-shared --enable-utf --enable-jit --enable-pcre16 --enable-pcre32"
|
||||
runtest
|
||||
CFLAGS="$SAVECFLAGS"
|
||||
fi
|
||||
|
||||
echo "General tests in the current directory"
|
||||
for opts in \
|
||||
"" \
|
||||
"--enable-utf --disable-static" \
|
||||
"--disable-stack-for-recursion --disable-shared" \
|
||||
"--enable-utf --disable-shared" \
|
||||
"--enable-utf --disable-stack-for-recursion --disable-shared" \
|
||||
"--enable-utf --with-link-size=3 --disable-shared" \
|
||||
"--enable-rebuild-chartables --disable-shared" \
|
||||
"--enable-newline-is-any --disable-shared" \
|
||||
"--enable-newline-is-cr --disable-shared" \
|
||||
"--enable-newline-is-crlf --disable-shared" \
|
||||
"--enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
|
||||
"--enable-utf --enable-newline-is-any --disable-stack-for-recursion --disable-static" \
|
||||
"--enable-jit --disable-shared" \
|
||||
"--enable-jit --enable-utf --disable-shared" \
|
||||
"--enable-jit --enable-utf --with-link-size=3 --disable-shared" \
|
||||
"--enable-pcre16" \
|
||||
"--enable-pcre16 --enable-jit --enable-utf --disable-shared" \
|
||||
"--enable-pcre16 --enable-jit --disable-pcre8 --disable-shared" \
|
||||
"--enable-pcre16 --enable-jit --disable-pcre8 --enable-utf --disable-shared" \
|
||||
"--enable-pcre16 --disable-stack-for-recursion --disable-shared" \
|
||||
"--enable-pcre16 --enable-utf --disable-stack-for-recursion --disable-shared" \
|
||||
"--enable-pcre16 --enable-jit --enable-utf --with-link-size=3 --disable-shared" \
|
||||
"--enable-pcre16 --enable-jit --enable-utf --with-link-size=4 --disable-shared" \
|
||||
"--enable-pcre32" \
|
||||
"--enable-pcre32 --enable-jit --enable-utf --disable-shared" \
|
||||
"--enable-pcre32 --enable-jit --disable-pcre8 --disable-shared" \
|
||||
"--enable-pcre32 --enable-jit --disable-pcre8 --enable-utf --disable-shared" \
|
||||
"--enable-pcre32 --disable-stack-for-recursion --disable-shared" \
|
||||
"--enable-pcre32 --enable-utf --disable-stack-for-recursion --disable-shared" \
|
||||
"--enable-pcre32 --enable-jit --enable-utf --with-link-size=4 --disable-shared" \
|
||||
"--enable-pcre32 --enable-pcre16 --disable-shared" \
|
||||
"--enable-pcre32 --enable-pcre16 --disable-pcre8 --disable-shared" \
|
||||
"--enable-pcre32 --enable-pcre16 --disable-pcre8 --enable-jit --enable-utf --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
|
||||
do
|
||||
runtest
|
||||
done
|
||||
|
||||
# Now re-run some of the tests under valgrind.
|
||||
|
||||
echo "Tests in the current directory using valgrind"
|
||||
valgrind=valgrind
|
||||
cvalgrind="valgrind -q --smc-check=all"
|
||||
withvalgrind="with valgrind"
|
||||
|
||||
for opts in \
|
||||
"--enable-utf --disable-stack-for-recursion --disable-shared" \
|
||||
"--enable-utf --with-link-size=3 --disable-shared" \
|
||||
"--enable-jit --enable-utf --disable-shared" \
|
||||
"--enable-pcre16 --enable-pcre32 --enable-jit --enable-utf " \
|
||||
"--disable-shared"
|
||||
do
|
||||
opts="--enable-valgrind $opts"
|
||||
runtest
|
||||
done
|
||||
|
||||
valgrind=
|
||||
cvalgrind=
|
||||
withvalgrind=
|
||||
|
||||
# Clean up the distribution and then do at least one build and test in a
|
||||
# directory other than the source directory. It doesn't work unless the
|
||||
# source directory is cleaned up first.
|
||||
|
||||
if [ -f Makefile ]; then
|
||||
echo "Running 'make distclean'"
|
||||
make distclean >/dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "** 'make distclean' failed"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Tests in the $tmp directory"
|
||||
srcdir=`pwd`
|
||||
export srcdir
|
||||
|
||||
if [ ! -e $tmp ]; then
|
||||
mkdir $tmp
|
||||
fi
|
||||
|
||||
if [ ! -d $tmp ]; then
|
||||
echo "** Failed to create $tmp or it is not a directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd $tmp
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "** Failed to cd to $tmp"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for opts in \
|
||||
"--enable-utf --disable-shared"
|
||||
do
|
||||
runtest
|
||||
done
|
||||
|
||||
echo "Removing $tmp"
|
||||
|
||||
rm -rf $tmp
|
||||
|
||||
echo "All done"
|
||||
|
||||
# End
|
|
@ -0,0 +1,505 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
# Multistage table builder
|
||||
# (c) Peter Kankowski, 2008
|
||||
|
||||
##############################################################################
|
||||
# This script was submitted to the PCRE project by Peter Kankowski as part of
|
||||
# the upgrading of Unicode property support. The new code speeds up property
|
||||
# matching many times. The script is for the use of PCRE maintainers, to
|
||||
# generate the pcre_ucd.c file that contains a digested form of the Unicode
|
||||
# data tables.
|
||||
#
|
||||
# The script should be run in the maint subdirectory, using the command
|
||||
#
|
||||
# [python2] ./MultiStage2.py >../src/pcre2_ucd.c
|
||||
#
|
||||
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
|
||||
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
|
||||
# Unicode.tables subdirectory. The first of these is found in the "extracted"
|
||||
# subdirectory of the Unicode database (UCD) on the Unicode web site; the
|
||||
# second is in the "auxiliary" subdirectory; the other two are directly in the
|
||||
# UCD directory.
|
||||
#
|
||||
# Minor modifications made to this script:
|
||||
# Added #! line at start
|
||||
# Removed tabs
|
||||
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
||||
# Consequent code tidy
|
||||
# Adjusted data file names to take from the Unicode.tables directory
|
||||
# Adjusted global table names by prefixing _pcre_.
|
||||
# Commented out stuff relating to the casefolding table, which isn't used;
|
||||
# removed completely in 2012.
|
||||
# Corrected size calculation
|
||||
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
|
||||
# Update for PCRE2: name changes and SUPPORT_UCP is abolished.
|
||||
#
|
||||
# Major modifications made to this script:
|
||||
# Added code to add a grapheme break property field to records.
|
||||
#
|
||||
# Added code to search for sets of more than two characters that must match
|
||||
# each other caselessly. A new table is output containing these sets, and
|
||||
# offsets into the table are added to the main output records. This new
|
||||
# code scans CaseFolding.txt instead of UnicodeData.txt.
|
||||
#
|
||||
# The main tables generated by this script are used by macros defined in
|
||||
# pcre2_internal.h. They look up Unicode character properties using short
|
||||
# sequences of code that contains no branches, which makes for greater speed.
|
||||
#
|
||||
# Conceptually, there is a table of records (of type ucd_record), containing a
|
||||
# script number, character type, grapheme break type, offset to caseless
|
||||
# matching set, and offset to the character's other case for every character.
|
||||
# However, a real table covering all Unicode characters would be far too big.
|
||||
# It can be efficiently compressed by observing that many characters have the
|
||||
# same record, and many blocks of characters (taking 128 characters in a block)
|
||||
# have the same set of records as other blocks. This leads to a 2-stage lookup
|
||||
# process.
|
||||
#
|
||||
# This script constructs four tables. The ucd_caseless_sets table contains
|
||||
# lists of characters that all match each other caselessly. Each list is
|
||||
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
||||
# any valid character. The first list is empty; this is used for characters
|
||||
# that are not part of any list.
|
||||
#
|
||||
# The ucd_records table contains one instance of every unique record that is
|
||||
# required. The ucd_stage1 table is indexed by a character's block number, and
|
||||
# yields what is in effect a "virtual" block number. The ucd_stage2 table is a
|
||||
# table of "virtual" blocks; each block is indexed by the offset of a character
|
||||
# within its own block, and the result is the offset of the required record.
|
||||
#
|
||||
# Example: lowercase "a" (U+0061) is in block 0
|
||||
# lookup 0 in stage1 table yields 0
|
||||
# lookup 97 in the first table in stage2 yields 16
|
||||
# record 17 is { 33, 5, 11, 0, -32 }
|
||||
# 33 = ucp_Latin => Latin script
|
||||
# 5 = ucp_Ll => Lower case letter
|
||||
# 11 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => not part of a caseless set
|
||||
# -32 => Other case is U+0041
|
||||
#
|
||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||
# are different because they are part of a multi-character caseless set (for
|
||||
# example, k, K and the Kelvin symbol are such a set).
|
||||
#
|
||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||
# lookup 96 in stage1 table yields 88
|
||||
# lookup 66 in the 88th table in stage2 yields 467
|
||||
# record 470 is { 26, 7, 11, 0, 0 }
|
||||
# 26 = ucp_Hiragana => Hiragana script
|
||||
# 7 = ucp_Lo => Other letter
|
||||
# 11 = ucp_gbOther => Grapheme break property "Other"
|
||||
# 0 => not part of a caseless set
|
||||
# 0 => No other case
|
||||
#
|
||||
# In these examples, no other blocks resolve to the same "virtual" block, as it
|
||||
# happens, but plenty of other blocks do share "virtual" blocks.
|
||||
#
|
||||
# There is a fourth table, maintained by hand, which translates from the
|
||||
# individual character types such as ucp_Cc to the general types like ucp_C.
|
||||
#
|
||||
# Philip Hazel, 03 July 2008
|
||||
#
|
||||
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
|
||||
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
|
||||
# July-2012: Updated list of scripts for Unicode 6.1.0
|
||||
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
|
||||
# field in the record to hold the value. Luckily, the
|
||||
# structure had a hole in it, so the resulting table is
|
||||
# not much bigger than before.
|
||||
# 18-September-2012: Added code for multiple caseless sets. This uses the
|
||||
# final hole in the structure.
|
||||
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
||||
# 13-May-2014: Updated for PCRE2
|
||||
##############################################################################
|
||||
|
||||
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
MAX_UNICODE = 0x110000
|
||||
NOTACHAR = 0xffffffff
|
||||
|
||||
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
|
||||
def make_get_names(enum):
|
||||
return lambda chardata: enum.index(chardata[1])
|
||||
|
||||
# Parse a line of CaseFolding.txt
|
||||
def get_other_case(chardata):
|
||||
if chardata[1] == 'C' or chardata[1] == 'S':
|
||||
return int(chardata[2], 16) - int(chardata[0], 16)
|
||||
return 0
|
||||
|
||||
|
||||
# Read the whole table in memory
|
||||
def read_table(file_name, get_value, default_value):
|
||||
file = open(file_name, 'r')
|
||||
table = [default_value] * MAX_UNICODE
|
||||
for line in file:
|
||||
line = re.sub(r'#.*', '', line)
|
||||
chardata = map(string.strip, line.split(';'))
|
||||
if len(chardata) <= 1:
|
||||
continue
|
||||
value = get_value(chardata)
|
||||
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
||||
char = int(m.group(1), 16)
|
||||
if m.group(3) is None:
|
||||
last = char
|
||||
else:
|
||||
last = int(m.group(3), 16)
|
||||
for i in range(char, last + 1):
|
||||
# It is important not to overwrite a previously set
|
||||
# value because in the CaseFolding file there are lines
|
||||
# to be ignored (returning the default value of 0)
|
||||
# which often come after a line which has already set
|
||||
# data.
|
||||
if table[i] == default_value:
|
||||
table[i] = value
|
||||
file.close()
|
||||
return table
|
||||
|
||||
# Get the smallest possible C language type for the values
|
||||
def get_type_size(table):
|
||||
type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
|
||||
("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
|
||||
limits = [(0, 255), (0, 65535), (0, 4294967295),
|
||||
(-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
|
||||
minval = min(table)
|
||||
maxval = max(table)
|
||||
for num, (minlimit, maxlimit) in enumerate(limits):
|
||||
if minlimit <= minval and maxval <= maxlimit:
|
||||
return type_size[num]
|
||||
else:
|
||||
raise OverflowError, "Too large to fit into C types"
|
||||
|
||||
def get_tables_size(*tables):
|
||||
total_size = 0
|
||||
for table in tables:
|
||||
type, size = get_type_size(table)
|
||||
total_size += size * len(table)
|
||||
return total_size
|
||||
|
||||
# Compress the table into the two stages
|
||||
def compress_table(table, block_size):
|
||||
blocks = {} # Dictionary for finding identical blocks
|
||||
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
||||
stage2 = [] # Stage 2 table contains the blocks with property values
|
||||
table = tuple(table)
|
||||
for i in range(0, len(table), block_size):
|
||||
block = table[i:i+block_size]
|
||||
start = blocks.get(block)
|
||||
if start is None:
|
||||
# Allocate a new block
|
||||
start = len(stage2) / block_size
|
||||
stage2 += block
|
||||
blocks[block] = start
|
||||
stage1.append(start)
|
||||
|
||||
return stage1, stage2
|
||||
|
||||
# Print a table
|
||||
def print_table(table, table_name, block_size = None):
|
||||
type, size = get_type_size(table)
|
||||
ELEMS_PER_LINE = 16
|
||||
|
||||
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
||||
if block_size:
|
||||
s += ", block = %d" % block_size
|
||||
print s + " */"
|
||||
table = tuple(table)
|
||||
if block_size is None:
|
||||
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
|
||||
mult = MAX_UNICODE / len(table)
|
||||
for i in range(0, len(table), ELEMS_PER_LINE):
|
||||
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
|
||||
else:
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
el = ELEMS_PER_LINE
|
||||
else:
|
||||
el = block_size
|
||||
fmt = "%3d," * el + "\n"
|
||||
if block_size > ELEMS_PER_LINE:
|
||||
fmt = fmt * (block_size / ELEMS_PER_LINE)
|
||||
for i in range(0, len(table), block_size):
|
||||
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
|
||||
print "};\n"
|
||||
|
||||
# Extract the unique combinations of properties into records
|
||||
def combine_tables(*tables):
|
||||
records = {}
|
||||
index = []
|
||||
for t in zip(*tables):
|
||||
i = records.get(t)
|
||||
if i is None:
|
||||
i = records[t] = len(records)
|
||||
index.append(i)
|
||||
return index, records
|
||||
|
||||
def get_record_size_struct(records):
|
||||
size = 0
|
||||
structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
|
||||
'types in this structure definition from pcre2_internal.h (the actual\n' + \
|
||||
'field names will be different):\n\ntypedef struct {\n'
|
||||
for i in range(len(records[0])):
|
||||
record_slice = map(lambda record: record[i], records)
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
# add padding: round up to the nearest power of slice_size
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
size += slice_size
|
||||
structure += '%s property_%d;\n' % (slice_type, i)
|
||||
|
||||
# round up to the first item of the next structure in array
|
||||
record_slice = map(lambda record: record[0], records)
|
||||
slice_type, slice_size = get_type_size(record_slice)
|
||||
size = (size + slice_size - 1) & -slice_size
|
||||
|
||||
structure += '} ucd_record;\n*/\n\n'
|
||||
return size, structure
|
||||
|
||||
def test_record_size():
|
||||
tests = [ \
|
||||
( [(3,), (6,), (6,), (1,)], 1 ), \
|
||||
( [(300,), (600,), (600,), (100,)], 2 ), \
|
||||
( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
|
||||
( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
|
||||
( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
||||
( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
|
||||
( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
|
||||
]
|
||||
for test in tests:
|
||||
size, struct = get_record_size_struct(test[0])
|
||||
assert(size == test[1])
|
||||
#print struct
|
||||
|
||||
def print_records(records, record_size):
|
||||
print 'const ucd_record PRIV(ucd_records)[] = { ' + \
|
||||
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
|
||||
records = zip(records.keys(), records.values())
|
||||
records.sort(None, lambda x: x[1])
|
||||
for i, record in enumerate(records):
|
||||
print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
|
||||
print '};\n'
|
||||
|
||||
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
||||
# New for Unicode 5.0
|
||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
||||
# New for Unicode 5.1
|
||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
|
||||
# New for Unicode 5.2
|
||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
|
||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
|
||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
|
||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
|
||||
# New for Unicode 6.0.0
|
||||
'Batak', 'Brahmi', 'Mandaic', \
|
||||
# New for Unicode 6.1.0
|
||||
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri'
|
||||
]
|
||||
|
||||
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
||||
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
||||
|
||||
break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
|
||||
'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other' ]
|
||||
|
||||
test_record_size()
|
||||
|
||||
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
|
||||
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
||||
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
|
||||
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
|
||||
|
||||
|
||||
# This block of code was added by PH in September 2012. I am not a Python
|
||||
# programmer, so the style is probably dreadful, but it does the job. It scans
|
||||
# the other_case table to find sets of more than two characters that must all
|
||||
# match each other caselessly. Later in this script a table of these sets is
|
||||
# written out. However, we have to do this work here in order to compute the
|
||||
# offsets in the table that are inserted into the main table.
|
||||
|
||||
# The CaseFolding.txt file lists pairs, but the common logic for reading data
|
||||
# sets only one value, so first we go through the table and set "return"
|
||||
# offsets for those that are not already set.
|
||||
|
||||
for c in range(0x10ffff):
|
||||
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
|
||||
other_case[c + other_case[c]] = -other_case[c]
|
||||
|
||||
# Now scan again and create equivalence sets.
|
||||
|
||||
sets = []
|
||||
|
||||
for c in range(0x10ffff):
|
||||
o = c + other_case[c]
|
||||
|
||||
# Trigger when this character's other case does not point back here. We
|
||||
# now have three characters that are case-equivalent.
|
||||
|
||||
if other_case[o] != -other_case[c]:
|
||||
t = o + other_case[o]
|
||||
|
||||
# Scan the existing sets to see if any of the three characters are already
|
||||
# part of a set. If so, unite the existing set with the new set.
|
||||
|
||||
appended = 0
|
||||
for s in sets:
|
||||
found = 0
|
||||
for x in s:
|
||||
if x == c or x == o or x == t:
|
||||
found = 1
|
||||
|
||||
# Add new characters to an existing set
|
||||
|
||||
if found:
|
||||
found = 0
|
||||
for y in [c, o, t]:
|
||||
for x in s:
|
||||
if x == y:
|
||||
found = 1
|
||||
if not found:
|
||||
s.append(y)
|
||||
appended = 1
|
||||
|
||||
# If we have not added to an existing set, create a new one.
|
||||
|
||||
if not appended:
|
||||
sets.append([c, o, t])
|
||||
|
||||
# End of loop looking for caseless sets.
|
||||
|
||||
# Now scan the sets and set appropriate offsets for the characters.
|
||||
|
||||
caseless_offsets = [0] * MAX_UNICODE
|
||||
|
||||
offset = 1;
|
||||
for s in sets:
|
||||
for x in s:
|
||||
caseless_offsets[x] = offset
|
||||
offset += len(s) + 1
|
||||
|
||||
# End of block of code for creating offsets for caseless matching sets.
|
||||
|
||||
|
||||
# Combine the tables
|
||||
|
||||
table, records = combine_tables(script, category, break_props,
|
||||
caseless_offsets, other_case)
|
||||
|
||||
record_size, record_struct = get_record_size_struct(records.keys())
|
||||
|
||||
# Find the optimum block size for the two-stage table
|
||||
min_size = sys.maxint
|
||||
for block_size in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * record_size
|
||||
stage1, stage2 = compress_table(table, block_size)
|
||||
size += get_tables_size(stage1, stage2)
|
||||
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2 = stage1, stage2
|
||||
min_block_size = block_size
|
||||
|
||||
print "/* This module is generated by the maint/MultiStage2.py script."
|
||||
print "Do not modify it by hand. Instead modify the script and run it"
|
||||
print "to regenerate this code."
|
||||
print
|
||||
print "As well as being part of the PCRE2 library, this module is #included"
|
||||
print "by the pcre2test program, which redefines the PRIV macro to change"
|
||||
print "table names from _pcre2_xxx to xxxx, thereby avoiding name clashes"
|
||||
print "with the library. At present, just one of these tables is actually"
|
||||
print "needed. */"
|
||||
print
|
||||
print "#ifndef PCRE2_INCLUDED"
|
||||
print
|
||||
print "#ifdef HAVE_CONFIG_H"
|
||||
print "#include \"config.h\""
|
||||
print "#endif"
|
||||
print
|
||||
print "#include \"pcre2_internal.h\""
|
||||
print
|
||||
print "#endif /* PCRE2_INCLUDED */"
|
||||
print
|
||||
print "/* Unicode character database. */"
|
||||
print "/* This file was autogenerated by the MultiStage2.py script. */"
|
||||
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
|
||||
print
|
||||
print "/* The tables herein are needed only when UCP support is built,"
|
||||
print "and in PCRE2 that happens automatically with UTF support."
|
||||
print "This module should not be referenced otherwise, so"
|
||||
print "it should not matter whether it is compiled or not. However"
|
||||
print "a comment was received about space saving - maybe the guy linked"
|
||||
print "all the modules rather than using a library - so we include a"
|
||||
print "condition to cut out the tables when not needed. But don't leave"
|
||||
print "a totally empty module because some compilers barf at that."
|
||||
print "Instead, just supply small dummy tables. */"
|
||||
print
|
||||
print "#ifndef SUPPORT_UTF"
|
||||
print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
|
||||
print "const uint8_t PRIV(ucd_stage1)[] = {0};"
|
||||
print "const uint16_t PRIV(ucd_stage2)[] = {0};"
|
||||
print "const uint32_t PRIV(ucd_caseless_sets)[] = {0};"
|
||||
print "#else"
|
||||
print
|
||||
print record_struct
|
||||
|
||||
# --- Added by PH: output the table of caseless character sets ---
|
||||
|
||||
print "const uint32_t PRIV(ucd_caseless_sets)[] = {"
|
||||
print " NOTACHAR,"
|
||||
for s in sets:
|
||||
s = sorted(s)
|
||||
for x in s:
|
||||
print ' 0x%04x,' % x,
|
||||
print ' NOTACHAR,'
|
||||
print '};'
|
||||
print
|
||||
|
||||
# ------
|
||||
|
||||
print "/* When #included in pcre2test, we don't need this large table. */"
|
||||
print
|
||||
print "#ifndef PCRE2_INCLUDED"
|
||||
print
|
||||
print_records(records, record_size)
|
||||
print_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
||||
print "#if UCD_BLOCK_SIZE != %d" % min_block_size
|
||||
print "#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h"
|
||||
print "#endif"
|
||||
print "#endif /* SUPPORT_UTF */"
|
||||
print
|
||||
print "#endif /* PCRE2_INCLUDED */"
|
||||
|
||||
"""
|
||||
|
||||
# Three-stage tables:
|
||||
|
||||
# Find the optimum block size for 3-stage table
|
||||
min_size = sys.maxint
|
||||
for stage3_block in [2 ** i for i in range(2,6)]:
|
||||
stage_i, stage3 = compress_table(table, stage3_block)
|
||||
for stage2_block in [2 ** i for i in range(5,10)]:
|
||||
size = len(records) * 4
|
||||
stage1, stage2 = compress_table(stage_i, stage2_block)
|
||||
size += get_tables_size(stage1, stage2, stage3)
|
||||
# print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
|
||||
if size < min_size:
|
||||
min_size = size
|
||||
min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
|
||||
min_stage2_block, min_stage3_block = stage2_block, stage3_block
|
||||
|
||||
print "/* Total size: %d bytes" % min_size */
|
||||
print_records(records)
|
||||
print_table(min_stage1, 'ucd_stage1')
|
||||
print_table(min_stage2, 'ucd_stage2', min_stage2_block)
|
||||
print_table(min_stage3, 'ucd_stage3', min_stage3_block)
|
||||
|
||||
"""
|
|
@ -0,0 +1,324 @@
|
|||
MAINTENANCE README FOR PCRE2
|
||||
============================
|
||||
|
||||
The files in the "maint" directory of the PCRE2 source contain data, scripts,
|
||||
and programs that are used for the maintenance of PCRE2, but which do not form
|
||||
part of the PCRE2 distribution tarballs. This document describes these files
|
||||
and also contains some notes for maintainers. Its contents are:
|
||||
|
||||
Files in the maint directory
|
||||
Updating to a new Unicode release
|
||||
Preparing for a PCRE2 release
|
||||
Making a PCRE2 release
|
||||
Long-term ideas (wish list)
|
||||
|
||||
|
||||
Files in the maint directory
|
||||
============================
|
||||
|
||||
GenerateUtt.py A Python script to generate part of the pcre2_tables.c file
|
||||
that contains Unicode script names in a long string with
|
||||
offsets, which is tedious to maintain by hand.
|
||||
|
||||
ManyConfigTests A shell script that runs "configure, make, test" a number of
|
||||
times with different configuration settings.
|
||||
|
||||
MultiStage2.py A Python script that generates the file pcre2_ucd.c from three
|
||||
Unicode data tables, which are themselves downloaded from the
|
||||
Unicode web site. Run this script in the "maint" directory.
|
||||
The generated file contains the tables for a 2-stage lookup
|
||||
of Unicode properties.
|
||||
|
||||
pcre2_chartables.c.non-standard
|
||||
This is a set of character tables that came from a Windows
|
||||
system. It has characters greater than 128 that are set as
|
||||
spaces, amongst other things. I kept it so that it can be
|
||||
used for testing from time to time.
|
||||
|
||||
README This file.
|
||||
|
||||
Unicode.tables The files in this directory (CaseFolding.txt,
|
||||
DerivedGeneralCategory.txt, GraphemeBreakProperty.txt,
|
||||
Scripts.txt and UnicodeData.txt) were downloaded from the
|
||||
Unicode web site. They contain information about Unicode
|
||||
characters and scripts.
|
||||
|
||||
ucptest.c A short C program for testing the Unicode property macros
|
||||
that do lookups in the pcre2_ucd.c data, mainly useful after
|
||||
rebuilding the Unicode property table. Compile and run this in
|
||||
the "maint" directory (see comments at its head).
|
||||
|
||||
ucptestdata A directory containing two files, testinput1 and testoutput1,
|
||||
to use in conjunction with the ucptest program.
|
||||
|
||||
utf8.c A short, freestanding C program for converting a Unicode code
|
||||
point into a sequence of bytes in the UTF-8 encoding, and vice
|
||||
versa. If its argument is a hex number such as 0x1234, it
|
||||
outputs a list of the equivalent UTF-8 bytes. If its argument
|
||||
is sequence of concatenated UTF-8 bytes (e.g. e188b4) it
|
||||
treats them as a UTF-8 character and outputs the equivalent
|
||||
code point in hex.
|
||||
|
||||
|
||||
Updating to a new Unicode release
|
||||
=================================
|
||||
|
||||
When there is a new release of Unicode, the files in Unicode.tables must be
|
||||
refreshed from the web site. If the new version of Unicode adds new character
|
||||
scripts, the source file pacr2_ucp.h and both the MultiStage2.py and the
|
||||
GenerateUtt.py scripts must be edited to add the new names. Then MultiStage2.py
|
||||
can be run to generate a new version of pcre2_ucd.c, and GenerateUtt.py can be
|
||||
run to generate the tricky tables for inclusion in pcre2_tables.c.
|
||||
|
||||
If MultiStage2.py gives the error "ValueError: list.index(x): x not in list",
|
||||
the cause is usually a missing (or misspelt) name in the list of scripts. I
|
||||
couldn't find a straightforward list of scripts on the Unicode site, but
|
||||
there's a useful Wikipedia page that list them, and notes the Unicode version
|
||||
in which they were introduced:
|
||||
|
||||
http://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts
|
||||
|
||||
The ucptest program can be compiled and used to check that the new tables in
|
||||
pcre2_ucd.c work properly, using the data files in ucptestdata to check a
|
||||
number of test characters. The source file ucptest.c must be updated whenever
|
||||
new Unicode script names are added.
|
||||
|
||||
Note also that both the pcre2syntax.3 and pcre2pattern.3 man pages contain
|
||||
lists of Unicode script names.
|
||||
|
||||
|
||||
Preparing for a PCRE release
|
||||
============================
|
||||
|
||||
This section contains a checklist of things that I consult before building a
|
||||
distribution for a new release.
|
||||
|
||||
. Ensure that the version number and version date are correct in configure.ac.
|
||||
|
||||
. Update the library version numbers in configure.ac according to the rules
|
||||
given below.
|
||||
|
||||
. If new build options have been added, ensure that they are added to the CMake
|
||||
files as well as to the autoconf files. The relevant files are CMakeLists.txt
|
||||
and config-cmake.h.in. After making a release tarball, test it out with CMake
|
||||
if there have been changes here.
|
||||
|
||||
. Run ./autogen.sh to ensure everything is up-to-date.
|
||||
|
||||
. Compile and test with many different config options, and combinations of
|
||||
options. Also, test with valgrind by running "RunTest valgrind" and
|
||||
"RunGrepTest valgrind" (which takes quite a long time). The script
|
||||
maint/ManyConfigTests now encapsulates this testing. It runs tests with
|
||||
different configurations, and it also runs some of them with valgrind, all of
|
||||
which can take quite some time.
|
||||
|
||||
. Run perltest.pl on the test data for tests 1, 4, and 6. The output
|
||||
should match the PCRE2 test output, apart from the version identification at
|
||||
the start of each test. The other tests are not Perl-compatible (they use
|
||||
various PCRE2-specific features or options).
|
||||
|
||||
. It is possible to test with the emulated memmove() function by undefining
|
||||
HAVE_MEMMOVE and HAVE_BCOPY in config.h, though I do not do this often. You
|
||||
may see a number of "pcre2_memmove defined but not used" warnings for the
|
||||
modules in which there is no call to memmove(). These can be ignored.
|
||||
|
||||
. Documentation: check AUTHORS, ChangeLog (check version and date), LICENCE,
|
||||
NEWS (check version and date), NON-AUTOTOOLS-BUILD, and README. Many of these
|
||||
won't need changing, but over the long term things do change.
|
||||
|
||||
. I used to test new releases myself on a number of different operating
|
||||
systems, using different compilers as well. For example, on Solaris it is
|
||||
helpful to test using Sun's cc compiler as a change from gcc. Adding
|
||||
-xarch=v9 to the cc options does a 64-bit test, but it also needs -S 64 for
|
||||
pcretest to increase the stack size for test 2. Since I retired I can no
|
||||
longer do this, but instead I rely on putting out release candidates for
|
||||
folks on the pcre-dev list to test.
|
||||
|
||||
|
||||
Updating version info for libtool
|
||||
=================================
|
||||
|
||||
This set of rules for updating library version information came from a web page
|
||||
whose URL I have forgotten. The version information consists of three parts:
|
||||
(current, revision, age).
|
||||
|
||||
1. Start with version information of 0:0:0 for each libtool library.
|
||||
|
||||
2. Update the version information only immediately before a public release of
|
||||
your software. More frequent updates are unnecessary, and only guarantee
|
||||
that the current interface number gets larger faster.
|
||||
|
||||
3. If the library source code has changed at all since the last update, then
|
||||
increment revision; c:r:a becomes c:r+1:a.
|
||||
|
||||
4. If any interfaces have been added, removed, or changed since the last
|
||||
update, increment current, and set revision to 0.
|
||||
|
||||
5. If any interfaces have been added since the last public release, then
|
||||
increment age.
|
||||
|
||||
6. If any interfaces have been removed or changed since the last public
|
||||
release, then set age to 0.
|
||||
|
||||
The following explanation may help in understanding the above rules a bit
|
||||
better. Consider that there are three possible kinds of reaction from users to
|
||||
changes in a shared library:
|
||||
|
||||
1. Programs using the previous version may use the new version as a drop-in
|
||||
replacement, and programs using the new version can also work with the
|
||||
previous one. In other words, no recompiling nor relinking is needed. In
|
||||
this case, increment revision only, don't touch current or age.
|
||||
|
||||
2. Programs using the previous version may use the new version as a drop-in
|
||||
replacement, but programs using the new version may use APIs not present in
|
||||
the previous one. In other words, a program linking against the new version
|
||||
may fail if linked against the old version at run time. In this case, set
|
||||
revision to 0, increment current and age.
|
||||
|
||||
3. Programs may need to be changed, recompiled, relinked in order to use the
|
||||
new version. Increment current, set revision and age to 0.
|
||||
|
||||
|
||||
Making a PCRE release
|
||||
=====================
|
||||
|
||||
Run PrepareRelease and commit the files that it changes (by removing trailing
|
||||
spaces). The first thing this script does is to run CheckMan on the man pages;
|
||||
if it finds any markup errors, it reports them and then aborts.
|
||||
|
||||
Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
|
||||
and the zipball. Double-check with "svn status", then create an SVN tagged
|
||||
copy:
|
||||
|
||||
svn copy svn://vcs.exim.org/pcre2/code/trunk \
|
||||
svn://vcs.exim.org/pcre2/code/tags/pcre-8.xx
|
||||
|
||||
Don't forget to update Freecode (fka Freshmeat) when the new release is out,
|
||||
and to tell webmaster@pcre.org and the mailing list. Also, update the list of
|
||||
version numbers in Bugzilla (edit products).
|
||||
|
||||
|
||||
Future ideas (wish list)
|
||||
========================
|
||||
|
||||
This section records a list of ideas so that they do not get forgotten. They
|
||||
vary enormously in their usefulness and potential for implementation. Some are
|
||||
very sensible; some are rather wacky. Some have been on this list for years;
|
||||
others are relatively new.
|
||||
|
||||
. Optimization
|
||||
|
||||
There are always ideas for new optimizations so as to speed up pattern
|
||||
matching. Most of them try to save work by recognizing a non-match without
|
||||
having to scan all the possibilities. These are some that I've recorded:
|
||||
|
||||
* /((A{0,5}){0,5}){0,5}(something complex)/ on a non-matching string is very
|
||||
slow, though Perl is fast. Can we speed up somehow? Convert to {0,125}?
|
||||
OTOH, this is pathological - the user could easily fix it.
|
||||
|
||||
* Turn ={4} into ==== ? (for speed). I once did an experiment, and it seems
|
||||
to have little effect, and maybe makes things worse.
|
||||
|
||||
* "Ends with literal string" - note that a single character doesn't gain much
|
||||
over the existing "required byte" (reqbyte) feature that just remembers one
|
||||
data unit.
|
||||
|
||||
* Remember an initial string rather than just 1 code unit?
|
||||
|
||||
* A required code unit from alternatives - not just the last unit, but an
|
||||
earlier one if common to all alternatives.
|
||||
|
||||
o Friedl contains other ideas.
|
||||
|
||||
* The code does not set initial code unit flags for Unicode property types
|
||||
such as \p; I don't know how much benefit there would be for, for example,
|
||||
setting the bits for 0-9 and all values >= xC0 (in 8-bit mode) when a
|
||||
pattern starts with \p{N}.
|
||||
|
||||
* There is scope for more "auto-possessifying" in connection with \p and \P.
|
||||
|
||||
. If Perl gets to a consistent state over the settings of capturing sub-
|
||||
patterns inside repeats, see if we can match it. One example of the
|
||||
difference is the matching of /(main(O)?)+/ against mainOmain, where PCRE
|
||||
leaves $2 set. In Perl, it's unset. Changing this in PCRE will be very hard
|
||||
because I think it needs much more state to be remembered.
|
||||
|
||||
. Perl 6 will be a revolution. Is it a revolution too far for PCRE?
|
||||
|
||||
. Allow errorptr and erroroffset to be NULL. I don't like this idea.
|
||||
|
||||
. Line endings:
|
||||
|
||||
* Option to use NUL as a line terminator in subject strings. This could now
|
||||
be done relatively easily since the extension to support LF, CR, and CRLF.
|
||||
If it is done, a suitable option for pcregrep is also required.
|
||||
|
||||
. Catch SIGSEGV for stack overflows?
|
||||
|
||||
. A feature to suspend a match via a callout was once requested.
|
||||
|
||||
. Option to convert results into character offsets and character lengths.
|
||||
|
||||
. Option for pcregrep to scan only the start of a file. I am not keen - this is
|
||||
the job of "head".
|
||||
|
||||
. A (non-Unix) user wanted pcregrep options to (a) list a file name just once,
|
||||
preceded by a blank line, instead of adding it to every matched line, and (b)
|
||||
support --outputfile=name.
|
||||
|
||||
. Consider making UTF and UCP the default for PCRE n.0 for some n > 8.
|
||||
|
||||
. Define a union for the results from pcre2_pattern_info().
|
||||
|
||||
. Provide a "random access to the subject" facility so that the way in which it
|
||||
is stored is independent of PCRE. For efficiency, it probably isn't possible
|
||||
to switch this dynamically. It would have to be specified when PCRE was
|
||||
compiled. PCRE would then call a function every time it wanted a character.
|
||||
|
||||
. Wild thought: the ability to compile from PCRE's internal byte code to a real
|
||||
FSM and a very fast (third) matcher to process the result. There would be
|
||||
even more restrictions than for pcre_dfa_exec(), however. This is not easy.
|
||||
This is probably obsolete now that we have the JIT support.
|
||||
|
||||
. Should pcretest have some private locale data, to avoid relying on the
|
||||
available locales for the test data, since different OS have different ideas?
|
||||
This won't be as thorough a test, but perhaps that doesn't really matter.
|
||||
|
||||
. pcregrep: add -rs for a sorted recurse? Having to store file names and sort
|
||||
them will of course slow it down.
|
||||
|
||||
. Someone suggested --disable-callout to save code space when callouts are
|
||||
never wanted. This seems rather marginal.
|
||||
|
||||
. A user suggested a parameter to limit the length of string matched, for
|
||||
example if the parameter is N, the current match should fail if the matched
|
||||
substring exceeds N. This could apply to both match functions. The value
|
||||
could be a new field in the extra block.
|
||||
|
||||
. Callouts with arguments: (?Cn:ARG) for instance.
|
||||
|
||||
. Write a function that generates random matching strings for a compiled regex.
|
||||
|
||||
. Pcregrep: an option to specify the output line separator, either as a string
|
||||
or select from a fixed list. This is not dead easy, because at the moment it
|
||||
outputs whatever is in the input file.
|
||||
|
||||
. Improve the code for duplicate checking in pcre_dfa_exec(). An incomplete,
|
||||
non-thread-safe patch showed that this can help performance for patterns
|
||||
where there are many alternatives. However, a simple thread-safe
|
||||
implementation that I tried made things worse in many simple cases, so this
|
||||
is not an obviously good thing.
|
||||
|
||||
. PCRE cannot at present distinguish between subpatterns with different names,
|
||||
but the same number (created by the use of ?|). In order to do so, a way of
|
||||
remembering *which* subpattern numbered n matched is needed. Bugzilla #760.
|
||||
Now that (*MARK) has been implemented, it can perhaps be used as a way round
|
||||
this problem.
|
||||
|
||||
. Instead of having #ifdef HAVE_CONFIG_H in each module, put #include
|
||||
"something" and the the #ifdef appears only in one place, in "something".
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 13 May 2014
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,138 @@
|
|||
const unsigned char _pcre_default_tables[] = {
|
||||
0,1,2,3,4,5,6,7,
|
||||
8,9,10,11,12,13,14,15,
|
||||
16,17,18,19,20,21,22,23,
|
||||
24,25,26,27,28,29,30,31,
|
||||
32,33,34,35,36,37,38,39,
|
||||
40,41,42,43,44,45,46,47,
|
||||
48,49,50,51,52,53,54,55,
|
||||
56,57,58,59,60,61,62,63,
|
||||
64,97,98,99,100,101,102,103,
|
||||
104,105,106,107,108,109,110,111,
|
||||
112,113,114,115,116,117,118,119,
|
||||
120,121,122,91,92,93,94,95,
|
||||
96,97,98,99,100,101,102,103,
|
||||
104,105,106,107,108,109,110,111,
|
||||
112,113,114,115,116,117,118,119,
|
||||
120,121,122,123,124,125,126,127,
|
||||
128,129,130,131,132,133,134,135,
|
||||
136,137,138,139,140,141,142,143,
|
||||
144,145,146,147,148,149,150,151,
|
||||
152,153,154,155,156,157,158,159,
|
||||
160,161,162,163,164,165,166,167,
|
||||
168,169,170,171,172,173,174,175,
|
||||
176,177,178,179,180,181,182,183,
|
||||
184,185,186,187,188,189,190,191,
|
||||
224,225,226,227,228,229,230,231,
|
||||
232,233,234,235,236,237,238,239,
|
||||
240,241,242,243,244,245,246,215,
|
||||
248,249,250,251,252,253,254,223,
|
||||
224,225,226,227,228,229,230,231,
|
||||
232,233,234,235,236,237,238,239,
|
||||
240,241,242,243,244,245,246,247,
|
||||
248,249,250,251,252,253,254,255,
|
||||
0,1,2,3,4,5,6,7,
|
||||
8,9,10,11,12,13,14,15,
|
||||
16,17,18,19,20,21,22,23,
|
||||
24,25,26,27,28,29,30,31,
|
||||
32,33,34,35,36,37,38,39,
|
||||
40,41,42,43,44,45,46,47,
|
||||
48,49,50,51,52,53,54,55,
|
||||
56,57,58,59,60,61,62,63,
|
||||
64,97,98,99,100,101,102,103,
|
||||
104,105,106,107,108,109,110,111,
|
||||
112,113,114,115,116,117,118,119,
|
||||
120,121,122,91,92,93,94,95,
|
||||
96,65,66,67,68,69,70,71,
|
||||
72,73,74,75,76,77,78,79,
|
||||
80,81,82,83,84,85,86,87,
|
||||
88,89,90,123,124,125,126,127,
|
||||
128,129,130,131,132,133,134,135,
|
||||
136,137,138,139,140,141,142,143,
|
||||
144,145,146,147,148,149,150,151,
|
||||
152,153,154,155,156,157,158,159,
|
||||
160,161,162,163,164,165,166,167,
|
||||
168,169,170,171,172,173,174,175,
|
||||
176,177,178,179,180,181,182,183,
|
||||
184,185,186,187,188,189,190,191,
|
||||
224,225,226,227,228,229,230,231,
|
||||
232,233,234,235,236,237,238,239,
|
||||
240,241,242,243,244,245,246,215,
|
||||
248,249,250,251,252,253,254,223,
|
||||
192,193,194,195,196,197,198,199,
|
||||
200,201,202,203,204,205,206,207,
|
||||
208,209,210,211,212,213,214,247,
|
||||
216,217,218,219,220,221,222,255,
|
||||
0,62,0,0,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
32,0,0,0,1,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,255,3,
|
||||
126,0,0,0,126,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,255,3,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,12,2,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
254,255,255,7,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
255,255,127,127,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,254,255,255,7,
|
||||
0,0,0,0,0,4,32,4,
|
||||
0,0,0,128,255,255,127,255,
|
||||
0,0,0,0,0,0,255,3,
|
||||
254,255,255,135,254,255,255,7,
|
||||
0,0,0,0,0,4,44,6,
|
||||
255,255,127,255,255,255,127,255,
|
||||
0,0,0,0,254,255,255,255,
|
||||
255,255,255,255,255,255,255,127,
|
||||
0,0,0,0,254,255,255,255,
|
||||
255,255,255,255,255,255,255,255,
|
||||
0,2,0,0,255,255,255,255,
|
||||
255,255,255,255,255,255,255,127,
|
||||
0,0,0,0,255,255,255,255,
|
||||
255,255,255,255,255,255,255,255,
|
||||
0,0,0,0,254,255,0,252,
|
||||
1,0,0,248,1,0,0,120,
|
||||
0,0,0,0,254,255,255,255,
|
||||
0,0,128,0,0,0,128,0,
|
||||
255,255,255,255,0,0,0,0,
|
||||
0,0,0,0,0,0,0,128,
|
||||
255,255,255,255,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
128,0,0,0,0,0,0,0,
|
||||
0,1,1,0,1,1,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,128,0,0,0,
|
||||
128,128,128,128,0,0,128,0,
|
||||
28,28,28,28,28,28,28,28,
|
||||
28,28,0,0,0,0,0,128,
|
||||
0,26,26,26,26,26,26,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,128,128,0,128,16,
|
||||
0,26,26,26,26,26,26,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,128,128,0,0,0,
|
||||
0,0,0,0,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,
|
||||
1,0,0,0,0,0,0,0,
|
||||
0,0,18,0,0,0,0,0,
|
||||
0,0,20,20,0,18,0,0,
|
||||
0,20,18,0,0,0,0,0,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,0,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,18,
|
||||
18,18,18,18,18,18,18,0,
|
||||
18,18,18,18,18,18,18,18
|
||||
};
|
|
@ -0,0 +1,297 @@
|
|||
/***************************************************
|
||||
* A program for testing the Unicode property table *
|
||||
***************************************************/
|
||||
|
||||
/* Copyright (c) University of Cambridge 2008 */
|
||||
|
||||
/* Compile thus:
|
||||
gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
|
||||
ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
|
||||
*/
|
||||
|
||||
/* The program expects to read commands on stdin, and it writes output
|
||||
to stdout. There is only one command, "findprop", followed by a list of Unicode
|
||||
code points as hex numbers (without any prefixes). The output is one line per
|
||||
character, giving its Unicode properties followed by its other case if there is
|
||||
one. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "../src/config.h"
|
||||
#endif
|
||||
|
||||
#ifndef SUPPORT_UTF
|
||||
#define SUPPORT_UTF
|
||||
#endif
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "../src/pcre2_internal.h"
|
||||
#include "../src/pcre2_ucp.h"
|
||||
|
||||
|
||||
|
||||
/* -------------------------------------------------------------------*/
|
||||
|
||||
#define CS (char *)
|
||||
#define CCS (const char *)
|
||||
#define CSS (char **)
|
||||
#define US (unsigned char *)
|
||||
#define CUS (const unsigned char *)
|
||||
#define USS (unsigned char **)
|
||||
|
||||
/* -------------------------------------------------------------------*/
|
||||
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print Unicode property info for a char *
|
||||
*************************************************/
|
||||
|
||||
static void
|
||||
print_prop(int c)
|
||||
{
|
||||
int type = UCD_CATEGORY(c);
|
||||
int fulltype = UCD_CHARTYPE(c);
|
||||
int script = UCD_SCRIPT(c);
|
||||
int gbprop = UCD_GRAPHBREAK(c);
|
||||
int othercase = UCD_OTHERCASE(c);
|
||||
int caseset = UCD_CASESET(c);
|
||||
|
||||
unsigned char *fulltypename = US"??";
|
||||
unsigned char *typename = US"??";
|
||||
unsigned char *scriptname = US"??";
|
||||
unsigned char *graphbreak = US"??";
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case ucp_C: typename = US"Control"; break;
|
||||
case ucp_L: typename = US"Letter"; break;
|
||||
case ucp_M: typename = US"Mark"; break;
|
||||
case ucp_N: typename = US"Number"; break;
|
||||
case ucp_P: typename = US"Punctuation"; break;
|
||||
case ucp_S: typename = US"Symbol"; break;
|
||||
case ucp_Z: typename = US"Separator"; break;
|
||||
}
|
||||
|
||||
switch (fulltype)
|
||||
{
|
||||
case ucp_Cc: fulltypename = US"Control"; break;
|
||||
case ucp_Cf: fulltypename = US"Format"; break;
|
||||
case ucp_Cn: fulltypename = US"Unassigned"; break;
|
||||
case ucp_Co: fulltypename = US"Private use"; break;
|
||||
case ucp_Cs: fulltypename = US"Surrogate"; break;
|
||||
case ucp_Ll: fulltypename = US"Lower case letter"; break;
|
||||
case ucp_Lm: fulltypename = US"Modifier letter"; break;
|
||||
case ucp_Lo: fulltypename = US"Other letter"; break;
|
||||
case ucp_Lt: fulltypename = US"Title case letter"; break;
|
||||
case ucp_Lu: fulltypename = US"Upper case letter"; break;
|
||||
case ucp_Mc: fulltypename = US"Spacing mark"; break;
|
||||
case ucp_Me: fulltypename = US"Enclosing mark"; break;
|
||||
case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
|
||||
case ucp_Nd: fulltypename = US"Decimal number"; break;
|
||||
case ucp_Nl: fulltypename = US"Letter number"; break;
|
||||
case ucp_No: fulltypename = US"Other number"; break;
|
||||
case ucp_Pc: fulltypename = US"Connector punctuation"; break;
|
||||
case ucp_Pd: fulltypename = US"Dash punctuation"; break;
|
||||
case ucp_Pe: fulltypename = US"Close punctuation"; break;
|
||||
case ucp_Pf: fulltypename = US"Final punctuation"; break;
|
||||
case ucp_Pi: fulltypename = US"Initial punctuation"; break;
|
||||
case ucp_Po: fulltypename = US"Other punctuation"; break;
|
||||
case ucp_Ps: fulltypename = US"Open punctuation"; break;
|
||||
case ucp_Sc: fulltypename = US"Currency symbol"; break;
|
||||
case ucp_Sk: fulltypename = US"Modifier symbol"; break;
|
||||
case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
|
||||
case ucp_So: fulltypename = US"Other symbol"; break;
|
||||
case ucp_Zl: fulltypename = US"Line separator"; break;
|
||||
case ucp_Zp: fulltypename = US"Paragraph separator"; break;
|
||||
case ucp_Zs: fulltypename = US"Space separator"; break;
|
||||
}
|
||||
|
||||
switch(gbprop)
|
||||
{
|
||||
case ucp_gbCR: graphbreak = US"CR"; break;
|
||||
case ucp_gbLF: graphbreak = US"LF"; break;
|
||||
case ucp_gbControl: graphbreak = US"Control"; break;
|
||||
case ucp_gbExtend: graphbreak = US"Extend"; break;
|
||||
case ucp_gbPrepend: graphbreak = US"Prepend"; break;
|
||||
case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break;
|
||||
case ucp_gbL: graphbreak = US"Hangul syllable type L"; break;
|
||||
case ucp_gbV: graphbreak = US"Hangul syllable type V"; break;
|
||||
case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
|
||||
case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
|
||||
case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
|
||||
case ucp_gbOther: graphbreak = US"Other"; break;
|
||||
}
|
||||
|
||||
switch(script)
|
||||
{
|
||||
case ucp_Arabic: scriptname = US"Arabic"; break;
|
||||
case ucp_Armenian: scriptname = US"Armenian"; break;
|
||||
case ucp_Balinese: scriptname = US"Balinese"; break;
|
||||
case ucp_Bengali: scriptname = US"Bengali"; break;
|
||||
case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
|
||||
case ucp_Braille: scriptname = US"Braille"; break;
|
||||
case ucp_Buginese: scriptname = US"Buginese"; break;
|
||||
case ucp_Buhid: scriptname = US"Buhid"; break;
|
||||
case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
|
||||
case ucp_Cherokee: scriptname = US"Cherokee"; break;
|
||||
case ucp_Common: scriptname = US"Common"; break;
|
||||
case ucp_Coptic: scriptname = US"Coptic"; break;
|
||||
case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
|
||||
case ucp_Cypriot: scriptname = US"Cypriot"; break;
|
||||
case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
|
||||
case ucp_Deseret: scriptname = US"Deseret"; break;
|
||||
case ucp_Devanagari: scriptname = US"Devanagari"; break;
|
||||
case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
|
||||
case ucp_Georgian: scriptname = US"Georgian"; break;
|
||||
case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
|
||||
case ucp_Gothic: scriptname = US"Gothic"; break;
|
||||
case ucp_Greek: scriptname = US"Greek"; break;
|
||||
case ucp_Gujarati: scriptname = US"Gujarati"; break;
|
||||
case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
|
||||
case ucp_Han: scriptname = US"Han"; break;
|
||||
case ucp_Hangul: scriptname = US"Hangul"; break;
|
||||
case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
|
||||
case ucp_Hebrew: scriptname = US"Hebrew"; break;
|
||||
case ucp_Hiragana: scriptname = US"Hiragana"; break;
|
||||
case ucp_Inherited: scriptname = US"Inherited"; break;
|
||||
case ucp_Kannada: scriptname = US"Kannada"; break;
|
||||
case ucp_Katakana: scriptname = US"Katakana"; break;
|
||||
case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
|
||||
case ucp_Khmer: scriptname = US"Khmer"; break;
|
||||
case ucp_Lao: scriptname = US"Lao"; break;
|
||||
case ucp_Latin: scriptname = US"Latin"; break;
|
||||
case ucp_Limbu: scriptname = US"Limbu"; break;
|
||||
case ucp_Linear_B: scriptname = US"Linear_B"; break;
|
||||
case ucp_Malayalam: scriptname = US"Malayalam"; break;
|
||||
case ucp_Mongolian: scriptname = US"Mongolian"; break;
|
||||
case ucp_Myanmar: scriptname = US"Myanmar"; break;
|
||||
case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
|
||||
case ucp_Nko: scriptname = US"Nko"; break;
|
||||
case ucp_Ogham: scriptname = US"Ogham"; break;
|
||||
case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
|
||||
case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
|
||||
case ucp_Oriya: scriptname = US"Oriya"; break;
|
||||
case ucp_Osmanya: scriptname = US"Osmanya"; break;
|
||||
case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
|
||||
case ucp_Phoenician: scriptname = US"Phoenician"; break;
|
||||
case ucp_Runic: scriptname = US"Runic"; break;
|
||||
case ucp_Shavian: scriptname = US"Shavian"; break;
|
||||
case ucp_Sinhala: scriptname = US"Sinhala"; break;
|
||||
case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
|
||||
case ucp_Syriac: scriptname = US"Syriac"; break;
|
||||
case ucp_Tagalog: scriptname = US"Tagalog"; break;
|
||||
case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
|
||||
case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
|
||||
case ucp_Tamil: scriptname = US"Tamil"; break;
|
||||
case ucp_Telugu: scriptname = US"Telugu"; break;
|
||||
case ucp_Thaana: scriptname = US"Thaana"; break;
|
||||
case ucp_Thai: scriptname = US"Thai"; break;
|
||||
case ucp_Tibetan: scriptname = US"Tibetan"; break;
|
||||
case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
|
||||
case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
|
||||
case ucp_Yi: scriptname = US"Yi"; break;
|
||||
/* New for Unicode 5.1: */
|
||||
case ucp_Carian: scriptname = US"Carian"; break;
|
||||
case ucp_Cham: scriptname = US"Cham"; break;
|
||||
case ucp_Kayah_Li: scriptname = US"Kayah_Li"; break;
|
||||
case ucp_Lepcha: scriptname = US"Lepcha"; break;
|
||||
case ucp_Lycian: scriptname = US"Lycian"; break;
|
||||
case ucp_Lydian: scriptname = US"Lydian"; break;
|
||||
case ucp_Ol_Chiki: scriptname = US"Ol_Chiki"; break;
|
||||
case ucp_Rejang: scriptname = US"Rejang"; break;
|
||||
case ucp_Saurashtra: scriptname = US"Saurashtra"; break;
|
||||
case ucp_Sundanese: scriptname = US"Sundanese"; break;
|
||||
case ucp_Vai: scriptname = US"Vai"; break;
|
||||
/* New for Unicode 5.2: */
|
||||
case ucp_Avestan: scriptname = US"Avestan"; break;
|
||||
case ucp_Bamum: scriptname = US"Bamum"; break;
|
||||
case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
|
||||
case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
|
||||
case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
|
||||
case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
|
||||
case ucp_Javanese: scriptname = US"Javanese"; break;
|
||||
case ucp_Kaithi: scriptname = US"Kaithi"; break;
|
||||
case ucp_Lisu: scriptname = US"Lisu"; break;
|
||||
case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
|
||||
case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
|
||||
case ucp_Old_Turkic: scriptname = US"Old_Turkic"; break;
|
||||
case ucp_Samaritan: scriptname = US"Samaritan"; break;
|
||||
case ucp_Tai_Tham: scriptname = US"Tai_Tham"; break;
|
||||
case ucp_Tai_Viet: scriptname = US"Tai_Viet"; break;
|
||||
/* New for Unicode 6.0.0 */
|
||||
case ucp_Batak: scriptname = US"Batak"; break;
|
||||
case ucp_Brahmi: scriptname = US"Brahmi"; break;
|
||||
case ucp_Mandaic: scriptname = US"Mandaic"; break;
|
||||
|
||||
/* New for Unicode 6.1.0 */
|
||||
case ucp_Chakma: scriptname = US"Chakma"; break;
|
||||
case ucp_Meroitic_Cursive: scriptname = US"Meroitic_Cursive"; break;
|
||||
case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
|
||||
case ucp_Miao: scriptname = US"Miao"; break;
|
||||
case ucp_Sharada: scriptname = US"Sharada"; break;
|
||||
case ucp_Sora_Sompeng: scriptname = US"Sora Sompent"; break;
|
||||
case ucp_Takri: scriptname = US"Takri"; break;
|
||||
|
||||
}
|
||||
|
||||
printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
|
||||
if (othercase != c)
|
||||
{
|
||||
printf(", %04x", othercase);
|
||||
if (caseset != 0)
|
||||
{
|
||||
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
|
||||
while (*(++p) < NOTACHAR)
|
||||
if (*p != othercase && *p != c) printf(", %04x", *p);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Main program *
|
||||
*************************************************/
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
unsigned char buffer[1024];
|
||||
while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
|
||||
{
|
||||
unsigned char name[24];
|
||||
unsigned char *s, *t;
|
||||
|
||||
printf("%s", buffer);
|
||||
s = buffer;
|
||||
while (isspace(*s)) s++;
|
||||
if (*s == 0) continue;
|
||||
|
||||
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
||||
*t = 0;
|
||||
while (isspace(*s)) s++;
|
||||
|
||||
if (strcmp(CS name, "findprop") == 0)
|
||||
{
|
||||
while (*s != 0)
|
||||
{
|
||||
unsigned char *endptr;
|
||||
int c = strtoul(CS s, CSS(&endptr), 16);
|
||||
print_prop(c);
|
||||
s = endptr;
|
||||
while (isspace(*s)) s++;
|
||||
}
|
||||
}
|
||||
|
||||
else printf("Unknown test command %s\n", name);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* End */
|
|
@ -0,0 +1,34 @@
|
|||
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
|
||||
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
|
||||
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
|
||||
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
|
||||
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
|
||||
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
|
||||
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
|
||||
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
|
||||
|
||||
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
|
||||
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
|
||||
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
|
||||
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
|
||||
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
|
||||
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
|
||||
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
|
||||
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
|
||||
|
||||
findprop 0100 0101 0102 0103 0104 0105 0106
|
||||
|
||||
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
|
||||
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
|
||||
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
|
||||
findprop 10000 10001 e01ef f0000 100000
|
||||
|
||||
findprop 1b00 12000 7c0 a840 10900
|
||||
findprop 1d79 a77d
|
||||
|
||||
findprop 0800 083e a4d0 a4f7 aa80 aadf
|
||||
findprop 10b00 10b35 13000 1342e 10840 10855
|
||||
|
||||
findprop 11100 1113c 11680 116c0
|
||||
|
||||
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
|
|
@ -0,0 +1,359 @@
|
|||
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
|
||||
0000 Control: Control, Common, Control
|
||||
0001 Control: Control, Common, Control
|
||||
0002 Control: Control, Common, Control
|
||||
0003 Control: Control, Common, Control
|
||||
0004 Control: Control, Common, Control
|
||||
0005 Control: Control, Common, Control
|
||||
0006 Control: Control, Common, Control
|
||||
0007 Control: Control, Common, Control
|
||||
0008 Control: Control, Common, Control
|
||||
0009 Control: Control, Common, Control
|
||||
000a Control: Control, Common, LF
|
||||
000b Control: Control, Common, Control
|
||||
000c Control: Control, Common, Control
|
||||
000d Control: Control, Common, CR
|
||||
000e Control: Control, Common, Control
|
||||
000f Control: Control, Common, Control
|
||||
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
|
||||
0010 Control: Control, Common, Control
|
||||
0011 Control: Control, Common, Control
|
||||
0012 Control: Control, Common, Control
|
||||
0013 Control: Control, Common, Control
|
||||
0014 Control: Control, Common, Control
|
||||
0015 Control: Control, Common, Control
|
||||
0016 Control: Control, Common, Control
|
||||
0017 Control: Control, Common, Control
|
||||
0018 Control: Control, Common, Control
|
||||
0019 Control: Control, Common, Control
|
||||
001a Control: Control, Common, Control
|
||||
001b Control: Control, Common, Control
|
||||
001c Control: Control, Common, Control
|
||||
001d Control: Control, Common, Control
|
||||
001e Control: Control, Common, Control
|
||||
001f Control: Control, Common, Control
|
||||
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
|
||||
0020 Separator: Space separator, Common, Other
|
||||
0021 Punctuation: Other punctuation, Common, Other
|
||||
0022 Punctuation: Other punctuation, Common, Other
|
||||
0023 Punctuation: Other punctuation, Common, Other
|
||||
0024 Symbol: Currency symbol, Common, Other
|
||||
0025 Punctuation: Other punctuation, Common, Other
|
||||
0026 Punctuation: Other punctuation, Common, Other
|
||||
0027 Punctuation: Other punctuation, Common, Other
|
||||
0028 Punctuation: Open punctuation, Common, Other
|
||||
0029 Punctuation: Close punctuation, Common, Other
|
||||
002a Punctuation: Other punctuation, Common, Other
|
||||
002b Symbol: Mathematical symbol, Common, Other
|
||||
002c Punctuation: Other punctuation, Common, Other
|
||||
002d Punctuation: Dash punctuation, Common, Other
|
||||
002e Punctuation: Other punctuation, Common, Other
|
||||
002f Punctuation: Other punctuation, Common, Other
|
||||
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
|
||||
0030 Number: Decimal number, Common, Other
|
||||
0031 Number: Decimal number, Common, Other
|
||||
0032 Number: Decimal number, Common, Other
|
||||
0033 Number: Decimal number, Common, Other
|
||||
0034 Number: Decimal number, Common, Other
|
||||
0035 Number: Decimal number, Common, Other
|
||||
0036 Number: Decimal number, Common, Other
|
||||
0037 Number: Decimal number, Common, Other
|
||||
0038 Number: Decimal number, Common, Other
|
||||
0039 Number: Decimal number, Common, Other
|
||||
003a Punctuation: Other punctuation, Common, Other
|
||||
003b Punctuation: Other punctuation, Common, Other
|
||||
003c Symbol: Mathematical symbol, Common, Other
|
||||
003d Symbol: Mathematical symbol, Common, Other
|
||||
003e Symbol: Mathematical symbol, Common, Other
|
||||
003f Punctuation: Other punctuation, Common, Other
|
||||
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
|
||||
0040 Punctuation: Other punctuation, Common, Other
|
||||
0041 Letter: Upper case letter, Latin, Other, 0061
|
||||
0042 Letter: Upper case letter, Latin, Other, 0062
|
||||
0043 Letter: Upper case letter, Latin, Other, 0063
|
||||
0044 Letter: Upper case letter, Latin, Other, 0064
|
||||
0045 Letter: Upper case letter, Latin, Other, 0065
|
||||
0046 Letter: Upper case letter, Latin, Other, 0066
|
||||
0047 Letter: Upper case letter, Latin, Other, 0067
|
||||
0048 Letter: Upper case letter, Latin, Other, 0068
|
||||
0049 Letter: Upper case letter, Latin, Other, 0069
|
||||
004a Letter: Upper case letter, Latin, Other, 006a
|
||||
004b Letter: Upper case letter, Latin, Other, 006b, 212a
|
||||
004c Letter: Upper case letter, Latin, Other, 006c
|
||||
004d Letter: Upper case letter, Latin, Other, 006d
|
||||
004e Letter: Upper case letter, Latin, Other, 006e
|
||||
004f Letter: Upper case letter, Latin, Other, 006f
|
||||
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
|
||||
0050 Letter: Upper case letter, Latin, Other, 0070
|
||||
0051 Letter: Upper case letter, Latin, Other, 0071
|
||||
0052 Letter: Upper case letter, Latin, Other, 0072
|
||||
0053 Letter: Upper case letter, Latin, Other, 0073, 017f
|
||||
0054 Letter: Upper case letter, Latin, Other, 0074
|
||||
0055 Letter: Upper case letter, Latin, Other, 0075
|
||||
0056 Letter: Upper case letter, Latin, Other, 0076
|
||||
0057 Letter: Upper case letter, Latin, Other, 0077
|
||||
0058 Letter: Upper case letter, Latin, Other, 0078
|
||||
0059 Letter: Upper case letter, Latin, Other, 0079
|
||||
005a Letter: Upper case letter, Latin, Other, 007a
|
||||
005b Punctuation: Open punctuation, Common, Other
|
||||
005c Punctuation: Other punctuation, Common, Other
|
||||
005d Punctuation: Close punctuation, Common, Other
|
||||
005e Symbol: Modifier symbol, Common, Other
|
||||
005f Punctuation: Connector punctuation, Common, Other
|
||||
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
|
||||
0060 Symbol: Modifier symbol, Common, Other
|
||||
0061 Letter: Lower case letter, Latin, Other, 0041
|
||||
0062 Letter: Lower case letter, Latin, Other, 0042
|
||||
0063 Letter: Lower case letter, Latin, Other, 0043
|
||||
0064 Letter: Lower case letter, Latin, Other, 0044
|
||||
0065 Letter: Lower case letter, Latin, Other, 0045
|
||||
0066 Letter: Lower case letter, Latin, Other, 0046
|
||||
0067 Letter: Lower case letter, Latin, Other, 0047
|
||||
0068 Letter: Lower case letter, Latin, Other, 0048
|
||||
0069 Letter: Lower case letter, Latin, Other, 0049
|
||||
006a Letter: Lower case letter, Latin, Other, 004a
|
||||
006b Letter: Lower case letter, Latin, Other, 004b, 212a
|
||||
006c Letter: Lower case letter, Latin, Other, 004c
|
||||
006d Letter: Lower case letter, Latin, Other, 004d
|
||||
006e Letter: Lower case letter, Latin, Other, 004e
|
||||
006f Letter: Lower case letter, Latin, Other, 004f
|
||||
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
|
||||
0070 Letter: Lower case letter, Latin, Other, 0050
|
||||
0071 Letter: Lower case letter, Latin, Other, 0051
|
||||
0072 Letter: Lower case letter, Latin, Other, 0052
|
||||
0073 Letter: Lower case letter, Latin, Other, 0053, 017f
|
||||
0074 Letter: Lower case letter, Latin, Other, 0054
|
||||
0075 Letter: Lower case letter, Latin, Other, 0055
|
||||
0076 Letter: Lower case letter, Latin, Other, 0056
|
||||
0077 Letter: Lower case letter, Latin, Other, 0057
|
||||
0078 Letter: Lower case letter, Latin, Other, 0058
|
||||
0079 Letter: Lower case letter, Latin, Other, 0059
|
||||
007a Letter: Lower case letter, Latin, Other, 005a
|
||||
007b Punctuation: Open punctuation, Common, Other
|
||||
007c Symbol: Mathematical symbol, Common, Other
|
||||
007d Punctuation: Close punctuation, Common, Other
|
||||
007e Symbol: Mathematical symbol, Common, Other
|
||||
007f Control: Control, Common, Control
|
||||
|
||||
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
|
||||
0080 Control: Control, Common, Control
|
||||
0081 Control: Control, Common, Control
|
||||
0082 Control: Control, Common, Control
|
||||
0083 Control: Control, Common, Control
|
||||
0084 Control: Control, Common, Control
|
||||
0085 Control: Control, Common, Control
|
||||
0086 Control: Control, Common, Control
|
||||
0087 Control: Control, Common, Control
|
||||
0088 Control: Control, Common, Control
|
||||
0089 Control: Control, Common, Control
|
||||
008a Control: Control, Common, Control
|
||||
008b Control: Control, Common, Control
|
||||
008c Control: Control, Common, Control
|
||||
008d Control: Control, Common, Control
|
||||
008e Control: Control, Common, Control
|
||||
008f Control: Control, Common, Control
|
||||
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
|
||||
0090 Control: Control, Common, Control
|
||||
0091 Control: Control, Common, Control
|
||||
0092 Control: Control, Common, Control
|
||||
0093 Control: Control, Common, Control
|
||||
0094 Control: Control, Common, Control
|
||||
0095 Control: Control, Common, Control
|
||||
0096 Control: Control, Common, Control
|
||||
0097 Control: Control, Common, Control
|
||||
0098 Control: Control, Common, Control
|
||||
0099 Control: Control, Common, Control
|
||||
009a Control: Control, Common, Control
|
||||
009b Control: Control, Common, Control
|
||||
009c Control: Control, Common, Control
|
||||
009d Control: Control, Common, Control
|
||||
009e Control: Control, Common, Control
|
||||
009f Control: Control, Common, Control
|
||||
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
|
||||
00a0 Separator: Space separator, Common, Other
|
||||
00a1 Punctuation: Other punctuation, Common, Other
|
||||
00a2 Symbol: Currency symbol, Common, Other
|
||||
00a3 Symbol: Currency symbol, Common, Other
|
||||
00a4 Symbol: Currency symbol, Common, Other
|
||||
00a5 Symbol: Currency symbol, Common, Other
|
||||
00a6 Symbol: Other symbol, Common, Other
|
||||
00a7 Punctuation: Other punctuation, Common, Other
|
||||
00a8 Symbol: Modifier symbol, Common, Other
|
||||
00a9 Symbol: Other symbol, Common, Other
|
||||
00aa Letter: Other letter, Latin, Other
|
||||
00ab Punctuation: Initial punctuation, Common, Other
|
||||
00ac Symbol: Mathematical symbol, Common, Other
|
||||
00ad Control: Format, Common, Control
|
||||
00ae Symbol: Other symbol, Common, Other
|
||||
00af Symbol: Modifier symbol, Common, Other
|
||||
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
|
||||
00b0 Symbol: Other symbol, Common, Other
|
||||
00b1 Symbol: Mathematical symbol, Common, Other
|
||||
00b2 Number: Other number, Common, Other
|
||||
00b3 Number: Other number, Common, Other
|
||||
00b4 Symbol: Modifier symbol, Common, Other
|
||||
00b5 Letter: Lower case letter, Common, Other, 03bc, 039c
|
||||
00b6 Punctuation: Other punctuation, Common, Other
|
||||
00b7 Punctuation: Other punctuation, Common, Other
|
||||
00b8 Symbol: Modifier symbol, Common, Other
|
||||
00b9 Number: Other number, Common, Other
|
||||
00ba Letter: Other letter, Latin, Other
|
||||
00bb Punctuation: Final punctuation, Common, Other
|
||||
00bc Number: Other number, Common, Other
|
||||
00bd Number: Other number, Common, Other
|
||||
00be Number: Other number, Common, Other
|
||||
00bf Punctuation: Other punctuation, Common, Other
|
||||
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
|
||||
00c0 Letter: Upper case letter, Latin, Other, 00e0
|
||||
00c1 Letter: Upper case letter, Latin, Other, 00e1
|
||||
00c2 Letter: Upper case letter, Latin, Other, 00e2
|
||||
00c3 Letter: Upper case letter, Latin, Other, 00e3
|
||||
00c4 Letter: Upper case letter, Latin, Other, 00e4
|
||||
00c5 Letter: Upper case letter, Latin, Other, 00e5, 212b
|
||||
00c6 Letter: Upper case letter, Latin, Other, 00e6
|
||||
00c7 Letter: Upper case letter, Latin, Other, 00e7
|
||||
00c8 Letter: Upper case letter, Latin, Other, 00e8
|
||||
00c9 Letter: Upper case letter, Latin, Other, 00e9
|
||||
00ca Letter: Upper case letter, Latin, Other, 00ea
|
||||
00cb Letter: Upper case letter, Latin, Other, 00eb
|
||||
00cc Letter: Upper case letter, Latin, Other, 00ec
|
||||
00cd Letter: Upper case letter, Latin, Other, 00ed
|
||||
00ce Letter: Upper case letter, Latin, Other, 00ee
|
||||
00cf Letter: Upper case letter, Latin, Other, 00ef
|
||||
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
|
||||
00d0 Letter: Upper case letter, Latin, Other, 00f0
|
||||
00d1 Letter: Upper case letter, Latin, Other, 00f1
|
||||
00d2 Letter: Upper case letter, Latin, Other, 00f2
|
||||
00d3 Letter: Upper case letter, Latin, Other, 00f3
|
||||
00d4 Letter: Upper case letter, Latin, Other, 00f4
|
||||
00d5 Letter: Upper case letter, Latin, Other, 00f5
|
||||
00d6 Letter: Upper case letter, Latin, Other, 00f6
|
||||
00d7 Symbol: Mathematical symbol, Common, Other
|
||||
00d8 Letter: Upper case letter, Latin, Other, 00f8
|
||||
00d9 Letter: Upper case letter, Latin, Other, 00f9
|
||||
00da Letter: Upper case letter, Latin, Other, 00fa
|
||||
00db Letter: Upper case letter, Latin, Other, 00fb
|
||||
00dc Letter: Upper case letter, Latin, Other, 00fc
|
||||
00dd Letter: Upper case letter, Latin, Other, 00fd
|
||||
00de Letter: Upper case letter, Latin, Other, 00fe
|
||||
00df Letter: Lower case letter, Latin, Other, 1e9e
|
||||
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
|
||||
00e0 Letter: Lower case letter, Latin, Other, 00c0
|
||||
00e1 Letter: Lower case letter, Latin, Other, 00c1
|
||||
00e2 Letter: Lower case letter, Latin, Other, 00c2
|
||||
00e3 Letter: Lower case letter, Latin, Other, 00c3
|
||||
00e4 Letter: Lower case letter, Latin, Other, 00c4
|
||||
00e5 Letter: Lower case letter, Latin, Other, 00c5, 212b
|
||||
00e6 Letter: Lower case letter, Latin, Other, 00c6
|
||||
00e7 Letter: Lower case letter, Latin, Other, 00c7
|
||||
00e8 Letter: Lower case letter, Latin, Other, 00c8
|
||||
00e9 Letter: Lower case letter, Latin, Other, 00c9
|
||||
00ea Letter: Lower case letter, Latin, Other, 00ca
|
||||
00eb Letter: Lower case letter, Latin, Other, 00cb
|
||||
00ec Letter: Lower case letter, Latin, Other, 00cc
|
||||
00ed Letter: Lower case letter, Latin, Other, 00cd
|
||||
00ee Letter: Lower case letter, Latin, Other, 00ce
|
||||
00ef Letter: Lower case letter, Latin, Other, 00cf
|
||||
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
|
||||
00f0 Letter: Lower case letter, Latin, Other, 00d0
|
||||
00f1 Letter: Lower case letter, Latin, Other, 00d1
|
||||
00f2 Letter: Lower case letter, Latin, Other, 00d2
|
||||
00f3 Letter: Lower case letter, Latin, Other, 00d3
|
||||
00f4 Letter: Lower case letter, Latin, Other, 00d4
|
||||
00f5 Letter: Lower case letter, Latin, Other, 00d5
|
||||
00f6 Letter: Lower case letter, Latin, Other, 00d6
|
||||
00f7 Symbol: Mathematical symbol, Common, Other
|
||||
00f8 Letter: Lower case letter, Latin, Other, 00d8
|
||||
00f9 Letter: Lower case letter, Latin, Other, 00d9
|
||||
00fa Letter: Lower case letter, Latin, Other, 00da
|
||||
00fb Letter: Lower case letter, Latin, Other, 00db
|
||||
00fc Letter: Lower case letter, Latin, Other, 00dc
|
||||
00fd Letter: Lower case letter, Latin, Other, 00dd
|
||||
00fe Letter: Lower case letter, Latin, Other, 00de
|
||||
00ff Letter: Lower case letter, Latin, Other, 0178
|
||||
|
||||
findprop 0100 0101 0102 0103 0104 0105 0106
|
||||
0100 Letter: Upper case letter, Latin, Other, 0101
|
||||
0101 Letter: Lower case letter, Latin, Other, 0100
|
||||
0102 Letter: Upper case letter, Latin, Other, 0103
|
||||
0103 Letter: Lower case letter, Latin, Other, 0102
|
||||
0104 Letter: Upper case letter, Latin, Other, 0105
|
||||
0105 Letter: Lower case letter, Latin, Other, 0104
|
||||
0106 Letter: Upper case letter, Latin, Other, 0107
|
||||
|
||||
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
|
||||
ffe0 Symbol: Currency symbol, Common, Other
|
||||
ffe1 Symbol: Currency symbol, Common, Other
|
||||
ffe2 Symbol: Mathematical symbol, Common, Other
|
||||
ffe3 Symbol: Modifier symbol, Common, Other
|
||||
ffe4 Symbol: Other symbol, Common, Other
|
||||
ffe5 Symbol: Currency symbol, Common, Other
|
||||
ffe6 Symbol: Currency symbol, Common, Other
|
||||
ffe7 Control: Unassigned, Common, Other
|
||||
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
|
||||
ffe8 Symbol: Other symbol, Common, Other
|
||||
ffe9 Symbol: Mathematical symbol, Common, Other
|
||||
ffea Symbol: Mathematical symbol, Common, Other
|
||||
ffeb Symbol: Mathematical symbol, Common, Other
|
||||
ffec Symbol: Mathematical symbol, Common, Other
|
||||
ffed Symbol: Other symbol, Common, Other
|
||||
ffee Symbol: Other symbol, Common, Other
|
||||
ffef Control: Unassigned, Common, Other
|
||||
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
|
||||
fff8 Control: Unassigned, Common, Control
|
||||
fff9 Control: Format, Common, Control
|
||||
fffa Control: Format, Common, Control
|
||||
fffb Control: Format, Common, Control
|
||||
fffc Symbol: Other symbol, Common, Other
|
||||
fffd Symbol: Other symbol, Common, Other
|
||||
fffe Control: Unassigned, Common, Other
|
||||
ffff Control: Unassigned, Common, Other
|
||||
findprop 10000 10001 e01ef f0000 100000
|
||||
10000 Letter: Other letter, Linear_B, Other
|
||||
10001 Letter: Other letter, Linear_B, Other
|
||||
e01ef Mark: Non-spacing mark, Inherited, Extend
|
||||
f0000 Control: Private use, Common, Other
|
||||
100000 Control: Private use, Common, Other
|
||||
|
||||
findprop 1b00 12000 7c0 a840 10900
|
||||
1b00 Mark: Non-spacing mark, Balinese, Extend
|
||||
12000 Letter: Other letter, Cuneiform, Other
|
||||
07c0 Number: Decimal number, Nko, Other
|
||||
a840 Letter: Other letter, Phags_Pa, Other
|
||||
10900 Letter: Other letter, Phoenician, Other
|
||||
findprop 1d79 a77d
|
||||
1d79 Letter: Lower case letter, Latin, Other, a77d
|
||||
a77d Letter: Upper case letter, Latin, Other, 1d79
|
||||
|
||||
findprop 0800 083e a4d0 a4f7 aa80 aadf
|
||||
0800 Letter: Other letter, Samaritan, Other
|
||||
083e Punctuation: Other punctuation, Samaritan, Other
|
||||
a4d0 Letter: Other letter, Lisu, Other
|
||||
a4f7 Letter: Other letter, Lisu, Other
|
||||
aa80 Letter: Other letter, Tai_Viet, Other
|
||||
aadf Punctuation: Other punctuation, Tai_Viet, Other
|
||||
findprop 10b00 10b35 13000 1342e 10840 10855
|
||||
10b00 Letter: Other letter, Avestan, Other
|
||||
10b35 Letter: Other letter, Avestan, Other
|
||||
13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
|
||||
1342e Letter: Other letter, Egyptian_Hieroglyphs, Other
|
||||
10840 Letter: Other letter, Imperial_Aramaic, Other
|
||||
10855 Letter: Other letter, Imperial_Aramaic, Other
|
||||
|
||||
findprop 11100 1113c 11680 116c0
|
||||
11100 Mark: Non-spacing mark, Chakma, Extend
|
||||
1113c Number: Decimal number, Chakma, Other
|
||||
11680 Letter: Other letter, Takri, Other
|
||||
116c0 Number: Decimal number, Takri, Other
|
||||
|
||||
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
|
||||
000d Control: Control, Common, CR
|
||||
000a Control: Control, Common, LF
|
||||
000e Control: Control, Common, Control
|
||||
0711 Mark: Non-spacing mark, Syriac, Extend
|
||||
1b04 Mark: Spacing mark, Balinese, SpacingMark
|
||||
1111 Letter: Other letter, Hangul, Hangul syllable type L
|
||||
1169 Letter: Other letter, Hangul, Hangul syllable type V
|
||||
11fe Letter: Other letter, Hangul, Hangul syllable type T
|
||||
ae4c Letter: Other letter, Hangul, Hangul syllable type LV
|
||||
ad89 Letter: Other letter, Hangul, Hangul syllable type LVT
|
|
@ -0,0 +1,253 @@
|
|||
/* A test program for converting characters to UTF-8 and vice versa. Note that
|
||||
this program conforms to the original definition of UTF-8, which allows
|
||||
codepoints up to 7fffffff. The more recent definition limits the validity of
|
||||
UTF-8 codepoints to a maximum of 10ffffff.
|
||||
|
||||
The arguments are either single codepoint values, written as 0xhhhh, for
|
||||
conversion to UTF-8, or sequences of hex values, written without 0x and
|
||||
optionally including spaces (but such arguments must be quoted), for conversion
|
||||
from UTF-8 to codepoints. For example:
|
||||
|
||||
./utf8 0x1234
|
||||
0x00001234 => e1 88 b4
|
||||
|
||||
./utf8 "e1 88 b4"
|
||||
0x00001234 <= e1 88 b4
|
||||
|
||||
In the second case, a number of characters can be present in one argument:
|
||||
|
||||
./utf8 "65 e188b4 77"
|
||||
0x00000065 <= 65
|
||||
0x00001234 <= e1 88 b4
|
||||
0x00000077 <= 77
|
||||
|
||||
If the option -s is given, the sequence of UTF-bytes is written out between
|
||||
angle brackets at the end of the line. On a UTF-8 terminal, this will show the
|
||||
appropriate graphic for the codepoint. */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
|
||||
/* The valid ranges for UTF-8 characters are:
|
||||
|
||||
0000 0000 to 0000 007f 1 byte (ascii)
|
||||
0000 0080 to 0000 07ff 2 bytes
|
||||
0000 0800 to 0000 ffff 3 bytes
|
||||
0001 0000 to 001f ffff 4 bytes
|
||||
0020 0000 to 03ff ffff 5 bytes
|
||||
0400 0000 to 7fff ffff 6 bytes
|
||||
*/
|
||||
|
||||
|
||||
static const int utf8_table1[] = {
|
||||
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
|
||||
|
||||
static const int utf8_table2[] = {
|
||||
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||
|
||||
static const int utf8_table3[] = {
|
||||
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||
|
||||
static const unsigned char utf8_table4[] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Convert character value to UTF-8 *
|
||||
*************************************************/
|
||||
|
||||
/* This function takes an integer value in the range 0 - 0x7fffffff
|
||||
and encodes it as a UTF-8 character in 1 to 6 bytes.
|
||||
|
||||
Arguments:
|
||||
cvalue the character value
|
||||
buffer pointer to buffer for result - at least 6 bytes long
|
||||
|
||||
Returns: number of characters placed in the buffer
|
||||
-1 if input character is negative
|
||||
0 if input character is positive but too big (only when
|
||||
int is longer than 32 bits)
|
||||
*/
|
||||
|
||||
int
|
||||
ord2utf8(int cvalue, unsigned char *buffer)
|
||||
{
|
||||
register int i, j;
|
||||
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
|
||||
if (cvalue <= utf8_table1[i]) break;
|
||||
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
|
||||
if (cvalue < 0) return -1;
|
||||
buffer += i;
|
||||
for (j = i; j > 0; j--)
|
||||
{
|
||||
*buffer-- = 0x80 | (cvalue & 0x3f);
|
||||
cvalue >>= 6;
|
||||
}
|
||||
*buffer = utf8_table2[i] | cvalue;
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Convert UTF-8 string to value *
|
||||
*************************************************/
|
||||
|
||||
/* This function takes one or more bytes that represents a UTF-8 character,
|
||||
and returns the value of the character.
|
||||
|
||||
Argument:
|
||||
buffer a pointer to the byte vector
|
||||
vptr a pointer to an int to receive the value
|
||||
|
||||
Returns: > 0 => the number of bytes consumed
|
||||
-6 to 0 => malformed UTF-8 character at offset = (-return)
|
||||
*/
|
||||
|
||||
int
|
||||
utf82ord(unsigned char *buffer, int *vptr)
|
||||
{
|
||||
int c = *buffer++;
|
||||
int d = c;
|
||||
int i, j, s;
|
||||
|
||||
for (i = -1; i < 6; i++) /* i is number of additional bytes */
|
||||
{
|
||||
if ((d & 0x80) == 0) break;
|
||||
d <<= 1;
|
||||
}
|
||||
|
||||
if (i == -1) { *vptr = c; return 1; } /* ascii character */
|
||||
if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
|
||||
|
||||
/* i now has a value in the range 1-5 */
|
||||
|
||||
s = 6*i;
|
||||
d = (c & utf8_table3[i]) << s;
|
||||
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
c = *buffer++;
|
||||
if ((c & 0xc0) != 0x80) return -(j+1);
|
||||
s -= 6;
|
||||
d |= (c & 0x3f) << s;
|
||||
}
|
||||
|
||||
/* Check that encoding was the correct unique one */
|
||||
|
||||
for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
|
||||
if (d <= utf8_table1[j]) break;
|
||||
if (j != i) return -(i+1);
|
||||
|
||||
/* Valid value */
|
||||
|
||||
*vptr = d;
|
||||
return i+1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Main Program *
|
||||
*************************************************/
|
||||
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
int i = 1;
|
||||
int show = 0;
|
||||
unsigned char buffer[64];
|
||||
|
||||
if (argc > 1 && strcmp(argv[1], "-s") == 0)
|
||||
{
|
||||
show = 1;
|
||||
i = 2;
|
||||
}
|
||||
|
||||
for (; i < argc; i++)
|
||||
{
|
||||
unsigned char *x = argv[i];
|
||||
if (strncmp(x, "0x", 2) == 0)
|
||||
{
|
||||
int j;
|
||||
int d = strtol(x+2, NULL, 16);
|
||||
int rc = ord2utf8(d, buffer);
|
||||
printf("0x%08x => ", d);
|
||||
if (rc <= 0) printf("*** Error %d ***", rc); else
|
||||
{
|
||||
for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
|
||||
if (show)
|
||||
{
|
||||
printf(">");
|
||||
for (j = 0; j < rc; j++) printf("%c", buffer[j]);
|
||||
printf("<");
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
int d, rc;
|
||||
int j = 0;
|
||||
int y = 0;
|
||||
int z = 0;
|
||||
unsigned char *bptr;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
while (*x == ' ') x++;
|
||||
if (*x == 0 && !z) break;
|
||||
if (!isxdigit(*x))
|
||||
{
|
||||
printf("Malformed hex string: %s\n", argv[i]);
|
||||
j = -1;
|
||||
break;
|
||||
}
|
||||
y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
|
||||
x++;
|
||||
if (z)
|
||||
{
|
||||
buffer[j++] = y;
|
||||
y = 0;
|
||||
}
|
||||
z ^= 1;
|
||||
}
|
||||
buffer[j] = 0;
|
||||
bptr = buffer;
|
||||
|
||||
while (*bptr != 0)
|
||||
{
|
||||
rc = utf82ord(bptr, &d);
|
||||
if (rc > 0)
|
||||
{
|
||||
printf("0x%08x <= ", d);
|
||||
for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
|
||||
if (show)
|
||||
{
|
||||
printf(">");
|
||||
for (j = 0; j < rc; j++) printf("%c", bptr[j]);
|
||||
printf("<");
|
||||
}
|
||||
printf("\n");
|
||||
bptr += rc;
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Malformed UTF-8 at offset %d <= ", -rc);
|
||||
while (*bptr != 0) printf("%02x ", *bptr++);
|
||||
printf("\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* End */
|
|
@ -451,7 +451,7 @@ PCRE2_EXP_DECL int pcre2_substring_length_byname(pcre2_match_data *, \
|
|||
PCRE2_EXP_DECL int pcre2_substring_length_bynumber(pcre2_match_data *, \
|
||||
int); \
|
||||
PCRE2_EXP_DECL int pcre2_substring_nametable_scan(const pcre2_code *, \
|
||||
PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_UCHAR **); \
|
||||
PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \
|
||||
PCRE2_EXP_DECL int pcre2_substring_number_from_name(\
|
||||
const pcre2_code *, PCRE2_SPTR); \
|
||||
PCRE2_EXP_DECL void pcre2_substring_list_free(PCRE2_SPTR *); \
|
||||
|
|
|
@ -451,7 +451,7 @@ PCRE2_EXP_DECL int pcre2_substring_length_byname(pcre2_match_data *, \
|
|||
PCRE2_EXP_DECL int pcre2_substring_length_bynumber(pcre2_match_data *, \
|
||||
int); \
|
||||
PCRE2_EXP_DECL int pcre2_substring_nametable_scan(const pcre2_code *, \
|
||||
PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_UCHAR **); \
|
||||
PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \
|
||||
PCRE2_EXP_DECL int pcre2_substring_number_from_name(\
|
||||
const pcre2_code *, PCRE2_SPTR); \
|
||||
PCRE2_EXP_DECL void pcre2_substring_list_free(PCRE2_SPTR *); \
|
||||
|
|
|
@ -102,23 +102,57 @@ if (ccontext == NULL)
|
|||
|
||||
if (pattern[0] == 'Y')
|
||||
{
|
||||
c = ccontext->memctl.malloc(sizeof(pcre2_real_code), NULL);
|
||||
PCRE2_UCHAR *n;
|
||||
int lennumber = (PCRE2_CODE_UNIT_WIDTH == 8)? 2 : 1;
|
||||
size_t size = sizeof(pcre2_real_code) +
|
||||
(12 + 3*lennumber)*(PCRE2_CODE_UNIT_WIDTH/8) + CU2BYTES(20);
|
||||
c = ccontext->memctl.malloc(size, NULL);
|
||||
c->memctl = ccontext->memctl;
|
||||
c->magic_number = MAGIC_NUMBER;
|
||||
c->size = sizeof(pcre2_real_code);
|
||||
c->name_table_offset = sizeof(pcre2_real_code);
|
||||
c->size = size;
|
||||
c->compile_options = options;
|
||||
c->flags = PCRE2_CODE_UNIT_WIDTH/8;
|
||||
c->limit_match = 0;
|
||||
c->limit_recursion = 0;
|
||||
c->max_lookbehind = 0;
|
||||
c->minlength = 3;
|
||||
c->top_bracket = 1;
|
||||
c->top_bracket = 5;
|
||||
c->top_backref = 1;
|
||||
c->bsr_convention = ccontext->bsr_convention;
|
||||
c->newline_convention = ccontext->newline_convention;
|
||||
c->name_count = 0;
|
||||
c->name_entry_size = 0;
|
||||
c->name_count = 3;
|
||||
c->name_entry_size = 4 + lennumber;
|
||||
|
||||
n = (PCRE2_UCHAR *)((char *)c + sizeof(pcre2_real_code));
|
||||
|
||||
if (lennumber == 2) *n++ = 0 ;
|
||||
*n++ = 1;
|
||||
*n++ = 'x'; *n++ = 'x'; *n++ = 'x'; *n++ = 0;
|
||||
|
||||
if (lennumber == 2) *n++ = 0 ;
|
||||
*n++ = 2;
|
||||
*n++ = 'y'; *n++ = 'y'; *n++ = 'y'; *n++ = 0;
|
||||
|
||||
if (lennumber == 2) *n++ = 0 ;
|
||||
*n++ = 3;
|
||||
*n++ = 'y'; *n++ = 'y'; *n++ = 'y'; *n++ = 0;
|
||||
|
||||
|
||||
*n++ = OP_CHAR;
|
||||
*n++ = 'x';
|
||||
*n++ = OP_CHARI;
|
||||
*n++ = 'Y';
|
||||
|
||||
*n++ = OP_PROP;
|
||||
*n++ = PT_SC;
|
||||
*n++ = 0;
|
||||
|
||||
*n++ = OP_DNRREF;
|
||||
*n++ = 0;
|
||||
|
||||
*n++ = OP_END;
|
||||
|
||||
|
||||
}
|
||||
|
||||
else
|
||||
|
|
|
@ -78,27 +78,26 @@ memory control data is to be stored for future use.
|
|||
Arguments:
|
||||
size amount of memory required
|
||||
offset offset in memory block to memctl structure
|
||||
gcontext a general context or NULL
|
||||
memctl pointer to a memctl block or NULL
|
||||
|
||||
Returns: pointer to memory or NULL on failure
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN void *
|
||||
PRIV(memctl_malloc)(size_t size, size_t offset,
|
||||
pcre2_general_context *gcontext)
|
||||
PRIV(memctl_malloc)(size_t size, size_t offset, pcre2_memctl *memctl)
|
||||
{
|
||||
pcre2_memctl *memctl;
|
||||
void *yield = (gcontext == NULL)? malloc(size) :
|
||||
gcontext->memctl.malloc(size, gcontext->memctl.memory_data);
|
||||
pcre2_memctl *newmemctl;
|
||||
void *yield = (memctl == NULL)? malloc(size) :
|
||||
memctl->malloc(size, memctl->memory_data);
|
||||
if (yield == NULL) return NULL;
|
||||
memctl = (pcre2_memctl *)(((uint8_t *)yield) + offset);
|
||||
if (gcontext == NULL)
|
||||
newmemctl = (pcre2_memctl *)(((uint8_t *)yield) + offset);
|
||||
if (memctl == NULL)
|
||||
{
|
||||
memctl->malloc = default_malloc;
|
||||
memctl->free = default_free;
|
||||
memctl->memory_data = NULL;
|
||||
newmemctl->malloc = default_malloc;
|
||||
newmemctl->free = default_free;
|
||||
newmemctl->memory_data = NULL;
|
||||
}
|
||||
else *memctl = gcontext->memctl;
|
||||
else *newmemctl = *memctl;
|
||||
return yield;
|
||||
}
|
||||
|
||||
|
@ -152,7 +151,7 @@ pcre2_compile_context_create(pcre2_general_context *gcontext)
|
|||
pcre2_compile_context *ccontext = PRIV(memctl_malloc)(
|
||||
sizeof(pcre2_real_compile_context),
|
||||
offsetof(pcre2_real_compile_context, memctl),
|
||||
gcontext);
|
||||
&(gcontext->memctl));
|
||||
if (ccontext == NULL) return NULL;
|
||||
PRIV(compile_context_init)(ccontext, FALSE);
|
||||
return ccontext;
|
||||
|
@ -184,7 +183,7 @@ pcre2_match_context_create(pcre2_general_context *gcontext)
|
|||
pcre2_match_context *mcontext = PRIV(memctl_malloc)(
|
||||
sizeof(pcre2_real_match_context),
|
||||
offsetof(pcre2_real_compile_context, memctl),
|
||||
gcontext);
|
||||
&(gcontext->memctl));
|
||||
if (mcontext == NULL) return NULL;
|
||||
PRIV(match_context_init)(mcontext, FALSE);
|
||||
return mcontext;
|
||||
|
@ -240,21 +239,24 @@ return new;
|
|||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_general_context_free(pcre2_general_context *gcontext)
|
||||
{
|
||||
gcontext->memctl.free(gcontext, gcontext->memctl.memory_data);
|
||||
if (gcontext != NULL)
|
||||
gcontext->memctl.free(gcontext, gcontext->memctl.memory_data);
|
||||
}
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_compile_context_free(pcre2_compile_context *ccontext)
|
||||
{
|
||||
ccontext->memctl.free(ccontext, ccontext->memctl.memory_data);
|
||||
if (ccontext != NULL)
|
||||
ccontext->memctl.free(ccontext, ccontext->memctl.memory_data);
|
||||
}
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_match_context_free(pcre2_match_context *mcontext)
|
||||
{
|
||||
mcontext->memctl.free(mcontext, mcontext->memctl.memory_data);
|
||||
if (mcontext != NULL)
|
||||
mcontext->memctl.free(mcontext, mcontext->memctl.memory_data);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -49,6 +49,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <string.h>
|
||||
|
||||
#include "pcre2.h"
|
||||
#include "pcre2_ucp.h"
|
||||
|
||||
#define PUBL(name) pcre2_##name
|
||||
|
||||
|
@ -77,6 +78,11 @@ typedef int BOOL;
|
|||
#include <valgrind/memcheck.h>
|
||||
#endif
|
||||
|
||||
/* This is an unsigned int value that no character can ever have, as
|
||||
Unicode doesn't go beyond 0x0010ffff. */
|
||||
|
||||
#define NOTACHAR 0xffffffff
|
||||
|
||||
/* When UTF encoding is being used, a character is no longer just a single
|
||||
byte in 8-bit mode or a single short in 16-bit mode. The macros for character
|
||||
handling generate simple sequences when used in the basic mode, and more
|
||||
|
@ -165,6 +171,109 @@ the pointer. */
|
|||
#endif /* SUPPORT_UTF */
|
||||
|
||||
|
||||
/* Tests for Unicode horizontal and vertical whitespace characters must check a
|
||||
number of different values. Using a switch statement for this generates the
|
||||
fastest code (no loop, no memory access), and there are several places in the
|
||||
interpreter code where this happens. In order to ensure that all the case lists
|
||||
remain in step, we use macros so that there is only one place where the lists
|
||||
are defined.
|
||||
|
||||
These values are also required as lists in pcre2_compile.c when processing \h,
|
||||
\H, \v and \V in a character class. The lists are defined in pcre2_tables.c,
|
||||
but macros that define the values are here so that all the definitions are
|
||||
together. The lists must be in ascending character order, terminated by
|
||||
NOTACHAR (which is 0xffffffff).
|
||||
|
||||
Any changes should ensure that the various macros are kept in step with each
|
||||
other. NOTE: The values also appear in pcre2_jit_compile.c. */
|
||||
|
||||
/* ------ ASCII/Unicode environments ------ */
|
||||
|
||||
#ifndef EBCDIC
|
||||
|
||||
#define HSPACE_LIST \
|
||||
CHAR_HT, CHAR_SPACE, 0xa0, \
|
||||
0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
|
||||
0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \
|
||||
NOTACHAR
|
||||
|
||||
#define HSPACE_MULTIBYTE_CASES \
|
||||
case 0x1680: /* OGHAM SPACE MARK */ \
|
||||
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \
|
||||
case 0x2000: /* EN QUAD */ \
|
||||
case 0x2001: /* EM QUAD */ \
|
||||
case 0x2002: /* EN SPACE */ \
|
||||
case 0x2003: /* EM SPACE */ \
|
||||
case 0x2004: /* THREE-PER-EM SPACE */ \
|
||||
case 0x2005: /* FOUR-PER-EM SPACE */ \
|
||||
case 0x2006: /* SIX-PER-EM SPACE */ \
|
||||
case 0x2007: /* FIGURE SPACE */ \
|
||||
case 0x2008: /* PUNCTUATION SPACE */ \
|
||||
case 0x2009: /* THIN SPACE */ \
|
||||
case 0x200A: /* HAIR SPACE */ \
|
||||
case 0x202f: /* NARROW NO-BREAK SPACE */ \
|
||||
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \
|
||||
case 0x3000 /* IDEOGRAPHIC SPACE */
|
||||
|
||||
#define HSPACE_BYTE_CASES \
|
||||
case CHAR_HT: \
|
||||
case CHAR_SPACE: \
|
||||
case 0xa0 /* NBSP */
|
||||
|
||||
#define HSPACE_CASES \
|
||||
HSPACE_BYTE_CASES: \
|
||||
HSPACE_MULTIBYTE_CASES
|
||||
|
||||
#define VSPACE_LIST \
|
||||
CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, 0x2028, 0x2029, NOTACHAR
|
||||
|
||||
#define VSPACE_MULTIBYTE_CASES \
|
||||
case 0x2028: /* LINE SEPARATOR */ \
|
||||
case 0x2029 /* PARAGRAPH SEPARATOR */
|
||||
|
||||
#define VSPACE_BYTE_CASES \
|
||||
case CHAR_LF: \
|
||||
case CHAR_VT: \
|
||||
case CHAR_FF: \
|
||||
case CHAR_CR: \
|
||||
case CHAR_NEL
|
||||
|
||||
#define VSPACE_CASES \
|
||||
VSPACE_BYTE_CASES: \
|
||||
VSPACE_MULTIBYTE_CASES
|
||||
|
||||
/* ------ EBCDIC environments ------ */
|
||||
|
||||
#else
|
||||
#define HSPACE_LIST CHAR_HT, CHAR_SPACE
|
||||
|
||||
#define HSPACE_BYTE_CASES \
|
||||
case CHAR_HT: \
|
||||
case CHAR_SPACE
|
||||
|
||||
#define HSPACE_CASES HSPACE_BYTE_CASES
|
||||
|
||||
#ifdef EBCDIC_NL25
|
||||
#define VSPACE_LIST \
|
||||
CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, CHAR_LF, NOTACHAR
|
||||
#else
|
||||
#define VSPACE_LIST \
|
||||
CHAR_VT, CHAR_FF, CHAR_CR, CHAR_LF, CHAR_NEL, NOTACHAR
|
||||
#endif
|
||||
|
||||
#define VSPACE_BYTE_CASES \
|
||||
case CHAR_LF: \
|
||||
case CHAR_VT: \
|
||||
case CHAR_FF: \
|
||||
case CHAR_CR: \
|
||||
case CHAR_NEL
|
||||
|
||||
#define VSPACE_CASES VSPACE_BYTE_CASES
|
||||
#endif /* EBCDIC */
|
||||
|
||||
/* ------ End of whitespace macros ------ */
|
||||
|
||||
|
||||
/* Private flags containing information about the compiled pattern. The first
|
||||
three must not be changed, because whichever is set is actually the number of
|
||||
bytes in a code unit in that mode. */
|
||||
|
@ -801,7 +910,519 @@ only. */
|
|||
|
||||
/* -------------------- End of character and string names -------------------*/
|
||||
|
||||
/* Private structures that are mode-independent. */
|
||||
/* -------------------- Definitions for compiled patterns -------------------*/
|
||||
|
||||
/* Escape items that are just an encoding of a particular data value. */
|
||||
|
||||
#ifndef ESC_e
|
||||
#define ESC_e CHAR_ESC
|
||||
#endif
|
||||
|
||||
#ifndef ESC_f
|
||||
#define ESC_f CHAR_FF
|
||||
#endif
|
||||
|
||||
#ifndef ESC_n
|
||||
#define ESC_n CHAR_LF
|
||||
#endif
|
||||
|
||||
#ifndef ESC_r
|
||||
#define ESC_r CHAR_CR
|
||||
#endif
|
||||
|
||||
/* We can't officially use ESC_t because it is a POSIX reserved identifier
|
||||
(presumably because of all the others like size_t). */
|
||||
|
||||
#ifndef ESC_tee
|
||||
#define ESC_tee CHAR_HT
|
||||
#endif
|
||||
|
||||
/* Codes for different types of Unicode property */
|
||||
|
||||
#define PT_ANY 0 /* Any property - matches all chars */
|
||||
#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
|
||||
#define PT_GC 2 /* Specified general characteristic (e.g. L) */
|
||||
#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
|
||||
#define PT_SC 4 /* Script (e.g. Han) */
|
||||
#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
|
||||
#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
|
||||
#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
|
||||
#define PT_WORD 8 /* Word - L plus N plus underscore */
|
||||
#define PT_CLIST 9 /* Pseudo-property: match character list */
|
||||
#define PT_UCNC 10 /* Universal Character nameable character */
|
||||
#define PT_TABSIZE 11 /* Size of square table for autopossessify tests */
|
||||
|
||||
/* The following special properties are used only in XCLASS items, when POSIX
|
||||
classes are specified and PCRE_UCP is set - in other words, for Unicode
|
||||
handling of these classes. They are not available via the \p or \P escapes like
|
||||
those in the above list, and so they do not take part in the autopossessifying
|
||||
table. */
|
||||
|
||||
#define PT_PXGRAPH 11 /* [:graph:] - characters that mark the paper */
|
||||
#define PT_PXPRINT 12 /* [:print:] - [:graph:] plus non-control spaces */
|
||||
#define PT_PXPUNCT 13 /* [:punct:] - punctuation characters */
|
||||
|
||||
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
|
||||
contain characters with values greater than 255. */
|
||||
|
||||
#define XCL_NOT 0x01 /* Flag: this is a negative class */
|
||||
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
|
||||
#define XCL_HASPROP 0x04 /* Flag: property checks are present. */
|
||||
|
||||
#define XCL_END 0 /* Marks end of individual items */
|
||||
#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
|
||||
#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
|
||||
#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */
|
||||
#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
|
||||
|
||||
/* These are escaped items that aren't just an encoding of a particular data
|
||||
value such as \n. They must have non-zero values, as check_escape() returns 0
|
||||
for a data character. Also, they must appear in the same order as in the
|
||||
opcode definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
|
||||
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
|
||||
used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In
|
||||
non-DOTALL mode, "." behaves like \N.
|
||||
|
||||
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
|
||||
when PCRE_UCP is set and replacement of \d etc by \p sequences is required.
|
||||
They must be contiguous, and remain in order so that the replacements can be
|
||||
looked up from a table.
|
||||
|
||||
Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in
|
||||
check_escape(). There are two tests in the code for an escape
|
||||
greater than ESC_b and less than ESC_Z to detect the types that may be
|
||||
repeated. These are the types that consume characters. If any new escapes are
|
||||
put in between that don't consume a character, that code will have to change.
|
||||
*/
|
||||
|
||||
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
|
||||
ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
|
||||
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
|
||||
ESC_E, ESC_Q, ESC_g, ESC_k,
|
||||
ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu };
|
||||
|
||||
|
||||
/********************** Opcode definitions ******************/
|
||||
|
||||
/****** NOTE NOTE NOTE ******
|
||||
|
||||
Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in
|
||||
order to the list of escapes immediately above. Furthermore, values up to
|
||||
OP_DOLLM must not be changed without adjusting the table called autoposstab in
|
||||
pcre_compile.c
|
||||
|
||||
Whenever this list is updated, the two macro definitions that follow must be
|
||||
updated to match. The possessification table called "opcode_possessify" in
|
||||
pcre_compile.c must also be updated, and also the tables called "coptable"
|
||||
and "poptable" in pcre_dfa_exec.c.
|
||||
|
||||
****** NOTE NOTE NOTE ******/
|
||||
|
||||
|
||||
/* The values between FIRST_AUTOTAB_OP and LAST_AUTOTAB_RIGHT_OP, inclusive,
|
||||
are used in a table for deciding whether a repeated character type can be
|
||||
auto-possessified. */
|
||||
|
||||
#define FIRST_AUTOTAB_OP OP_NOT_DIGIT
|
||||
#define LAST_AUTOTAB_LEFT_OP OP_EXTUNI
|
||||
#define LAST_AUTOTAB_RIGHT_OP OP_DOLLM
|
||||
|
||||
enum {
|
||||
OP_END, /* 0 End of pattern */
|
||||
|
||||
/* Values corresponding to backslashed metacharacters */
|
||||
|
||||
OP_SOD, /* 1 Start of data: \A */
|
||||
OP_SOM, /* 2 Start of match (subject + offset): \G */
|
||||
OP_SET_SOM, /* 3 Set start of match (\K) */
|
||||
OP_NOT_WORD_BOUNDARY, /* 4 \B */
|
||||
OP_WORD_BOUNDARY, /* 5 \b */
|
||||
OP_NOT_DIGIT, /* 6 \D */
|
||||
OP_DIGIT, /* 7 \d */
|
||||
OP_NOT_WHITESPACE, /* 8 \S */
|
||||
OP_WHITESPACE, /* 9 \s */
|
||||
OP_NOT_WORDCHAR, /* 10 \W */
|
||||
OP_WORDCHAR, /* 11 \w */
|
||||
|
||||
OP_ANY, /* 12 Match any character except newline (\N) */
|
||||
OP_ALLANY, /* 13 Match any character */
|
||||
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
|
||||
OP_NOTPROP, /* 15 \P (not Unicode property) */
|
||||
OP_PROP, /* 16 \p (Unicode property) */
|
||||
OP_ANYNL, /* 17 \R (any newline sequence) */
|
||||
OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */
|
||||
OP_HSPACE, /* 19 \h (horizontal whitespace) */
|
||||
OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */
|
||||
OP_VSPACE, /* 21 \v (vertical whitespace) */
|
||||
OP_EXTUNI, /* 22 \X (extended Unicode sequence */
|
||||
OP_EODN, /* 23 End of data or \n at end of data (\Z) */
|
||||
OP_EOD, /* 24 End of data (\z) */
|
||||
|
||||
/* Line end assertions */
|
||||
|
||||
OP_DOLL, /* 25 End of line - not multiline */
|
||||
OP_DOLLM, /* 26 End of line - multiline */
|
||||
OP_CIRC, /* 27 Start of line - not multiline */
|
||||
OP_CIRCM, /* 28 Start of line - multiline */
|
||||
|
||||
/* Single characters; caseful must precede the caseless ones */
|
||||
|
||||
OP_CHAR, /* 29 Match one character, casefully */
|
||||
OP_CHARI, /* 30 Match one character, caselessly */
|
||||
OP_NOT, /* 31 Match one character, not the given one, casefully */
|
||||
OP_NOTI, /* 32 Match one character, not the given one, caselessly */
|
||||
|
||||
/* The following sets of 13 opcodes must always be kept in step because
|
||||
the offset from the first one is used to generate the others. */
|
||||
|
||||
/* Repeated characters; caseful must precede the caseless ones */
|
||||
|
||||
OP_STAR, /* 33 The maximizing and minimizing versions of */
|
||||
OP_MINSTAR, /* 34 these six opcodes must come in pairs, with */
|
||||
OP_PLUS, /* 35 the minimizing one second. */
|
||||
OP_MINPLUS, /* 36 */
|
||||
OP_QUERY, /* 37 */
|
||||
OP_MINQUERY, /* 38 */
|
||||
|
||||
OP_UPTO, /* 39 From 0 to n matches of one character, caseful*/
|
||||
OP_MINUPTO, /* 40 */
|
||||
OP_EXACT, /* 41 Exactly n matches */
|
||||
|
||||
OP_POSSTAR, /* 42 Possessified star, caseful */
|
||||
OP_POSPLUS, /* 43 Possessified plus, caseful */
|
||||
OP_POSQUERY, /* 44 Posesssified query, caseful */
|
||||
OP_POSUPTO, /* 45 Possessified upto, caseful */
|
||||
|
||||
/* Repeated characters; caseless must follow the caseful ones */
|
||||
|
||||
OP_STARI, /* 46 */
|
||||
OP_MINSTARI, /* 47 */
|
||||
OP_PLUSI, /* 48 */
|
||||
OP_MINPLUSI, /* 49 */
|
||||
OP_QUERYI, /* 50 */
|
||||
OP_MINQUERYI, /* 51 */
|
||||
|
||||
OP_UPTOI, /* 52 From 0 to n matches of one character, caseless */
|
||||
OP_MINUPTOI, /* 53 */
|
||||
OP_EXACTI, /* 54 */
|
||||
|
||||
OP_POSSTARI, /* 55 Possessified star, caseless */
|
||||
OP_POSPLUSI, /* 56 Possessified plus, caseless */
|
||||
OP_POSQUERYI, /* 57 Posesssified query, caseless */
|
||||
OP_POSUPTOI, /* 58 Possessified upto, caseless */
|
||||
|
||||
/* The negated ones must follow the non-negated ones, and match them */
|
||||
/* Negated repeated character, caseful; must precede the caseless ones */
|
||||
|
||||
OP_NOTSTAR, /* 59 The maximizing and minimizing versions of */
|
||||
OP_NOTMINSTAR, /* 60 these six opcodes must come in pairs, with */
|
||||
OP_NOTPLUS, /* 61 the minimizing one second. They must be in */
|
||||
OP_NOTMINPLUS, /* 62 exactly the same order as those above. */
|
||||
OP_NOTQUERY, /* 63 */
|
||||
OP_NOTMINQUERY, /* 64 */
|
||||
|
||||
OP_NOTUPTO, /* 65 From 0 to n matches, caseful */
|
||||
OP_NOTMINUPTO, /* 66 */
|
||||
OP_NOTEXACT, /* 67 Exactly n matches */
|
||||
|
||||
OP_NOTPOSSTAR, /* 68 Possessified versions, caseful */
|
||||
OP_NOTPOSPLUS, /* 69 */
|
||||
OP_NOTPOSQUERY, /* 70 */
|
||||
OP_NOTPOSUPTO, /* 71 */
|
||||
|
||||
/* Negated repeated character, caseless; must follow the caseful ones */
|
||||
|
||||
OP_NOTSTARI, /* 72 */
|
||||
OP_NOTMINSTARI, /* 73 */
|
||||
OP_NOTPLUSI, /* 74 */
|
||||
OP_NOTMINPLUSI, /* 75 */
|
||||
OP_NOTQUERYI, /* 76 */
|
||||
OP_NOTMINQUERYI, /* 77 */
|
||||
|
||||
OP_NOTUPTOI, /* 78 From 0 to n matches, caseless */
|
||||
OP_NOTMINUPTOI, /* 79 */
|
||||
OP_NOTEXACTI, /* 80 Exactly n matches */
|
||||
|
||||
OP_NOTPOSSTARI, /* 81 Possessified versions, caseless */
|
||||
OP_NOTPOSPLUSI, /* 82 */
|
||||
OP_NOTPOSQUERYI, /* 83 */
|
||||
OP_NOTPOSUPTOI, /* 84 */
|
||||
|
||||
/* Character types */
|
||||
|
||||
OP_TYPESTAR, /* 85 The maximizing and minimizing versions of */
|
||||
OP_TYPEMINSTAR, /* 86 these six opcodes must come in pairs, with */
|
||||
OP_TYPEPLUS, /* 87 the minimizing one second. These codes must */
|
||||
OP_TYPEMINPLUS, /* 88 be in exactly the same order as those above. */
|
||||
OP_TYPEQUERY, /* 89 */
|
||||
OP_TYPEMINQUERY, /* 90 */
|
||||
|
||||
OP_TYPEUPTO, /* 91 From 0 to n matches */
|
||||
OP_TYPEMINUPTO, /* 92 */
|
||||
OP_TYPEEXACT, /* 93 Exactly n matches */
|
||||
|
||||
OP_TYPEPOSSTAR, /* 94 Possessified versions */
|
||||
OP_TYPEPOSPLUS, /* 95 */
|
||||
OP_TYPEPOSQUERY, /* 96 */
|
||||
OP_TYPEPOSUPTO, /* 97 */
|
||||
|
||||
/* These are used for character classes and back references; only the
|
||||
first six are the same as the sets above. */
|
||||
|
||||
OP_CRSTAR, /* 98 The maximizing and minimizing versions of */
|
||||
OP_CRMINSTAR, /* 99 all these opcodes must come in pairs, with */
|
||||
OP_CRPLUS, /* 100 the minimizing one second. These codes must */
|
||||
OP_CRMINPLUS, /* 101 be in exactly the same order as those above. */
|
||||
OP_CRQUERY, /* 102 */
|
||||
OP_CRMINQUERY, /* 103 */
|
||||
|
||||
OP_CRRANGE, /* 104 These are different to the three sets above. */
|
||||
OP_CRMINRANGE, /* 105 */
|
||||
|
||||
OP_CRPOSSTAR, /* 106 Possessified versions */
|
||||
OP_CRPOSPLUS, /* 107 */
|
||||
OP_CRPOSQUERY, /* 108 */
|
||||
OP_CRPOSRANGE, /* 109 */
|
||||
|
||||
/* End of quantifier opcodes */
|
||||
|
||||
OP_CLASS, /* 110 Match a character class, chars < 256 only */
|
||||
OP_NCLASS, /* 111 Same, but the bitmap was created from a negative
|
||||
class - the difference is relevant only when a
|
||||
character > 255 is encountered. */
|
||||
OP_XCLASS, /* 112 Extended class for handling > 255 chars within the
|
||||
class. This does both positive and negative. */
|
||||
OP_REF, /* 113 Match a back reference, casefully */
|
||||
OP_REFI, /* 114 Match a back reference, caselessly */
|
||||
OP_DNREF, /* 115 Match a duplicate name backref, casefully */
|
||||
OP_DNREFI, /* 116 Match a duplicate name backref, caselessly */
|
||||
OP_RECURSE, /* 117 Match a numbered subpattern (possibly recursive) */
|
||||
OP_CALLOUT, /* 118 Call out to external function if provided */
|
||||
|
||||
OP_ALT, /* 119 Start of alternation */
|
||||
OP_KET, /* 120 End of group that doesn't have an unbounded repeat */
|
||||
OP_KETRMAX, /* 121 These two must remain together and in this */
|
||||
OP_KETRMIN, /* 122 order. They are for groups the repeat for ever. */
|
||||
OP_KETRPOS, /* 123 Possessive unlimited repeat. */
|
||||
|
||||
/* The assertions must come before BRA, CBRA, ONCE, and COND, and the four
|
||||
asserts must remain in order. */
|
||||
|
||||
OP_REVERSE, /* 124 Move pointer back - used in lookbehind assertions */
|
||||
OP_ASSERT, /* 125 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 126 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 127 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 128 Negative lookbehind */
|
||||
|
||||
/* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately
|
||||
after the assertions, with ONCE first, as there's a test for >= ONCE for a
|
||||
subpattern that isn't an assertion. The POS versions must immediately follow
|
||||
the non-POS versions in each case. */
|
||||
|
||||
OP_ONCE, /* 129 Atomic group, contains captures */
|
||||
OP_ONCE_NC, /* 130 Atomic group containing no captures */
|
||||
OP_BRA, /* 131 Start of non-capturing bracket */
|
||||
OP_BRAPOS, /* 132 Ditto, with unlimited, possessive repeat */
|
||||
OP_CBRA, /* 133 Start of capturing bracket */
|
||||
OP_CBRAPOS, /* 134 Ditto, with unlimited, possessive repeat */
|
||||
OP_COND, /* 135 Conditional group */
|
||||
|
||||
/* These five must follow the previous five, in the same order. There's a
|
||||
check for >= SBRA to distinguish the two sets. */
|
||||
|
||||
OP_SBRA, /* 136 Start of non-capturing bracket, check empty */
|
||||
OP_SBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCBRA, /* 138 Start of capturing bracket, check empty */
|
||||
OP_SCBRAPOS, /* 139 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCOND, /* 140 Conditional group, check empty */
|
||||
|
||||
/* The next two pairs must (respectively) be kept together. */
|
||||
|
||||
OP_CREF, /* 141 Used to hold a capture number as condition */
|
||||
OP_DNCREF, /* 142 Used to point to duplicate names as a condition */
|
||||
OP_RREF, /* 143 Used to hold a recursion number as condition */
|
||||
OP_DNRREF, /* 144 Used to point to duplicate names as a condition */
|
||||
OP_DEF, /* 145 The DEFINE condition */
|
||||
|
||||
OP_BRAZERO, /* 146 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 147 order. */
|
||||
OP_BRAPOSZERO, /* 148 */
|
||||
|
||||
/* These are backtracking control verbs */
|
||||
|
||||
OP_MARK, /* 149 always has an argument */
|
||||
OP_PRUNE, /* 150 */
|
||||
OP_PRUNE_ARG, /* 151 same, but with argument */
|
||||
OP_SKIP, /* 152 */
|
||||
OP_SKIP_ARG, /* 153 same, but with argument */
|
||||
OP_THEN, /* 154 */
|
||||
OP_THEN_ARG, /* 155 same, but with argument */
|
||||
OP_COMMIT, /* 156 */
|
||||
|
||||
/* These are forced failure and success verbs */
|
||||
|
||||
OP_FAIL, /* 157 */
|
||||
OP_ACCEPT, /* 158 */
|
||||
OP_ASSERT_ACCEPT, /* 159 Used inside assertions */
|
||||
OP_CLOSE, /* 160 Used before OP_ACCEPT to close open captures */
|
||||
|
||||
/* This is used to skip a subpattern with a {0} quantifier */
|
||||
|
||||
OP_SKIPZERO, /* 161 */
|
||||
|
||||
/* This is not an opcode, but is used to check that tables indexed by opcode
|
||||
are the correct length, in order to catch updating errors - there have been
|
||||
some in the past. */
|
||||
|
||||
OP_TABLE_LENGTH
|
||||
};
|
||||
|
||||
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
|
||||
definitions that follow must also be updated to match. There are also tables
|
||||
called "opcode_possessify" in pcre_compile.c and "coptable" and "poptable" in
|
||||
pcre_dfa_exec.c that must be updated. */
|
||||
|
||||
|
||||
/* This macro defines textual names for all the opcodes. These are used only
|
||||
for debugging, and some of them are only partial names. The macro is referenced
|
||||
only in pcre_printint.c, which fills out the full names in many cases (and in
|
||||
some cases doesn't actually use these names at all). */
|
||||
|
||||
#define OP_NAME_LIST \
|
||||
"End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
|
||||
"\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
|
||||
"notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
|
||||
"extuni", "\\Z", "\\z", \
|
||||
"$", "$", "^", "^", "char", "chari", "not", "noti", \
|
||||
"*", "*?", "+", "+?", "?", "??", \
|
||||
"{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", \
|
||||
"{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", \
|
||||
"{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", \
|
||||
"{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
"class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \
|
||||
"Recurse", "Callout", \
|
||||
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
|
||||
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \
|
||||
"Once", "Once_NC", \
|
||||
"Bra", "BraPos", "CBra", "CBraPos", \
|
||||
"Cond", \
|
||||
"SBra", "SBraPos", "SCBra", "SCBraPos", \
|
||||
"SCond", \
|
||||
"Cond ref", "Cond dnref", "Cond rec", "Cond dnrec", "Cond def", \
|
||||
"Brazero", "Braminzero", "Braposzero", \
|
||||
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
|
||||
"*THEN", "*THEN", "*COMMIT", "*FAIL", \
|
||||
"*ACCEPT", "*ASSERT_ACCEPT", \
|
||||
"Close", "Skip zero"
|
||||
|
||||
|
||||
/* This macro defines the length of fixed length operations in the compiled
|
||||
regex. The lengths are used when searching for specific things, and also in the
|
||||
debugging printing of a compiled regex. We use a macro so that it can be
|
||||
defined close to the definitions of the opcodes themselves.
|
||||
|
||||
As things have been extended, some of these are no longer fixed lenths, but are
|
||||
minima instead. For example, the length of a single-character repeat may vary
|
||||
in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
|
||||
#define OP_LENGTHS \
|
||||
1, /* End */ \
|
||||
1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
|
||||
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
|
||||
1, 1, 1, /* Any, AllAny, Anybyte */ \
|
||||
3, 3, /* \P, \p */ \
|
||||
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
|
||||
1, /* \X */ \
|
||||
1, 1, 1, 1, 1, 1, /* \Z, \z, $, $M ^, ^M */ \
|
||||
2, /* Char - the minimum length */ \
|
||||
2, /* Chari - the minimum length */ \
|
||||
2, /* not */ \
|
||||
2, /* noti */ \
|
||||
/* Positive single-char repeats ** These are */ \
|
||||
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
|
||||
2+IMM2_SIZE, 2+IMM2_SIZE, /* upto, minupto ** mode */ \
|
||||
2+IMM2_SIZE, /* exact */ \
|
||||
2, 2, 2, 2+IMM2_SIZE, /* *+, ++, ?+, upto+ */ \
|
||||
2, 2, 2, 2, 2, 2, /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8 */ \
|
||||
2+IMM2_SIZE, 2+IMM2_SIZE, /* upto I, minupto I */ \
|
||||
2+IMM2_SIZE, /* exact I */ \
|
||||
2, 2, 2, 2+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ \
|
||||
/* Negative single-char repeats - only for chars < 256 */ \
|
||||
2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
|
||||
2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto, minupto */ \
|
||||
2+IMM2_SIZE, /* NOT exact */ \
|
||||
2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *, +, ?, upto */ \
|
||||
2, 2, 2, 2, 2, 2, /* NOT *I, *?I, +I, +?I, ?I, ??I */ \
|
||||
2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto I, minupto I */ \
|
||||
2+IMM2_SIZE, /* NOT exact I */ \
|
||||
2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *I, +I, ?I, upto I */ \
|
||||
/* Positive type repeats */ \
|
||||
2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
|
||||
2+IMM2_SIZE, 2+IMM2_SIZE, /* Type upto, minupto */ \
|
||||
2+IMM2_SIZE, /* Type exact */ \
|
||||
2, 2, 2, 2+IMM2_SIZE, /* Possessive *+, ++, ?+, upto+ */ \
|
||||
/* Character class & ref repeats */ \
|
||||
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
|
||||
1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \
|
||||
1, 1, 1, 1+2*IMM2_SIZE, /* Possessive *+, ++, ?+, CRPOSRANGE */ \
|
||||
1+(32/sizeof(PCRE2_UCHAR)), /* CLASS */ \
|
||||
1+(32/sizeof(PCRE2_UCHAR)), /* NCLASS */ \
|
||||
0, /* XCLASS - variable length */ \
|
||||
1+IMM2_SIZE, /* REF */ \
|
||||
1+IMM2_SIZE, /* REFI */ \
|
||||
1+2*IMM2_SIZE, /* DNREF */ \
|
||||
1+2*IMM2_SIZE, /* DNREFI */ \
|
||||
1+LINK_SIZE, /* RECURSE */ \
|
||||
2+2*LINK_SIZE, /* CALLOUT */ \
|
||||
1+LINK_SIZE, /* Alt */ \
|
||||
1+LINK_SIZE, /* Ket */ \
|
||||
1+LINK_SIZE, /* KetRmax */ \
|
||||
1+LINK_SIZE, /* KetRmin */ \
|
||||
1+LINK_SIZE, /* KetRpos */ \
|
||||
1+LINK_SIZE, /* Reverse */ \
|
||||
1+LINK_SIZE, /* Assert */ \
|
||||
1+LINK_SIZE, /* Assert not */ \
|
||||
1+LINK_SIZE, /* Assert behind */ \
|
||||
1+LINK_SIZE, /* Assert behind not */ \
|
||||
1+LINK_SIZE, /* ONCE */ \
|
||||
1+LINK_SIZE, /* ONCE_NC */ \
|
||||
1+LINK_SIZE, /* BRA */ \
|
||||
1+LINK_SIZE, /* BRAPOS */ \
|
||||
1+LINK_SIZE+IMM2_SIZE, /* CBRA */ \
|
||||
1+LINK_SIZE+IMM2_SIZE, /* CBRAPOS */ \
|
||||
1+LINK_SIZE, /* COND */ \
|
||||
1+LINK_SIZE, /* SBRA */ \
|
||||
1+LINK_SIZE, /* SBRAPOS */ \
|
||||
1+LINK_SIZE+IMM2_SIZE, /* SCBRA */ \
|
||||
1+LINK_SIZE+IMM2_SIZE, /* SCBRAPOS */ \
|
||||
1+LINK_SIZE, /* SCOND */ \
|
||||
1+IMM2_SIZE, 1+2*IMM2_SIZE, /* CREF, DNCREF */ \
|
||||
1+IMM2_SIZE, 1+2*IMM2_SIZE, /* RREF, DNRREF */ \
|
||||
1, /* DEF */ \
|
||||
1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \
|
||||
3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
|
||||
1, 3, /* SKIP, SKIP_ARG */ \
|
||||
1, 3, /* THEN, THEN_ARG */ \
|
||||
1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \
|
||||
1+IMM2_SIZE, 1 /* CLOSE, SKIPZERO */
|
||||
|
||||
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
|
||||
|
||||
#define RREF_ANY 0xffff
|
||||
|
||||
|
||||
/* ---------- Private structures that are mode-independent. ---------- */
|
||||
|
||||
/* Structure to hold data for custom memory management. */
|
||||
|
||||
|
@ -811,15 +1432,64 @@ typedef struct pcre2_memctl {
|
|||
void *memory_data;
|
||||
} pcre2_memctl;
|
||||
|
||||
/* The other private structures used by PCRE are defined in a separate file.
|
||||
/* Layout of the UCP type table that translates property names into types and
|
||||
codes. Each entry used to point directly to a name, but to reduce the number of
|
||||
relocations in shared libraries, it now has an offset into a single string
|
||||
instead. */
|
||||
|
||||
typedef struct {
|
||||
uint16_t name_offset;
|
||||
uint16_t type;
|
||||
uint16_t value;
|
||||
} ucp_type_table;
|
||||
|
||||
/* Unicode character database (UCD) */
|
||||
|
||||
typedef struct {
|
||||
uint8_t script; /* ucp_Arabic, etc. */
|
||||
uint8_t chartype; /* ucp_Cc, etc. (general categories) */
|
||||
uint8_t gbprop; /* ucp_gbControl, etc. (grapheme break property) */
|
||||
uint8_t caseset; /* offset to multichar other cases or zero */
|
||||
int32_t other_case; /* offset to other case, or zero if none */
|
||||
} ucd_record;
|
||||
|
||||
extern const uint32_t PRIV(ucd_caseless_sets)[];
|
||||
extern const ucd_record PRIV(ucd_records)[];
|
||||
extern const uint8_t PRIV(ucd_stage1)[];
|
||||
extern const uint16_t PRIV(ucd_stage2)[];
|
||||
extern const uint32_t PRIV(ucp_gentype)[];
|
||||
extern const uint32_t PRIV(ucp_gbtable)[];
|
||||
#ifdef SUPPORT_JIT
|
||||
extern const int PRIV(ucp_typerange)[];
|
||||
#endif
|
||||
|
||||
/* UCD access macros */
|
||||
|
||||
#define UCD_BLOCK_SIZE 128
|
||||
#define GET_UCD(ch) (PRIV(ucd_records) + \
|
||||
PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \
|
||||
UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE])
|
||||
|
||||
#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
|
||||
#define UCD_SCRIPT(ch) GET_UCD(ch)->script
|
||||
#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
|
||||
#define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop
|
||||
#define UCD_CASESET(ch) GET_UCD(ch)->caseset
|
||||
#define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
|
||||
|
||||
|
||||
|
||||
/* ----------------- Items that need PCRE2_CODE_UNIT_WIDTH ----------------- */
|
||||
|
||||
#ifdef PCRE2_CODE_UNIT_WIDTH
|
||||
|
||||
/* Mode-dependent macros and private structures are defined in a separate file.
|
||||
When compiling the library, PCRE2_CODE_UNIT_WIDTH will be defined, and we
|
||||
include them at the appropriate width. When compiling pcre2test, however, that
|
||||
macro is not set at this point because pcre2test needs to include them at all
|
||||
supported widths. */
|
||||
|
||||
#ifdef PCRE2_CODE_UNIT_WIDTH
|
||||
#include "pcre2_intstructs.h"
|
||||
#endif
|
||||
#include "pcre2_intmodedep.h"
|
||||
|
||||
/* Internal shared functions. These are functions that are used by more than
|
||||
one of the library's exported public functions. They have to be "external" in
|
||||
|
@ -827,14 +1497,15 @@ the C sense, but are not part of the PCRE public API. They are not referenced
|
|||
from pcre2test, and must not be defined when no code unit width is available.
|
||||
*/
|
||||
|
||||
#ifdef PCRE2_CODE_UNIT_WIDTH
|
||||
#define _pcre2_compile_context_init PCRE2_SUFFIX(_pcre2_compile_context_init_)
|
||||
#define _pcre2_match_context_init PCRE2_SUFFIX(_pcre2_match_context_init_)
|
||||
#define _pcre2_memctl_malloc PCRE2_SUFFIX(_pcre2_memctl_malloc_)
|
||||
#define _pcre2_strcmp PCRE2_SUFFIX(_pcre_strcmp_)
|
||||
|
||||
extern void _pcre2_compile_context_init(pcre2_compile_context *, BOOL);
|
||||
extern void _pcre2_match_context_init(pcre2_match_context *, BOOL);
|
||||
extern void *_pcre2_memctl_malloc(size_t, size_t, pcre2_general_context *);
|
||||
extern void *_pcre2_memctl_malloc(size_t, size_t, pcre2_memctl *);
|
||||
extern int _pcre2_strcmp(PCRE2_SPTR, PCRE2_SPTR);
|
||||
#endif
|
||||
|
||||
/* End of pcre2_internal.h */
|
||||
|
|
|
@ -0,0 +1,258 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2014 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains mode-dependent macro and structure definitions. The
|
||||
file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
|
||||
These mode-dependent items are kept in a separate file so that they can also be
|
||||
#included multiple times for different code unit widths by pcre2test. Start by
|
||||
undefining all the new macros defined herein so that they can be redefined for
|
||||
multiple inclusions. */
|
||||
|
||||
#undef CU2BYTES
|
||||
#undef GET
|
||||
#undef GET2
|
||||
#undef IMM2_SIZE
|
||||
#undef MAX_PATTERN_SIZE
|
||||
#undef PUT
|
||||
#undef PUT2
|
||||
#undef PUTINC
|
||||
|
||||
|
||||
/* ---------------------------MACROS ----------------------------- */
|
||||
|
||||
/* PCRE keeps offsets in its compiled code as at least 16-bit quantities
|
||||
(always stored in big-endian order in 8-bit mode) by default. These are used,
|
||||
for example, to link from the start of a subpattern to its alternatives and its
|
||||
end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
|
||||
to around 64K, which is big enough for almost everybody. However, I received a
|
||||
request for an even bigger limit. For this reason, and also to make the code
|
||||
easier to maintain, the storing and loading of offsets from the compiled code
|
||||
unit string is now handled by the macros that are defined here.
|
||||
|
||||
The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
|
||||
values of 2 or 4 are also supported. */
|
||||
|
||||
|
||||
/* ------------------- 8-bit support ------------------ */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
|
||||
#if LINK_SIZE == 2
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (d) >> 8), \
|
||||
(a[(n)+1] = (d) & 255)
|
||||
#define GET(a,n) \
|
||||
(((a)[n] << 8) | (a)[(n)+1])
|
||||
#define MAX_PATTERN_SIZE (1 << 16)
|
||||
|
||||
#elif LINK_SIZE == 3
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (d) >> 16), \
|
||||
(a[(n)+1] = (d) >> 8), \
|
||||
(a[(n)+2] = (d) & 255)
|
||||
#define GET(a,n) \
|
||||
(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
|
||||
#define MAX_PATTERN_SIZE (1 << 24)
|
||||
|
||||
#elif LINK_SIZE == 4
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (d) >> 24), \
|
||||
(a[(n)+1] = (d) >> 16), \
|
||||
(a[(n)+2] = (d) >> 8), \
|
||||
(a[(n)+3] = (d) & 255)
|
||||
#define GET(a,n) \
|
||||
(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
|
||||
#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
|
||||
|
||||
#else
|
||||
#error LINK_SIZE must be either 2, 3, or 4
|
||||
#endif
|
||||
|
||||
|
||||
/* ------------------- 16-bit support ------------------ */
|
||||
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
|
||||
#if LINK_SIZE == 2
|
||||
#undef LINK_SIZE
|
||||
#define LINK_SIZE 1
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (d))
|
||||
#define GET(a,n) \
|
||||
(a[n])
|
||||
#define MAX_PATTERN_SIZE (1 << 16)
|
||||
|
||||
#elif LINK_SIZE == 3 || LINK_SIZE == 4
|
||||
#undef LINK_SIZE
|
||||
#define LINK_SIZE 2
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (d) >> 16), \
|
||||
(a[(n)+1] = (d) & 65535)
|
||||
#define GET(a,n) \
|
||||
(((a)[n] << 16) | (a)[(n)+1])
|
||||
#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
|
||||
|
||||
#else
|
||||
#error LINK_SIZE must be either 2, 3, or 4
|
||||
#endif
|
||||
|
||||
|
||||
/* ------------------- 32-bit support ------------------ */
|
||||
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 32
|
||||
#undef LINK_SIZE
|
||||
#define LINK_SIZE 1
|
||||
#define PUT(a,n,d) \
|
||||
(a[n] = (d))
|
||||
#define GET(a,n) \
|
||||
(a[n])
|
||||
#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
|
||||
|
||||
#else
|
||||
#error Unsupported compiling mode
|
||||
#endif
|
||||
|
||||
/* -------------------------------------------------------*/
|
||||
|
||||
|
||||
/* PCRE uses some other (at least) 16-bit quantities that do not change when
|
||||
the size of offsets changes. There are used for repeat counts and for other
|
||||
things such as capturing parenthesis numbers in back references.
|
||||
|
||||
Define the number of code units required to hold a 16-bit count/offset, and
|
||||
macros to load and store such a value. For reasons that I do not understand,
|
||||
the expression in the 8-bit GET2 macro is treated by gcc as a signed
|
||||
expression, even when a is declared as unsigned. It seems that any kind of
|
||||
arithmetic results in a signed value. Hence the cast. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
#define IMM2_SIZE 2
|
||||
#define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
|
||||
#define PUT2(a,n,d) { a[n] = (d) >> 8; a[(n)+1] = (d) & 255; }
|
||||
|
||||
#else /* Code units are 16 or 32 bits */
|
||||
#define IMM2_SIZE 1
|
||||
#define GET2(a,n) a[n]
|
||||
#define PUT2(a,n,d) a[n] = d
|
||||
#endif
|
||||
|
||||
|
||||
/* Mode-dependent macros that have the same definition in all modes. */
|
||||
|
||||
#define CU2BYTES(x) (x)*((PCRE2_CODE_UNIT_WIDTH/8))
|
||||
#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
|
||||
|
||||
|
||||
/* --------------------------- STRUCTURES ----------------------------- */
|
||||
|
||||
/* The real general context structure. At present it hold only data for custom
|
||||
memory control. */
|
||||
|
||||
typedef struct pcre2_real_general_context {
|
||||
pcre2_memctl memctl;
|
||||
} pcre2_real_general_context;
|
||||
|
||||
/* The real compile context structure */
|
||||
|
||||
typedef struct pcre2_real_compile_context {
|
||||
pcre2_memctl memctl;
|
||||
int (*stack_guard)(uint32_t);
|
||||
const unsigned char *tables;
|
||||
uint16_t bsr_convention;
|
||||
uint16_t newline_convention;
|
||||
uint32_t parens_nest_limit;
|
||||
} pcre2_real_compile_context;
|
||||
|
||||
/* The real match context structure. */
|
||||
|
||||
typedef struct pcre2_real_match_context {
|
||||
pcre2_memctl memctl;
|
||||
#ifdef NO_RECURSE
|
||||
void * (*stack_malloc)(size_t, void *);
|
||||
void (*stack_free)(void *, void *);
|
||||
#endif
|
||||
int (*callout)(pcre2_callout_block *, void *);
|
||||
uint32_t match_limit;
|
||||
uint32_t recursion_limit;
|
||||
} pcre2_real_match_context;
|
||||
|
||||
/* The real compiled code structure */
|
||||
|
||||
typedef struct pcre2_real_code {
|
||||
pcre2_memctl memctl;
|
||||
void *executable_jit; /* Pointer to JIT code */
|
||||
uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
|
||||
uint32_t magic_number; /* Paranoid and endianness check */
|
||||
uint32_t size; /* Total (bytes) that was malloc-ed */
|
||||
uint32_t compile_options; /* Options passed to pcre2_compile() */
|
||||
uint32_t pattern_options; /* Options taken from the pattern */
|
||||
uint32_t flags; /* Various state flags */
|
||||
uint32_t limit_match; /* Limit set in the pattern */
|
||||
uint32_t limit_recursion; /* Limit set in the pattern */
|
||||
uint32_t first_codeunit; /* Starting code unit */
|
||||
uint32_t last_codeunit; /* This codeunit must be seen */
|
||||
uint16_t bsr_convention; /* What \R matches */
|
||||
uint16_t newline_convention; /* What is a newline? */
|
||||
uint16_t max_lookbehind; /* Longest lookbehind (characters) */
|
||||
uint16_t minlength; /* Minimum length of match */
|
||||
uint16_t top_bracket; /* Highest numbered group */
|
||||
uint16_t top_backref; /* Highest numbered back reference */
|
||||
uint16_t name_entry_size; /* Size (code units) of table entries */
|
||||
uint16_t name_count; /* Number of name entries in the table */
|
||||
} pcre2_real_code;
|
||||
|
||||
/* The reat match data structure. */
|
||||
|
||||
typedef struct pcre2_real_match_data {
|
||||
pcre2_memctl memctl;
|
||||
const pcre2_real_code *code; /* The pattern used for the match */
|
||||
PCRE2_SPTR subject; /* The subject that was matched */
|
||||
int rc; /* The return code from the match */
|
||||
int utf_reason; /* Reason code for bad UTF */
|
||||
size_t leftchar; /* Offset to leftmost code unit */
|
||||
size_t rightchar; /* Offset to rightmost code unit */
|
||||
size_t startchar; /* Offset to starting code unit */
|
||||
PCRE2_SPTR mark; /* Pointer to last mark */
|
||||
uint16_t oveccount; /* Number of pairs */
|
||||
size_t ovector[1]; /* The first field */
|
||||
} pcre2_real_match_data;
|
||||
|
||||
/* End of pcre2_intmodedep.h */
|
|
@ -1,114 +0,0 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2014 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains the private mode-dependent structures needed by
|
||||
pcre2_internal.h. They are kept separate so that they can be #included multiple
|
||||
times for different code unit widths by pcre2test. */
|
||||
|
||||
/* The real general context structure. At present it hold only data for custom
|
||||
memory control. */
|
||||
|
||||
typedef struct pcre2_real_general_context {
|
||||
pcre2_memctl memctl;
|
||||
} pcre2_real_general_context;
|
||||
|
||||
/* The real compile context structure */
|
||||
|
||||
typedef struct pcre2_real_compile_context {
|
||||
pcre2_memctl memctl;
|
||||
int (*stack_guard)(uint32_t);
|
||||
const unsigned char *tables;
|
||||
uint16_t bsr_convention;
|
||||
uint16_t newline_convention;
|
||||
uint32_t parens_nest_limit;
|
||||
} pcre2_real_compile_context;
|
||||
|
||||
/* The real match context structure. */
|
||||
|
||||
typedef struct pcre2_real_match_context {
|
||||
pcre2_memctl memctl;
|
||||
#ifdef NO_RECURSE
|
||||
void * (*stack_malloc)(size_t, void *);
|
||||
void (*stack_free)(void *, void *);
|
||||
#endif
|
||||
int (*callout)(pcre2_callout_block *, void *);
|
||||
uint32_t match_limit;
|
||||
uint32_t recursion_limit;
|
||||
} pcre2_real_match_context;
|
||||
|
||||
/* The reat match data structure. */
|
||||
|
||||
typedef struct pcre2_real_match_data {
|
||||
pcre2_memctl memctl;
|
||||
size_t leftchar; /* Offset to leftmost code unit */
|
||||
size_t rightchar; /* Offset to rightmost code unit */
|
||||
size_t startchar; /* Offset to starting code unit */
|
||||
PCRE2_SPTR mark; /* Pointer to last mark */
|
||||
uint16_t oveccount; /* Number of pairs */
|
||||
size_t ovector[1]; /* The first field */
|
||||
} pcre2_real_match_data;
|
||||
|
||||
/* The real compiled code structure */
|
||||
|
||||
typedef struct pcre2_real_code {
|
||||
pcre2_memctl memctl;
|
||||
void *executable_jit; /* Pointer to JIT code */
|
||||
uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
|
||||
uint32_t magic_number; /* Paranoid and endianness check */
|
||||
uint32_t size; /* Total that was malloc-ed */
|
||||
uint32_t compile_options; /* Options passed to pcre2_compile() */
|
||||
uint32_t pattern_options; /* Options taken from the pattern */
|
||||
uint32_t flags; /* Various state flags */
|
||||
uint32_t limit_match; /* Limit set in the pattern */
|
||||
uint32_t limit_recursion; /* Limit set in the pattern */
|
||||
uint32_t first_codeunit; /* Starting code unit */
|
||||
uint32_t last_codeunit; /* This codeunit must be seen */
|
||||
uint16_t bsr_convention; /* What \R matches */
|
||||
uint16_t newline_convention; /* What is a newline? */
|
||||
uint16_t max_lookbehind; /* Longest lookbehind (characters) */
|
||||
uint16_t minlength; /* Minimum length of match */
|
||||
uint16_t top_bracket; /* Highest numbered group */
|
||||
uint16_t top_backref; /* Highest numbered back reference */
|
||||
uint16_t name_table_offset; /* Offset to name table that follows */
|
||||
uint16_t name_entry_size; /* Size of name items in the table */
|
||||
uint16_t name_count; /* Number of name entries in the table */
|
||||
} pcre2_real_code;
|
||||
|
||||
/* End of pcre2_intstructs.h */
|
|
@ -76,11 +76,19 @@ pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, int length,
|
|||
size_t start_offset, uint32_t options, pcre2_match_data *match_data,
|
||||
pcre2_match_context *mcontext)
|
||||
{
|
||||
int rc = PCRE2_ERROR_NOMATCH;
|
||||
|
||||
/* Fudge for testing pcre2test */
|
||||
mcontext=mcontext;length=length;
|
||||
options=options;
|
||||
|
||||
if (subject[start_offset] == 'Y')
|
||||
|
||||
/* Fudges for testing pcre2test */
|
||||
|
||||
if (subject[0] == 'Y')
|
||||
{
|
||||
rc = 0;
|
||||
match_data->code = code;
|
||||
match_data->subject = subject;
|
||||
match_data->leftchar = 0;
|
||||
match_data->rightchar = 3;
|
||||
match_data->startchar = 0;
|
||||
|
@ -88,24 +96,51 @@ if (subject[start_offset] == 'Y')
|
|||
|
||||
switch (match_data->oveccount)
|
||||
{
|
||||
case 0: return 0;
|
||||
case 0: break;
|
||||
|
||||
case 1: match_data->ovector[0] = start_offset;
|
||||
match_data->ovector[1] = start_offset + 4;
|
||||
return 0;
|
||||
break;
|
||||
|
||||
default: match_data->ovector[0] = start_offset;
|
||||
default:
|
||||
case 6: match_data->ovector[10] = PCRE2_UNSET;
|
||||
match_data->ovector[11] = PCRE2_UNSET;
|
||||
|
||||
case 5: match_data->ovector[8] = PCRE2_UNSET;
|
||||
match_data->ovector[9] = PCRE2_UNSET;
|
||||
|
||||
case 4: match_data->ovector[6] = start_offset + 3;
|
||||
match_data->ovector[7] = start_offset + 4;
|
||||
rc += 2;
|
||||
|
||||
case 3: match_data->ovector[4] = PCRE2_UNSET;
|
||||
match_data->ovector[5] = PCRE2_UNSET;
|
||||
|
||||
case 2: match_data->ovector[0] = start_offset;
|
||||
match_data->ovector[1] = start_offset + 4;
|
||||
match_data->ovector[2] = start_offset + 1;
|
||||
match_data->ovector[3] = start_offset + 3;
|
||||
return 2;
|
||||
match_data->mark = subject;
|
||||
rc += 2;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
else if (subject[0] == 'P')
|
||||
{
|
||||
rc = PCRE2_ERROR_PARTIAL;
|
||||
match_data->code = code;
|
||||
match_data->subject = subject;
|
||||
match_data->leftchar = 0;
|
||||
match_data->rightchar = length;
|
||||
match_data->startchar = 1;
|
||||
match_data->mark = NULL;
|
||||
}
|
||||
|
||||
|
||||
mcontext=mcontext;code=code;subject=subject;length=length;
|
||||
start_offset=start_offset; options=options; match_data=match_data;
|
||||
return PCRE2_ERROR_NOMATCH;
|
||||
match_data->rc = rc;
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* End of pcre2_match.c */
|
||||
|
|
|
@ -56,7 +56,7 @@ pcre2_match_data_create(size_t oveccount, pcre2_general_context *gcontext)
|
|||
{
|
||||
pcre2_match_data *yield = PRIV(memctl_malloc)(
|
||||
sizeof(pcre2_match_data) + 3*oveccount*sizeof(size_t),
|
||||
offsetof(pcre2_real_match_data, memctl), gcontext);
|
||||
offsetof(pcre2_real_match_data, memctl), &(gcontext->memctl));
|
||||
yield->oveccount = oveccount;
|
||||
return yield;
|
||||
}
|
||||
|
|
|
@ -167,7 +167,7 @@ switch(what)
|
|||
break;
|
||||
|
||||
case PCRE2_INFO_NAMETABLE:
|
||||
*((PCRE2_SPTR*)where) = (PCRE2_SPTR)re + re->name_table_offset;
|
||||
*((PCRE2_SPTR*)where) = (PCRE2_SPTR)((char *)re + sizeof(pcre2_real_code));
|
||||
break;
|
||||
|
||||
case PCRE2_INFO_NEWLINE_CONVENTION:
|
||||
|
|
|
@ -1 +1,787 @@
|
|||
/* This is a placeholder for pcre2_printint.c */
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2014 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains a PCRE private debugging function for printing out the
|
||||
internal form of a compiled regular expression, along with some supporting
|
||||
local functions. This source file is #included in pcre2test.c at each supported
|
||||
code unit width, with PCRE2_SUFFIX set appropriately, just like the functions
|
||||
that comprise the library. */
|
||||
|
||||
|
||||
/* Tables of operator names. The same 8-bit table is used for all code unit
|
||||
widths, so it must be defined only once. The list itself is defined in
|
||||
pcre2_internal.h, which is #included by pcre2test before this file. */
|
||||
|
||||
#ifndef OP_LISTS_DEFINED
|
||||
static const char *OP_names[] = { OP_NAME_LIST };
|
||||
#define OP_LISTS_DEFINED
|
||||
#endif
|
||||
|
||||
/* The functions and tables herein must all have mode-dependent names. */
|
||||
|
||||
#define OP_lengths PCRE2_SUFFIX(OP_lengths_)
|
||||
#define get_ucpname PCRE2_SUFFIX(get_ucpname_)
|
||||
#define pcre2_printint PCRE2_SUFFIX(pcre2_printint_)
|
||||
#define print_char PCRE2_SUFFIX(print_char_)
|
||||
#define print_custring PCRE2_SUFFIX(print_custring_)
|
||||
#define print_prop PCRE2_SUFFIX(print_prop_)
|
||||
|
||||
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
|
||||
the definition is next to the definition of the opcodes in pcre2_internal.h.
|
||||
The contents of the table are, however, mode-dependent. */
|
||||
|
||||
static const uint8_t OP_lengths[] = { OP_LENGTHS };
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print one character from a string *
|
||||
*************************************************/
|
||||
|
||||
/* In UTF mode the character may occupy more than one code unit.
|
||||
|
||||
Arguments:
|
||||
f file to write to
|
||||
ptr pointer to first code unit of the character
|
||||
utf TRUE if string is UTF (will be FALSE if UTF is not supported)
|
||||
|
||||
Returns: number of additional code units used
|
||||
*/
|
||||
|
||||
static unsigned int
|
||||
print_char(FILE *f, PCRE2_SPTR ptr, BOOL utf)
|
||||
{
|
||||
uint32_t c = *ptr;
|
||||
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
|
||||
int a, i, s;
|
||||
#endif
|
||||
|
||||
/* If UTF is supported and requested, check for a one-code-unit character. The
|
||||
16-bit and 32-bit tests are for malformed UTF, and should only trigger if the
|
||||
sanity check is turned off. */
|
||||
|
||||
#ifdef SUPPORT_UTF
|
||||
if (utf)
|
||||
{
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
utf = (c & 0xc0) == 0xc0;
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
utf = (c & 0xfc00) == 0xd800;
|
||||
#else
|
||||
utf = (c & 0xfffff800u) != 0xd800u;
|
||||
#endif
|
||||
}
|
||||
#endif /* SUPPORT_UTF */
|
||||
|
||||
/* Handle a one-code-unit character at any width. */
|
||||
|
||||
if (!utf)
|
||||
{
|
||||
if (PRINTABLE(c)) fprintf(f, "%c", (char)c);
|
||||
else if (c < 0x80) fprintf(f, "\\x%02x", c);
|
||||
else fprintf(f, "\\x{%02x}", c);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Per-width code for handling non-one-code-unit UTF characters. */
|
||||
|
||||
#ifdef SUPPORT_UTF
|
||||
|
||||
/* Handle a multi-byte UTF-8 character. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
a = utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
||||
s = 6*a;
|
||||
c = (c & utf8_table3[a]) << s;
|
||||
for (i = 1; i <= a; i++)
|
||||
{
|
||||
/* This is a check for malformed UTF-8; it should only occur if the sanity
|
||||
check has been turned off. Rather than swallow random bytes, just stop if
|
||||
we hit a bad one. Print it with \X instead of \x as an indication. */
|
||||
|
||||
if ((ptr[i] & 0xc0) != 0x80)
|
||||
{
|
||||
fprintf(f, "\\X{%x}", c);
|
||||
return i - 1;
|
||||
}
|
||||
|
||||
/* The byte is OK */
|
||||
|
||||
s -= 6;
|
||||
c |= (ptr[i] & 0x3f) << s;
|
||||
}
|
||||
fprintf(f, "\\x{%x}", c);
|
||||
return a;
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
|
||||
|
||||
/* Handle a multi-code-unit UTF-16 character, starting with a check for
|
||||
malformed UTF-16; it should only occur if the sanity check has been turned off.
|
||||
Rather than swallow a low surrogate, just stop if we hit a bad one. Print it
|
||||
with \X instead of \x as an indication. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 16
|
||||
if ((ptr[1] & 0xfc00) != 0xdc00)
|
||||
{
|
||||
fprintf(f, "\\X{%x}", c);
|
||||
return 0;
|
||||
}
|
||||
c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000;
|
||||
fprintf(f, "\\x{%x}", c);
|
||||
return 1;
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == 16 */
|
||||
|
||||
/* For UTF-32 we get here only for a malformed code unit, which should only
|
||||
occur if the sanity check has been turned off. Print it with \X instead of \x
|
||||
as an indication. */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
fprintf(f, "\\X{%x}", c);
|
||||
return 0;
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH == 16 */
|
||||
#endif /* SUPPORT_UTF */
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print string as a list of code units *
|
||||
*************************************************/
|
||||
|
||||
/* This takes no account of UTF as it always prints each individual code unit.
|
||||
The string is zero-terminated.
|
||||
|
||||
Arguments:
|
||||
f file to write to
|
||||
ptr point to the string
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
print_custring(FILE *f, PCRE2_SPTR ptr)
|
||||
{
|
||||
while (*ptr != '\0')
|
||||
{
|
||||
register uint32_t c = *ptr++;
|
||||
if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find Unicode property name *
|
||||
*************************************************/
|
||||
|
||||
static const char *
|
||||
get_ucpname(unsigned int ptype, unsigned int pvalue)
|
||||
{
|
||||
int i;
|
||||
for (i = utt_size - 1; i >= 0; i--)
|
||||
{
|
||||
if (ptype == utt[i].type && pvalue == utt[i].value) break;
|
||||
}
|
||||
return (i >= 0)? utt_names + utt[i].name_offset : "??";
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print Unicode property value *
|
||||
*************************************************/
|
||||
|
||||
/* "Normal" properties can be printed from tables. The PT_CLIST property is a
|
||||
pseudo-property that contains a pointer to a list of case-equivalent
|
||||
characters.
|
||||
|
||||
Arguments:
|
||||
f file to write to
|
||||
code pointer in the compiled code
|
||||
before text to print before
|
||||
after text to print after
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
print_prop(FILE *f, PCRE2_SPTR code, const char *before, const char *after)
|
||||
{
|
||||
if (code[1] != PT_CLIST)
|
||||
{
|
||||
fprintf(f, "%s%s %s%s", before, OP_names[*code], get_ucpname(code[1],
|
||||
code[2]), after);
|
||||
}
|
||||
else
|
||||
{
|
||||
const char *not = (*code == OP_PROP)? "" : "not ";
|
||||
const uint32_t *p = ucd_caseless_sets + code[2];
|
||||
fprintf (f, "%s%sclist", before, not);
|
||||
while (*p < NOTACHAR) fprintf(f, " %04x", *p++);
|
||||
fprintf(f, "%s", after);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print compiled pattern *
|
||||
*************************************************/
|
||||
|
||||
/* The print_lengths flag controls whether offsets and lengths of items are
|
||||
printed. Lenths can be turned off from pcre2test so that automatic tests on
|
||||
bytecode can be written that do not depend on the value of LINK_SIZE.
|
||||
|
||||
Arguments:
|
||||
re a compiled pattern
|
||||
f the file to write to
|
||||
print_lenghts show various lengths
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
pcre2_printint(pcre2_code *re, FILE *f, BOOL print_lengths)
|
||||
{
|
||||
PCRE2_SPTR codestart, nametable, code;
|
||||
uint32_t options = re->compile_options;
|
||||
size_t nesize = re->name_entry_size;
|
||||
BOOL utf = (options & PCRE2_UTF) != 0;
|
||||
|
||||
nametable = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code));
|
||||
code = codestart = nametable + re->name_count * re->name_entry_size;
|
||||
|
||||
for(;;)
|
||||
{
|
||||
PCRE2_SPTR ccode;
|
||||
uint32_t c;
|
||||
const char *flag = " ";
|
||||
unsigned int extra = 0;
|
||||
|
||||
if (print_lengths)
|
||||
fprintf(f, "%3d ", (int)(code - codestart));
|
||||
else
|
||||
fprintf(f, " ");
|
||||
|
||||
switch(*code)
|
||||
{
|
||||
/* ========================================================================== */
|
||||
/* These cases are never obeyed. This is a fudge that causes a compile-
|
||||
time error if the vectors OP_names or OP_lengths, which are indexed
|
||||
by opcode, are not the correct length. It seems to be the only way to do
|
||||
such a check at compile time, as the sizeof() operator does not work in
|
||||
the C preprocessor. */
|
||||
|
||||
case OP_TABLE_LENGTH:
|
||||
case OP_TABLE_LENGTH +
|
||||
((sizeof(OP_names)/sizeof(const char *) == OP_TABLE_LENGTH) &&
|
||||
(sizeof(OP_lengths) == OP_TABLE_LENGTH)):
|
||||
break;
|
||||
/* ========================================================================== */
|
||||
|
||||
case OP_END:
|
||||
fprintf(f, " %s\n", OP_names[*code]);
|
||||
fprintf(f, "------------------------------------------------------------------\n");
|
||||
return;
|
||||
|
||||
case OP_CHAR:
|
||||
fprintf(f, " ");
|
||||
do
|
||||
{
|
||||
code++;
|
||||
code += 1 + print_char(f, code, utf);
|
||||
}
|
||||
while (*code == OP_CHAR);
|
||||
fprintf(f, "\n");
|
||||
continue;
|
||||
|
||||
case OP_CHARI:
|
||||
fprintf(f, " /i ");
|
||||
do
|
||||
{
|
||||
code++;
|
||||
code += 1 + print_char(f, code, utf);
|
||||
}
|
||||
while (*code == OP_CHARI);
|
||||
fprintf(f, "\n");
|
||||
continue;
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_CBRAPOS:
|
||||
case OP_SCBRA:
|
||||
case OP_SCBRAPOS:
|
||||
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
|
||||
else fprintf(f, " ");
|
||||
fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
|
||||
break;
|
||||
|
||||
case OP_BRA:
|
||||
case OP_BRAPOS:
|
||||
case OP_SBRA:
|
||||
case OP_SBRAPOS:
|
||||
case OP_KETRMAX:
|
||||
case OP_KETRMIN:
|
||||
case OP_KETRPOS:
|
||||
case OP_ALT:
|
||||
case OP_KET:
|
||||
case OP_ASSERT:
|
||||
case OP_ASSERT_NOT:
|
||||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
case OP_ONCE:
|
||||
case OP_ONCE_NC:
|
||||
case OP_COND:
|
||||
case OP_SCOND:
|
||||
case OP_REVERSE:
|
||||
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
|
||||
else fprintf(f, " ");
|
||||
fprintf(f, "%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_CLOSE:
|
||||
fprintf(f, " %s %d", OP_names[*code], GET2(code, 1));
|
||||
break;
|
||||
|
||||
case OP_CREF:
|
||||
fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_DNCREF:
|
||||
{
|
||||
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
|
||||
fprintf(f, " %s Cond ref <", flag);
|
||||
print_custring(f, entry);
|
||||
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
|
||||
}
|
||||
break;
|
||||
|
||||
case OP_RREF:
|
||||
c = GET2(code, 1);
|
||||
if (c == RREF_ANY)
|
||||
fprintf(f, " Cond recurse any");
|
||||
else
|
||||
fprintf(f, " Cond recurse %d", c);
|
||||
break;
|
||||
|
||||
case OP_DNRREF:
|
||||
{
|
||||
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
|
||||
fprintf(f, " %s Cond recurse <", flag);
|
||||
print_custring(f, entry);
|
||||
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
|
||||
}
|
||||
break;
|
||||
|
||||
case OP_DEF:
|
||||
fprintf(f, " Cond def");
|
||||
break;
|
||||
|
||||
case OP_STARI:
|
||||
case OP_MINSTARI:
|
||||
case OP_POSSTARI:
|
||||
case OP_PLUSI:
|
||||
case OP_MINPLUSI:
|
||||
case OP_POSPLUSI:
|
||||
case OP_QUERYI:
|
||||
case OP_MINQUERYI:
|
||||
case OP_POSQUERYI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_STAR:
|
||||
case OP_MINSTAR:
|
||||
case OP_POSSTAR:
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_POSQUERY:
|
||||
case OP_TYPESTAR:
|
||||
case OP_TYPEMINSTAR:
|
||||
case OP_TYPEPOSSTAR:
|
||||
case OP_TYPEPLUS:
|
||||
case OP_TYPEMINPLUS:
|
||||
case OP_TYPEPOSPLUS:
|
||||
case OP_TYPEQUERY:
|
||||
case OP_TYPEMINQUERY:
|
||||
case OP_TYPEPOSQUERY:
|
||||
fprintf(f, " %s ", flag);
|
||||
|
||||
if (*code >= OP_TYPESTAR)
|
||||
{
|
||||
if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
|
||||
{
|
||||
print_prop(f, code + 1, "", " ");
|
||||
extra = 2;
|
||||
}
|
||||
else fprintf(f, "%s", OP_names[code[1]]);
|
||||
}
|
||||
else extra = print_char(f, code+1, utf);
|
||||
fprintf(f, "%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_EXACTI:
|
||||
case OP_UPTOI:
|
||||
case OP_MINUPTOI:
|
||||
case OP_POSUPTOI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_EXACT:
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_POSUPTO:
|
||||
fprintf(f, " %s ", flag);
|
||||
extra = print_char(f, code + 1 + IMM2_SIZE, utf);
|
||||
fprintf(f, "{");
|
||||
if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_MINUPTO || *code == OP_MINUPTOI) fprintf(f, "?");
|
||||
else if (*code == OP_POSUPTO || *code == OP_POSUPTOI) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_TYPEEXACT:
|
||||
case OP_TYPEUPTO:
|
||||
case OP_TYPEMINUPTO:
|
||||
case OP_TYPEPOSUPTO:
|
||||
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
|
||||
{
|
||||
print_prop(f, code + IMM2_SIZE + 1, " ", " ");
|
||||
extra = 2;
|
||||
}
|
||||
else fprintf(f, " %s", OP_names[code[1 + IMM2_SIZE]]);
|
||||
fprintf(f, "{");
|
||||
if (*code != OP_TYPEEXACT) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
|
||||
else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_NOTI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_NOT:
|
||||
fprintf(f, " %s [^", flag);
|
||||
extra = print_char(f, code + 1, utf);
|
||||
fprintf(f, "]");
|
||||
break;
|
||||
|
||||
case OP_NOTSTARI:
|
||||
case OP_NOTMINSTARI:
|
||||
case OP_NOTPOSSTARI:
|
||||
case OP_NOTPLUSI:
|
||||
case OP_NOTMINPLUSI:
|
||||
case OP_NOTPOSPLUSI:
|
||||
case OP_NOTQUERYI:
|
||||
case OP_NOTMINQUERYI:
|
||||
case OP_NOTPOSQUERYI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
|
||||
case OP_NOTSTAR:
|
||||
case OP_NOTMINSTAR:
|
||||
case OP_NOTPOSSTAR:
|
||||
case OP_NOTPLUS:
|
||||
case OP_NOTMINPLUS:
|
||||
case OP_NOTPOSPLUS:
|
||||
case OP_NOTQUERY:
|
||||
case OP_NOTMINQUERY:
|
||||
case OP_NOTPOSQUERY:
|
||||
fprintf(f, " %s [^", flag);
|
||||
extra = print_char(f, code + 1, utf);
|
||||
fprintf(f, "]%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_NOTEXACTI:
|
||||
case OP_NOTUPTOI:
|
||||
case OP_NOTMINUPTOI:
|
||||
case OP_NOTPOSUPTOI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTUPTO:
|
||||
case OP_NOTMINUPTO:
|
||||
case OP_NOTPOSUPTO:
|
||||
fprintf(f, " %s [^", flag);
|
||||
extra = print_char(f, code + 1 + IMM2_SIZE, utf);
|
||||
fprintf(f, "]{");
|
||||
if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,");
|
||||
fprintf(f, "%d}", GET2(code,1));
|
||||
if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?");
|
||||
else
|
||||
if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+");
|
||||
break;
|
||||
|
||||
case OP_RECURSE:
|
||||
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
|
||||
else fprintf(f, " ");
|
||||
fprintf(f, "%s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_REFI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_REF:
|
||||
fprintf(f, " %s \\%d", flag, GET2(code,1));
|
||||
ccode = code + OP_lengths[*code];
|
||||
goto CLASS_REF_REPEAT;
|
||||
|
||||
case OP_DNREFI:
|
||||
flag = "/i";
|
||||
/* Fall through */
|
||||
case OP_DNREF:
|
||||
{
|
||||
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
|
||||
fprintf(f, " %s \\k<", flag);
|
||||
print_custring(f, entry);
|
||||
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
|
||||
}
|
||||
ccode = code + OP_lengths[*code];
|
||||
goto CLASS_REF_REPEAT;
|
||||
|
||||
case OP_CALLOUT:
|
||||
fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2),
|
||||
GET(code, 2 + LINK_SIZE));
|
||||
break;
|
||||
|
||||
case OP_PROP:
|
||||
case OP_NOTPROP:
|
||||
print_prop(f, code, " ", "");
|
||||
break;
|
||||
|
||||
/* OP_XCLASS cannot occur in 8-bit, non-UTF mode. However, there's no harm
|
||||
in having this code always here, and it makes it less messy without all
|
||||
those #ifdefs. */
|
||||
|
||||
case OP_CLASS:
|
||||
case OP_NCLASS:
|
||||
case OP_XCLASS:
|
||||
{
|
||||
int i;
|
||||
unsigned int min, max;
|
||||
BOOL printmap;
|
||||
BOOL invertmap = FALSE;
|
||||
uint8_t *map;
|
||||
uint8_t inverted_map[32];
|
||||
|
||||
fprintf(f, " [");
|
||||
|
||||
if (*code == OP_XCLASS)
|
||||
{
|
||||
extra = GET(code, 1);
|
||||
ccode = code + LINK_SIZE + 1;
|
||||
printmap = (*ccode & XCL_MAP) != 0;
|
||||
if ((*ccode & XCL_NOT) != 0)
|
||||
{
|
||||
invertmap = (*ccode & XCL_HASPROP) == 0;
|
||||
fprintf(f, "^");
|
||||
}
|
||||
ccode++;
|
||||
}
|
||||
else
|
||||
{
|
||||
printmap = TRUE;
|
||||
ccode = code + 1;
|
||||
}
|
||||
|
||||
/* Print a bit map */
|
||||
|
||||
if (printmap)
|
||||
{
|
||||
map = (uint8_t *)ccode;
|
||||
if (invertmap)
|
||||
{
|
||||
for (i = 0; i < 32; i++) inverted_map[i] = ~map[i];
|
||||
map = inverted_map;
|
||||
}
|
||||
|
||||
for (i = 0; i < 256; i++)
|
||||
{
|
||||
if ((map[i/8] & (1 << (i&7))) != 0)
|
||||
{
|
||||
int j;
|
||||
for (j = i+1; j < 256; j++)
|
||||
if ((map[j/8] & (1 << (j&7))) == 0) break;
|
||||
if (i == '-' || i == ']') fprintf(f, "\\");
|
||||
if (PRINTABLE(i)) fprintf(f, "%c", i);
|
||||
else fprintf(f, "\\x%02x", i);
|
||||
if (--j > i)
|
||||
{
|
||||
if (j != i + 1) fprintf(f, "-");
|
||||
if (j == '-' || j == ']') fprintf(f, "\\");
|
||||
if (PRINTABLE(j)) fprintf(f, "%c", j);
|
||||
else fprintf(f, "\\x%02x", j);
|
||||
}
|
||||
i = j;
|
||||
}
|
||||
}
|
||||
ccode += 32 / sizeof(PCRE2_UCHAR);
|
||||
}
|
||||
|
||||
/* For an XCLASS there is always some additional data */
|
||||
|
||||
if (*code == OP_XCLASS)
|
||||
{
|
||||
PCRE2_UCHAR ch;
|
||||
while ((ch = *ccode++) != XCL_END)
|
||||
{
|
||||
BOOL not = FALSE;
|
||||
const char *notch = "";
|
||||
|
||||
switch(ch)
|
||||
{
|
||||
case XCL_NOTPROP:
|
||||
not = TRUE;
|
||||
notch = "^";
|
||||
/* Fall through */
|
||||
|
||||
case XCL_PROP:
|
||||
{
|
||||
unsigned int ptype = *ccode++;
|
||||
unsigned int pvalue = *ccode++;
|
||||
|
||||
switch(ptype)
|
||||
{
|
||||
case PT_PXGRAPH:
|
||||
fprintf(f, "[:%sgraph:]", notch);
|
||||
break;
|
||||
|
||||
case PT_PXPRINT:
|
||||
fprintf(f, "[:%sprint:]", notch);
|
||||
break;
|
||||
|
||||
case PT_PXPUNCT:
|
||||
fprintf(f, "[:%spunct:]", notch);
|
||||
break;
|
||||
|
||||
default:
|
||||
fprintf(f, "\\%c{%s}", (not? 'P':'p'),
|
||||
get_ucpname(ptype, pvalue));
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
ccode += 1 + print_char(f, ccode, utf);
|
||||
if (ch == XCL_RANGE)
|
||||
{
|
||||
fprintf(f, "-");
|
||||
ccode += 1 + print_char(f, ccode, utf);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Indicate a non-UTF class which was created by negation */
|
||||
|
||||
fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
|
||||
|
||||
/* Handle repeats after a class or a back reference */
|
||||
|
||||
CLASS_REF_REPEAT:
|
||||
switch(*ccode)
|
||||
{
|
||||
case OP_CRSTAR:
|
||||
case OP_CRMINSTAR:
|
||||
case OP_CRPLUS:
|
||||
case OP_CRMINPLUS:
|
||||
case OP_CRQUERY:
|
||||
case OP_CRMINQUERY:
|
||||
case OP_CRPOSSTAR:
|
||||
case OP_CRPOSPLUS:
|
||||
case OP_CRPOSQUERY:
|
||||
fprintf(f, "%s", OP_names[*ccode]);
|
||||
extra += OP_lengths[*ccode];
|
||||
break;
|
||||
|
||||
case OP_CRRANGE:
|
||||
case OP_CRMINRANGE:
|
||||
case OP_CRPOSRANGE:
|
||||
min = GET2(ccode,1);
|
||||
max = GET2(ccode,1 + IMM2_SIZE);
|
||||
if (max == 0) fprintf(f, "{%u,}", min);
|
||||
else fprintf(f, "{%u,%u}", min, max);
|
||||
if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
|
||||
else if (*ccode == OP_CRPOSRANGE) fprintf(f, "+");
|
||||
extra += OP_lengths[*ccode];
|
||||
break;
|
||||
|
||||
/* Do nothing if it's not a repeat; this code stops picky compilers
|
||||
warning about the lack of a default code path. */
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case OP_MARK:
|
||||
case OP_PRUNE_ARG:
|
||||
case OP_SKIP_ARG:
|
||||
case OP_THEN_ARG:
|
||||
fprintf(f, " %s ", OP_names[*code]);
|
||||
print_custring(f, code + 2);
|
||||
extra += code[1];
|
||||
break;
|
||||
|
||||
case OP_THEN:
|
||||
fprintf(f, " %s", OP_names[*code]);
|
||||
break;
|
||||
|
||||
case OP_CIRCM:
|
||||
case OP_DOLLM:
|
||||
flag = "/m";
|
||||
/* Fall through */
|
||||
|
||||
/* Anything else is just an item with no data, but possibly a flag. */
|
||||
|
||||
default:
|
||||
fprintf(f, " %s %s", flag, OP_names[*code]);
|
||||
break;
|
||||
}
|
||||
|
||||
code += OP_lengths[*code] + extra;
|
||||
fprintf(f, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* End of pcre2_printint.c */
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2014 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains internal functions for comparing and finding the length
|
||||
of strings. These are used instead of strcmp() etc because the standard
|
||||
functions work only on 8-bit data. */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
/* FIXME: this module is incomplete */
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Compare two strings *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
str1 first string
|
||||
str2 second string
|
||||
|
||||
Returns: 0, 1, or -1
|
||||
*/
|
||||
|
||||
int
|
||||
PRIV(strcmp)(PCRE2_SPTR str1, PCRE2_SPTR str2)
|
||||
{
|
||||
PCRE2_UCHAR c1, c2;
|
||||
while (*str1 != '\0' || *str2 != '\0')
|
||||
{
|
||||
c1 = *str1++;
|
||||
c2 = *str2++;
|
||||
if (c1 != c2) return ((c1 > c2) << 1) - 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* End of pcre2_string_utils.c */
|
|
@ -46,8 +46,6 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
/* FIXME: most of these are currently placeholder functions */
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Copy named captured string to given buffer *
|
||||
|
@ -75,7 +73,16 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
|||
pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR stringname,
|
||||
PCRE2_UCHAR *buffer, size_t size)
|
||||
{
|
||||
match_data=match_data;stringname=stringname;buffer=buffer;size=size;
|
||||
PCRE2_SPTR first, last, entry;
|
||||
int entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
|
||||
&first, &last);
|
||||
if (entrysize <= 0) return entrysize;
|
||||
for (entry = first; entry <= last; entry += entrysize)
|
||||
{
|
||||
uint16_t n = GET2(entry, 0);
|
||||
if (n < match_data->oveccount && match_data->ovector[n*2] != PCRE2_UNSET)
|
||||
return pcre2_substring_copy_bynumber(match_data, n, buffer, size);
|
||||
}
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
}
|
||||
|
||||
|
@ -106,55 +113,17 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
|||
pcre2_substring_copy_bynumber(pcre2_match_data *match_data, int stringnumber,
|
||||
PCRE2_UCHAR *buffer, size_t size)
|
||||
{
|
||||
match_data=match_data;stringnumber=stringnumber;buffer=buffer;size=size;
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Free memory obtained by get_substring *
|
||||
*************************************************/
|
||||
|
||||
/* This function exists for the benefit of people calling PCRE from non-C
|
||||
programs that can call its functions, but not free() itself.
|
||||
|
||||
Arguments:
|
||||
context points to a PCRE2 context
|
||||
string the result of a previous pcre2_get_substring()
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_free(PCRE2_UCHAR *string)
|
||||
{
|
||||
string=string;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Free memory obtained by get_substring_list *
|
||||
*************************************************/
|
||||
|
||||
/* This function exists for the benefit of people calling PCRE from non-C
|
||||
programs that can call its functions, but not free() itself.
|
||||
|
||||
Arguments:
|
||||
context points to a PCRE2 context
|
||||
list the result of a previous pcre2_get_substring_list()
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_list_free(PCRE2_SPTR *list)
|
||||
{
|
||||
list=list;
|
||||
return;
|
||||
size_t left, right;
|
||||
size_t p = 0;
|
||||
PCRE2_SPTR subject = match_data->subject;
|
||||
if (stringnumber >= match_data->oveccount ||
|
||||
(left = match_data->ovector[stringnumber*2]) == PCRE2_UNSET)
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
right = match_data->ovector[stringnumber*2+1];
|
||||
if (right - left + 1 > size) return PCRE2_ERROR_NOMEMORY;
|
||||
while (left < right) buffer[p++] = subject[left++];
|
||||
buffer[p] = 0;
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
|
@ -168,10 +137,9 @@ new memory. If the regex permits duplicate names, the first substring that is
|
|||
set is chosen.
|
||||
|
||||
Arguments:
|
||||
context points to a PCRE2 context
|
||||
match_data pointer to match_data
|
||||
stringname the name of the required substring
|
||||
stringptr where to put the pointer
|
||||
stringptr where to put the pointer to the new memory
|
||||
|
||||
Returns: if successful:
|
||||
the length of the copied string, not including the zero
|
||||
|
@ -185,7 +153,16 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
|||
pcre2_substring_get_byname(pcre2_match_data *match_data,
|
||||
PCRE2_SPTR stringname, PCRE2_UCHAR **stringptr)
|
||||
{
|
||||
match_data=match_data;stringname=stringname;stringptr=stringptr;
|
||||
PCRE2_SPTR first, last, entry;
|
||||
int entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
|
||||
&first, &last);
|
||||
if (entrysize <= 0) return entrysize;
|
||||
for (entry = first; entry <= last; entry += entrysize)
|
||||
{
|
||||
uint16_t n = GET2(entry, 0);
|
||||
if (n < match_data->oveccount && match_data->ovector[n*2] != PCRE2_UNSET)
|
||||
return pcre2_substring_get_bynumber(match_data, n, stringptr);
|
||||
}
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
}
|
||||
|
||||
|
@ -199,10 +176,9 @@ return PCRE2_ERROR_NOSUBSTRING;
|
|||
memory.
|
||||
|
||||
Arguments:
|
||||
context points to a PCRE2 context
|
||||
match_data points to match data
|
||||
stringnumber the number of the required substring
|
||||
stringptr where to put a pointer to the substring
|
||||
stringptr where to put a pointer to the new memory
|
||||
|
||||
Returns: if successful:
|
||||
the length of the string, not including the zero that
|
||||
|
@ -216,9 +192,44 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
|||
pcre2_substring_get_bynumber(pcre2_match_data *match_data, int stringnumber,
|
||||
PCRE2_UCHAR **stringptr)
|
||||
{
|
||||
match_data=match_data;stringnumber=stringnumber;
|
||||
stringptr=stringptr;
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
size_t left, right;
|
||||
size_t p = 0;
|
||||
void *block;
|
||||
PCRE2_UCHAR *yield;
|
||||
|
||||
PCRE2_SPTR subject = match_data->subject;
|
||||
if (stringnumber >= match_data->oveccount ||
|
||||
(left = match_data->ovector[stringnumber*2]) == PCRE2_UNSET)
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
right = match_data->ovector[stringnumber*2+1];
|
||||
|
||||
block = PRIV(memctl_malloc)(sizeof(pcre2_memctl) +
|
||||
(right-left+1)*PCRE2_CODE_UNIT_WIDTH, 0, &(match_data->memctl));
|
||||
if (block == NULL) return PCRE2_ERROR_NOMEMORY;
|
||||
|
||||
yield = (PCRE2_UCHAR *)((char *)block + sizeof(pcre2_memctl));
|
||||
while (left < right) yield[p++] = subject[left++];
|
||||
yield[p] = 0;
|
||||
*stringptr = yield;
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Free memory obtained by get_substring *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Argument: the result of a previous pcre2_substring_get_byxxx()
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_free(PCRE2_UCHAR *string)
|
||||
{
|
||||
pcre2_memctl *memctl = (pcre2_memctl *)((char *)string - sizeof(pcre2_memctl));
|
||||
memctl->free(memctl, memctl->memory_data);
|
||||
}
|
||||
|
||||
|
||||
|
@ -242,7 +253,16 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
|||
pcre2_substring_length_byname(pcre2_match_data *match_data,
|
||||
PCRE2_SPTR stringname)
|
||||
{
|
||||
match_data=match_data;stringname=stringname;
|
||||
PCRE2_SPTR first, last, entry;
|
||||
int entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
|
||||
&first, &last);
|
||||
if (entrysize <= 0) return entrysize;
|
||||
for (entry = first; entry <= last; entry += entrysize)
|
||||
{
|
||||
uint16_t n = GET2(entry, 0);
|
||||
if (n < match_data->oveccount && match_data->ovector[n*2] != PCRE2_UNSET)
|
||||
return pcre2_substring_length_bynumber(match_data, n);
|
||||
}
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
}
|
||||
|
||||
|
@ -266,8 +286,11 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
|||
pcre2_substring_length_bynumber(pcre2_match_data *match_data,
|
||||
int stringnumber)
|
||||
{
|
||||
match_data=match_data;stringnumber=stringnumber;
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
if (stringnumber >= match_data->oveccount ||
|
||||
match_data->ovector[stringnumber*2] == PCRE2_UNSET)
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
return match_data->ovector[stringnumber*2 + 1] -
|
||||
match_data->ovector[stringnumber*2];
|
||||
}
|
||||
|
||||
|
||||
|
@ -278,48 +301,88 @@ return PCRE2_ERROR_NOSUBSTRING;
|
|||
|
||||
/* This function gets one chunk of memory and builds a list of pointers and all
|
||||
the captured substrings in it. A NULL pointer is put on the end of the list.
|
||||
The substrings are zero-terminated, but also, if the final argument is
|
||||
non-NULL, a list of lengths is also returned. This allows binary data to be
|
||||
handled.
|
||||
|
||||
Arguments:
|
||||
context points to a PCRE2 context
|
||||
match_data points to the match data
|
||||
listptr set to point to the list of pointers
|
||||
lengthsptr set to point to the list of lengths (may be NULL)
|
||||
|
||||
Returns: if successful: 0
|
||||
if not successful, a negative error code:
|
||||
PCRE2_ERROR_NOMEMORY: failed to get memory
|
||||
PCRE2_ERROR_NOMEMORY: failed to get memory,
|
||||
or a match failure code
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr,
|
||||
size_t **lengthsptr)
|
||||
{
|
||||
match_data=match_data;listptr=listptr;lengthsptr=lengthsptr;
|
||||
return PCRE2_ERROR_NOMEMORY;
|
||||
int i, count, count2;
|
||||
size_t size;
|
||||
size_t *lensp, *ovector;
|
||||
pcre2_memctl *memp;
|
||||
PCRE2_UCHAR **listp;
|
||||
PCRE2_UCHAR *sp;
|
||||
|
||||
if ((count = match_data->rc) < 0) return count;
|
||||
|
||||
count2 = 2*count;
|
||||
ovector = match_data->ovector;
|
||||
size = sizeof(pcre2_memctl) + sizeof(PCRE2_UCHAR *); /* For final NULL */
|
||||
if (lengthsptr != NULL) size += sizeof(size_t) * count; /* For lengths */
|
||||
|
||||
for (i = 0; i < count2; i += 2)
|
||||
size += sizeof(PCRE2_UCHAR *) + CU2BYTES(ovector[i+1] - ovector[i] + 1);
|
||||
memp = PRIV(memctl_malloc)(size, 0, &(match_data->memctl));
|
||||
if (memp == NULL) return PCRE2_ERROR_NOMEMORY;
|
||||
|
||||
*listptr = listp = (PCRE2_UCHAR **)((char *)memp + sizeof(pcre2_memctl));
|
||||
lensp = (size_t *)((char *)listp + sizeof(PCRE2_UCHAR *) * (count + 1));
|
||||
|
||||
if (lengthsptr == NULL)
|
||||
{
|
||||
sp = (PCRE2_UCHAR *)lensp;
|
||||
lensp = NULL;
|
||||
}
|
||||
else
|
||||
{
|
||||
*lengthsptr = lensp;
|
||||
sp = (PCRE2_UCHAR *)((char *)lensp + sizeof(size_t) * count);
|
||||
}
|
||||
|
||||
for (i = 0; i < count2; i += 2)
|
||||
{
|
||||
size = ovector[i+1] - ovector[i];
|
||||
memcpy(sp, match_data->subject + ovector[i], CU2BYTES(size));
|
||||
*listp++ = sp;
|
||||
if (lensp != NULL) *lensp++ = size;
|
||||
sp += size;
|
||||
*sp++ = 0;
|
||||
}
|
||||
|
||||
*listp = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find number for named string *
|
||||
* Free memory obtained by substring_list_get *
|
||||
*************************************************/
|
||||
|
||||
/* This function is used by the local get_first_set() function, as well
|
||||
as being generally available. It assumes that names are unique.
|
||||
|
||||
Arguments:
|
||||
code the compiled regex
|
||||
stringname the name whose number is required
|
||||
|
||||
Returns: the number of the named parentheses, or a negative number
|
||||
(PCRE2_ERROR_NOSUBSTRING) if not found
|
||||
/*
|
||||
Argument: the result of a previous pcre2_substring_list_get()
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_number_from_name(const pcre2_code *code,
|
||||
PCRE2_SPTR stringname)
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_list_free(PCRE2_SPTR *list)
|
||||
{
|
||||
code=code;stringname=stringname;
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
pcre2_memctl *memctl = (pcre2_memctl *)((char *)list - sizeof(pcre2_memctl));
|
||||
memctl->free(memctl, memctl->memory_data);
|
||||
}
|
||||
|
||||
|
||||
|
@ -328,8 +391,10 @@ return PCRE2_ERROR_NOSUBSTRING;
|
|||
* Find (multiple) entries for named string *
|
||||
*************************************************/
|
||||
|
||||
/* This is used by the local get_first_set() function, as well as being
|
||||
generally available. It is used when duplicated names are permitted.
|
||||
/* This function scans the nametable for a given name, using binary chop. It
|
||||
returns either two pointers to the entries in the table, or, if no pointers are
|
||||
given, the number of a group with the given name. If duplicate names are
|
||||
permitted, this may not be unique.
|
||||
|
||||
Arguments:
|
||||
code the compiled regex
|
||||
|
@ -337,17 +402,73 @@ Arguments:
|
|||
firstptr where to put the pointer to the first entry
|
||||
lastptr where to put the pointer to the last entry
|
||||
|
||||
Returns: the length of each entry, or a negative number
|
||||
Returns: if firstptr and lastptr are NULL, a group number;
|
||||
otherwise, the length of each entry, or a negative number
|
||||
(PCRE2_ERROR_NOSUBSTRING) if not found
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR stringname,
|
||||
PCRE2_UCHAR **firstptr, PCRE2_UCHAR **lastptr)
|
||||
PCRE2_SPTR *firstptr, PCRE2_SPTR *lastptr)
|
||||
{
|
||||
code=code;stringname=stringname;firstptr=firstptr;lastptr=lastptr;
|
||||
uint16_t bot = 0;
|
||||
uint16_t top = code->name_count;
|
||||
uint16_t entrysize = code->name_entry_size;
|
||||
PCRE2_SPTR nametable = (PCRE2_SPTR)((char *)code + sizeof(pcre2_real_code));
|
||||
|
||||
while (top > bot)
|
||||
{
|
||||
uint16_t mid = (top + bot) / 2;
|
||||
PCRE2_SPTR entry = nametable + entrysize*mid;
|
||||
int c = PRIV(strcmp)(stringname, entry + IMM2_SIZE);
|
||||
if (c == 0)
|
||||
{
|
||||
PCRE2_SPTR first, last, lastentry;
|
||||
if (firstptr == NULL) return GET2(entry, 0);
|
||||
lastentry = nametable + entrysize * (code->name_count - 1);
|
||||
first = last = entry;
|
||||
while (first > nametable)
|
||||
{
|
||||
if (PRIV(strcmp)(stringname, (first - entrysize + IMM2_SIZE)) != 0) break;
|
||||
first -= entrysize;
|
||||
}
|
||||
while (last < lastentry)
|
||||
{
|
||||
if (PRIV(strcmp)(stringname, (last + entrysize + IMM2_SIZE)) != 0) break;
|
||||
last += entrysize;
|
||||
}
|
||||
*firstptr = first;
|
||||
*lastptr = last;
|
||||
return entrysize;
|
||||
}
|
||||
if (c > 0) bot = mid + 1; else top = mid;
|
||||
}
|
||||
|
||||
return PCRE2_ERROR_NOSUBSTRING;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find number for named string *
|
||||
*************************************************/
|
||||
|
||||
/* This function is a convenience wrapper for pcre2_substring_nametable_scan()
|
||||
when it is known that names are unique. If there are duplicate names, it is not
|
||||
defined which number is returned.
|
||||
|
||||
Arguments:
|
||||
code the compiled regex
|
||||
stringname the name whose number is required
|
||||
|
||||
Returns: the number of the named parenthesis, or a negative number
|
||||
(PCRE2_ERROR_NOSUBSTRING) if not found
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_number_from_name(const pcre2_code *code,
|
||||
PCRE2_SPTR stringname)
|
||||
{
|
||||
return pcre2_substring_nametable_scan(code, stringname, NULL, NULL);
|
||||
}
|
||||
|
||||
/* End of pcre2_substring.c */
|
||||
|
|
|
@ -38,34 +38,33 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef PCRE2_INCLUDED
|
||||
|
||||
/* This module contains some fixed tables that are used by more than one of the
|
||||
PCRE code modules. The tables are also #included by the pcre2test program,
|
||||
which uses macros to change their names from _pcre2_xxx to xxxx, thereby
|
||||
avoiding name clashes with the library. */
|
||||
avoiding name clashes with the library. In this case, PCRE2_INCLUDED is
|
||||
defined. */
|
||||
|
||||
#ifndef PCRE2_INCLUDED /* We're compiling the library */
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
#endif /* PCRE2_INCLUDED */
|
||||
|
||||
|
||||
#ifdef FIXME
|
||||
|
||||
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
|
||||
the definition is next to the definition of the opcodes in pcre2_internal.h. */
|
||||
the definition is next to the definition of the opcodes in pcre2_internal.h.
|
||||
This is mode-dependent, so is skipped when this file is included by pcre2test. */
|
||||
|
||||
#ifndef PCRE2_INCLUDED
|
||||
const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS };
|
||||
#endif
|
||||
|
||||
/* Tables of horizontal and vertical whitespace characters, suitable for
|
||||
adding to classes. */
|
||||
|
||||
const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST };
|
||||
const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST };
|
||||
#endif /* FIXME */
|
||||
|
||||
|
||||
/*************************************************
|
||||
|
@ -103,8 +102,6 @@ const uint8_t PRIV(utf8_table4)[] = {
|
|||
#endif /* (SUPPORT_UTF && COMPILE_PCRE8) || (PCRE2_INCLUDED && SUPPORT_PCRE[16|32])*/
|
||||
|
||||
|
||||
#ifdef FIXME
|
||||
|
||||
#ifdef SUPPORT_UTF
|
||||
|
||||
/* Table to translate from particular type value to the general value. */
|
||||
|
@ -122,9 +119,9 @@ const uint32_t PRIV(ucp_gentype)[] = {
|
|||
|
||||
/* This table encodes the rules for finding the end of an extended grapheme
|
||||
cluster. Every code point has a grapheme break property which is one of the
|
||||
ucp_gbXX values defined in ucp.h. The 2-dimensional table is indexed by the
|
||||
properties of two adjacent code points. The left property selects a word from
|
||||
the table, and the right property selects a bit from that word like this:
|
||||
ucp_gbXX values defined in pcre2_ucp.h. The 2-dimensional table is indexed by
|
||||
the properties of two adjacent code points. The left property selects a word
|
||||
from the table, and the right property selects a bit from that word like this:
|
||||
|
||||
ucp_gbtable[left-property] & (1 << right-property)
|
||||
|
||||
|
@ -660,6 +657,4 @@ const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
|||
|
||||
#endif /* SUPPORT_UTF */
|
||||
|
||||
#endif /* FIXME */
|
||||
|
||||
/* End of pcre2_tables.c */
|
||||
|
|
|
@ -2,28 +2,29 @@
|
|||
Do not modify it by hand. Instead modify the script and run it
|
||||
to regenerate this code.
|
||||
|
||||
As well as being part of the PCRE library, this module is #included
|
||||
by the pcretest program, which redefines the PRIV macro to change
|
||||
table names from _pcre_xxx to xxxx, thereby avoiding name clashes
|
||||
As well as being part of the PCRE2 library, this module is #included
|
||||
by the pcre2test program, which redefines the PRIV macro to change
|
||||
table names from _pcre2_xxx to xxxx, thereby avoiding name clashes
|
||||
with the library. At present, just one of these tables is actually
|
||||
needed. */
|
||||
|
||||
#ifndef PCRE_INCLUDED
|
||||
#ifndef PCRE2_INCLUDED
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre_internal.h"
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
#endif /* PCRE_INCLUDED */
|
||||
#endif /* PCRE2_INCLUDED */
|
||||
|
||||
/* Unicode character database. */
|
||||
/* This file was autogenerated by the MultiStage2.py script. */
|
||||
/* Total size: 65688 bytes, block size: 128. */
|
||||
|
||||
/* The tables herein are needed only when UCP support is built
|
||||
into PCRE. This module should not be referenced otherwise, so
|
||||
/* The tables herein are needed only when UCP support is built,
|
||||
and in PCRE2 that happens automatically with UTF support.
|
||||
This module should not be referenced otherwise, so
|
||||
it should not matter whether it is compiled or not. However
|
||||
a comment was received about space saving - maybe the guy linked
|
||||
all the modules rather than using a library - so we include a
|
||||
|
@ -31,28 +32,28 @@ condition to cut out the tables when not needed. But don't leave
|
|||
a totally empty module because some compilers barf at that.
|
||||
Instead, just supply small dummy tables. */
|
||||
|
||||
#ifndef SUPPORT_UCP
|
||||
#ifndef SUPPORT_UTF
|
||||
const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};
|
||||
const pcre_uint8 PRIV(ucd_stage1)[] = {0};
|
||||
const pcre_uint16 PRIV(ucd_stage2)[] = {0};
|
||||
const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};
|
||||
const uint8_t PRIV(ucd_stage1)[] = {0};
|
||||
const uint16_t PRIV(ucd_stage2)[] = {0};
|
||||
const uint32_t PRIV(ucd_caseless_sets)[] = {0};
|
||||
#else
|
||||
|
||||
/* When recompiling tables with a new Unicode version, please check the
|
||||
types in this structure definition from pcre_internal.h (the actual
|
||||
types in this structure definition from pcre2_internal.h (the actual
|
||||
field names will be different):
|
||||
|
||||
typedef struct {
|
||||
pcre_uint8 property_0;
|
||||
pcre_uint8 property_1;
|
||||
pcre_uint8 property_2;
|
||||
pcre_uint8 property_3;
|
||||
uint8_t property_0;
|
||||
uint8_t property_1;
|
||||
uint8_t property_2;
|
||||
uint8_t property_3;
|
||||
pcre_int32 property_4;
|
||||
} ucd_record;
|
||||
*/
|
||||
|
||||
|
||||
const pcre_uint32 PRIV(ucd_caseless_sets)[] = {
|
||||
const uint32_t PRIV(ucd_caseless_sets)[] = {
|
||||
NOTACHAR,
|
||||
0x0053, 0x0073, 0x017f, NOTACHAR,
|
||||
0x01c4, 0x01c5, 0x01c6, NOTACHAR,
|
||||
|
@ -75,9 +76,9 @@ const pcre_uint32 PRIV(ucd_caseless_sets)[] = {
|
|||
0x00c5, 0x00e5, 0x212b, NOTACHAR,
|
||||
};
|
||||
|
||||
/* When #included in pcretest, we don't need this large table. */
|
||||
/* When #included in pcre2test, we don't need this large table. */
|
||||
|
||||
#ifndef PCRE_INCLUDED
|
||||
#ifndef PCRE2_INCLUDED
|
||||
|
||||
const ucd_record PRIV(ucd_records)[] = { /* 5016 bytes, record size 8 */
|
||||
{ 9, 0, 2, 0, 0, }, /* 0 */
|
||||
|
@ -709,7 +710,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 5016 bytes, record size 8 */
|
|||
{ 26, 26, 12, 0, 0, }, /* 626 */
|
||||
};
|
||||
|
||||
const pcre_uint8 PRIV(ucd_stage1)[] = { /* 8704 bytes */
|
||||
const uint8_t PRIV(ucd_stage1)[] = { /* 8704 bytes */
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* U+0000 */
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, /* U+0800 */
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 41, 41, 42, 43, 44, 45, /* U+1000 */
|
||||
|
@ -1256,7 +1257,7 @@ const pcre_uint8 PRIV(ucd_stage1)[] = { /* 8704 bytes */
|
|||
123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,202, /* U+10F800 */
|
||||
};
|
||||
|
||||
const pcre_uint16 PRIV(ucd_stage2)[] = { /* 51968 bytes, block = 128 */
|
||||
const uint16_t PRIV(ucd_stage2)[] = { /* 51968 bytes, block = 128 */
|
||||
/* block 0 */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
@ -3290,8 +3291,8 @@ const pcre_uint16 PRIV(ucd_stage2)[] = { /* 51968 bytes, block = 128 */
|
|||
};
|
||||
|
||||
#if UCD_BLOCK_SIZE != 128
|
||||
#error Please correct UCD_BLOCK_SIZE in pcre_internal.h
|
||||
#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
|
||||
#endif
|
||||
#endif /* SUPPORT_UCP */
|
||||
#endif /* SUPPORT_UTF */
|
||||
|
||||
#endif /* PCRE_INCLUDED */
|
||||
#endif /* PCRE2_INCLUDED */
|
||||
|
|
|
@ -0,0 +1,237 @@
|
|||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2014 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
#ifndef _PCRE2_UCP_H
|
||||
#define _PCRE2_UCP_H
|
||||
|
||||
/* This file contains definitions of the property values that are returned by
|
||||
the UCD access macros. New values that are added for new releases of Unicode
|
||||
should always be at the end of each enum, for backwards compatibility.
|
||||
|
||||
IMPORTANT: Note also that the specific numeric values of the enums have to be
|
||||
the same as the values that are generated by the maint/MultiStage2.py script,
|
||||
where the equivalent property descriptive names are listed in vectors.
|
||||
|
||||
ALSO: The specific values of the first two enums are assumed for the table
|
||||
called catposstab in pcre2_compile.c. */
|
||||
|
||||
/* These are the general character categories. */
|
||||
|
||||
enum {
|
||||
ucp_C, /* Other */
|
||||
ucp_L, /* Letter */
|
||||
ucp_M, /* Mark */
|
||||
ucp_N, /* Number */
|
||||
ucp_P, /* Punctuation */
|
||||
ucp_S, /* Symbol */
|
||||
ucp_Z /* Separator */
|
||||
};
|
||||
|
||||
/* These are the particular character categories. */
|
||||
|
||||
enum {
|
||||
ucp_Cc, /* Control */
|
||||
ucp_Cf, /* Format */
|
||||
ucp_Cn, /* Unassigned */
|
||||
ucp_Co, /* Private use */
|
||||
ucp_Cs, /* Surrogate */
|
||||
ucp_Ll, /* Lower case letter */
|
||||
ucp_Lm, /* Modifier letter */
|
||||
ucp_Lo, /* Other letter */
|
||||
ucp_Lt, /* Title case letter */
|
||||
ucp_Lu, /* Upper case letter */
|
||||
ucp_Mc, /* Spacing mark */
|
||||
ucp_Me, /* Enclosing mark */
|
||||
ucp_Mn, /* Non-spacing mark */
|
||||
ucp_Nd, /* Decimal number */
|
||||
ucp_Nl, /* Letter number */
|
||||
ucp_No, /* Other number */
|
||||
ucp_Pc, /* Connector punctuation */
|
||||
ucp_Pd, /* Dash punctuation */
|
||||
ucp_Pe, /* Close punctuation */
|
||||
ucp_Pf, /* Final punctuation */
|
||||
ucp_Pi, /* Initial punctuation */
|
||||
ucp_Po, /* Other punctuation */
|
||||
ucp_Ps, /* Open punctuation */
|
||||
ucp_Sc, /* Currency symbol */
|
||||
ucp_Sk, /* Modifier symbol */
|
||||
ucp_Sm, /* Mathematical symbol */
|
||||
ucp_So, /* Other symbol */
|
||||
ucp_Zl, /* Line separator */
|
||||
ucp_Zp, /* Paragraph separator */
|
||||
ucp_Zs /* Space separator */
|
||||
};
|
||||
|
||||
/* These are grapheme break properties. Note that the code for processing them
|
||||
assumes that the values are less than 16. If more values are added that take
|
||||
the number to 16 or more, the code will have to be rewritten. */
|
||||
|
||||
enum {
|
||||
ucp_gbCR, /* 0 */
|
||||
ucp_gbLF, /* 1 */
|
||||
ucp_gbControl, /* 2 */
|
||||
ucp_gbExtend, /* 3 */
|
||||
ucp_gbPrepend, /* 4 */
|
||||
ucp_gbSpacingMark, /* 5 */
|
||||
ucp_gbL, /* 6 Hangul syllable type L */
|
||||
ucp_gbV, /* 7 Hangul syllable type V */
|
||||
ucp_gbT, /* 8 Hangul syllable type T */
|
||||
ucp_gbLV, /* 9 Hangul syllable type LV */
|
||||
ucp_gbLVT, /* 10 Hangul syllable type LVT */
|
||||
ucp_gbRegionalIndicator, /* 11 */
|
||||
ucp_gbOther /* 12 */
|
||||
};
|
||||
|
||||
/* These are the script identifications. */
|
||||
|
||||
enum {
|
||||
ucp_Arabic,
|
||||
ucp_Armenian,
|
||||
ucp_Bengali,
|
||||
ucp_Bopomofo,
|
||||
ucp_Braille,
|
||||
ucp_Buginese,
|
||||
ucp_Buhid,
|
||||
ucp_Canadian_Aboriginal,
|
||||
ucp_Cherokee,
|
||||
ucp_Common,
|
||||
ucp_Coptic,
|
||||
ucp_Cypriot,
|
||||
ucp_Cyrillic,
|
||||
ucp_Deseret,
|
||||
ucp_Devanagari,
|
||||
ucp_Ethiopic,
|
||||
ucp_Georgian,
|
||||
ucp_Glagolitic,
|
||||
ucp_Gothic,
|
||||
ucp_Greek,
|
||||
ucp_Gujarati,
|
||||
ucp_Gurmukhi,
|
||||
ucp_Han,
|
||||
ucp_Hangul,
|
||||
ucp_Hanunoo,
|
||||
ucp_Hebrew,
|
||||
ucp_Hiragana,
|
||||
ucp_Inherited,
|
||||
ucp_Kannada,
|
||||
ucp_Katakana,
|
||||
ucp_Kharoshthi,
|
||||
ucp_Khmer,
|
||||
ucp_Lao,
|
||||
ucp_Latin,
|
||||
ucp_Limbu,
|
||||
ucp_Linear_B,
|
||||
ucp_Malayalam,
|
||||
ucp_Mongolian,
|
||||
ucp_Myanmar,
|
||||
ucp_New_Tai_Lue,
|
||||
ucp_Ogham,
|
||||
ucp_Old_Italic,
|
||||
ucp_Old_Persian,
|
||||
ucp_Oriya,
|
||||
ucp_Osmanya,
|
||||
ucp_Runic,
|
||||
ucp_Shavian,
|
||||
ucp_Sinhala,
|
||||
ucp_Syloti_Nagri,
|
||||
ucp_Syriac,
|
||||
ucp_Tagalog,
|
||||
ucp_Tagbanwa,
|
||||
ucp_Tai_Le,
|
||||
ucp_Tamil,
|
||||
ucp_Telugu,
|
||||
ucp_Thaana,
|
||||
ucp_Thai,
|
||||
ucp_Tibetan,
|
||||
ucp_Tifinagh,
|
||||
ucp_Ugaritic,
|
||||
ucp_Yi,
|
||||
/* New for Unicode 5.0: */
|
||||
ucp_Balinese,
|
||||
ucp_Cuneiform,
|
||||
ucp_Nko,
|
||||
ucp_Phags_Pa,
|
||||
ucp_Phoenician,
|
||||
/* New for Unicode 5.1: */
|
||||
ucp_Carian,
|
||||
ucp_Cham,
|
||||
ucp_Kayah_Li,
|
||||
ucp_Lepcha,
|
||||
ucp_Lycian,
|
||||
ucp_Lydian,
|
||||
ucp_Ol_Chiki,
|
||||
ucp_Rejang,
|
||||
ucp_Saurashtra,
|
||||
ucp_Sundanese,
|
||||
ucp_Vai,
|
||||
/* New for Unicode 5.2: */
|
||||
ucp_Avestan,
|
||||
ucp_Bamum,
|
||||
ucp_Egyptian_Hieroglyphs,
|
||||
ucp_Imperial_Aramaic,
|
||||
ucp_Inscriptional_Pahlavi,
|
||||
ucp_Inscriptional_Parthian,
|
||||
ucp_Javanese,
|
||||
ucp_Kaithi,
|
||||
ucp_Lisu,
|
||||
ucp_Meetei_Mayek,
|
||||
ucp_Old_South_Arabian,
|
||||
ucp_Old_Turkic,
|
||||
ucp_Samaritan,
|
||||
ucp_Tai_Tham,
|
||||
ucp_Tai_Viet,
|
||||
/* New for Unicode 6.0.0: */
|
||||
ucp_Batak,
|
||||
ucp_Brahmi,
|
||||
ucp_Mandaic,
|
||||
/* New for Unicode 6.1.0: */
|
||||
ucp_Chakma,
|
||||
ucp_Meroitic_Cursive,
|
||||
ucp_Meroitic_Hieroglyphs,
|
||||
ucp_Miao,
|
||||
ucp_Sharada,
|
||||
ucp_Sora_Sompeng,
|
||||
ucp_Takri
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/* End of pcvre2_ucp.h */
|
1571
src/pcre2test.c
1571
src/pcre2test.c
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue