Further work on pcre2test (can now display compiled code).

This commit is contained in:
Philip.Hazel 2014-05-13 11:20:03 +00:00
parent 9812ca8b0a
commit 225992aa3a
32 changed files with 38225 additions and 937 deletions

View File

@ -269,25 +269,26 @@ COMMON_SOURCES = \
src/pcre2_error.c \
src/pcre2_match.c \
src/pcre2_internal.h \
src/pcre2_intmodedep.h \
src/pcre2_jit_compile.c \
src/pcre2_jit_match.c \
src/pcre2_jit_misc.c \
src/pcre2_maketables.c \
src/pcre2_match_data.c \
src/pcre2_pattern_info.c \
src/pcre2_string_utils.c \
src/pcre2_substring.c \
src/pcre2_tables.c \
src/pcre2_ucd.c \
src/pcre2_ucp.h \
src/pcre2_version.c
# src/pcre2_newline.c \
# src/pcre2_ord2utf8.c \
# src/pcre2_refcount.c \
# src/pcre2_string_utils.c \
# src/pcre2_study.c \
# src/pcre2_tables.c \
# src/pcre2_ucd.c \
# src/pcre2_valid_utf8.c \
# src/pcre2_xclass.c \
# src/ucp.h
# src/pcre2_xclass.c
if WITH_PCRE8
@ -450,7 +451,7 @@ endif # WITH_GCOV
endif # WITH_JIT
# Build the general pcre2test program. The file src/pcre2_printint.c is
# #included by pcre2test as many times as needed, at different code unit
# #included by pcre2test as many times as needed, at different code unit
# widths.
bin_PROGRAMS += pcre2test
@ -593,8 +594,8 @@ CLEANFILES += \
testtemp* \
testtry \
testNinput
## ------------ End of testing -------------
## ------------ End of testing -------------
# PCRE demonstration program. Not built automatcally. The point is that the

113
maint/GenerateUtt.py Executable file
View File

@ -0,0 +1,113 @@
#! /usr/bin/python
# Generate utt tables. Note: this script is written in Python 2 and is
# incompatible with Python 3. However, the 2to3 conversion script has been
# successfully tested on it.
# The source file pcre2_tables.c contains (amongst other things), a table that
# is indexed by script name. In order to reduce the number of relocations when
# loading the library, the names are held as a single large string, with
# offsets in the table. This is tedious to maintain by hand. Therefore, this
# script is used to generate the table. The output is sent to stdout; usually
# that should be directed to a temporary file. Then pcre2_tables.c can be
# edited by replacing the relevant definitions and table therein with the
# temporary file.
# Modified by PH 17-March-2009 to generate the more verbose form that works
# for UTF-support in EBCDIC as well as ASCII environments.
# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
# Modified by PH 04-May-2010 to add new "X.." special categories.
# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
# Modified by ChPe 30-September-2012 to add this note; no other changes were
# necessary for Unicode 6.2.0 support.
# Modfied by PH 26-February-2013 to add the Xuc special category.
# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
# New for Unicode 5.0
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
# New for Unicode 5.1
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
# New for Unicode 5.2
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
# New for Unicode 6.0.0
'Batak', 'Brahmi', 'Mandaic', \
# New for Unicode 6.1.0
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri'
]
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
# First add the Unicode script and category names.
utt_table = zip(script_names, ['PT_SC'] * len(script_names))
utt_table += zip(category_names, ['PT_PC'] * len(category_names))
utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names))
# Now add our own specials.
utt_table.append(('Any', 'PT_ANY'))
utt_table.append(('L&', 'PT_LAMP'))
utt_table.append(('Xan', 'PT_ALNUM'))
utt_table.append(('Xps', 'PT_PXSPACE'))
utt_table.append(('Xsp', 'PT_SPACE'))
utt_table.append(('Xuc', 'PT_UCNC'))
utt_table.append(('Xwd', 'PT_WORD'))
# Sort the table.
utt_table.sort()
# We have to use STR_ macros to define the strings so that it all works in
# UTF-8 mode on EBCDIC platforms.
for utt in utt_table:
print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')),
for c in utt[0]:
if c == '_':
print 'STR_UNDERSCORE',
elif c == '&':
print 'STR_AMPERSAND',
else:
print 'STR_%s' % c,;
print '"\\0"'
# Print the actual table, using the string names
print ''
print 'const char PRIV(utt_names)[] =';
last = ''
for utt in utt_table:
if utt == utt_table[-1]:
last = ';'
print ' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)
# This was how it was done before the EBCDIC-compatible modification.
# print ' "%s\\0"%s' % (utt[0], last)
print '\nconst ucp_type_table PRIV(utt)[] = {'
offset = 0
last = ','
for utt in utt_table:
if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
value = '0'
else:
value = 'ucp_' + utt[0]
if utt == utt_table[-1]:
last = ''
print ' { %3d, %s, %s }%s' % (offset, utt[1], value, last)
offset += len(utt[0]) + 1
print '};'

305
maint/ManyConfigTests Executable file
View File

@ -0,0 +1,305 @@
#! /bin/sh
# This is a script for the use of PCRE maintainers. It configures and rebuilds
# PCRE2 with a variety of configuration options, and in each case runs the
# tests to ensure that all goes well. Every possible combination would take far
# too long, so we use a representative sample. This script should be run in the
# PCRE2 source directory.
# Some of the tests have to be skipped when PCRE2 is built with non-Unix
# newline recognition. I am planning to reduce this as much as possible in due
# course.
# This is in case the caller has set aliases (as I do - PH)
unset cp ls mv rm
# Use -v to make the output more verbose
verbose=0
if [ "$1" = "-v" ] ; then verbose=1; fi
# This is a temporary directory for testing out-of-line builds
tmp=/tmp/pcretesting
# Don't bother with compiler optimization for most tests; it just slows down
# compilation a lot (and running the tests themselves is quick). However, a
# few specific tests turn optimization on, because it can provoke some compiler
# warnings.
CFLAGS="-g -O0"
CXXFLAGS="$CFLAGS"
ISGCC="no"
# If the compiler is gcc, add a lot of warning switches.
cc --version >zzz 2>/dev/null
if [ $? -eq 0 ] && grep GCC zzz >/dev/null; then
ISGCC="yes"
CFLAGS="$CFLAGS -Wall"
CFLAGS="$CFLAGS -Wno-overlength-strings"
CFLAGS="$CFLAGS -Wpointer-arith"
CFLAGS="$CFLAGS -Wwrite-strings"
CFLAGS="$CFLAGS -Wundef -Wshadow"
CFLAGS="$CFLAGS -Wmissing-field-initializers"
CFLAGS="$CFLAGS -Wunused-parameter"
CFLAGS="$CFLAGS -Wextra -Wformat"
CFLAGS="$CFLAGS -Wbad-function-cast"
CFLAGS="$CFLAGS -Wmissing-declarations"
CFLAGS="$CFLAGS -Wnested-externs"
CFLAGS="$CFLAGS -pedantic"
CFLAGS="$CFLAGS -Wuninitialized"
CFLAGS="$CFLAGS -Wmissing-prototypes"
CFLAGS="$CFLAGS -Wstrict-prototypes"
fi
# This function runs a single test with the set of configuration options that
# are in $opts. The source directory must be set in srcdir.
function runtest()
{
rm -f *_unittest
testcount=`expr $testcount + 1`
if [ "$opts" = "" ] ; then
echo "[$testcount/$testtotal] Configuring with: default settings"
else
echo "[$testcount/$testtotal] Configuring with:"
echo " $opts"
fi
CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" \
$srcdir/configure $opts >/dev/null 2>teststderr
if [ $? -ne 0 ]; then
echo " "
echo "**** Error while configuring ****"
cat teststderr
exit 1
fi
echo "Making"
make -j >/dev/null 2>teststderr
if [ $? -ne 0 -o -s teststderr ]; then
echo " "
echo "**** Errors or warnings while making ****"
echo " "
cat teststderr
exit 1
fi
if [ $verbose -eq 1 ]; then
./pcre2test -C
fi
nl=`./pcre2test -C newline`
./pcretest -C jit >/dev/null
jit=$?
./pcre2test -C utf >/dev/null
utf=$?
if [ "$nl" = "LF" -o "$nl" = "ANY" ]; then
echo "Running C library tests $withvalgrind"
$srcdir/RunTest $valgrind >teststdout
if [ $? -ne 0 ]; then
echo " "
echo "**** Test failed ****"
cat teststdout
exit 1
fi
else
echo "Skipping C library tests: newline is $nl"
fi
if [ "$nl" = "LF" ]; then
echo "Running pcre2grep tests $withvalgrind"
$srcdir/RunGrepTest $valgrind >teststdout 2>teststderr
if [ $? -ne 0 ]; then
echo " "
echo "**** Test failed ****"
cat teststderr
cat teststdout
exit 1
fi
else
echo "Skipping pcre2grep tests: newline is $nl"
fi
if [ "$jit" -gt 0 -a $utf -gt 0 ]; then
echo "Running JIT regression tests $withvalgrind"
$cvalgrind $srcdir/pcre2_jit_test >teststdout 2>teststderr
if [ $? -ne 0 ]; then
echo " "
echo "**** Test failed ****"
cat teststderr
cat teststdout
exit 1
fi
else
echo "Skipping JIT regression tests: JIT or UTF not enabled"
fi
# if [ "$nl" = "LF" -o "$nl" = "ANY" ]; then
# if [ -f pcrecpp_unittest ] ; then
# for utest in pcrecpp_unittest \
# pcre_scanner_unittest \
# pcre_stringpiece_unittest
# do
# echo "Running $utest $withvalgrind"
# $cvalgrind $utest >teststdout
# if [ $? -ne 0 ]; then
# echo " "
# echo "**** Test failed ****"
# cat teststdout
# exit 1
# fi
# done
# else
# echo "Skipping C++ tests: pcrecpp_unittest does not exist"
# fi
# else
# echo "Skipping C++ tests: newline is $nl"
# fi
}
# Update the total count whenever a new test is added; it is used to show
# progess as each test is run.
testtotal=40
testcount=0
# This set of tests builds PCRE and runs the tests with a variety of configure
# options, in the current (source) directory. The empty configuration builds
# with all the default settings. As well as testing that these options work, we
# use --disable-shared or --disable-static after the default test (which builds
# both) to save a bit of time by building only one version of the library for
# the subsequent tests.
valgrind=
cvalgrind=
withvalgrind=
srcdir=.
export srcdir
# If gcc is in use, run a maximally configured test with -O2, because that can
# throw up warnings that are not detected with -O0.
if [ "$ISGCC" = "yes" ]; then
echo "Maximally configured test with -O2"
SAVECLFAGS="$CFLAGS"
CFLAGS="$CFLAGS -O2"
opts="--disable-shared --enable-utf --enable-jit --enable-pcre16 --enable-pcre32"
runtest
CFLAGS="$SAVECFLAGS"
fi
echo "General tests in the current directory"
for opts in \
"" \
"--enable-utf --disable-static" \
"--disable-stack-for-recursion --disable-shared" \
"--enable-utf --disable-shared" \
"--enable-utf --disable-stack-for-recursion --disable-shared" \
"--enable-utf --with-link-size=3 --disable-shared" \
"--enable-rebuild-chartables --disable-shared" \
"--enable-newline-is-any --disable-shared" \
"--enable-newline-is-cr --disable-shared" \
"--enable-newline-is-crlf --disable-shared" \
"--enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
"--enable-utf --enable-newline-is-any --disable-stack-for-recursion --disable-static" \
"--enable-jit --disable-shared" \
"--enable-jit --enable-utf --disable-shared" \
"--enable-jit --enable-utf --with-link-size=3 --disable-shared" \
"--enable-pcre16" \
"--enable-pcre16 --enable-jit --enable-utf --disable-shared" \
"--enable-pcre16 --enable-jit --disable-pcre8 --disable-shared" \
"--enable-pcre16 --enable-jit --disable-pcre8 --enable-utf --disable-shared" \
"--enable-pcre16 --disable-stack-for-recursion --disable-shared" \
"--enable-pcre16 --enable-utf --disable-stack-for-recursion --disable-shared" \
"--enable-pcre16 --enable-jit --enable-utf --with-link-size=3 --disable-shared" \
"--enable-pcre16 --enable-jit --enable-utf --with-link-size=4 --disable-shared" \
"--enable-pcre32" \
"--enable-pcre32 --enable-jit --enable-utf --disable-shared" \
"--enable-pcre32 --enable-jit --disable-pcre8 --disable-shared" \
"--enable-pcre32 --enable-jit --disable-pcre8 --enable-utf --disable-shared" \
"--enable-pcre32 --disable-stack-for-recursion --disable-shared" \
"--enable-pcre32 --enable-utf --disable-stack-for-recursion --disable-shared" \
"--enable-pcre32 --enable-jit --enable-utf --with-link-size=4 --disable-shared" \
"--enable-pcre32 --enable-pcre16 --disable-shared" \
"--enable-pcre32 --enable-pcre16 --disable-pcre8 --disable-shared" \
"--enable-pcre32 --enable-pcre16 --disable-pcre8 --enable-jit --enable-utf --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
do
runtest
done
# Now re-run some of the tests under valgrind.
echo "Tests in the current directory using valgrind"
valgrind=valgrind
cvalgrind="valgrind -q --smc-check=all"
withvalgrind="with valgrind"
for opts in \
"--enable-utf --disable-stack-for-recursion --disable-shared" \
"--enable-utf --with-link-size=3 --disable-shared" \
"--enable-jit --enable-utf --disable-shared" \
"--enable-pcre16 --enable-pcre32 --enable-jit --enable-utf " \
"--disable-shared"
do
opts="--enable-valgrind $opts"
runtest
done
valgrind=
cvalgrind=
withvalgrind=
# Clean up the distribution and then do at least one build and test in a
# directory other than the source directory. It doesn't work unless the
# source directory is cleaned up first.
if [ -f Makefile ]; then
echo "Running 'make distclean'"
make distclean >/dev/null 2>&1
if [ $? -ne 0 ]; then
echo "** 'make distclean' failed"
exit 1
fi
fi
echo "Tests in the $tmp directory"
srcdir=`pwd`
export srcdir
if [ ! -e $tmp ]; then
mkdir $tmp
fi
if [ ! -d $tmp ]; then
echo "** Failed to create $tmp or it is not a directory"
exit 1
fi
cd $tmp
if [ $? -ne 0 ]; then
echo "** Failed to cd to $tmp"
exit 1
fi
for opts in \
"--enable-utf --disable-shared"
do
runtest
done
echo "Removing $tmp"
rm -rf $tmp
echo "All done"
# End

505
maint/MultiStage2.py Executable file
View File

@ -0,0 +1,505 @@
#! /usr/bin/python
# Multistage table builder
# (c) Peter Kankowski, 2008
##############################################################################
# This script was submitted to the PCRE project by Peter Kankowski as part of
# the upgrading of Unicode property support. The new code speeds up property
# matching many times. The script is for the use of PCRE maintainers, to
# generate the pcre_ucd.c file that contains a digested form of the Unicode
# data tables.
#
# The script should be run in the maint subdirectory, using the command
#
# [python2] ./MultiStage2.py >../src/pcre2_ucd.c
#
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
# Unicode.tables subdirectory. The first of these is found in the "extracted"
# subdirectory of the Unicode database (UCD) on the Unicode web site; the
# second is in the "auxiliary" subdirectory; the other two are directly in the
# UCD directory.
#
# Minor modifications made to this script:
# Added #! line at start
# Removed tabs
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
# Consequent code tidy
# Adjusted data file names to take from the Unicode.tables directory
# Adjusted global table names by prefixing _pcre_.
# Commented out stuff relating to the casefolding table, which isn't used;
# removed completely in 2012.
# Corrected size calculation
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
# Update for PCRE2: name changes and SUPPORT_UCP is abolished.
#
# Major modifications made to this script:
# Added code to add a grapheme break property field to records.
#
# Added code to search for sets of more than two characters that must match
# each other caselessly. A new table is output containing these sets, and
# offsets into the table are added to the main output records. This new
# code scans CaseFolding.txt instead of UnicodeData.txt.
#
# The main tables generated by this script are used by macros defined in
# pcre2_internal.h. They look up Unicode character properties using short
# sequences of code that contains no branches, which makes for greater speed.
#
# Conceptually, there is a table of records (of type ucd_record), containing a
# script number, character type, grapheme break type, offset to caseless
# matching set, and offset to the character's other case for every character.
# However, a real table covering all Unicode characters would be far too big.
# It can be efficiently compressed by observing that many characters have the
# same record, and many blocks of characters (taking 128 characters in a block)
# have the same set of records as other blocks. This leads to a 2-stage lookup
# process.
#
# This script constructs four tables. The ucd_caseless_sets table contains
# lists of characters that all match each other caselessly. Each list is
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
# any valid character. The first list is empty; this is used for characters
# that are not part of any list.
#
# The ucd_records table contains one instance of every unique record that is
# required. The ucd_stage1 table is indexed by a character's block number, and
# yields what is in effect a "virtual" block number. The ucd_stage2 table is a
# table of "virtual" blocks; each block is indexed by the offset of a character
# within its own block, and the result is the offset of the required record.
#
# Example: lowercase "a" (U+0061) is in block 0
# lookup 0 in stage1 table yields 0
# lookup 97 in the first table in stage2 yields 16
# record 17 is { 33, 5, 11, 0, -32 }
# 33 = ucp_Latin => Latin script
# 5 = ucp_Ll => Lower case letter
# 11 = ucp_gbOther => Grapheme break property "Other"
# 0 => not part of a caseless set
# -32 => Other case is U+0041
#
# Almost all lowercase latin characters resolve to the same record. One or two
# are different because they are part of a multi-character caseless set (for
# example, k, K and the Kelvin symbol are such a set).
#
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
# lookup 96 in stage1 table yields 88
# lookup 66 in the 88th table in stage2 yields 467
# record 470 is { 26, 7, 11, 0, 0 }
# 26 = ucp_Hiragana => Hiragana script
# 7 = ucp_Lo => Other letter
# 11 = ucp_gbOther => Grapheme break property "Other"
# 0 => not part of a caseless set
# 0 => No other case
#
# In these examples, no other blocks resolve to the same "virtual" block, as it
# happens, but plenty of other blocks do share "virtual" blocks.
#
# There is a fourth table, maintained by hand, which translates from the
# individual character types such as ucp_Cc to the general types like ucp_C.
#
# Philip Hazel, 03 July 2008
#
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
# July-2012: Updated list of scripts for Unicode 6.1.0
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
# field in the record to hold the value. Luckily, the
# structure had a hole in it, so the resulting table is
# not much bigger than before.
# 18-September-2012: Added code for multiple caseless sets. This uses the
# final hole in the structure.
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
# 13-May-2014: Updated for PCRE2
##############################################################################
import re
import string
import sys
MAX_UNICODE = 0x110000
NOTACHAR = 0xffffffff
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
def make_get_names(enum):
return lambda chardata: enum.index(chardata[1])
# Parse a line of CaseFolding.txt
def get_other_case(chardata):
if chardata[1] == 'C' or chardata[1] == 'S':
return int(chardata[2], 16) - int(chardata[0], 16)
return 0
# Read the whole table in memory
def read_table(file_name, get_value, default_value):
file = open(file_name, 'r')
table = [default_value] * MAX_UNICODE
for line in file:
line = re.sub(r'#.*', '', line)
chardata = map(string.strip, line.split(';'))
if len(chardata) <= 1:
continue
value = get_value(chardata)
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
char = int(m.group(1), 16)
if m.group(3) is None:
last = char
else:
last = int(m.group(3), 16)
for i in range(char, last + 1):
# It is important not to overwrite a previously set
# value because in the CaseFolding file there are lines
# to be ignored (returning the default value of 0)
# which often come after a line which has already set
# data.
if table[i] == default_value:
table[i] = value
file.close()
return table
# Get the smallest possible C language type for the values
def get_type_size(table):
type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
limits = [(0, 255), (0, 65535), (0, 4294967295),
(-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
minval = min(table)
maxval = max(table)
for num, (minlimit, maxlimit) in enumerate(limits):
if minlimit <= minval and maxval <= maxlimit:
return type_size[num]
else:
raise OverflowError, "Too large to fit into C types"
def get_tables_size(*tables):
total_size = 0
for table in tables:
type, size = get_type_size(table)
total_size += size * len(table)
return total_size
# Compress the table into the two stages
def compress_table(table, block_size):
blocks = {} # Dictionary for finding identical blocks
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
stage2 = [] # Stage 2 table contains the blocks with property values
table = tuple(table)
for i in range(0, len(table), block_size):
block = table[i:i+block_size]
start = blocks.get(block)
if start is None:
# Allocate a new block
start = len(stage2) / block_size
stage2 += block
blocks[block] = start
stage1.append(start)
return stage1, stage2
# Print a table
def print_table(table, table_name, block_size = None):
type, size = get_type_size(table)
ELEMS_PER_LINE = 16
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
if block_size:
s += ", block = %d" % block_size
print s + " */"
table = tuple(table)
if block_size is None:
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
mult = MAX_UNICODE / len(table)
for i in range(0, len(table), ELEMS_PER_LINE):
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
else:
if block_size > ELEMS_PER_LINE:
el = ELEMS_PER_LINE
else:
el = block_size
fmt = "%3d," * el + "\n"
if block_size > ELEMS_PER_LINE:
fmt = fmt * (block_size / ELEMS_PER_LINE)
for i in range(0, len(table), block_size):
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
print "};\n"
# Extract the unique combinations of properties into records
def combine_tables(*tables):
records = {}
index = []
for t in zip(*tables):
i = records.get(t)
if i is None:
i = records[t] = len(records)
index.append(i)
return index, records
def get_record_size_struct(records):
size = 0
structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
'types in this structure definition from pcre2_internal.h (the actual\n' + \
'field names will be different):\n\ntypedef struct {\n'
for i in range(len(records[0])):
record_slice = map(lambda record: record[i], records)
slice_type, slice_size = get_type_size(record_slice)
# add padding: round up to the nearest power of slice_size
size = (size + slice_size - 1) & -slice_size
size += slice_size
structure += '%s property_%d;\n' % (slice_type, i)
# round up to the first item of the next structure in array
record_slice = map(lambda record: record[0], records)
slice_type, slice_size = get_type_size(record_slice)
size = (size + slice_size - 1) & -slice_size
structure += '} ucd_record;\n*/\n\n'
return size, structure
def test_record_size():
tests = [ \
( [(3,), (6,), (6,), (1,)], 1 ), \
( [(300,), (600,), (600,), (100,)], 2 ), \
( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
]
for test in tests:
size, struct = get_record_size_struct(test[0])
assert(size == test[1])
#print struct
def print_records(records, record_size):
print 'const ucd_record PRIV(ucd_records)[] = { ' + \
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
records = zip(records.keys(), records.values())
records.sort(None, lambda x: x[1])
for i, record in enumerate(records):
print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
print '};\n'
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
# New for Unicode 5.0
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
# New for Unicode 5.1
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
# New for Unicode 5.2
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
# New for Unicode 6.0.0
'Batak', 'Brahmi', 'Mandaic', \
# New for Unicode 6.1.0
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri'
]
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other' ]
test_record_size()
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
# This block of code was added by PH in September 2012. I am not a Python
# programmer, so the style is probably dreadful, but it does the job. It scans
# the other_case table to find sets of more than two characters that must all
# match each other caselessly. Later in this script a table of these sets is
# written out. However, we have to do this work here in order to compute the
# offsets in the table that are inserted into the main table.
# The CaseFolding.txt file lists pairs, but the common logic for reading data
# sets only one value, so first we go through the table and set "return"
# offsets for those that are not already set.
for c in range(0x10ffff):
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
other_case[c + other_case[c]] = -other_case[c]
# Now scan again and create equivalence sets.
sets = []
for c in range(0x10ffff):
o = c + other_case[c]
# Trigger when this character's other case does not point back here. We
# now have three characters that are case-equivalent.
if other_case[o] != -other_case[c]:
t = o + other_case[o]
# Scan the existing sets to see if any of the three characters are already
# part of a set. If so, unite the existing set with the new set.
appended = 0
for s in sets:
found = 0
for x in s:
if x == c or x == o or x == t:
found = 1
# Add new characters to an existing set
if found:
found = 0
for y in [c, o, t]:
for x in s:
if x == y:
found = 1
if not found:
s.append(y)
appended = 1
# If we have not added to an existing set, create a new one.
if not appended:
sets.append([c, o, t])
# End of loop looking for caseless sets.
# Now scan the sets and set appropriate offsets for the characters.
caseless_offsets = [0] * MAX_UNICODE
offset = 1;
for s in sets:
for x in s:
caseless_offsets[x] = offset
offset += len(s) + 1
# End of block of code for creating offsets for caseless matching sets.
# Combine the tables
table, records = combine_tables(script, category, break_props,
caseless_offsets, other_case)
record_size, record_struct = get_record_size_struct(records.keys())
# Find the optimum block size for the two-stage table
min_size = sys.maxint
for block_size in [2 ** i for i in range(5,10)]:
size = len(records) * record_size
stage1, stage2 = compress_table(table, block_size)
size += get_tables_size(stage1, stage2)
#print "/* block size %5d => %5d bytes */" % (block_size, size)
if size < min_size:
min_size = size
min_stage1, min_stage2 = stage1, stage2
min_block_size = block_size
print "/* This module is generated by the maint/MultiStage2.py script."
print "Do not modify it by hand. Instead modify the script and run it"
print "to regenerate this code."
print
print "As well as being part of the PCRE2 library, this module is #included"
print "by the pcre2test program, which redefines the PRIV macro to change"
print "table names from _pcre2_xxx to xxxx, thereby avoiding name clashes"
print "with the library. At present, just one of these tables is actually"
print "needed. */"
print
print "#ifndef PCRE2_INCLUDED"
print
print "#ifdef HAVE_CONFIG_H"
print "#include \"config.h\""
print "#endif"
print
print "#include \"pcre2_internal.h\""
print
print "#endif /* PCRE2_INCLUDED */"
print
print "/* Unicode character database. */"
print "/* This file was autogenerated by the MultiStage2.py script. */"
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
print
print "/* The tables herein are needed only when UCP support is built,"
print "and in PCRE2 that happens automatically with UTF support."
print "This module should not be referenced otherwise, so"
print "it should not matter whether it is compiled or not. However"
print "a comment was received about space saving - maybe the guy linked"
print "all the modules rather than using a library - so we include a"
print "condition to cut out the tables when not needed. But don't leave"
print "a totally empty module because some compilers barf at that."
print "Instead, just supply small dummy tables. */"
print
print "#ifndef SUPPORT_UTF"
print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
print "const uint8_t PRIV(ucd_stage1)[] = {0};"
print "const uint16_t PRIV(ucd_stage2)[] = {0};"
print "const uint32_t PRIV(ucd_caseless_sets)[] = {0};"
print "#else"
print
print record_struct
# --- Added by PH: output the table of caseless character sets ---
print "const uint32_t PRIV(ucd_caseless_sets)[] = {"
print " NOTACHAR,"
for s in sets:
s = sorted(s)
for x in s:
print ' 0x%04x,' % x,
print ' NOTACHAR,'
print '};'
print
# ------
print "/* When #included in pcre2test, we don't need this large table. */"
print
print "#ifndef PCRE2_INCLUDED"
print
print_records(records, record_size)
print_table(min_stage1, 'PRIV(ucd_stage1)')
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
print "#if UCD_BLOCK_SIZE != %d" % min_block_size
print "#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h"
print "#endif"
print "#endif /* SUPPORT_UTF */"
print
print "#endif /* PCRE2_INCLUDED */"
"""
# Three-stage tables:
# Find the optimum block size for 3-stage table
min_size = sys.maxint
for stage3_block in [2 ** i for i in range(2,6)]:
stage_i, stage3 = compress_table(table, stage3_block)
for stage2_block in [2 ** i for i in range(5,10)]:
size = len(records) * 4
stage1, stage2 = compress_table(stage_i, stage2_block)
size += get_tables_size(stage1, stage2, stage3)
# print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
if size < min_size:
min_size = size
min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
min_stage2_block, min_stage3_block = stage2_block, stage3_block
print "/* Total size: %d bytes" % min_size */
print_records(records)
print_table(min_stage1, 'ucd_stage1')
print_table(min_stage2, 'ucd_stage2', min_stage2_block)
print_table(min_stage3, 'ucd_stage3', min_stage3_block)
"""

324
maint/README Normal file
View File

@ -0,0 +1,324 @@
MAINTENANCE README FOR PCRE2
============================
The files in the "maint" directory of the PCRE2 source contain data, scripts,
and programs that are used for the maintenance of PCRE2, but which do not form
part of the PCRE2 distribution tarballs. This document describes these files
and also contains some notes for maintainers. Its contents are:
Files in the maint directory
Updating to a new Unicode release
Preparing for a PCRE2 release
Making a PCRE2 release
Long-term ideas (wish list)
Files in the maint directory
============================
GenerateUtt.py A Python script to generate part of the pcre2_tables.c file
that contains Unicode script names in a long string with
offsets, which is tedious to maintain by hand.
ManyConfigTests A shell script that runs "configure, make, test" a number of
times with different configuration settings.
MultiStage2.py A Python script that generates the file pcre2_ucd.c from three
Unicode data tables, which are themselves downloaded from the
Unicode web site. Run this script in the "maint" directory.
The generated file contains the tables for a 2-stage lookup
of Unicode properties.
pcre2_chartables.c.non-standard
This is a set of character tables that came from a Windows
system. It has characters greater than 128 that are set as
spaces, amongst other things. I kept it so that it can be
used for testing from time to time.
README This file.
Unicode.tables The files in this directory (CaseFolding.txt,
DerivedGeneralCategory.txt, GraphemeBreakProperty.txt,
Scripts.txt and UnicodeData.txt) were downloaded from the
Unicode web site. They contain information about Unicode
characters and scripts.
ucptest.c A short C program for testing the Unicode property macros
that do lookups in the pcre2_ucd.c data, mainly useful after
rebuilding the Unicode property table. Compile and run this in
the "maint" directory (see comments at its head).
ucptestdata A directory containing two files, testinput1 and testoutput1,
to use in conjunction with the ucptest program.
utf8.c A short, freestanding C program for converting a Unicode code
point into a sequence of bytes in the UTF-8 encoding, and vice
versa. If its argument is a hex number such as 0x1234, it
outputs a list of the equivalent UTF-8 bytes. If its argument
is sequence of concatenated UTF-8 bytes (e.g. e188b4) it
treats them as a UTF-8 character and outputs the equivalent
code point in hex.
Updating to a new Unicode release
=================================
When there is a new release of Unicode, the files in Unicode.tables must be
refreshed from the web site. If the new version of Unicode adds new character
scripts, the source file pacr2_ucp.h and both the MultiStage2.py and the
GenerateUtt.py scripts must be edited to add the new names. Then MultiStage2.py
can be run to generate a new version of pcre2_ucd.c, and GenerateUtt.py can be
run to generate the tricky tables for inclusion in pcre2_tables.c.
If MultiStage2.py gives the error "ValueError: list.index(x): x not in list",
the cause is usually a missing (or misspelt) name in the list of scripts. I
couldn't find a straightforward list of scripts on the Unicode site, but
there's a useful Wikipedia page that list them, and notes the Unicode version
in which they were introduced:
http://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts
The ucptest program can be compiled and used to check that the new tables in
pcre2_ucd.c work properly, using the data files in ucptestdata to check a
number of test characters. The source file ucptest.c must be updated whenever
new Unicode script names are added.
Note also that both the pcre2syntax.3 and pcre2pattern.3 man pages contain
lists of Unicode script names.
Preparing for a PCRE release
============================
This section contains a checklist of things that I consult before building a
distribution for a new release.
. Ensure that the version number and version date are correct in configure.ac.
. Update the library version numbers in configure.ac according to the rules
given below.
. If new build options have been added, ensure that they are added to the CMake
files as well as to the autoconf files. The relevant files are CMakeLists.txt
and config-cmake.h.in. After making a release tarball, test it out with CMake
if there have been changes here.
. Run ./autogen.sh to ensure everything is up-to-date.
. Compile and test with many different config options, and combinations of
options. Also, test with valgrind by running "RunTest valgrind" and
"RunGrepTest valgrind" (which takes quite a long time). The script
maint/ManyConfigTests now encapsulates this testing. It runs tests with
different configurations, and it also runs some of them with valgrind, all of
which can take quite some time.
. Run perltest.pl on the test data for tests 1, 4, and 6. The output
should match the PCRE2 test output, apart from the version identification at
the start of each test. The other tests are not Perl-compatible (they use
various PCRE2-specific features or options).
. It is possible to test with the emulated memmove() function by undefining
HAVE_MEMMOVE and HAVE_BCOPY in config.h, though I do not do this often. You
may see a number of "pcre2_memmove defined but not used" warnings for the
modules in which there is no call to memmove(). These can be ignored.
. Documentation: check AUTHORS, ChangeLog (check version and date), LICENCE,
NEWS (check version and date), NON-AUTOTOOLS-BUILD, and README. Many of these
won't need changing, but over the long term things do change.
. I used to test new releases myself on a number of different operating
systems, using different compilers as well. For example, on Solaris it is
helpful to test using Sun's cc compiler as a change from gcc. Adding
-xarch=v9 to the cc options does a 64-bit test, but it also needs -S 64 for
pcretest to increase the stack size for test 2. Since I retired I can no
longer do this, but instead I rely on putting out release candidates for
folks on the pcre-dev list to test.
Updating version info for libtool
=================================
This set of rules for updating library version information came from a web page
whose URL I have forgotten. The version information consists of three parts:
(current, revision, age).
1. Start with version information of 0:0:0 for each libtool library.
2. Update the version information only immediately before a public release of
your software. More frequent updates are unnecessary, and only guarantee
that the current interface number gets larger faster.
3. If the library source code has changed at all since the last update, then
increment revision; c:r:a becomes c:r+1:a.
4. If any interfaces have been added, removed, or changed since the last
update, increment current, and set revision to 0.
5. If any interfaces have been added since the last public release, then
increment age.
6. If any interfaces have been removed or changed since the last public
release, then set age to 0.
The following explanation may help in understanding the above rules a bit
better. Consider that there are three possible kinds of reaction from users to
changes in a shared library:
1. Programs using the previous version may use the new version as a drop-in
replacement, and programs using the new version can also work with the
previous one. In other words, no recompiling nor relinking is needed. In
this case, increment revision only, don't touch current or age.
2. Programs using the previous version may use the new version as a drop-in
replacement, but programs using the new version may use APIs not present in
the previous one. In other words, a program linking against the new version
may fail if linked against the old version at run time. In this case, set
revision to 0, increment current and age.
3. Programs may need to be changed, recompiled, relinked in order to use the
new version. Increment current, set revision and age to 0.
Making a PCRE release
=====================
Run PrepareRelease and commit the files that it changes (by removing trailing
spaces). The first thing this script does is to run CheckMan on the man pages;
if it finds any markup errors, it reports them and then aborts.
Once PrepareRelease has run clean, run "make distcheck" to create the tarballs
and the zipball. Double-check with "svn status", then create an SVN tagged
copy:
svn copy svn://vcs.exim.org/pcre2/code/trunk \
svn://vcs.exim.org/pcre2/code/tags/pcre-8.xx
Don't forget to update Freecode (fka Freshmeat) when the new release is out,
and to tell webmaster@pcre.org and the mailing list. Also, update the list of
version numbers in Bugzilla (edit products).
Future ideas (wish list)
========================
This section records a list of ideas so that they do not get forgotten. They
vary enormously in their usefulness and potential for implementation. Some are
very sensible; some are rather wacky. Some have been on this list for years;
others are relatively new.
. Optimization
There are always ideas for new optimizations so as to speed up pattern
matching. Most of them try to save work by recognizing a non-match without
having to scan all the possibilities. These are some that I've recorded:
* /((A{0,5}){0,5}){0,5}(something complex)/ on a non-matching string is very
slow, though Perl is fast. Can we speed up somehow? Convert to {0,125}?
OTOH, this is pathological - the user could easily fix it.
* Turn ={4} into ==== ? (for speed). I once did an experiment, and it seems
to have little effect, and maybe makes things worse.
* "Ends with literal string" - note that a single character doesn't gain much
over the existing "required byte" (reqbyte) feature that just remembers one
data unit.
* Remember an initial string rather than just 1 code unit?
* A required code unit from alternatives - not just the last unit, but an
earlier one if common to all alternatives.
o Friedl contains other ideas.
* The code does not set initial code unit flags for Unicode property types
such as \p; I don't know how much benefit there would be for, for example,
setting the bits for 0-9 and all values >= xC0 (in 8-bit mode) when a
pattern starts with \p{N}.
* There is scope for more "auto-possessifying" in connection with \p and \P.
. If Perl gets to a consistent state over the settings of capturing sub-
patterns inside repeats, see if we can match it. One example of the
difference is the matching of /(main(O)?)+/ against mainOmain, where PCRE
leaves $2 set. In Perl, it's unset. Changing this in PCRE will be very hard
because I think it needs much more state to be remembered.
. Perl 6 will be a revolution. Is it a revolution too far for PCRE?
. Allow errorptr and erroroffset to be NULL. I don't like this idea.
. Line endings:
* Option to use NUL as a line terminator in subject strings. This could now
be done relatively easily since the extension to support LF, CR, and CRLF.
If it is done, a suitable option for pcregrep is also required.
. Catch SIGSEGV for stack overflows?
. A feature to suspend a match via a callout was once requested.
. Option to convert results into character offsets and character lengths.
. Option for pcregrep to scan only the start of a file. I am not keen - this is
the job of "head".
. A (non-Unix) user wanted pcregrep options to (a) list a file name just once,
preceded by a blank line, instead of adding it to every matched line, and (b)
support --outputfile=name.
. Consider making UTF and UCP the default for PCRE n.0 for some n > 8.
. Define a union for the results from pcre2_pattern_info().
. Provide a "random access to the subject" facility so that the way in which it
is stored is independent of PCRE. For efficiency, it probably isn't possible
to switch this dynamically. It would have to be specified when PCRE was
compiled. PCRE would then call a function every time it wanted a character.
. Wild thought: the ability to compile from PCRE's internal byte code to a real
FSM and a very fast (third) matcher to process the result. There would be
even more restrictions than for pcre_dfa_exec(), however. This is not easy.
This is probably obsolete now that we have the JIT support.
. Should pcretest have some private locale data, to avoid relying on the
available locales for the test data, since different OS have different ideas?
This won't be as thorough a test, but perhaps that doesn't really matter.
. pcregrep: add -rs for a sorted recurse? Having to store file names and sort
them will of course slow it down.
. Someone suggested --disable-callout to save code space when callouts are
never wanted. This seems rather marginal.
. A user suggested a parameter to limit the length of string matched, for
example if the parameter is N, the current match should fail if the matched
substring exceeds N. This could apply to both match functions. The value
could be a new field in the extra block.
. Callouts with arguments: (?Cn:ARG) for instance.
. Write a function that generates random matching strings for a compiled regex.
. Pcregrep: an option to specify the output line separator, either as a string
or select from a fixed list. This is not dead easy, because at the moment it
outputs whatever is in the input file.
. Improve the code for duplicate checking in pcre_dfa_exec(). An incomplete,
non-thread-safe patch showed that this can help performance for patterns
where there are many alternatives. However, a simple thread-safe
implementation that I tried made things worse in many simple cases, so this
is not an obviously good thing.
. PCRE cannot at present distinguish between subpatterns with different names,
but the same number (created by the use of ?|). In order to do so, a way of
remembering *which* subpattern numbered n matched is needed. Bugzilla #760.
Now that (*MARK) has been implemented, it can perhaps be used as a way round
this problem.
. Instead of having #ifdef HAVE_CONFIG_H in each module, put #include
"something" and the the #ifdef appears only in one place, in "something".
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
Last updated: 13 May 2014

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,138 @@
const unsigned char _pcre_default_tables[] = {
0,1,2,3,4,5,6,7,
8,9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,
24,25,26,27,28,29,30,31,
32,33,34,35,36,37,38,39,
40,41,42,43,44,45,46,47,
48,49,50,51,52,53,54,55,
56,57,58,59,60,61,62,63,
64,97,98,99,100,101,102,103,
104,105,106,107,108,109,110,111,
112,113,114,115,116,117,118,119,
120,121,122,91,92,93,94,95,
96,97,98,99,100,101,102,103,
104,105,106,107,108,109,110,111,
112,113,114,115,116,117,118,119,
120,121,122,123,124,125,126,127,
128,129,130,131,132,133,134,135,
136,137,138,139,140,141,142,143,
144,145,146,147,148,149,150,151,
152,153,154,155,156,157,158,159,
160,161,162,163,164,165,166,167,
168,169,170,171,172,173,174,175,
176,177,178,179,180,181,182,183,
184,185,186,187,188,189,190,191,
224,225,226,227,228,229,230,231,
232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,215,
248,249,250,251,252,253,254,223,
224,225,226,227,228,229,230,231,
232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,247,
248,249,250,251,252,253,254,255,
0,1,2,3,4,5,6,7,
8,9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,
24,25,26,27,28,29,30,31,
32,33,34,35,36,37,38,39,
40,41,42,43,44,45,46,47,
48,49,50,51,52,53,54,55,
56,57,58,59,60,61,62,63,
64,97,98,99,100,101,102,103,
104,105,106,107,108,109,110,111,
112,113,114,115,116,117,118,119,
120,121,122,91,92,93,94,95,
96,65,66,67,68,69,70,71,
72,73,74,75,76,77,78,79,
80,81,82,83,84,85,86,87,
88,89,90,123,124,125,126,127,
128,129,130,131,132,133,134,135,
136,137,138,139,140,141,142,143,
144,145,146,147,148,149,150,151,
152,153,154,155,156,157,158,159,
160,161,162,163,164,165,166,167,
168,169,170,171,172,173,174,175,
176,177,178,179,180,181,182,183,
184,185,186,187,188,189,190,191,
224,225,226,227,228,229,230,231,
232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,215,
248,249,250,251,252,253,254,223,
192,193,194,195,196,197,198,199,
200,201,202,203,204,205,206,207,
208,209,210,211,212,213,214,247,
216,217,218,219,220,221,222,255,
0,62,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,
32,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,255,3,
126,0,0,0,126,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,255,3,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,12,2,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
254,255,255,7,0,0,0,0,
0,0,0,0,0,0,0,0,
255,255,127,127,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,254,255,255,7,
0,0,0,0,0,4,32,4,
0,0,0,128,255,255,127,255,
0,0,0,0,0,0,255,3,
254,255,255,135,254,255,255,7,
0,0,0,0,0,4,44,6,
255,255,127,255,255,255,127,255,
0,0,0,0,254,255,255,255,
255,255,255,255,255,255,255,127,
0,0,0,0,254,255,255,255,
255,255,255,255,255,255,255,255,
0,2,0,0,255,255,255,255,
255,255,255,255,255,255,255,127,
0,0,0,0,255,255,255,255,
255,255,255,255,255,255,255,255,
0,0,0,0,254,255,0,252,
1,0,0,248,1,0,0,120,
0,0,0,0,254,255,255,255,
0,0,128,0,0,0,128,0,
255,255,255,255,0,0,0,0,
0,0,0,0,0,0,0,128,
255,255,255,255,0,0,0,0,
0,0,0,0,0,0,0,0,
128,0,0,0,0,0,0,0,
0,1,1,0,1,1,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,0,0,0,128,0,0,0,
128,128,128,128,0,0,128,0,
28,28,28,28,28,28,28,28,
28,28,0,0,0,0,0,128,
0,26,26,26,26,26,26,18,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,128,128,0,128,16,
0,26,26,26,26,26,26,18,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,128,128,0,0,0,
0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,
0,0,18,0,0,0,0,0,
0,0,20,20,0,18,0,0,
0,20,18,0,0,0,0,0,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,0,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,18,
18,18,18,18,18,18,18,0,
18,18,18,18,18,18,18,18
};

297
maint/ucptest.c Normal file
View File

@ -0,0 +1,297 @@
/***************************************************
* A program for testing the Unicode property table *
***************************************************/
/* Copyright (c) University of Cambridge 2008 */
/* Compile thus:
gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
*/
/* The program expects to read commands on stdin, and it writes output
to stdout. There is only one command, "findprop", followed by a list of Unicode
code points as hex numbers (without any prefixes). The output is one line per
character, giving its Unicode properties followed by its other case if there is
one. */
#ifdef HAVE_CONFIG_H
#include "../src/config.h"
#endif
#ifndef SUPPORT_UTF
#define SUPPORT_UTF
#endif
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../src/pcre2_internal.h"
#include "../src/pcre2_ucp.h"
/* -------------------------------------------------------------------*/
#define CS (char *)
#define CCS (const char *)
#define CSS (char **)
#define US (unsigned char *)
#define CUS (const unsigned char *)
#define USS (unsigned char **)
/* -------------------------------------------------------------------*/
/*************************************************
* Print Unicode property info for a char *
*************************************************/
static void
print_prop(int c)
{
int type = UCD_CATEGORY(c);
int fulltype = UCD_CHARTYPE(c);
int script = UCD_SCRIPT(c);
int gbprop = UCD_GRAPHBREAK(c);
int othercase = UCD_OTHERCASE(c);
int caseset = UCD_CASESET(c);
unsigned char *fulltypename = US"??";
unsigned char *typename = US"??";
unsigned char *scriptname = US"??";
unsigned char *graphbreak = US"??";
switch (type)
{
case ucp_C: typename = US"Control"; break;
case ucp_L: typename = US"Letter"; break;
case ucp_M: typename = US"Mark"; break;
case ucp_N: typename = US"Number"; break;
case ucp_P: typename = US"Punctuation"; break;
case ucp_S: typename = US"Symbol"; break;
case ucp_Z: typename = US"Separator"; break;
}
switch (fulltype)
{
case ucp_Cc: fulltypename = US"Control"; break;
case ucp_Cf: fulltypename = US"Format"; break;
case ucp_Cn: fulltypename = US"Unassigned"; break;
case ucp_Co: fulltypename = US"Private use"; break;
case ucp_Cs: fulltypename = US"Surrogate"; break;
case ucp_Ll: fulltypename = US"Lower case letter"; break;
case ucp_Lm: fulltypename = US"Modifier letter"; break;
case ucp_Lo: fulltypename = US"Other letter"; break;
case ucp_Lt: fulltypename = US"Title case letter"; break;
case ucp_Lu: fulltypename = US"Upper case letter"; break;
case ucp_Mc: fulltypename = US"Spacing mark"; break;
case ucp_Me: fulltypename = US"Enclosing mark"; break;
case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
case ucp_Nd: fulltypename = US"Decimal number"; break;
case ucp_Nl: fulltypename = US"Letter number"; break;
case ucp_No: fulltypename = US"Other number"; break;
case ucp_Pc: fulltypename = US"Connector punctuation"; break;
case ucp_Pd: fulltypename = US"Dash punctuation"; break;
case ucp_Pe: fulltypename = US"Close punctuation"; break;
case ucp_Pf: fulltypename = US"Final punctuation"; break;
case ucp_Pi: fulltypename = US"Initial punctuation"; break;
case ucp_Po: fulltypename = US"Other punctuation"; break;
case ucp_Ps: fulltypename = US"Open punctuation"; break;
case ucp_Sc: fulltypename = US"Currency symbol"; break;
case ucp_Sk: fulltypename = US"Modifier symbol"; break;
case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
case ucp_So: fulltypename = US"Other symbol"; break;
case ucp_Zl: fulltypename = US"Line separator"; break;
case ucp_Zp: fulltypename = US"Paragraph separator"; break;
case ucp_Zs: fulltypename = US"Space separator"; break;
}
switch(gbprop)
{
case ucp_gbCR: graphbreak = US"CR"; break;
case ucp_gbLF: graphbreak = US"LF"; break;
case ucp_gbControl: graphbreak = US"Control"; break;
case ucp_gbExtend: graphbreak = US"Extend"; break;
case ucp_gbPrepend: graphbreak = US"Prepend"; break;
case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break;
case ucp_gbL: graphbreak = US"Hangul syllable type L"; break;
case ucp_gbV: graphbreak = US"Hangul syllable type V"; break;
case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
case ucp_gbOther: graphbreak = US"Other"; break;
}
switch(script)
{
case ucp_Arabic: scriptname = US"Arabic"; break;
case ucp_Armenian: scriptname = US"Armenian"; break;
case ucp_Balinese: scriptname = US"Balinese"; break;
case ucp_Bengali: scriptname = US"Bengali"; break;
case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
case ucp_Braille: scriptname = US"Braille"; break;
case ucp_Buginese: scriptname = US"Buginese"; break;
case ucp_Buhid: scriptname = US"Buhid"; break;
case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
case ucp_Cherokee: scriptname = US"Cherokee"; break;
case ucp_Common: scriptname = US"Common"; break;
case ucp_Coptic: scriptname = US"Coptic"; break;
case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
case ucp_Cypriot: scriptname = US"Cypriot"; break;
case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
case ucp_Deseret: scriptname = US"Deseret"; break;
case ucp_Devanagari: scriptname = US"Devanagari"; break;
case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
case ucp_Georgian: scriptname = US"Georgian"; break;
case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
case ucp_Gothic: scriptname = US"Gothic"; break;
case ucp_Greek: scriptname = US"Greek"; break;
case ucp_Gujarati: scriptname = US"Gujarati"; break;
case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
case ucp_Han: scriptname = US"Han"; break;
case ucp_Hangul: scriptname = US"Hangul"; break;
case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
case ucp_Hebrew: scriptname = US"Hebrew"; break;
case ucp_Hiragana: scriptname = US"Hiragana"; break;
case ucp_Inherited: scriptname = US"Inherited"; break;
case ucp_Kannada: scriptname = US"Kannada"; break;
case ucp_Katakana: scriptname = US"Katakana"; break;
case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
case ucp_Khmer: scriptname = US"Khmer"; break;
case ucp_Lao: scriptname = US"Lao"; break;
case ucp_Latin: scriptname = US"Latin"; break;
case ucp_Limbu: scriptname = US"Limbu"; break;
case ucp_Linear_B: scriptname = US"Linear_B"; break;
case ucp_Malayalam: scriptname = US"Malayalam"; break;
case ucp_Mongolian: scriptname = US"Mongolian"; break;
case ucp_Myanmar: scriptname = US"Myanmar"; break;
case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
case ucp_Nko: scriptname = US"Nko"; break;
case ucp_Ogham: scriptname = US"Ogham"; break;
case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
case ucp_Oriya: scriptname = US"Oriya"; break;
case ucp_Osmanya: scriptname = US"Osmanya"; break;
case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
case ucp_Phoenician: scriptname = US"Phoenician"; break;
case ucp_Runic: scriptname = US"Runic"; break;
case ucp_Shavian: scriptname = US"Shavian"; break;
case ucp_Sinhala: scriptname = US"Sinhala"; break;
case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
case ucp_Syriac: scriptname = US"Syriac"; break;
case ucp_Tagalog: scriptname = US"Tagalog"; break;
case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
case ucp_Tamil: scriptname = US"Tamil"; break;
case ucp_Telugu: scriptname = US"Telugu"; break;
case ucp_Thaana: scriptname = US"Thaana"; break;
case ucp_Thai: scriptname = US"Thai"; break;
case ucp_Tibetan: scriptname = US"Tibetan"; break;
case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
case ucp_Yi: scriptname = US"Yi"; break;
/* New for Unicode 5.1: */
case ucp_Carian: scriptname = US"Carian"; break;
case ucp_Cham: scriptname = US"Cham"; break;
case ucp_Kayah_Li: scriptname = US"Kayah_Li"; break;
case ucp_Lepcha: scriptname = US"Lepcha"; break;
case ucp_Lycian: scriptname = US"Lycian"; break;
case ucp_Lydian: scriptname = US"Lydian"; break;
case ucp_Ol_Chiki: scriptname = US"Ol_Chiki"; break;
case ucp_Rejang: scriptname = US"Rejang"; break;
case ucp_Saurashtra: scriptname = US"Saurashtra"; break;
case ucp_Sundanese: scriptname = US"Sundanese"; break;
case ucp_Vai: scriptname = US"Vai"; break;
/* New for Unicode 5.2: */
case ucp_Avestan: scriptname = US"Avestan"; break;
case ucp_Bamum: scriptname = US"Bamum"; break;
case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
case ucp_Javanese: scriptname = US"Javanese"; break;
case ucp_Kaithi: scriptname = US"Kaithi"; break;
case ucp_Lisu: scriptname = US"Lisu"; break;
case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
case ucp_Old_Turkic: scriptname = US"Old_Turkic"; break;
case ucp_Samaritan: scriptname = US"Samaritan"; break;
case ucp_Tai_Tham: scriptname = US"Tai_Tham"; break;
case ucp_Tai_Viet: scriptname = US"Tai_Viet"; break;
/* New for Unicode 6.0.0 */
case ucp_Batak: scriptname = US"Batak"; break;
case ucp_Brahmi: scriptname = US"Brahmi"; break;
case ucp_Mandaic: scriptname = US"Mandaic"; break;
/* New for Unicode 6.1.0 */
case ucp_Chakma: scriptname = US"Chakma"; break;
case ucp_Meroitic_Cursive: scriptname = US"Meroitic_Cursive"; break;
case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
case ucp_Miao: scriptname = US"Miao"; break;
case ucp_Sharada: scriptname = US"Sharada"; break;
case ucp_Sora_Sompeng: scriptname = US"Sora Sompent"; break;
case ucp_Takri: scriptname = US"Takri"; break;
}
printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
if (othercase != c)
{
printf(", %04x", othercase);
if (caseset != 0)
{
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
while (*(++p) < NOTACHAR)
if (*p != othercase && *p != c) printf(", %04x", *p);
}
}
printf("\n");
}
/*************************************************
* Main program *
*************************************************/
int
main(void)
{
unsigned char buffer[1024];
while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
{
unsigned char name[24];
unsigned char *s, *t;
printf("%s", buffer);
s = buffer;
while (isspace(*s)) s++;
if (*s == 0) continue;
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
*t = 0;
while (isspace(*s)) s++;
if (strcmp(CS name, "findprop") == 0)
{
while (*s != 0)
{
unsigned char *endptr;
int c = strtoul(CS s, CSS(&endptr), 16);
print_prop(c);
s = endptr;
while (isspace(*s)) s++;
}
}
else printf("Unknown test command %s\n", name);
}
return 0;
}
/* End */

View File

@ -0,0 +1,34 @@
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
findprop 0100 0101 0102 0103 0104 0105 0106
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
findprop 10000 10001 e01ef f0000 100000
findprop 1b00 12000 7c0 a840 10900
findprop 1d79 a77d
findprop 0800 083e a4d0 a4f7 aa80 aadf
findprop 10b00 10b35 13000 1342e 10840 10855
findprop 11100 1113c 11680 116c0
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89

View File

@ -0,0 +1,359 @@
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
0000 Control: Control, Common, Control
0001 Control: Control, Common, Control
0002 Control: Control, Common, Control
0003 Control: Control, Common, Control
0004 Control: Control, Common, Control
0005 Control: Control, Common, Control
0006 Control: Control, Common, Control
0007 Control: Control, Common, Control
0008 Control: Control, Common, Control
0009 Control: Control, Common, Control
000a Control: Control, Common, LF
000b Control: Control, Common, Control
000c Control: Control, Common, Control
000d Control: Control, Common, CR
000e Control: Control, Common, Control
000f Control: Control, Common, Control
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
0010 Control: Control, Common, Control
0011 Control: Control, Common, Control
0012 Control: Control, Common, Control
0013 Control: Control, Common, Control
0014 Control: Control, Common, Control
0015 Control: Control, Common, Control
0016 Control: Control, Common, Control
0017 Control: Control, Common, Control
0018 Control: Control, Common, Control
0019 Control: Control, Common, Control
001a Control: Control, Common, Control
001b Control: Control, Common, Control
001c Control: Control, Common, Control
001d Control: Control, Common, Control
001e Control: Control, Common, Control
001f Control: Control, Common, Control
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
0020 Separator: Space separator, Common, Other
0021 Punctuation: Other punctuation, Common, Other
0022 Punctuation: Other punctuation, Common, Other
0023 Punctuation: Other punctuation, Common, Other
0024 Symbol: Currency symbol, Common, Other
0025 Punctuation: Other punctuation, Common, Other
0026 Punctuation: Other punctuation, Common, Other
0027 Punctuation: Other punctuation, Common, Other
0028 Punctuation: Open punctuation, Common, Other
0029 Punctuation: Close punctuation, Common, Other
002a Punctuation: Other punctuation, Common, Other
002b Symbol: Mathematical symbol, Common, Other
002c Punctuation: Other punctuation, Common, Other
002d Punctuation: Dash punctuation, Common, Other
002e Punctuation: Other punctuation, Common, Other
002f Punctuation: Other punctuation, Common, Other
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
0030 Number: Decimal number, Common, Other
0031 Number: Decimal number, Common, Other
0032 Number: Decimal number, Common, Other
0033 Number: Decimal number, Common, Other
0034 Number: Decimal number, Common, Other
0035 Number: Decimal number, Common, Other
0036 Number: Decimal number, Common, Other
0037 Number: Decimal number, Common, Other
0038 Number: Decimal number, Common, Other
0039 Number: Decimal number, Common, Other
003a Punctuation: Other punctuation, Common, Other
003b Punctuation: Other punctuation, Common, Other
003c Symbol: Mathematical symbol, Common, Other
003d Symbol: Mathematical symbol, Common, Other
003e Symbol: Mathematical symbol, Common, Other
003f Punctuation: Other punctuation, Common, Other
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
0040 Punctuation: Other punctuation, Common, Other
0041 Letter: Upper case letter, Latin, Other, 0061
0042 Letter: Upper case letter, Latin, Other, 0062
0043 Letter: Upper case letter, Latin, Other, 0063
0044 Letter: Upper case letter, Latin, Other, 0064
0045 Letter: Upper case letter, Latin, Other, 0065
0046 Letter: Upper case letter, Latin, Other, 0066
0047 Letter: Upper case letter, Latin, Other, 0067
0048 Letter: Upper case letter, Latin, Other, 0068
0049 Letter: Upper case letter, Latin, Other, 0069
004a Letter: Upper case letter, Latin, Other, 006a
004b Letter: Upper case letter, Latin, Other, 006b, 212a
004c Letter: Upper case letter, Latin, Other, 006c
004d Letter: Upper case letter, Latin, Other, 006d
004e Letter: Upper case letter, Latin, Other, 006e
004f Letter: Upper case letter, Latin, Other, 006f
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
0050 Letter: Upper case letter, Latin, Other, 0070
0051 Letter: Upper case letter, Latin, Other, 0071
0052 Letter: Upper case letter, Latin, Other, 0072
0053 Letter: Upper case letter, Latin, Other, 0073, 017f
0054 Letter: Upper case letter, Latin, Other, 0074
0055 Letter: Upper case letter, Latin, Other, 0075
0056 Letter: Upper case letter, Latin, Other, 0076
0057 Letter: Upper case letter, Latin, Other, 0077
0058 Letter: Upper case letter, Latin, Other, 0078
0059 Letter: Upper case letter, Latin, Other, 0079
005a Letter: Upper case letter, Latin, Other, 007a
005b Punctuation: Open punctuation, Common, Other
005c Punctuation: Other punctuation, Common, Other
005d Punctuation: Close punctuation, Common, Other
005e Symbol: Modifier symbol, Common, Other
005f Punctuation: Connector punctuation, Common, Other
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
0060 Symbol: Modifier symbol, Common, Other
0061 Letter: Lower case letter, Latin, Other, 0041
0062 Letter: Lower case letter, Latin, Other, 0042
0063 Letter: Lower case letter, Latin, Other, 0043
0064 Letter: Lower case letter, Latin, Other, 0044
0065 Letter: Lower case letter, Latin, Other, 0045
0066 Letter: Lower case letter, Latin, Other, 0046
0067 Letter: Lower case letter, Latin, Other, 0047
0068 Letter: Lower case letter, Latin, Other, 0048
0069 Letter: Lower case letter, Latin, Other, 0049
006a Letter: Lower case letter, Latin, Other, 004a
006b Letter: Lower case letter, Latin, Other, 004b, 212a
006c Letter: Lower case letter, Latin, Other, 004c
006d Letter: Lower case letter, Latin, Other, 004d
006e Letter: Lower case letter, Latin, Other, 004e
006f Letter: Lower case letter, Latin, Other, 004f
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
0070 Letter: Lower case letter, Latin, Other, 0050
0071 Letter: Lower case letter, Latin, Other, 0051
0072 Letter: Lower case letter, Latin, Other, 0052
0073 Letter: Lower case letter, Latin, Other, 0053, 017f
0074 Letter: Lower case letter, Latin, Other, 0054
0075 Letter: Lower case letter, Latin, Other, 0055
0076 Letter: Lower case letter, Latin, Other, 0056
0077 Letter: Lower case letter, Latin, Other, 0057
0078 Letter: Lower case letter, Latin, Other, 0058
0079 Letter: Lower case letter, Latin, Other, 0059
007a Letter: Lower case letter, Latin, Other, 005a
007b Punctuation: Open punctuation, Common, Other
007c Symbol: Mathematical symbol, Common, Other
007d Punctuation: Close punctuation, Common, Other
007e Symbol: Mathematical symbol, Common, Other
007f Control: Control, Common, Control
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
0080 Control: Control, Common, Control
0081 Control: Control, Common, Control
0082 Control: Control, Common, Control
0083 Control: Control, Common, Control
0084 Control: Control, Common, Control
0085 Control: Control, Common, Control
0086 Control: Control, Common, Control
0087 Control: Control, Common, Control
0088 Control: Control, Common, Control
0089 Control: Control, Common, Control
008a Control: Control, Common, Control
008b Control: Control, Common, Control
008c Control: Control, Common, Control
008d Control: Control, Common, Control
008e Control: Control, Common, Control
008f Control: Control, Common, Control
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
0090 Control: Control, Common, Control
0091 Control: Control, Common, Control
0092 Control: Control, Common, Control
0093 Control: Control, Common, Control
0094 Control: Control, Common, Control
0095 Control: Control, Common, Control
0096 Control: Control, Common, Control
0097 Control: Control, Common, Control
0098 Control: Control, Common, Control
0099 Control: Control, Common, Control
009a Control: Control, Common, Control
009b Control: Control, Common, Control
009c Control: Control, Common, Control
009d Control: Control, Common, Control
009e Control: Control, Common, Control
009f Control: Control, Common, Control
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
00a0 Separator: Space separator, Common, Other
00a1 Punctuation: Other punctuation, Common, Other
00a2 Symbol: Currency symbol, Common, Other
00a3 Symbol: Currency symbol, Common, Other
00a4 Symbol: Currency symbol, Common, Other
00a5 Symbol: Currency symbol, Common, Other
00a6 Symbol: Other symbol, Common, Other
00a7 Punctuation: Other punctuation, Common, Other
00a8 Symbol: Modifier symbol, Common, Other
00a9 Symbol: Other symbol, Common, Other
00aa Letter: Other letter, Latin, Other
00ab Punctuation: Initial punctuation, Common, Other
00ac Symbol: Mathematical symbol, Common, Other
00ad Control: Format, Common, Control
00ae Symbol: Other symbol, Common, Other
00af Symbol: Modifier symbol, Common, Other
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
00b0 Symbol: Other symbol, Common, Other
00b1 Symbol: Mathematical symbol, Common, Other
00b2 Number: Other number, Common, Other
00b3 Number: Other number, Common, Other
00b4 Symbol: Modifier symbol, Common, Other
00b5 Letter: Lower case letter, Common, Other, 03bc, 039c
00b6 Punctuation: Other punctuation, Common, Other
00b7 Punctuation: Other punctuation, Common, Other
00b8 Symbol: Modifier symbol, Common, Other
00b9 Number: Other number, Common, Other
00ba Letter: Other letter, Latin, Other
00bb Punctuation: Final punctuation, Common, Other
00bc Number: Other number, Common, Other
00bd Number: Other number, Common, Other
00be Number: Other number, Common, Other
00bf Punctuation: Other punctuation, Common, Other
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
00c0 Letter: Upper case letter, Latin, Other, 00e0
00c1 Letter: Upper case letter, Latin, Other, 00e1
00c2 Letter: Upper case letter, Latin, Other, 00e2
00c3 Letter: Upper case letter, Latin, Other, 00e3
00c4 Letter: Upper case letter, Latin, Other, 00e4
00c5 Letter: Upper case letter, Latin, Other, 00e5, 212b
00c6 Letter: Upper case letter, Latin, Other, 00e6
00c7 Letter: Upper case letter, Latin, Other, 00e7
00c8 Letter: Upper case letter, Latin, Other, 00e8
00c9 Letter: Upper case letter, Latin, Other, 00e9
00ca Letter: Upper case letter, Latin, Other, 00ea
00cb Letter: Upper case letter, Latin, Other, 00eb
00cc Letter: Upper case letter, Latin, Other, 00ec
00cd Letter: Upper case letter, Latin, Other, 00ed
00ce Letter: Upper case letter, Latin, Other, 00ee
00cf Letter: Upper case letter, Latin, Other, 00ef
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
00d0 Letter: Upper case letter, Latin, Other, 00f0
00d1 Letter: Upper case letter, Latin, Other, 00f1
00d2 Letter: Upper case letter, Latin, Other, 00f2
00d3 Letter: Upper case letter, Latin, Other, 00f3
00d4 Letter: Upper case letter, Latin, Other, 00f4
00d5 Letter: Upper case letter, Latin, Other, 00f5
00d6 Letter: Upper case letter, Latin, Other, 00f6
00d7 Symbol: Mathematical symbol, Common, Other
00d8 Letter: Upper case letter, Latin, Other, 00f8
00d9 Letter: Upper case letter, Latin, Other, 00f9
00da Letter: Upper case letter, Latin, Other, 00fa
00db Letter: Upper case letter, Latin, Other, 00fb
00dc Letter: Upper case letter, Latin, Other, 00fc
00dd Letter: Upper case letter, Latin, Other, 00fd
00de Letter: Upper case letter, Latin, Other, 00fe
00df Letter: Lower case letter, Latin, Other, 1e9e
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
00e0 Letter: Lower case letter, Latin, Other, 00c0
00e1 Letter: Lower case letter, Latin, Other, 00c1
00e2 Letter: Lower case letter, Latin, Other, 00c2
00e3 Letter: Lower case letter, Latin, Other, 00c3
00e4 Letter: Lower case letter, Latin, Other, 00c4
00e5 Letter: Lower case letter, Latin, Other, 00c5, 212b
00e6 Letter: Lower case letter, Latin, Other, 00c6
00e7 Letter: Lower case letter, Latin, Other, 00c7
00e8 Letter: Lower case letter, Latin, Other, 00c8
00e9 Letter: Lower case letter, Latin, Other, 00c9
00ea Letter: Lower case letter, Latin, Other, 00ca
00eb Letter: Lower case letter, Latin, Other, 00cb
00ec Letter: Lower case letter, Latin, Other, 00cc
00ed Letter: Lower case letter, Latin, Other, 00cd
00ee Letter: Lower case letter, Latin, Other, 00ce
00ef Letter: Lower case letter, Latin, Other, 00cf
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
00f0 Letter: Lower case letter, Latin, Other, 00d0
00f1 Letter: Lower case letter, Latin, Other, 00d1
00f2 Letter: Lower case letter, Latin, Other, 00d2
00f3 Letter: Lower case letter, Latin, Other, 00d3
00f4 Letter: Lower case letter, Latin, Other, 00d4
00f5 Letter: Lower case letter, Latin, Other, 00d5
00f6 Letter: Lower case letter, Latin, Other, 00d6
00f7 Symbol: Mathematical symbol, Common, Other
00f8 Letter: Lower case letter, Latin, Other, 00d8
00f9 Letter: Lower case letter, Latin, Other, 00d9
00fa Letter: Lower case letter, Latin, Other, 00da
00fb Letter: Lower case letter, Latin, Other, 00db
00fc Letter: Lower case letter, Latin, Other, 00dc
00fd Letter: Lower case letter, Latin, Other, 00dd
00fe Letter: Lower case letter, Latin, Other, 00de
00ff Letter: Lower case letter, Latin, Other, 0178
findprop 0100 0101 0102 0103 0104 0105 0106
0100 Letter: Upper case letter, Latin, Other, 0101
0101 Letter: Lower case letter, Latin, Other, 0100
0102 Letter: Upper case letter, Latin, Other, 0103
0103 Letter: Lower case letter, Latin, Other, 0102
0104 Letter: Upper case letter, Latin, Other, 0105
0105 Letter: Lower case letter, Latin, Other, 0104
0106 Letter: Upper case letter, Latin, Other, 0107
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
ffe0 Symbol: Currency symbol, Common, Other
ffe1 Symbol: Currency symbol, Common, Other
ffe2 Symbol: Mathematical symbol, Common, Other
ffe3 Symbol: Modifier symbol, Common, Other
ffe4 Symbol: Other symbol, Common, Other
ffe5 Symbol: Currency symbol, Common, Other
ffe6 Symbol: Currency symbol, Common, Other
ffe7 Control: Unassigned, Common, Other
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
ffe8 Symbol: Other symbol, Common, Other
ffe9 Symbol: Mathematical symbol, Common, Other
ffea Symbol: Mathematical symbol, Common, Other
ffeb Symbol: Mathematical symbol, Common, Other
ffec Symbol: Mathematical symbol, Common, Other
ffed Symbol: Other symbol, Common, Other
ffee Symbol: Other symbol, Common, Other
ffef Control: Unassigned, Common, Other
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
fff8 Control: Unassigned, Common, Control
fff9 Control: Format, Common, Control
fffa Control: Format, Common, Control
fffb Control: Format, Common, Control
fffc Symbol: Other symbol, Common, Other
fffd Symbol: Other symbol, Common, Other
fffe Control: Unassigned, Common, Other
ffff Control: Unassigned, Common, Other
findprop 10000 10001 e01ef f0000 100000
10000 Letter: Other letter, Linear_B, Other
10001 Letter: Other letter, Linear_B, Other
e01ef Mark: Non-spacing mark, Inherited, Extend
f0000 Control: Private use, Common, Other
100000 Control: Private use, Common, Other
findprop 1b00 12000 7c0 a840 10900
1b00 Mark: Non-spacing mark, Balinese, Extend
12000 Letter: Other letter, Cuneiform, Other
07c0 Number: Decimal number, Nko, Other
a840 Letter: Other letter, Phags_Pa, Other
10900 Letter: Other letter, Phoenician, Other
findprop 1d79 a77d
1d79 Letter: Lower case letter, Latin, Other, a77d
a77d Letter: Upper case letter, Latin, Other, 1d79
findprop 0800 083e a4d0 a4f7 aa80 aadf
0800 Letter: Other letter, Samaritan, Other
083e Punctuation: Other punctuation, Samaritan, Other
a4d0 Letter: Other letter, Lisu, Other
a4f7 Letter: Other letter, Lisu, Other
aa80 Letter: Other letter, Tai_Viet, Other
aadf Punctuation: Other punctuation, Tai_Viet, Other
findprop 10b00 10b35 13000 1342e 10840 10855
10b00 Letter: Other letter, Avestan, Other
10b35 Letter: Other letter, Avestan, Other
13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
1342e Letter: Other letter, Egyptian_Hieroglyphs, Other
10840 Letter: Other letter, Imperial_Aramaic, Other
10855 Letter: Other letter, Imperial_Aramaic, Other
findprop 11100 1113c 11680 116c0
11100 Mark: Non-spacing mark, Chakma, Extend
1113c Number: Decimal number, Chakma, Other
11680 Letter: Other letter, Takri, Other
116c0 Number: Decimal number, Takri, Other
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
000d Control: Control, Common, CR
000a Control: Control, Common, LF
000e Control: Control, Common, Control
0711 Mark: Non-spacing mark, Syriac, Extend
1b04 Mark: Spacing mark, Balinese, SpacingMark
1111 Letter: Other letter, Hangul, Hangul syllable type L
1169 Letter: Other letter, Hangul, Hangul syllable type V
11fe Letter: Other letter, Hangul, Hangul syllable type T
ae4c Letter: Other letter, Hangul, Hangul syllable type LV
ad89 Letter: Other letter, Hangul, Hangul syllable type LVT

253
maint/utf8.c Normal file
View File

@ -0,0 +1,253 @@
/* A test program for converting characters to UTF-8 and vice versa. Note that
this program conforms to the original definition of UTF-8, which allows
codepoints up to 7fffffff. The more recent definition limits the validity of
UTF-8 codepoints to a maximum of 10ffffff.
The arguments are either single codepoint values, written as 0xhhhh, for
conversion to UTF-8, or sequences of hex values, written without 0x and
optionally including spaces (but such arguments must be quoted), for conversion
from UTF-8 to codepoints. For example:
./utf8 0x1234
0x00001234 => e1 88 b4
./utf8 "e1 88 b4"
0x00001234 <= e1 88 b4
In the second case, a number of characters can be present in one argument:
./utf8 "65 e188b4 77"
0x00000065 <= 65
0x00001234 <= e1 88 b4
0x00000077 <= 77
If the option -s is given, the sequence of UTF-bytes is written out between
angle brackets at the end of the line. On a UTF-8 terminal, this will show the
appropriate graphic for the codepoint. */
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
/* The valid ranges for UTF-8 characters are:
0000 0000 to 0000 007f 1 byte (ascii)
0000 0080 to 0000 07ff 2 bytes
0000 0800 to 0000 ffff 3 bytes
0001 0000 to 001f ffff 4 bytes
0020 0000 to 03ff ffff 5 bytes
0400 0000 to 7fff ffff 6 bytes
*/
static const int utf8_table1[] = {
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
static const int utf8_table2[] = {
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
static const int utf8_table3[] = {
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
static const unsigned char utf8_table4[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
/*************************************************
* Convert character value to UTF-8 *
*************************************************/
/* This function takes an integer value in the range 0 - 0x7fffffff
and encodes it as a UTF-8 character in 1 to 6 bytes.
Arguments:
cvalue the character value
buffer pointer to buffer for result - at least 6 bytes long
Returns: number of characters placed in the buffer
-1 if input character is negative
0 if input character is positive but too big (only when
int is longer than 32 bits)
*/
int
ord2utf8(int cvalue, unsigned char *buffer)
{
register int i, j;
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
if (cvalue <= utf8_table1[i]) break;
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
if (cvalue < 0) return -1;
buffer += i;
for (j = i; j > 0; j--)
{
*buffer-- = 0x80 | (cvalue & 0x3f);
cvalue >>= 6;
}
*buffer = utf8_table2[i] | cvalue;
return i + 1;
}
/*************************************************
* Convert UTF-8 string to value *
*************************************************/
/* This function takes one or more bytes that represents a UTF-8 character,
and returns the value of the character.
Argument:
buffer a pointer to the byte vector
vptr a pointer to an int to receive the value
Returns: > 0 => the number of bytes consumed
-6 to 0 => malformed UTF-8 character at offset = (-return)
*/
int
utf82ord(unsigned char *buffer, int *vptr)
{
int c = *buffer++;
int d = c;
int i, j, s;
for (i = -1; i < 6; i++) /* i is number of additional bytes */
{
if ((d & 0x80) == 0) break;
d <<= 1;
}
if (i == -1) { *vptr = c; return 1; } /* ascii character */
if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
/* i now has a value in the range 1-5 */
s = 6*i;
d = (c & utf8_table3[i]) << s;
for (j = 0; j < i; j++)
{
c = *buffer++;
if ((c & 0xc0) != 0x80) return -(j+1);
s -= 6;
d |= (c & 0x3f) << s;
}
/* Check that encoding was the correct unique one */
for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
if (d <= utf8_table1[j]) break;
if (j != i) return -(i+1);
/* Valid value */
*vptr = d;
return i+1;
}
/*************************************************
* Main Program *
*************************************************/
int
main(int argc, char **argv)
{
int i = 1;
int show = 0;
unsigned char buffer[64];
if (argc > 1 && strcmp(argv[1], "-s") == 0)
{
show = 1;
i = 2;
}
for (; i < argc; i++)
{
unsigned char *x = argv[i];
if (strncmp(x, "0x", 2) == 0)
{
int j;
int d = strtol(x+2, NULL, 16);
int rc = ord2utf8(d, buffer);
printf("0x%08x => ", d);
if (rc <= 0) printf("*** Error %d ***", rc); else
{
for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
if (show)
{
printf(">");
for (j = 0; j < rc; j++) printf("%c", buffer[j]);
printf("<");
}
}
printf("\n");
}
else
{
int d, rc;
int j = 0;
int y = 0;
int z = 0;
unsigned char *bptr;
for (;;)
{
while (*x == ' ') x++;
if (*x == 0 && !z) break;
if (!isxdigit(*x))
{
printf("Malformed hex string: %s\n", argv[i]);
j = -1;
break;
}
y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
x++;
if (z)
{
buffer[j++] = y;
y = 0;
}
z ^= 1;
}
buffer[j] = 0;
bptr = buffer;
while (*bptr != 0)
{
rc = utf82ord(bptr, &d);
if (rc > 0)
{
printf("0x%08x <= ", d);
for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
if (show)
{
printf(">");
for (j = 0; j < rc; j++) printf("%c", bptr[j]);
printf("<");
}
printf("\n");
bptr += rc;
}
else
{
printf("Malformed UTF-8 at offset %d <= ", -rc);
while (*bptr != 0) printf("%02x ", *bptr++);
printf("\n");
break;
}
}
}
}
return 0;
}
/* End */

View File

@ -451,7 +451,7 @@ PCRE2_EXP_DECL int pcre2_substring_length_byname(pcre2_match_data *, \
PCRE2_EXP_DECL int pcre2_substring_length_bynumber(pcre2_match_data *, \
int); \
PCRE2_EXP_DECL int pcre2_substring_nametable_scan(const pcre2_code *, \
PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_UCHAR **); \
PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \
PCRE2_EXP_DECL int pcre2_substring_number_from_name(\
const pcre2_code *, PCRE2_SPTR); \
PCRE2_EXP_DECL void pcre2_substring_list_free(PCRE2_SPTR *); \

View File

@ -451,7 +451,7 @@ PCRE2_EXP_DECL int pcre2_substring_length_byname(pcre2_match_data *, \
PCRE2_EXP_DECL int pcre2_substring_length_bynumber(pcre2_match_data *, \
int); \
PCRE2_EXP_DECL int pcre2_substring_nametable_scan(const pcre2_code *, \
PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_UCHAR **); \
PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \
PCRE2_EXP_DECL int pcre2_substring_number_from_name(\
const pcre2_code *, PCRE2_SPTR); \
PCRE2_EXP_DECL void pcre2_substring_list_free(PCRE2_SPTR *); \

View File

@ -102,23 +102,57 @@ if (ccontext == NULL)
if (pattern[0] == 'Y')
{
c = ccontext->memctl.malloc(sizeof(pcre2_real_code), NULL);
PCRE2_UCHAR *n;
int lennumber = (PCRE2_CODE_UNIT_WIDTH == 8)? 2 : 1;
size_t size = sizeof(pcre2_real_code) +
(12 + 3*lennumber)*(PCRE2_CODE_UNIT_WIDTH/8) + CU2BYTES(20);
c = ccontext->memctl.malloc(size, NULL);
c->memctl = ccontext->memctl;
c->magic_number = MAGIC_NUMBER;
c->size = sizeof(pcre2_real_code);
c->name_table_offset = sizeof(pcre2_real_code);
c->size = size;
c->compile_options = options;
c->flags = PCRE2_CODE_UNIT_WIDTH/8;
c->limit_match = 0;
c->limit_recursion = 0;
c->max_lookbehind = 0;
c->minlength = 3;
c->top_bracket = 1;
c->top_bracket = 5;
c->top_backref = 1;
c->bsr_convention = ccontext->bsr_convention;
c->newline_convention = ccontext->newline_convention;
c->name_count = 0;
c->name_entry_size = 0;
c->name_count = 3;
c->name_entry_size = 4 + lennumber;
n = (PCRE2_UCHAR *)((char *)c + sizeof(pcre2_real_code));
if (lennumber == 2) *n++ = 0 ;
*n++ = 1;
*n++ = 'x'; *n++ = 'x'; *n++ = 'x'; *n++ = 0;
if (lennumber == 2) *n++ = 0 ;
*n++ = 2;
*n++ = 'y'; *n++ = 'y'; *n++ = 'y'; *n++ = 0;
if (lennumber == 2) *n++ = 0 ;
*n++ = 3;
*n++ = 'y'; *n++ = 'y'; *n++ = 'y'; *n++ = 0;
*n++ = OP_CHAR;
*n++ = 'x';
*n++ = OP_CHARI;
*n++ = 'Y';
*n++ = OP_PROP;
*n++ = PT_SC;
*n++ = 0;
*n++ = OP_DNRREF;
*n++ = 0;
*n++ = OP_END;
}
else

View File

@ -78,27 +78,26 @@ memory control data is to be stored for future use.
Arguments:
size amount of memory required
offset offset in memory block to memctl structure
gcontext a general context or NULL
memctl pointer to a memctl block or NULL
Returns: pointer to memory or NULL on failure
*/
PCRE2_EXP_DEFN void *
PRIV(memctl_malloc)(size_t size, size_t offset,
pcre2_general_context *gcontext)
PRIV(memctl_malloc)(size_t size, size_t offset, pcre2_memctl *memctl)
{
pcre2_memctl *memctl;
void *yield = (gcontext == NULL)? malloc(size) :
gcontext->memctl.malloc(size, gcontext->memctl.memory_data);
pcre2_memctl *newmemctl;
void *yield = (memctl == NULL)? malloc(size) :
memctl->malloc(size, memctl->memory_data);
if (yield == NULL) return NULL;
memctl = (pcre2_memctl *)(((uint8_t *)yield) + offset);
if (gcontext == NULL)
newmemctl = (pcre2_memctl *)(((uint8_t *)yield) + offset);
if (memctl == NULL)
{
memctl->malloc = default_malloc;
memctl->free = default_free;
memctl->memory_data = NULL;
newmemctl->malloc = default_malloc;
newmemctl->free = default_free;
newmemctl->memory_data = NULL;
}
else *memctl = gcontext->memctl;
else *newmemctl = *memctl;
return yield;
}
@ -152,7 +151,7 @@ pcre2_compile_context_create(pcre2_general_context *gcontext)
pcre2_compile_context *ccontext = PRIV(memctl_malloc)(
sizeof(pcre2_real_compile_context),
offsetof(pcre2_real_compile_context, memctl),
gcontext);
&(gcontext->memctl));
if (ccontext == NULL) return NULL;
PRIV(compile_context_init)(ccontext, FALSE);
return ccontext;
@ -184,7 +183,7 @@ pcre2_match_context_create(pcre2_general_context *gcontext)
pcre2_match_context *mcontext = PRIV(memctl_malloc)(
sizeof(pcre2_real_match_context),
offsetof(pcre2_real_compile_context, memctl),
gcontext);
&(gcontext->memctl));
if (mcontext == NULL) return NULL;
PRIV(match_context_init)(mcontext, FALSE);
return mcontext;
@ -240,21 +239,24 @@ return new;
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_general_context_free(pcre2_general_context *gcontext)
{
gcontext->memctl.free(gcontext, gcontext->memctl.memory_data);
if (gcontext != NULL)
gcontext->memctl.free(gcontext, gcontext->memctl.memory_data);
}
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_compile_context_free(pcre2_compile_context *ccontext)
{
ccontext->memctl.free(ccontext, ccontext->memctl.memory_data);
if (ccontext != NULL)
ccontext->memctl.free(ccontext, ccontext->memctl.memory_data);
}
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_match_context_free(pcre2_match_context *mcontext)
{
mcontext->memctl.free(mcontext, mcontext->memctl.memory_data);
if (mcontext != NULL)
mcontext->memctl.free(mcontext, mcontext->memctl.memory_data);
}

View File

@ -49,6 +49,7 @@ POSSIBILITY OF SUCH DAMAGE.
#include <string.h>
#include "pcre2.h"
#include "pcre2_ucp.h"
#define PUBL(name) pcre2_##name
@ -77,6 +78,11 @@ typedef int BOOL;
#include <valgrind/memcheck.h>
#endif
/* This is an unsigned int value that no character can ever have, as
Unicode doesn't go beyond 0x0010ffff. */
#define NOTACHAR 0xffffffff
/* When UTF encoding is being used, a character is no longer just a single
byte in 8-bit mode or a single short in 16-bit mode. The macros for character
handling generate simple sequences when used in the basic mode, and more
@ -165,6 +171,109 @@ the pointer. */
#endif /* SUPPORT_UTF */
/* Tests for Unicode horizontal and vertical whitespace characters must check a
number of different values. Using a switch statement for this generates the
fastest code (no loop, no memory access), and there are several places in the
interpreter code where this happens. In order to ensure that all the case lists
remain in step, we use macros so that there is only one place where the lists
are defined.
These values are also required as lists in pcre2_compile.c when processing \h,
\H, \v and \V in a character class. The lists are defined in pcre2_tables.c,
but macros that define the values are here so that all the definitions are
together. The lists must be in ascending character order, terminated by
NOTACHAR (which is 0xffffffff).
Any changes should ensure that the various macros are kept in step with each
other. NOTE: The values also appear in pcre2_jit_compile.c. */
/* ------ ASCII/Unicode environments ------ */
#ifndef EBCDIC
#define HSPACE_LIST \
CHAR_HT, CHAR_SPACE, 0xa0, \
0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \
NOTACHAR
#define HSPACE_MULTIBYTE_CASES \
case 0x1680: /* OGHAM SPACE MARK */ \
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \
case 0x2000: /* EN QUAD */ \
case 0x2001: /* EM QUAD */ \
case 0x2002: /* EN SPACE */ \
case 0x2003: /* EM SPACE */ \
case 0x2004: /* THREE-PER-EM SPACE */ \
case 0x2005: /* FOUR-PER-EM SPACE */ \
case 0x2006: /* SIX-PER-EM SPACE */ \
case 0x2007: /* FIGURE SPACE */ \
case 0x2008: /* PUNCTUATION SPACE */ \
case 0x2009: /* THIN SPACE */ \
case 0x200A: /* HAIR SPACE */ \
case 0x202f: /* NARROW NO-BREAK SPACE */ \
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \
case 0x3000 /* IDEOGRAPHIC SPACE */
#define HSPACE_BYTE_CASES \
case CHAR_HT: \
case CHAR_SPACE: \
case 0xa0 /* NBSP */
#define HSPACE_CASES \
HSPACE_BYTE_CASES: \
HSPACE_MULTIBYTE_CASES
#define VSPACE_LIST \
CHAR_LF, CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, 0x2028, 0x2029, NOTACHAR
#define VSPACE_MULTIBYTE_CASES \
case 0x2028: /* LINE SEPARATOR */ \
case 0x2029 /* PARAGRAPH SEPARATOR */
#define VSPACE_BYTE_CASES \
case CHAR_LF: \
case CHAR_VT: \
case CHAR_FF: \
case CHAR_CR: \
case CHAR_NEL
#define VSPACE_CASES \
VSPACE_BYTE_CASES: \
VSPACE_MULTIBYTE_CASES
/* ------ EBCDIC environments ------ */
#else
#define HSPACE_LIST CHAR_HT, CHAR_SPACE
#define HSPACE_BYTE_CASES \
case CHAR_HT: \
case CHAR_SPACE
#define HSPACE_CASES HSPACE_BYTE_CASES
#ifdef EBCDIC_NL25
#define VSPACE_LIST \
CHAR_VT, CHAR_FF, CHAR_CR, CHAR_NEL, CHAR_LF, NOTACHAR
#else
#define VSPACE_LIST \
CHAR_VT, CHAR_FF, CHAR_CR, CHAR_LF, CHAR_NEL, NOTACHAR
#endif
#define VSPACE_BYTE_CASES \
case CHAR_LF: \
case CHAR_VT: \
case CHAR_FF: \
case CHAR_CR: \
case CHAR_NEL
#define VSPACE_CASES VSPACE_BYTE_CASES
#endif /* EBCDIC */
/* ------ End of whitespace macros ------ */
/* Private flags containing information about the compiled pattern. The first
three must not be changed, because whichever is set is actually the number of
bytes in a code unit in that mode. */
@ -801,7 +910,519 @@ only. */
/* -------------------- End of character and string names -------------------*/
/* Private structures that are mode-independent. */
/* -------------------- Definitions for compiled patterns -------------------*/
/* Escape items that are just an encoding of a particular data value. */
#ifndef ESC_e
#define ESC_e CHAR_ESC
#endif
#ifndef ESC_f
#define ESC_f CHAR_FF
#endif
#ifndef ESC_n
#define ESC_n CHAR_LF
#endif
#ifndef ESC_r
#define ESC_r CHAR_CR
#endif
/* We can't officially use ESC_t because it is a POSIX reserved identifier
(presumably because of all the others like size_t). */
#ifndef ESC_tee
#define ESC_tee CHAR_HT
#endif
/* Codes for different types of Unicode property */
#define PT_ANY 0 /* Any property - matches all chars */
#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
#define PT_GC 2 /* Specified general characteristic (e.g. L) */
#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
#define PT_SC 4 /* Script (e.g. Han) */
#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
#define PT_WORD 8 /* Word - L plus N plus underscore */
#define PT_CLIST 9 /* Pseudo-property: match character list */
#define PT_UCNC 10 /* Universal Character nameable character */
#define PT_TABSIZE 11 /* Size of square table for autopossessify tests */
/* The following special properties are used only in XCLASS items, when POSIX
classes are specified and PCRE_UCP is set - in other words, for Unicode
handling of these classes. They are not available via the \p or \P escapes like
those in the above list, and so they do not take part in the autopossessifying
table. */
#define PT_PXGRAPH 11 /* [:graph:] - characters that mark the paper */
#define PT_PXPRINT 12 /* [:print:] - [:graph:] plus non-control spaces */
#define PT_PXPUNCT 13 /* [:punct:] - punctuation characters */
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain characters with values greater than 255. */
#define XCL_NOT 0x01 /* Flag: this is a negative class */
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
#define XCL_HASPROP 0x04 /* Flag: property checks are present. */
#define XCL_END 0 /* Marks end of individual items */
#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */
#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */
/* These are escaped items that aren't just an encoding of a particular data
value such as \n. They must have non-zero values, as check_escape() returns 0
for a data character. Also, they must appear in the same order as in the
opcode definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In
non-DOTALL mode, "." behaves like \N.
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
when PCRE_UCP is set and replacement of \d etc by \p sequences is required.
They must be contiguous, and remain in order so that the replacements can be
looked up from a table.
Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in
check_escape(). There are two tests in the code for an escape
greater than ESC_b and less than ESC_Z to detect the types that may be
repeated. These are the types that consume characters. If any new escapes are
put in between that don't consume a character, that code will have to change.
*/
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
ESC_E, ESC_Q, ESC_g, ESC_k,
ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu };
/********************** Opcode definitions ******************/
/****** NOTE NOTE NOTE ******
Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in
order to the list of escapes immediately above. Furthermore, values up to
OP_DOLLM must not be changed without adjusting the table called autoposstab in
pcre_compile.c
Whenever this list is updated, the two macro definitions that follow must be
updated to match. The possessification table called "opcode_possessify" in
pcre_compile.c must also be updated, and also the tables called "coptable"
and "poptable" in pcre_dfa_exec.c.
****** NOTE NOTE NOTE ******/
/* The values between FIRST_AUTOTAB_OP and LAST_AUTOTAB_RIGHT_OP, inclusive,
are used in a table for deciding whether a repeated character type can be
auto-possessified. */
#define FIRST_AUTOTAB_OP OP_NOT_DIGIT
#define LAST_AUTOTAB_LEFT_OP OP_EXTUNI
#define LAST_AUTOTAB_RIGHT_OP OP_DOLLM
enum {
OP_END, /* 0 End of pattern */
/* Values corresponding to backslashed metacharacters */
OP_SOD, /* 1 Start of data: \A */
OP_SOM, /* 2 Start of match (subject + offset): \G */
OP_SET_SOM, /* 3 Set start of match (\K) */
OP_NOT_WORD_BOUNDARY, /* 4 \B */
OP_WORD_BOUNDARY, /* 5 \b */
OP_NOT_DIGIT, /* 6 \D */
OP_DIGIT, /* 7 \d */
OP_NOT_WHITESPACE, /* 8 \S */
OP_WHITESPACE, /* 9 \s */
OP_NOT_WORDCHAR, /* 10 \W */
OP_WORDCHAR, /* 11 \w */
OP_ANY, /* 12 Match any character except newline (\N) */
OP_ALLANY, /* 13 Match any character */
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
OP_NOTPROP, /* 15 \P (not Unicode property) */
OP_PROP, /* 16 \p (Unicode property) */
OP_ANYNL, /* 17 \R (any newline sequence) */
OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */
OP_HSPACE, /* 19 \h (horizontal whitespace) */
OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */
OP_VSPACE, /* 21 \v (vertical whitespace) */
OP_EXTUNI, /* 22 \X (extended Unicode sequence */
OP_EODN, /* 23 End of data or \n at end of data (\Z) */
OP_EOD, /* 24 End of data (\z) */
/* Line end assertions */
OP_DOLL, /* 25 End of line - not multiline */
OP_DOLLM, /* 26 End of line - multiline */
OP_CIRC, /* 27 Start of line - not multiline */
OP_CIRCM, /* 28 Start of line - multiline */
/* Single characters; caseful must precede the caseless ones */
OP_CHAR, /* 29 Match one character, casefully */
OP_CHARI, /* 30 Match one character, caselessly */
OP_NOT, /* 31 Match one character, not the given one, casefully */
OP_NOTI, /* 32 Match one character, not the given one, caselessly */
/* The following sets of 13 opcodes must always be kept in step because
the offset from the first one is used to generate the others. */
/* Repeated characters; caseful must precede the caseless ones */
OP_STAR, /* 33 The maximizing and minimizing versions of */
OP_MINSTAR, /* 34 these six opcodes must come in pairs, with */
OP_PLUS, /* 35 the minimizing one second. */
OP_MINPLUS, /* 36 */
OP_QUERY, /* 37 */
OP_MINQUERY, /* 38 */
OP_UPTO, /* 39 From 0 to n matches of one character, caseful*/
OP_MINUPTO, /* 40 */
OP_EXACT, /* 41 Exactly n matches */
OP_POSSTAR, /* 42 Possessified star, caseful */
OP_POSPLUS, /* 43 Possessified plus, caseful */
OP_POSQUERY, /* 44 Posesssified query, caseful */
OP_POSUPTO, /* 45 Possessified upto, caseful */
/* Repeated characters; caseless must follow the caseful ones */
OP_STARI, /* 46 */
OP_MINSTARI, /* 47 */
OP_PLUSI, /* 48 */
OP_MINPLUSI, /* 49 */
OP_QUERYI, /* 50 */
OP_MINQUERYI, /* 51 */
OP_UPTOI, /* 52 From 0 to n matches of one character, caseless */
OP_MINUPTOI, /* 53 */
OP_EXACTI, /* 54 */
OP_POSSTARI, /* 55 Possessified star, caseless */
OP_POSPLUSI, /* 56 Possessified plus, caseless */
OP_POSQUERYI, /* 57 Posesssified query, caseless */
OP_POSUPTOI, /* 58 Possessified upto, caseless */
/* The negated ones must follow the non-negated ones, and match them */
/* Negated repeated character, caseful; must precede the caseless ones */
OP_NOTSTAR, /* 59 The maximizing and minimizing versions of */
OP_NOTMINSTAR, /* 60 these six opcodes must come in pairs, with */
OP_NOTPLUS, /* 61 the minimizing one second. They must be in */
OP_NOTMINPLUS, /* 62 exactly the same order as those above. */
OP_NOTQUERY, /* 63 */
OP_NOTMINQUERY, /* 64 */
OP_NOTUPTO, /* 65 From 0 to n matches, caseful */
OP_NOTMINUPTO, /* 66 */
OP_NOTEXACT, /* 67 Exactly n matches */
OP_NOTPOSSTAR, /* 68 Possessified versions, caseful */
OP_NOTPOSPLUS, /* 69 */
OP_NOTPOSQUERY, /* 70 */
OP_NOTPOSUPTO, /* 71 */
/* Negated repeated character, caseless; must follow the caseful ones */
OP_NOTSTARI, /* 72 */
OP_NOTMINSTARI, /* 73 */
OP_NOTPLUSI, /* 74 */
OP_NOTMINPLUSI, /* 75 */
OP_NOTQUERYI, /* 76 */
OP_NOTMINQUERYI, /* 77 */
OP_NOTUPTOI, /* 78 From 0 to n matches, caseless */
OP_NOTMINUPTOI, /* 79 */
OP_NOTEXACTI, /* 80 Exactly n matches */
OP_NOTPOSSTARI, /* 81 Possessified versions, caseless */
OP_NOTPOSPLUSI, /* 82 */
OP_NOTPOSQUERYI, /* 83 */
OP_NOTPOSUPTOI, /* 84 */
/* Character types */
OP_TYPESTAR, /* 85 The maximizing and minimizing versions of */
OP_TYPEMINSTAR, /* 86 these six opcodes must come in pairs, with */
OP_TYPEPLUS, /* 87 the minimizing one second. These codes must */
OP_TYPEMINPLUS, /* 88 be in exactly the same order as those above. */
OP_TYPEQUERY, /* 89 */
OP_TYPEMINQUERY, /* 90 */
OP_TYPEUPTO, /* 91 From 0 to n matches */
OP_TYPEMINUPTO, /* 92 */
OP_TYPEEXACT, /* 93 Exactly n matches */
OP_TYPEPOSSTAR, /* 94 Possessified versions */
OP_TYPEPOSPLUS, /* 95 */
OP_TYPEPOSQUERY, /* 96 */
OP_TYPEPOSUPTO, /* 97 */
/* These are used for character classes and back references; only the
first six are the same as the sets above. */
OP_CRSTAR, /* 98 The maximizing and minimizing versions of */
OP_CRMINSTAR, /* 99 all these opcodes must come in pairs, with */
OP_CRPLUS, /* 100 the minimizing one second. These codes must */
OP_CRMINPLUS, /* 101 be in exactly the same order as those above. */
OP_CRQUERY, /* 102 */
OP_CRMINQUERY, /* 103 */
OP_CRRANGE, /* 104 These are different to the three sets above. */
OP_CRMINRANGE, /* 105 */
OP_CRPOSSTAR, /* 106 Possessified versions */
OP_CRPOSPLUS, /* 107 */
OP_CRPOSQUERY, /* 108 */
OP_CRPOSRANGE, /* 109 */
/* End of quantifier opcodes */
OP_CLASS, /* 110 Match a character class, chars < 256 only */
OP_NCLASS, /* 111 Same, but the bitmap was created from a negative
class - the difference is relevant only when a
character > 255 is encountered. */
OP_XCLASS, /* 112 Extended class for handling > 255 chars within the
class. This does both positive and negative. */
OP_REF, /* 113 Match a back reference, casefully */
OP_REFI, /* 114 Match a back reference, caselessly */
OP_DNREF, /* 115 Match a duplicate name backref, casefully */
OP_DNREFI, /* 116 Match a duplicate name backref, caselessly */
OP_RECURSE, /* 117 Match a numbered subpattern (possibly recursive) */
OP_CALLOUT, /* 118 Call out to external function if provided */
OP_ALT, /* 119 Start of alternation */
OP_KET, /* 120 End of group that doesn't have an unbounded repeat */
OP_KETRMAX, /* 121 These two must remain together and in this */
OP_KETRMIN, /* 122 order. They are for groups the repeat for ever. */
OP_KETRPOS, /* 123 Possessive unlimited repeat. */
/* The assertions must come before BRA, CBRA, ONCE, and COND, and the four
asserts must remain in order. */
OP_REVERSE, /* 124 Move pointer back - used in lookbehind assertions */
OP_ASSERT, /* 125 Positive lookahead */
OP_ASSERT_NOT, /* 126 Negative lookahead */
OP_ASSERTBACK, /* 127 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 128 Negative lookbehind */
/* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately
after the assertions, with ONCE first, as there's a test for >= ONCE for a
subpattern that isn't an assertion. The POS versions must immediately follow
the non-POS versions in each case. */
OP_ONCE, /* 129 Atomic group, contains captures */
OP_ONCE_NC, /* 130 Atomic group containing no captures */
OP_BRA, /* 131 Start of non-capturing bracket */
OP_BRAPOS, /* 132 Ditto, with unlimited, possessive repeat */
OP_CBRA, /* 133 Start of capturing bracket */
OP_CBRAPOS, /* 134 Ditto, with unlimited, possessive repeat */
OP_COND, /* 135 Conditional group */
/* These five must follow the previous five, in the same order. There's a
check for >= SBRA to distinguish the two sets. */
OP_SBRA, /* 136 Start of non-capturing bracket, check empty */
OP_SBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */
OP_SCBRA, /* 138 Start of capturing bracket, check empty */
OP_SCBRAPOS, /* 139 Ditto, with unlimited, possessive repeat */
OP_SCOND, /* 140 Conditional group, check empty */
/* The next two pairs must (respectively) be kept together. */
OP_CREF, /* 141 Used to hold a capture number as condition */
OP_DNCREF, /* 142 Used to point to duplicate names as a condition */
OP_RREF, /* 143 Used to hold a recursion number as condition */
OP_DNRREF, /* 144 Used to point to duplicate names as a condition */
OP_DEF, /* 145 The DEFINE condition */
OP_BRAZERO, /* 146 These two must remain together and in this */
OP_BRAMINZERO, /* 147 order. */
OP_BRAPOSZERO, /* 148 */
/* These are backtracking control verbs */
OP_MARK, /* 149 always has an argument */
OP_PRUNE, /* 150 */
OP_PRUNE_ARG, /* 151 same, but with argument */
OP_SKIP, /* 152 */
OP_SKIP_ARG, /* 153 same, but with argument */
OP_THEN, /* 154 */
OP_THEN_ARG, /* 155 same, but with argument */
OP_COMMIT, /* 156 */
/* These are forced failure and success verbs */
OP_FAIL, /* 157 */
OP_ACCEPT, /* 158 */
OP_ASSERT_ACCEPT, /* 159 Used inside assertions */
OP_CLOSE, /* 160 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
OP_SKIPZERO, /* 161 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
some in the past. */
OP_TABLE_LENGTH
};
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
definitions that follow must also be updated to match. There are also tables
called "opcode_possessify" in pcre_compile.c and "coptable" and "poptable" in
pcre_dfa_exec.c that must be updated. */
/* This macro defines textual names for all the opcodes. These are used only
for debugging, and some of them are only partial names. The macro is referenced
only in pcre_printint.c, which fills out the full names in many cases (and in
some cases doesn't actually use these names at all). */
#define OP_NAME_LIST \
"End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
"\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
"notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
"extuni", "\\Z", "\\z", \
"$", "$", "^", "^", "char", "chari", "not", "noti", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", \
"*+","++", "?+", "{", \
"class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \
"Recurse", "Callout", \
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \
"Once", "Once_NC", \
"Bra", "BraPos", "CBra", "CBraPos", \
"Cond", \
"SBra", "SBraPos", "SCBra", "SCBraPos", \
"SCond", \
"Cond ref", "Cond dnref", "Cond rec", "Cond dnrec", "Cond def", \
"Brazero", "Braminzero", "Braposzero", \
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
"*THEN", "*THEN", "*COMMIT", "*FAIL", \
"*ACCEPT", "*ASSERT_ACCEPT", \
"Close", "Skip zero"
/* This macro defines the length of fixed length operations in the compiled
regex. The lengths are used when searching for specific things, and also in the
debugging printing of a compiled regex. We use a macro so that it can be
defined close to the definitions of the opcodes themselves.
As things have been extended, some of these are no longer fixed lenths, but are
minima instead. For example, the length of a single-character repeat may vary
in UTF-8 mode. The code that uses this table must know about such things. */
#define OP_LENGTHS \
1, /* End */ \
1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
1, 1, 1, /* Any, AllAny, Anybyte */ \
3, 3, /* \P, \p */ \
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
1, /* \X */ \
1, 1, 1, 1, 1, 1, /* \Z, \z, $, $M ^, ^M */ \
2, /* Char - the minimum length */ \
2, /* Chari - the minimum length */ \
2, /* not */ \
2, /* noti */ \
/* Positive single-char repeats ** These are */ \
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
2+IMM2_SIZE, 2+IMM2_SIZE, /* upto, minupto ** mode */ \
2+IMM2_SIZE, /* exact */ \
2, 2, 2, 2+IMM2_SIZE, /* *+, ++, ?+, upto+ */ \
2, 2, 2, 2, 2, 2, /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8 */ \
2+IMM2_SIZE, 2+IMM2_SIZE, /* upto I, minupto I */ \
2+IMM2_SIZE, /* exact I */ \
2, 2, 2, 2+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */ \
/* Negative single-char repeats - only for chars < 256 */ \
2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto, minupto */ \
2+IMM2_SIZE, /* NOT exact */ \
2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *, +, ?, upto */ \
2, 2, 2, 2, 2, 2, /* NOT *I, *?I, +I, +?I, ?I, ??I */ \
2+IMM2_SIZE, 2+IMM2_SIZE, /* NOT upto I, minupto I */ \
2+IMM2_SIZE, /* NOT exact I */ \
2, 2, 2, 2+IMM2_SIZE, /* Possessive NOT *I, +I, ?I, upto I */ \
/* Positive type repeats */ \
2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
2+IMM2_SIZE, 2+IMM2_SIZE, /* Type upto, minupto */ \
2+IMM2_SIZE, /* Type exact */ \
2, 2, 2, 2+IMM2_SIZE, /* Possessive *+, ++, ?+, upto+ */ \
/* Character class & ref repeats */ \
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \
1, 1, 1, 1+2*IMM2_SIZE, /* Possessive *+, ++, ?+, CRPOSRANGE */ \
1+(32/sizeof(PCRE2_UCHAR)), /* CLASS */ \
1+(32/sizeof(PCRE2_UCHAR)), /* NCLASS */ \
0, /* XCLASS - variable length */ \
1+IMM2_SIZE, /* REF */ \
1+IMM2_SIZE, /* REFI */ \
1+2*IMM2_SIZE, /* DNREF */ \
1+2*IMM2_SIZE, /* DNREFI */ \
1+LINK_SIZE, /* RECURSE */ \
2+2*LINK_SIZE, /* CALLOUT */ \
1+LINK_SIZE, /* Alt */ \
1+LINK_SIZE, /* Ket */ \
1+LINK_SIZE, /* KetRmax */ \
1+LINK_SIZE, /* KetRmin */ \
1+LINK_SIZE, /* KetRpos */ \
1+LINK_SIZE, /* Reverse */ \
1+LINK_SIZE, /* Assert */ \
1+LINK_SIZE, /* Assert not */ \
1+LINK_SIZE, /* Assert behind */ \
1+LINK_SIZE, /* Assert behind not */ \
1+LINK_SIZE, /* ONCE */ \
1+LINK_SIZE, /* ONCE_NC */ \
1+LINK_SIZE, /* BRA */ \
1+LINK_SIZE, /* BRAPOS */ \
1+LINK_SIZE+IMM2_SIZE, /* CBRA */ \
1+LINK_SIZE+IMM2_SIZE, /* CBRAPOS */ \
1+LINK_SIZE, /* COND */ \
1+LINK_SIZE, /* SBRA */ \
1+LINK_SIZE, /* SBRAPOS */ \
1+LINK_SIZE+IMM2_SIZE, /* SCBRA */ \
1+LINK_SIZE+IMM2_SIZE, /* SCBRAPOS */ \
1+LINK_SIZE, /* SCOND */ \
1+IMM2_SIZE, 1+2*IMM2_SIZE, /* CREF, DNCREF */ \
1+IMM2_SIZE, 1+2*IMM2_SIZE, /* RREF, DNRREF */ \
1, /* DEF */ \
1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \
3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
1, 3, /* SKIP, SKIP_ARG */ \
1, 3, /* THEN, THEN_ARG */ \
1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \
1+IMM2_SIZE, 1 /* CLOSE, SKIPZERO */
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
#define RREF_ANY 0xffff
/* ---------- Private structures that are mode-independent. ---------- */
/* Structure to hold data for custom memory management. */
@ -811,15 +1432,64 @@ typedef struct pcre2_memctl {
void *memory_data;
} pcre2_memctl;
/* The other private structures used by PCRE are defined in a separate file.
/* Layout of the UCP type table that translates property names into types and
codes. Each entry used to point directly to a name, but to reduce the number of
relocations in shared libraries, it now has an offset into a single string
instead. */
typedef struct {
uint16_t name_offset;
uint16_t type;
uint16_t value;
} ucp_type_table;
/* Unicode character database (UCD) */
typedef struct {
uint8_t script; /* ucp_Arabic, etc. */
uint8_t chartype; /* ucp_Cc, etc. (general categories) */
uint8_t gbprop; /* ucp_gbControl, etc. (grapheme break property) */
uint8_t caseset; /* offset to multichar other cases or zero */
int32_t other_case; /* offset to other case, or zero if none */
} ucd_record;
extern const uint32_t PRIV(ucd_caseless_sets)[];
extern const ucd_record PRIV(ucd_records)[];
extern const uint8_t PRIV(ucd_stage1)[];
extern const uint16_t PRIV(ucd_stage2)[];
extern const uint32_t PRIV(ucp_gentype)[];
extern const uint32_t PRIV(ucp_gbtable)[];
#ifdef SUPPORT_JIT
extern const int PRIV(ucp_typerange)[];
#endif
/* UCD access macros */
#define UCD_BLOCK_SIZE 128
#define GET_UCD(ch) (PRIV(ucd_records) + \
PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \
UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE])
#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
#define UCD_SCRIPT(ch) GET_UCD(ch)->script
#define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
#define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop
#define UCD_CASESET(ch) GET_UCD(ch)->caseset
#define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
/* ----------------- Items that need PCRE2_CODE_UNIT_WIDTH ----------------- */
#ifdef PCRE2_CODE_UNIT_WIDTH
/* Mode-dependent macros and private structures are defined in a separate file.
When compiling the library, PCRE2_CODE_UNIT_WIDTH will be defined, and we
include them at the appropriate width. When compiling pcre2test, however, that
macro is not set at this point because pcre2test needs to include them at all
supported widths. */
#ifdef PCRE2_CODE_UNIT_WIDTH
#include "pcre2_intstructs.h"
#endif
#include "pcre2_intmodedep.h"
/* Internal shared functions. These are functions that are used by more than
one of the library's exported public functions. They have to be "external" in
@ -827,14 +1497,15 @@ the C sense, but are not part of the PCRE public API. They are not referenced
from pcre2test, and must not be defined when no code unit width is available.
*/
#ifdef PCRE2_CODE_UNIT_WIDTH
#define _pcre2_compile_context_init PCRE2_SUFFIX(_pcre2_compile_context_init_)
#define _pcre2_match_context_init PCRE2_SUFFIX(_pcre2_match_context_init_)
#define _pcre2_memctl_malloc PCRE2_SUFFIX(_pcre2_memctl_malloc_)
#define _pcre2_strcmp PCRE2_SUFFIX(_pcre_strcmp_)
extern void _pcre2_compile_context_init(pcre2_compile_context *, BOOL);
extern void _pcre2_match_context_init(pcre2_match_context *, BOOL);
extern void *_pcre2_memctl_malloc(size_t, size_t, pcre2_general_context *);
extern void *_pcre2_memctl_malloc(size_t, size_t, pcre2_memctl *);
extern int _pcre2_strcmp(PCRE2_SPTR, PCRE2_SPTR);
#endif
/* End of pcre2_internal.h */

258
src/pcre2_intmodedep.h Normal file
View File

@ -0,0 +1,258 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains mode-dependent macro and structure definitions. The
file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
These mode-dependent items are kept in a separate file so that they can also be
#included multiple times for different code unit widths by pcre2test. Start by
undefining all the new macros defined herein so that they can be redefined for
multiple inclusions. */
#undef CU2BYTES
#undef GET
#undef GET2
#undef IMM2_SIZE
#undef MAX_PATTERN_SIZE
#undef PUT
#undef PUT2
#undef PUTINC
/* ---------------------------MACROS ----------------------------- */
/* PCRE keeps offsets in its compiled code as at least 16-bit quantities
(always stored in big-endian order in 8-bit mode) by default. These are used,
for example, to link from the start of a subpattern to its alternatives and its
end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
to around 64K, which is big enough for almost everybody. However, I received a
request for an even bigger limit. For this reason, and also to make the code
easier to maintain, the storing and loading of offsets from the compiled code
unit string is now handled by the macros that are defined here.
The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
values of 2 or 4 are also supported. */
/* ------------------- 8-bit support ------------------ */
#if PCRE2_CODE_UNIT_WIDTH == 8
#if LINK_SIZE == 2
#define PUT(a,n,d) \
(a[n] = (d) >> 8), \
(a[(n)+1] = (d) & 255)
#define GET(a,n) \
(((a)[n] << 8) | (a)[(n)+1])
#define MAX_PATTERN_SIZE (1 << 16)
#elif LINK_SIZE == 3
#define PUT(a,n,d) \
(a[n] = (d) >> 16), \
(a[(n)+1] = (d) >> 8), \
(a[(n)+2] = (d) & 255)
#define GET(a,n) \
(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
#define MAX_PATTERN_SIZE (1 << 24)
#elif LINK_SIZE == 4
#define PUT(a,n,d) \
(a[n] = (d) >> 24), \
(a[(n)+1] = (d) >> 16), \
(a[(n)+2] = (d) >> 8), \
(a[(n)+3] = (d) & 255)
#define GET(a,n) \
(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
#else
#error LINK_SIZE must be either 2, 3, or 4
#endif
/* ------------------- 16-bit support ------------------ */
#elif PCRE2_CODE_UNIT_WIDTH == 16
#if LINK_SIZE == 2
#undef LINK_SIZE
#define LINK_SIZE 1
#define PUT(a,n,d) \
(a[n] = (d))
#define GET(a,n) \
(a[n])
#define MAX_PATTERN_SIZE (1 << 16)
#elif LINK_SIZE == 3 || LINK_SIZE == 4
#undef LINK_SIZE
#define LINK_SIZE 2
#define PUT(a,n,d) \
(a[n] = (d) >> 16), \
(a[(n)+1] = (d) & 65535)
#define GET(a,n) \
(((a)[n] << 16) | (a)[(n)+1])
#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
#else
#error LINK_SIZE must be either 2, 3, or 4
#endif
/* ------------------- 32-bit support ------------------ */
#elif PCRE2_CODE_UNIT_WIDTH == 32
#undef LINK_SIZE
#define LINK_SIZE 1
#define PUT(a,n,d) \
(a[n] = (d))
#define GET(a,n) \
(a[n])
#define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */
#else
#error Unsupported compiling mode
#endif
/* -------------------------------------------------------*/
/* PCRE uses some other (at least) 16-bit quantities that do not change when
the size of offsets changes. There are used for repeat counts and for other
things such as capturing parenthesis numbers in back references.
Define the number of code units required to hold a 16-bit count/offset, and
macros to load and store such a value. For reasons that I do not understand,
the expression in the 8-bit GET2 macro is treated by gcc as a signed
expression, even when a is declared as unsigned. It seems that any kind of
arithmetic results in a signed value. Hence the cast. */
#if PCRE2_CODE_UNIT_WIDTH == 8
#define IMM2_SIZE 2
#define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
#define PUT2(a,n,d) { a[n] = (d) >> 8; a[(n)+1] = (d) & 255; }
#else /* Code units are 16 or 32 bits */
#define IMM2_SIZE 1
#define GET2(a,n) a[n]
#define PUT2(a,n,d) a[n] = d
#endif
/* Mode-dependent macros that have the same definition in all modes. */
#define CU2BYTES(x) (x)*((PCRE2_CODE_UNIT_WIDTH/8))
#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
/* --------------------------- STRUCTURES ----------------------------- */
/* The real general context structure. At present it hold only data for custom
memory control. */
typedef struct pcre2_real_general_context {
pcre2_memctl memctl;
} pcre2_real_general_context;
/* The real compile context structure */
typedef struct pcre2_real_compile_context {
pcre2_memctl memctl;
int (*stack_guard)(uint32_t);
const unsigned char *tables;
uint16_t bsr_convention;
uint16_t newline_convention;
uint32_t parens_nest_limit;
} pcre2_real_compile_context;
/* The real match context structure. */
typedef struct pcre2_real_match_context {
pcre2_memctl memctl;
#ifdef NO_RECURSE
void * (*stack_malloc)(size_t, void *);
void (*stack_free)(void *, void *);
#endif
int (*callout)(pcre2_callout_block *, void *);
uint32_t match_limit;
uint32_t recursion_limit;
} pcre2_real_match_context;
/* The real compiled code structure */
typedef struct pcre2_real_code {
pcre2_memctl memctl;
void *executable_jit; /* Pointer to JIT code */
uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
uint32_t magic_number; /* Paranoid and endianness check */
uint32_t size; /* Total (bytes) that was malloc-ed */
uint32_t compile_options; /* Options passed to pcre2_compile() */
uint32_t pattern_options; /* Options taken from the pattern */
uint32_t flags; /* Various state flags */
uint32_t limit_match; /* Limit set in the pattern */
uint32_t limit_recursion; /* Limit set in the pattern */
uint32_t first_codeunit; /* Starting code unit */
uint32_t last_codeunit; /* This codeunit must be seen */
uint16_t bsr_convention; /* What \R matches */
uint16_t newline_convention; /* What is a newline? */
uint16_t max_lookbehind; /* Longest lookbehind (characters) */
uint16_t minlength; /* Minimum length of match */
uint16_t top_bracket; /* Highest numbered group */
uint16_t top_backref; /* Highest numbered back reference */
uint16_t name_entry_size; /* Size (code units) of table entries */
uint16_t name_count; /* Number of name entries in the table */
} pcre2_real_code;
/* The reat match data structure. */
typedef struct pcre2_real_match_data {
pcre2_memctl memctl;
const pcre2_real_code *code; /* The pattern used for the match */
PCRE2_SPTR subject; /* The subject that was matched */
int rc; /* The return code from the match */
int utf_reason; /* Reason code for bad UTF */
size_t leftchar; /* Offset to leftmost code unit */
size_t rightchar; /* Offset to rightmost code unit */
size_t startchar; /* Offset to starting code unit */
PCRE2_SPTR mark; /* Pointer to last mark */
uint16_t oveccount; /* Number of pairs */
size_t ovector[1]; /* The first field */
} pcre2_real_match_data;
/* End of pcre2_intmodedep.h */

View File

@ -1,114 +0,0 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains the private mode-dependent structures needed by
pcre2_internal.h. They are kept separate so that they can be #included multiple
times for different code unit widths by pcre2test. */
/* The real general context structure. At present it hold only data for custom
memory control. */
typedef struct pcre2_real_general_context {
pcre2_memctl memctl;
} pcre2_real_general_context;
/* The real compile context structure */
typedef struct pcre2_real_compile_context {
pcre2_memctl memctl;
int (*stack_guard)(uint32_t);
const unsigned char *tables;
uint16_t bsr_convention;
uint16_t newline_convention;
uint32_t parens_nest_limit;
} pcre2_real_compile_context;
/* The real match context structure. */
typedef struct pcre2_real_match_context {
pcre2_memctl memctl;
#ifdef NO_RECURSE
void * (*stack_malloc)(size_t, void *);
void (*stack_free)(void *, void *);
#endif
int (*callout)(pcre2_callout_block *, void *);
uint32_t match_limit;
uint32_t recursion_limit;
} pcre2_real_match_context;
/* The reat match data structure. */
typedef struct pcre2_real_match_data {
pcre2_memctl memctl;
size_t leftchar; /* Offset to leftmost code unit */
size_t rightchar; /* Offset to rightmost code unit */
size_t startchar; /* Offset to starting code unit */
PCRE2_SPTR mark; /* Pointer to last mark */
uint16_t oveccount; /* Number of pairs */
size_t ovector[1]; /* The first field */
} pcre2_real_match_data;
/* The real compiled code structure */
typedef struct pcre2_real_code {
pcre2_memctl memctl;
void *executable_jit; /* Pointer to JIT code */
uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
uint32_t magic_number; /* Paranoid and endianness check */
uint32_t size; /* Total that was malloc-ed */
uint32_t compile_options; /* Options passed to pcre2_compile() */
uint32_t pattern_options; /* Options taken from the pattern */
uint32_t flags; /* Various state flags */
uint32_t limit_match; /* Limit set in the pattern */
uint32_t limit_recursion; /* Limit set in the pattern */
uint32_t first_codeunit; /* Starting code unit */
uint32_t last_codeunit; /* This codeunit must be seen */
uint16_t bsr_convention; /* What \R matches */
uint16_t newline_convention; /* What is a newline? */
uint16_t max_lookbehind; /* Longest lookbehind (characters) */
uint16_t minlength; /* Minimum length of match */
uint16_t top_bracket; /* Highest numbered group */
uint16_t top_backref; /* Highest numbered back reference */
uint16_t name_table_offset; /* Offset to name table that follows */
uint16_t name_entry_size; /* Size of name items in the table */
uint16_t name_count; /* Number of name entries in the table */
} pcre2_real_code;
/* End of pcre2_intstructs.h */

View File

@ -76,11 +76,19 @@ pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, int length,
size_t start_offset, uint32_t options, pcre2_match_data *match_data,
pcre2_match_context *mcontext)
{
int rc = PCRE2_ERROR_NOMATCH;
/* Fudge for testing pcre2test */
mcontext=mcontext;length=length;
options=options;
if (subject[start_offset] == 'Y')
/* Fudges for testing pcre2test */
if (subject[0] == 'Y')
{
rc = 0;
match_data->code = code;
match_data->subject = subject;
match_data->leftchar = 0;
match_data->rightchar = 3;
match_data->startchar = 0;
@ -88,24 +96,51 @@ if (subject[start_offset] == 'Y')
switch (match_data->oveccount)
{
case 0: return 0;
case 0: break;
case 1: match_data->ovector[0] = start_offset;
match_data->ovector[1] = start_offset + 4;
return 0;
break;
default: match_data->ovector[0] = start_offset;
default:
case 6: match_data->ovector[10] = PCRE2_UNSET;
match_data->ovector[11] = PCRE2_UNSET;
case 5: match_data->ovector[8] = PCRE2_UNSET;
match_data->ovector[9] = PCRE2_UNSET;
case 4: match_data->ovector[6] = start_offset + 3;
match_data->ovector[7] = start_offset + 4;
rc += 2;
case 3: match_data->ovector[4] = PCRE2_UNSET;
match_data->ovector[5] = PCRE2_UNSET;
case 2: match_data->ovector[0] = start_offset;
match_data->ovector[1] = start_offset + 4;
match_data->ovector[2] = start_offset + 1;
match_data->ovector[3] = start_offset + 3;
return 2;
match_data->mark = subject;
rc += 2;
break;
}
}
else if (subject[0] == 'P')
{
rc = PCRE2_ERROR_PARTIAL;
match_data->code = code;
match_data->subject = subject;
match_data->leftchar = 0;
match_data->rightchar = length;
match_data->startchar = 1;
match_data->mark = NULL;
}
mcontext=mcontext;code=code;subject=subject;length=length;
start_offset=start_offset; options=options; match_data=match_data;
return PCRE2_ERROR_NOMATCH;
match_data->rc = rc;
return rc;
}
/* End of pcre2_match.c */

View File

@ -56,7 +56,7 @@ pcre2_match_data_create(size_t oveccount, pcre2_general_context *gcontext)
{
pcre2_match_data *yield = PRIV(memctl_malloc)(
sizeof(pcre2_match_data) + 3*oveccount*sizeof(size_t),
offsetof(pcre2_real_match_data, memctl), gcontext);
offsetof(pcre2_real_match_data, memctl), &(gcontext->memctl));
yield->oveccount = oveccount;
return yield;
}

View File

@ -167,7 +167,7 @@ switch(what)
break;
case PCRE2_INFO_NAMETABLE:
*((PCRE2_SPTR*)where) = (PCRE2_SPTR)re + re->name_table_offset;
*((PCRE2_SPTR*)where) = (PCRE2_SPTR)((char *)re + sizeof(pcre2_real_code));
break;
case PCRE2_INFO_NEWLINE_CONVENTION:

View File

@ -1 +1,787 @@
/* This is a placeholder for pcre2_printint.c */
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains a PCRE private debugging function for printing out the
internal form of a compiled regular expression, along with some supporting
local functions. This source file is #included in pcre2test.c at each supported
code unit width, with PCRE2_SUFFIX set appropriately, just like the functions
that comprise the library. */
/* Tables of operator names. The same 8-bit table is used for all code unit
widths, so it must be defined only once. The list itself is defined in
pcre2_internal.h, which is #included by pcre2test before this file. */
#ifndef OP_LISTS_DEFINED
static const char *OP_names[] = { OP_NAME_LIST };
#define OP_LISTS_DEFINED
#endif
/* The functions and tables herein must all have mode-dependent names. */
#define OP_lengths PCRE2_SUFFIX(OP_lengths_)
#define get_ucpname PCRE2_SUFFIX(get_ucpname_)
#define pcre2_printint PCRE2_SUFFIX(pcre2_printint_)
#define print_char PCRE2_SUFFIX(print_char_)
#define print_custring PCRE2_SUFFIX(print_custring_)
#define print_prop PCRE2_SUFFIX(print_prop_)
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
the definition is next to the definition of the opcodes in pcre2_internal.h.
The contents of the table are, however, mode-dependent. */
static const uint8_t OP_lengths[] = { OP_LENGTHS };
/*************************************************
* Print one character from a string *
*************************************************/
/* In UTF mode the character may occupy more than one code unit.
Arguments:
f file to write to
ptr pointer to first code unit of the character
utf TRUE if string is UTF (will be FALSE if UTF is not supported)
Returns: number of additional code units used
*/
static unsigned int
print_char(FILE *f, PCRE2_SPTR ptr, BOOL utf)
{
uint32_t c = *ptr;
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
int a, i, s;
#endif
/* If UTF is supported and requested, check for a one-code-unit character. The
16-bit and 32-bit tests are for malformed UTF, and should only trigger if the
sanity check is turned off. */
#ifdef SUPPORT_UTF
if (utf)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
utf = (c & 0xc0) == 0xc0;
#elif PCRE2_CODE_UNIT_WIDTH == 16
utf = (c & 0xfc00) == 0xd800;
#else
utf = (c & 0xfffff800u) != 0xd800u;
#endif
}
#endif /* SUPPORT_UTF */
/* Handle a one-code-unit character at any width. */
if (!utf)
{
if (PRINTABLE(c)) fprintf(f, "%c", (char)c);
else if (c < 0x80) fprintf(f, "\\x%02x", c);
else fprintf(f, "\\x{%02x}", c);
return 0;
}
/* Per-width code for handling non-one-code-unit UTF characters. */
#ifdef SUPPORT_UTF
/* Handle a multi-byte UTF-8 character. */
#if PCRE2_CODE_UNIT_WIDTH == 8
a = utf8_table4[c & 0x3f]; /* Number of additional bytes */
s = 6*a;
c = (c & utf8_table3[a]) << s;
for (i = 1; i <= a; i++)
{
/* This is a check for malformed UTF-8; it should only occur if the sanity
check has been turned off. Rather than swallow random bytes, just stop if
we hit a bad one. Print it with \X instead of \x as an indication. */
if ((ptr[i] & 0xc0) != 0x80)
{
fprintf(f, "\\X{%x}", c);
return i - 1;
}
/* The byte is OK */
s -= 6;
c |= (ptr[i] & 0x3f) << s;
}
fprintf(f, "\\x{%x}", c);
return a;
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
/* Handle a multi-code-unit UTF-16 character, starting with a check for
malformed UTF-16; it should only occur if the sanity check has been turned off.
Rather than swallow a low surrogate, just stop if we hit a bad one. Print it
with \X instead of \x as an indication. */
#if PCRE2_CODE_UNIT_WIDTH == 16
if ((ptr[1] & 0xfc00) != 0xdc00)
{
fprintf(f, "\\X{%x}", c);
return 0;
}
c = (((c & 0x3ff) << 10) | (ptr[1] & 0x3ff)) + 0x10000;
fprintf(f, "\\x{%x}", c);
return 1;
#endif /* PCRE2_CODE_UNIT_WIDTH == 16 */
/* For UTF-32 we get here only for a malformed code unit, which should only
occur if the sanity check has been turned off. Print it with \X instead of \x
as an indication. */
#if PCRE2_CODE_UNIT_WIDTH == 32
fprintf(f, "\\X{%x}", c);
return 0;
#endif /* PCRE2_CODE_UNIT_WIDTH == 16 */
#endif /* SUPPORT_UTF */
}
/*************************************************
* Print string as a list of code units *
*************************************************/
/* This takes no account of UTF as it always prints each individual code unit.
The string is zero-terminated.
Arguments:
f file to write to
ptr point to the string
Returns: nothing
*/
static void
print_custring(FILE *f, PCRE2_SPTR ptr)
{
while (*ptr != '\0')
{
register uint32_t c = *ptr++;
if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x{%x}", c);
}
}
/*************************************************
* Find Unicode property name *
*************************************************/
static const char *
get_ucpname(unsigned int ptype, unsigned int pvalue)
{
int i;
for (i = utt_size - 1; i >= 0; i--)
{
if (ptype == utt[i].type && pvalue == utt[i].value) break;
}
return (i >= 0)? utt_names + utt[i].name_offset : "??";
}
/*************************************************
* Print Unicode property value *
*************************************************/
/* "Normal" properties can be printed from tables. The PT_CLIST property is a
pseudo-property that contains a pointer to a list of case-equivalent
characters.
Arguments:
f file to write to
code pointer in the compiled code
before text to print before
after text to print after
Returns: nothing
*/
static void
print_prop(FILE *f, PCRE2_SPTR code, const char *before, const char *after)
{
if (code[1] != PT_CLIST)
{
fprintf(f, "%s%s %s%s", before, OP_names[*code], get_ucpname(code[1],
code[2]), after);
}
else
{
const char *not = (*code == OP_PROP)? "" : "not ";
const uint32_t *p = ucd_caseless_sets + code[2];
fprintf (f, "%s%sclist", before, not);
while (*p < NOTACHAR) fprintf(f, " %04x", *p++);
fprintf(f, "%s", after);
}
}
/*************************************************
* Print compiled pattern *
*************************************************/
/* The print_lengths flag controls whether offsets and lengths of items are
printed. Lenths can be turned off from pcre2test so that automatic tests on
bytecode can be written that do not depend on the value of LINK_SIZE.
Arguments:
re a compiled pattern
f the file to write to
print_lenghts show various lengths
Returns: nothing
*/
static void
pcre2_printint(pcre2_code *re, FILE *f, BOOL print_lengths)
{
PCRE2_SPTR codestart, nametable, code;
uint32_t options = re->compile_options;
size_t nesize = re->name_entry_size;
BOOL utf = (options & PCRE2_UTF) != 0;
nametable = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code));
code = codestart = nametable + re->name_count * re->name_entry_size;
for(;;)
{
PCRE2_SPTR ccode;
uint32_t c;
const char *flag = " ";
unsigned int extra = 0;
if (print_lengths)
fprintf(f, "%3d ", (int)(code - codestart));
else
fprintf(f, " ");
switch(*code)
{
/* ========================================================================== */
/* These cases are never obeyed. This is a fudge that causes a compile-
time error if the vectors OP_names or OP_lengths, which are indexed
by opcode, are not the correct length. It seems to be the only way to do
such a check at compile time, as the sizeof() operator does not work in
the C preprocessor. */
case OP_TABLE_LENGTH:
case OP_TABLE_LENGTH +
((sizeof(OP_names)/sizeof(const char *) == OP_TABLE_LENGTH) &&
(sizeof(OP_lengths) == OP_TABLE_LENGTH)):
break;
/* ========================================================================== */
case OP_END:
fprintf(f, " %s\n", OP_names[*code]);
fprintf(f, "------------------------------------------------------------------\n");
return;
case OP_CHAR:
fprintf(f, " ");
do
{
code++;
code += 1 + print_char(f, code, utf);
}
while (*code == OP_CHAR);
fprintf(f, "\n");
continue;
case OP_CHARI:
fprintf(f, " /i ");
do
{
code++;
code += 1 + print_char(f, code, utf);
}
while (*code == OP_CHARI);
fprintf(f, "\n");
continue;
case OP_CBRA:
case OP_CBRAPOS:
case OP_SCBRA:
case OP_SCBRAPOS:
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
else fprintf(f, " ");
fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
break;
case OP_BRA:
case OP_BRAPOS:
case OP_SBRA:
case OP_SBRAPOS:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_KETRPOS:
case OP_ALT:
case OP_KET:
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
case OP_ONCE:
case OP_ONCE_NC:
case OP_COND:
case OP_SCOND:
case OP_REVERSE:
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
else fprintf(f, " ");
fprintf(f, "%s", OP_names[*code]);
break;
case OP_CLOSE:
fprintf(f, " %s %d", OP_names[*code], GET2(code, 1));
break;
case OP_CREF:
fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
break;
case OP_DNCREF:
{
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
fprintf(f, " %s Cond ref <", flag);
print_custring(f, entry);
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
}
break;
case OP_RREF:
c = GET2(code, 1);
if (c == RREF_ANY)
fprintf(f, " Cond recurse any");
else
fprintf(f, " Cond recurse %d", c);
break;
case OP_DNRREF:
{
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
fprintf(f, " %s Cond recurse <", flag);
print_custring(f, entry);
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
}
break;
case OP_DEF:
fprintf(f, " Cond def");
break;
case OP_STARI:
case OP_MINSTARI:
case OP_POSSTARI:
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
flag = "/i";
/* Fall through */
case OP_STAR:
case OP_MINSTAR:
case OP_POSSTAR:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPOSSTAR:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEPOSPLUS:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSQUERY:
fprintf(f, " %s ", flag);
if (*code >= OP_TYPESTAR)
{
if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
{
print_prop(f, code + 1, "", " ");
extra = 2;
}
else fprintf(f, "%s", OP_names[code[1]]);
}
else extra = print_char(f, code+1, utf);
fprintf(f, "%s", OP_names[*code]);
break;
case OP_EXACTI:
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
flag = "/i";
/* Fall through */
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
fprintf(f, " %s ", flag);
extra = print_char(f, code + 1 + IMM2_SIZE, utf);
fprintf(f, "{");
if (*code != OP_EXACT && *code != OP_EXACTI) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
if (*code == OP_MINUPTO || *code == OP_MINUPTOI) fprintf(f, "?");
else if (*code == OP_POSUPTO || *code == OP_POSUPTOI) fprintf(f, "+");
break;
case OP_TYPEEXACT:
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
{
print_prop(f, code + IMM2_SIZE + 1, " ", " ");
extra = 2;
}
else fprintf(f, " %s", OP_names[code[1 + IMM2_SIZE]]);
fprintf(f, "{");
if (*code != OP_TYPEEXACT) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
break;
case OP_NOTI:
flag = "/i";
/* Fall through */
case OP_NOT:
fprintf(f, " %s [^", flag);
extra = print_char(f, code + 1, utf);
fprintf(f, "]");
break;
case OP_NOTSTARI:
case OP_NOTMINSTARI:
case OP_NOTPOSSTARI:
case OP_NOTPLUSI:
case OP_NOTMINPLUSI:
case OP_NOTPOSPLUSI:
case OP_NOTQUERYI:
case OP_NOTMINQUERYI:
case OP_NOTPOSQUERYI:
flag = "/i";
/* Fall through */
case OP_NOTSTAR:
case OP_NOTMINSTAR:
case OP_NOTPOSSTAR:
case OP_NOTPLUS:
case OP_NOTMINPLUS:
case OP_NOTPOSPLUS:
case OP_NOTQUERY:
case OP_NOTMINQUERY:
case OP_NOTPOSQUERY:
fprintf(f, " %s [^", flag);
extra = print_char(f, code + 1, utf);
fprintf(f, "]%s", OP_names[*code]);
break;
case OP_NOTEXACTI:
case OP_NOTUPTOI:
case OP_NOTMINUPTOI:
case OP_NOTPOSUPTOI:
flag = "/i";
/* Fall through */
case OP_NOTEXACT:
case OP_NOTUPTO:
case OP_NOTMINUPTO:
case OP_NOTPOSUPTO:
fprintf(f, " %s [^", flag);
extra = print_char(f, code + 1 + IMM2_SIZE, utf);
fprintf(f, "]{");
if (*code != OP_NOTEXACT && *code != OP_NOTEXACTI) fprintf(f, "0,");
fprintf(f, "%d}", GET2(code,1));
if (*code == OP_NOTMINUPTO || *code == OP_NOTMINUPTOI) fprintf(f, "?");
else
if (*code == OP_NOTPOSUPTO || *code == OP_NOTPOSUPTOI) fprintf(f, "+");
break;
case OP_RECURSE:
if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
else fprintf(f, " ");
fprintf(f, "%s", OP_names[*code]);
break;
case OP_REFI:
flag = "/i";
/* Fall through */
case OP_REF:
fprintf(f, " %s \\%d", flag, GET2(code,1));
ccode = code + OP_lengths[*code];
goto CLASS_REF_REPEAT;
case OP_DNREFI:
flag = "/i";
/* Fall through */
case OP_DNREF:
{
PCRE2_SPTR entry = nametable + (GET2(code, 1) * nesize) + IMM2_SIZE;
fprintf(f, " %s \\k<", flag);
print_custring(f, entry);
fprintf(f, ">%d", GET2(code, 1 + IMM2_SIZE));
}
ccode = code + OP_lengths[*code];
goto CLASS_REF_REPEAT;
case OP_CALLOUT:
fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2),
GET(code, 2 + LINK_SIZE));
break;
case OP_PROP:
case OP_NOTPROP:
print_prop(f, code, " ", "");
break;
/* OP_XCLASS cannot occur in 8-bit, non-UTF mode. However, there's no harm
in having this code always here, and it makes it less messy without all
those #ifdefs. */
case OP_CLASS:
case OP_NCLASS:
case OP_XCLASS:
{
int i;
unsigned int min, max;
BOOL printmap;
BOOL invertmap = FALSE;
uint8_t *map;
uint8_t inverted_map[32];
fprintf(f, " [");
if (*code == OP_XCLASS)
{
extra = GET(code, 1);
ccode = code + LINK_SIZE + 1;
printmap = (*ccode & XCL_MAP) != 0;
if ((*ccode & XCL_NOT) != 0)
{
invertmap = (*ccode & XCL_HASPROP) == 0;
fprintf(f, "^");
}
ccode++;
}
else
{
printmap = TRUE;
ccode = code + 1;
}
/* Print a bit map */
if (printmap)
{
map = (uint8_t *)ccode;
if (invertmap)
{
for (i = 0; i < 32; i++) inverted_map[i] = ~map[i];
map = inverted_map;
}
for (i = 0; i < 256; i++)
{
if ((map[i/8] & (1 << (i&7))) != 0)
{
int j;
for (j = i+1; j < 256; j++)
if ((map[j/8] & (1 << (j&7))) == 0) break;
if (i == '-' || i == ']') fprintf(f, "\\");
if (PRINTABLE(i)) fprintf(f, "%c", i);
else fprintf(f, "\\x%02x", i);
if (--j > i)
{
if (j != i + 1) fprintf(f, "-");
if (j == '-' || j == ']') fprintf(f, "\\");
if (PRINTABLE(j)) fprintf(f, "%c", j);
else fprintf(f, "\\x%02x", j);
}
i = j;
}
}
ccode += 32 / sizeof(PCRE2_UCHAR);
}
/* For an XCLASS there is always some additional data */
if (*code == OP_XCLASS)
{
PCRE2_UCHAR ch;
while ((ch = *ccode++) != XCL_END)
{
BOOL not = FALSE;
const char *notch = "";
switch(ch)
{
case XCL_NOTPROP:
not = TRUE;
notch = "^";
/* Fall through */
case XCL_PROP:
{
unsigned int ptype = *ccode++;
unsigned int pvalue = *ccode++;
switch(ptype)
{
case PT_PXGRAPH:
fprintf(f, "[:%sgraph:]", notch);
break;
case PT_PXPRINT:
fprintf(f, "[:%sprint:]", notch);
break;
case PT_PXPUNCT:
fprintf(f, "[:%spunct:]", notch);
break;
default:
fprintf(f, "\\%c{%s}", (not? 'P':'p'),
get_ucpname(ptype, pvalue));
break;
}
}
break;
default:
ccode += 1 + print_char(f, ccode, utf);
if (ch == XCL_RANGE)
{
fprintf(f, "-");
ccode += 1 + print_char(f, ccode, utf);
}
break;
}
}
}
/* Indicate a non-UTF class which was created by negation */
fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
/* Handle repeats after a class or a back reference */
CLASS_REF_REPEAT:
switch(*ccode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRPLUS:
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
case OP_CRPOSSTAR:
case OP_CRPOSPLUS:
case OP_CRPOSQUERY:
fprintf(f, "%s", OP_names[*ccode]);
extra += OP_lengths[*ccode];
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
case OP_CRPOSRANGE:
min = GET2(ccode,1);
max = GET2(ccode,1 + IMM2_SIZE);
if (max == 0) fprintf(f, "{%u,}", min);
else fprintf(f, "{%u,%u}", min, max);
if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
else if (*ccode == OP_CRPOSRANGE) fprintf(f, "+");
extra += OP_lengths[*ccode];
break;
/* Do nothing if it's not a repeat; this code stops picky compilers
warning about the lack of a default code path. */
default:
break;
}
}
break;
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
case OP_THEN_ARG:
fprintf(f, " %s ", OP_names[*code]);
print_custring(f, code + 2);
extra += code[1];
break;
case OP_THEN:
fprintf(f, " %s", OP_names[*code]);
break;
case OP_CIRCM:
case OP_DOLLM:
flag = "/m";
/* Fall through */
/* Anything else is just an item with no data, but possibly a flag. */
default:
fprintf(f, " %s %s", flag, OP_names[*code]);
break;
}
code += OP_lengths[*code] + extra;
fprintf(f, "\n");
}
}
/* End of pcre2_printint.c */

80
src/pcre2_string_utils.c Normal file
View File

@ -0,0 +1,80 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains internal functions for comparing and finding the length
of strings. These are used instead of strcmp() etc because the standard
functions work only on 8-bit data. */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre2_internal.h"
/* FIXME: this module is incomplete */
/*************************************************
* Compare two strings *
*************************************************/
/*
Arguments:
str1 first string
str2 second string
Returns: 0, 1, or -1
*/
int
PRIV(strcmp)(PCRE2_SPTR str1, PCRE2_SPTR str2)
{
PCRE2_UCHAR c1, c2;
while (*str1 != '\0' || *str2 != '\0')
{
c1 = *str1++;
c2 = *str2++;
if (c1 != c2) return ((c1 > c2) << 1) - 1;
}
return 0;
}
/* End of pcre2_string_utils.c */

View File

@ -46,8 +46,6 @@ POSSIBILITY OF SUCH DAMAGE.
#include "pcre2_internal.h"
/* FIXME: most of these are currently placeholder functions */
/*************************************************
* Copy named captured string to given buffer *
@ -75,7 +73,16 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_copy_byname(pcre2_match_data *match_data, PCRE2_SPTR stringname,
PCRE2_UCHAR *buffer, size_t size)
{
match_data=match_data;stringname=stringname;buffer=buffer;size=size;
PCRE2_SPTR first, last, entry;
int entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
&first, &last);
if (entrysize <= 0) return entrysize;
for (entry = first; entry <= last; entry += entrysize)
{
uint16_t n = GET2(entry, 0);
if (n < match_data->oveccount && match_data->ovector[n*2] != PCRE2_UNSET)
return pcre2_substring_copy_bynumber(match_data, n, buffer, size);
}
return PCRE2_ERROR_NOSUBSTRING;
}
@ -106,55 +113,17 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_copy_bynumber(pcre2_match_data *match_data, int stringnumber,
PCRE2_UCHAR *buffer, size_t size)
{
match_data=match_data;stringnumber=stringnumber;buffer=buffer;size=size;
return PCRE2_ERROR_NOSUBSTRING;
}
/*************************************************
* Free memory obtained by get_substring *
*************************************************/
/* This function exists for the benefit of people calling PCRE from non-C
programs that can call its functions, but not free() itself.
Arguments:
context points to a PCRE2 context
string the result of a previous pcre2_get_substring()
Returns: nothing
*/
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_substring_free(PCRE2_UCHAR *string)
{
string=string;
return;
}
/*************************************************
* Free memory obtained by get_substring_list *
*************************************************/
/* This function exists for the benefit of people calling PCRE from non-C
programs that can call its functions, but not free() itself.
Arguments:
context points to a PCRE2 context
list the result of a previous pcre2_get_substring_list()
Returns: nothing
*/
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_substring_list_free(PCRE2_SPTR *list)
{
list=list;
return;
size_t left, right;
size_t p = 0;
PCRE2_SPTR subject = match_data->subject;
if (stringnumber >= match_data->oveccount ||
(left = match_data->ovector[stringnumber*2]) == PCRE2_UNSET)
return PCRE2_ERROR_NOSUBSTRING;
right = match_data->ovector[stringnumber*2+1];
if (right - left + 1 > size) return PCRE2_ERROR_NOMEMORY;
while (left < right) buffer[p++] = subject[left++];
buffer[p] = 0;
return p;
}
@ -168,10 +137,9 @@ new memory. If the regex permits duplicate names, the first substring that is
set is chosen.
Arguments:
context points to a PCRE2 context
match_data pointer to match_data
stringname the name of the required substring
stringptr where to put the pointer
stringptr where to put the pointer to the new memory
Returns: if successful:
the length of the copied string, not including the zero
@ -185,7 +153,16 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_get_byname(pcre2_match_data *match_data,
PCRE2_SPTR stringname, PCRE2_UCHAR **stringptr)
{
match_data=match_data;stringname=stringname;stringptr=stringptr;
PCRE2_SPTR first, last, entry;
int entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
&first, &last);
if (entrysize <= 0) return entrysize;
for (entry = first; entry <= last; entry += entrysize)
{
uint16_t n = GET2(entry, 0);
if (n < match_data->oveccount && match_data->ovector[n*2] != PCRE2_UNSET)
return pcre2_substring_get_bynumber(match_data, n, stringptr);
}
return PCRE2_ERROR_NOSUBSTRING;
}
@ -199,10 +176,9 @@ return PCRE2_ERROR_NOSUBSTRING;
memory.
Arguments:
context points to a PCRE2 context
match_data points to match data
stringnumber the number of the required substring
stringptr where to put a pointer to the substring
stringptr where to put a pointer to the new memory
Returns: if successful:
the length of the string, not including the zero that
@ -216,9 +192,44 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_get_bynumber(pcre2_match_data *match_data, int stringnumber,
PCRE2_UCHAR **stringptr)
{
match_data=match_data;stringnumber=stringnumber;
stringptr=stringptr;
return PCRE2_ERROR_NOSUBSTRING;
size_t left, right;
size_t p = 0;
void *block;
PCRE2_UCHAR *yield;
PCRE2_SPTR subject = match_data->subject;
if (stringnumber >= match_data->oveccount ||
(left = match_data->ovector[stringnumber*2]) == PCRE2_UNSET)
return PCRE2_ERROR_NOSUBSTRING;
right = match_data->ovector[stringnumber*2+1];
block = PRIV(memctl_malloc)(sizeof(pcre2_memctl) +
(right-left+1)*PCRE2_CODE_UNIT_WIDTH, 0, &(match_data->memctl));
if (block == NULL) return PCRE2_ERROR_NOMEMORY;
yield = (PCRE2_UCHAR *)((char *)block + sizeof(pcre2_memctl));
while (left < right) yield[p++] = subject[left++];
yield[p] = 0;
*stringptr = yield;
return p;
}
/*************************************************
* Free memory obtained by get_substring *
*************************************************/
/*
Argument: the result of a previous pcre2_substring_get_byxxx()
Returns: nothing
*/
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_substring_free(PCRE2_UCHAR *string)
{
pcre2_memctl *memctl = (pcre2_memctl *)((char *)string - sizeof(pcre2_memctl));
memctl->free(memctl, memctl->memory_data);
}
@ -242,7 +253,16 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_length_byname(pcre2_match_data *match_data,
PCRE2_SPTR stringname)
{
match_data=match_data;stringname=stringname;
PCRE2_SPTR first, last, entry;
int entrysize = pcre2_substring_nametable_scan(match_data->code, stringname,
&first, &last);
if (entrysize <= 0) return entrysize;
for (entry = first; entry <= last; entry += entrysize)
{
uint16_t n = GET2(entry, 0);
if (n < match_data->oveccount && match_data->ovector[n*2] != PCRE2_UNSET)
return pcre2_substring_length_bynumber(match_data, n);
}
return PCRE2_ERROR_NOSUBSTRING;
}
@ -266,8 +286,11 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_length_bynumber(pcre2_match_data *match_data,
int stringnumber)
{
match_data=match_data;stringnumber=stringnumber;
return PCRE2_ERROR_NOSUBSTRING;
if (stringnumber >= match_data->oveccount ||
match_data->ovector[stringnumber*2] == PCRE2_UNSET)
return PCRE2_ERROR_NOSUBSTRING;
return match_data->ovector[stringnumber*2 + 1] -
match_data->ovector[stringnumber*2];
}
@ -278,48 +301,88 @@ return PCRE2_ERROR_NOSUBSTRING;
/* This function gets one chunk of memory and builds a list of pointers and all
the captured substrings in it. A NULL pointer is put on the end of the list.
The substrings are zero-terminated, but also, if the final argument is
non-NULL, a list of lengths is also returned. This allows binary data to be
handled.
Arguments:
context points to a PCRE2 context
match_data points to the match data
listptr set to point to the list of pointers
lengthsptr set to point to the list of lengths (may be NULL)
Returns: if successful: 0
if not successful, a negative error code:
PCRE2_ERROR_NOMEMORY: failed to get memory
PCRE2_ERROR_NOMEMORY: failed to get memory,
or a match failure code
*/
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_list_get(pcre2_match_data *match_data, PCRE2_UCHAR ***listptr,
size_t **lengthsptr)
{
match_data=match_data;listptr=listptr;lengthsptr=lengthsptr;
return PCRE2_ERROR_NOMEMORY;
int i, count, count2;
size_t size;
size_t *lensp, *ovector;
pcre2_memctl *memp;
PCRE2_UCHAR **listp;
PCRE2_UCHAR *sp;
if ((count = match_data->rc) < 0) return count;
count2 = 2*count;
ovector = match_data->ovector;
size = sizeof(pcre2_memctl) + sizeof(PCRE2_UCHAR *); /* For final NULL */
if (lengthsptr != NULL) size += sizeof(size_t) * count; /* For lengths */
for (i = 0; i < count2; i += 2)
size += sizeof(PCRE2_UCHAR *) + CU2BYTES(ovector[i+1] - ovector[i] + 1);
memp = PRIV(memctl_malloc)(size, 0, &(match_data->memctl));
if (memp == NULL) return PCRE2_ERROR_NOMEMORY;
*listptr = listp = (PCRE2_UCHAR **)((char *)memp + sizeof(pcre2_memctl));
lensp = (size_t *)((char *)listp + sizeof(PCRE2_UCHAR *) * (count + 1));
if (lengthsptr == NULL)
{
sp = (PCRE2_UCHAR *)lensp;
lensp = NULL;
}
else
{
*lengthsptr = lensp;
sp = (PCRE2_UCHAR *)((char *)lensp + sizeof(size_t) * count);
}
for (i = 0; i < count2; i += 2)
{
size = ovector[i+1] - ovector[i];
memcpy(sp, match_data->subject + ovector[i], CU2BYTES(size));
*listp++ = sp;
if (lensp != NULL) *lensp++ = size;
sp += size;
*sp++ = 0;
}
*listp = NULL;
return 0;
}
/*************************************************
* Find number for named string *
* Free memory obtained by substring_list_get *
*************************************************/
/* This function is used by the local get_first_set() function, as well
as being generally available. It assumes that names are unique.
Arguments:
code the compiled regex
stringname the name whose number is required
Returns: the number of the named parentheses, or a negative number
(PCRE2_ERROR_NOSUBSTRING) if not found
/*
Argument: the result of a previous pcre2_substring_list_get()
Returns: nothing
*/
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_number_from_name(const pcre2_code *code,
PCRE2_SPTR stringname)
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_substring_list_free(PCRE2_SPTR *list)
{
code=code;stringname=stringname;
return PCRE2_ERROR_NOSUBSTRING;
pcre2_memctl *memctl = (pcre2_memctl *)((char *)list - sizeof(pcre2_memctl));
memctl->free(memctl, memctl->memory_data);
}
@ -328,8 +391,10 @@ return PCRE2_ERROR_NOSUBSTRING;
* Find (multiple) entries for named string *
*************************************************/
/* This is used by the local get_first_set() function, as well as being
generally available. It is used when duplicated names are permitted.
/* This function scans the nametable for a given name, using binary chop. It
returns either two pointers to the entries in the table, or, if no pointers are
given, the number of a group with the given name. If duplicate names are
permitted, this may not be unique.
Arguments:
code the compiled regex
@ -337,17 +402,73 @@ Arguments:
firstptr where to put the pointer to the first entry
lastptr where to put the pointer to the last entry
Returns: the length of each entry, or a negative number
Returns: if firstptr and lastptr are NULL, a group number;
otherwise, the length of each entry, or a negative number
(PCRE2_ERROR_NOSUBSTRING) if not found
*/
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR stringname,
PCRE2_UCHAR **firstptr, PCRE2_UCHAR **lastptr)
PCRE2_SPTR *firstptr, PCRE2_SPTR *lastptr)
{
code=code;stringname=stringname;firstptr=firstptr;lastptr=lastptr;
uint16_t bot = 0;
uint16_t top = code->name_count;
uint16_t entrysize = code->name_entry_size;
PCRE2_SPTR nametable = (PCRE2_SPTR)((char *)code + sizeof(pcre2_real_code));
while (top > bot)
{
uint16_t mid = (top + bot) / 2;
PCRE2_SPTR entry = nametable + entrysize*mid;
int c = PRIV(strcmp)(stringname, entry + IMM2_SIZE);
if (c == 0)
{
PCRE2_SPTR first, last, lastentry;
if (firstptr == NULL) return GET2(entry, 0);
lastentry = nametable + entrysize * (code->name_count - 1);
first = last = entry;
while (first > nametable)
{
if (PRIV(strcmp)(stringname, (first - entrysize + IMM2_SIZE)) != 0) break;
first -= entrysize;
}
while (last < lastentry)
{
if (PRIV(strcmp)(stringname, (last + entrysize + IMM2_SIZE)) != 0) break;
last += entrysize;
}
*firstptr = first;
*lastptr = last;
return entrysize;
}
if (c > 0) bot = mid + 1; else top = mid;
}
return PCRE2_ERROR_NOSUBSTRING;
}
/*************************************************
* Find number for named string *
*************************************************/
/* This function is a convenience wrapper for pcre2_substring_nametable_scan()
when it is known that names are unique. If there are duplicate names, it is not
defined which number is returned.
Arguments:
code the compiled regex
stringname the name whose number is required
Returns: the number of the named parenthesis, or a negative number
(PCRE2_ERROR_NOSUBSTRING) if not found
*/
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_number_from_name(const pcre2_code *code,
PCRE2_SPTR stringname)
{
return pcre2_substring_nametable_scan(code, stringname, NULL, NULL);
}
/* End of pcre2_substring.c */

View File

@ -38,34 +38,33 @@ POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
#ifndef PCRE2_INCLUDED
/* This module contains some fixed tables that are used by more than one of the
PCRE code modules. The tables are also #included by the pcre2test program,
which uses macros to change their names from _pcre2_xxx to xxxx, thereby
avoiding name clashes with the library. */
avoiding name clashes with the library. In this case, PCRE2_INCLUDED is
defined. */
#ifndef PCRE2_INCLUDED /* We're compiling the library */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre2_internal.h"
#endif /* PCRE2_INCLUDED */
#ifdef FIXME
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
the definition is next to the definition of the opcodes in pcre2_internal.h. */
the definition is next to the definition of the opcodes in pcre2_internal.h.
This is mode-dependent, so is skipped when this file is included by pcre2test. */
#ifndef PCRE2_INCLUDED
const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS };
#endif
/* Tables of horizontal and vertical whitespace characters, suitable for
adding to classes. */
const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST };
const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST };
#endif /* FIXME */
/*************************************************
@ -103,8 +102,6 @@ const uint8_t PRIV(utf8_table4)[] = {
#endif /* (SUPPORT_UTF && COMPILE_PCRE8) || (PCRE2_INCLUDED && SUPPORT_PCRE[16|32])*/
#ifdef FIXME
#ifdef SUPPORT_UTF
/* Table to translate from particular type value to the general value. */
@ -122,9 +119,9 @@ const uint32_t PRIV(ucp_gentype)[] = {
/* This table encodes the rules for finding the end of an extended grapheme
cluster. Every code point has a grapheme break property which is one of the
ucp_gbXX values defined in ucp.h. The 2-dimensional table is indexed by the
properties of two adjacent code points. The left property selects a word from
the table, and the right property selects a bit from that word like this:
ucp_gbXX values defined in pcre2_ucp.h. The 2-dimensional table is indexed by
the properties of two adjacent code points. The left property selects a word
from the table, and the right property selects a bit from that word like this:
ucp_gbtable[left-property] & (1 << right-property)
@ -660,6 +657,4 @@ const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
#endif /* SUPPORT_UTF */
#endif /* FIXME */
/* End of pcre2_tables.c */

View File

@ -2,28 +2,29 @@
Do not modify it by hand. Instead modify the script and run it
to regenerate this code.
As well as being part of the PCRE library, this module is #included
by the pcretest program, which redefines the PRIV macro to change
table names from _pcre_xxx to xxxx, thereby avoiding name clashes
As well as being part of the PCRE2 library, this module is #included
by the pcre2test program, which redefines the PRIV macro to change
table names from _pcre2_xxx to xxxx, thereby avoiding name clashes
with the library. At present, just one of these tables is actually
needed. */
#ifndef PCRE_INCLUDED
#ifndef PCRE2_INCLUDED
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre_internal.h"
#include "pcre2_internal.h"
#endif /* PCRE_INCLUDED */
#endif /* PCRE2_INCLUDED */
/* Unicode character database. */
/* This file was autogenerated by the MultiStage2.py script. */
/* Total size: 65688 bytes, block size: 128. */
/* The tables herein are needed only when UCP support is built
into PCRE. This module should not be referenced otherwise, so
/* The tables herein are needed only when UCP support is built,
and in PCRE2 that happens automatically with UTF support.
This module should not be referenced otherwise, so
it should not matter whether it is compiled or not. However
a comment was received about space saving - maybe the guy linked
all the modules rather than using a library - so we include a
@ -31,28 +32,28 @@ condition to cut out the tables when not needed. But don't leave
a totally empty module because some compilers barf at that.
Instead, just supply small dummy tables. */
#ifndef SUPPORT_UCP
#ifndef SUPPORT_UTF
const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};
const pcre_uint8 PRIV(ucd_stage1)[] = {0};
const pcre_uint16 PRIV(ucd_stage2)[] = {0};
const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};
const uint8_t PRIV(ucd_stage1)[] = {0};
const uint16_t PRIV(ucd_stage2)[] = {0};
const uint32_t PRIV(ucd_caseless_sets)[] = {0};
#else
/* When recompiling tables with a new Unicode version, please check the
types in this structure definition from pcre_internal.h (the actual
types in this structure definition from pcre2_internal.h (the actual
field names will be different):
typedef struct {
pcre_uint8 property_0;
pcre_uint8 property_1;
pcre_uint8 property_2;
pcre_uint8 property_3;
uint8_t property_0;
uint8_t property_1;
uint8_t property_2;
uint8_t property_3;
pcre_int32 property_4;
} ucd_record;
*/
const pcre_uint32 PRIV(ucd_caseless_sets)[] = {
const uint32_t PRIV(ucd_caseless_sets)[] = {
NOTACHAR,
0x0053, 0x0073, 0x017f, NOTACHAR,
0x01c4, 0x01c5, 0x01c6, NOTACHAR,
@ -75,9 +76,9 @@ const pcre_uint32 PRIV(ucd_caseless_sets)[] = {
0x00c5, 0x00e5, 0x212b, NOTACHAR,
};
/* When #included in pcretest, we don't need this large table. */
/* When #included in pcre2test, we don't need this large table. */
#ifndef PCRE_INCLUDED
#ifndef PCRE2_INCLUDED
const ucd_record PRIV(ucd_records)[] = { /* 5016 bytes, record size 8 */
{ 9, 0, 2, 0, 0, }, /* 0 */
@ -709,7 +710,7 @@ const ucd_record PRIV(ucd_records)[] = { /* 5016 bytes, record size 8 */
{ 26, 26, 12, 0, 0, }, /* 626 */
};
const pcre_uint8 PRIV(ucd_stage1)[] = { /* 8704 bytes */
const uint8_t PRIV(ucd_stage1)[] = { /* 8704 bytes */
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* U+0000 */
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, /* U+0800 */
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 41, 41, 42, 43, 44, 45, /* U+1000 */
@ -1256,7 +1257,7 @@ const pcre_uint8 PRIV(ucd_stage1)[] = { /* 8704 bytes */
123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,202, /* U+10F800 */
};
const pcre_uint16 PRIV(ucd_stage2)[] = { /* 51968 bytes, block = 128 */
const uint16_t PRIV(ucd_stage2)[] = { /* 51968 bytes, block = 128 */
/* block 0 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@ -3290,8 +3291,8 @@ const pcre_uint16 PRIV(ucd_stage2)[] = { /* 51968 bytes, block = 128 */
};
#if UCD_BLOCK_SIZE != 128
#error Please correct UCD_BLOCK_SIZE in pcre_internal.h
#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
#endif
#endif /* SUPPORT_UCP */
#endif /* SUPPORT_UTF */
#endif /* PCRE_INCLUDED */
#endif /* PCRE2_INCLUDED */

237
src/pcre2_ucp.h Normal file
View File

@ -0,0 +1,237 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
#ifndef _PCRE2_UCP_H
#define _PCRE2_UCP_H
/* This file contains definitions of the property values that are returned by
the UCD access macros. New values that are added for new releases of Unicode
should always be at the end of each enum, for backwards compatibility.
IMPORTANT: Note also that the specific numeric values of the enums have to be
the same as the values that are generated by the maint/MultiStage2.py script,
where the equivalent property descriptive names are listed in vectors.
ALSO: The specific values of the first two enums are assumed for the table
called catposstab in pcre2_compile.c. */
/* These are the general character categories. */
enum {
ucp_C, /* Other */
ucp_L, /* Letter */
ucp_M, /* Mark */
ucp_N, /* Number */
ucp_P, /* Punctuation */
ucp_S, /* Symbol */
ucp_Z /* Separator */
};
/* These are the particular character categories. */
enum {
ucp_Cc, /* Control */
ucp_Cf, /* Format */
ucp_Cn, /* Unassigned */
ucp_Co, /* Private use */
ucp_Cs, /* Surrogate */
ucp_Ll, /* Lower case letter */
ucp_Lm, /* Modifier letter */
ucp_Lo, /* Other letter */
ucp_Lt, /* Title case letter */
ucp_Lu, /* Upper case letter */
ucp_Mc, /* Spacing mark */
ucp_Me, /* Enclosing mark */
ucp_Mn, /* Non-spacing mark */
ucp_Nd, /* Decimal number */
ucp_Nl, /* Letter number */
ucp_No, /* Other number */
ucp_Pc, /* Connector punctuation */
ucp_Pd, /* Dash punctuation */
ucp_Pe, /* Close punctuation */
ucp_Pf, /* Final punctuation */
ucp_Pi, /* Initial punctuation */
ucp_Po, /* Other punctuation */
ucp_Ps, /* Open punctuation */
ucp_Sc, /* Currency symbol */
ucp_Sk, /* Modifier symbol */
ucp_Sm, /* Mathematical symbol */
ucp_So, /* Other symbol */
ucp_Zl, /* Line separator */
ucp_Zp, /* Paragraph separator */
ucp_Zs /* Space separator */
};
/* These are grapheme break properties. Note that the code for processing them
assumes that the values are less than 16. If more values are added that take
the number to 16 or more, the code will have to be rewritten. */
enum {
ucp_gbCR, /* 0 */
ucp_gbLF, /* 1 */
ucp_gbControl, /* 2 */
ucp_gbExtend, /* 3 */
ucp_gbPrepend, /* 4 */
ucp_gbSpacingMark, /* 5 */
ucp_gbL, /* 6 Hangul syllable type L */
ucp_gbV, /* 7 Hangul syllable type V */
ucp_gbT, /* 8 Hangul syllable type T */
ucp_gbLV, /* 9 Hangul syllable type LV */
ucp_gbLVT, /* 10 Hangul syllable type LVT */
ucp_gbRegionalIndicator, /* 11 */
ucp_gbOther /* 12 */
};
/* These are the script identifications. */
enum {
ucp_Arabic,
ucp_Armenian,
ucp_Bengali,
ucp_Bopomofo,
ucp_Braille,
ucp_Buginese,
ucp_Buhid,
ucp_Canadian_Aboriginal,
ucp_Cherokee,
ucp_Common,
ucp_Coptic,
ucp_Cypriot,
ucp_Cyrillic,
ucp_Deseret,
ucp_Devanagari,
ucp_Ethiopic,
ucp_Georgian,
ucp_Glagolitic,
ucp_Gothic,
ucp_Greek,
ucp_Gujarati,
ucp_Gurmukhi,
ucp_Han,
ucp_Hangul,
ucp_Hanunoo,
ucp_Hebrew,
ucp_Hiragana,
ucp_Inherited,
ucp_Kannada,
ucp_Katakana,
ucp_Kharoshthi,
ucp_Khmer,
ucp_Lao,
ucp_Latin,
ucp_Limbu,
ucp_Linear_B,
ucp_Malayalam,
ucp_Mongolian,
ucp_Myanmar,
ucp_New_Tai_Lue,
ucp_Ogham,
ucp_Old_Italic,
ucp_Old_Persian,
ucp_Oriya,
ucp_Osmanya,
ucp_Runic,
ucp_Shavian,
ucp_Sinhala,
ucp_Syloti_Nagri,
ucp_Syriac,
ucp_Tagalog,
ucp_Tagbanwa,
ucp_Tai_Le,
ucp_Tamil,
ucp_Telugu,
ucp_Thaana,
ucp_Thai,
ucp_Tibetan,
ucp_Tifinagh,
ucp_Ugaritic,
ucp_Yi,
/* New for Unicode 5.0: */
ucp_Balinese,
ucp_Cuneiform,
ucp_Nko,
ucp_Phags_Pa,
ucp_Phoenician,
/* New for Unicode 5.1: */
ucp_Carian,
ucp_Cham,
ucp_Kayah_Li,
ucp_Lepcha,
ucp_Lycian,
ucp_Lydian,
ucp_Ol_Chiki,
ucp_Rejang,
ucp_Saurashtra,
ucp_Sundanese,
ucp_Vai,
/* New for Unicode 5.2: */
ucp_Avestan,
ucp_Bamum,
ucp_Egyptian_Hieroglyphs,
ucp_Imperial_Aramaic,
ucp_Inscriptional_Pahlavi,
ucp_Inscriptional_Parthian,
ucp_Javanese,
ucp_Kaithi,
ucp_Lisu,
ucp_Meetei_Mayek,
ucp_Old_South_Arabian,
ucp_Old_Turkic,
ucp_Samaritan,
ucp_Tai_Tham,
ucp_Tai_Viet,
/* New for Unicode 6.0.0: */
ucp_Batak,
ucp_Brahmi,
ucp_Mandaic,
/* New for Unicode 6.1.0: */
ucp_Chakma,
ucp_Meroitic_Cursive,
ucp_Meroitic_Hieroglyphs,
ucp_Miao,
ucp_Sharada,
ucp_Sora_Sompeng,
ucp_Takri
};
#endif
/* End of pcvre2_ucp.h */

File diff suppressed because it is too large Load Diff