Implement --never-backslash-C

This commit is contained in:
Philip.Hazel 2015-10-17 13:50:56 +00:00
parent 5923caf05e
commit 3263d44b97
58 changed files with 2060 additions and 1479 deletions

View File

@ -70,6 +70,7 @@
# 2015-04-24 PH added support for PCRE2_DEBUG
# 2015-07-16 PH updated for new pcre2_find_bracket source module
# 2015-08-24 PH correct C_FLAGS setting (patch from Roy Ivy III)
# 2015-10=16 PH added support for never-backslash-C
PROJECT(PCRE2 C)
@ -162,6 +163,9 @@ SET(PCRE2_SUPPORT_UNICODE ON CACHE BOOL
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
SET(PCRE2_NEVER_BACKSLASH_C OFF CACHE BOOL
"If ON, backslash-C (upper case C) is locked out.")
SET(PCRE2_SUPPORT_VALGRIND OFF CACHE BOOL
"Enable Valgrind support.")
@ -252,6 +256,10 @@ IF(PCRE2_SUPPORT_BSR_ANYCRLF)
SET(BSR_ANYCRLF 1)
ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF)
IF(PCRE2_NEVER_BACKSLASH_C)
SET(NEVER_BACKSLASH_C 1)
ENDIF(PCRE2_NEVER_BACKSLASH_C)
IF(PCRE2_SUPPORT_UNICODE)
SET(SUPPORT_UNICODE 1)
ENDIF(PCRE2_SUPPORT_UNICODE)
@ -719,6 +727,7 @@ IF(PCRE2_SHOW_REPORT)
MESSAGE(STATUS " Enable Unicode support .......... : ${PCRE2_SUPPORT_UNICODE}")
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE2_NEWLINE}")
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE2_SUPPORT_BSR_ANYCRLF}")
MESSAGE(STATUS " \\C is disabled .................. : ${PCRE2_NEVER_BACKSLASH_C}")
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE2_EBCDIC}")
MESSAGE(STATUS " EBCDIC coding with NL=0x25 ...... : ${PCRE2_EBCDIC_NL25}")
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE2_REBUILD_CHARTABLES}")

View File

@ -201,6 +201,8 @@ escape was being ignored.
57. Fixed integer overflow for patterns whose minimum matching length is very,
very large.
58. Implemented --never-backslash-C.
Version 10.20 30-June-2015
--------------------------

9
README
View File

@ -220,6 +220,13 @@ library. They are also documented in the pcre2build man page.
restrict \R to match only CR, LF, or CRLF. You can make this the default by
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
. In a pattern, the escape sequence \C matches a single code unit, even in a
UTF mode. This can be dangerous because it breaks up multi-code-unit
characters. You can build PCRE2 with the use of \C permanently locked out by
adding --enable-never-backslash-C (note the upper case C) to the "configure"
command. When \C is allowed by the library, individual applications can lock
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
pattern. This limits the amount of system stack that a pattern uses when it
is compiled. The default is 250, but you can change it by setting, for
@ -833,4 +840,4 @@ The distribution should contain the files listed below.
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
Last updated: 16 July 2015
Last updated: 16 October 2015

64
RunTest
View File

@ -75,7 +75,10 @@ title17="Test 17: JIT-specific features when JIT is available"
title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP"
title19="Test 19: Tests of the POSIX interface with UTF/UCP"
title20="Test 20: Serialization tests"
maxtest=20
title21="Test 21: \C tests without UTF (supported for DFA matching)"
title22="Test 22: \C tests with UTF (not supported for DFA matching)"
title23="Test 23: \C disabled test"
maxtest=23
if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title0
@ -99,6 +102,9 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title18
echo $title19
echo $title20
echo $title21
echo $title22
echo $title23
exit 0
fi
@ -223,6 +229,9 @@ do17=no
do18=no
do19=no
do20=no
do21=no
do22=no
do23=no
while [ $# -gt 0 ] ; do
case $1 in
@ -247,6 +256,9 @@ while [ $# -gt 0 ] ; do
18) do18=yes;;
19) do19=yes;;
20) do20=yes;;
21) do21=yes;;
22) do22=yes;;
23) do23=yes;;
-8) arg8=yes;;
-16) arg16=yes;;
-32) arg32=yes;;
@ -326,6 +338,11 @@ support16=$?
$sim ./pcre2test -C pcre2-32 >/dev/null
support32=$?
# \C may be disabled
$sim ./pcre2test -C backslash-C >/dev/null
supportBSC=$?
# Initialize all bitsizes skipped
test8=skip
@ -400,7 +417,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
$do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
$do20 = no \
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no \
]; then
do0=yes
do1=yes
@ -423,6 +440,9 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
do18=yes
do19=yes
do20=yes
do21=yes
do22=yes
do23=yes
fi
# Handle any explicit skips at this stage, so that an argument list may consist
@ -781,6 +801,46 @@ for bmode in "$test8" "$test16" "$test32"; do
checkresult $? 20 ""
fi
# \C tests without UTF - DFA matching is supported
if [ "$do21" = yes ] ; then
echo $title21
if [ $supportBSC -eq 0 ] ; then
echo " Skipped because \C is disabled"
else
for opt in "" $jitopt -dfa; do
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput21 testtry
checkresult $? 21 "$opt"
done
fi
fi
# \C tests with UTF - DFA matching is not supported for \C in UTF mode
if [ "$do22" = yes ] ; then
echo $title22
if [ $supportBSC -eq 0 ] ; then
echo " Skipped because \C is disabled"
else
for opt in "" $jitopt; do
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput22 testtry
checkresult $? 22-$bits "$opt"
done
fi
fi
# Test when \C is disabled
if [ "$do23" = yes ] ; then
echo $title23
if [ $supportBSC -ne 0 ] ; then
echo " Skipped because \C is not disabled"
else
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput23 testtry
checkresult $? 23 ""
fi
fi
# End of loop for 8/16/32-bit tests
done

View File

@ -13,11 +13,10 @@
@rem line. Added argument validation and added error reporting.
@rem
@rem Sheri Pierce added logic to skip feature dependent tests
@rem tests 4 5 9 15 and 18 require utf support
@rem tests 6 7 10 16 and 19 require ucp support
@rem 11 requires ucp and link size 2
@rem 12 requires presence of jit support
@rem 13 requires absence of jit support
@rem tests 4 5 7 10 12 14 19 and 22 require Unicode support
@rem 8 requires Unicode and link size 2
@rem 16 requires absence of jit support
@rem 17 requires presence of jit support
@rem Sheri P also added override tests for study and jit testing
@rem Zoltan Herczeg added libpcre16 support
@rem Zoltan Herczeg added libpcre32 support
@ -25,6 +24,7 @@
@rem
@rem The file was converted for PCRE2 by PH, February 2015.
@rem Updated for new test 14 (moving others up a number), August 2015.
@rem Tidied and updated for new tests 21, 22, 23 by PH, October 2015.
setlocal enabledelayedexpansion
@ -65,6 +65,8 @@ set support32=%ERRORLEVEL%
set unicode=%ERRORLEVEL%
%pcre2test% -C jit >NUL
set jit=%ERRORLEVEL%
%pcre2test% -C backslash-C >NUL
set supportBSC=%ERRORLEVEL%
if %support8% EQU 1 (
if not exist testout8 md testout8
@ -101,18 +103,21 @@ set do17=no
set do18=no
set do19=no
set do20=no
set do21=no
set do22=no
set do23=no
set all=yes
for %%a in (%*) do (
set valid=no
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20) do if %%v == %%a set valid=yes
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23) do if %%v == %%a set valid=yes
if "!valid!" == "yes" (
set do%%a=yes
set all=no
) else (
echo Invalid test number - %%a!
echo Usage %0 [ test_number ] ...
echo Where test_number is one or more optional test numbers 1 through 20, default is all tests.
echo Where test_number is one or more optional test numbers 1 through 23, default is all tests.
exit /b 1
)
)
@ -139,6 +144,9 @@ if "%all%" == "yes" (
set do18=yes
set do19=yes
set do20=yes
set do21=yes
set do22=yes
set do23=yes
)
@echo RunTest.bat's pcre2test output is written to newly created subfolders
@ -187,6 +195,9 @@ if "%do17%" == "yes" call :do17
if "%do18%" == "yes" call :do18
if "%do19%" == "yes" call :do19
if "%do20%" == "yes" call :do20
if "%do21%" == "yes" call :do21
if "%do22%" == "yes" call :do22
if "%do23%" == "yes" call :do23
:modeSkip
if "%mode%" == "" (
set mode=-16
@ -323,7 +334,7 @@ if %unicode% EQU 0 (
goto :eof
:do6
call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q -dfa
call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q
goto :eof
:do7
@ -331,7 +342,7 @@ if %unicode% EQU 0 (
echo Test 7 Skipped due to absence of Unicode support.
goto :eof
)
call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q -dfa
call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q
goto :eof
:do8
@ -395,10 +406,14 @@ if %bits% EQU 8 (
echo Test 13 Skipped when running 8-bit tests.
goto :eof
)
call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q -dfa
call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q
goto :eof
:do14
if %unicode% EQU 0 (
echo Test 14 Skipped due to absence of Unicode support.
goto :eof
)
call :runsub 14 testout "DFA specials for UTF and UCP support" -q
goto :eof
@ -442,6 +457,10 @@ if %bits% EQU 16 (
if %bits% EQU 32 (
echo Test 19 Skipped when running 32-bit tests.
goto :eof
)
if %unicode% EQU 0 (
echo Test 19 Skipped due to absence of Unicode support.
goto :eof
)
call :runsub 19 testout "POSIX interface with UTF-8 and UCP" -q
goto :eof
@ -450,6 +469,37 @@ goto :eof
call :runsub 20 testout "Serialization tests" -q
goto :eof
:do21
if %supportBSC% EQU 0 (
echo Test 21 Skipped due to absence of backslash-C support.
goto :eof
)
call :runsub 21 testout "Backslash-C tests without UTF" -q
call :runsub 21 testout "Backslash-C tests without UTF (DFA)" -q -dfa
if %jit% EQU 1 call :runsub 21 testoutjit "Test with JIT Override" -q -jit
goto :eof
:do22
if %supportBSC% EQU 0 (
echo Test 22 Skipped due to absence of backslash-C support.
goto :eof
)
if %unicode% EQU 0 (
echo Test 22 Skipped due to absence of Unicode support.
goto :eof
)
call :runsub 22 testout "Backslash-C tests with UTF" -q
if %jit% EQU 1 call :runsub 22 testoutjit "Test with JIT Override" -q -jit
goto :eof
:do23
if %supportBSC% EQU 1 (
echo Test 23 Skipped due to presence of backslash-C support.
goto :eof
)
call :runsub 23 testout "Backslash-C disabled test" -q
goto :eof
:conferror
@echo.
@echo Either your build is incomplete or you have a configuration error.

View File

@ -33,6 +33,7 @@
#cmakedefine EBCDIC 1
#cmakedefine EBCDIC_NL25 1
#cmakedefine HEAP_MATCH_RECURSE 1
#cmakedefine NEVER_BACKSLASH_C 1
#define LINK_SIZE @PCRE2_LINK_SIZE@
#define MATCH_LIMIT @PCRE2_MATCH_LIMIT@

View File

@ -190,6 +190,12 @@ AC_ARG_ENABLE(bsr-anycrlf,
[\R matches only CR, LF, CRLF by default]),
, enable_bsr_anycrlf=no)
# Handle --enable-never-backslash-C
AC_ARG_ENABLE(never-backslash-C,
AS_HELP_STRING([--enable-never-backslash-C],
[use of \C causes an error]),
, enable_never_backslash_C=no)
# Handle --enable-ebcdic
AC_ARG_ENABLE(ebcdic,
AS_HELP_STRING([--enable-ebcdic],
@ -604,6 +610,11 @@ if test "$enable_bsr_anycrlf" = "yes"; then
The build-time default can be overridden by the user of PCRE2 at runtime.])
fi
if test "$enable_never_backslash_C" = "yes"; then
AC_DEFINE([NEVER_BACKSLASH_C], [], [
Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns.])
fi
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
The value of LINK_SIZE determines the number of bytes used to store
links as offsets within the compiled regex. The default is 2, which
@ -881,6 +892,7 @@ $PACKAGE-$VERSION configuration summary:
Enable Unicode support .......... : ${enable_unicode}
Newline char/sequence ........... : ${enable_newline}
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
\C is disabled .................. : ${enable_never_backslash_C}
EBCDIC coding ................... : ${enable_ebcdic}
EBCDIC code for NL .............. : ${ebcdic_nl_code}
Rebuild char tables ............. : ${enable_rebuild_chartables}

View File

@ -220,6 +220,13 @@ library. They are also documented in the pcre2build man page.
restrict \R to match only CR, LF, or CRLF. You can make this the default by
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
. In a pattern, the escape sequence \C matches a single code unit, even in a
UTF mode. This can be dangerous because it breaks up multi-code-unit
characters. You can build PCRE2 with the use of \C permanently locked out by
adding --enable-never-backslash-C (note the upper case C) to the "configure"
command. When \C is allowed by the library, individual applications can lock
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
pattern. This limits the amount of system stack that a pattern uses when it
is compiled. The default is 250, but you can change it by setting, for
@ -833,4 +840,4 @@ The distribution should contain the files listed below.
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
Last updated: 16 July 2015
Last updated: 16 October 2015

View File

@ -126,8 +126,10 @@ running redundant checks.
<P>
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to
problems, because it may leave the current matching point in the middle of a
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to
lock out the use of \C, causing a compile-time error if it is encountered.
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
application to lock out the use of \C, causing a compile-time error if it is
encountered. It is also possible to build PCRE2 with the use of \C permanently
disabled.
</P>
<P>
Another way that performance can be hit is by running a pattern that has a very
@ -187,7 +189,7 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
</P>
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
<P>
Last updated: 13 April 2015
Last updated: 16 October 2015
<br>
Copyright &copy; 1997-2015 University of Cambridge.
<br>

View File

@ -60,19 +60,21 @@ units, not characters, as is the contents of the variable pointed at by
The options are:
<pre>
PCRE2_ANCHORED Match only at the first position
PCRE2_NOTBOL Subject string is not the beginning of a line
PCRE2_NOTEOL Subject string is not the end of a line
PCRE2_NOTBOL Subject is not the beginning of a line
PCRE2_NOTEOL Subject is not the end of a line
PCRE2_NOTEMPTY An empty string is not a valid match
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject
is not a valid match
PCRE2_NO_UTF_CHECK Do not check the subject or replacement for
UTF validity (only relevant if PCRE2_UTF
was set at compile time)
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the
subject is not a valid match
PCRE2_NO_UTF_CHECK Do not check the subject or replacement
for UTF validity (only relevant if
PCRE2_UTF was set at compile time)
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
</pre>
The function returns the number of substitutions, which may be zero if there
were no matches. The result can be greater than one only when
PCRE2_SUBSTITUTE_GLOBAL is set.
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
is returned.
</P>
<P>
There is a complete description of the PCRE2 native API in the

View File

@ -1237,7 +1237,8 @@ This option locks out the use of \C in the pattern that is being compiled.
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
it may leave the current matching point in the middle of a multi-code-unit
character. This option may be useful in applications that process patterns from
external sources.
external sources. Note that there is also a build-time option that permanently
locks out the use of \C.
<pre>
PCRE2_NEVER_UCP
</pre>
@ -2613,43 +2614,17 @@ same number causes an error at compile time.
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR \fIreplacementzfP,</b>
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR <i>replacement</i>,</b>
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *\fIoutputbuffer\zfP,</b>
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
</P>
<P>
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
string in <i>outputbuffer</i>, replacing the part that was matched with the
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
</P>
<P>
In the replacement string, which is interpreted as a UTF string in UTF mode,
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
dollar character is an escape character that can specify the insertion of
characters from capturing groups or (*MARK) items in the pattern. The following
forms are recognized:
<pre>
$$ insert a dollar character
$&#60;n&#62; or ${&#60;n&#62;} insert the contents of group &#60;n&#62;
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
</pre>
Either a group number or a group name can be given for &#60;n&#62;. Curly brackets are
required only if the following character would be interpreted as part of the
number or name. The number may be zero to include the entire matched string.
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
string "+$1$0$1+", the result is "=+babcb+=". Group insertion is done by
calling <b>pcre2_copy_byname()</b> or <b>pcre2_copy_bynumber()</b> as
appropriate.
</P>
<P>
The facility for inserting a (*MARK) name can be used to perform simple
simultaneous substitutions, as this <b>pcre2test</b> example shows:
<pre>
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
apple lemon
2: pear orange
</PRE>
</P>
<P>
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
<b>pcre2_match()</b>, except that the partial matching options are not
permitted, and <i>match_data</i> may be passed as NULL, in which case a match
@ -2658,25 +2633,112 @@ functions from the match context, if provided, or else those that were used to
allocate memory for the compiled code.
</P>
<P>
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
The <i>outlengthptr</i> argument must point to a variable that contains the
length, in code units, of the output buffer. If the function is successful,
the value is updated to contain the length of the new string, excluding the
trailing zero that is automatically added. If the function is not successful,
the value is set to PCRE2_UNSET for general errors (such as output buffer too
small). For syntax errors in the replacement string, the value is set to the
offset in the replacement string where the error was detected.
</P>
<P>
In the replacement string, which is interpreted as a UTF string in UTF mode,
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
dollar character is an escape character that can specify the insertion of
characters from capturing groups or (*MARK) items in the pattern. The following
forms are always recognized:
<pre>
$$ insert a dollar character
$&#60;n&#62; or ${&#60;n&#62;} insert the contents of group &#60;n&#62;
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
</pre>
Either a group number or a group name can be given for &#60;n&#62;. Curly brackets are
required only if the following character would be interpreted as part of the
number or name. The number may be zero to include the entire matched string.
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
string "+$1$0$1+", the result is "=+babcb+=".
</P>
<P>
The facility for inserting a (*MARK) name can be used to perform simple
simultaneous substitutions, as this <b>pcre2test</b> example shows:
<pre>
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
apple lemon
2: pear orange
</pre>
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
function to iterate over the subject string, replacing every matching
substring. If this is not set, only the first matching substring is replaced.
</P>
<P>
The <i>outlengthptr</i> argument must point to a variable that contains the
length, in code units, of the output buffer. It is updated to contain the
length of the new string, excluding the trailing zero that is automatically
added.
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra processing
to be applied to the replacement string. Without this option, only the dollar
character is special, and only the group insertion forms listed above are
valid. When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
</P>
<P>
The function returns the number of replacements that were made. This may be
zero if no matches were found, and is never greater than 1 unless
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
is returned. Except for PCRE2_ERROR_NOMATCH (which is never returned), any
errors from <b>pcre2_match()</b> or the substring copying functions are passed
straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid
replacement string (unrecognized sequence following a dollar sign), and
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough.
Firstly, backslash in a replacement string is interpreted as an escape
character. The usual forms such as \n or \x{ddd} can be used to specify
particular character codes, and backslash followed by any non-alphanumeric
character quotes that character. Extended quoting can be coded using \Q...\E,
exactly as in pattern strings.
</P>
<P>
There are also four escape sequences for forcing the case of inserted letters.
The insertion mechanism has three states: no case forcing, force upper case,
and force lower case. The escape sequences change the current state: \U and
\L change to upper or lower case forcing, respectively, and \E (when not
terminating a \Q quoted sequence) reverts to no case forcing. The sequences
\u and \l force the next character (if it is a letter) to upper or lower
case, respectively, and then the state automatically reverts to no case
forcing. Case forcing applies to all inserted characters, including those from
captured groups and letters within \Q...\E quoted sequences.
</P>
<P>
Note that case forcing sequences such as \U...\E do not nest. For example,
the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no
effect.
</P>
<P>
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
flexibility to group substitution. The syntax is similar to that used by Bash:
<pre>
${&#60;n&#62;:-&#60;string&#62;}
${&#60;n&#62;:+&#60;string1&#62;:&#60;string2&#62;}
</pre>
As before, &#60;n&#62; may be a group number or a name. The first form specifies a
default value. If group &#60;n&#62; is set, its value is inserted; if not, &#60;string&#62; is
expanded and the result inserted. The second form specifies strings that are
expanded and inserted when group &#60;n&#62; is set or unset, respectively. The first
form is just a convenient shorthand for
<pre>
${&#60;n&#62;:+${&#60;n&#62;}:&#60;string&#62;}
</pre>
Backslash can be used to escape colons and closing curly brackets in the
replacement strings. A change of the case forcing state within a replacement
string remains in force afterwards, as shown in this <b>pcre2test</b> example:
<pre>
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
body
1: hello
somebody
1: HELLO
</pre>
If successful, the function returns the number of replacements that were made.
This may be zero if no matches were found, and is never greater than 1 unless
PCRE2_SUBSTITUTE_GLOBAL is set.
</P>
<P>
In the event of an error, a negative error code is returned. Except for
PCRE2_ERROR_NOMATCH (which is never returned), errors from <b>pcre2_match()</b>
are passed straight back. PCRE2_ERROR_NOMEMORY is returned if the output buffer
is not big enough. PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax
errors in the replacement string, with more particular errors being
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence),
PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket not found), and
PCRE2_BADSUBSTITUTION (syntax error in extended group substitution). As for all
PCRE2 errors, a text message that describes the error can be obtained by
calling <b>pcre2_get_error_message()</b>.
</P>
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
<P>
@ -2908,8 +2970,8 @@ There are in addition the following errors that are specific to
PCRE2_ERROR_DFA_UITEM
</pre>
This return is given if <b>pcre2_dfa_match()</b> encounters an item in the
pattern that it does not support, for instance, the use of \C or a back
reference.
pattern that it does not support, for instance, the use of \C in a UTF mode or
a back reference.
<pre>
PCRE2_ERROR_DFA_UCOND
</pre>
@ -2953,7 +3015,7 @@ Cambridge, England.
</P>
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
<P>
Last updated: 22 September 2015
Last updated: 16 October 2015
<br>
Copyright &copy; 1997-2015 University of Cambridge.
<br>

View File

@ -18,23 +18,24 @@ please consult the man page, in case the conversion went wrong.
<li><a name="TOC3" href="#SEC3">BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a>
<li><a name="TOC4" href="#SEC4">BUILDING SHARED AND STATIC LIBRARIES</a>
<li><a name="TOC5" href="#SEC5">UNICODE AND UTF SUPPORT</a>
<li><a name="TOC6" href="#SEC6">JUST-IN-TIME COMPILER SUPPORT</a>
<li><a name="TOC7" href="#SEC7">NEWLINE RECOGNITION</a>
<li><a name="TOC8" href="#SEC8">WHAT \R MATCHES</a>
<li><a name="TOC9" href="#SEC9">HANDLING VERY LARGE PATTERNS</a>
<li><a name="TOC10" href="#SEC10">AVOIDING EXCESSIVE STACK USAGE</a>
<li><a name="TOC11" href="#SEC11">LIMITING PCRE2 RESOURCE USAGE</a>
<li><a name="TOC12" href="#SEC12">CREATING CHARACTER TABLES AT BUILD TIME</a>
<li><a name="TOC13" href="#SEC13">USING EBCDIC CODE</a>
<li><a name="TOC14" href="#SEC14">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
<li><a name="TOC15" href="#SEC15">PCRE2GREP BUFFER SIZE</a>
<li><a name="TOC16" href="#SEC16">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a>
<li><a name="TOC17" href="#SEC17">INCLUDING DEBUGGING CODE</a>
<li><a name="TOC18" href="#SEC18">DEBUGGING WITH VALGRIND SUPPORT</a>
<li><a name="TOC19" href="#SEC19">CODE COVERAGE REPORTING</a>
<li><a name="TOC20" href="#SEC20">SEE ALSO</a>
<li><a name="TOC21" href="#SEC21">AUTHOR</a>
<li><a name="TOC22" href="#SEC22">REVISION</a>
<li><a name="TOC6" href="#SEC6">DISABLING THE USE OF \C</a>
<li><a name="TOC7" href="#SEC7">JUST-IN-TIME COMPILER SUPPORT</a>
<li><a name="TOC8" href="#SEC8">NEWLINE RECOGNITION</a>
<li><a name="TOC9" href="#SEC9">WHAT \R MATCHES</a>
<li><a name="TOC10" href="#SEC10">HANDLING VERY LARGE PATTERNS</a>
<li><a name="TOC11" href="#SEC11">AVOIDING EXCESSIVE STACK USAGE</a>
<li><a name="TOC12" href="#SEC12">LIMITING PCRE2 RESOURCE USAGE</a>
<li><a name="TOC13" href="#SEC13">CREATING CHARACTER TABLES AT BUILD TIME</a>
<li><a name="TOC14" href="#SEC14">USING EBCDIC CODE</a>
<li><a name="TOC15" href="#SEC15">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
<li><a name="TOC16" href="#SEC16">PCRE2GREP BUFFER SIZE</a>
<li><a name="TOC17" href="#SEC17">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a>
<li><a name="TOC18" href="#SEC18">INCLUDING DEBUGGING CODE</a>
<li><a name="TOC19" href="#SEC19">DEBUGGING WITH VALGRIND SUPPORT</a>
<li><a name="TOC20" href="#SEC20">CODE COVERAGE REPORTING</a>
<li><a name="TOC21" href="#SEC21">SEE ALSO</a>
<li><a name="TOC22" href="#SEC22">AUTHOR</a>
<li><a name="TOC23" href="#SEC23">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">BUILDING PCRE2</a><br>
<P>
@ -148,13 +149,19 @@ properties. The application can request that they do by setting the PCRE2_UCP
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
request this by starting with (*UCP).
</P>
<br><a name="SEC6" href="#TOC1">DISABLING THE USE OF \C</a><br>
<P>
The \C escape sequence, which matches a single code unit, even in a UTF mode,
can cause unpredictable behaviour because it may leave the current matching
point in the middle of a multi-code-unit character. It can be locked out by
setting the PCRE2_NEVER_BACKSLASH_C option.
point in the middle of a multi-code-unit character. The application can lock it
out by setting the PCRE2_NEVER_BACKSLASH_C option when calling
<b>pcre2_compile()</b>. There is also a build-time option
<pre>
--enable-never-backslash-C
</pre>
(note the upper case C) which locks out the use of \C entirely.
</P>
<br><a name="SEC6" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br>
<br><a name="SEC7" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br>
<P>
Just-in-time compiler support is included in the build by specifying
<pre>
@ -171,7 +178,7 @@ pcre2grep automatically makes use of it, unless you add
</pre>
to the "configure" command.
</P>
<br><a name="SEC7" href="#TOC1">NEWLINE RECOGNITION</a><br>
<br><a name="SEC8" href="#TOC1">NEWLINE RECOGNITION</a><br>
<P>
By default, PCRE2 interprets the linefeed (LF) character as indicating the end
of a line. This is the normal newline character on Unix-like systems. You can
@ -208,7 +215,7 @@ Whatever default line ending convention is selected when PCRE2 is built can be
overridden by applications that use the library. At build time it is
conventional to use the standard for your operating system.
</P>
<br><a name="SEC8" href="#TOC1">WHAT \R MATCHES</a><br>
<br><a name="SEC9" href="#TOC1">WHAT \R MATCHES</a><br>
<P>
By default, the sequence \R in a pattern matches any Unicode newline sequence,
independently of what has been selected as the line ending sequence. If you
@ -220,7 +227,7 @@ the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
selected when PCRE2 is built can be overridden by applications that use the
called.
</P>
<br><a name="SEC9" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
<br><a name="SEC10" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
<P>
Within a compiled pattern, offset values are used to point from one part to
another (for example, from an opening parenthesis to an alternation
@ -239,7 +246,7 @@ longer offsets slows down the operation of PCRE2 because it has to load
additional data when handling them. For the 32-bit library the value is always
4 and cannot be overridden; the value of --with-link-size is ignored.
</P>
<br><a name="SEC10" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
<br><a name="SEC11" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
<P>
When matching with the <b>pcre2_match()</b> function, PCRE2 implements
backtracking by making recursive calls to an internal function called
@ -261,7 +268,7 @@ custom memory management functions can be called instead. PCRE2 runs noticeably
more slowly when built in this way. This option affects only the
<b>pcre2_match()</b> function; it is not relevant for <b>pcre2_dfa_match()</b>.
</P>
<br><a name="SEC11" href="#TOC1">LIMITING PCRE2 RESOURCE USAGE</a><br>
<br><a name="SEC12" href="#TOC1">LIMITING PCRE2 RESOURCE USAGE</a><br>
<P>
Internally, PCRE2 has a function called <b>match()</b>, which it calls
repeatedly (sometimes recursively) when matching a pattern with the
@ -290,7 +297,7 @@ constraints. However, you can set a lower limit by adding, for example,
</pre>
to the <b>configure</b> command. This value can also be overridden at run time.
</P>
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
<br><a name="SEC13" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
<P>
PCRE2 uses fixed tables for processing characters whose code points are less
than 256. By default, PCRE2 is built with a set of tables that are distributed
@ -307,7 +314,7 @@ compiling, because <b>dftables</b> is run on the local host. If you need to
create alternative tables when cross compiling, you will have to do so "by
hand".)
</P>
<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
<br><a name="SEC14" href="#TOC1">USING EBCDIC CODE</a><br>
<P>
PCRE2 assumes by default that it will run in an environment where the character
code is ASCII or Unicode, which is a superset of ASCII. This is the case for
@ -342,7 +349,7 @@ The options that select newline behaviour, such as --enable-newline-is-cr,
and equivalent run-time options, refer to these character values in an EBCDIC
environment.
</P>
<br><a name="SEC14" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
<br><a name="SEC15" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
<P>
By default, <b>pcre2grep</b> reads all files as plain text. You can build it so
that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads
@ -355,7 +362,7 @@ to the <b>configure</b> command. These options naturally require that the
relevant libraries are installed on your system. Configuration will fail if
they are not.
</P>
<br><a name="SEC15" href="#TOC1">PCRE2GREP BUFFER SIZE</a><br>
<br><a name="SEC16" href="#TOC1">PCRE2GREP BUFFER SIZE</a><br>
<P>
<b>pcre2grep</b> uses an internal buffer to hold a "window" on the file it is
scanning, in order to be able to output "before" and "after" lines when it
@ -370,7 +377,7 @@ parameter value by adding, for example,
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override this
value by using --buffer-size on the command line..
</P>
<br><a name="SEC16" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
<br><a name="SEC17" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
<P>
If you add one of
<pre>
@ -404,7 +411,7 @@ automatically included, you may need to add something like
</pre>
immediately before the <b>configure</b> command.
</P>
<br><a name="SEC17" href="#TOC1">INCLUDING DEBUGGING CODE</a><br>
<br><a name="SEC18" href="#TOC1">INCLUDING DEBUGGING CODE</a><br>
<P>
If you add
<pre>
@ -413,7 +420,7 @@ If you add
to the <b>configure</b> command, additional debugging code is included in the
build. This feature is intended for use by the PCRE2 maintainers.
</P>
<br><a name="SEC18" href="#TOC1">DEBUGGING WITH VALGRIND SUPPORT</a><br>
<br><a name="SEC19" href="#TOC1">DEBUGGING WITH VALGRIND SUPPORT</a><br>
<P>
If you add
<pre>
@ -423,7 +430,7 @@ to the <b>configure</b> command, PCRE2 will use valgrind annotations to mark
certain memory regions as unaddressable. This allows it to detect invalid
memory accesses, and is mostly useful for debugging PCRE2 itself.
</P>
<br><a name="SEC19" href="#TOC1">CODE COVERAGE REPORTING</a><br>
<br><a name="SEC20" href="#TOC1">CODE COVERAGE REPORTING</a><br>
<P>
If your C compiler is gcc, you can build a version of PCRE2 that can generate a
code coverage report for its test suite. To enable this, you must install
@ -480,11 +487,11 @@ This cleans all coverage data including the generated coverage report. For more
information about code coverage, see the <b>gcov</b> and <b>lcov</b>
documentation.
</P>
<br><a name="SEC20" href="#TOC1">SEE ALSO</a><br>
<br><a name="SEC21" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcre2api</b>(3), <b>pcre2-config</b>(3).
</P>
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br>
<br><a name="SEC22" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
@ -493,9 +500,9 @@ University Computing Service
Cambridge, England.
<br>
</P>
<br><a name="SEC22" href="#TOC1">REVISION</a><br>
<br><a name="SEC23" href="#TOC1">REVISION</a><br>
<P>
Last updated: 24 April 2015
Last updated: 16 October 2015
<br>
Copyright &copy; 1997-2015 University of Cambridge.
<br>

View File

@ -1236,14 +1236,21 @@ with \C in UTF-8 or UTF-16 mode means that the rest of the string may start
with a malformed UTF character. This has undefined results, because PCRE2
assumes that it is matching character by character in a valid UTF string (by
default it checks the subject string's validity at the start of processing
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the
use of \C by setting the PCRE2_NEVER_BACKSLASH_C option.
unless the PCRE2_NO_UTF_CHECK option is used).
</P>
<P>
An application can lock out the use of \C by setting the
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
build PCRE2 with the use of \C permanently disabled.
</P>
<P>
PCRE2 does not allow \C to appear in lookbehind assertions
<a href="#lookbehind">(described below)</a>
in a UTF mode, because this would make it impossible to calculate the length of
the lookbehind.
the lookbehind. Neither the alternative matching function
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
former gives a match-time error; the latter fails to optimize and so the match
is always run using the interpreter.
</P>
<P>
In general, the \C escape sequence is best avoided. However, one way of using
@ -3351,7 +3358,7 @@ Cambridge, England.
</P>
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
<P>
Last updated: 01 September 2015
Last updated: 16 October 2015
<br>
Copyright &copy; 1997-2015 University of Cambridge.
<br>

View File

@ -12,17 +12,21 @@ This page is part of the PCRE2 HTML documentation. It was generated
automatically from the original man page. If there is any nonsense in it,
please consult the man page, in case the conversion went wrong.
<br>
<br><b>
PCRE2 PERFORMANCE
</b><br>
<ul>
<li><a name="TOC1" href="#SEC1">PCRE2 PERFORMANCE</a>
<li><a name="TOC2" href="#SEC2">COMPILED PATTERN MEMORY USAGE</a>
<li><a name="TOC3" href="#SEC3">STACK USAGE AT RUN TIME</a>
<li><a name="TOC4" href="#SEC4">PROCESSING TIME</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE2 PERFORMANCE</a><br>
<P>
Two aspects of performance are discussed below: memory usage and processing
time. The way you express your pattern as a regular expression can affect both
of them.
</P>
<br><b>
COMPILED PATTERN MEMORY USAGE
</b><br>
<br><a name="SEC2" href="#TOC1">COMPILED PATTERN MEMORY USAGE</a><br>
<P>
Patterns are compiled by PCRE2 into a reasonably efficient interpretive code,
so that most simple patterns do not use much memory. However, there is one case
@ -75,9 +79,7 @@ pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
speed is acceptable, this kind of rewriting will allow you to process patterns
that PCRE2 cannot otherwise handle.
</P>
<br><b>
STACK USAGE AT RUN TIME
</b><br>
<br><a name="SEC3" href="#TOC1">STACK USAGE AT RUN TIME</a><br>
<P>
When <b>pcre2_match()</b> is used for matching, certain kinds of pattern can
cause it to use large amounts of the process stack. In some environments the
@ -86,9 +88,7 @@ SIGSEGV. Rewriting your pattern can often help. The
<a href="pcre2stack.html"><b>pcre2stack</b></a>
documentation discusses this issue in detail.
</P>
<br><b>
PROCESSING TIME
</b><br>
<br><a name="SEC4" href="#TOC1">PROCESSING TIME</a><br>
<P>
Certain items in regular expression patterns are processed more efficiently
than others. It is more efficient to use a character class like [aeiou] than a
@ -177,9 +177,7 @@ appreciable time with strings longer than about 20 characters.
In many cases, the solution to this kind of performance issue is to use an
atomic group or a possessive quantifier.
</P>
<br><b>
AUTHOR
</b><br>
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
@ -188,9 +186,7 @@ University Computing Service
Cambridge, England.
<br>
</P>
<br><b>
REVISION
</b><br>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 02 January 2015
<br>

View File

@ -111,9 +111,10 @@ it matches a literal "u".
\W a "non-word" character
\X a Unicode extended grapheme cluster
</pre>
The application can lock out the use of \C by setting the
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the
current matching point in the middle of a UTF-8 or UTF-16 character.
\C is dangerous because it may leave the current matching point in the middle
of a UTF-8 or UTF-16 character. The application can lock out the use of \C by
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
with the use of \C permanently disabled.
</P>
<P>
By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode
@ -588,7 +589,7 @@ Cambridge, England.
</P>
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
<P>
Last updated: 17 July 2015
Last updated: 16 October 2015
<br>
Copyright &copy; 1997-2015 University of Cambridge.
<br>

View File

@ -155,6 +155,7 @@ following options output the value and set the exit code as indicated:
The following options output 1 for true or 0 for false, and set the exit code
to the same value:
<pre>
backslash-C \C is supported (not locked out)
ebcdic compiled for an EBCDIC environment
jit just-in-time support is available
pcre2-16 the 16-bit library was built
@ -510,7 +511,7 @@ Setting compilation options
<P>
The following modifiers set options for <b>pcre2_compile()</b>. The most common
ones have single-letter abbreviations. See
<a href="pcreapi.html"><b>pcreapi</b></a>
<a href="pcre2api.html"><b>pcre2api</b></a>
for a description of their effects.
<pre>
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
@ -537,6 +538,7 @@ for a description of their effects.
no_utf_check set PCRE2_NO_UTF_CHECK
ucp set PCRE2_UCP
ungreedy set PCRE2_UNGREEDY
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
utf set PCRE2_UTF
</pre>
As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all
@ -564,6 +566,7 @@ about the pattern:
locale=&#60;name&#62; use this locale
memory show memory used
newline=&#60;type&#62; set newline type
null_context compile with a NULL context
parens_nest_limit=&#60;n&#62; set maximum parentheses depth
posix use the POSIX API
push push compiled pattern onto the stack
@ -642,6 +645,15 @@ is requested. For each callout, either its number or string is given, followed
by the item that follows it in the pattern.
</P>
<br><b>
Passing a NULL context
</b><br>
<P>
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_compile()</b>. If
the <b>null_context</b> modifier is set, however, NULL is passed. This is for
testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
default values).
</P>
<br><b>
Specifying a pattern in hex
</b><br>
<P>
@ -920,9 +932,11 @@ pattern.
/g global global matching
jitstack=&#60;n&#62; set size of JIT stack
mark show mark values
match_limit=&#62;n&#62; set a match limit
match_limit=&#60;n&#62; set a match limit
memory show memory usage
null_context match with a NULL context
offset=&#60;n&#62; set starting offset
offset_limit=&#60;n&#62; set offset limit
ovector=&#60;n&#62; set size of output vector
recursion_limit=&#60;n&#62; set a recursion limit
replace=&#60;string&#62; specify a replacement string
@ -1170,6 +1184,16 @@ The <b>offset</b> modifier sets an offset in the subject string at which
matching starts. Its value is a number of code units, not characters.
</P>
<br><b>
Setting an offset limit
</b><br>
<P>
The <b>offset_limit</b> modifier sets a limit for unanchored matches. If a match
cannot be found starting at or before this offset in the subject, a "no match"
return is given. The data value is a number of code units, not characters. When
this modifier is used, the <b>use_offset_limit</b> modifier must have been set
for the pattern; if not, an error is generated.
</P>
<br><b>
Setting the size of the output vector
</b><br>
<P>
@ -1201,6 +1225,17 @@ this modifier has no effect, as there is no facility for passing a length.)
When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
passing the replacement string as zero-terminated.
</P>
<br><b>
Passing a NULL context
</b><br>
<P>
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
<b>pcre2_dfa_match()</b> or <b>pcre2_jit_match()</b>. If the <b>null_context</b>
modifier is set, however, NULL is passed. This is for testing that the matching
functions behave correctly in this case (they use default values). This
modifier cannot be used with the <b>find_limits</b> modifier or when testing the
substitution function.
</P>
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
<P>
By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
@ -1539,7 +1574,7 @@ Cambridge, England.
</P>
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
<P>
Last updated: 14 September 2015
Last updated: 17 October 2015
<br>
Copyright &copy; 1997-2015 University of Cambridge.
<br>

View File

@ -71,11 +71,12 @@ The escape sequence \C can be used to match a single code unit, in a UTF mode,
but its use can lead to some strange effects because it breaks up multi-unit
characters (see the description of \C in the
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
documentation). The use of \C is not supported in the alternative matching
function <b>pcre2_dfa_match()</b>, nor is it supported in UTF mode by the JIT
optimization. If JIT optimization is requested for a UTF pattern that contains
\C, it will not succeed, and so the matching will be carried out by the normal
interpretive function.
documentation). The use of \C is not supported by the alternative matching
function <b>pcre2_dfa_match()</b> when in UTF mode. Its use provokes a
match-time error. The JIT optimization also does not support \C in UTF mode.
If JIT optimization is requested for a UTF pattern that contains \C, it will
not succeed, and so the matching will be carried out by the normal interpretive
function.
</P>
<P>
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
@ -275,7 +276,7 @@ Cambridge, England.
REVISION
</b><br>
<P>
Last updated: 18 August 2015
Last updated: 16 October 2015
<br>
Copyright &copy; 1997-2015 University of Cambridge.
<br>

View File

@ -1,4 +1,4 @@
.TH PCRE2 3 "13 April 2015" "PCRE2 10.20"
.TH PCRE2 3 "16 October 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH INTRODUCTION
@ -118,8 +118,10 @@ running redundant checks.
.P
The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to
problems, because it may leave the current matching point in the middle of a
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to
lock out the use of \eC, causing a compile-time error if it is encountered.
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
application to lock out the use of \eC, causing a compile-time error if it is
encountered. It is also possible to build PCRE2 with the use of \eC permanently
disabled.
.P
Another way that performance can be hit is by running a pattern that has a very
large search tree against a string that will never match. Nested unlimited
@ -187,6 +189,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
.rs
.sp
.nf
Last updated: 13 April 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -104,8 +104,9 @@ SECURITY CONSIDERATIONS
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead
to problems, because it may leave the current matching point in the
middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C
option can be used to lock out the use of \C, causing a compile-time
error if it is encountered.
option can be used by an application to lock out the use of \C, causing
a compile-time error if it is encountered. It is also possible to build
PCRE2 with the use of \C permanently disabled.
Another way that performance can be hit is by running a pattern that
has a very large search tree against a string that will never match.
@ -165,7 +166,7 @@ AUTHOR
REVISION
Last updated: 13 April 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------
@ -1276,7 +1277,9 @@ COMPILING A PATTERN
piled. This escape can cause unpredictable behaviour in UTF-8 or
UTF-16 modes, because it may leave the current matching point in the
middle of a multi-code-unit character. This option may be useful in
applications that process patterns from external sources.
applications that process patterns from external sources. Note that
there is also a build-time option that permanently locks out the use of
\C.
PCRE2_NEVER_UCP
@ -2571,19 +2574,36 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
PCRE2_SIZE length, PCRE2_SIZE startoffset,
uint32_t options, pcre2_match_data *match_data,
pcre2_match_context *mcontext, PCRE2_SPTR replacementzfP,
pcre2_match_context *mcontext, PCRE2_SPTR replacement,
PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferP,
PCRE2_SIZE *outlengthptr);
This function calls pcre2_match() and then makes a copy of the subject
string in outputbuffer, replacing the part that was matched with the
replacement string, whose length is supplied in rlength. This can be
given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
The first seven arguments of pcre2_substitute() are the same as for
pcre2_match(), except that the partial matching options are not permit-
ted, and match_data may be passed as NULL, in which case a match data
block is obtained and freed within this function, using memory manage-
ment functions from the match context, if provided, or else those that
were used to allocate memory for the compiled code.
The outlengthptr argument must point to a variable that contains the
length, in code units, of the output buffer. If the function is suc-
cessful, the value is updated to contain the length of the new string,
excluding the trailing zero that is automatically added. If the func-
tion is not successful, the value is set to PCRE2_UNSET for general
errors (such as output buffer too small). For syntax errors in the
replacement string, the value is set to the offset in the replacement
string where the error was detected.
In the replacement string, which is interpreted as a UTF string in UTF
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
option is set, a dollar character is an escape character that can spec-
ify the insertion of characters from capturing groups or (*MARK) items
in the pattern. The following forms are recognized:
in the pattern. The following forms are always recognized:
$$ insert a dollar character
$<n> or ${<n>} insert the contents of group <n>
@ -2594,8 +2614,7 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
preted as part of the number or name. The number may be zero to include
the entire matched string. For example, if the pattern a(b)c is
matched with "=abc=" and the replacement string "+$1$0$1+", the result
is "=+babcb+=". Group insertion is done by calling pcre2_copy_byname()
or pcre2_copy_bynumber() as appropriate.
is "=+babcb+=".
The facility for inserting a (*MARK) name can be used to perform simple
simultaneous substitutions, as this pcre2test example shows:
@ -2604,32 +2623,80 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
apple lemon
2: pear orange
The first seven arguments of pcre2_substitute() are the same as for
pcre2_match(), except that the partial matching options are not permit-
ted, and match_data may be passed as NULL, in which case a match data
block is obtained and freed within this function, using memory manage-
ment functions from the match context, if provided, or else those that
were used to allocate memory for the compiled code.
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
the function to iterate over the subject string, replacing every match-
ing substring. If this is not set, only the first matching substring is
replaced.
The outlengthptr argument must point to a variable that contains the
length, in code units, of the output buffer. It is updated to contain
the length of the new string, excluding the trailing zero that is auto-
matically added.
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra
processing to be applied to the replacement string. Without this
option, only the dollar character is special, and only the group inser-
tion forms listed above are valid. When PCRE2_SUBSTITUTE_EXTENDED is
set, two things change:
The function returns the number of replacements that were made. This
may be zero if no matches were found, and is never greater than 1
unless PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a neg-
ative error code is returned. Except for PCRE2_ERROR_NOMATCH (which is
never returned), any errors from pcre2_match() or the substring copying
functions are passed straight back. PCRE2_ERROR_BADREPLACEMENT is
returned for an invalid replacement string (unrecognized sequence fol-
lowing a dollar sign), and PCRE2_ERROR_NOMEMORY is returned if the out-
put buffer is not big enough.
Firstly, backslash in a replacement string is interpreted as an escape
character. The usual forms such as \n or \x{ddd} can be used to specify
particular character codes, and backslash followed by any non-alphanu-
meric character quotes that character. Extended quoting can be coded
using \Q...\E, exactly as in pattern strings.
There are also four escape sequences for forcing the case of inserted
letters. The insertion mechanism has three states: no case forcing,
force upper case, and force lower case. The escape sequences change the
current state: \U and \L change to upper or lower case forcing, respec-
tively, and \E (when not terminating a \Q quoted sequence) reverts to
no case forcing. The sequences \u and \l force the next character (if
it is a letter) to upper or lower case, respectively, and then the
state automatically reverts to no case forcing. Case forcing applies to
all inserted characters, including those from captured groups and let-
ters within \Q...\E quoted sequences.
Note that case forcing sequences such as \U...\E do not nest. For exam-
ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final
\E has no effect.
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
flexibility to group substitution. The syntax is similar to that used
by Bash:
${<n>:-<string>}
${<n>:+<string1>:<string2>}
As before, <n> may be a group number or a name. The first form speci-
fies a default value. If group <n> is set, its value is inserted; if
not, <string> is expanded and the result inserted. The second form
specifies strings that are expanded and inserted when group <n> is set
or unset, respectively. The first form is just a convenient shorthand
for
${<n>:+${<n>}:<string>}
Backslash can be used to escape colons and closing curly brackets in
the replacement strings. A change of the case forcing state within a
replacement string remains in force afterwards, as shown in this
pcre2test example:
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
body
1: hello
somebody
1: HELLO
If successful, the function returns the number of replacements that
were made. This may be zero if no matches were found, and is never
greater than 1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
In the event of an error, a negative error code is returned. Except for
PCRE2_ERROR_NOMATCH (which is never returned), errors from
pcre2_match() are passed straight back. PCRE2_ERROR_NOMEMORY is
returned if the output buffer is not big enough.
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in
the replacement string, with more particular errors being
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REP-
MISSING_BRACE (closing curly bracket not found), and PCRE2_BADSUBSTITU-
TION (syntax error in extended group substitution). As for all PCRE2
errors, a text message that describes the error can be obtained by
calling pcre2_get_error_message().
DUPLICATE SUBPATTERN NAMES
@ -2845,8 +2912,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
PCRE2_ERROR_DFA_UITEM
This return is given if pcre2_dfa_match() encounters an item in the
pattern that it does not support, for instance, the use of \C or a back
reference.
pattern that it does not support, for instance, the use of \C in a UTF
mode or a back reference.
PCRE2_ERROR_DFA_UCOND
@ -2890,7 +2957,7 @@ AUTHOR
REVISION
Last updated: 22 September 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------
@ -3010,10 +3077,18 @@ UNICODE AND UTF SUPPORT
PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a
pattern may also request this by starting with (*UCP).
DISABLING THE USE OF \C
The \C escape sequence, which matches a single code unit, even in a UTF
mode, can cause unpredictable behaviour because it may leave the cur-
rent matching point in the middle of a multi-code-unit character. It
can be locked out by setting the PCRE2_NEVER_BACKSLASH_C option.
rent matching point in the middle of a multi-code-unit character. The
application can lock it out by setting the PCRE2_NEVER_BACKSLASH_C
option when calling pcre2_compile(). There is also a build-time option
--enable-never-backslash-C
(note the upper case C) which locks out the use of \C entirely.
JUST-IN-TIME COMPILER SUPPORT
@ -3366,7 +3441,7 @@ AUTHOR
REVISION
Last updated: 24 April 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------
@ -6028,12 +6103,18 @@ MATCHING A SINGLE CODE UNIT
results, because PCRE2 assumes that it is matching character by charac-
ter in a valid UTF string (by default it checks the subject string's
validity at the start of processing unless the PCRE2_NO_UTF_CHECK
option is used). An application can lock out the use of \C by setting
the PCRE2_NEVER_BACKSLASH_C option.
option is used).
An application can lock out the use of \C by setting the
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also
possible to build PCRE2 with the use of \C permanently disabled.
PCRE2 does not allow \C to appear in lookbehind assertions (described
below) in a UTF mode, because this would make it impossible to calcu-
late the length of the lookbehind.
late the length of the lookbehind. Neither the alternative matching
function pcre2_dfa_match() not the JIT optimizer support \C in a UTF
mode. The former gives a match-time error; the latter fails to optimize
and so the match is always run using the interpreter.
In general, the \C escape sequence is best avoided. However, one way of
using it that avoids the problem of malformed UTF characters is to use
@ -8036,7 +8117,7 @@ AUTHOR
REVISION
Last updated: 01 September 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------
@ -8966,10 +9047,10 @@ CHARACTER TYPES
\W a "non-word" character
\X a Unicode extended grapheme cluster
The application can lock out the use of \C by setting the
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave
the current matching point in the middle of a UTF-8 or UTF-16 charac-
ter.
\C is dangerous because it may leave the current matching point in the
middle of a UTF-8 or UTF-16 character. The application can lock out the
use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also
possible to build PCRE2 with the use of \C permanently disabled.
By default, \d, \s, and \w match only ASCII characters, even in UTF-8
mode or in the 16-bit and 32-bit libraries. However, if locale-specific
@ -9325,7 +9406,7 @@ AUTHOR
REVISION
Last updated: 17 July 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------
@ -9384,11 +9465,12 @@ WIDE CHARACTERS AND UTF MODES
The escape sequence \C can be used to match a single code unit, in a
UTF mode, but its use can lead to some strange effects because it
breaks up multi-unit characters (see the description of \C in the
pcre2pattern documentation). The use of \C is not supported in the
alternative matching function pcre2_dfa_match(), nor is it supported in
UTF mode by the JIT optimization. If JIT optimization is requested for
a UTF pattern that contains \C, it will not succeed, and so the match-
ing will be carried out by the normal interpretive function.
pcre2pattern documentation). The use of \C is not supported by the
alternative matching function pcre2_dfa_match() when in UTF mode. Its
use provokes a match-time error. The JIT optimization also does not
support \C in UTF mode. If JIT optimization is requested for a UTF
pattern that contains \C, it will not succeed, and so the matching will
be carried out by the normal interpretive function.
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
characters of any code value, but, by default, the characters that
@ -9563,7 +9645,7 @@ AUTHOR
REVISION
Last updated: 18 August 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2API 3 "07 October 2015" "PCRE2 10.21"
.TH PCRE2API 3 "16 October 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.sp
@ -1209,7 +1209,8 @@ This option locks out the use of \eC in the pattern that is being compiled.
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
it may leave the current matching point in the middle of a multi-code-unit
character. This option may be useful in applications that process patterns from
external sources.
external sources. Note that there is also a build-time option that permanently
locks out the use of \eC.
.sp
PCRE2_NEVER_UCP
.sp
@ -3014,8 +3015,8 @@ There are in addition the following errors that are specific to
PCRE2_ERROR_DFA_UITEM
.sp
This return is given if \fBpcre2_dfa_match()\fP encounters an item in the
pattern that it does not support, for instance, the use of \eC or a back
reference.
pattern that it does not support, for instance, the use of \eC in a UTF mode or
a back reference.
.sp
PCRE2_ERROR_DFA_UCOND
.sp
@ -3065,6 +3066,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 07 October 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2BUILD 3 "23 April 2015" "PCRE2 10.20"
.TH PCRE2BUILD 3 "16 October 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.
@ -132,11 +132,20 @@ Pattern escapes such as \ed and \ew do not by default make use of Unicode
properties. The application can request that they do by setting the PCRE2_UCP
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
request this by starting with (*UCP).
.P
.
.
.SH "DISABLING THE USE OF \eC"
.rs
.sp
The \eC escape sequence, which matches a single code unit, even in a UTF mode,
can cause unpredictable behaviour because it may leave the current matching
point in the middle of a multi-code-unit character. It can be locked out by
setting the PCRE2_NEVER_BACKSLASH_C option.
point in the middle of a multi-code-unit character. The application can lock it
out by setting the PCRE2_NEVER_BACKSLASH_C option when calling
\fBpcre2_compile()\fP. There is also a build-time option
.sp
--enable-never-backslash-C
.sp
(note the upper case C) which locks out the use of \eC entirely.
.
.
.SH "JUST-IN-TIME COMPILER SUPPORT"
@ -510,6 +519,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 24 April 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21"
.TH PCRE2PATTERN 3 "16 October 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -1233,8 +1233,11 @@ with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start
with a malformed UTF character. This has undefined results, because PCRE2
assumes that it is matching character by character in a valid UTF string (by
default it checks the subject string's validity at the start of processing
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the
use of \eC by setting the PCRE2_NEVER_BACKSLASH_C option.
unless the PCRE2_NO_UTF_CHECK option is used).
.P
An application can lock out the use of \eC by setting the
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
build PCRE2 with the use of \eC permanently disabled.
.P
PCRE2 does not allow \eC to appear in lookbehind assertions
.\" HTML <a href="#lookbehind">
@ -1242,7 +1245,10 @@ PCRE2 does not allow \eC to appear in lookbehind assertions
(described below)
.\"
in a UTF mode, because this would make it impossible to calculate the length of
the lookbehind.
the lookbehind. Neither the alternative matching function
\fBpcre2_dfa_match()\fP not the JIT optimizer support \eC in a UTF mode. The
former gives a match-time error; the latter fails to optimize and so the match
is always run using the interpreter.
.P
In general, the \eC escape sequence is best avoided. However, one way of using
it that avoids the problem of malformed UTF characters is to use a lookahead to
@ -3386,6 +3392,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 01 September 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2SYNTAX 3 "17 July 2015" "PCRE2 10.21"
.TH PCRE2SYNTAX 3 "16 October 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
@ -81,9 +81,10 @@ it matches a literal "u".
\eW a "non-word" character
\eX a Unicode extended grapheme cluster
.sp
The application can lock out the use of \eC by setting the
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the
current matching point in the middle of a UTF-8 or UTF-16 character.
\eC is dangerous because it may leave the current matching point in the middle
of a UTF-8 or UTF-16 character. The application can lock out the use of \eC by
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
with the use of \eC permanently disabled.
.P
By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode
or in the 16-bit and 32-bit libraries. However, if locale-specific matching is
@ -576,6 +577,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 17 July 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2TEST 1 "23 September 2015" "PCRE 10.21"
.TH PCRE2TEST 1 "17 October 2015" "PCRE 10.21"
.SH NAME
pcre2test - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS
@ -122,6 +122,7 @@ following options output the value and set the exit code as indicated:
The following options output 1 for true or 0 for false, and set the exit code
to the same value:
.sp
backslash-C \eC is supported (not locked out)
ebcdic compiled for an EBCDIC environment
jit just-in-time support is available
pcre2-16 the 16-bit library was built
@ -1559,6 +1560,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 23 September 2015
Last updated: 17 October 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -119,6 +119,7 @@ COMMAND LINE OPTIONS
The following options output 1 for true or 0 for false, and
set the exit code to the same value:
backslash-C \C is supported (not locked out)
ebcdic compiled for an EBCDIC environment
jit just-in-time support is available
pcre2-16 the 16-bit library was built
@ -457,7 +458,7 @@ PATTERN MODIFIERS
Setting compilation options
The following modifiers set options for pcre2_compile(). The most com-
mon ones have single-letter abbreviations. See pcreapi for a descrip-
mon ones have single-letter abbreviations. See pcre2api for a descrip-
tion of their effects.
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
@ -484,6 +485,7 @@ PATTERN MODIFIERS
no_utf_check set PCRE2_NO_UTF_CHECK
ucp set PCRE2_UCP
ungreedy set PCRE2_UNGREEDY
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
utf set PCRE2_UTF
As well as turning on the PCRE2_UTF option, the utf modifier causes all
@ -509,6 +511,7 @@ PATTERN MODIFIERS
locale=<name> use this locale
memory show memory used
newline=<type> set newline type
null_context compile with a NULL context
parens_nest_limit=<n> set maximum parentheses depth
posix use the POSIX API
push push compiled pattern onto the stack
@ -579,6 +582,13 @@ PATTERN MODIFIERS
mation that is requested. For each callout, either its number or string
is given, followed by the item that follows it in the pattern.
Passing a NULL context
Normally, pcre2test passes a context block to pcre2_compile(). If the
null_context modifier is set, however, NULL is passed. This is for
testing that pcre2_compile() behaves correctly in this case (it uses
default values).
Specifying a pattern in hex
The hex modifier specifies that the characters of the pattern are to be
@ -823,9 +833,11 @@ SUBJECT MODIFIERS
/g global global matching
jitstack=<n> set size of JIT stack
mark show mark values
match_limit=>n> set a match limit
match_limit=<n> set a match limit
memory show memory usage
null_context match with a NULL context
offset=<n> set starting offset
offset_limit=<n> set offset limit
ovector=<n> set size of output vector
recursion_limit=<n> set a recursion limit
replace=<string> specify a replacement string
@ -1046,6 +1058,14 @@ SUBJECT MODIFIERS
The offset modifier sets an offset in the subject string at which
matching starts. Its value is a number of code units, not characters.
Setting an offset limit
The offset_limit modifier sets a limit for unanchored matches. If a
match cannot be found starting at or before this offset in the subject,
a "no match" return is given. The data value is a number of code units,
not characters. When this modifier is used, the use_offset_limit modi-
fier must have been set for the pattern; if not, an error is generated.
Setting the size of the output vector
The ovector modifier applies only to the subject line in which it
@ -1073,6 +1093,15 @@ SUBJECT MODIFIERS
When testing pcre2_substitute(), this modifier also has the effect of
passing the replacement string as zero-terminated.
Passing a NULL context
Normally, pcre2test passes a context block to pcre2_match(),
pcre2_dfa_match() or pcre2_jit_match(). If the null_context modifier is
set, however, NULL is passed. This is for testing that the matching
functions behave correctly in this case (they use default values). This
modifier cannot be used with the find_limits modifier or when testing
the substitution function.
THE ALTERNATIVE MATCHING FUNCTION
@ -1398,5 +1427,5 @@ AUTHOR
REVISION
Last updated: 14 September 2015
Last updated: 17 October 2015
Copyright (c) 1997-2015 University of Cambridge.

View File

@ -1,4 +1,4 @@
.TH PCRE2UNICODE 3 "18 August 2015" "PCRE2 10.21"
.TH PCRE2UNICODE 3 "16 October 2015" "PCRE2 10.21"
.SH NAME
PCRE - Perl-compatible regular expressions (revised API)
.SH "UNICODE AND UTF SUPPORT"
@ -63,11 +63,12 @@ characters (see the description of \eC in the
.\" HREF
\fBpcre2pattern\fP
.\"
documentation). The use of \eC is not supported in the alternative matching
function \fBpcre2_dfa_match()\fP, nor is it supported in UTF mode by the JIT
optimization. If JIT optimization is requested for a UTF pattern that contains
\eC, it will not succeed, and so the matching will be carried out by the normal
interpretive function.
documentation). The use of \eC is not supported by the alternative matching
function \fBpcre2_dfa_match()\fP when in UTF mode. Its use provokes a
match-time error. The JIT optimization also does not support \eC in UTF mode.
If JIT optimization is requested for a UTF pattern that contains \eC, it will
not succeed, and so the matching will be carried out by the normal interpretive
function.
.P
The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test
characters of any code value, but, by default, the characters that PCRE2
@ -262,6 +263,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 18 August 2015
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -182,6 +182,9 @@ sure both macros are undefined; an emulation function will then be used. */
#define MAX_NAME_SIZE 32
#endif
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
/* #undef NEVER_BACKSLASH_C */
/* The value of NEWLINE_DEFAULT determines the default newline character
sequence. PCRE2 client programs can override this by selecting other values
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5

View File

@ -169,6 +169,9 @@ sure both macros are undefined; an emulation function will then be used. */
overflow caused by enormously large patterns. */
#undef MAX_NAME_SIZE
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
#undef NEVER_BACKSLASH_C
/* The value of NEWLINE_DEFAULT determines the default newline character
sequence. PCRE2 client programs can override this by selecting other values
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5

View File

@ -583,7 +583,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
ERR81, ERR82, ERR83, ERR84 };
ERR81, ERR82, ERR83, ERR84, ERR85 };
/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -7053,11 +7053,19 @@ for (;; ptr++)
/* The use of \C can be locked out. */
#ifdef NEVER_BACKSLASH_C
else if (escape == ESC_C)
{
*errorcodeptr = ERR85;
goto FAILED;
}
#else
else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
{
*errorcodeptr = ERR83;
goto FAILED;
}
#endif
/* For the rest (including \X when Unicode properties are supported), we
can obtain the OP value by negating the escape value in the default

View File

@ -168,6 +168,8 @@ static const char compile_error_texts[] =
"unrecognized string delimiter follows (?C\0"
"using \\C is disabled by the application\0"
"(?| and/or (?J: or (?x: parentheses are too deeply nested\0"
/* 85 */
"using \\C is disabled in this PCRE2 library\0"
;
/* Match-time and UTF error texts are in the same format. */

View File

@ -106,7 +106,7 @@ static const int eint1[] = {
static const int eint2[] = {
30, REG_ECTYPE, /* unknown POSIX class name */
32, REG_INVARG, /* this version of PCRE does not have UTF or UCP support */
32, REG_INVARG, /* this version of PCRE2 does not have Unicode support */
37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */
56, REG_INVARG, /* internal error: unknown newline setting */
};

View File

@ -667,6 +667,12 @@ table itself easier to read. */
#define EBCDIC_NL 0
#endif
#ifdef NEVER_BACKSLASH_C
#define BACKSLASH_C 0
#else
#define BACKSLASH_C 1
#endif
typedef struct coptstruct {
const char *name;
uint32_t type;
@ -681,6 +687,7 @@ enum { CONF_BSR,
};
static coptstruct coptlist[] = {
{ "backslash-C", CONF_FIX, BACKSLASH_C },
{ "bsr", CONF_BSR, PCRE2_CONFIG_BSR },
{ "ebcdic", CONF_FIX, SUPPORT_EBCDIC },
{ "ebcdic-nl", CONF_FIZ, EBCDIC_NL },
@ -6467,6 +6474,7 @@ printf(" -b set default pattern control 'fullbincode'\n");
printf(" -C show PCRE2 compile-time options and exit\n");
printf(" -C arg show a specific compile-time option and exit with its\n");
printf(" value if numeric (else 0). The arg can be:\n");
printf(" backslash-C use of \\C is enabled [0, 1]\n");
printf(" bsr \\R type [ANYCRLF, ANY]\n");
printf(" ebcdic compiled for EBCDIC character code [0,1]\n");
printf(" ebcdic-nl NL code if compiled for EBCDIC\n");
@ -6618,6 +6626,11 @@ print_newline_config(optval, FALSE);
(void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &optval);
printf(" \\R matches %s\n", optval? "CR, LF, or CRLF only" :
"all Unicode newlines");
#ifdef NEVER_BACKSLASH_C
printf(" \\C is not supported\n");
#else
printf(" \\C is supported\n");
#endif
(void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &optval);
printf(" Internal link size = %d\n", optval);
(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &optval);

55
testdata/testinput10 vendored
View File

@ -1,46 +1,6 @@
# This set of tests is for UTF-8 support and Unicode property support, with
# relevance only for the 8-bit library.
/X(\C{3})/utf
X\x{1234}
/X(\C{4})/utf
X\x{1234}YZ
/X\C*/utf
XYZabcdce
/X\C*?/utf
XYZabcde
/X\C{3,5}/utf
Xabcdefg
X\x{1234}
X\x{1234}YZ
X\x{1234}\x{512}
X\x{1234}\x{512}YZ
/X\C{3,5}?/utf
Xabcdefg
X\x{1234}
X\x{1234}YZ
X\x{1234}\x{512}
/a\Cb/utf
aXb
a\nb
/a\C\Cb/utf
a\x{100}b
/ab\Cde/utf
abXde
/a\C\Cb/utf
a\x{100}b
\= Expect no match
a\x{12257}b
# The next 3 patterns have UTF-8 errors
/[Ã]/utf
@ -212,21 +172,6 @@
/\x{212ab}/IB,utf
# This one is here not because it's different to Perl, but because the way
# the captured single-byte is displayed. (In Perl it becomes a character, and you
# can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
X\nabc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match
a\x{100}b
/[^ab\xC0-\xF0]/IB,utf
\x{f1}
\x{bf}

View File

@ -6,10 +6,6 @@
#forbid_utf
#newline_default LF ANY ANYCRLF
/a\Cb/
aXb
a\nb
/[^\x{c4}]/IB
/\x{100}/I
@ -344,7 +340,7 @@
# Non-UTF characters
/\C{2,3}/
/.{2,3}/
\x{400000}\x{400001}\x{400002}\x{400003}
/\x{400000}\x{800000}/IBi

59
testdata/testinput12 vendored
View File

@ -7,49 +7,6 @@
/abc/utf
Ã]
/X(\C{3})/utf
X\x{11234}Y
X\x{11234}YZ
/X(\C{4})/utf
X\x{11234}YZ
X\x{11234}YZW
/X\C*/utf
XYZabcdce
/X\C*?/utf
XYZabcde
/X\C{3,5}/utf
Xabcdefg
X\x{11234}Y
X\x{11234}YZ
X\x{11234}\x{512}
X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
/X\C{3,5}?/utf
Xabcdefg
X\x{11234}Y
X\x{11234}YZ
X\x{11234}\x{512}YZ
\= Expect no match
X\x{11234}
/a\Cb/utf
aXb
a\nb
/a\C\Cb/utf
a\x{12257}b
\= Expect no match
a\x{12257}\x{11234}b
a\x{100}b
/ab\Cde/utf
abXde
# Check maximum character size
/\x{ffff}/IB,utf
@ -90,16 +47,6 @@
/\x{212ab}/IB,utf
# These two \C tests, copied from the UTF-8 input file, do not have any
# problems in 16 or 32 bits.
/X(\C)(.*)/utf
X\x{1234}
X\nabc
/a\Cb/utf
a\x{100}b
/[^ab\xC0-\xF0]/IB,utf
\x{f1}
\x{bf}
@ -336,9 +283,6 @@
/\o{4200000}/utf
/\C/utf
\x{110000}
/\x{100}*A/IB,utf
A
@ -396,4 +340,7 @@
/\x{3a3}B/IBi,utf
/./utf
\x{110000}
# End of testinput12

45
testdata/testinput2 vendored
View File

@ -3739,41 +3739,40 @@
/[bcd]*a/B
# A complete set of tests for auto-possessification of character types.
# A complete set of tests for auto-possessification of character types, but
# omitting \C because it might be disabled (it has its own tests).
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\C \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\C \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\C \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\C \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\C \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\C \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\C \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\C \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\C \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\C \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\C \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\C \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\C a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\C \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
/\D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \C+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
/ \D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
/(?=a+)a(a+)++a/B
@ -4327,8 +4326,6 @@
/((?2){73}(?2))((?1))/info
/ab\Cde/never_backslash_c
/abc/
\= Expect no match
\[9x!xxx(]{9999}
@ -4446,12 +4443,6 @@
/\x0{ab}/
\0{ab}
/ab\Cde/
abXde
/(?<=ab\Cde)X/
abZdeX
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
ababababbbabZXXXX

16
testdata/testinput21 vendored Normal file
View File

@ -0,0 +1,16 @@
# These are tests of \C that do not involve UTF. They are not run when \C is
# disabled by compiling with --enable-never-backslash-C.
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
/\D+\C \d+\C \S+\C \s+\C \W+\C \w+\C .+\C \R+\C \H+\C \h+\C \V+\C \v+\C a+\C \n+\C \C+\C/Bx
/ab\Cde/never_backslash_c
/ab\Cde/
abXde
/(?<=ab\Cde)X/
abZdeX
# End of testinput21

95
testdata/testinput22 vendored Normal file
View File

@ -0,0 +1,95 @@
# Tests of \C when Unicode support is available. Note that \C is not supported
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
# in some widths and not in others.
/ab\Cde/utf
abXde
# This should produce an error diagnostic (\C in UTF lookbehind)
/(?<=ab\Cde)X/utf
# Autopossessification tests
/\C+\X \X+\C/Bx
/\C+\X \X+\C/Bx,utf
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
/X(\C{3})/utf
X\x{1234}
X\x{11234}Y
X\x{11234}YZ
/X(\C{4})/utf
X\x{1234}YZ
X\x{11234}YZ
X\x{11234}YZW
/X\C*/utf
XYZabcdce
/X\C*?/utf
XYZabcde
/X\C{3,5}/utf
Xabcdefg
X\x{1234}
X\x{1234}YZ
X\x{1234}\x{512}
X\x{1234}\x{512}YZ
X\x{11234}Y
X\x{11234}YZ
X\x{11234}\x{512}
X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
/X\C{3,5}?/utf
Xabcdefg
X\x{1234}
X\x{1234}YZ
X\x{1234}\x{512}
X\x{11234}Y
X\x{11234}YZ
X\x{11234}\x{512}YZ
X\x{11234}
/a\Cb/utf
aXb
a\nb
a\x{100}b
/a\C\Cb/utf
a\x{100}b
a\x{12257}b
a\x{12257}\x{11234}b
/ab\Cde/utf
abXde
# This one is here not because it's different to Perl, but because the way
# the captured single code unit is displayed. (In Perl it becomes a character,
# and you can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
X\nabc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match in 8-bit mode
a\x{100}b
# End of testinput22

7
testdata/testinput23 vendored Normal file
View File

@ -0,0 +1,7 @@
# This test is run when PCRE2 has been built with --enable-never-backslash-C,
# which disables the use of \C. All we can do is check that it gives the
# correct error message.
/a\Cb/
# End of testinput23

19
testdata/testinput5 vendored
View File

@ -111,9 +111,6 @@
/.{3,5}?/IB,utf
\x{212ab}\x{212ab}\x{212ab}\x{861}
/(?<=\C)X/utf
Should produce an error diagnostic
/^[ab]/IB,utf
bar
\= Expect no match
@ -1367,8 +1364,6 @@
\= Expect no match
aAz
/(?<=ab\Cde)X/utf
/\X/
a\=ps
a\=ph
@ -1617,13 +1612,13 @@
/[\p{L}ab]{2,3}+/B,no_auto_possess
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \C+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
/.+\X/Bsx
/\X+$/Bmx
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\C \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
/\d+\s{0,5}=\s*\S?=\w{0,4}\W*/B,utf,ucp
@ -1665,16 +1660,6 @@
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
/[\pS#moq]/
=

6
testdata/testinput6 vendored
View File

@ -4645,12 +4645,6 @@
aaaa\=ovector=3
aaaa\=ovector=4
/ab\Cde/
abXde
/(?<=ab\Cde)X/
abZdeX
/^\R/
\r\=ps
\r\=ph

5
testdata/testinput7 vendored
View File

@ -671,11 +671,6 @@
the cat\=ps
the cat\=ph
/ab\Cde/utf
abXde
/(?<=ab\Cde)X/utf
/./newline=crlf,utf
\r\=ps
\r\=ph

6
testdata/testinput9 vendored
View File

@ -4,10 +4,8 @@
#forbid_utf
#newline_default lf any anycrlf
/a\Cb/
aXb
a\nb
\= Expect no match and error message (too big char)
/ab/
\= Expect error message (too big char) and no match
A\x{123}B
A\o{443}B

83
testdata/testoutput10 vendored
View File

@ -1,67 +1,6 @@
# This set of tests is for UTF-8 support and Unicode property support, with
# relevance only for the 8-bit library.
/X(\C{3})/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
/X(\C{4})/utf
X\x{1234}YZ
0: X\x{1234}Y
1: \x{1234}Y
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{1234}
0: X\x{1234}
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
0: X\x{1234}\x{512}
X\x{1234}\x{512}YZ
0: X\x{1234}\x{512}
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{1234}
0: X\x{1234}
X\x{1234}YZ
0: X\x{1234}
X\x{1234}\x{512}
0: X\x{1234}
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
/a\C\Cb/utf
a\x{100}b
0: a\x{100}b
/ab\Cde/utf
abXde
0: abXde
/a\C\Cb/utf
a\x{100}b
0: a\x{100}b
\= Expect no match
a\x{12257}b
No match
# The next 3 patterns have UTF-8 errors
/[Ã]/utf
@ -511,28 +450,6 @@ First code unit = \xf0
Last code unit = \xab
Subject length lower bound = 1
# This one is here not because it's different to Perl, but because the way
# the captured single-byte is displayed. (In Perl it becomes a character, and you
# can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{e1}
2: \x{88}\x{b4}
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match
a\x{100}b
No match
/[^ab\xC0-\xF0]/IB,utf
------------------------------------------------------------------
Bra

View File

@ -6,12 +6,6 @@
#forbid_utf
#newline_default LF ANY ANYCRLF
/a\Cb/
aXb
0: aXb
a\nb
0: a\x0ab
/[^\x{c4}]/IB
------------------------------------------------------------------
Bra
@ -582,7 +576,7 @@ Failed: error 134 at offset 11: character code point value in \x{} or \o{} is to
# Non-UTF characters
/\C{2,3}/
/.{2,3}/
\x{400000}\x{400001}\x{400002}\x{400003}
** Character \x{400000} is greater than 0xffff and UTF-16 mode is not enabled.
** Truncation will probably give the wrong result.

View File

@ -6,12 +6,6 @@
#forbid_utf
#newline_default LF ANY ANYCRLF
/a\Cb/
aXb
0: aXb
a\nb
0: a\x0ab
/[^\x{c4}]/IB
------------------------------------------------------------------
Bra
@ -583,7 +577,7 @@ Subject length lower bound = 2
# Non-UTF characters
/\C{2,3}/
/.{2,3}/
\x{400000}\x{400001}\x{400002}\x{400003}
0: \x{400000}\x{400001}\x{400002}

View File

@ -9,76 +9,6 @@
Ã]
** Failed: invalid UTF-8 string cannot be used as input in UTF mode
/X(\C{3})/utf
X\x{11234}Y
0: X\x{11234}Y
1: \x{11234}Y
X\x{11234}YZ
0: X\x{11234}Y
1: \x{11234}Y
/X(\C{4})/utf
X\x{11234}YZ
0: X\x{11234}YZ
1: \x{11234}YZ
X\x{11234}YZW
0: X\x{11234}YZ
1: \x{11234}YZ
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{11234}Y
0: X\x{11234}Y
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}
0: X\x{11234}\x{512}
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
0: X\x{11234}\x{512}\x{11234}
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{11234}Y
0: X\x{11234}Y
X\x{11234}YZ
0: X\x{11234}Y
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}
\= Expect no match
X\x{11234}
No match
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
/a\C\Cb/utf
a\x{12257}b
0: a\x{12257}b
\= Expect no match
a\x{12257}\x{11234}b
No match
a\x{100}b
No match
/ab\Cde/utf
abXde
0: abXde
# Check maximum character size
/\x{ffff}/IB,utf
@ -308,23 +238,6 @@ First code unit = \x{d844}
Last code unit = \x{deab}
Subject length lower bound = 1
# These two \C tests, copied from the UTF-8 input file, do not have any
# problems in 16 or 32 bits.
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
2:
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
/a\Cb/utf
a\x{100}b
0: a\x{100}b
/[^ab\xC0-\xF0]/IB,utf
------------------------------------------------------------------
Bra
@ -1127,10 +1040,6 @@ Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too
/\o{4200000}/utf
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
/\C/utf
\x{110000}
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
/\x{100}*A/IB,utf
------------------------------------------------------------------
Bra
@ -1454,4 +1363,8 @@ Starting code units: \xff
Last code unit = 'B' (caseless)
Subject length lower bound = 2
/./utf
\x{110000}
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
# End of testinput12

View File

@ -9,74 +9,6 @@
Ã]
** Failed: invalid UTF-8 string cannot be used as input in UTF mode
/X(\C{3})/utf
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
1: \x{11234}YZ
/X(\C{4})/utf
X\x{11234}YZ
No match
X\x{11234}YZW
0: X\x{11234}YZW
1: \x{11234}YZW
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}
No match
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
0: X\x{11234}\x{512}\x{11234}Z
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}Y
\= Expect no match
X\x{11234}
No match
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
/a\C\Cb/utf
a\x{12257}b
No match
\= Expect no match
a\x{12257}\x{11234}b
0: a\x{12257}\x{11234}b
a\x{100}b
No match
/ab\Cde/utf
abXde
0: abXde
# Check maximum character size
/\x{ffff}/IB,utf
@ -301,23 +233,6 @@ Options: utf
First code unit = \x{212ab}
Subject length lower bound = 1
# These two \C tests, copied from the UTF-8 input file, do not have any
# problems in 16 or 32 bits.
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
2:
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
/a\Cb/utf
a\x{100}b
0: a\x{100}b
/[^ab\xC0-\xF0]/IB,utf
------------------------------------------------------------------
Bra
@ -1119,10 +1034,6 @@ Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too
/\o{4200000}/utf
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
/\C/utf
\x{110000}
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0
/\x{100}*A/IB,utf
------------------------------------------------------------------
Bra
@ -1446,4 +1357,8 @@ Starting code units: \xff
Last code unit = 'B' (caseless)
Subject length lower bound = 2
/./utf
\x{110000}
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0
# End of testinput12

693
testdata/testoutput2 vendored
View File

@ -11948,9 +11948,10 @@ Subject length lower bound = 5
End
------------------------------------------------------------------
# A complete set of tests for auto-possessification of character types.
# A complete set of tests for auto-possessification of character types, but
# omitting \C because it might be disabled (it has its own tests).
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\C \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
------------------------------------------------------------------
Bra
\D+
@ -11968,8 +11969,6 @@ Subject length lower bound = 5
\D+
Any
\D+
AllAny
\D+
\R
\D+
\H
@ -11989,7 +11988,7 @@ Subject length lower bound = 5
End
------------------------------------------------------------------
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\C \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
------------------------------------------------------------------
Bra
\d++
@ -12006,8 +12005,6 @@ Subject length lower bound = 5
\w
\d+
Any
\d+
AllAny
\d++
\R
\d+
@ -12028,7 +12025,7 @@ Subject length lower bound = 5
End
------------------------------------------------------------------
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\C \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
------------------------------------------------------------------
Bra
\S+
@ -12045,8 +12042,6 @@ Subject length lower bound = 5
\w
\S+
Any
\S+
AllAny
\S++
\R
\S+
@ -12067,7 +12062,7 @@ Subject length lower bound = 5
End
------------------------------------------------------------------
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\C \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
------------------------------------------------------------------
Bra
\s+
@ -12085,8 +12080,6 @@ Subject length lower bound = 5
\s+
Any
\s+
AllAny
\s+
\R
\s+
\H
@ -12106,7 +12099,7 @@ Subject length lower bound = 5
End
------------------------------------------------------------------
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\C \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
------------------------------------------------------------------
Bra
\W+
@ -12124,8 +12117,6 @@ Subject length lower bound = 5
\W+
Any
\W+
AllAny
\W+
\R
\W+
\H
@ -12145,7 +12136,7 @@ Subject length lower bound = 5
End
------------------------------------------------------------------
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\C \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
------------------------------------------------------------------
Bra
\w+
@ -12162,8 +12153,6 @@ Subject length lower bound = 5
\w
\w+
Any
\w+
AllAny
\w++
\R
\w+
@ -12184,7 +12173,303 @@ Subject length lower bound = 5
End
------------------------------------------------------------------
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\C \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
------------------------------------------------------------------
Bra
\R+
\D
\R++
\d
\R+
\S
\R++
\s
\R+
\W
\R++
\w
\R++
Any
\R+
\R
\R+
\H
\R++
\h
\R+
\V
\R+
\v
\R+
\Z
\R++
\z
\R+
$
Ket
End
------------------------------------------------------------------
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
------------------------------------------------------------------
Bra
\H+
\D
\H+
\d
\H+
\S
\H+
\s
\H+
\W
\H+
\w
\H+
Any
\H+
\R
\H+
\H
\H++
\h
\H+
\V
\H+
\v
\H+
\Z
\H++
\z
\H+
$
Ket
End
------------------------------------------------------------------
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
------------------------------------------------------------------
Bra
\h+
\D
\h++
\d
\h++
\S
\h+
\s
\h+
\W
\h++
\w
\h+
Any
\h++
\R
\h++
\H
\h+
\h
\h+
\V
\h++
\v
\h+
\Z
\h++
\z
\h+
$
Ket
End
------------------------------------------------------------------
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
------------------------------------------------------------------
Bra
\V+
\D
\V+
\d
\V+
\S
\V+
\s
\V+
\W
\V+
\w
\V+
Any
\V++
\R
\V+
\H
\V+
\h
\V+
\V
\V++
\v
\V+
\Z
\V++
\z
\V+
$
Ket
End
------------------------------------------------------------------
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
------------------------------------------------------------------
Bra
\v+
\D
\v++
\d
\v++
\S
\v+
\s
\v+
\W
\v++
\w
\v+
Any
\v+
\R
\v+
\H
\v++
\h
\v++
\V
\v+
\v
\v+
\Z
\v++
\z
\v+
$
Ket
End
------------------------------------------------------------------
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
------------------------------------------------------------------
Bra
a+
\D
a++
\d
a+
\S
a++
\s
a++
\W
a+
\w
a+
Any
a++
\R
a+
\H
a++
\h
a+
\V
a++
\v
a++
\Z
a++
\z
a++
$
Ket
End
------------------------------------------------------------------
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
------------------------------------------------------------------
Bra
\x0a+
\D
\x0a++
\d
\x0a++
\S
\x0a+
\s
\x0a+
\W
\x0a++
\w
\x0a+
Any
\x0a+
\R
\x0a+
\H
\x0a++
\h
\x0a++
\V
\x0a+
\v
\x0a+
\Z
\x0a++
\z
\x0a+
$
Ket
End
------------------------------------------------------------------
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
------------------------------------------------------------------
Bra
Any+
\D
Any+
\d
Any+
\S
Any+
\s
Any+
\W
Any+
\w
Any+
Any
Any++
\R
Any+
\H
Any+
\h
Any+
\V
Any+
\v
Any+
\Z
Any++
\z
Any+
$
Ket
End
------------------------------------------------------------------
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
------------------------------------------------------------------
Bra
AllAny+
@ -12200,8 +12485,6 @@ Subject length lower bound = 5
AllAny+
\w
AllAny+
Any
AllAny+
AllAny
AllAny+
\R
@ -12223,358 +12506,7 @@ Subject length lower bound = 5
End
------------------------------------------------------------------
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\C \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
------------------------------------------------------------------
Bra
\R+
\D
\R++
\d
\R+
\S
\R++
\s
\R+
\W
\R++
\w
\R++
Any
\R+
AllAny
\R+
\R
\R+
\H
\R++
\h
\R+
\V
\R+
\v
\R+
\Z
\R++
\z
\R+
$
Ket
End
------------------------------------------------------------------
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\C \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
------------------------------------------------------------------
Bra
\H+
\D
\H+
\d
\H+
\S
\H+
\s
\H+
\W
\H+
\w
\H+
Any
\H+
AllAny
\H+
\R
\H+
\H
\H++
\h
\H+
\V
\H+
\v
\H+
\Z
\H++
\z
\H+
$
Ket
End
------------------------------------------------------------------
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\C \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
------------------------------------------------------------------
Bra
\h+
\D
\h++
\d
\h++
\S
\h+
\s
\h+
\W
\h++
\w
\h+
Any
\h+
AllAny
\h++
\R
\h++
\H
\h+
\h
\h+
\V
\h++
\v
\h+
\Z
\h++
\z
\h+
$
Ket
End
------------------------------------------------------------------
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\C \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
------------------------------------------------------------------
Bra
\V+
\D
\V+
\d
\V+
\S
\V+
\s
\V+
\W
\V+
\w
\V+
Any
\V+
AllAny
\V++
\R
\V+
\H
\V+
\h
\V+
\V
\V++
\v
\V+
\Z
\V++
\z
\V+
$
Ket
End
------------------------------------------------------------------
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\C \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
------------------------------------------------------------------
Bra
\v+
\D
\v++
\d
\v++
\S
\v+
\s
\v+
\W
\v++
\w
\v+
Any
\v+
AllAny
\v+
\R
\v+
\H
\v++
\h
\v++
\V
\v+
\v
\v+
\Z
\v++
\z
\v+
$
Ket
End
------------------------------------------------------------------
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\C a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
------------------------------------------------------------------
Bra
a+
\D
a++
\d
a+
\S
a++
\s
a++
\W
a+
\w
a+
Any
a+
AllAny
a++
\R
a+
\H
a++
\h
a+
\V
a++
\v
a++
\Z
a++
\z
a++
$
Ket
End
------------------------------------------------------------------
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\C \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
------------------------------------------------------------------
Bra
\x0a+
\D
\x0a++
\d
\x0a++
\S
\x0a+
\s
\x0a+
\W
\x0a++
\w
\x0a+
Any
\x0a+
AllAny
\x0a+
\R
\x0a+
\H
\x0a++
\h
\x0a++
\V
\x0a+
\v
\x0a+
\Z
\x0a++
\z
\x0a+
$
Ket
End
------------------------------------------------------------------
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
------------------------------------------------------------------
Bra
Any+
\D
Any+
\d
Any+
\S
Any+
\s
Any+
\W
Any+
\w
Any+
Any
Any+
AllAny
Any++
\R
Any+
\H
Any+
\h
Any+
\V
Any+
\v
Any+
\Z
Any++
\z
Any+
$
Ket
End
------------------------------------------------------------------
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
------------------------------------------------------------------
Bra
AllAny+
\D
AllAny+
\d
AllAny+
\S
AllAny+
\s
AllAny+
\W
AllAny+
\w
AllAny+
AllAny
AllAny+
AllAny
AllAny+
\R
AllAny+
\H
AllAny+
\h
AllAny+
\V
AllAny+
\v
AllAny+
\Z
AllAny++
\z
AllAny+
$
Ket
End
------------------------------------------------------------------
/\D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \C+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
/ \D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
------------------------------------------------------------------
Bra
\D+
@ -12588,8 +12520,6 @@ Subject length lower bound = 5
\W+
/m $
\w++
/m $
AllAny+
/m $
\R+
/m $
@ -14210,9 +14140,6 @@ Capturing subpattern count = 2
May match empty string
Subject length lower bound = 0
/ab\Cde/never_backslash_c
Failed: error 183 at offset 3: using \C is disabled by the application
/abc/
\= Expect no match
\[9x!xxx(]{9999}
@ -14532,14 +14459,6 @@ Subject length lower bound = 0
\0{ab}
0: \x00{ab}
/ab\Cde/
abXde
0: abXde
/(?<=ab\Cde)X/
abZdeX
0: X
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
ababababbbabZXXXX
0: ababababbbabZ

89
testdata/testoutput21 vendored Normal file
View File

@ -0,0 +1,89 @@
# These are tests of \C that do not involve UTF. They are not run when \C is
# disabled by compiling with --enable-never-backslash-C.
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
------------------------------------------------------------------
Bra
AllAny+
\D
AllAny+
\d
AllAny+
\S
AllAny+
\s
AllAny+
\W
AllAny+
\w
AllAny+
Any
AllAny+
\R
AllAny+
\H
AllAny+
\h
AllAny+
\V
AllAny+
\v
AllAny+
\Z
AllAny++
\z
AllAny+
$
Ket
End
------------------------------------------------------------------
/\D+\C \d+\C \S+\C \s+\C \W+\C \w+\C .+\C \R+\C \H+\C \h+\C \V+\C \v+\C a+\C \n+\C \C+\C/Bx
------------------------------------------------------------------
Bra
\D+
AllAny
\d+
AllAny
\S+
AllAny
\s+
AllAny
\W+
AllAny
\w+
AllAny
Any+
AllAny
\R+
AllAny
\H+
AllAny
\h+
AllAny
\V+
AllAny
\v+
AllAny
a+
AllAny
\x0a+
AllAny
AllAny+
AllAny
Ket
End
------------------------------------------------------------------
/ab\Cde/never_backslash_c
Failed: error 183 at offset 3: using \C is disabled by the application
/ab\Cde/
abXde
0: abXde
/(?<=ab\Cde)X/
abZdeX
0: X
# End of testinput21

161
testdata/testoutput22-16 vendored Normal file
View File

@ -0,0 +1,161 @@
# Tests of \C when Unicode support is available. Note that \C is not supported
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
# in some widths and not in others.
/ab\Cde/utf
abXde
0: abXde
# This should produce an error diagnostic (\C in UTF lookbehind)
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
# Autopossessification tests
/\C+\X \X+\C/Bx
------------------------------------------------------------------
Bra
AllAny+
extuni
extuni+
AllAny
Ket
End
------------------------------------------------------------------
/\C+\X \X+\C/Bx,utf
------------------------------------------------------------------
Bra
Anybyte+
extuni
extuni+
Anybyte
Ket
End
------------------------------------------------------------------
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
No match
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
No match
/X(\C{3})/utf
X\x{1234}
No match
X\x{11234}Y
0: X\x{11234}Y
1: \x{11234}Y
X\x{11234}YZ
0: X\x{11234}Y
1: \x{11234}Y
/X(\C{4})/utf
X\x{1234}YZ
No match
X\x{11234}YZ
0: X\x{11234}YZ
1: \x{11234}YZ
X\x{11234}YZW
0: X\x{11234}YZ
1: \x{11234}YZ
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{1234}
No match
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
No match
X\x{1234}\x{512}YZ
0: X\x{1234}\x{512}YZ
X\x{11234}Y
0: X\x{11234}Y
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}
0: X\x{11234}\x{512}
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
0: X\x{11234}\x{512}\x{11234}
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{1234}
No match
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
No match
X\x{11234}Y
0: X\x{11234}Y
X\x{11234}YZ
0: X\x{11234}Y
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}
X\x{11234}
No match
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
a\x{100}b
0: a\x{100}b
/a\C\Cb/utf
a\x{100}b
No match
a\x{12257}b
0: a\x{12257}b
a\x{12257}\x{11234}b
No match
/ab\Cde/utf
abXde
0: abXde
# This one is here not because it's different to Perl, but because the way
# the captured single code unit is displayed. (In Perl it becomes a character,
# and you can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
2:
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match in 8-bit mode
a\x{100}b
0: a\x{100}b
# End of testinput22

159
testdata/testoutput22-32 vendored Normal file
View File

@ -0,0 +1,159 @@
# Tests of \C when Unicode support is available. Note that \C is not supported
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
# in some widths and not in others.
/ab\Cde/utf
abXde
0: abXde
# This should produce an error diagnostic (\C in UTF lookbehind)
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
# Autopossessification tests
/\C+\X \X+\C/Bx
------------------------------------------------------------------
Bra
AllAny+
extuni
extuni+
AllAny
Ket
End
------------------------------------------------------------------
/\C+\X \X+\C/Bx,utf
------------------------------------------------------------------
Bra
Anybyte+
extuni
extuni+
Anybyte
Ket
End
------------------------------------------------------------------
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
No match
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
No match
/X(\C{3})/utf
X\x{1234}
No match
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
1: \x{11234}YZ
/X(\C{4})/utf
X\x{1234}YZ
No match
X\x{11234}YZ
No match
X\x{11234}YZW
0: X\x{11234}YZW
1: \x{11234}YZW
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{1234}
No match
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
No match
X\x{1234}\x{512}YZ
0: X\x{1234}\x{512}YZ
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}
No match
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
0: X\x{11234}\x{512}\x{11234}Z
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{1234}
No match
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
No match
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}Y
X\x{11234}
No match
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
a\x{100}b
0: a\x{100}b
/a\C\Cb/utf
a\x{100}b
No match
a\x{12257}b
No match
a\x{12257}\x{11234}b
0: a\x{12257}\x{11234}b
/ab\Cde/utf
abXde
0: abXde
# This one is here not because it's different to Perl, but because the way
# the captured single code unit is displayed. (In Perl it becomes a character,
# and you can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
2:
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match in 8-bit mode
a\x{100}b
0: a\x{100}b
# End of testinput22

163
testdata/testoutput22-8 vendored Normal file
View File

@ -0,0 +1,163 @@
# Tests of \C when Unicode support is available. Note that \C is not supported
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
# in some widths and not in others.
/ab\Cde/utf
abXde
0: abXde
# This should produce an error diagnostic (\C in UTF lookbehind)
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
# Autopossessification tests
/\C+\X \X+\C/Bx
------------------------------------------------------------------
Bra
AllAny+
extuni
extuni+
AllAny
Ket
End
------------------------------------------------------------------
/\C+\X \X+\C/Bx,utf
------------------------------------------------------------------
Bra
Anybyte+
extuni
extuni+
Anybyte
Ket
End
------------------------------------------------------------------
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
No match
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
No match
/X(\C{3})/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
X\x{11234}Y
0: X\x{f0}\x{91}\x{88}
1: \x{f0}\x{91}\x{88}
X\x{11234}YZ
0: X\x{f0}\x{91}\x{88}
1: \x{f0}\x{91}\x{88}
/X(\C{4})/utf
X\x{1234}YZ
0: X\x{1234}Y
1: \x{1234}Y
X\x{11234}YZ
0: X\x{11234}
1: \x{11234}
X\x{11234}YZW
0: X\x{11234}
1: \x{11234}
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{1234}
0: X\x{1234}
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
0: X\x{1234}\x{512}
X\x{1234}\x{512}YZ
0: X\x{1234}\x{512}
X\x{11234}Y
0: X\x{11234}Y
X\x{11234}YZ
0: X\x{11234}Y
X\x{11234}\x{512}
0: X\x{11234}\x{d4}
X\x{11234}\x{512}YZ
0: X\x{11234}\x{d4}
X\x{11234}\x{512}\x{11234}Z
0: X\x{11234}\x{d4}
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{1234}
0: X\x{1234}
X\x{1234}YZ
0: X\x{1234}
X\x{1234}\x{512}
0: X\x{1234}
X\x{11234}Y
0: X\x{f0}\x{91}\x{88}
X\x{11234}YZ
0: X\x{f0}\x{91}\x{88}
X\x{11234}\x{512}YZ
0: X\x{f0}\x{91}\x{88}
X\x{11234}
0: X\x{f0}\x{91}\x{88}
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
a\x{100}b
No match
/a\C\Cb/utf
a\x{100}b
0: a\x{100}b
a\x{12257}b
No match
a\x{12257}\x{11234}b
No match
/ab\Cde/utf
abXde
0: abXde
# This one is here not because it's different to Perl, but because the way
# the captured single code unit is displayed. (In Perl it becomes a character,
# and you can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{e1}
2: \x{88}\x{b4}
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match in 8-bit mode
a\x{100}b
No match
# End of testinput22

8
testdata/testoutput23 vendored Normal file
View File

@ -0,0 +1,8 @@
# This test is run when PCRE2 has been built with --enable-never-backslash-C,
# which disables the use of \C. All we can do is check that it gives the
# correct error message.
/a\Cb/
Failed: error 185 at offset 2: using \C is disabled in this PCRE2 library
# End of testinput23

27
testdata/testoutput5 vendored
View File

@ -181,10 +181,6 @@ Subject length lower bound = 3
\x{212ab}\x{212ab}\x{212ab}\x{861}
0: \x{212ab}\x{212ab}\x{212ab}
/(?<=\C)X/utf
Failed: error 136 at offset 6: \C is not allowed in a lookbehind assertion
Should produce an error diagnostic
/^[ab]/IB,utf
------------------------------------------------------------------
Bra
@ -2905,9 +2901,6 @@ No match
aAz
No match
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
/\X/
a\=ps
0: a
@ -3803,7 +3796,7 @@ No match
End
------------------------------------------------------------------
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \C+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
------------------------------------------------------------------
Bra
\D+
@ -3818,8 +3811,6 @@ No match
extuni
\w+
extuni
AllAny+
extuni
\R+
extuni
\H+
@ -3858,7 +3849,7 @@ No match
End
------------------------------------------------------------------
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\C \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
------------------------------------------------------------------
Bra
extuni+
@ -3876,8 +3867,6 @@ No match
extuni+
Any
extuni+
AllAny
extuni+
\R
extuni+
\H
@ -4010,18 +3999,6 @@ Failed: error 122 at offset 1227: unmatched closing parenthesis
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
Failed: error 124 at offset 113: letter or underscore expected after (?< or (?'
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
No match
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
No match
/[\pS#moq]/
=
0: =

View File

@ -7174,14 +7174,6 @@ Matched, but offsets vector is too small to show all matches
2: aa
3: a
/ab\Cde/
abXde
0: abXde
/(?<=ab\Cde)X/
abZdeX
0: X
/^\R/
\r\=ps
0: \x0d

View File

@ -1141,13 +1141,6 @@ Partial match: abcde
the cat\=ph
Partial match: the cat
/ab\Cde/utf
abXde
Failed: error -42: pattern contains an item that is not supported for DFA matching
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
/./newline=crlf,utf
\r\=ps
0: \x{0d}

View File

@ -4,12 +4,8 @@
#forbid_utf
#newline_default lf any anycrlf
/a\Cb/
aXb
0: aXb
a\nb
0: a\x0ab
\= Expect no match and error message (too big char)
/ab/
\= Expect error message (too big char) and no match
A\x{123}B
** Character \x{123} is greater than 255 and UTF-8 mode is not enabled.
** Truncation will probably give the wrong result.