Implement --never-backslash-C
This commit is contained in:
parent
5923caf05e
commit
3263d44b97
|
@ -70,6 +70,7 @@
|
|||
# 2015-04-24 PH added support for PCRE2_DEBUG
|
||||
# 2015-07-16 PH updated for new pcre2_find_bracket source module
|
||||
# 2015-08-24 PH correct C_FLAGS setting (patch from Roy Ivy III)
|
||||
# 2015-10=16 PH added support for never-backslash-C
|
||||
|
||||
PROJECT(PCRE2 C)
|
||||
|
||||
|
@ -162,6 +163,9 @@ SET(PCRE2_SUPPORT_UNICODE ON CACHE BOOL
|
|||
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
|
||||
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
|
||||
|
||||
SET(PCRE2_NEVER_BACKSLASH_C OFF CACHE BOOL
|
||||
"If ON, backslash-C (upper case C) is locked out.")
|
||||
|
||||
SET(PCRE2_SUPPORT_VALGRIND OFF CACHE BOOL
|
||||
"Enable Valgrind support.")
|
||||
|
||||
|
@ -252,6 +256,10 @@ IF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
|||
SET(BSR_ANYCRLF 1)
|
||||
ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
||||
|
||||
IF(PCRE2_NEVER_BACKSLASH_C)
|
||||
SET(NEVER_BACKSLASH_C 1)
|
||||
ENDIF(PCRE2_NEVER_BACKSLASH_C)
|
||||
|
||||
IF(PCRE2_SUPPORT_UNICODE)
|
||||
SET(SUPPORT_UNICODE 1)
|
||||
ENDIF(PCRE2_SUPPORT_UNICODE)
|
||||
|
@ -719,6 +727,7 @@ IF(PCRE2_SHOW_REPORT)
|
|||
MESSAGE(STATUS " Enable Unicode support .......... : ${PCRE2_SUPPORT_UNICODE}")
|
||||
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE2_NEWLINE}")
|
||||
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE2_SUPPORT_BSR_ANYCRLF}")
|
||||
MESSAGE(STATUS " \\C is disabled .................. : ${PCRE2_NEVER_BACKSLASH_C}")
|
||||
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE2_EBCDIC}")
|
||||
MESSAGE(STATUS " EBCDIC coding with NL=0x25 ...... : ${PCRE2_EBCDIC_NL25}")
|
||||
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE2_REBUILD_CHARTABLES}")
|
||||
|
|
|
@ -201,6 +201,8 @@ escape was being ignored.
|
|||
57. Fixed integer overflow for patterns whose minimum matching length is very,
|
||||
very large.
|
||||
|
||||
58. Implemented --never-backslash-C.
|
||||
|
||||
|
||||
Version 10.20 30-June-2015
|
||||
--------------------------
|
||||
|
|
9
README
9
README
|
@ -220,6 +220,13 @@ library. They are also documented in the pcre2build man page.
|
|||
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||
|
||||
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||
characters. You can build PCRE2 with the use of \C permanently locked out by
|
||||
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||
command. When \C is allowed by the library, individual applications can lock
|
||||
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||
|
||||
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||
pattern. This limits the amount of system stack that a pattern uses when it
|
||||
is compiled. The default is 250, but you can change it by setting, for
|
||||
|
@ -833,4 +840,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 16 July 2015
|
||||
Last updated: 16 October 2015
|
||||
|
|
64
RunTest
64
RunTest
|
@ -75,7 +75,10 @@ title17="Test 17: JIT-specific features when JIT is available"
|
|||
title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP"
|
||||
title19="Test 19: Tests of the POSIX interface with UTF/UCP"
|
||||
title20="Test 20: Serialization tests"
|
||||
maxtest=20
|
||||
title21="Test 21: \C tests without UTF (supported for DFA matching)"
|
||||
title22="Test 22: \C tests with UTF (not supported for DFA matching)"
|
||||
title23="Test 23: \C disabled test"
|
||||
maxtest=23
|
||||
|
||||
if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||
echo $title0
|
||||
|
@ -99,6 +102,9 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
|
|||
echo $title18
|
||||
echo $title19
|
||||
echo $title20
|
||||
echo $title21
|
||||
echo $title22
|
||||
echo $title23
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
@ -223,6 +229,9 @@ do17=no
|
|||
do18=no
|
||||
do19=no
|
||||
do20=no
|
||||
do21=no
|
||||
do22=no
|
||||
do23=no
|
||||
|
||||
while [ $# -gt 0 ] ; do
|
||||
case $1 in
|
||||
|
@ -247,6 +256,9 @@ while [ $# -gt 0 ] ; do
|
|||
18) do18=yes;;
|
||||
19) do19=yes;;
|
||||
20) do20=yes;;
|
||||
21) do21=yes;;
|
||||
22) do22=yes;;
|
||||
23) do23=yes;;
|
||||
-8) arg8=yes;;
|
||||
-16) arg16=yes;;
|
||||
-32) arg32=yes;;
|
||||
|
@ -326,6 +338,11 @@ support16=$?
|
|||
$sim ./pcre2test -C pcre2-32 >/dev/null
|
||||
support32=$?
|
||||
|
||||
# \C may be disabled
|
||||
|
||||
$sim ./pcre2test -C backslash-C >/dev/null
|
||||
supportBSC=$?
|
||||
|
||||
# Initialize all bitsizes skipped
|
||||
|
||||
test8=skip
|
||||
|
@ -400,7 +417,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
$do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \
|
||||
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
||||
$do20 = no \
|
||||
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no \
|
||||
]; then
|
||||
do0=yes
|
||||
do1=yes
|
||||
|
@ -423,6 +440,9 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
do18=yes
|
||||
do19=yes
|
||||
do20=yes
|
||||
do21=yes
|
||||
do22=yes
|
||||
do23=yes
|
||||
fi
|
||||
|
||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||
|
@ -781,6 +801,46 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
checkresult $? 20 ""
|
||||
fi
|
||||
|
||||
# \C tests without UTF - DFA matching is supported
|
||||
|
||||
if [ "$do21" = yes ] ; then
|
||||
echo $title21
|
||||
if [ $supportBSC -eq 0 ] ; then
|
||||
echo " Skipped because \C is disabled"
|
||||
else
|
||||
for opt in "" $jitopt -dfa; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput21 testtry
|
||||
checkresult $? 21 "$opt"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# \C tests with UTF - DFA matching is not supported for \C in UTF mode
|
||||
|
||||
if [ "$do22" = yes ] ; then
|
||||
echo $title22
|
||||
if [ $supportBSC -eq 0 ] ; then
|
||||
echo " Skipped because \C is disabled"
|
||||
else
|
||||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput22 testtry
|
||||
checkresult $? 22-$bits "$opt"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# Test when \C is disabled
|
||||
|
||||
if [ "$do23" = yes ] ; then
|
||||
echo $title23
|
||||
if [ $supportBSC -ne 0 ] ; then
|
||||
echo " Skipped because \C is not disabled"
|
||||
else
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput23 testtry
|
||||
checkresult $? 23 ""
|
||||
fi
|
||||
fi
|
||||
|
||||
# End of loop for 8/16/32-bit tests
|
||||
done
|
||||
|
||||
|
|
70
RunTest.bat
70
RunTest.bat
|
@ -13,11 +13,10 @@
|
|||
@rem line. Added argument validation and added error reporting.
|
||||
@rem
|
||||
@rem Sheri Pierce added logic to skip feature dependent tests
|
||||
@rem tests 4 5 9 15 and 18 require utf support
|
||||
@rem tests 6 7 10 16 and 19 require ucp support
|
||||
@rem 11 requires ucp and link size 2
|
||||
@rem 12 requires presence of jit support
|
||||
@rem 13 requires absence of jit support
|
||||
@rem tests 4 5 7 10 12 14 19 and 22 require Unicode support
|
||||
@rem 8 requires Unicode and link size 2
|
||||
@rem 16 requires absence of jit support
|
||||
@rem 17 requires presence of jit support
|
||||
@rem Sheri P also added override tests for study and jit testing
|
||||
@rem Zoltan Herczeg added libpcre16 support
|
||||
@rem Zoltan Herczeg added libpcre32 support
|
||||
|
@ -25,6 +24,7 @@
|
|||
@rem
|
||||
@rem The file was converted for PCRE2 by PH, February 2015.
|
||||
@rem Updated for new test 14 (moving others up a number), August 2015.
|
||||
@rem Tidied and updated for new tests 21, 22, 23 by PH, October 2015.
|
||||
|
||||
|
||||
setlocal enabledelayedexpansion
|
||||
|
@ -65,6 +65,8 @@ set support32=%ERRORLEVEL%
|
|||
set unicode=%ERRORLEVEL%
|
||||
%pcre2test% -C jit >NUL
|
||||
set jit=%ERRORLEVEL%
|
||||
%pcre2test% -C backslash-C >NUL
|
||||
set supportBSC=%ERRORLEVEL%
|
||||
|
||||
if %support8% EQU 1 (
|
||||
if not exist testout8 md testout8
|
||||
|
@ -101,18 +103,21 @@ set do17=no
|
|||
set do18=no
|
||||
set do19=no
|
||||
set do20=no
|
||||
set do21=no
|
||||
set do22=no
|
||||
set do23=no
|
||||
set all=yes
|
||||
|
||||
for %%a in (%*) do (
|
||||
set valid=no
|
||||
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20) do if %%v == %%a set valid=yes
|
||||
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23) do if %%v == %%a set valid=yes
|
||||
if "!valid!" == "yes" (
|
||||
set do%%a=yes
|
||||
set all=no
|
||||
) else (
|
||||
echo Invalid test number - %%a!
|
||||
echo Usage %0 [ test_number ] ...
|
||||
echo Where test_number is one or more optional test numbers 1 through 20, default is all tests.
|
||||
echo Where test_number is one or more optional test numbers 1 through 23, default is all tests.
|
||||
exit /b 1
|
||||
)
|
||||
)
|
||||
|
@ -139,6 +144,9 @@ if "%all%" == "yes" (
|
|||
set do18=yes
|
||||
set do19=yes
|
||||
set do20=yes
|
||||
set do21=yes
|
||||
set do22=yes
|
||||
set do23=yes
|
||||
)
|
||||
|
||||
@echo RunTest.bat's pcre2test output is written to newly created subfolders
|
||||
|
@ -187,6 +195,9 @@ if "%do17%" == "yes" call :do17
|
|||
if "%do18%" == "yes" call :do18
|
||||
if "%do19%" == "yes" call :do19
|
||||
if "%do20%" == "yes" call :do20
|
||||
if "%do21%" == "yes" call :do21
|
||||
if "%do22%" == "yes" call :do22
|
||||
if "%do23%" == "yes" call :do23
|
||||
:modeSkip
|
||||
if "%mode%" == "" (
|
||||
set mode=-16
|
||||
|
@ -323,7 +334,7 @@ if %unicode% EQU 0 (
|
|||
goto :eof
|
||||
|
||||
:do6
|
||||
call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q -dfa
|
||||
call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q
|
||||
goto :eof
|
||||
|
||||
:do7
|
||||
|
@ -331,7 +342,7 @@ if %unicode% EQU 0 (
|
|||
echo Test 7 Skipped due to absence of Unicode support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q -dfa
|
||||
call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q
|
||||
goto :eof
|
||||
|
||||
:do8
|
||||
|
@ -395,10 +406,14 @@ if %bits% EQU 8 (
|
|||
echo Test 13 Skipped when running 8-bit tests.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q -dfa
|
||||
call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q
|
||||
goto :eof
|
||||
|
||||
:do14
|
||||
if %unicode% EQU 0 (
|
||||
echo Test 14 Skipped due to absence of Unicode support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 14 testout "DFA specials for UTF and UCP support" -q
|
||||
goto :eof
|
||||
|
||||
|
@ -442,6 +457,10 @@ if %bits% EQU 16 (
|
|||
if %bits% EQU 32 (
|
||||
echo Test 19 Skipped when running 32-bit tests.
|
||||
goto :eof
|
||||
)
|
||||
if %unicode% EQU 0 (
|
||||
echo Test 19 Skipped due to absence of Unicode support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 19 testout "POSIX interface with UTF-8 and UCP" -q
|
||||
goto :eof
|
||||
|
@ -450,6 +469,37 @@ goto :eof
|
|||
call :runsub 20 testout "Serialization tests" -q
|
||||
goto :eof
|
||||
|
||||
:do21
|
||||
if %supportBSC% EQU 0 (
|
||||
echo Test 21 Skipped due to absence of backslash-C support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 21 testout "Backslash-C tests without UTF" -q
|
||||
call :runsub 21 testout "Backslash-C tests without UTF (DFA)" -q -dfa
|
||||
if %jit% EQU 1 call :runsub 21 testoutjit "Test with JIT Override" -q -jit
|
||||
goto :eof
|
||||
|
||||
:do22
|
||||
if %supportBSC% EQU 0 (
|
||||
echo Test 22 Skipped due to absence of backslash-C support.
|
||||
goto :eof
|
||||
)
|
||||
if %unicode% EQU 0 (
|
||||
echo Test 22 Skipped due to absence of Unicode support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 22 testout "Backslash-C tests with UTF" -q
|
||||
if %jit% EQU 1 call :runsub 22 testoutjit "Test with JIT Override" -q -jit
|
||||
goto :eof
|
||||
|
||||
:do23
|
||||
if %supportBSC% EQU 1 (
|
||||
echo Test 23 Skipped due to presence of backslash-C support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 23 testout "Backslash-C disabled test" -q
|
||||
goto :eof
|
||||
|
||||
:conferror
|
||||
@echo.
|
||||
@echo Either your build is incomplete or you have a configuration error.
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#cmakedefine EBCDIC 1
|
||||
#cmakedefine EBCDIC_NL25 1
|
||||
#cmakedefine HEAP_MATCH_RECURSE 1
|
||||
#cmakedefine NEVER_BACKSLASH_C 1
|
||||
|
||||
#define LINK_SIZE @PCRE2_LINK_SIZE@
|
||||
#define MATCH_LIMIT @PCRE2_MATCH_LIMIT@
|
||||
|
|
12
configure.ac
12
configure.ac
|
@ -190,6 +190,12 @@ AC_ARG_ENABLE(bsr-anycrlf,
|
|||
[\R matches only CR, LF, CRLF by default]),
|
||||
, enable_bsr_anycrlf=no)
|
||||
|
||||
# Handle --enable-never-backslash-C
|
||||
AC_ARG_ENABLE(never-backslash-C,
|
||||
AS_HELP_STRING([--enable-never-backslash-C],
|
||||
[use of \C causes an error]),
|
||||
, enable_never_backslash_C=no)
|
||||
|
||||
# Handle --enable-ebcdic
|
||||
AC_ARG_ENABLE(ebcdic,
|
||||
AS_HELP_STRING([--enable-ebcdic],
|
||||
|
@ -604,6 +610,11 @@ if test "$enable_bsr_anycrlf" = "yes"; then
|
|||
The build-time default can be overridden by the user of PCRE2 at runtime.])
|
||||
fi
|
||||
|
||||
if test "$enable_never_backslash_C" = "yes"; then
|
||||
AC_DEFINE([NEVER_BACKSLASH_C], [], [
|
||||
Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns.])
|
||||
fi
|
||||
|
||||
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
|
||||
The value of LINK_SIZE determines the number of bytes used to store
|
||||
links as offsets within the compiled regex. The default is 2, which
|
||||
|
@ -881,6 +892,7 @@ $PACKAGE-$VERSION configuration summary:
|
|||
Enable Unicode support .......... : ${enable_unicode}
|
||||
Newline char/sequence ........... : ${enable_newline}
|
||||
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
|
||||
\C is disabled .................. : ${enable_never_backslash_C}
|
||||
EBCDIC coding ................... : ${enable_ebcdic}
|
||||
EBCDIC code for NL .............. : ${ebcdic_nl_code}
|
||||
Rebuild char tables ............. : ${enable_rebuild_chartables}
|
||||
|
|
|
@ -220,6 +220,13 @@ library. They are also documented in the pcre2build man page.
|
|||
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||
|
||||
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||
characters. You can build PCRE2 with the use of \C permanently locked out by
|
||||
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||
command. When \C is allowed by the library, individual applications can lock
|
||||
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||
|
||||
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||
pattern. This limits the amount of system stack that a pattern uses when it
|
||||
is compiled. The default is 250, but you can change it by setting, for
|
||||
|
@ -833,4 +840,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 16 July 2015
|
||||
Last updated: 16 October 2015
|
||||
|
|
|
@ -126,8 +126,10 @@ running redundant checks.
|
|||
<P>
|
||||
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||
problems, because it may leave the current matching point in the middle of a
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to
|
||||
lock out the use of \C, causing a compile-time error if it is encountered.
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||
application to lock out the use of \C, causing a compile-time error if it is
|
||||
encountered. It is also possible to build PCRE2 with the use of \C permanently
|
||||
disabled.
|
||||
</P>
|
||||
<P>
|
||||
Another way that performance can be hit is by running a pattern that has a very
|
||||
|
@ -187,7 +189,7 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
|||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 13 April 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -60,19 +60,21 @@ units, not characters, as is the contents of the variable pointed at by
|
|||
The options are:
|
||||
<pre>
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_NOTBOL Subject string is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject string is not the end of a line
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject
|
||||
is not a valid match
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement for
|
||||
UTF validity (only relevant if PCRE2_UTF
|
||||
was set at compile time)
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the
|
||||
subject is not a valid match
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement
|
||||
for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
</pre>
|
||||
The function returns the number of substitutions, which may be zero if there
|
||||
were no matches. The result can be greater than one only when
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
|
||||
is returned.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -1237,7 +1237,8 @@ This option locks out the use of \C in the pattern that is being compiled.
|
|||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||
it may leave the current matching point in the middle of a multi-code-unit
|
||||
character. This option may be useful in applications that process patterns from
|
||||
external sources.
|
||||
external sources. Note that there is also a build-time option that permanently
|
||||
locks out the use of \C.
|
||||
<pre>
|
||||
PCRE2_NEVER_UCP
|
||||
</pre>
|
||||
|
@ -2613,43 +2614,17 @@ same number causes an error at compile time.
|
|||
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||
<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
|
||||
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR \fIreplacementzfP,</b>
|
||||
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR <i>replacement</i>,</b>
|
||||
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *\fIoutputbuffer\zfP,</b>
|
||||
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
||||
string in <i>outputbuffer</i>, replacing the part that was matched with the
|
||||
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
|
||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||
</P>
|
||||
<P>
|
||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||
dollar character is an escape character that can specify the insertion of
|
||||
characters from capturing groups or (*MARK) items in the pattern. The following
|
||||
forms are recognized:
|
||||
<pre>
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||
</pre>
|
||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||
required only if the following character would be interpreted as part of the
|
||||
number or name. The number may be zero to include the entire matched string.
|
||||
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
||||
string "+$1$0$1+", the result is "=+babcb+=". Group insertion is done by
|
||||
calling <b>pcre2_copy_byname()</b> or <b>pcre2_copy_bynumber()</b> as
|
||||
appropriate.
|
||||
</P>
|
||||
<P>
|
||||
The facility for inserting a (*MARK) name can be used to perform simple
|
||||
simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
||||
<pre>
|
||||
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||
apple lemon
|
||||
2: pear orange
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
||||
<b>pcre2_match()</b>, except that the partial matching options are not
|
||||
permitted, and <i>match_data</i> may be passed as NULL, in which case a match
|
||||
|
@ -2658,25 +2633,112 @@ functions from the match context, if provided, or else those that were used to
|
|||
allocate memory for the compiled code.
|
||||
</P>
|
||||
<P>
|
||||
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
|
||||
The <i>outlengthptr</i> argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. If the function is successful,
|
||||
the value is updated to contain the length of the new string, excluding the
|
||||
trailing zero that is automatically added. If the function is not successful,
|
||||
the value is set to PCRE2_UNSET for general errors (such as output buffer too
|
||||
small). For syntax errors in the replacement string, the value is set to the
|
||||
offset in the replacement string where the error was detected.
|
||||
</P>
|
||||
<P>
|
||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||
dollar character is an escape character that can specify the insertion of
|
||||
characters from capturing groups or (*MARK) items in the pattern. The following
|
||||
forms are always recognized:
|
||||
<pre>
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||
</pre>
|
||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||
required only if the following character would be interpreted as part of the
|
||||
number or name. The number may be zero to include the entire matched string.
|
||||
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
||||
string "+$1$0$1+", the result is "=+babcb+=".
|
||||
</P>
|
||||
<P>
|
||||
The facility for inserting a (*MARK) name can be used to perform simple
|
||||
simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
||||
<pre>
|
||||
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||
apple lemon
|
||||
2: pear orange
|
||||
</pre>
|
||||
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
|
||||
function to iterate over the subject string, replacing every matching
|
||||
substring. If this is not set, only the first matching substring is replaced.
|
||||
</P>
|
||||
<P>
|
||||
The <i>outlengthptr</i> argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. It is updated to contain the
|
||||
length of the new string, excluding the trailing zero that is automatically
|
||||
added.
|
||||
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra processing
|
||||
to be applied to the replacement string. Without this option, only the dollar
|
||||
character is special, and only the group insertion forms listed above are
|
||||
valid. When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
|
||||
</P>
|
||||
<P>
|
||||
The function returns the number of replacements that were made. This may be
|
||||
zero if no matches were found, and is never greater than 1 unless
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
|
||||
is returned. Except for PCRE2_ERROR_NOMATCH (which is never returned), any
|
||||
errors from <b>pcre2_match()</b> or the substring copying functions are passed
|
||||
straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid
|
||||
replacement string (unrecognized sequence following a dollar sign), and
|
||||
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough.
|
||||
Firstly, backslash in a replacement string is interpreted as an escape
|
||||
character. The usual forms such as \n or \x{ddd} can be used to specify
|
||||
particular character codes, and backslash followed by any non-alphanumeric
|
||||
character quotes that character. Extended quoting can be coded using \Q...\E,
|
||||
exactly as in pattern strings.
|
||||
</P>
|
||||
<P>
|
||||
There are also four escape sequences for forcing the case of inserted letters.
|
||||
The insertion mechanism has three states: no case forcing, force upper case,
|
||||
and force lower case. The escape sequences change the current state: \U and
|
||||
\L change to upper or lower case forcing, respectively, and \E (when not
|
||||
terminating a \Q quoted sequence) reverts to no case forcing. The sequences
|
||||
\u and \l force the next character (if it is a letter) to upper or lower
|
||||
case, respectively, and then the state automatically reverts to no case
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
captured groups and letters within \Q...\E quoted sequences.
|
||||
</P>
|
||||
<P>
|
||||
Note that case forcing sequences such as \U...\E do not nest. For example,
|
||||
the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no
|
||||
effect.
|
||||
</P>
|
||||
<P>
|
||||
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
|
||||
flexibility to group substitution. The syntax is similar to that used by Bash:
|
||||
<pre>
|
||||
${<n>:-<string>}
|
||||
${<n>:+<string1>:<string2>}
|
||||
</pre>
|
||||
As before, <n> may be a group number or a name. The first form specifies a
|
||||
default value. If group <n> is set, its value is inserted; if not, <string> is
|
||||
expanded and the result inserted. The second form specifies strings that are
|
||||
expanded and inserted when group <n> is set or unset, respectively. The first
|
||||
form is just a convenient shorthand for
|
||||
<pre>
|
||||
${<n>:+${<n>}:<string>}
|
||||
</pre>
|
||||
Backslash can be used to escape colons and closing curly brackets in the
|
||||
replacement strings. A change of the case forcing state within a replacement
|
||||
string remains in force afterwards, as shown in this <b>pcre2test</b> example:
|
||||
<pre>
|
||||
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
|
||||
body
|
||||
1: hello
|
||||
somebody
|
||||
1: HELLO
|
||||
</pre>
|
||||
If successful, the function returns the number of replacements that were made.
|
||||
This may be zero if no matches were found, and is never greater than 1 unless
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||
</P>
|
||||
<P>
|
||||
In the event of an error, a negative error code is returned. Except for
|
||||
PCRE2_ERROR_NOMATCH (which is never returned), errors from <b>pcre2_match()</b>
|
||||
are passed straight back. PCRE2_ERROR_NOMEMORY is returned if the output buffer
|
||||
is not big enough. PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax
|
||||
errors in the replacement string, with more particular errors being
|
||||
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence),
|
||||
PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket not found), and
|
||||
PCRE2_BADSUBSTITUTION (syntax error in extended group substitution). As for all
|
||||
PCRE2 errors, a text message that describes the error can be obtained by
|
||||
calling <b>pcre2_get_error_message()</b>.
|
||||
</P>
|
||||
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||
<P>
|
||||
|
@ -2908,8 +2970,8 @@ There are in addition the following errors that are specific to
|
|||
PCRE2_ERROR_DFA_UITEM
|
||||
</pre>
|
||||
This return is given if <b>pcre2_dfa_match()</b> encounters an item in the
|
||||
pattern that it does not support, for instance, the use of \C or a back
|
||||
reference.
|
||||
pattern that it does not support, for instance, the use of \C in a UTF mode or
|
||||
a back reference.
|
||||
<pre>
|
||||
PCRE2_ERROR_DFA_UCOND
|
||||
</pre>
|
||||
|
@ -2953,7 +3015,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 22 September 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -18,23 +18,24 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC3" href="#SEC3">BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a>
|
||||
<li><a name="TOC4" href="#SEC4">BUILDING SHARED AND STATIC LIBRARIES</a>
|
||||
<li><a name="TOC5" href="#SEC5">UNICODE AND UTF SUPPORT</a>
|
||||
<li><a name="TOC6" href="#SEC6">JUST-IN-TIME COMPILER SUPPORT</a>
|
||||
<li><a name="TOC7" href="#SEC7">NEWLINE RECOGNITION</a>
|
||||
<li><a name="TOC8" href="#SEC8">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC9" href="#SEC9">HANDLING VERY LARGE PATTERNS</a>
|
||||
<li><a name="TOC10" href="#SEC10">AVOIDING EXCESSIVE STACK USAGE</a>
|
||||
<li><a name="TOC11" href="#SEC11">LIMITING PCRE2 RESOURCE USAGE</a>
|
||||
<li><a name="TOC12" href="#SEC12">CREATING CHARACTER TABLES AT BUILD TIME</a>
|
||||
<li><a name="TOC13" href="#SEC13">USING EBCDIC CODE</a>
|
||||
<li><a name="TOC14" href="#SEC14">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
|
||||
<li><a name="TOC15" href="#SEC15">PCRE2GREP BUFFER SIZE</a>
|
||||
<li><a name="TOC16" href="#SEC16">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a>
|
||||
<li><a name="TOC17" href="#SEC17">INCLUDING DEBUGGING CODE</a>
|
||||
<li><a name="TOC18" href="#SEC18">DEBUGGING WITH VALGRIND SUPPORT</a>
|
||||
<li><a name="TOC19" href="#SEC19">CODE COVERAGE REPORTING</a>
|
||||
<li><a name="TOC20" href="#SEC20">SEE ALSO</a>
|
||||
<li><a name="TOC21" href="#SEC21">AUTHOR</a>
|
||||
<li><a name="TOC22" href="#SEC22">REVISION</a>
|
||||
<li><a name="TOC6" href="#SEC6">DISABLING THE USE OF \C</a>
|
||||
<li><a name="TOC7" href="#SEC7">JUST-IN-TIME COMPILER SUPPORT</a>
|
||||
<li><a name="TOC8" href="#SEC8">NEWLINE RECOGNITION</a>
|
||||
<li><a name="TOC9" href="#SEC9">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC10" href="#SEC10">HANDLING VERY LARGE PATTERNS</a>
|
||||
<li><a name="TOC11" href="#SEC11">AVOIDING EXCESSIVE STACK USAGE</a>
|
||||
<li><a name="TOC12" href="#SEC12">LIMITING PCRE2 RESOURCE USAGE</a>
|
||||
<li><a name="TOC13" href="#SEC13">CREATING CHARACTER TABLES AT BUILD TIME</a>
|
||||
<li><a name="TOC14" href="#SEC14">USING EBCDIC CODE</a>
|
||||
<li><a name="TOC15" href="#SEC15">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
|
||||
<li><a name="TOC16" href="#SEC16">PCRE2GREP BUFFER SIZE</a>
|
||||
<li><a name="TOC17" href="#SEC17">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a>
|
||||
<li><a name="TOC18" href="#SEC18">INCLUDING DEBUGGING CODE</a>
|
||||
<li><a name="TOC19" href="#SEC19">DEBUGGING WITH VALGRIND SUPPORT</a>
|
||||
<li><a name="TOC20" href="#SEC20">CODE COVERAGE REPORTING</a>
|
||||
<li><a name="TOC21" href="#SEC21">SEE ALSO</a>
|
||||
<li><a name="TOC22" href="#SEC22">AUTHOR</a>
|
||||
<li><a name="TOC23" href="#SEC23">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">BUILDING PCRE2</a><br>
|
||||
<P>
|
||||
|
@ -148,13 +149,19 @@ properties. The application can request that they do by setting the PCRE2_UCP
|
|||
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
|
||||
request this by starting with (*UCP).
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">DISABLING THE USE OF \C</a><br>
|
||||
<P>
|
||||
The \C escape sequence, which matches a single code unit, even in a UTF mode,
|
||||
can cause unpredictable behaviour because it may leave the current matching
|
||||
point in the middle of a multi-code-unit character. It can be locked out by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
point in the middle of a multi-code-unit character. The application can lock it
|
||||
out by setting the PCRE2_NEVER_BACKSLASH_C option when calling
|
||||
<b>pcre2_compile()</b>. There is also a build-time option
|
||||
<pre>
|
||||
--enable-never-backslash-C
|
||||
</pre>
|
||||
(note the upper case C) which locks out the use of \C entirely.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br>
|
||||
<br><a name="SEC7" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br>
|
||||
<P>
|
||||
Just-in-time compiler support is included in the build by specifying
|
||||
<pre>
|
||||
|
@ -171,7 +178,7 @@ pcre2grep automatically makes use of it, unless you add
|
|||
</pre>
|
||||
to the "configure" command.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">NEWLINE RECOGNITION</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">NEWLINE RECOGNITION</a><br>
|
||||
<P>
|
||||
By default, PCRE2 interprets the linefeed (LF) character as indicating the end
|
||||
of a line. This is the normal newline character on Unix-like systems. You can
|
||||
|
@ -208,7 +215,7 @@ Whatever default line ending convention is selected when PCRE2 is built can be
|
|||
overridden by applications that use the library. At build time it is
|
||||
conventional to use the standard for your operating system.
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<br><a name="SEC9" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<P>
|
||||
By default, the sequence \R in a pattern matches any Unicode newline sequence,
|
||||
independently of what has been selected as the line ending sequence. If you
|
||||
|
@ -220,7 +227,7 @@ the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
|
|||
selected when PCRE2 is built can be overridden by applications that use the
|
||||
called.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
|
||||
<br><a name="SEC10" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
|
||||
<P>
|
||||
Within a compiled pattern, offset values are used to point from one part to
|
||||
another (for example, from an opening parenthesis to an alternation
|
||||
|
@ -239,7 +246,7 @@ longer offsets slows down the operation of PCRE2 because it has to load
|
|||
additional data when handling them. For the 32-bit library the value is always
|
||||
4 and cannot be overridden; the value of --with-link-size is ignored.
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
|
||||
<br><a name="SEC11" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
|
||||
<P>
|
||||
When matching with the <b>pcre2_match()</b> function, PCRE2 implements
|
||||
backtracking by making recursive calls to an internal function called
|
||||
|
@ -261,7 +268,7 @@ custom memory management functions can be called instead. PCRE2 runs noticeably
|
|||
more slowly when built in this way. This option affects only the
|
||||
<b>pcre2_match()</b> function; it is not relevant for <b>pcre2_dfa_match()</b>.
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">LIMITING PCRE2 RESOURCE USAGE</a><br>
|
||||
<br><a name="SEC12" href="#TOC1">LIMITING PCRE2 RESOURCE USAGE</a><br>
|
||||
<P>
|
||||
Internally, PCRE2 has a function called <b>match()</b>, which it calls
|
||||
repeatedly (sometimes recursively) when matching a pattern with the
|
||||
|
@ -290,7 +297,7 @@ constraints. However, you can set a lower limit by adding, for example,
|
|||
</pre>
|
||||
to the <b>configure</b> command. This value can also be overridden at run time.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
|
||||
<br><a name="SEC13" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
|
||||
<P>
|
||||
PCRE2 uses fixed tables for processing characters whose code points are less
|
||||
than 256. By default, PCRE2 is built with a set of tables that are distributed
|
||||
|
@ -307,7 +314,7 @@ compiling, because <b>dftables</b> is run on the local host. If you need to
|
|||
create alternative tables when cross compiling, you will have to do so "by
|
||||
hand".)
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
|
||||
<br><a name="SEC14" href="#TOC1">USING EBCDIC CODE</a><br>
|
||||
<P>
|
||||
PCRE2 assumes by default that it will run in an environment where the character
|
||||
code is ASCII or Unicode, which is a superset of ASCII. This is the case for
|
||||
|
@ -342,7 +349,7 @@ The options that select newline behaviour, such as --enable-newline-is-cr,
|
|||
and equivalent run-time options, refer to these character values in an EBCDIC
|
||||
environment.
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
|
||||
<br><a name="SEC15" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
|
||||
<P>
|
||||
By default, <b>pcre2grep</b> reads all files as plain text. You can build it so
|
||||
that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads
|
||||
|
@ -355,7 +362,7 @@ to the <b>configure</b> command. These options naturally require that the
|
|||
relevant libraries are installed on your system. Configuration will fail if
|
||||
they are not.
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">PCRE2GREP BUFFER SIZE</a><br>
|
||||
<br><a name="SEC16" href="#TOC1">PCRE2GREP BUFFER SIZE</a><br>
|
||||
<P>
|
||||
<b>pcre2grep</b> uses an internal buffer to hold a "window" on the file it is
|
||||
scanning, in order to be able to output "before" and "after" lines when it
|
||||
|
@ -370,7 +377,7 @@ parameter value by adding, for example,
|
|||
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override this
|
||||
value by using --buffer-size on the command line..
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
||||
<br><a name="SEC17" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
||||
<P>
|
||||
If you add one of
|
||||
<pre>
|
||||
|
@ -404,7 +411,7 @@ automatically included, you may need to add something like
|
|||
</pre>
|
||||
immediately before the <b>configure</b> command.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">INCLUDING DEBUGGING CODE</a><br>
|
||||
<br><a name="SEC18" href="#TOC1">INCLUDING DEBUGGING CODE</a><br>
|
||||
<P>
|
||||
If you add
|
||||
<pre>
|
||||
|
@ -413,7 +420,7 @@ If you add
|
|||
to the <b>configure</b> command, additional debugging code is included in the
|
||||
build. This feature is intended for use by the PCRE2 maintainers.
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">DEBUGGING WITH VALGRIND SUPPORT</a><br>
|
||||
<br><a name="SEC19" href="#TOC1">DEBUGGING WITH VALGRIND SUPPORT</a><br>
|
||||
<P>
|
||||
If you add
|
||||
<pre>
|
||||
|
@ -423,7 +430,7 @@ to the <b>configure</b> command, PCRE2 will use valgrind annotations to mark
|
|||
certain memory regions as unaddressable. This allows it to detect invalid
|
||||
memory accesses, and is mostly useful for debugging PCRE2 itself.
|
||||
</P>
|
||||
<br><a name="SEC19" href="#TOC1">CODE COVERAGE REPORTING</a><br>
|
||||
<br><a name="SEC20" href="#TOC1">CODE COVERAGE REPORTING</a><br>
|
||||
<P>
|
||||
If your C compiler is gcc, you can build a version of PCRE2 that can generate a
|
||||
code coverage report for its test suite. To enable this, you must install
|
||||
|
@ -480,11 +487,11 @@ This cleans all coverage data including the generated coverage report. For more
|
|||
information about code coverage, see the <b>gcov</b> and <b>lcov</b>
|
||||
documentation.
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC21" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2api</b>(3), <b>pcre2-config</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC22" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -493,9 +500,9 @@ University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC23" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 24 April 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -1236,14 +1236,21 @@ with \C in UTF-8 or UTF-16 mode means that the rest of the string may start
|
|||
with a malformed UTF character. This has undefined results, because PCRE2
|
||||
assumes that it is matching character by character in a valid UTF string (by
|
||||
default it checks the subject string's validity at the start of processing
|
||||
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the
|
||||
use of \C by setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||
</P>
|
||||
<P>
|
||||
An application can lock out the use of \C by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
|
||||
build PCRE2 with the use of \C permanently disabled.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2 does not allow \C to appear in lookbehind assertions
|
||||
<a href="#lookbehind">(described below)</a>
|
||||
in a UTF mode, because this would make it impossible to calculate the length of
|
||||
the lookbehind.
|
||||
the lookbehind. Neither the alternative matching function
|
||||
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
||||
former gives a match-time error; the latter fails to optimize and so the match
|
||||
is always run using the interpreter.
|
||||
</P>
|
||||
<P>
|
||||
In general, the \C escape sequence is best avoided. However, one way of using
|
||||
|
@ -3351,7 +3358,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 01 September 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -12,17 +12,21 @@ This page is part of the PCRE2 HTML documentation. It was generated
|
|||
automatically from the original man page. If there is any nonsense in it,
|
||||
please consult the man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<br><b>
|
||||
PCRE2 PERFORMANCE
|
||||
</b><br>
|
||||
<ul>
|
||||
<li><a name="TOC1" href="#SEC1">PCRE2 PERFORMANCE</a>
|
||||
<li><a name="TOC2" href="#SEC2">COMPILED PATTERN MEMORY USAGE</a>
|
||||
<li><a name="TOC3" href="#SEC3">STACK USAGE AT RUN TIME</a>
|
||||
<li><a name="TOC4" href="#SEC4">PROCESSING TIME</a>
|
||||
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
|
||||
<li><a name="TOC6" href="#SEC6">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE2 PERFORMANCE</a><br>
|
||||
<P>
|
||||
Two aspects of performance are discussed below: memory usage and processing
|
||||
time. The way you express your pattern as a regular expression can affect both
|
||||
of them.
|
||||
</P>
|
||||
<br><b>
|
||||
COMPILED PATTERN MEMORY USAGE
|
||||
</b><br>
|
||||
<br><a name="SEC2" href="#TOC1">COMPILED PATTERN MEMORY USAGE</a><br>
|
||||
<P>
|
||||
Patterns are compiled by PCRE2 into a reasonably efficient interpretive code,
|
||||
so that most simple patterns do not use much memory. However, there is one case
|
||||
|
@ -75,9 +79,7 @@ pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
|
|||
speed is acceptable, this kind of rewriting will allow you to process patterns
|
||||
that PCRE2 cannot otherwise handle.
|
||||
</P>
|
||||
<br><b>
|
||||
STACK USAGE AT RUN TIME
|
||||
</b><br>
|
||||
<br><a name="SEC3" href="#TOC1">STACK USAGE AT RUN TIME</a><br>
|
||||
<P>
|
||||
When <b>pcre2_match()</b> is used for matching, certain kinds of pattern can
|
||||
cause it to use large amounts of the process stack. In some environments the
|
||||
|
@ -86,9 +88,7 @@ SIGSEGV. Rewriting your pattern can often help. The
|
|||
<a href="pcre2stack.html"><b>pcre2stack</b></a>
|
||||
documentation discusses this issue in detail.
|
||||
</P>
|
||||
<br><b>
|
||||
PROCESSING TIME
|
||||
</b><br>
|
||||
<br><a name="SEC4" href="#TOC1">PROCESSING TIME</a><br>
|
||||
<P>
|
||||
Certain items in regular expression patterns are processed more efficiently
|
||||
than others. It is more efficient to use a character class like [aeiou] than a
|
||||
|
@ -177,9 +177,7 @@ appreciable time with strings longer than about 20 characters.
|
|||
In many cases, the solution to this kind of performance issue is to use an
|
||||
atomic group or a possessive quantifier.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -188,9 +186,7 @@ University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><b>
|
||||
REVISION
|
||||
</b><br>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 02 January 2015
|
||||
<br>
|
||||
|
|
|
@ -111,9 +111,10 @@ it matches a literal "u".
|
|||
\W a "non-word" character
|
||||
\X a Unicode extended grapheme cluster
|
||||
</pre>
|
||||
The application can lock out the use of \C by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the
|
||||
current matching point in the middle of a UTF-8 or UTF-16 character.
|
||||
\C is dangerous because it may leave the current matching point in the middle
|
||||
of a UTF-8 or UTF-16 character. The application can lock out the use of \C by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||
with the use of \C permanently disabled.
|
||||
</P>
|
||||
<P>
|
||||
By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode
|
||||
|
@ -588,7 +589,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 July 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -155,6 +155,7 @@ following options output the value and set the exit code as indicated:
|
|||
The following options output 1 for true or 0 for false, and set the exit code
|
||||
to the same value:
|
||||
<pre>
|
||||
backslash-C \C is supported (not locked out)
|
||||
ebcdic compiled for an EBCDIC environment
|
||||
jit just-in-time support is available
|
||||
pcre2-16 the 16-bit library was built
|
||||
|
@ -510,7 +511,7 @@ Setting compilation options
|
|||
<P>
|
||||
The following modifiers set options for <b>pcre2_compile()</b>. The most common
|
||||
ones have single-letter abbreviations. See
|
||||
<a href="pcreapi.html"><b>pcreapi</b></a>
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
for a description of their effects.
|
||||
<pre>
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
|
@ -537,6 +538,7 @@ for a description of their effects.
|
|||
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||
ucp set PCRE2_UCP
|
||||
ungreedy set PCRE2_UNGREEDY
|
||||
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||
utf set PCRE2_UTF
|
||||
</pre>
|
||||
As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all
|
||||
|
@ -564,6 +566,7 @@ about the pattern:
|
|||
locale=<name> use this locale
|
||||
memory show memory used
|
||||
newline=<type> set newline type
|
||||
null_context compile with a NULL context
|
||||
parens_nest_limit=<n> set maximum parentheses depth
|
||||
posix use the POSIX API
|
||||
push push compiled pattern onto the stack
|
||||
|
@ -642,6 +645,15 @@ is requested. For each callout, either its number or string is given, followed
|
|||
by the item that follows it in the pattern.
|
||||
</P>
|
||||
<br><b>
|
||||
Passing a NULL context
|
||||
</b><br>
|
||||
<P>
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_compile()</b>. If
|
||||
the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
||||
default values).
|
||||
</P>
|
||||
<br><b>
|
||||
Specifying a pattern in hex
|
||||
</b><br>
|
||||
<P>
|
||||
|
@ -920,9 +932,11 @@ pattern.
|
|||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
match_limit=>n> set a match limit
|
||||
match_limit=<n> set a match limit
|
||||
memory show memory usage
|
||||
null_context match with a NULL context
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
recursion_limit=<n> set a recursion limit
|
||||
replace=<string> specify a replacement string
|
||||
|
@ -1170,6 +1184,16 @@ The <b>offset</b> modifier sets an offset in the subject string at which
|
|||
matching starts. Its value is a number of code units, not characters.
|
||||
</P>
|
||||
<br><b>
|
||||
Setting an offset limit
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>offset_limit</b> modifier sets a limit for unanchored matches. If a match
|
||||
cannot be found starting at or before this offset in the subject, a "no match"
|
||||
return is given. The data value is a number of code units, not characters. When
|
||||
this modifier is used, the <b>use_offset_limit</b> modifier must have been set
|
||||
for the pattern; if not, an error is generated.
|
||||
</P>
|
||||
<br><b>
|
||||
Setting the size of the output vector
|
||||
</b><br>
|
||||
<P>
|
||||
|
@ -1201,6 +1225,17 @@ this modifier has no effect, as there is no facility for passing a length.)
|
|||
When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
|
||||
passing the replacement string as zero-terminated.
|
||||
</P>
|
||||
<br><b>
|
||||
Passing a NULL context
|
||||
</b><br>
|
||||
<P>
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||
<b>pcre2_dfa_match()</b> or <b>pcre2_jit_match()</b>. If the <b>null_context</b>
|
||||
modifier is set, however, NULL is passed. This is for testing that the matching
|
||||
functions behave correctly in this case (they use default values). This
|
||||
modifier cannot be used with the <b>find_limits</b> modifier or when testing the
|
||||
substitution function.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
<P>
|
||||
By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
|
||||
|
@ -1539,7 +1574,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 14 September 2015
|
||||
Last updated: 17 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -71,11 +71,12 @@ The escape sequence \C can be used to match a single code unit, in a UTF mode,
|
|||
but its use can lead to some strange effects because it breaks up multi-unit
|
||||
characters (see the description of \C in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation). The use of \C is not supported in the alternative matching
|
||||
function <b>pcre2_dfa_match()</b>, nor is it supported in UTF mode by the JIT
|
||||
optimization. If JIT optimization is requested for a UTF pattern that contains
|
||||
\C, it will not succeed, and so the matching will be carried out by the normal
|
||||
interpretive function.
|
||||
documentation). The use of \C is not supported by the alternative matching
|
||||
function <b>pcre2_dfa_match()</b> when in UTF mode. Its use provokes a
|
||||
match-time error. The JIT optimization also does not support \C in UTF mode.
|
||||
If JIT optimization is requested for a UTF pattern that contains \C, it will
|
||||
not succeed, and so the matching will be carried out by the normal interpretive
|
||||
function.
|
||||
</P>
|
||||
<P>
|
||||
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
||||
|
@ -275,7 +276,7 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 18 August 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
10
doc/pcre2.3
10
doc/pcre2.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2 3 "13 April 2015" "PCRE2 10.20"
|
||||
.TH PCRE2 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH INTRODUCTION
|
||||
|
@ -118,8 +118,10 @@ running redundant checks.
|
|||
.P
|
||||
The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||
problems, because it may leave the current matching point in the middle of a
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to
|
||||
lock out the use of \eC, causing a compile-time error if it is encountered.
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||
application to lock out the use of \eC, causing a compile-time error if it is
|
||||
encountered. It is also possible to build PCRE2 with the use of \eC permanently
|
||||
disabled.
|
||||
.P
|
||||
Another way that performance can be hit is by running a pattern that has a very
|
||||
large search tree against a string that will never match. Nested unlimited
|
||||
|
@ -187,6 +189,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 13 April 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
182
doc/pcre2.txt
182
doc/pcre2.txt
|
@ -104,8 +104,9 @@ SECURITY CONSIDERATIONS
|
|||
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead
|
||||
to problems, because it may leave the current matching point in the
|
||||
middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C
|
||||
option can be used to lock out the use of \C, causing a compile-time
|
||||
error if it is encountered.
|
||||
option can be used by an application to lock out the use of \C, causing
|
||||
a compile-time error if it is encountered. It is also possible to build
|
||||
PCRE2 with the use of \C permanently disabled.
|
||||
|
||||
Another way that performance can be hit is by running a pattern that
|
||||
has a very large search tree against a string that will never match.
|
||||
|
@ -165,7 +166,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 13 April 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -1276,7 +1277,9 @@ COMPILING A PATTERN
|
|||
piled. This escape can cause unpredictable behaviour in UTF-8 or
|
||||
UTF-16 modes, because it may leave the current matching point in the
|
||||
middle of a multi-code-unit character. This option may be useful in
|
||||
applications that process patterns from external sources.
|
||||
applications that process patterns from external sources. Note that
|
||||
there is also a build-time option that permanently locks out the use of
|
||||
\C.
|
||||
|
||||
PCRE2_NEVER_UCP
|
||||
|
||||
|
@ -2571,19 +2574,36 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
|
||||
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
||||
uint32_t options, pcre2_match_data *match_data,
|
||||
pcre2_match_context *mcontext, PCRE2_SPTR replacementzfP,
|
||||
pcre2_match_context *mcontext, PCRE2_SPTR replacement,
|
||||
PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferP,
|
||||
PCRE2_SIZE *outlengthptr);
|
||||
|
||||
This function calls pcre2_match() and then makes a copy of the subject
|
||||
string in outputbuffer, replacing the part that was matched with the
|
||||
replacement string, whose length is supplied in rlength. This can be
|
||||
given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||
|
||||
The first seven arguments of pcre2_substitute() are the same as for
|
||||
pcre2_match(), except that the partial matching options are not permit-
|
||||
ted, and match_data may be passed as NULL, in which case a match data
|
||||
block is obtained and freed within this function, using memory manage-
|
||||
ment functions from the match context, if provided, or else those that
|
||||
were used to allocate memory for the compiled code.
|
||||
|
||||
The outlengthptr argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. If the function is suc-
|
||||
cessful, the value is updated to contain the length of the new string,
|
||||
excluding the trailing zero that is automatically added. If the func-
|
||||
tion is not successful, the value is set to PCRE2_UNSET for general
|
||||
errors (such as output buffer too small). For syntax errors in the
|
||||
replacement string, the value is set to the offset in the replacement
|
||||
string where the error was detected.
|
||||
|
||||
In the replacement string, which is interpreted as a UTF string in UTF
|
||||
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
||||
option is set, a dollar character is an escape character that can spec-
|
||||
ify the insertion of characters from capturing groups or (*MARK) items
|
||||
in the pattern. The following forms are recognized:
|
||||
in the pattern. The following forms are always recognized:
|
||||
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
|
@ -2594,8 +2614,7 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
preted as part of the number or name. The number may be zero to include
|
||||
the entire matched string. For example, if the pattern a(b)c is
|
||||
matched with "=abc=" and the replacement string "+$1$0$1+", the result
|
||||
is "=+babcb+=". Group insertion is done by calling pcre2_copy_byname()
|
||||
or pcre2_copy_bynumber() as appropriate.
|
||||
is "=+babcb+=".
|
||||
|
||||
The facility for inserting a (*MARK) name can be used to perform simple
|
||||
simultaneous substitutions, as this pcre2test example shows:
|
||||
|
@ -2604,32 +2623,80 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
apple lemon
|
||||
2: pear orange
|
||||
|
||||
The first seven arguments of pcre2_substitute() are the same as for
|
||||
pcre2_match(), except that the partial matching options are not permit-
|
||||
ted, and match_data may be passed as NULL, in which case a match data
|
||||
block is obtained and freed within this function, using memory manage-
|
||||
ment functions from the match context, if provided, or else those that
|
||||
were used to allocate memory for the compiled code.
|
||||
|
||||
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
|
||||
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
|
||||
the function to iterate over the subject string, replacing every match-
|
||||
ing substring. If this is not set, only the first matching substring is
|
||||
replaced.
|
||||
|
||||
The outlengthptr argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. It is updated to contain
|
||||
the length of the new string, excluding the trailing zero that is auto-
|
||||
matically added.
|
||||
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra
|
||||
processing to be applied to the replacement string. Without this
|
||||
option, only the dollar character is special, and only the group inser-
|
||||
tion forms listed above are valid. When PCRE2_SUBSTITUTE_EXTENDED is
|
||||
set, two things change:
|
||||
|
||||
The function returns the number of replacements that were made. This
|
||||
may be zero if no matches were found, and is never greater than 1
|
||||
unless PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a neg-
|
||||
ative error code is returned. Except for PCRE2_ERROR_NOMATCH (which is
|
||||
never returned), any errors from pcre2_match() or the substring copying
|
||||
functions are passed straight back. PCRE2_ERROR_BADREPLACEMENT is
|
||||
returned for an invalid replacement string (unrecognized sequence fol-
|
||||
lowing a dollar sign), and PCRE2_ERROR_NOMEMORY is returned if the out-
|
||||
put buffer is not big enough.
|
||||
Firstly, backslash in a replacement string is interpreted as an escape
|
||||
character. The usual forms such as \n or \x{ddd} can be used to specify
|
||||
particular character codes, and backslash followed by any non-alphanu-
|
||||
meric character quotes that character. Extended quoting can be coded
|
||||
using \Q...\E, exactly as in pattern strings.
|
||||
|
||||
There are also four escape sequences for forcing the case of inserted
|
||||
letters. The insertion mechanism has three states: no case forcing,
|
||||
force upper case, and force lower case. The escape sequences change the
|
||||
current state: \U and \L change to upper or lower case forcing, respec-
|
||||
tively, and \E (when not terminating a \Q quoted sequence) reverts to
|
||||
no case forcing. The sequences \u and \l force the next character (if
|
||||
it is a letter) to upper or lower case, respectively, and then the
|
||||
state automatically reverts to no case forcing. Case forcing applies to
|
||||
all inserted characters, including those from captured groups and let-
|
||||
ters within \Q...\E quoted sequences.
|
||||
|
||||
Note that case forcing sequences such as \U...\E do not nest. For exam-
|
||||
ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final
|
||||
\E has no effect.
|
||||
|
||||
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
|
||||
flexibility to group substitution. The syntax is similar to that used
|
||||
by Bash:
|
||||
|
||||
${<n>:-<string>}
|
||||
${<n>:+<string1>:<string2>}
|
||||
|
||||
As before, <n> may be a group number or a name. The first form speci-
|
||||
fies a default value. If group <n> is set, its value is inserted; if
|
||||
not, <string> is expanded and the result inserted. The second form
|
||||
specifies strings that are expanded and inserted when group <n> is set
|
||||
or unset, respectively. The first form is just a convenient shorthand
|
||||
for
|
||||
|
||||
${<n>:+${<n>}:<string>}
|
||||
|
||||
Backslash can be used to escape colons and closing curly brackets in
|
||||
the replacement strings. A change of the case forcing state within a
|
||||
replacement string remains in force afterwards, as shown in this
|
||||
pcre2test example:
|
||||
|
||||
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
|
||||
body
|
||||
1: hello
|
||||
somebody
|
||||
1: HELLO
|
||||
|
||||
If successful, the function returns the number of replacements that
|
||||
were made. This may be zero if no matches were found, and is never
|
||||
greater than 1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||
|
||||
In the event of an error, a negative error code is returned. Except for
|
||||
PCRE2_ERROR_NOMATCH (which is never returned), errors from
|
||||
pcre2_match() are passed straight back. PCRE2_ERROR_NOMEMORY is
|
||||
returned if the output buffer is not big enough.
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in
|
||||
the replacement string, with more particular errors being
|
||||
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REP-
|
||||
MISSING_BRACE (closing curly bracket not found), and PCRE2_BADSUBSTITU-
|
||||
TION (syntax error in extended group substitution). As for all PCRE2
|
||||
errors, a text message that describes the error can be obtained by
|
||||
calling pcre2_get_error_message().
|
||||
|
||||
|
||||
DUPLICATE SUBPATTERN NAMES
|
||||
|
@ -2845,8 +2912,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
PCRE2_ERROR_DFA_UITEM
|
||||
|
||||
This return is given if pcre2_dfa_match() encounters an item in the
|
||||
pattern that it does not support, for instance, the use of \C or a back
|
||||
reference.
|
||||
pattern that it does not support, for instance, the use of \C in a UTF
|
||||
mode or a back reference.
|
||||
|
||||
PCRE2_ERROR_DFA_UCOND
|
||||
|
||||
|
@ -2890,7 +2957,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 22 September 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -3010,10 +3077,18 @@ UNICODE AND UTF SUPPORT
|
|||
PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a
|
||||
pattern may also request this by starting with (*UCP).
|
||||
|
||||
|
||||
DISABLING THE USE OF \C
|
||||
|
||||
The \C escape sequence, which matches a single code unit, even in a UTF
|
||||
mode, can cause unpredictable behaviour because it may leave the cur-
|
||||
rent matching point in the middle of a multi-code-unit character. It
|
||||
can be locked out by setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
rent matching point in the middle of a multi-code-unit character. The
|
||||
application can lock it out by setting the PCRE2_NEVER_BACKSLASH_C
|
||||
option when calling pcre2_compile(). There is also a build-time option
|
||||
|
||||
--enable-never-backslash-C
|
||||
|
||||
(note the upper case C) which locks out the use of \C entirely.
|
||||
|
||||
|
||||
JUST-IN-TIME COMPILER SUPPORT
|
||||
|
@ -3366,7 +3441,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 24 April 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -6028,12 +6103,18 @@ MATCHING A SINGLE CODE UNIT
|
|||
results, because PCRE2 assumes that it is matching character by charac-
|
||||
ter in a valid UTF string (by default it checks the subject string's
|
||||
validity at the start of processing unless the PCRE2_NO_UTF_CHECK
|
||||
option is used). An application can lock out the use of \C by setting
|
||||
the PCRE2_NEVER_BACKSLASH_C option.
|
||||
option is used).
|
||||
|
||||
An application can lock out the use of \C by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also
|
||||
possible to build PCRE2 with the use of \C permanently disabled.
|
||||
|
||||
PCRE2 does not allow \C to appear in lookbehind assertions (described
|
||||
below) in a UTF mode, because this would make it impossible to calcu-
|
||||
late the length of the lookbehind.
|
||||
late the length of the lookbehind. Neither the alternative matching
|
||||
function pcre2_dfa_match() not the JIT optimizer support \C in a UTF
|
||||
mode. The former gives a match-time error; the latter fails to optimize
|
||||
and so the match is always run using the interpreter.
|
||||
|
||||
In general, the \C escape sequence is best avoided. However, one way of
|
||||
using it that avoids the problem of malformed UTF characters is to use
|
||||
|
@ -8036,7 +8117,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 01 September 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -8966,10 +9047,10 @@ CHARACTER TYPES
|
|||
\W a "non-word" character
|
||||
\X a Unicode extended grapheme cluster
|
||||
|
||||
The application can lock out the use of \C by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave
|
||||
the current matching point in the middle of a UTF-8 or UTF-16 charac-
|
||||
ter.
|
||||
\C is dangerous because it may leave the current matching point in the
|
||||
middle of a UTF-8 or UTF-16 character. The application can lock out the
|
||||
use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also
|
||||
possible to build PCRE2 with the use of \C permanently disabled.
|
||||
|
||||
By default, \d, \s, and \w match only ASCII characters, even in UTF-8
|
||||
mode or in the 16-bit and 32-bit libraries. However, if locale-specific
|
||||
|
@ -9325,7 +9406,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 17 July 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -9384,11 +9465,12 @@ WIDE CHARACTERS AND UTF MODES
|
|||
The escape sequence \C can be used to match a single code unit, in a
|
||||
UTF mode, but its use can lead to some strange effects because it
|
||||
breaks up multi-unit characters (see the description of \C in the
|
||||
pcre2pattern documentation). The use of \C is not supported in the
|
||||
alternative matching function pcre2_dfa_match(), nor is it supported in
|
||||
UTF mode by the JIT optimization. If JIT optimization is requested for
|
||||
a UTF pattern that contains \C, it will not succeed, and so the match-
|
||||
ing will be carried out by the normal interpretive function.
|
||||
pcre2pattern documentation). The use of \C is not supported by the
|
||||
alternative matching function pcre2_dfa_match() when in UTF mode. Its
|
||||
use provokes a match-time error. The JIT optimization also does not
|
||||
support \C in UTF mode. If JIT optimization is requested for a UTF
|
||||
pattern that contains \C, it will not succeed, and so the matching will
|
||||
be carried out by the normal interpretive function.
|
||||
|
||||
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
||||
characters of any code value, but, by default, the characters that
|
||||
|
@ -9563,7 +9645,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 18 August 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "07 October 2015" "PCRE2 10.21"
|
||||
.TH PCRE2API 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -1209,7 +1209,8 @@ This option locks out the use of \eC in the pattern that is being compiled.
|
|||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||
it may leave the current matching point in the middle of a multi-code-unit
|
||||
character. This option may be useful in applications that process patterns from
|
||||
external sources.
|
||||
external sources. Note that there is also a build-time option that permanently
|
||||
locks out the use of \eC.
|
||||
.sp
|
||||
PCRE2_NEVER_UCP
|
||||
.sp
|
||||
|
@ -3014,8 +3015,8 @@ There are in addition the following errors that are specific to
|
|||
PCRE2_ERROR_DFA_UITEM
|
||||
.sp
|
||||
This return is given if \fBpcre2_dfa_match()\fP encounters an item in the
|
||||
pattern that it does not support, for instance, the use of \eC or a back
|
||||
reference.
|
||||
pattern that it does not support, for instance, the use of \eC in a UTF mode or
|
||||
a back reference.
|
||||
.sp
|
||||
PCRE2_ERROR_DFA_UCOND
|
||||
.sp
|
||||
|
@ -3065,6 +3066,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 07 October 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2BUILD 3 "23 April 2015" "PCRE2 10.20"
|
||||
.TH PCRE2BUILD 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.
|
||||
|
@ -132,11 +132,20 @@ Pattern escapes such as \ed and \ew do not by default make use of Unicode
|
|||
properties. The application can request that they do by setting the PCRE2_UCP
|
||||
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
|
||||
request this by starting with (*UCP).
|
||||
.P
|
||||
.
|
||||
.
|
||||
.SH "DISABLING THE USE OF \eC"
|
||||
.rs
|
||||
.sp
|
||||
The \eC escape sequence, which matches a single code unit, even in a UTF mode,
|
||||
can cause unpredictable behaviour because it may leave the current matching
|
||||
point in the middle of a multi-code-unit character. It can be locked out by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
point in the middle of a multi-code-unit character. The application can lock it
|
||||
out by setting the PCRE2_NEVER_BACKSLASH_C option when calling
|
||||
\fBpcre2_compile()\fP. There is also a build-time option
|
||||
.sp
|
||||
--enable-never-backslash-C
|
||||
.sp
|
||||
(note the upper case C) which locks out the use of \eC entirely.
|
||||
.
|
||||
.
|
||||
.SH "JUST-IN-TIME COMPILER SUPPORT"
|
||||
|
@ -510,6 +519,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 24 April 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21"
|
||||
.TH PCRE2PATTERN 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -1233,8 +1233,11 @@ with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start
|
|||
with a malformed UTF character. This has undefined results, because PCRE2
|
||||
assumes that it is matching character by character in a valid UTF string (by
|
||||
default it checks the subject string's validity at the start of processing
|
||||
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the
|
||||
use of \eC by setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||
.P
|
||||
An application can lock out the use of \eC by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
|
||||
build PCRE2 with the use of \eC permanently disabled.
|
||||
.P
|
||||
PCRE2 does not allow \eC to appear in lookbehind assertions
|
||||
.\" HTML <a href="#lookbehind">
|
||||
|
@ -1242,7 +1245,10 @@ PCRE2 does not allow \eC to appear in lookbehind assertions
|
|||
(described below)
|
||||
.\"
|
||||
in a UTF mode, because this would make it impossible to calculate the length of
|
||||
the lookbehind.
|
||||
the lookbehind. Neither the alternative matching function
|
||||
\fBpcre2_dfa_match()\fP not the JIT optimizer support \eC in a UTF mode. The
|
||||
former gives a match-time error; the latter fails to optimize and so the match
|
||||
is always run using the interpreter.
|
||||
.P
|
||||
In general, the \eC escape sequence is best avoided. However, one way of using
|
||||
it that avoids the problem of malformed UTF characters is to use a lookahead to
|
||||
|
@ -3386,6 +3392,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 01 September 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SYNTAX 3 "17 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2SYNTAX 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||
|
@ -81,9 +81,10 @@ it matches a literal "u".
|
|||
\eW a "non-word" character
|
||||
\eX a Unicode extended grapheme cluster
|
||||
.sp
|
||||
The application can lock out the use of \eC by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the
|
||||
current matching point in the middle of a UTF-8 or UTF-16 character.
|
||||
\eC is dangerous because it may leave the current matching point in the middle
|
||||
of a UTF-8 or UTF-16 character. The application can lock out the use of \eC by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||
with the use of \eC permanently disabled.
|
||||
.P
|
||||
By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode
|
||||
or in the 16-bit and 32-bit libraries. However, if locale-specific matching is
|
||||
|
@ -576,6 +577,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 17 July 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "23 September 2015" "PCRE 10.21"
|
||||
.TH PCRE2TEST 1 "17 October 2015" "PCRE 10.21"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -122,6 +122,7 @@ following options output the value and set the exit code as indicated:
|
|||
The following options output 1 for true or 0 for false, and set the exit code
|
||||
to the same value:
|
||||
.sp
|
||||
backslash-C \eC is supported (not locked out)
|
||||
ebcdic compiled for an EBCDIC environment
|
||||
jit just-in-time support is available
|
||||
pcre2-16 the 16-bit library was built
|
||||
|
@ -1559,6 +1560,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 September 2015
|
||||
Last updated: 17 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -119,6 +119,7 @@ COMMAND LINE OPTIONS
|
|||
The following options output 1 for true or 0 for false, and
|
||||
set the exit code to the same value:
|
||||
|
||||
backslash-C \C is supported (not locked out)
|
||||
ebcdic compiled for an EBCDIC environment
|
||||
jit just-in-time support is available
|
||||
pcre2-16 the 16-bit library was built
|
||||
|
@ -457,7 +458,7 @@ PATTERN MODIFIERS
|
|||
Setting compilation options
|
||||
|
||||
The following modifiers set options for pcre2_compile(). The most com-
|
||||
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
||||
mon ones have single-letter abbreviations. See pcre2api for a descrip-
|
||||
tion of their effects.
|
||||
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
|
@ -484,6 +485,7 @@ PATTERN MODIFIERS
|
|||
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||
ucp set PCRE2_UCP
|
||||
ungreedy set PCRE2_UNGREEDY
|
||||
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||
utf set PCRE2_UTF
|
||||
|
||||
As well as turning on the PCRE2_UTF option, the utf modifier causes all
|
||||
|
@ -509,6 +511,7 @@ PATTERN MODIFIERS
|
|||
locale=<name> use this locale
|
||||
memory show memory used
|
||||
newline=<type> set newline type
|
||||
null_context compile with a NULL context
|
||||
parens_nest_limit=<n> set maximum parentheses depth
|
||||
posix use the POSIX API
|
||||
push push compiled pattern onto the stack
|
||||
|
@ -579,6 +582,13 @@ PATTERN MODIFIERS
|
|||
mation that is requested. For each callout, either its number or string
|
||||
is given, followed by the item that follows it in the pattern.
|
||||
|
||||
Passing a NULL context
|
||||
|
||||
Normally, pcre2test passes a context block to pcre2_compile(). If the
|
||||
null_context modifier is set, however, NULL is passed. This is for
|
||||
testing that pcre2_compile() behaves correctly in this case (it uses
|
||||
default values).
|
||||
|
||||
Specifying a pattern in hex
|
||||
|
||||
The hex modifier specifies that the characters of the pattern are to be
|
||||
|
@ -823,9 +833,11 @@ SUBJECT MODIFIERS
|
|||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
match_limit=>n> set a match limit
|
||||
match_limit=<n> set a match limit
|
||||
memory show memory usage
|
||||
null_context match with a NULL context
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
recursion_limit=<n> set a recursion limit
|
||||
replace=<string> specify a replacement string
|
||||
|
@ -1046,6 +1058,14 @@ SUBJECT MODIFIERS
|
|||
The offset modifier sets an offset in the subject string at which
|
||||
matching starts. Its value is a number of code units, not characters.
|
||||
|
||||
Setting an offset limit
|
||||
|
||||
The offset_limit modifier sets a limit for unanchored matches. If a
|
||||
match cannot be found starting at or before this offset in the subject,
|
||||
a "no match" return is given. The data value is a number of code units,
|
||||
not characters. When this modifier is used, the use_offset_limit modi-
|
||||
fier must have been set for the pattern; if not, an error is generated.
|
||||
|
||||
Setting the size of the output vector
|
||||
|
||||
The ovector modifier applies only to the subject line in which it
|
||||
|
@ -1073,6 +1093,15 @@ SUBJECT MODIFIERS
|
|||
When testing pcre2_substitute(), this modifier also has the effect of
|
||||
passing the replacement string as zero-terminated.
|
||||
|
||||
Passing a NULL context
|
||||
|
||||
Normally, pcre2test passes a context block to pcre2_match(),
|
||||
pcre2_dfa_match() or pcre2_jit_match(). If the null_context modifier is
|
||||
set, however, NULL is passed. This is for testing that the matching
|
||||
functions behave correctly in this case (they use default values). This
|
||||
modifier cannot be used with the find_limits modifier or when testing
|
||||
the substitution function.
|
||||
|
||||
|
||||
THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
||||
|
@ -1398,5 +1427,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 14 September 2015
|
||||
Last updated: 17 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2UNICODE 3 "18 August 2015" "PCRE2 10.21"
|
||||
.TH PCRE2UNICODE 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions (revised API)
|
||||
.SH "UNICODE AND UTF SUPPORT"
|
||||
|
@ -63,11 +63,12 @@ characters (see the description of \eC in the
|
|||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
documentation). The use of \eC is not supported in the alternative matching
|
||||
function \fBpcre2_dfa_match()\fP, nor is it supported in UTF mode by the JIT
|
||||
optimization. If JIT optimization is requested for a UTF pattern that contains
|
||||
\eC, it will not succeed, and so the matching will be carried out by the normal
|
||||
interpretive function.
|
||||
documentation). The use of \eC is not supported by the alternative matching
|
||||
function \fBpcre2_dfa_match()\fP when in UTF mode. Its use provokes a
|
||||
match-time error. The JIT optimization also does not support \eC in UTF mode.
|
||||
If JIT optimization is requested for a UTF pattern that contains \eC, it will
|
||||
not succeed, and so the matching will be carried out by the normal interpretive
|
||||
function.
|
||||
.P
|
||||
The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test
|
||||
characters of any code value, but, by default, the characters that PCRE2
|
||||
|
@ -262,6 +263,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 18 August 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -182,6 +182,9 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define MAX_NAME_SIZE 32
|
||||
#endif
|
||||
|
||||
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
|
||||
/* #undef NEVER_BACKSLASH_C */
|
||||
|
||||
/* The value of NEWLINE_DEFAULT determines the default newline character
|
||||
sequence. PCRE2 client programs can override this by selecting other values
|
||||
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5
|
||||
|
|
|
@ -169,6 +169,9 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
overflow caused by enormously large patterns. */
|
||||
#undef MAX_NAME_SIZE
|
||||
|
||||
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
|
||||
#undef NEVER_BACKSLASH_C
|
||||
|
||||
/* The value of NEWLINE_DEFAULT determines the default newline character
|
||||
sequence. PCRE2 client programs can override this by selecting other values
|
||||
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5
|
||||
|
|
|
@ -583,7 +583,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
|
|||
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
|
||||
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
|
||||
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
|
||||
ERR81, ERR82, ERR83, ERR84 };
|
||||
ERR81, ERR82, ERR83, ERR84, ERR85 };
|
||||
|
||||
/* This is a table of start-of-pattern options such as (*UTF) and settings such
|
||||
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
|
||||
|
@ -7053,11 +7053,19 @@ for (;; ptr++)
|
|||
|
||||
/* The use of \C can be locked out. */
|
||||
|
||||
#ifdef NEVER_BACKSLASH_C
|
||||
else if (escape == ESC_C)
|
||||
{
|
||||
*errorcodeptr = ERR85;
|
||||
goto FAILED;
|
||||
}
|
||||
#else
|
||||
else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
|
||||
{
|
||||
*errorcodeptr = ERR83;
|
||||
goto FAILED;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* For the rest (including \X when Unicode properties are supported), we
|
||||
can obtain the OP value by negating the escape value in the default
|
||||
|
|
|
@ -168,6 +168,8 @@ static const char compile_error_texts[] =
|
|||
"unrecognized string delimiter follows (?C\0"
|
||||
"using \\C is disabled by the application\0"
|
||||
"(?| and/or (?J: or (?x: parentheses are too deeply nested\0"
|
||||
/* 85 */
|
||||
"using \\C is disabled in this PCRE2 library\0"
|
||||
;
|
||||
|
||||
/* Match-time and UTF error texts are in the same format. */
|
||||
|
|
|
@ -106,7 +106,7 @@ static const int eint1[] = {
|
|||
|
||||
static const int eint2[] = {
|
||||
30, REG_ECTYPE, /* unknown POSIX class name */
|
||||
32, REG_INVARG, /* this version of PCRE does not have UTF or UCP support */
|
||||
32, REG_INVARG, /* this version of PCRE2 does not have Unicode support */
|
||||
37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */
|
||||
56, REG_INVARG, /* internal error: unknown newline setting */
|
||||
};
|
||||
|
|
|
@ -667,6 +667,12 @@ table itself easier to read. */
|
|||
#define EBCDIC_NL 0
|
||||
#endif
|
||||
|
||||
#ifdef NEVER_BACKSLASH_C
|
||||
#define BACKSLASH_C 0
|
||||
#else
|
||||
#define BACKSLASH_C 1
|
||||
#endif
|
||||
|
||||
typedef struct coptstruct {
|
||||
const char *name;
|
||||
uint32_t type;
|
||||
|
@ -681,6 +687,7 @@ enum { CONF_BSR,
|
|||
};
|
||||
|
||||
static coptstruct coptlist[] = {
|
||||
{ "backslash-C", CONF_FIX, BACKSLASH_C },
|
||||
{ "bsr", CONF_BSR, PCRE2_CONFIG_BSR },
|
||||
{ "ebcdic", CONF_FIX, SUPPORT_EBCDIC },
|
||||
{ "ebcdic-nl", CONF_FIZ, EBCDIC_NL },
|
||||
|
@ -6467,6 +6474,7 @@ printf(" -b set default pattern control 'fullbincode'\n");
|
|||
printf(" -C show PCRE2 compile-time options and exit\n");
|
||||
printf(" -C arg show a specific compile-time option and exit with its\n");
|
||||
printf(" value if numeric (else 0). The arg can be:\n");
|
||||
printf(" backslash-C use of \\C is enabled [0, 1]\n");
|
||||
printf(" bsr \\R type [ANYCRLF, ANY]\n");
|
||||
printf(" ebcdic compiled for EBCDIC character code [0,1]\n");
|
||||
printf(" ebcdic-nl NL code if compiled for EBCDIC\n");
|
||||
|
@ -6618,6 +6626,11 @@ print_newline_config(optval, FALSE);
|
|||
(void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &optval);
|
||||
printf(" \\R matches %s\n", optval? "CR, LF, or CRLF only" :
|
||||
"all Unicode newlines");
|
||||
#ifdef NEVER_BACKSLASH_C
|
||||
printf(" \\C is not supported\n");
|
||||
#else
|
||||
printf(" \\C is supported\n");
|
||||
#endif
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &optval);
|
||||
printf(" Internal link size = %d\n", optval);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &optval);
|
||||
|
|
|
@ -1,46 +1,6 @@
|
|||
# This set of tests is for UTF-8 support and Unicode property support, with
|
||||
# relevance only for the 8-bit library.
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
X\x{1234}
|
||||
X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
X\x{1234}\x{512}YZ
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
X\x{1234}
|
||||
X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
a\nb
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
\= Expect no match
|
||||
a\x{12257}b
|
||||
|
||||
# The next 3 patterns have UTF-8 errors
|
||||
|
||||
/[Ã]/utf
|
||||
|
@ -212,21 +172,6 @@
|
|||
|
||||
/\x{212ab}/IB,utf
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single-byte is displayed. (In Perl it becomes a character, and you
|
||||
# can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
X\nabc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match
|
||||
a\x{100}b
|
||||
|
||||
/[^ab\xC0-\xF0]/IB,utf
|
||||
\x{f1}
|
||||
\x{bf}
|
||||
|
|
|
@ -6,10 +6,6 @@
|
|||
#forbid_utf
|
||||
#newline_default LF ANY ANYCRLF
|
||||
|
||||
/a\Cb/
|
||||
aXb
|
||||
a\nb
|
||||
|
||||
/[^\x{c4}]/IB
|
||||
|
||||
/\x{100}/I
|
||||
|
@ -344,7 +340,7 @@
|
|||
|
||||
# Non-UTF characters
|
||||
|
||||
/\C{2,3}/
|
||||
/.{2,3}/
|
||||
\x{400000}\x{400001}\x{400002}\x{400003}
|
||||
|
||||
/\x{400000}\x{800000}/IBi
|
||||
|
|
|
@ -7,49 +7,6 @@
|
|||
/abc/utf
|
||||
Ã]
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{11234}YZ
|
||||
X\x{11234}YZW
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
X\x{11234}\x{512}YZ
|
||||
\= Expect no match
|
||||
X\x{11234}
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
a\nb
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{12257}b
|
||||
\= Expect no match
|
||||
a\x{12257}\x{11234}b
|
||||
a\x{100}b
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
|
||||
# Check maximum character size
|
||||
|
||||
/\x{ffff}/IB,utf
|
||||
|
@ -90,16 +47,6 @@
|
|||
|
||||
/\x{212ab}/IB,utf
|
||||
|
||||
# These two \C tests, copied from the UTF-8 input file, do not have any
|
||||
# problems in 16 or 32 bits.
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
X\nabc
|
||||
|
||||
/a\Cb/utf
|
||||
a\x{100}b
|
||||
|
||||
/[^ab\xC0-\xF0]/IB,utf
|
||||
\x{f1}
|
||||
\x{bf}
|
||||
|
@ -336,9 +283,6 @@
|
|||
|
||||
/\o{4200000}/utf
|
||||
|
||||
/\C/utf
|
||||
\x{110000}
|
||||
|
||||
/\x{100}*A/IB,utf
|
||||
A
|
||||
|
||||
|
@ -396,4 +340,7 @@
|
|||
|
||||
/\x{3a3}B/IBi,utf
|
||||
|
||||
/./utf
|
||||
\x{110000}
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -3739,41 +3739,40 @@
|
|||
|
||||
/[bcd]*a/B
|
||||
|
||||
# A complete set of tests for auto-possessification of character types.
|
||||
# A complete set of tests for auto-possessification of character types, but
|
||||
# omitting \C because it might be disabled (it has its own tests).
|
||||
|
||||
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\C \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
||||
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
||||
|
||||
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\C \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
||||
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
||||
|
||||
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\C \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
||||
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
||||
|
||||
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\C \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
||||
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
||||
|
||||
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\C \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
||||
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
||||
|
||||
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\C \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
||||
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
||||
|
||||
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\C \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
||||
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
||||
|
||||
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\C \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
||||
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
||||
|
||||
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\C \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
||||
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
||||
|
||||
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\C \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
||||
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
||||
|
||||
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\C \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
||||
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
||||
|
||||
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\C \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
||||
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
||||
|
||||
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\C a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
||||
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
||||
|
||||
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\C \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
||||
|
||||
/\D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \C+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
||||
/ \D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
||||
|
||||
/(?=a+)a(a+)++a/B
|
||||
|
||||
|
@ -4327,8 +4326,6 @@
|
|||
|
||||
/((?2){73}(?2))((?1))/info
|
||||
|
||||
/ab\Cde/never_backslash_c
|
||||
|
||||
/abc/
|
||||
\= Expect no match
|
||||
\[9x!xxx(]{9999}
|
||||
|
@ -4446,12 +4443,6 @@
|
|||
/\x0{ab}/
|
||||
\0{ab}
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
|
||||
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
|
||||
ababababbbabZXXXX
|
||||
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
# These are tests of \C that do not involve UTF. They are not run when \C is
|
||||
# disabled by compiling with --enable-never-backslash-C.
|
||||
|
||||
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
||||
|
||||
/\D+\C \d+\C \S+\C \s+\C \W+\C \w+\C .+\C \R+\C \H+\C \h+\C \V+\C \v+\C a+\C \n+\C \C+\C/Bx
|
||||
|
||||
/ab\Cde/never_backslash_c
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
|
||||
# End of testinput21
|
|
@ -0,0 +1,95 @@
|
|||
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||
# in some widths and not in others.
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
|
||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
|
||||
# Autopossessification tests
|
||||
|
||||
/\C+\X \X+\C/Bx
|
||||
|
||||
/\C+\X \X+\C/Bx,utf
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
X\x{11234}YZ
|
||||
X\x{11234}YZW
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
X\x{1234}
|
||||
X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
X\x{1234}\x{512}YZ
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
X\x{1234}
|
||||
X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
X\x{11234}\x{512}YZ
|
||||
X\x{11234}
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
a\nb
|
||||
a\x{100}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
a\x{12257}b
|
||||
a\x{12257}\x{11234}b
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||
# and you can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
X\nabc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match in 8-bit mode
|
||||
a\x{100}b
|
||||
|
||||
# End of testinput22
|
|
@ -0,0 +1,7 @@
|
|||
# This test is run when PCRE2 has been built with --enable-never-backslash-C,
|
||||
# which disables the use of \C. All we can do is check that it gives the
|
||||
# correct error message.
|
||||
|
||||
/a\Cb/
|
||||
|
||||
# End of testinput23
|
|
@ -111,9 +111,6 @@
|
|||
/.{3,5}?/IB,utf
|
||||
\x{212ab}\x{212ab}\x{212ab}\x{861}
|
||||
|
||||
/(?<=\C)X/utf
|
||||
Should produce an error diagnostic
|
||||
|
||||
/^[ab]/IB,utf
|
||||
bar
|
||||
\= Expect no match
|
||||
|
@ -1367,8 +1364,6 @@
|
|||
\= Expect no match
|
||||
aAz
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
|
||||
/\X/
|
||||
a\=ps
|
||||
a\=ph
|
||||
|
@ -1617,13 +1612,13 @@
|
|||
|
||||
/[\p{L}ab]{2,3}+/B,no_auto_possess
|
||||
|
||||
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \C+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
||||
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
||||
|
||||
/.+\X/Bsx
|
||||
|
||||
/\X+$/Bmx
|
||||
|
||||
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\C \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
||||
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
||||
|
||||
/\d+\s{0,5}=\s*\S?=\w{0,4}\W*/B,utf,ucp
|
||||
|
||||
|
@ -1665,16 +1660,6 @@
|
|||
|
||||
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
|
||||
/[\pS#moq]/
|
||||
=
|
||||
|
||||
|
|
|
@ -4645,12 +4645,6 @@
|
|||
aaaa\=ovector=3
|
||||
aaaa\=ovector=4
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
|
||||
/^\R/
|
||||
\r\=ps
|
||||
\r\=ph
|
||||
|
|
|
@ -671,11 +671,6 @@
|
|||
the cat\=ps
|
||||
the cat\=ph
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
|
||||
/./newline=crlf,utf
|
||||
\r\=ps
|
||||
\r\=ph
|
||||
|
|
|
@ -4,10 +4,8 @@
|
|||
#forbid_utf
|
||||
#newline_default lf any anycrlf
|
||||
|
||||
/a\Cb/
|
||||
aXb
|
||||
a\nb
|
||||
\= Expect no match and error message (too big char)
|
||||
/ab/
|
||||
\= Expect error message (too big char) and no match
|
||||
A\x{123}B
|
||||
A\o{443}B
|
||||
|
||||
|
|
|
@ -1,67 +1,6 @@
|
|||
# This set of tests is for UTF-8 support and Unicode property support, with
|
||||
# relevance only for the 8-bit library.
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}Y
|
||||
1: \x{1234}Y
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
0: X\x{1234}\x{512}
|
||||
X\x{1234}\x{512}YZ
|
||||
0: X\x{1234}\x{512}
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}
|
||||
X\x{1234}\x{512}
|
||||
0: X\x{1234}
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
\= Expect no match
|
||||
a\x{12257}b
|
||||
No match
|
||||
|
||||
# The next 3 patterns have UTF-8 errors
|
||||
|
||||
/[Ã]/utf
|
||||
|
@ -511,28 +450,6 @@ First code unit = \xf0
|
|||
Last code unit = \xab
|
||||
Subject length lower bound = 1
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single-byte is displayed. (In Perl it becomes a character, and you
|
||||
# can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{e1}
|
||||
2: \x{88}\x{b4}
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match
|
||||
a\x{100}b
|
||||
No match
|
||||
|
||||
/[^ab\xC0-\xF0]/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
|
|
@ -6,12 +6,6 @@
|
|||
#forbid_utf
|
||||
#newline_default LF ANY ANYCRLF
|
||||
|
||||
/a\Cb/
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x0ab
|
||||
|
||||
/[^\x{c4}]/IB
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -582,7 +576,7 @@ Failed: error 134 at offset 11: character code point value in \x{} or \o{} is to
|
|||
|
||||
# Non-UTF characters
|
||||
|
||||
/\C{2,3}/
|
||||
/.{2,3}/
|
||||
\x{400000}\x{400001}\x{400002}\x{400003}
|
||||
** Character \x{400000} is greater than 0xffff and UTF-16 mode is not enabled.
|
||||
** Truncation will probably give the wrong result.
|
||||
|
|
|
@ -6,12 +6,6 @@
|
|||
#forbid_utf
|
||||
#newline_default LF ANY ANYCRLF
|
||||
|
||||
/a\Cb/
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x0ab
|
||||
|
||||
/[^\x{c4}]/IB
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -583,7 +577,7 @@ Subject length lower bound = 2
|
|||
|
||||
# Non-UTF characters
|
||||
|
||||
/\C{2,3}/
|
||||
/.{2,3}/
|
||||
\x{400000}\x{400001}\x{400002}\x{400003}
|
||||
0: \x{400000}\x{400001}\x{400002}
|
||||
|
||||
|
|
|
@ -9,76 +9,6 @@
|
|||
Ã]
|
||||
** Failed: invalid UTF-8 string cannot be used as input in UTF mode
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
1: \x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}Y
|
||||
1: \x{11234}Y
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
X\x{11234}YZW
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
0: X\x{11234}\x{512}
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
0: X\x{11234}\x{512}\x{11234}
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}
|
||||
\= Expect no match
|
||||
X\x{11234}
|
||||
No match
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{12257}b
|
||||
0: a\x{12257}b
|
||||
\= Expect no match
|
||||
a\x{12257}\x{11234}b
|
||||
No match
|
||||
a\x{100}b
|
||||
No match
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# Check maximum character size
|
||||
|
||||
/\x{ffff}/IB,utf
|
||||
|
@ -308,23 +238,6 @@ First code unit = \x{d844}
|
|||
Last code unit = \x{deab}
|
||||
Subject length lower bound = 1
|
||||
|
||||
# These two \C tests, copied from the UTF-8 input file, do not have any
|
||||
# problems in 16 or 32 bits.
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
2:
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
/a\Cb/utf
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
/[^ab\xC0-\xF0]/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -1127,10 +1040,6 @@ Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too
|
|||
/\o{4200000}/utf
|
||||
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
|
||||
|
||||
/\C/utf
|
||||
\x{110000}
|
||||
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
||||
|
||||
/\x{100}*A/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -1454,4 +1363,8 @@ Starting code units: \xff
|
|||
Last code unit = 'B' (caseless)
|
||||
Subject length lower bound = 2
|
||||
|
||||
/./utf
|
||||
\x{110000}
|
||||
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -9,74 +9,6 @@
|
|||
Ã]
|
||||
** Failed: invalid UTF-8 string cannot be used as input in UTF mode
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{11234}YZ
|
||||
No match
|
||||
X\x{11234}YZW
|
||||
0: X\x{11234}YZW
|
||||
1: \x{11234}YZW
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
No match
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
0: X\x{11234}\x{512}\x{11234}Z
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}Y
|
||||
\= Expect no match
|
||||
X\x{11234}
|
||||
No match
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{12257}b
|
||||
No match
|
||||
\= Expect no match
|
||||
a\x{12257}\x{11234}b
|
||||
0: a\x{12257}\x{11234}b
|
||||
a\x{100}b
|
||||
No match
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# Check maximum character size
|
||||
|
||||
/\x{ffff}/IB,utf
|
||||
|
@ -301,23 +233,6 @@ Options: utf
|
|||
First code unit = \x{212ab}
|
||||
Subject length lower bound = 1
|
||||
|
||||
# These two \C tests, copied from the UTF-8 input file, do not have any
|
||||
# problems in 16 or 32 bits.
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
2:
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
/a\Cb/utf
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
/[^ab\xC0-\xF0]/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -1119,10 +1034,6 @@ Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too
|
|||
/\o{4200000}/utf
|
||||
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
|
||||
|
||||
/\C/utf
|
||||
\x{110000}
|
||||
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0
|
||||
|
||||
/\x{100}*A/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -1446,4 +1357,8 @@ Starting code units: \xff
|
|||
Last code unit = 'B' (caseless)
|
||||
Subject length lower bound = 2
|
||||
|
||||
/./utf
|
||||
\x{110000}
|
||||
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -11948,9 +11948,10 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
# A complete set of tests for auto-possessification of character types.
|
||||
# A complete set of tests for auto-possessification of character types, but
|
||||
# omitting \C because it might be disabled (it has its own tests).
|
||||
|
||||
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\C \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
||||
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\D+
|
||||
|
@ -11968,8 +11969,6 @@ Subject length lower bound = 5
|
|||
\D+
|
||||
Any
|
||||
\D+
|
||||
AllAny
|
||||
\D+
|
||||
\R
|
||||
\D+
|
||||
\H
|
||||
|
@ -11989,7 +11988,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\C \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
||||
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\d++
|
||||
|
@ -12006,8 +12005,6 @@ Subject length lower bound = 5
|
|||
\w
|
||||
\d+
|
||||
Any
|
||||
\d+
|
||||
AllAny
|
||||
\d++
|
||||
\R
|
||||
\d+
|
||||
|
@ -12028,7 +12025,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\C \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
||||
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\S+
|
||||
|
@ -12045,8 +12042,6 @@ Subject length lower bound = 5
|
|||
\w
|
||||
\S+
|
||||
Any
|
||||
\S+
|
||||
AllAny
|
||||
\S++
|
||||
\R
|
||||
\S+
|
||||
|
@ -12067,7 +12062,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\C \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
||||
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\s+
|
||||
|
@ -12085,8 +12080,6 @@ Subject length lower bound = 5
|
|||
\s+
|
||||
Any
|
||||
\s+
|
||||
AllAny
|
||||
\s+
|
||||
\R
|
||||
\s+
|
||||
\H
|
||||
|
@ -12106,7 +12099,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\C \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
||||
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\W+
|
||||
|
@ -12124,8 +12117,6 @@ Subject length lower bound = 5
|
|||
\W+
|
||||
Any
|
||||
\W+
|
||||
AllAny
|
||||
\W+
|
||||
\R
|
||||
\W+
|
||||
\H
|
||||
|
@ -12145,7 +12136,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\C \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
||||
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\w+
|
||||
|
@ -12162,8 +12153,6 @@ Subject length lower bound = 5
|
|||
\w
|
||||
\w+
|
||||
Any
|
||||
\w+
|
||||
AllAny
|
||||
\w++
|
||||
\R
|
||||
\w+
|
||||
|
@ -12184,7 +12173,303 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\C \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
||||
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\R+
|
||||
\D
|
||||
\R++
|
||||
\d
|
||||
\R+
|
||||
\S
|
||||
\R++
|
||||
\s
|
||||
\R+
|
||||
\W
|
||||
\R++
|
||||
\w
|
||||
\R++
|
||||
Any
|
||||
\R+
|
||||
\R
|
||||
\R+
|
||||
\H
|
||||
\R++
|
||||
\h
|
||||
\R+
|
||||
\V
|
||||
\R+
|
||||
\v
|
||||
\R+
|
||||
\Z
|
||||
\R++
|
||||
\z
|
||||
\R+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\H+
|
||||
\D
|
||||
\H+
|
||||
\d
|
||||
\H+
|
||||
\S
|
||||
\H+
|
||||
\s
|
||||
\H+
|
||||
\W
|
||||
\H+
|
||||
\w
|
||||
\H+
|
||||
Any
|
||||
\H+
|
||||
\R
|
||||
\H+
|
||||
\H
|
||||
\H++
|
||||
\h
|
||||
\H+
|
||||
\V
|
||||
\H+
|
||||
\v
|
||||
\H+
|
||||
\Z
|
||||
\H++
|
||||
\z
|
||||
\H+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\h+
|
||||
\D
|
||||
\h++
|
||||
\d
|
||||
\h++
|
||||
\S
|
||||
\h+
|
||||
\s
|
||||
\h+
|
||||
\W
|
||||
\h++
|
||||
\w
|
||||
\h+
|
||||
Any
|
||||
\h++
|
||||
\R
|
||||
\h++
|
||||
\H
|
||||
\h+
|
||||
\h
|
||||
\h+
|
||||
\V
|
||||
\h++
|
||||
\v
|
||||
\h+
|
||||
\Z
|
||||
\h++
|
||||
\z
|
||||
\h+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\V+
|
||||
\D
|
||||
\V+
|
||||
\d
|
||||
\V+
|
||||
\S
|
||||
\V+
|
||||
\s
|
||||
\V+
|
||||
\W
|
||||
\V+
|
||||
\w
|
||||
\V+
|
||||
Any
|
||||
\V++
|
||||
\R
|
||||
\V+
|
||||
\H
|
||||
\V+
|
||||
\h
|
||||
\V+
|
||||
\V
|
||||
\V++
|
||||
\v
|
||||
\V+
|
||||
\Z
|
||||
\V++
|
||||
\z
|
||||
\V+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\v+
|
||||
\D
|
||||
\v++
|
||||
\d
|
||||
\v++
|
||||
\S
|
||||
\v+
|
||||
\s
|
||||
\v+
|
||||
\W
|
||||
\v++
|
||||
\w
|
||||
\v+
|
||||
Any
|
||||
\v+
|
||||
\R
|
||||
\v+
|
||||
\H
|
||||
\v++
|
||||
\h
|
||||
\v++
|
||||
\V
|
||||
\v+
|
||||
\v
|
||||
\v+
|
||||
\Z
|
||||
\v++
|
||||
\z
|
||||
\v+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
a+
|
||||
\D
|
||||
a++
|
||||
\d
|
||||
a+
|
||||
\S
|
||||
a++
|
||||
\s
|
||||
a++
|
||||
\W
|
||||
a+
|
||||
\w
|
||||
a+
|
||||
Any
|
||||
a++
|
||||
\R
|
||||
a+
|
||||
\H
|
||||
a++
|
||||
\h
|
||||
a+
|
||||
\V
|
||||
a++
|
||||
\v
|
||||
a++
|
||||
\Z
|
||||
a++
|
||||
\z
|
||||
a++
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\x0a+
|
||||
\D
|
||||
\x0a++
|
||||
\d
|
||||
\x0a++
|
||||
\S
|
||||
\x0a+
|
||||
\s
|
||||
\x0a+
|
||||
\W
|
||||
\x0a++
|
||||
\w
|
||||
\x0a+
|
||||
Any
|
||||
\x0a+
|
||||
\R
|
||||
\x0a+
|
||||
\H
|
||||
\x0a++
|
||||
\h
|
||||
\x0a++
|
||||
\V
|
||||
\x0a+
|
||||
\v
|
||||
\x0a+
|
||||
\Z
|
||||
\x0a++
|
||||
\z
|
||||
\x0a+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
Any+
|
||||
\D
|
||||
Any+
|
||||
\d
|
||||
Any+
|
||||
\S
|
||||
Any+
|
||||
\s
|
||||
Any+
|
||||
\W
|
||||
Any+
|
||||
\w
|
||||
Any+
|
||||
Any
|
||||
Any++
|
||||
\R
|
||||
Any+
|
||||
\H
|
||||
Any+
|
||||
\h
|
||||
Any+
|
||||
\V
|
||||
Any+
|
||||
\v
|
||||
Any+
|
||||
\Z
|
||||
Any++
|
||||
\z
|
||||
Any+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
|
@ -12200,8 +12485,6 @@ Subject length lower bound = 5
|
|||
AllAny+
|
||||
\w
|
||||
AllAny+
|
||||
Any
|
||||
AllAny+
|
||||
AllAny
|
||||
AllAny+
|
||||
\R
|
||||
|
@ -12223,358 +12506,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\C \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\R+
|
||||
\D
|
||||
\R++
|
||||
\d
|
||||
\R+
|
||||
\S
|
||||
\R++
|
||||
\s
|
||||
\R+
|
||||
\W
|
||||
\R++
|
||||
\w
|
||||
\R++
|
||||
Any
|
||||
\R+
|
||||
AllAny
|
||||
\R+
|
||||
\R
|
||||
\R+
|
||||
\H
|
||||
\R++
|
||||
\h
|
||||
\R+
|
||||
\V
|
||||
\R+
|
||||
\v
|
||||
\R+
|
||||
\Z
|
||||
\R++
|
||||
\z
|
||||
\R+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\C \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\H+
|
||||
\D
|
||||
\H+
|
||||
\d
|
||||
\H+
|
||||
\S
|
||||
\H+
|
||||
\s
|
||||
\H+
|
||||
\W
|
||||
\H+
|
||||
\w
|
||||
\H+
|
||||
Any
|
||||
\H+
|
||||
AllAny
|
||||
\H+
|
||||
\R
|
||||
\H+
|
||||
\H
|
||||
\H++
|
||||
\h
|
||||
\H+
|
||||
\V
|
||||
\H+
|
||||
\v
|
||||
\H+
|
||||
\Z
|
||||
\H++
|
||||
\z
|
||||
\H+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\C \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\h+
|
||||
\D
|
||||
\h++
|
||||
\d
|
||||
\h++
|
||||
\S
|
||||
\h+
|
||||
\s
|
||||
\h+
|
||||
\W
|
||||
\h++
|
||||
\w
|
||||
\h+
|
||||
Any
|
||||
\h+
|
||||
AllAny
|
||||
\h++
|
||||
\R
|
||||
\h++
|
||||
\H
|
||||
\h+
|
||||
\h
|
||||
\h+
|
||||
\V
|
||||
\h++
|
||||
\v
|
||||
\h+
|
||||
\Z
|
||||
\h++
|
||||
\z
|
||||
\h+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\C \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\V+
|
||||
\D
|
||||
\V+
|
||||
\d
|
||||
\V+
|
||||
\S
|
||||
\V+
|
||||
\s
|
||||
\V+
|
||||
\W
|
||||
\V+
|
||||
\w
|
||||
\V+
|
||||
Any
|
||||
\V+
|
||||
AllAny
|
||||
\V++
|
||||
\R
|
||||
\V+
|
||||
\H
|
||||
\V+
|
||||
\h
|
||||
\V+
|
||||
\V
|
||||
\V++
|
||||
\v
|
||||
\V+
|
||||
\Z
|
||||
\V++
|
||||
\z
|
||||
\V+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\C \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\v+
|
||||
\D
|
||||
\v++
|
||||
\d
|
||||
\v++
|
||||
\S
|
||||
\v+
|
||||
\s
|
||||
\v+
|
||||
\W
|
||||
\v++
|
||||
\w
|
||||
\v+
|
||||
Any
|
||||
\v+
|
||||
AllAny
|
||||
\v+
|
||||
\R
|
||||
\v+
|
||||
\H
|
||||
\v++
|
||||
\h
|
||||
\v++
|
||||
\V
|
||||
\v+
|
||||
\v
|
||||
\v+
|
||||
\Z
|
||||
\v++
|
||||
\z
|
||||
\v+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\C a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
a+
|
||||
\D
|
||||
a++
|
||||
\d
|
||||
a+
|
||||
\S
|
||||
a++
|
||||
\s
|
||||
a++
|
||||
\W
|
||||
a+
|
||||
\w
|
||||
a+
|
||||
Any
|
||||
a+
|
||||
AllAny
|
||||
a++
|
||||
\R
|
||||
a+
|
||||
\H
|
||||
a++
|
||||
\h
|
||||
a+
|
||||
\V
|
||||
a++
|
||||
\v
|
||||
a++
|
||||
\Z
|
||||
a++
|
||||
\z
|
||||
a++
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\C \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\x0a+
|
||||
\D
|
||||
\x0a++
|
||||
\d
|
||||
\x0a++
|
||||
\S
|
||||
\x0a+
|
||||
\s
|
||||
\x0a+
|
||||
\W
|
||||
\x0a++
|
||||
\w
|
||||
\x0a+
|
||||
Any
|
||||
\x0a+
|
||||
AllAny
|
||||
\x0a+
|
||||
\R
|
||||
\x0a+
|
||||
\H
|
||||
\x0a++
|
||||
\h
|
||||
\x0a++
|
||||
\V
|
||||
\x0a+
|
||||
\v
|
||||
\x0a+
|
||||
\Z
|
||||
\x0a++
|
||||
\z
|
||||
\x0a+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
Any+
|
||||
\D
|
||||
Any+
|
||||
\d
|
||||
Any+
|
||||
\S
|
||||
Any+
|
||||
\s
|
||||
Any+
|
||||
\W
|
||||
Any+
|
||||
\w
|
||||
Any+
|
||||
Any
|
||||
Any+
|
||||
AllAny
|
||||
Any++
|
||||
\R
|
||||
Any+
|
||||
\H
|
||||
Any+
|
||||
\h
|
||||
Any+
|
||||
\V
|
||||
Any+
|
||||
\v
|
||||
Any+
|
||||
\Z
|
||||
Any++
|
||||
\z
|
||||
Any+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
\D
|
||||
AllAny+
|
||||
\d
|
||||
AllAny+
|
||||
\S
|
||||
AllAny+
|
||||
\s
|
||||
AllAny+
|
||||
\W
|
||||
AllAny+
|
||||
\w
|
||||
AllAny+
|
||||
AllAny
|
||||
AllAny+
|
||||
AllAny
|
||||
AllAny+
|
||||
\R
|
||||
AllAny+
|
||||
\H
|
||||
AllAny+
|
||||
\h
|
||||
AllAny+
|
||||
\V
|
||||
AllAny+
|
||||
\v
|
||||
AllAny+
|
||||
\Z
|
||||
AllAny++
|
||||
\z
|
||||
AllAny+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \C+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
||||
/ \D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\D+
|
||||
|
@ -12588,8 +12520,6 @@ Subject length lower bound = 5
|
|||
\W+
|
||||
/m $
|
||||
\w++
|
||||
/m $
|
||||
AllAny+
|
||||
/m $
|
||||
\R+
|
||||
/m $
|
||||
|
@ -14210,9 +14140,6 @@ Capturing subpattern count = 2
|
|||
May match empty string
|
||||
Subject length lower bound = 0
|
||||
|
||||
/ab\Cde/never_backslash_c
|
||||
Failed: error 183 at offset 3: using \C is disabled by the application
|
||||
|
||||
/abc/
|
||||
\= Expect no match
|
||||
\[9x!xxx(]{9999}
|
||||
|
@ -14532,14 +14459,6 @@ Subject length lower bound = 0
|
|||
\0{ab}
|
||||
0: \x00{ab}
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
0: X
|
||||
|
||||
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
|
||||
ababababbbabZXXXX
|
||||
0: ababababbbabZ
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
# These are tests of \C that do not involve UTF. They are not run when \C is
|
||||
# disabled by compiling with --enable-never-backslash-C.
|
||||
|
||||
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
\D
|
||||
AllAny+
|
||||
\d
|
||||
AllAny+
|
||||
\S
|
||||
AllAny+
|
||||
\s
|
||||
AllAny+
|
||||
\W
|
||||
AllAny+
|
||||
\w
|
||||
AllAny+
|
||||
Any
|
||||
AllAny+
|
||||
\R
|
||||
AllAny+
|
||||
\H
|
||||
AllAny+
|
||||
\h
|
||||
AllAny+
|
||||
\V
|
||||
AllAny+
|
||||
\v
|
||||
AllAny+
|
||||
\Z
|
||||
AllAny++
|
||||
\z
|
||||
AllAny+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\D+\C \d+\C \S+\C \s+\C \W+\C \w+\C .+\C \R+\C \H+\C \h+\C \V+\C \v+\C a+\C \n+\C \C+\C/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\D+
|
||||
AllAny
|
||||
\d+
|
||||
AllAny
|
||||
\S+
|
||||
AllAny
|
||||
\s+
|
||||
AllAny
|
||||
\W+
|
||||
AllAny
|
||||
\w+
|
||||
AllAny
|
||||
Any+
|
||||
AllAny
|
||||
\R+
|
||||
AllAny
|
||||
\H+
|
||||
AllAny
|
||||
\h+
|
||||
AllAny
|
||||
\V+
|
||||
AllAny
|
||||
\v+
|
||||
AllAny
|
||||
a+
|
||||
AllAny
|
||||
\x0a+
|
||||
AllAny
|
||||
AllAny+
|
||||
AllAny
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ab\Cde/never_backslash_c
|
||||
Failed: error 183 at offset 3: using \C is disabled by the application
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
0: X
|
||||
|
||||
# End of testinput21
|
|
@ -0,0 +1,161 @@
|
|||
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||
# in some widths and not in others.
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||
|
||||
# Autopossessification tests
|
||||
|
||||
/\C+\X \X+\C/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
extuni
|
||||
extuni+
|
||||
AllAny
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C+\X \X+\C/Bx,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
Anybyte+
|
||||
extuni
|
||||
extuni+
|
||||
Anybyte
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
No match
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
No match
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
1: \x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}Y
|
||||
1: \x{11234}Y
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
X\x{11234}YZW
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
No match
|
||||
X\x{1234}\x{512}YZ
|
||||
0: X\x{1234}\x{512}YZ
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
0: X\x{11234}\x{512}
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
0: X\x{11234}\x{512}\x{11234}
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
No match
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}
|
||||
X\x{11234}
|
||||
No match
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
No match
|
||||
a\x{12257}b
|
||||
0: a\x{12257}b
|
||||
a\x{12257}\x{11234}b
|
||||
No match
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||
# and you can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
2:
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match in 8-bit mode
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
# End of testinput22
|
|
@ -0,0 +1,159 @@
|
|||
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||
# in some widths and not in others.
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||
|
||||
# Autopossessification tests
|
||||
|
||||
/\C+\X \X+\C/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
extuni
|
||||
extuni+
|
||||
AllAny
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C+\X \X+\C/Bx,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
Anybyte+
|
||||
extuni
|
||||
extuni+
|
||||
Anybyte
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
No match
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
No match
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
No match
|
||||
X\x{11234}YZW
|
||||
0: X\x{11234}YZW
|
||||
1: \x{11234}YZW
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
No match
|
||||
X\x{1234}\x{512}YZ
|
||||
0: X\x{1234}\x{512}YZ
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
No match
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
0: X\x{11234}\x{512}\x{11234}Z
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
No match
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}Y
|
||||
X\x{11234}
|
||||
No match
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
No match
|
||||
a\x{12257}b
|
||||
No match
|
||||
a\x{12257}\x{11234}b
|
||||
0: a\x{12257}\x{11234}b
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||
# and you can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
2:
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match in 8-bit mode
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
# End of testinput22
|
|
@ -0,0 +1,163 @@
|
|||
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||
# in some widths and not in others.
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||
|
||||
# Autopossessification tests
|
||||
|
||||
/\C+\X \X+\C/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
extuni
|
||||
extuni+
|
||||
AllAny
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C+\X \X+\C/Bx,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
Anybyte+
|
||||
extuni
|
||||
extuni+
|
||||
Anybyte
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
No match
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
No match
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
X\x{11234}Y
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
1: \x{f0}\x{91}\x{88}
|
||||
X\x{11234}YZ
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
1: \x{f0}\x{91}\x{88}
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}Y
|
||||
1: \x{1234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}
|
||||
1: \x{11234}
|
||||
X\x{11234}YZW
|
||||
0: X\x{11234}
|
||||
1: \x{11234}
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
0: X\x{1234}\x{512}
|
||||
X\x{1234}\x{512}YZ
|
||||
0: X\x{1234}\x{512}
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}\x{512}
|
||||
0: X\x{11234}\x{d4}
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{d4}
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
0: X\x{11234}\x{d4}
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}
|
||||
X\x{1234}\x{512}
|
||||
0: X\x{1234}
|
||||
X\x{11234}Y
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
X\x{11234}YZ
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
X\x{11234}
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
a\x{100}b
|
||||
No match
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
a\x{12257}b
|
||||
No match
|
||||
a\x{12257}\x{11234}b
|
||||
No match
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||
# and you can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{e1}
|
||||
2: \x{88}\x{b4}
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match in 8-bit mode
|
||||
a\x{100}b
|
||||
No match
|
||||
|
||||
# End of testinput22
|
|
@ -0,0 +1,8 @@
|
|||
# This test is run when PCRE2 has been built with --enable-never-backslash-C,
|
||||
# which disables the use of \C. All we can do is check that it gives the
|
||||
# correct error message.
|
||||
|
||||
/a\Cb/
|
||||
Failed: error 185 at offset 2: using \C is disabled in this PCRE2 library
|
||||
|
||||
# End of testinput23
|
|
@ -181,10 +181,6 @@ Subject length lower bound = 3
|
|||
\x{212ab}\x{212ab}\x{212ab}\x{861}
|
||||
0: \x{212ab}\x{212ab}\x{212ab}
|
||||
|
||||
/(?<=\C)X/utf
|
||||
Failed: error 136 at offset 6: \C is not allowed in a lookbehind assertion
|
||||
Should produce an error diagnostic
|
||||
|
||||
/^[ab]/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -2905,9 +2901,6 @@ No match
|
|||
aAz
|
||||
No match
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||
|
||||
/\X/
|
||||
a\=ps
|
||||
0: a
|
||||
|
@ -3803,7 +3796,7 @@ No match
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \C+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
||||
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\D+
|
||||
|
@ -3818,8 +3811,6 @@ No match
|
|||
extuni
|
||||
\w+
|
||||
extuni
|
||||
AllAny+
|
||||
extuni
|
||||
\R+
|
||||
extuni
|
||||
\H+
|
||||
|
@ -3858,7 +3849,7 @@ No match
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\C \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
||||
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
extuni+
|
||||
|
@ -3876,8 +3867,6 @@ No match
|
|||
extuni+
|
||||
Any
|
||||
extuni+
|
||||
AllAny
|
||||
extuni+
|
||||
\R
|
||||
extuni+
|
||||
\H
|
||||
|
@ -4010,18 +3999,6 @@ Failed: error 122 at offset 1227: unmatched closing parenthesis
|
|||
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
|
||||
Failed: error 124 at offset 113: letter or underscore expected after (?< or (?'
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
No match
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
No match
|
||||
|
||||
/[\pS#moq]/
|
||||
=
|
||||
0: =
|
||||
|
|
|
@ -7174,14 +7174,6 @@ Matched, but offsets vector is too small to show all matches
|
|||
2: aa
|
||||
3: a
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
0: X
|
||||
|
||||
/^\R/
|
||||
\r\=ps
|
||||
0: \x0d
|
||||
|
|
|
@ -1141,13 +1141,6 @@ Partial match: abcde
|
|||
the cat\=ph
|
||||
Partial match: the cat
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
Failed: error -42: pattern contains an item that is not supported for DFA matching
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||
|
||||
/./newline=crlf,utf
|
||||
\r\=ps
|
||||
0: \x{0d}
|
||||
|
|
|
@ -4,12 +4,8 @@
|
|||
#forbid_utf
|
||||
#newline_default lf any anycrlf
|
||||
|
||||
/a\Cb/
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x0ab
|
||||
\= Expect no match and error message (too big char)
|
||||
/ab/
|
||||
\= Expect error message (too big char) and no match
|
||||
A\x{123}B
|
||||
** Character \x{123} is greater than 255 and UTF-8 mode is not enabled.
|
||||
** Truncation will probably give the wrong result.
|
||||
|
|
Loading…
Reference in New Issue