Implement --never-backslash-C
This commit is contained in:
parent
5923caf05e
commit
3263d44b97
|
@ -70,6 +70,7 @@
|
|||
# 2015-04-24 PH added support for PCRE2_DEBUG
|
||||
# 2015-07-16 PH updated for new pcre2_find_bracket source module
|
||||
# 2015-08-24 PH correct C_FLAGS setting (patch from Roy Ivy III)
|
||||
# 2015-10=16 PH added support for never-backslash-C
|
||||
|
||||
PROJECT(PCRE2 C)
|
||||
|
||||
|
@ -162,6 +163,9 @@ SET(PCRE2_SUPPORT_UNICODE ON CACHE BOOL
|
|||
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
|
||||
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
|
||||
|
||||
SET(PCRE2_NEVER_BACKSLASH_C OFF CACHE BOOL
|
||||
"If ON, backslash-C (upper case C) is locked out.")
|
||||
|
||||
SET(PCRE2_SUPPORT_VALGRIND OFF CACHE BOOL
|
||||
"Enable Valgrind support.")
|
||||
|
||||
|
@ -252,6 +256,10 @@ IF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
|||
SET(BSR_ANYCRLF 1)
|
||||
ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
||||
|
||||
IF(PCRE2_NEVER_BACKSLASH_C)
|
||||
SET(NEVER_BACKSLASH_C 1)
|
||||
ENDIF(PCRE2_NEVER_BACKSLASH_C)
|
||||
|
||||
IF(PCRE2_SUPPORT_UNICODE)
|
||||
SET(SUPPORT_UNICODE 1)
|
||||
ENDIF(PCRE2_SUPPORT_UNICODE)
|
||||
|
@ -719,6 +727,7 @@ IF(PCRE2_SHOW_REPORT)
|
|||
MESSAGE(STATUS " Enable Unicode support .......... : ${PCRE2_SUPPORT_UNICODE}")
|
||||
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE2_NEWLINE}")
|
||||
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE2_SUPPORT_BSR_ANYCRLF}")
|
||||
MESSAGE(STATUS " \\C is disabled .................. : ${PCRE2_NEVER_BACKSLASH_C}")
|
||||
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE2_EBCDIC}")
|
||||
MESSAGE(STATUS " EBCDIC coding with NL=0x25 ...... : ${PCRE2_EBCDIC_NL25}")
|
||||
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE2_REBUILD_CHARTABLES}")
|
||||
|
|
|
@ -201,6 +201,8 @@ escape was being ignored.
|
|||
57. Fixed integer overflow for patterns whose minimum matching length is very,
|
||||
very large.
|
||||
|
||||
58. Implemented --never-backslash-C.
|
||||
|
||||
|
||||
Version 10.20 30-June-2015
|
||||
--------------------------
|
||||
|
|
9
README
9
README
|
@ -219,6 +219,13 @@ library. They are also documented in the pcre2build man page.
|
|||
to be the end of a line (see above). However, the caller of PCRE2 can
|
||||
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||
|
||||
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||
characters. You can build PCRE2 with the use of \C permanently locked out by
|
||||
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||
command. When \C is allowed by the library, individual applications can lock
|
||||
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||
|
||||
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||
pattern. This limits the amount of system stack that a pattern uses when it
|
||||
|
@ -833,4 +840,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 16 July 2015
|
||||
Last updated: 16 October 2015
|
||||
|
|
64
RunTest
64
RunTest
|
@ -75,7 +75,10 @@ title17="Test 17: JIT-specific features when JIT is available"
|
|||
title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP"
|
||||
title19="Test 19: Tests of the POSIX interface with UTF/UCP"
|
||||
title20="Test 20: Serialization tests"
|
||||
maxtest=20
|
||||
title21="Test 21: \C tests without UTF (supported for DFA matching)"
|
||||
title22="Test 22: \C tests with UTF (not supported for DFA matching)"
|
||||
title23="Test 23: \C disabled test"
|
||||
maxtest=23
|
||||
|
||||
if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||
echo $title0
|
||||
|
@ -99,6 +102,9 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
|
|||
echo $title18
|
||||
echo $title19
|
||||
echo $title20
|
||||
echo $title21
|
||||
echo $title22
|
||||
echo $title23
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
@ -223,6 +229,9 @@ do17=no
|
|||
do18=no
|
||||
do19=no
|
||||
do20=no
|
||||
do21=no
|
||||
do22=no
|
||||
do23=no
|
||||
|
||||
while [ $# -gt 0 ] ; do
|
||||
case $1 in
|
||||
|
@ -247,6 +256,9 @@ while [ $# -gt 0 ] ; do
|
|||
18) do18=yes;;
|
||||
19) do19=yes;;
|
||||
20) do20=yes;;
|
||||
21) do21=yes;;
|
||||
22) do22=yes;;
|
||||
23) do23=yes;;
|
||||
-8) arg8=yes;;
|
||||
-16) arg16=yes;;
|
||||
-32) arg32=yes;;
|
||||
|
@ -326,6 +338,11 @@ support16=$?
|
|||
$sim ./pcre2test -C pcre2-32 >/dev/null
|
||||
support32=$?
|
||||
|
||||
# \C may be disabled
|
||||
|
||||
$sim ./pcre2test -C backslash-C >/dev/null
|
||||
supportBSC=$?
|
||||
|
||||
# Initialize all bitsizes skipped
|
||||
|
||||
test8=skip
|
||||
|
@ -400,7 +417,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
$do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \
|
||||
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
||||
$do20 = no \
|
||||
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no \
|
||||
]; then
|
||||
do0=yes
|
||||
do1=yes
|
||||
|
@ -423,6 +440,9 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
do18=yes
|
||||
do19=yes
|
||||
do20=yes
|
||||
do21=yes
|
||||
do22=yes
|
||||
do23=yes
|
||||
fi
|
||||
|
||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||
|
@ -780,6 +800,46 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry
|
||||
checkresult $? 20 ""
|
||||
fi
|
||||
|
||||
# \C tests without UTF - DFA matching is supported
|
||||
|
||||
if [ "$do21" = yes ] ; then
|
||||
echo $title21
|
||||
if [ $supportBSC -eq 0 ] ; then
|
||||
echo " Skipped because \C is disabled"
|
||||
else
|
||||
for opt in "" $jitopt -dfa; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput21 testtry
|
||||
checkresult $? 21 "$opt"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# \C tests with UTF - DFA matching is not supported for \C in UTF mode
|
||||
|
||||
if [ "$do22" = yes ] ; then
|
||||
echo $title22
|
||||
if [ $supportBSC -eq 0 ] ; then
|
||||
echo " Skipped because \C is disabled"
|
||||
else
|
||||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput22 testtry
|
||||
checkresult $? 22-$bits "$opt"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# Test when \C is disabled
|
||||
|
||||
if [ "$do23" = yes ] ; then
|
||||
echo $title23
|
||||
if [ $supportBSC -ne 0 ] ; then
|
||||
echo " Skipped because \C is not disabled"
|
||||
else
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput23 testtry
|
||||
checkresult $? 23 ""
|
||||
fi
|
||||
fi
|
||||
|
||||
# End of loop for 8/16/32-bit tests
|
||||
done
|
||||
|
|
74
RunTest.bat
74
RunTest.bat
|
@ -13,11 +13,10 @@
|
|||
@rem line. Added argument validation and added error reporting.
|
||||
@rem
|
||||
@rem Sheri Pierce added logic to skip feature dependent tests
|
||||
@rem tests 4 5 9 15 and 18 require utf support
|
||||
@rem tests 6 7 10 16 and 19 require ucp support
|
||||
@rem 11 requires ucp and link size 2
|
||||
@rem 12 requires presence of jit support
|
||||
@rem 13 requires absence of jit support
|
||||
@rem tests 4 5 7 10 12 14 19 and 22 require Unicode support
|
||||
@rem 8 requires Unicode and link size 2
|
||||
@rem 16 requires absence of jit support
|
||||
@rem 17 requires presence of jit support
|
||||
@rem Sheri P also added override tests for study and jit testing
|
||||
@rem Zoltan Herczeg added libpcre16 support
|
||||
@rem Zoltan Herczeg added libpcre32 support
|
||||
|
@ -25,6 +24,7 @@
|
|||
@rem
|
||||
@rem The file was converted for PCRE2 by PH, February 2015.
|
||||
@rem Updated for new test 14 (moving others up a number), August 2015.
|
||||
@rem Tidied and updated for new tests 21, 22, 23 by PH, October 2015.
|
||||
|
||||
|
||||
setlocal enabledelayedexpansion
|
||||
|
@ -65,6 +65,8 @@ set support32=%ERRORLEVEL%
|
|||
set unicode=%ERRORLEVEL%
|
||||
%pcre2test% -C jit >NUL
|
||||
set jit=%ERRORLEVEL%
|
||||
%pcre2test% -C backslash-C >NUL
|
||||
set supportBSC=%ERRORLEVEL%
|
||||
|
||||
if %support8% EQU 1 (
|
||||
if not exist testout8 md testout8
|
||||
|
@ -101,18 +103,21 @@ set do17=no
|
|||
set do18=no
|
||||
set do19=no
|
||||
set do20=no
|
||||
set do21=no
|
||||
set do22=no
|
||||
set do23=no
|
||||
set all=yes
|
||||
|
||||
for %%a in (%*) do (
|
||||
set valid=no
|
||||
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20) do if %%v == %%a set valid=yes
|
||||
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23) do if %%v == %%a set valid=yes
|
||||
if "!valid!" == "yes" (
|
||||
set do%%a=yes
|
||||
set all=no
|
||||
) else (
|
||||
echo Invalid test number - %%a!
|
||||
echo Usage %0 [ test_number ] ...
|
||||
echo Where test_number is one or more optional test numbers 1 through 20, default is all tests.
|
||||
echo Where test_number is one or more optional test numbers 1 through 23, default is all tests.
|
||||
exit /b 1
|
||||
)
|
||||
)
|
||||
|
@ -139,6 +144,9 @@ if "%all%" == "yes" (
|
|||
set do18=yes
|
||||
set do19=yes
|
||||
set do20=yes
|
||||
set do21=yes
|
||||
set do22=yes
|
||||
set do23=yes
|
||||
)
|
||||
|
||||
@echo RunTest.bat's pcre2test output is written to newly created subfolders
|
||||
|
@ -187,6 +195,9 @@ if "%do17%" == "yes" call :do17
|
|||
if "%do18%" == "yes" call :do18
|
||||
if "%do19%" == "yes" call :do19
|
||||
if "%do20%" == "yes" call :do20
|
||||
if "%do21%" == "yes" call :do21
|
||||
if "%do22%" == "yes" call :do22
|
||||
if "%do23%" == "yes" call :do23
|
||||
:modeSkip
|
||||
if "%mode%" == "" (
|
||||
set mode=-16
|
||||
|
@ -323,7 +334,7 @@ if %unicode% EQU 0 (
|
|||
goto :eof
|
||||
|
||||
:do6
|
||||
call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q -dfa
|
||||
call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q
|
||||
goto :eof
|
||||
|
||||
:do7
|
||||
|
@ -331,7 +342,7 @@ if %unicode% EQU 0 (
|
|||
echo Test 7 Skipped due to absence of Unicode support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q -dfa
|
||||
call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q
|
||||
goto :eof
|
||||
|
||||
:do8
|
||||
|
@ -395,12 +406,16 @@ if %bits% EQU 8 (
|
|||
echo Test 13 Skipped when running 8-bit tests.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q -dfa
|
||||
call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q
|
||||
goto :eof
|
||||
|
||||
:do14
|
||||
call :runsub 14 testout "DFA specials for UTF and UCP support" -q
|
||||
goto :eof
|
||||
if %unicode% EQU 0 (
|
||||
echo Test 14 Skipped due to absence of Unicode support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 14 testout "DFA specials for UTF and UCP support" -q
|
||||
goto :eof
|
||||
|
||||
:do15
|
||||
call :runsub 15 testout "Non-JIT limits and other non_JIT tests" -q
|
||||
|
@ -442,6 +457,10 @@ if %bits% EQU 16 (
|
|||
if %bits% EQU 32 (
|
||||
echo Test 19 Skipped when running 32-bit tests.
|
||||
goto :eof
|
||||
)
|
||||
if %unicode% EQU 0 (
|
||||
echo Test 19 Skipped due to absence of Unicode support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 19 testout "POSIX interface with UTF-8 and UCP" -q
|
||||
goto :eof
|
||||
|
@ -450,6 +469,37 @@ goto :eof
|
|||
call :runsub 20 testout "Serialization tests" -q
|
||||
goto :eof
|
||||
|
||||
:do21
|
||||
if %supportBSC% EQU 0 (
|
||||
echo Test 21 Skipped due to absence of backslash-C support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 21 testout "Backslash-C tests without UTF" -q
|
||||
call :runsub 21 testout "Backslash-C tests without UTF (DFA)" -q -dfa
|
||||
if %jit% EQU 1 call :runsub 21 testoutjit "Test with JIT Override" -q -jit
|
||||
goto :eof
|
||||
|
||||
:do22
|
||||
if %supportBSC% EQU 0 (
|
||||
echo Test 22 Skipped due to absence of backslash-C support.
|
||||
goto :eof
|
||||
)
|
||||
if %unicode% EQU 0 (
|
||||
echo Test 22 Skipped due to absence of Unicode support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 22 testout "Backslash-C tests with UTF" -q
|
||||
if %jit% EQU 1 call :runsub 22 testoutjit "Test with JIT Override" -q -jit
|
||||
goto :eof
|
||||
|
||||
:do23
|
||||
if %supportBSC% EQU 1 (
|
||||
echo Test 23 Skipped due to presence of backslash-C support.
|
||||
goto :eof
|
||||
)
|
||||
call :runsub 23 testout "Backslash-C disabled test" -q
|
||||
goto :eof
|
||||
|
||||
:conferror
|
||||
@echo.
|
||||
@echo Either your build is incomplete or you have a configuration error.
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#cmakedefine EBCDIC 1
|
||||
#cmakedefine EBCDIC_NL25 1
|
||||
#cmakedefine HEAP_MATCH_RECURSE 1
|
||||
#cmakedefine NEVER_BACKSLASH_C 1
|
||||
|
||||
#define LINK_SIZE @PCRE2_LINK_SIZE@
|
||||
#define MATCH_LIMIT @PCRE2_MATCH_LIMIT@
|
||||
|
|
12
configure.ac
12
configure.ac
|
@ -189,6 +189,12 @@ AC_ARG_ENABLE(bsr-anycrlf,
|
|||
AS_HELP_STRING([--enable-bsr-anycrlf],
|
||||
[\R matches only CR, LF, CRLF by default]),
|
||||
, enable_bsr_anycrlf=no)
|
||||
|
||||
# Handle --enable-never-backslash-C
|
||||
AC_ARG_ENABLE(never-backslash-C,
|
||||
AS_HELP_STRING([--enable-never-backslash-C],
|
||||
[use of \C causes an error]),
|
||||
, enable_never_backslash_C=no)
|
||||
|
||||
# Handle --enable-ebcdic
|
||||
AC_ARG_ENABLE(ebcdic,
|
||||
|
@ -604,6 +610,11 @@ if test "$enable_bsr_anycrlf" = "yes"; then
|
|||
The build-time default can be overridden by the user of PCRE2 at runtime.])
|
||||
fi
|
||||
|
||||
if test "$enable_never_backslash_C" = "yes"; then
|
||||
AC_DEFINE([NEVER_BACKSLASH_C], [], [
|
||||
Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns.])
|
||||
fi
|
||||
|
||||
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
|
||||
The value of LINK_SIZE determines the number of bytes used to store
|
||||
links as offsets within the compiled regex. The default is 2, which
|
||||
|
@ -881,6 +892,7 @@ $PACKAGE-$VERSION configuration summary:
|
|||
Enable Unicode support .......... : ${enable_unicode}
|
||||
Newline char/sequence ........... : ${enable_newline}
|
||||
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
|
||||
\C is disabled .................. : ${enable_never_backslash_C}
|
||||
EBCDIC coding ................... : ${enable_ebcdic}
|
||||
EBCDIC code for NL .............. : ${ebcdic_nl_code}
|
||||
Rebuild char tables ............. : ${enable_rebuild_chartables}
|
||||
|
|
|
@ -219,6 +219,13 @@ library. They are also documented in the pcre2build man page.
|
|||
to be the end of a line (see above). However, the caller of PCRE2 can
|
||||
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||
|
||||
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||
characters. You can build PCRE2 with the use of \C permanently locked out by
|
||||
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||
command. When \C is allowed by the library, individual applications can lock
|
||||
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||
|
||||
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||
pattern. This limits the amount of system stack that a pattern uses when it
|
||||
|
@ -833,4 +840,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 16 July 2015
|
||||
Last updated: 16 October 2015
|
||||
|
|
|
@ -126,8 +126,10 @@ running redundant checks.
|
|||
<P>
|
||||
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||
problems, because it may leave the current matching point in the middle of a
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to
|
||||
lock out the use of \C, causing a compile-time error if it is encountered.
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||
application to lock out the use of \C, causing a compile-time error if it is
|
||||
encountered. It is also possible to build PCRE2 with the use of \C permanently
|
||||
disabled.
|
||||
</P>
|
||||
<P>
|
||||
Another way that performance can be hit is by running a pattern that has a very
|
||||
|
@ -187,7 +189,7 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
|||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 13 April 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -59,20 +59,22 @@ units, not characters, as is the contents of the variable pointed at by
|
|||
<i>outlengthptr</i>, which is updated to the actual length of the new string.
|
||||
The options are:
|
||||
<pre>
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_NOTBOL Subject string is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject string is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject
|
||||
is not a valid match
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement for
|
||||
UTF validity (only relevant if PCRE2_UTF
|
||||
was set at compile time)
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the
|
||||
subject is not a valid match
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement
|
||||
for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
</pre>
|
||||
The function returns the number of substitutions, which may be zero if there
|
||||
were no matches. The result can be greater than one only when
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
|
||||
is returned.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -1197,7 +1197,7 @@ built.
|
|||
</pre>
|
||||
If this option is set, an unanchored pattern is required to match before or at
|
||||
the first newline in the subject string, though the matched text may continue
|
||||
over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
|
||||
over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
|
||||
general limiting facility.
|
||||
<pre>
|
||||
PCRE2_MATCH_UNSET_BACKREF
|
||||
|
@ -1237,7 +1237,8 @@ This option locks out the use of \C in the pattern that is being compiled.
|
|||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||
it may leave the current matching point in the middle of a multi-code-unit
|
||||
character. This option may be useful in applications that process patterns from
|
||||
external sources.
|
||||
external sources. Note that there is also a build-time option that permanently
|
||||
locks out the use of \C.
|
||||
<pre>
|
||||
PCRE2_NEVER_UCP
|
||||
</pre>
|
||||
|
@ -1383,8 +1384,8 @@ with Perl. It can also be set by a (?U) option setting within the pattern.
|
|||
<pre>
|
||||
PCRE2_USE_OFFSET_LIMIT
|
||||
</pre>
|
||||
This option must be set for <b>pcre2_compile()</b> if
|
||||
<b>pcre2_set_offset_limit()</b> is going to be used to set a non-default offset
|
||||
This option must be set for <b>pcre2_compile()</b> if
|
||||
<b>pcre2_set_offset_limit()</b> is going to be used to set a non-default offset
|
||||
limit in a match context for matches that use this pattern. An error is
|
||||
generated if an offset limit is set without this option. For more details, see
|
||||
the description of <b>pcre2_set_offset_limit()</b> in the
|
||||
|
@ -2613,43 +2614,17 @@ same number causes an error at compile time.
|
|||
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||
<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
|
||||
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR \fIreplacementzfP,</b>
|
||||
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR <i>replacement</i>,</b>
|
||||
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *\fIoutputbuffer\zfP,</b>
|
||||
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
||||
string in <i>outputbuffer</i>, replacing the part that was matched with the
|
||||
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
|
||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||
</P>
|
||||
<P>
|
||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||
dollar character is an escape character that can specify the insertion of
|
||||
characters from capturing groups or (*MARK) items in the pattern. The following
|
||||
forms are recognized:
|
||||
<pre>
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||
</pre>
|
||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||
required only if the following character would be interpreted as part of the
|
||||
number or name. The number may be zero to include the entire matched string.
|
||||
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
||||
string "+$1$0$1+", the result is "=+babcb+=". Group insertion is done by
|
||||
calling <b>pcre2_copy_byname()</b> or <b>pcre2_copy_bynumber()</b> as
|
||||
appropriate.
|
||||
</P>
|
||||
<P>
|
||||
The facility for inserting a (*MARK) name can be used to perform simple
|
||||
simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
||||
<pre>
|
||||
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||
apple lemon
|
||||
2: pear orange
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
||||
<b>pcre2_match()</b>, except that the partial matching options are not
|
||||
permitted, and <i>match_data</i> may be passed as NULL, in which case a match
|
||||
|
@ -2658,25 +2633,112 @@ functions from the match context, if provided, or else those that were used to
|
|||
allocate memory for the compiled code.
|
||||
</P>
|
||||
<P>
|
||||
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
|
||||
The <i>outlengthptr</i> argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. If the function is successful,
|
||||
the value is updated to contain the length of the new string, excluding the
|
||||
trailing zero that is automatically added. If the function is not successful,
|
||||
the value is set to PCRE2_UNSET for general errors (such as output buffer too
|
||||
small). For syntax errors in the replacement string, the value is set to the
|
||||
offset in the replacement string where the error was detected.
|
||||
</P>
|
||||
<P>
|
||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||
dollar character is an escape character that can specify the insertion of
|
||||
characters from capturing groups or (*MARK) items in the pattern. The following
|
||||
forms are always recognized:
|
||||
<pre>
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||
</pre>
|
||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||
required only if the following character would be interpreted as part of the
|
||||
number or name. The number may be zero to include the entire matched string.
|
||||
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
||||
string "+$1$0$1+", the result is "=+babcb+=".
|
||||
</P>
|
||||
<P>
|
||||
The facility for inserting a (*MARK) name can be used to perform simple
|
||||
simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
||||
<pre>
|
||||
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||
apple lemon
|
||||
2: pear orange
|
||||
</pre>
|
||||
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
|
||||
function to iterate over the subject string, replacing every matching
|
||||
substring. If this is not set, only the first matching substring is replaced.
|
||||
</P>
|
||||
<P>
|
||||
The <i>outlengthptr</i> argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. It is updated to contain the
|
||||
length of the new string, excluding the trailing zero that is automatically
|
||||
added.
|
||||
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra processing
|
||||
to be applied to the replacement string. Without this option, only the dollar
|
||||
character is special, and only the group insertion forms listed above are
|
||||
valid. When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
|
||||
</P>
|
||||
<P>
|
||||
The function returns the number of replacements that were made. This may be
|
||||
zero if no matches were found, and is never greater than 1 unless
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
|
||||
is returned. Except for PCRE2_ERROR_NOMATCH (which is never returned), any
|
||||
errors from <b>pcre2_match()</b> or the substring copying functions are passed
|
||||
straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid
|
||||
replacement string (unrecognized sequence following a dollar sign), and
|
||||
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough.
|
||||
Firstly, backslash in a replacement string is interpreted as an escape
|
||||
character. The usual forms such as \n or \x{ddd} can be used to specify
|
||||
particular character codes, and backslash followed by any non-alphanumeric
|
||||
character quotes that character. Extended quoting can be coded using \Q...\E,
|
||||
exactly as in pattern strings.
|
||||
</P>
|
||||
<P>
|
||||
There are also four escape sequences for forcing the case of inserted letters.
|
||||
The insertion mechanism has three states: no case forcing, force upper case,
|
||||
and force lower case. The escape sequences change the current state: \U and
|
||||
\L change to upper or lower case forcing, respectively, and \E (when not
|
||||
terminating a \Q quoted sequence) reverts to no case forcing. The sequences
|
||||
\u and \l force the next character (if it is a letter) to upper or lower
|
||||
case, respectively, and then the state automatically reverts to no case
|
||||
forcing. Case forcing applies to all inserted characters, including those from
|
||||
captured groups and letters within \Q...\E quoted sequences.
|
||||
</P>
|
||||
<P>
|
||||
Note that case forcing sequences such as \U...\E do not nest. For example,
|
||||
the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no
|
||||
effect.
|
||||
</P>
|
||||
<P>
|
||||
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
|
||||
flexibility to group substitution. The syntax is similar to that used by Bash:
|
||||
<pre>
|
||||
${<n>:-<string>}
|
||||
${<n>:+<string1>:<string2>}
|
||||
</pre>
|
||||
As before, <n> may be a group number or a name. The first form specifies a
|
||||
default value. If group <n> is set, its value is inserted; if not, <string> is
|
||||
expanded and the result inserted. The second form specifies strings that are
|
||||
expanded and inserted when group <n> is set or unset, respectively. The first
|
||||
form is just a convenient shorthand for
|
||||
<pre>
|
||||
${<n>:+${<n>}:<string>}
|
||||
</pre>
|
||||
Backslash can be used to escape colons and closing curly brackets in the
|
||||
replacement strings. A change of the case forcing state within a replacement
|
||||
string remains in force afterwards, as shown in this <b>pcre2test</b> example:
|
||||
<pre>
|
||||
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
|
||||
body
|
||||
1: hello
|
||||
somebody
|
||||
1: HELLO
|
||||
</pre>
|
||||
If successful, the function returns the number of replacements that were made.
|
||||
This may be zero if no matches were found, and is never greater than 1 unless
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||
</P>
|
||||
<P>
|
||||
In the event of an error, a negative error code is returned. Except for
|
||||
PCRE2_ERROR_NOMATCH (which is never returned), errors from <b>pcre2_match()</b>
|
||||
are passed straight back. PCRE2_ERROR_NOMEMORY is returned if the output buffer
|
||||
is not big enough. PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax
|
||||
errors in the replacement string, with more particular errors being
|
||||
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence),
|
||||
PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket not found), and
|
||||
PCRE2_BADSUBSTITUTION (syntax error in extended group substitution). As for all
|
||||
PCRE2 errors, a text message that describes the error can be obtained by
|
||||
calling <b>pcre2_get_error_message()</b>.
|
||||
</P>
|
||||
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||
<P>
|
||||
|
@ -2908,8 +2970,8 @@ There are in addition the following errors that are specific to
|
|||
PCRE2_ERROR_DFA_UITEM
|
||||
</pre>
|
||||
This return is given if <b>pcre2_dfa_match()</b> encounters an item in the
|
||||
pattern that it does not support, for instance, the use of \C or a back
|
||||
reference.
|
||||
pattern that it does not support, for instance, the use of \C in a UTF mode or
|
||||
a back reference.
|
||||
<pre>
|
||||
PCRE2_ERROR_DFA_UCOND
|
||||
</pre>
|
||||
|
@ -2953,7 +3015,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 22 September 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -18,23 +18,24 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC3" href="#SEC3">BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a>
|
||||
<li><a name="TOC4" href="#SEC4">BUILDING SHARED AND STATIC LIBRARIES</a>
|
||||
<li><a name="TOC5" href="#SEC5">UNICODE AND UTF SUPPORT</a>
|
||||
<li><a name="TOC6" href="#SEC6">JUST-IN-TIME COMPILER SUPPORT</a>
|
||||
<li><a name="TOC7" href="#SEC7">NEWLINE RECOGNITION</a>
|
||||
<li><a name="TOC8" href="#SEC8">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC9" href="#SEC9">HANDLING VERY LARGE PATTERNS</a>
|
||||
<li><a name="TOC10" href="#SEC10">AVOIDING EXCESSIVE STACK USAGE</a>
|
||||
<li><a name="TOC11" href="#SEC11">LIMITING PCRE2 RESOURCE USAGE</a>
|
||||
<li><a name="TOC12" href="#SEC12">CREATING CHARACTER TABLES AT BUILD TIME</a>
|
||||
<li><a name="TOC13" href="#SEC13">USING EBCDIC CODE</a>
|
||||
<li><a name="TOC14" href="#SEC14">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
|
||||
<li><a name="TOC15" href="#SEC15">PCRE2GREP BUFFER SIZE</a>
|
||||
<li><a name="TOC16" href="#SEC16">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a>
|
||||
<li><a name="TOC17" href="#SEC17">INCLUDING DEBUGGING CODE</a>
|
||||
<li><a name="TOC18" href="#SEC18">DEBUGGING WITH VALGRIND SUPPORT</a>
|
||||
<li><a name="TOC19" href="#SEC19">CODE COVERAGE REPORTING</a>
|
||||
<li><a name="TOC20" href="#SEC20">SEE ALSO</a>
|
||||
<li><a name="TOC21" href="#SEC21">AUTHOR</a>
|
||||
<li><a name="TOC22" href="#SEC22">REVISION</a>
|
||||
<li><a name="TOC6" href="#SEC6">DISABLING THE USE OF \C</a>
|
||||
<li><a name="TOC7" href="#SEC7">JUST-IN-TIME COMPILER SUPPORT</a>
|
||||
<li><a name="TOC8" href="#SEC8">NEWLINE RECOGNITION</a>
|
||||
<li><a name="TOC9" href="#SEC9">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC10" href="#SEC10">HANDLING VERY LARGE PATTERNS</a>
|
||||
<li><a name="TOC11" href="#SEC11">AVOIDING EXCESSIVE STACK USAGE</a>
|
||||
<li><a name="TOC12" href="#SEC12">LIMITING PCRE2 RESOURCE USAGE</a>
|
||||
<li><a name="TOC13" href="#SEC13">CREATING CHARACTER TABLES AT BUILD TIME</a>
|
||||
<li><a name="TOC14" href="#SEC14">USING EBCDIC CODE</a>
|
||||
<li><a name="TOC15" href="#SEC15">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
|
||||
<li><a name="TOC16" href="#SEC16">PCRE2GREP BUFFER SIZE</a>
|
||||
<li><a name="TOC17" href="#SEC17">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a>
|
||||
<li><a name="TOC18" href="#SEC18">INCLUDING DEBUGGING CODE</a>
|
||||
<li><a name="TOC19" href="#SEC19">DEBUGGING WITH VALGRIND SUPPORT</a>
|
||||
<li><a name="TOC20" href="#SEC20">CODE COVERAGE REPORTING</a>
|
||||
<li><a name="TOC21" href="#SEC21">SEE ALSO</a>
|
||||
<li><a name="TOC22" href="#SEC22">AUTHOR</a>
|
||||
<li><a name="TOC23" href="#SEC23">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">BUILDING PCRE2</a><br>
|
||||
<P>
|
||||
|
@ -148,13 +149,19 @@ properties. The application can request that they do by setting the PCRE2_UCP
|
|||
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
|
||||
request this by starting with (*UCP).
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">DISABLING THE USE OF \C</a><br>
|
||||
<P>
|
||||
The \C escape sequence, which matches a single code unit, even in a UTF mode,
|
||||
can cause unpredictable behaviour because it may leave the current matching
|
||||
point in the middle of a multi-code-unit character. It can be locked out by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
point in the middle of a multi-code-unit character. The application can lock it
|
||||
out by setting the PCRE2_NEVER_BACKSLASH_C option when calling
|
||||
<b>pcre2_compile()</b>. There is also a build-time option
|
||||
<pre>
|
||||
--enable-never-backslash-C
|
||||
</pre>
|
||||
(note the upper case C) which locks out the use of \C entirely.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br>
|
||||
<br><a name="SEC7" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br>
|
||||
<P>
|
||||
Just-in-time compiler support is included in the build by specifying
|
||||
<pre>
|
||||
|
@ -171,7 +178,7 @@ pcre2grep automatically makes use of it, unless you add
|
|||
</pre>
|
||||
to the "configure" command.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">NEWLINE RECOGNITION</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">NEWLINE RECOGNITION</a><br>
|
||||
<P>
|
||||
By default, PCRE2 interprets the linefeed (LF) character as indicating the end
|
||||
of a line. This is the normal newline character on Unix-like systems. You can
|
||||
|
@ -208,7 +215,7 @@ Whatever default line ending convention is selected when PCRE2 is built can be
|
|||
overridden by applications that use the library. At build time it is
|
||||
conventional to use the standard for your operating system.
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<br><a name="SEC9" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<P>
|
||||
By default, the sequence \R in a pattern matches any Unicode newline sequence,
|
||||
independently of what has been selected as the line ending sequence. If you
|
||||
|
@ -220,7 +227,7 @@ the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
|
|||
selected when PCRE2 is built can be overridden by applications that use the
|
||||
called.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
|
||||
<br><a name="SEC10" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
|
||||
<P>
|
||||
Within a compiled pattern, offset values are used to point from one part to
|
||||
another (for example, from an opening parenthesis to an alternation
|
||||
|
@ -239,7 +246,7 @@ longer offsets slows down the operation of PCRE2 because it has to load
|
|||
additional data when handling them. For the 32-bit library the value is always
|
||||
4 and cannot be overridden; the value of --with-link-size is ignored.
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
|
||||
<br><a name="SEC11" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
|
||||
<P>
|
||||
When matching with the <b>pcre2_match()</b> function, PCRE2 implements
|
||||
backtracking by making recursive calls to an internal function called
|
||||
|
@ -261,7 +268,7 @@ custom memory management functions can be called instead. PCRE2 runs noticeably
|
|||
more slowly when built in this way. This option affects only the
|
||||
<b>pcre2_match()</b> function; it is not relevant for <b>pcre2_dfa_match()</b>.
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">LIMITING PCRE2 RESOURCE USAGE</a><br>
|
||||
<br><a name="SEC12" href="#TOC1">LIMITING PCRE2 RESOURCE USAGE</a><br>
|
||||
<P>
|
||||
Internally, PCRE2 has a function called <b>match()</b>, which it calls
|
||||
repeatedly (sometimes recursively) when matching a pattern with the
|
||||
|
@ -290,7 +297,7 @@ constraints. However, you can set a lower limit by adding, for example,
|
|||
</pre>
|
||||
to the <b>configure</b> command. This value can also be overridden at run time.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
|
||||
<br><a name="SEC13" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
|
||||
<P>
|
||||
PCRE2 uses fixed tables for processing characters whose code points are less
|
||||
than 256. By default, PCRE2 is built with a set of tables that are distributed
|
||||
|
@ -307,7 +314,7 @@ compiling, because <b>dftables</b> is run on the local host. If you need to
|
|||
create alternative tables when cross compiling, you will have to do so "by
|
||||
hand".)
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
|
||||
<br><a name="SEC14" href="#TOC1">USING EBCDIC CODE</a><br>
|
||||
<P>
|
||||
PCRE2 assumes by default that it will run in an environment where the character
|
||||
code is ASCII or Unicode, which is a superset of ASCII. This is the case for
|
||||
|
@ -342,7 +349,7 @@ The options that select newline behaviour, such as --enable-newline-is-cr,
|
|||
and equivalent run-time options, refer to these character values in an EBCDIC
|
||||
environment.
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
|
||||
<br><a name="SEC15" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
|
||||
<P>
|
||||
By default, <b>pcre2grep</b> reads all files as plain text. You can build it so
|
||||
that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads
|
||||
|
@ -355,7 +362,7 @@ to the <b>configure</b> command. These options naturally require that the
|
|||
relevant libraries are installed on your system. Configuration will fail if
|
||||
they are not.
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">PCRE2GREP BUFFER SIZE</a><br>
|
||||
<br><a name="SEC16" href="#TOC1">PCRE2GREP BUFFER SIZE</a><br>
|
||||
<P>
|
||||
<b>pcre2grep</b> uses an internal buffer to hold a "window" on the file it is
|
||||
scanning, in order to be able to output "before" and "after" lines when it
|
||||
|
@ -370,7 +377,7 @@ parameter value by adding, for example,
|
|||
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override this
|
||||
value by using --buffer-size on the command line..
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
||||
<br><a name="SEC17" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
||||
<P>
|
||||
If you add one of
|
||||
<pre>
|
||||
|
@ -404,7 +411,7 @@ automatically included, you may need to add something like
|
|||
</pre>
|
||||
immediately before the <b>configure</b> command.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">INCLUDING DEBUGGING CODE</a><br>
|
||||
<br><a name="SEC18" href="#TOC1">INCLUDING DEBUGGING CODE</a><br>
|
||||
<P>
|
||||
If you add
|
||||
<pre>
|
||||
|
@ -413,7 +420,7 @@ If you add
|
|||
to the <b>configure</b> command, additional debugging code is included in the
|
||||
build. This feature is intended for use by the PCRE2 maintainers.
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">DEBUGGING WITH VALGRIND SUPPORT</a><br>
|
||||
<br><a name="SEC19" href="#TOC1">DEBUGGING WITH VALGRIND SUPPORT</a><br>
|
||||
<P>
|
||||
If you add
|
||||
<pre>
|
||||
|
@ -423,7 +430,7 @@ to the <b>configure</b> command, PCRE2 will use valgrind annotations to mark
|
|||
certain memory regions as unaddressable. This allows it to detect invalid
|
||||
memory accesses, and is mostly useful for debugging PCRE2 itself.
|
||||
</P>
|
||||
<br><a name="SEC19" href="#TOC1">CODE COVERAGE REPORTING</a><br>
|
||||
<br><a name="SEC20" href="#TOC1">CODE COVERAGE REPORTING</a><br>
|
||||
<P>
|
||||
If your C compiler is gcc, you can build a version of PCRE2 that can generate a
|
||||
code coverage report for its test suite. To enable this, you must install
|
||||
|
@ -480,11 +487,11 @@ This cleans all coverage data including the generated coverage report. For more
|
|||
information about code coverage, see the <b>gcov</b> and <b>lcov</b>
|
||||
documentation.
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC21" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2api</b>(3), <b>pcre2-config</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC22" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -493,9 +500,9 @@ University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC23" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 24 April 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -1236,14 +1236,21 @@ with \C in UTF-8 or UTF-16 mode means that the rest of the string may start
|
|||
with a malformed UTF character. This has undefined results, because PCRE2
|
||||
assumes that it is matching character by character in a valid UTF string (by
|
||||
default it checks the subject string's validity at the start of processing
|
||||
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the
|
||||
use of \C by setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||
</P>
|
||||
<P>
|
||||
An application can lock out the use of \C by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
|
||||
build PCRE2 with the use of \C permanently disabled.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2 does not allow \C to appear in lookbehind assertions
|
||||
<a href="#lookbehind">(described below)</a>
|
||||
in a UTF mode, because this would make it impossible to calculate the length of
|
||||
the lookbehind.
|
||||
the lookbehind. Neither the alternative matching function
|
||||
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
||||
former gives a match-time error; the latter fails to optimize and so the match
|
||||
is always run using the interpreter.
|
||||
</P>
|
||||
<P>
|
||||
In general, the \C escape sequence is best avoided. However, one way of using
|
||||
|
@ -3351,7 +3358,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 01 September 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -12,17 +12,21 @@ This page is part of the PCRE2 HTML documentation. It was generated
|
|||
automatically from the original man page. If there is any nonsense in it,
|
||||
please consult the man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<br><b>
|
||||
PCRE2 PERFORMANCE
|
||||
</b><br>
|
||||
<ul>
|
||||
<li><a name="TOC1" href="#SEC1">PCRE2 PERFORMANCE</a>
|
||||
<li><a name="TOC2" href="#SEC2">COMPILED PATTERN MEMORY USAGE</a>
|
||||
<li><a name="TOC3" href="#SEC3">STACK USAGE AT RUN TIME</a>
|
||||
<li><a name="TOC4" href="#SEC4">PROCESSING TIME</a>
|
||||
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
|
||||
<li><a name="TOC6" href="#SEC6">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE2 PERFORMANCE</a><br>
|
||||
<P>
|
||||
Two aspects of performance are discussed below: memory usage and processing
|
||||
time. The way you express your pattern as a regular expression can affect both
|
||||
of them.
|
||||
</P>
|
||||
<br><b>
|
||||
COMPILED PATTERN MEMORY USAGE
|
||||
</b><br>
|
||||
<br><a name="SEC2" href="#TOC1">COMPILED PATTERN MEMORY USAGE</a><br>
|
||||
<P>
|
||||
Patterns are compiled by PCRE2 into a reasonably efficient interpretive code,
|
||||
so that most simple patterns do not use much memory. However, there is one case
|
||||
|
@ -75,9 +79,7 @@ pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
|
|||
speed is acceptable, this kind of rewriting will allow you to process patterns
|
||||
that PCRE2 cannot otherwise handle.
|
||||
</P>
|
||||
<br><b>
|
||||
STACK USAGE AT RUN TIME
|
||||
</b><br>
|
||||
<br><a name="SEC3" href="#TOC1">STACK USAGE AT RUN TIME</a><br>
|
||||
<P>
|
||||
When <b>pcre2_match()</b> is used for matching, certain kinds of pattern can
|
||||
cause it to use large amounts of the process stack. In some environments the
|
||||
|
@ -86,9 +88,7 @@ SIGSEGV. Rewriting your pattern can often help. The
|
|||
<a href="pcre2stack.html"><b>pcre2stack</b></a>
|
||||
documentation discusses this issue in detail.
|
||||
</P>
|
||||
<br><b>
|
||||
PROCESSING TIME
|
||||
</b><br>
|
||||
<br><a name="SEC4" href="#TOC1">PROCESSING TIME</a><br>
|
||||
<P>
|
||||
Certain items in regular expression patterns are processed more efficiently
|
||||
than others. It is more efficient to use a character class like [aeiou] than a
|
||||
|
@ -177,9 +177,7 @@ appreciable time with strings longer than about 20 characters.
|
|||
In many cases, the solution to this kind of performance issue is to use an
|
||||
atomic group or a possessive quantifier.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -188,9 +186,7 @@ University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><b>
|
||||
REVISION
|
||||
</b><br>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 02 January 2015
|
||||
<br>
|
||||
|
|
|
@ -111,9 +111,10 @@ it matches a literal "u".
|
|||
\W a "non-word" character
|
||||
\X a Unicode extended grapheme cluster
|
||||
</pre>
|
||||
The application can lock out the use of \C by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the
|
||||
current matching point in the middle of a UTF-8 or UTF-16 character.
|
||||
\C is dangerous because it may leave the current matching point in the middle
|
||||
of a UTF-8 or UTF-16 character. The application can lock out the use of \C by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||
with the use of \C permanently disabled.
|
||||
</P>
|
||||
<P>
|
||||
By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode
|
||||
|
@ -588,7 +589,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 July 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -155,12 +155,13 @@ following options output the value and set the exit code as indicated:
|
|||
The following options output 1 for true or 0 for false, and set the exit code
|
||||
to the same value:
|
||||
<pre>
|
||||
ebcdic compiled for an EBCDIC environment
|
||||
jit just-in-time support is available
|
||||
pcre2-16 the 16-bit library was built
|
||||
pcre2-32 the 32-bit library was built
|
||||
pcre2-8 the 8-bit library was built
|
||||
unicode Unicode support is available
|
||||
backslash-C \C is supported (not locked out)
|
||||
ebcdic compiled for an EBCDIC environment
|
||||
jit just-in-time support is available
|
||||
pcre2-16 the 16-bit library was built
|
||||
pcre2-32 the 32-bit library was built
|
||||
pcre2-8 the 8-bit library was built
|
||||
unicode Unicode support is available
|
||||
</pre>
|
||||
If an unknown option is given, an error message is output; the exit code is 0.
|
||||
</P>
|
||||
|
@ -510,7 +511,7 @@ Setting compilation options
|
|||
<P>
|
||||
The following modifiers set options for <b>pcre2_compile()</b>. The most common
|
||||
ones have single-letter abbreviations. See
|
||||
<a href="pcreapi.html"><b>pcreapi</b></a>
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
for a description of their effects.
|
||||
<pre>
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
|
@ -537,6 +538,7 @@ for a description of their effects.
|
|||
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||
ucp set PCRE2_UCP
|
||||
ungreedy set PCRE2_UNGREEDY
|
||||
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||
utf set PCRE2_UTF
|
||||
</pre>
|
||||
As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all
|
||||
|
@ -564,6 +566,7 @@ about the pattern:
|
|||
locale=<name> use this locale
|
||||
memory show memory used
|
||||
newline=<type> set newline type
|
||||
null_context compile with a NULL context
|
||||
parens_nest_limit=<n> set maximum parentheses depth
|
||||
posix use the POSIX API
|
||||
push push compiled pattern onto the stack
|
||||
|
@ -642,6 +645,15 @@ is requested. For each callout, either its number or string is given, followed
|
|||
by the item that follows it in the pattern.
|
||||
</P>
|
||||
<br><b>
|
||||
Passing a NULL context
|
||||
</b><br>
|
||||
<P>
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_compile()</b>. If
|
||||
the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
||||
default values).
|
||||
</P>
|
||||
<br><b>
|
||||
Specifying a pattern in hex
|
||||
</b><br>
|
||||
<P>
|
||||
|
@ -920,9 +932,11 @@ pattern.
|
|||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
match_limit=>n> set a match limit
|
||||
match_limit=<n> set a match limit
|
||||
memory show memory usage
|
||||
null_context match with a NULL context
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
recursion_limit=<n> set a recursion limit
|
||||
replace=<string> specify a replacement string
|
||||
|
@ -1170,6 +1184,16 @@ The <b>offset</b> modifier sets an offset in the subject string at which
|
|||
matching starts. Its value is a number of code units, not characters.
|
||||
</P>
|
||||
<br><b>
|
||||
Setting an offset limit
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>offset_limit</b> modifier sets a limit for unanchored matches. If a match
|
||||
cannot be found starting at or before this offset in the subject, a "no match"
|
||||
return is given. The data value is a number of code units, not characters. When
|
||||
this modifier is used, the <b>use_offset_limit</b> modifier must have been set
|
||||
for the pattern; if not, an error is generated.
|
||||
</P>
|
||||
<br><b>
|
||||
Setting the size of the output vector
|
||||
</b><br>
|
||||
<P>
|
||||
|
@ -1201,6 +1225,17 @@ this modifier has no effect, as there is no facility for passing a length.)
|
|||
When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
|
||||
passing the replacement string as zero-terminated.
|
||||
</P>
|
||||
<br><b>
|
||||
Passing a NULL context
|
||||
</b><br>
|
||||
<P>
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||
<b>pcre2_dfa_match()</b> or <b>pcre2_jit_match()</b>. If the <b>null_context</b>
|
||||
modifier is set, however, NULL is passed. This is for testing that the matching
|
||||
functions behave correctly in this case (they use default values). This
|
||||
modifier cannot be used with the <b>find_limits</b> modifier or when testing the
|
||||
substitution function.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
<P>
|
||||
By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
|
||||
|
@ -1539,7 +1574,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 14 September 2015
|
||||
Last updated: 17 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -71,11 +71,12 @@ The escape sequence \C can be used to match a single code unit, in a UTF mode,
|
|||
but its use can lead to some strange effects because it breaks up multi-unit
|
||||
characters (see the description of \C in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation). The use of \C is not supported in the alternative matching
|
||||
function <b>pcre2_dfa_match()</b>, nor is it supported in UTF mode by the JIT
|
||||
optimization. If JIT optimization is requested for a UTF pattern that contains
|
||||
\C, it will not succeed, and so the matching will be carried out by the normal
|
||||
interpretive function.
|
||||
documentation). The use of \C is not supported by the alternative matching
|
||||
function <b>pcre2_dfa_match()</b> when in UTF mode. Its use provokes a
|
||||
match-time error. The JIT optimization also does not support \C in UTF mode.
|
||||
If JIT optimization is requested for a UTF pattern that contains \C, it will
|
||||
not succeed, and so the matching will be carried out by the normal interpretive
|
||||
function.
|
||||
</P>
|
||||
<P>
|
||||
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
||||
|
@ -275,7 +276,7 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 18 August 2015
|
||||
Last updated: 16 October 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
10
doc/pcre2.3
10
doc/pcre2.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2 3 "13 April 2015" "PCRE2 10.20"
|
||||
.TH PCRE2 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH INTRODUCTION
|
||||
|
@ -118,8 +118,10 @@ running redundant checks.
|
|||
.P
|
||||
The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||
problems, because it may leave the current matching point in the middle of a
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to
|
||||
lock out the use of \eC, causing a compile-time error if it is encountered.
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||
application to lock out the use of \eC, causing a compile-time error if it is
|
||||
encountered. It is also possible to build PCRE2 with the use of \eC permanently
|
||||
disabled.
|
||||
.P
|
||||
Another way that performance can be hit is by running a pattern that has a very
|
||||
large search tree against a string that will never match. Nested unlimited
|
||||
|
@ -187,6 +189,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 13 April 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
514
doc/pcre2.txt
514
doc/pcre2.txt
|
@ -104,26 +104,27 @@ SECURITY CONSIDERATIONS
|
|||
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead
|
||||
to problems, because it may leave the current matching point in the
|
||||
middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C
|
||||
option can be used to lock out the use of \C, causing a compile-time
|
||||
error if it is encountered.
|
||||
option can be used by an application to lock out the use of \C, causing
|
||||
a compile-time error if it is encountered. It is also possible to build
|
||||
PCRE2 with the use of \C permanently disabled.
|
||||
|
||||
Another way that performance can be hit is by running a pattern that
|
||||
has a very large search tree against a string that will never match.
|
||||
Nested unlimited repeats in a pattern are a common example. PCRE2 pro-
|
||||
vides some protection against this: see the pcre2_set_match_limit()
|
||||
Another way that performance can be hit is by running a pattern that
|
||||
has a very large search tree against a string that will never match.
|
||||
Nested unlimited repeats in a pattern are a common example. PCRE2 pro-
|
||||
vides some protection against this: see the pcre2_set_match_limit()
|
||||
function in the pcre2api page.
|
||||
|
||||
|
||||
USER DOCUMENTATION
|
||||
|
||||
The user documentation for PCRE2 comprises a number of different sec-
|
||||
tions. In the "man" format, each of these is a separate "man page". In
|
||||
the HTML format, each is a separate page, linked from the index page.
|
||||
In the plain text format, the descriptions of the pcre2grep and
|
||||
The user documentation for PCRE2 comprises a number of different sec-
|
||||
tions. In the "man" format, each of these is a separate "man page". In
|
||||
the HTML format, each is a separate page, linked from the index page.
|
||||
In the plain text format, the descriptions of the pcre2grep and
|
||||
pcre2test programs are in files called pcre2grep.txt and pcre2test.txt,
|
||||
respectively. The remaining sections, except for the pcre2demo section
|
||||
(which is a program listing), and the short pages for individual func-
|
||||
tions, are concatenated in pcre2.txt, for ease of searching. The sec-
|
||||
respectively. The remaining sections, except for the pcre2demo section
|
||||
(which is a program listing), and the short pages for individual func-
|
||||
tions, are concatenated in pcre2.txt, for ease of searching. The sec-
|
||||
tions are as follows:
|
||||
|
||||
pcre2 this document
|
||||
|
@ -148,7 +149,7 @@ USER DOCUMENTATION
|
|||
pcre2test description of the pcre2test command
|
||||
pcre2unicode discussion of Unicode and UTF support
|
||||
|
||||
In the "man" and HTML formats, there is also a short page for each C
|
||||
In the "man" and HTML formats, there is also a short page for each C
|
||||
library function, listing its arguments and results.
|
||||
|
||||
|
||||
|
@ -158,14 +159,14 @@ AUTHOR
|
|||
University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
Putting an actual email address here is a spam magnet. If you want to
|
||||
email me, use my two initials, followed by the two digits 10, at the
|
||||
Putting an actual email address here is a spam magnet. If you want to
|
||||
email me, use my two initials, followed by the two digits 10, at the
|
||||
domain cam.ac.uk.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 13 April 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -1276,7 +1277,9 @@ COMPILING A PATTERN
|
|||
piled. This escape can cause unpredictable behaviour in UTF-8 or
|
||||
UTF-16 modes, because it may leave the current matching point in the
|
||||
middle of a multi-code-unit character. This option may be useful in
|
||||
applications that process patterns from external sources.
|
||||
applications that process patterns from external sources. Note that
|
||||
there is also a build-time option that permanently locks out the use of
|
||||
\C.
|
||||
|
||||
PCRE2_NEVER_UCP
|
||||
|
||||
|
@ -2571,19 +2574,36 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
|
||||
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
||||
uint32_t options, pcre2_match_data *match_data,
|
||||
pcre2_match_context *mcontext, PCRE2_SPTR replacementzfP,
|
||||
pcre2_match_context *mcontext, PCRE2_SPTR replacement,
|
||||
PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferP,
|
||||
PCRE2_SIZE *outlengthptr);
|
||||
|
||||
This function calls pcre2_match() and then makes a copy of the subject
|
||||
string in outputbuffer, replacing the part that was matched with the
|
||||
replacement string, whose length is supplied in rlength. This can be
|
||||
given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||
|
||||
The first seven arguments of pcre2_substitute() are the same as for
|
||||
pcre2_match(), except that the partial matching options are not permit-
|
||||
ted, and match_data may be passed as NULL, in which case a match data
|
||||
block is obtained and freed within this function, using memory manage-
|
||||
ment functions from the match context, if provided, or else those that
|
||||
were used to allocate memory for the compiled code.
|
||||
|
||||
The outlengthptr argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. If the function is suc-
|
||||
cessful, the value is updated to contain the length of the new string,
|
||||
excluding the trailing zero that is automatically added. If the func-
|
||||
tion is not successful, the value is set to PCRE2_UNSET for general
|
||||
errors (such as output buffer too small). For syntax errors in the
|
||||
replacement string, the value is set to the offset in the replacement
|
||||
string where the error was detected.
|
||||
|
||||
In the replacement string, which is interpreted as a UTF string in UTF
|
||||
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
||||
option is set, a dollar character is an escape character that can spec-
|
||||
ify the insertion of characters from capturing groups or (*MARK) items
|
||||
in the pattern. The following forms are recognized:
|
||||
in the pattern. The following forms are always recognized:
|
||||
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
|
@ -2594,8 +2614,7 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
preted as part of the number or name. The number may be zero to include
|
||||
the entire matched string. For example, if the pattern a(b)c is
|
||||
matched with "=abc=" and the replacement string "+$1$0$1+", the result
|
||||
is "=+babcb+=". Group insertion is done by calling pcre2_copy_byname()
|
||||
or pcre2_copy_bynumber() as appropriate.
|
||||
is "=+babcb+=".
|
||||
|
||||
The facility for inserting a (*MARK) name can be used to perform simple
|
||||
simultaneous substitutions, as this pcre2test example shows:
|
||||
|
@ -2604,32 +2623,80 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
apple lemon
|
||||
2: pear orange
|
||||
|
||||
The first seven arguments of pcre2_substitute() are the same as for
|
||||
pcre2_match(), except that the partial matching options are not permit-
|
||||
ted, and match_data may be passed as NULL, in which case a match data
|
||||
block is obtained and freed within this function, using memory manage-
|
||||
ment functions from the match context, if provided, or else those that
|
||||
were used to allocate memory for the compiled code.
|
||||
|
||||
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
|
||||
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
|
||||
the function to iterate over the subject string, replacing every match-
|
||||
ing substring. If this is not set, only the first matching substring is
|
||||
replaced.
|
||||
|
||||
The outlengthptr argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. It is updated to contain
|
||||
the length of the new string, excluding the trailing zero that is auto-
|
||||
matically added.
|
||||
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra
|
||||
processing to be applied to the replacement string. Without this
|
||||
option, only the dollar character is special, and only the group inser-
|
||||
tion forms listed above are valid. When PCRE2_SUBSTITUTE_EXTENDED is
|
||||
set, two things change:
|
||||
|
||||
The function returns the number of replacements that were made. This
|
||||
may be zero if no matches were found, and is never greater than 1
|
||||
unless PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a neg-
|
||||
ative error code is returned. Except for PCRE2_ERROR_NOMATCH (which is
|
||||
never returned), any errors from pcre2_match() or the substring copying
|
||||
functions are passed straight back. PCRE2_ERROR_BADREPLACEMENT is
|
||||
returned for an invalid replacement string (unrecognized sequence fol-
|
||||
lowing a dollar sign), and PCRE2_ERROR_NOMEMORY is returned if the out-
|
||||
put buffer is not big enough.
|
||||
Firstly, backslash in a replacement string is interpreted as an escape
|
||||
character. The usual forms such as \n or \x{ddd} can be used to specify
|
||||
particular character codes, and backslash followed by any non-alphanu-
|
||||
meric character quotes that character. Extended quoting can be coded
|
||||
using \Q...\E, exactly as in pattern strings.
|
||||
|
||||
There are also four escape sequences for forcing the case of inserted
|
||||
letters. The insertion mechanism has three states: no case forcing,
|
||||
force upper case, and force lower case. The escape sequences change the
|
||||
current state: \U and \L change to upper or lower case forcing, respec-
|
||||
tively, and \E (when not terminating a \Q quoted sequence) reverts to
|
||||
no case forcing. The sequences \u and \l force the next character (if
|
||||
it is a letter) to upper or lower case, respectively, and then the
|
||||
state automatically reverts to no case forcing. Case forcing applies to
|
||||
all inserted characters, including those from captured groups and let-
|
||||
ters within \Q...\E quoted sequences.
|
||||
|
||||
Note that case forcing sequences such as \U...\E do not nest. For exam-
|
||||
ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final
|
||||
\E has no effect.
|
||||
|
||||
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
|
||||
flexibility to group substitution. The syntax is similar to that used
|
||||
by Bash:
|
||||
|
||||
${<n>:-<string>}
|
||||
${<n>:+<string1>:<string2>}
|
||||
|
||||
As before, <n> may be a group number or a name. The first form speci-
|
||||
fies a default value. If group <n> is set, its value is inserted; if
|
||||
not, <string> is expanded and the result inserted. The second form
|
||||
specifies strings that are expanded and inserted when group <n> is set
|
||||
or unset, respectively. The first form is just a convenient shorthand
|
||||
for
|
||||
|
||||
${<n>:+${<n>}:<string>}
|
||||
|
||||
Backslash can be used to escape colons and closing curly brackets in
|
||||
the replacement strings. A change of the case forcing state within a
|
||||
replacement string remains in force afterwards, as shown in this
|
||||
pcre2test example:
|
||||
|
||||
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
|
||||
body
|
||||
1: hello
|
||||
somebody
|
||||
1: HELLO
|
||||
|
||||
If successful, the function returns the number of replacements that
|
||||
were made. This may be zero if no matches were found, and is never
|
||||
greater than 1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||
|
||||
In the event of an error, a negative error code is returned. Except for
|
||||
PCRE2_ERROR_NOMATCH (which is never returned), errors from
|
||||
pcre2_match() are passed straight back. PCRE2_ERROR_NOMEMORY is
|
||||
returned if the output buffer is not big enough.
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in
|
||||
the replacement string, with more particular errors being
|
||||
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REP-
|
||||
MISSING_BRACE (closing curly bracket not found), and PCRE2_BADSUBSTITU-
|
||||
TION (syntax error in extended group substitution). As for all PCRE2
|
||||
errors, a text message that describes the error can be obtained by
|
||||
calling pcre2_get_error_message().
|
||||
|
||||
|
||||
DUPLICATE SUBPATTERN NAMES
|
||||
|
@ -2845,8 +2912,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
PCRE2_ERROR_DFA_UITEM
|
||||
|
||||
This return is given if pcre2_dfa_match() encounters an item in the
|
||||
pattern that it does not support, for instance, the use of \C or a back
|
||||
reference.
|
||||
pattern that it does not support, for instance, the use of \C in a UTF
|
||||
mode or a back reference.
|
||||
|
||||
PCRE2_ERROR_DFA_UCOND
|
||||
|
||||
|
@ -2890,7 +2957,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 22 September 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -3010,10 +3077,18 @@ UNICODE AND UTF SUPPORT
|
|||
PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a
|
||||
pattern may also request this by starting with (*UCP).
|
||||
|
||||
|
||||
DISABLING THE USE OF \C
|
||||
|
||||
The \C escape sequence, which matches a single code unit, even in a UTF
|
||||
mode, can cause unpredictable behaviour because it may leave the cur-
|
||||
rent matching point in the middle of a multi-code-unit character. It
|
||||
can be locked out by setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
rent matching point in the middle of a multi-code-unit character. The
|
||||
application can lock it out by setting the PCRE2_NEVER_BACKSLASH_C
|
||||
option when calling pcre2_compile(). There is also a build-time option
|
||||
|
||||
--enable-never-backslash-C
|
||||
|
||||
(note the upper case C) which locks out the use of \C entirely.
|
||||
|
||||
|
||||
JUST-IN-TIME COMPILER SUPPORT
|
||||
|
@ -3022,10 +3097,10 @@ JUST-IN-TIME COMPILER SUPPORT
|
|||
|
||||
--enable-jit
|
||||
|
||||
This support is available only for certain hardware architectures. If
|
||||
this option is set for an unsupported architecture, a building error
|
||||
occurs. See the pcre2jit documentation for a discussion of JIT usage.
|
||||
When JIT support is enabled, pcre2grep automatically makes use of it,
|
||||
This support is available only for certain hardware architectures. If
|
||||
this option is set for an unsupported architecture, a building error
|
||||
occurs. See the pcre2jit documentation for a discussion of JIT usage.
|
||||
When JIT support is enabled, pcre2grep automatically makes use of it,
|
||||
unless you add
|
||||
|
||||
--disable-pcre2grep-jit
|
||||
|
@ -3035,14 +3110,14 @@ JUST-IN-TIME COMPILER SUPPORT
|
|||
|
||||
NEWLINE RECOGNITION
|
||||
|
||||
By default, PCRE2 interprets the linefeed (LF) character as indicating
|
||||
the end of a line. This is the normal newline character on Unix-like
|
||||
systems. You can compile PCRE2 to use carriage return (CR) instead, by
|
||||
By default, PCRE2 interprets the linefeed (LF) character as indicating
|
||||
the end of a line. This is the normal newline character on Unix-like
|
||||
systems. You can compile PCRE2 to use carriage return (CR) instead, by
|
||||
adding
|
||||
|
||||
--enable-newline-is-cr
|
||||
|
||||
to the configure command. There is also an --enable-newline-is-lf
|
||||
to the configure command. There is also an --enable-newline-is-lf
|
||||
option, which explicitly specifies linefeed as the newline character.
|
||||
|
||||
Alternatively, you can specify that line endings are to be indicated by
|
||||
|
@ -3055,76 +3130,76 @@ NEWLINE RECOGNITION
|
|||
|
||||
--enable-newline-is-anycrlf
|
||||
|
||||
which causes PCRE2 to recognize any of the three sequences CR, LF, or
|
||||
which causes PCRE2 to recognize any of the three sequences CR, LF, or
|
||||
CRLF as indicating a line ending. Finally, a fifth option, specified by
|
||||
|
||||
--enable-newline-is-any
|
||||
|
||||
causes PCRE2 to recognize any Unicode newline sequence. The Unicode
|
||||
causes PCRE2 to recognize any Unicode newline sequence. The Unicode
|
||||
newline sequences are the three just mentioned, plus the single charac-
|
||||
ters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line,
|
||||
U+0085), LS (line separator, U+2028), and PS (paragraph separator,
|
||||
U+0085), LS (line separator, U+2028), and PS (paragraph separator,
|
||||
U+2029).
|
||||
|
||||
Whatever default line ending convention is selected when PCRE2 is built
|
||||
can be overridden by applications that use the library. At build time
|
||||
can be overridden by applications that use the library. At build time
|
||||
it is conventional to use the standard for your operating system.
|
||||
|
||||
|
||||
WHAT \R MATCHES
|
||||
|
||||
By default, the sequence \R in a pattern matches any Unicode newline
|
||||
sequence, independently of what has been selected as the line ending
|
||||
By default, the sequence \R in a pattern matches any Unicode newline
|
||||
sequence, independently of what has been selected as the line ending
|
||||
sequence. If you specify
|
||||
|
||||
--enable-bsr-anycrlf
|
||||
|
||||
the default is changed so that \R matches only CR, LF, or CRLF. What-
|
||||
ever is selected when PCRE2 is built can be overridden by applications
|
||||
the default is changed so that \R matches only CR, LF, or CRLF. What-
|
||||
ever is selected when PCRE2 is built can be overridden by applications
|
||||
that use the called.
|
||||
|
||||
|
||||
HANDLING VERY LARGE PATTERNS
|
||||
|
||||
Within a compiled pattern, offset values are used to point from one
|
||||
part to another (for example, from an opening parenthesis to an alter-
|
||||
nation metacharacter). By default, in the 8-bit and 16-bit libraries,
|
||||
two-byte values are used for these offsets, leading to a maximum size
|
||||
for a compiled pattern of around 64K code units. This is sufficient to
|
||||
Within a compiled pattern, offset values are used to point from one
|
||||
part to another (for example, from an opening parenthesis to an alter-
|
||||
nation metacharacter). By default, in the 8-bit and 16-bit libraries,
|
||||
two-byte values are used for these offsets, leading to a maximum size
|
||||
for a compiled pattern of around 64K code units. This is sufficient to
|
||||
handle all but the most gigantic patterns. Nevertheless, some people do
|
||||
want to process truly enormous patterns, so it is possible to compile
|
||||
PCRE2 to use three-byte or four-byte offsets by adding a setting such
|
||||
want to process truly enormous patterns, so it is possible to compile
|
||||
PCRE2 to use three-byte or four-byte offsets by adding a setting such
|
||||
as
|
||||
|
||||
--with-link-size=3
|
||||
|
||||
to the configure command. The value given must be 2, 3, or 4. For the
|
||||
16-bit library, a value of 3 is rounded up to 4. In these libraries,
|
||||
using longer offsets slows down the operation of PCRE2 because it has
|
||||
to load additional data when handling them. For the 32-bit library the
|
||||
value is always 4 and cannot be overridden; the value of --with-link-
|
||||
to the configure command. The value given must be 2, 3, or 4. For the
|
||||
16-bit library, a value of 3 is rounded up to 4. In these libraries,
|
||||
using longer offsets slows down the operation of PCRE2 because it has
|
||||
to load additional data when handling them. For the 32-bit library the
|
||||
value is always 4 and cannot be overridden; the value of --with-link-
|
||||
size is ignored.
|
||||
|
||||
|
||||
AVOIDING EXCESSIVE STACK USAGE
|
||||
|
||||
When matching with the pcre2_match() function, PCRE2 implements back-
|
||||
tracking by making recursive calls to an internal function called
|
||||
match(). In environments where the size of the stack is limited, this
|
||||
can severely limit PCRE2's operation. (The Unix environment does not
|
||||
usually suffer from this problem, but it may sometimes be necessary to
|
||||
When matching with the pcre2_match() function, PCRE2 implements back-
|
||||
tracking by making recursive calls to an internal function called
|
||||
match(). In environments where the size of the stack is limited, this
|
||||
can severely limit PCRE2's operation. (The Unix environment does not
|
||||
usually suffer from this problem, but it may sometimes be necessary to
|
||||
increase the maximum stack size. There is a discussion in the
|
||||
pcre2stack documentation.) An alternative approach to recursion that
|
||||
uses memory from the heap to remember data, instead of using recursive
|
||||
function calls, has been implemented to work round the problem of lim-
|
||||
ited stack size. If you want to build a version of PCRE2 that works
|
||||
pcre2stack documentation.) An alternative approach to recursion that
|
||||
uses memory from the heap to remember data, instead of using recursive
|
||||
function calls, has been implemented to work round the problem of lim-
|
||||
ited stack size. If you want to build a version of PCRE2 that works
|
||||
this way, add
|
||||
|
||||
--disable-stack-for-recursion
|
||||
|
||||
to the configure command. By default, the system functions malloc() and
|
||||
free() are called to manage the heap memory that is required, but cus-
|
||||
tom memory management functions can be called instead. PCRE2 runs
|
||||
free() are called to manage the heap memory that is required, but cus-
|
||||
tom memory management functions can be called instead. PCRE2 runs
|
||||
noticeably more slowly when built in this way. This option affects only
|
||||
the pcre2_match() function; it is not relevant for pcre2_dfa_match().
|
||||
|
||||
|
@ -3132,30 +3207,30 @@ AVOIDING EXCESSIVE STACK USAGE
|
|||
LIMITING PCRE2 RESOURCE USAGE
|
||||
|
||||
Internally, PCRE2 has a function called match(), which it calls repeat-
|
||||
edly (sometimes recursively) when matching a pattern with the
|
||||
edly (sometimes recursively) when matching a pattern with the
|
||||
pcre2_match() function. By controlling the maximum number of times this
|
||||
function may be called during a single matching operation, a limit can
|
||||
be placed on the resources used by a single call to pcre2_match(). The
|
||||
function may be called during a single matching operation, a limit can
|
||||
be placed on the resources used by a single call to pcre2_match(). The
|
||||
limit can be changed at run time, as described in the pcre2api documen-
|
||||
tation. The default is 10 million, but this can be changed by adding a
|
||||
tation. The default is 10 million, but this can be changed by adding a
|
||||
setting such as
|
||||
|
||||
--with-match-limit=500000
|
||||
|
||||
to the configure command. This setting has no effect on the
|
||||
to the configure command. This setting has no effect on the
|
||||
pcre2_dfa_match() matching function.
|
||||
|
||||
In some environments it is desirable to limit the depth of recursive
|
||||
In some environments it is desirable to limit the depth of recursive
|
||||
calls of match() more strictly than the total number of calls, in order
|
||||
to restrict the maximum amount of stack (or heap, if --disable-stack-
|
||||
to restrict the maximum amount of stack (or heap, if --disable-stack-
|
||||
for-recursion is specified) that is used. A second limit controls this;
|
||||
it defaults to the value that is set for --with-match-limit, which
|
||||
imposes no additional constraints. However, you can set a lower limit
|
||||
it defaults to the value that is set for --with-match-limit, which
|
||||
imposes no additional constraints. However, you can set a lower limit
|
||||
by adding, for example,
|
||||
|
||||
--with-match-limit-recursion=10000
|
||||
|
||||
to the configure command. This value can also be overridden at run
|
||||
to the configure command. This value can also be overridden at run
|
||||
time.
|
||||
|
||||
|
||||
|
@ -3163,45 +3238,45 @@ CREATING CHARACTER TABLES AT BUILD TIME
|
|||
|
||||
PCRE2 uses fixed tables for processing characters whose code points are
|
||||
less than 256. By default, PCRE2 is built with a set of tables that are
|
||||
distributed in the file src/pcre2_chartables.c.dist. These tables are
|
||||
distributed in the file src/pcre2_chartables.c.dist. These tables are
|
||||
for ASCII codes only. If you add
|
||||
|
||||
--enable-rebuild-chartables
|
||||
|
||||
to the configure command, the distributed tables are no longer used.
|
||||
Instead, a program called dftables is compiled and run. This outputs
|
||||
to the configure command, the distributed tables are no longer used.
|
||||
Instead, a program called dftables is compiled and run. This outputs
|
||||
the source for new set of tables, created in the default locale of your
|
||||
C run-time system. (This method of replacing the tables does not work
|
||||
if you are cross compiling, because dftables is run on the local host.
|
||||
C run-time system. (This method of replacing the tables does not work
|
||||
if you are cross compiling, because dftables is run on the local host.
|
||||
If you need to create alternative tables when cross compiling, you will
|
||||
have to do so "by hand".)
|
||||
|
||||
|
||||
USING EBCDIC CODE
|
||||
|
||||
PCRE2 assumes by default that it will run in an environment where the
|
||||
character code is ASCII or Unicode, which is a superset of ASCII. This
|
||||
PCRE2 assumes by default that it will run in an environment where the
|
||||
character code is ASCII or Unicode, which is a superset of ASCII. This
|
||||
is the case for most computer operating systems. PCRE2 can, however, be
|
||||
compiled to run in an 8-bit EBCDIC environment by adding
|
||||
|
||||
--enable-ebcdic --disable-unicode
|
||||
|
||||
to the configure command. This setting implies --enable-rebuild-charta-
|
||||
bles. You should only use it if you know that you are in an EBCDIC
|
||||
bles. You should only use it if you know that you are in an EBCDIC
|
||||
environment (for example, an IBM mainframe operating system).
|
||||
|
||||
It is not possible to support both EBCDIC and UTF-8 codes in the same
|
||||
version of the library. Consequently, --enable-unicode and --enable-
|
||||
It is not possible to support both EBCDIC and UTF-8 codes in the same
|
||||
version of the library. Consequently, --enable-unicode and --enable-
|
||||
ebcdic are mutually exclusive.
|
||||
|
||||
The EBCDIC character that corresponds to an ASCII LF is assumed to have
|
||||
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
||||
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
||||
is used. In such an environment you should use
|
||||
|
||||
--enable-ebcdic-nl25
|
||||
|
||||
as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR
|
||||
has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and
|
||||
has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and
|
||||
0x25 is not chosen as LF is made to correspond to the Unicode NEL char-
|
||||
acter (which, in Unicode, is 0x85).
|
||||
|
||||
|
@ -3212,31 +3287,31 @@ USING EBCDIC CODE
|
|||
|
||||
PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT
|
||||
|
||||
By default, pcre2grep reads all files as plain text. You can build it
|
||||
so that it recognizes files whose names end in .gz or .bz2, and reads
|
||||
By default, pcre2grep reads all files as plain text. You can build it
|
||||
so that it recognizes files whose names end in .gz or .bz2, and reads
|
||||
them with libz or libbz2, respectively, by adding one or both of
|
||||
|
||||
--enable-pcre2grep-libz
|
||||
--enable-pcre2grep-libbz2
|
||||
|
||||
to the configure command. These options naturally require that the rel-
|
||||
evant libraries are installed on your system. Configuration will fail
|
||||
evant libraries are installed on your system. Configuration will fail
|
||||
if they are not.
|
||||
|
||||
|
||||
PCRE2GREP BUFFER SIZE
|
||||
|
||||
pcre2grep uses an internal buffer to hold a "window" on the file it is
|
||||
pcre2grep uses an internal buffer to hold a "window" on the file it is
|
||||
scanning, in order to be able to output "before" and "after" lines when
|
||||
it finds a match. The size of the buffer is controlled by a parameter
|
||||
it finds a match. The size of the buffer is controlled by a parameter
|
||||
whose default value is 20K. The buffer itself is three times this size,
|
||||
but because of the way it is used for holding "before" lines, the long-
|
||||
est line that is guaranteed to be processable is the parameter size.
|
||||
est line that is guaranteed to be processable is the parameter size.
|
||||
You can change the default parameter value by adding, for example,
|
||||
|
||||
--with-pcre2grep-bufsize=50K
|
||||
|
||||
to the configure command. The caller of pcre2grep can override this
|
||||
to the configure command. The caller of pcre2grep can override this
|
||||
value by using --buffer-size on the command line..
|
||||
|
||||
|
||||
|
@ -3247,26 +3322,26 @@ PCRE2TEST OPTION FOR LIBREADLINE SUPPORT
|
|||
--enable-pcre2test-libreadline
|
||||
--enable-pcre2test-libedit
|
||||
|
||||
to the configure command, pcre2test is linked with the libreadline
|
||||
to the configure command, pcre2test is linked with the libreadline
|
||||
orlibedit library, respectively, and when its input is from a terminal,
|
||||
it reads it using the readline() function. This provides line-editing
|
||||
and history facilities. Note that libreadline is GPL-licensed, so if
|
||||
you distribute a binary of pcre2test linked in this way, there may be
|
||||
it reads it using the readline() function. This provides line-editing
|
||||
and history facilities. Note that libreadline is GPL-licensed, so if
|
||||
you distribute a binary of pcre2test linked in this way, there may be
|
||||
licensing issues. These can be avoided by linking instead with libedit,
|
||||
which has a BSD licence.
|
||||
|
||||
Setting --enable-pcre2test-libreadline causes the -lreadline option to
|
||||
be added to the pcre2test build. In many operating environments with a
|
||||
sytem-installed readline library this is sufficient. However, in some
|
||||
Setting --enable-pcre2test-libreadline causes the -lreadline option to
|
||||
be added to the pcre2test build. In many operating environments with a
|
||||
sytem-installed readline library this is sufficient. However, in some
|
||||
environments (e.g. if an unmodified distribution version of readline is
|
||||
in use), some extra configuration may be necessary. The INSTALL file
|
||||
in use), some extra configuration may be necessary. The INSTALL file
|
||||
for libreadline says this:
|
||||
|
||||
"Readline uses the termcap functions, but does not link with
|
||||
the termcap or curses library itself, allowing applications
|
||||
which link with readline the to choose an appropriate library."
|
||||
|
||||
If your environment has not been set up so that an appropriate library
|
||||
If your environment has not been set up so that an appropriate library
|
||||
is automatically included, you may need to add something like
|
||||
|
||||
LIBS="-ncurses"
|
||||
|
@ -3280,7 +3355,7 @@ INCLUDING DEBUGGING CODE
|
|||
|
||||
--enable-debug
|
||||
|
||||
to the configure command, additional debugging code is included in the
|
||||
to the configure command, additional debugging code is included in the
|
||||
build. This feature is intended for use by the PCRE2 maintainers.
|
||||
|
||||
|
||||
|
@ -3290,15 +3365,15 @@ DEBUGGING WITH VALGRIND SUPPORT
|
|||
|
||||
--enable-valgrind
|
||||
|
||||
to the configure command, PCRE2 will use valgrind annotations to mark
|
||||
certain memory regions as unaddressable. This allows it to detect
|
||||
invalid memory accesses, and is mostly useful for debugging PCRE2
|
||||
to the configure command, PCRE2 will use valgrind annotations to mark
|
||||
certain memory regions as unaddressable. This allows it to detect
|
||||
invalid memory accesses, and is mostly useful for debugging PCRE2
|
||||
itself.
|
||||
|
||||
|
||||
CODE COVERAGE REPORTING
|
||||
|
||||
If your C compiler is gcc, you can build a version of PCRE2 that can
|
||||
If your C compiler is gcc, you can build a version of PCRE2 that can
|
||||
generate a code coverage report for its test suite. To enable this, you
|
||||
must install lcov version 1.6 or above. Then specify
|
||||
|
||||
|
@ -3307,20 +3382,20 @@ CODE COVERAGE REPORTING
|
|||
to the configure command and build PCRE2 in the usual way.
|
||||
|
||||
Note that using ccache (a caching C compiler) is incompatible with code
|
||||
coverage reporting. If you have configured ccache to run automatically
|
||||
coverage reporting. If you have configured ccache to run automatically
|
||||
on your system, you must set the environment variable
|
||||
|
||||
CCACHE_DISABLE=1
|
||||
|
||||
before running make to build PCRE2, so that ccache is not used.
|
||||
|
||||
When --enable-coverage is used, the following addition targets are
|
||||
When --enable-coverage is used, the following addition targets are
|
||||
added to the Makefile:
|
||||
|
||||
make coverage
|
||||
|
||||
This creates a fresh coverage report for the PCRE2 test suite. It is
|
||||
equivalent to running "make coverage-reset", "make coverage-baseline",
|
||||
This creates a fresh coverage report for the PCRE2 test suite. It is
|
||||
equivalent to running "make coverage-reset", "make coverage-baseline",
|
||||
"make check", and then "make coverage-report".
|
||||
|
||||
make coverage-reset
|
||||
|
@ -3337,18 +3412,18 @@ CODE COVERAGE REPORTING
|
|||
|
||||
make coverage-clean-report
|
||||
|
||||
This removes the generated coverage report without cleaning the cover-
|
||||
This removes the generated coverage report without cleaning the cover-
|
||||
age data itself.
|
||||
|
||||
make coverage-clean-data
|
||||
|
||||
This removes the captured coverage data without removing the coverage
|
||||
This removes the captured coverage data without removing the coverage
|
||||
files created at compile time (*.gcno).
|
||||
|
||||
make coverage-clean
|
||||
|
||||
This cleans all coverage data including the generated coverage report.
|
||||
For more information about code coverage, see the gcov and lcov docu-
|
||||
This cleans all coverage data including the generated coverage report.
|
||||
For more information about code coverage, see the gcov and lcov docu-
|
||||
mentation.
|
||||
|
||||
|
||||
|
@ -3366,7 +3441,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 24 April 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -6028,12 +6103,18 @@ MATCHING A SINGLE CODE UNIT
|
|||
results, because PCRE2 assumes that it is matching character by charac-
|
||||
ter in a valid UTF string (by default it checks the subject string's
|
||||
validity at the start of processing unless the PCRE2_NO_UTF_CHECK
|
||||
option is used). An application can lock out the use of \C by setting
|
||||
the PCRE2_NEVER_BACKSLASH_C option.
|
||||
option is used).
|
||||
|
||||
PCRE2 does not allow \C to appear in lookbehind assertions (described
|
||||
below) in a UTF mode, because this would make it impossible to calcu-
|
||||
late the length of the lookbehind.
|
||||
An application can lock out the use of \C by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also
|
||||
possible to build PCRE2 with the use of \C permanently disabled.
|
||||
|
||||
PCRE2 does not allow \C to appear in lookbehind assertions (described
|
||||
below) in a UTF mode, because this would make it impossible to calcu-
|
||||
late the length of the lookbehind. Neither the alternative matching
|
||||
function pcre2_dfa_match() not the JIT optimizer support \C in a UTF
|
||||
mode. The former gives a match-time error; the latter fails to optimize
|
||||
and so the match is always run using the interpreter.
|
||||
|
||||
In general, the \C escape sequence is best avoided. However, one way of
|
||||
using it that avoids the problem of malformed UTF characters is to use
|
||||
|
@ -8036,7 +8117,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 01 September 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -8966,10 +9047,10 @@ CHARACTER TYPES
|
|||
\W a "non-word" character
|
||||
\X a Unicode extended grapheme cluster
|
||||
|
||||
The application can lock out the use of \C by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave
|
||||
the current matching point in the middle of a UTF-8 or UTF-16 charac-
|
||||
ter.
|
||||
\C is dangerous because it may leave the current matching point in the
|
||||
middle of a UTF-8 or UTF-16 character. The application can lock out the
|
||||
use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also
|
||||
possible to build PCRE2 with the use of \C permanently disabled.
|
||||
|
||||
By default, \d, \s, and \w match only ASCII characters, even in UTF-8
|
||||
mode or in the 16-bit and 32-bit libraries. However, if locale-specific
|
||||
|
@ -9325,7 +9406,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 17 July 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -9384,89 +9465,90 @@ WIDE CHARACTERS AND UTF MODES
|
|||
The escape sequence \C can be used to match a single code unit, in a
|
||||
UTF mode, but its use can lead to some strange effects because it
|
||||
breaks up multi-unit characters (see the description of \C in the
|
||||
pcre2pattern documentation). The use of \C is not supported in the
|
||||
alternative matching function pcre2_dfa_match(), nor is it supported in
|
||||
UTF mode by the JIT optimization. If JIT optimization is requested for
|
||||
a UTF pattern that contains \C, it will not succeed, and so the match-
|
||||
ing will be carried out by the normal interpretive function.
|
||||
pcre2pattern documentation). The use of \C is not supported by the
|
||||
alternative matching function pcre2_dfa_match() when in UTF mode. Its
|
||||
use provokes a match-time error. The JIT optimization also does not
|
||||
support \C in UTF mode. If JIT optimization is requested for a UTF
|
||||
pattern that contains \C, it will not succeed, and so the matching will
|
||||
be carried out by the normal interpretive function.
|
||||
|
||||
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
||||
characters of any code value, but, by default, the characters that
|
||||
PCRE2 recognizes as digits, spaces, or word characters remain the same
|
||||
set as in non-UTF mode, all with code points less than 256. This
|
||||
remains true even when PCRE2 is built to include Unicode support,
|
||||
because to do otherwise would slow down matching in many common cases.
|
||||
Note that this also applies to \b and \B, because they are defined in
|
||||
terms of \w and \W. If you want to test for a wider sense of, say,
|
||||
"digit", you can use explicit Unicode property tests such as \p{Nd}.
|
||||
Alternatively, if you set the PCRE2_UCP option, the way that the char-
|
||||
acter escapes work is changed so that Unicode properties are used to
|
||||
characters of any code value, but, by default, the characters that
|
||||
PCRE2 recognizes as digits, spaces, or word characters remain the same
|
||||
set as in non-UTF mode, all with code points less than 256. This
|
||||
remains true even when PCRE2 is built to include Unicode support,
|
||||
because to do otherwise would slow down matching in many common cases.
|
||||
Note that this also applies to \b and \B, because they are defined in
|
||||
terms of \w and \W. If you want to test for a wider sense of, say,
|
||||
"digit", you can use explicit Unicode property tests such as \p{Nd}.
|
||||
Alternatively, if you set the PCRE2_UCP option, the way that the char-
|
||||
acter escapes work is changed so that Unicode properties are used to
|
||||
determine which characters match. There are more details in the section
|
||||
on generic character types in the pcre2pattern documentation.
|
||||
|
||||
Similarly, characters that match the POSIX named character classes are
|
||||
Similarly, characters that match the POSIX named character classes are
|
||||
all low-valued characters, unless the PCRE2_UCP option is set.
|
||||
|
||||
However, the special horizontal and vertical white space matching
|
||||
However, the special horizontal and vertical white space matching
|
||||
escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
|
||||
acters, whether or not PCRE2_UCP is set.
|
||||
|
||||
Case-insensitive matching in UTF mode makes use of Unicode properties.
|
||||
A few Unicode characters such as Greek sigma have more than two code-
|
||||
Case-insensitive matching in UTF mode makes use of Unicode properties.
|
||||
A few Unicode characters such as Greek sigma have more than two code-
|
||||
points that are case-equivalent, and these are treated as such.
|
||||
|
||||
|
||||
VALIDITY OF UTF STRINGS
|
||||
|
||||
When the PCRE2_UTF option is set, the strings passed as patterns and
|
||||
When the PCRE2_UTF option is set, the strings passed as patterns and
|
||||
subjects are (by default) checked for validity on entry to the relevant
|
||||
functions. If an invalid UTF string is passed, an negative error code
|
||||
is returned. The code unit offset to the offending character can be
|
||||
extracted from the match data block by calling pcre2_get_startchar(),
|
||||
functions. If an invalid UTF string is passed, an negative error code
|
||||
is returned. The code unit offset to the offending character can be
|
||||
extracted from the match data block by calling pcre2_get_startchar(),
|
||||
which is used for this purpose after a UTF error.
|
||||
|
||||
UTF-16 and UTF-32 strings can indicate their endianness by special code
|
||||
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
||||
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
||||
this, expecting strings to be in host byte order.
|
||||
|
||||
A UTF string is checked before any other processing takes place. In the
|
||||
case of pcre2_match() and pcre2_dfa_match() calls with a non-zero
|
||||
starting offset, the check is applied only to that part of the subject
|
||||
that could be inspected during matching, and there is a check that the
|
||||
starting offset points to the first code unit of a character or to the
|
||||
end of the subject. If there are no lookbehind assertions in the pat-
|
||||
tern, the check starts at the starting offset. Otherwise, it starts at
|
||||
the length of the longest lookbehind before the starting offset, or at
|
||||
the start of the subject if there are not that many characters before
|
||||
the starting offset. Note that the sequences \b and \B are one-charac-
|
||||
case of pcre2_match() and pcre2_dfa_match() calls with a non-zero
|
||||
starting offset, the check is applied only to that part of the subject
|
||||
that could be inspected during matching, and there is a check that the
|
||||
starting offset points to the first code unit of a character or to the
|
||||
end of the subject. If there are no lookbehind assertions in the pat-
|
||||
tern, the check starts at the starting offset. Otherwise, it starts at
|
||||
the length of the longest lookbehind before the starting offset, or at
|
||||
the start of the subject if there are not that many characters before
|
||||
the starting offset. Note that the sequences \b and \B are one-charac-
|
||||
ter lookbehinds.
|
||||
|
||||
In addition to checking the format of the string, there is a check to
|
||||
In addition to checking the format of the string, there is a check to
|
||||
ensure that all code points lie in the range U+0 to U+10FFFF, excluding
|
||||
the surrogate area. The so-called "non-character" code points are not
|
||||
the surrogate area. The so-called "non-character" code points are not
|
||||
excluded because Unicode corrigendum #9 makes it clear that they should
|
||||
not be.
|
||||
|
||||
Characters in the "Surrogate Area" of Unicode are reserved for use by
|
||||
UTF-16, where they are used in pairs to encode code points with values
|
||||
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
|
||||
are available independently in the UTF-8 and UTF-32 encodings. (In
|
||||
other words, the whole surrogate thing is a fudge for UTF-16 which
|
||||
Characters in the "Surrogate Area" of Unicode are reserved for use by
|
||||
UTF-16, where they are used in pairs to encode code points with values
|
||||
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
|
||||
are available independently in the UTF-8 and UTF-32 encodings. (In
|
||||
other words, the whole surrogate thing is a fudge for UTF-16 which
|
||||
unfortunately messes up UTF-8 and UTF-32.)
|
||||
|
||||
In some situations, you may already know that your strings are valid,
|
||||
and therefore want to skip these checks in order to improve perfor-
|
||||
mance, for example in the case of a long subject string that is being
|
||||
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com-
|
||||
pile time or at match time, PCRE2 assumes that the pattern or subject
|
||||
In some situations, you may already know that your strings are valid,
|
||||
and therefore want to skip these checks in order to improve perfor-
|
||||
mance, for example in the case of a long subject string that is being
|
||||
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com-
|
||||
pile time or at match time, PCRE2 assumes that the pattern or subject
|
||||
it is given (respectively) contains only valid UTF code unit sequences.
|
||||
|
||||
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
|
||||
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
|
||||
for the pattern; it does not also apply to subject strings. If you want
|
||||
to disable the check for a subject string you must pass this option to
|
||||
to disable the check for a subject string you must pass this option to
|
||||
pcre2_match() or pcre2_dfa_match().
|
||||
|
||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
||||
result is undefined and your program may crash or loop indefinitely.
|
||||
|
||||
Errors in UTF-8 strings
|
||||
|
@ -9479,10 +9561,10 @@ VALIDITY OF UTF STRINGS
|
|||
PCRE2_ERROR_UTF8_ERR4
|
||||
PCRE2_ERROR_UTF8_ERR5
|
||||
|
||||
The string ends with a truncated UTF-8 character; the code specifies
|
||||
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
|
||||
characters to be no longer than 4 bytes, the encoding scheme (origi-
|
||||
nally defined by RFC 2279) allows for up to 6 bytes, and this is
|
||||
The string ends with a truncated UTF-8 character; the code specifies
|
||||
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
|
||||
characters to be no longer than 4 bytes, the encoding scheme (origi-
|
||||
nally defined by RFC 2279) allows for up to 6 bytes, and this is
|
||||
checked first; hence the possibility of 4 or 5 missing bytes.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR6
|
||||
|
@ -9492,24 +9574,24 @@ VALIDITY OF UTF STRINGS
|
|||
PCRE2_ERROR_UTF8_ERR10
|
||||
|
||||
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
|
||||
the character do not have the binary value 0b10 (that is, either the
|
||||
the character do not have the binary value 0b10 (that is, either the
|
||||
most significant bit is 0, or the next bit is 1).
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR11
|
||||
PCRE2_ERROR_UTF8_ERR12
|
||||
|
||||
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
|
||||
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
|
||||
long; these code points are excluded by RFC 3629.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR13
|
||||
|
||||
A 4-byte character has a value greater than 0x10fff; these code points
|
||||
A 4-byte character has a value greater than 0x10fff; these code points
|
||||
are excluded by RFC 3629.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR14
|
||||
|
||||
A 3-byte character has a value in the range 0xd800 to 0xdfff; this
|
||||
range of code points are reserved by RFC 3629 for use with UTF-16, and
|
||||
A 3-byte character has a value in the range 0xd800 to 0xdfff; this
|
||||
range of code points are reserved by RFC 3629 for use with UTF-16, and
|
||||
so are excluded from UTF-8.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR15
|
||||
|
@ -9518,26 +9600,26 @@ VALIDITY OF UTF STRINGS
|
|||
PCRE2_ERROR_UTF8_ERR18
|
||||
PCRE2_ERROR_UTF8_ERR19
|
||||
|
||||
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
|
||||
for a value that can be represented by fewer bytes, which is invalid.
|
||||
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
|
||||
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
|
||||
for a value that can be represented by fewer bytes, which is invalid.
|
||||
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
|
||||
rect coding uses just one byte.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR20
|
||||
|
||||
The two most significant bits of the first byte of a character have the
|
||||
binary value 0b10 (that is, the most significant bit is 1 and the sec-
|
||||
ond is 0). Such a byte can only validly occur as the second or subse-
|
||||
binary value 0b10 (that is, the most significant bit is 1 and the sec-
|
||||
ond is 0). Such a byte can only validly occur as the second or subse-
|
||||
quent byte of a multi-byte character.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR21
|
||||
|
||||
The first byte of a character has the value 0xfe or 0xff. These values
|
||||
The first byte of a character has the value 0xfe or 0xff. These values
|
||||
can never occur in a valid UTF-8 string.
|
||||
|
||||
Errors in UTF-16 strings
|
||||
|
||||
The following negative error codes are given for invalid UTF-16
|
||||
The following negative error codes are given for invalid UTF-16
|
||||
strings:
|
||||
|
||||
PCRE_UTF16_ERR1 Missing low surrogate at end of string
|
||||
|
@ -9547,7 +9629,7 @@ VALIDITY OF UTF STRINGS
|
|||
|
||||
Errors in UTF-32 strings
|
||||
|
||||
The following negative error codes are given for invalid UTF-32
|
||||
The following negative error codes are given for invalid UTF-32
|
||||
strings:
|
||||
|
||||
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
|
||||
|
@ -9563,7 +9645,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 18 August 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "07 October 2015" "PCRE2 10.21"
|
||||
.TH PCRE2API 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -1209,7 +1209,8 @@ This option locks out the use of \eC in the pattern that is being compiled.
|
|||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||
it may leave the current matching point in the middle of a multi-code-unit
|
||||
character. This option may be useful in applications that process patterns from
|
||||
external sources.
|
||||
external sources. Note that there is also a build-time option that permanently
|
||||
locks out the use of \eC.
|
||||
.sp
|
||||
PCRE2_NEVER_UCP
|
||||
.sp
|
||||
|
@ -3014,8 +3015,8 @@ There are in addition the following errors that are specific to
|
|||
PCRE2_ERROR_DFA_UITEM
|
||||
.sp
|
||||
This return is given if \fBpcre2_dfa_match()\fP encounters an item in the
|
||||
pattern that it does not support, for instance, the use of \eC or a back
|
||||
reference.
|
||||
pattern that it does not support, for instance, the use of \eC in a UTF mode or
|
||||
a back reference.
|
||||
.sp
|
||||
PCRE2_ERROR_DFA_UCOND
|
||||
.sp
|
||||
|
@ -3065,6 +3066,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 07 October 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2BUILD 3 "23 April 2015" "PCRE2 10.20"
|
||||
.TH PCRE2BUILD 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.
|
||||
|
@ -132,11 +132,20 @@ Pattern escapes such as \ed and \ew do not by default make use of Unicode
|
|||
properties. The application can request that they do by setting the PCRE2_UCP
|
||||
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
|
||||
request this by starting with (*UCP).
|
||||
.P
|
||||
.
|
||||
.
|
||||
.SH "DISABLING THE USE OF \eC"
|
||||
.rs
|
||||
.sp
|
||||
The \eC escape sequence, which matches a single code unit, even in a UTF mode,
|
||||
can cause unpredictable behaviour because it may leave the current matching
|
||||
point in the middle of a multi-code-unit character. It can be locked out by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
point in the middle of a multi-code-unit character. The application can lock it
|
||||
out by setting the PCRE2_NEVER_BACKSLASH_C option when calling
|
||||
\fBpcre2_compile()\fP. There is also a build-time option
|
||||
.sp
|
||||
--enable-never-backslash-C
|
||||
.sp
|
||||
(note the upper case C) which locks out the use of \eC entirely.
|
||||
.
|
||||
.
|
||||
.SH "JUST-IN-TIME COMPILER SUPPORT"
|
||||
|
@ -510,6 +519,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 24 April 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21"
|
||||
.TH PCRE2PATTERN 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -1233,8 +1233,11 @@ with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start
|
|||
with a malformed UTF character. This has undefined results, because PCRE2
|
||||
assumes that it is matching character by character in a valid UTF string (by
|
||||
default it checks the subject string's validity at the start of processing
|
||||
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the
|
||||
use of \eC by setting the PCRE2_NEVER_BACKSLASH_C option.
|
||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||
.P
|
||||
An application can lock out the use of \eC by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
|
||||
build PCRE2 with the use of \eC permanently disabled.
|
||||
.P
|
||||
PCRE2 does not allow \eC to appear in lookbehind assertions
|
||||
.\" HTML <a href="#lookbehind">
|
||||
|
@ -1242,7 +1245,10 @@ PCRE2 does not allow \eC to appear in lookbehind assertions
|
|||
(described below)
|
||||
.\"
|
||||
in a UTF mode, because this would make it impossible to calculate the length of
|
||||
the lookbehind.
|
||||
the lookbehind. Neither the alternative matching function
|
||||
\fBpcre2_dfa_match()\fP not the JIT optimizer support \eC in a UTF mode. The
|
||||
former gives a match-time error; the latter fails to optimize and so the match
|
||||
is always run using the interpreter.
|
||||
.P
|
||||
In general, the \eC escape sequence is best avoided. However, one way of using
|
||||
it that avoids the problem of malformed UTF characters is to use a lookahead to
|
||||
|
@ -3386,6 +3392,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 01 September 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SYNTAX 3 "17 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2SYNTAX 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||
|
@ -81,9 +81,10 @@ it matches a literal "u".
|
|||
\eW a "non-word" character
|
||||
\eX a Unicode extended grapheme cluster
|
||||
.sp
|
||||
The application can lock out the use of \eC by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the
|
||||
current matching point in the middle of a UTF-8 or UTF-16 character.
|
||||
\eC is dangerous because it may leave the current matching point in the middle
|
||||
of a UTF-8 or UTF-16 character. The application can lock out the use of \eC by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||
with the use of \eC permanently disabled.
|
||||
.P
|
||||
By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode
|
||||
or in the 16-bit and 32-bit libraries. However, if locale-specific matching is
|
||||
|
@ -576,6 +577,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 17 July 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "23 September 2015" "PCRE 10.21"
|
||||
.TH PCRE2TEST 1 "17 October 2015" "PCRE 10.21"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -122,12 +122,13 @@ following options output the value and set the exit code as indicated:
|
|||
The following options output 1 for true or 0 for false, and set the exit code
|
||||
to the same value:
|
||||
.sp
|
||||
ebcdic compiled for an EBCDIC environment
|
||||
jit just-in-time support is available
|
||||
pcre2-16 the 16-bit library was built
|
||||
pcre2-32 the 32-bit library was built
|
||||
pcre2-8 the 8-bit library was built
|
||||
unicode Unicode support is available
|
||||
backslash-C \eC is supported (not locked out)
|
||||
ebcdic compiled for an EBCDIC environment
|
||||
jit just-in-time support is available
|
||||
pcre2-16 the 16-bit library was built
|
||||
pcre2-32 the 32-bit library was built
|
||||
pcre2-8 the 8-bit library was built
|
||||
unicode Unicode support is available
|
||||
.sp
|
||||
If an unknown option is given, an error message is output; the exit code is 0.
|
||||
.TP 10
|
||||
|
@ -1559,6 +1560,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 September 2015
|
||||
Last updated: 17 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -119,12 +119,13 @@ COMMAND LINE OPTIONS
|
|||
The following options output 1 for true or 0 for false, and
|
||||
set the exit code to the same value:
|
||||
|
||||
ebcdic compiled for an EBCDIC environment
|
||||
jit just-in-time support is available
|
||||
pcre2-16 the 16-bit library was built
|
||||
pcre2-32 the 32-bit library was built
|
||||
pcre2-8 the 8-bit library was built
|
||||
unicode Unicode support is available
|
||||
backslash-C \C is supported (not locked out)
|
||||
ebcdic compiled for an EBCDIC environment
|
||||
jit just-in-time support is available
|
||||
pcre2-16 the 16-bit library was built
|
||||
pcre2-32 the 32-bit library was built
|
||||
pcre2-8 the 8-bit library was built
|
||||
unicode Unicode support is available
|
||||
|
||||
If an unknown option is given, an error message is output;
|
||||
the exit code is 0.
|
||||
|
@ -457,7 +458,7 @@ PATTERN MODIFIERS
|
|||
Setting compilation options
|
||||
|
||||
The following modifiers set options for pcre2_compile(). The most com-
|
||||
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
||||
mon ones have single-letter abbreviations. See pcre2api for a descrip-
|
||||
tion of their effects.
|
||||
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
|
@ -484,6 +485,7 @@ PATTERN MODIFIERS
|
|||
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||
ucp set PCRE2_UCP
|
||||
ungreedy set PCRE2_UNGREEDY
|
||||
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||
utf set PCRE2_UTF
|
||||
|
||||
As well as turning on the PCRE2_UTF option, the utf modifier causes all
|
||||
|
@ -509,6 +511,7 @@ PATTERN MODIFIERS
|
|||
locale=<name> use this locale
|
||||
memory show memory used
|
||||
newline=<type> set newline type
|
||||
null_context compile with a NULL context
|
||||
parens_nest_limit=<n> set maximum parentheses depth
|
||||
posix use the POSIX API
|
||||
push push compiled pattern onto the stack
|
||||
|
@ -579,35 +582,42 @@ PATTERN MODIFIERS
|
|||
mation that is requested. For each callout, either its number or string
|
||||
is given, followed by the item that follows it in the pattern.
|
||||
|
||||
Passing a NULL context
|
||||
|
||||
Normally, pcre2test passes a context block to pcre2_compile(). If the
|
||||
null_context modifier is set, however, NULL is passed. This is for
|
||||
testing that pcre2_compile() behaves correctly in this case (it uses
|
||||
default values).
|
||||
|
||||
Specifying a pattern in hex
|
||||
|
||||
The hex modifier specifies that the characters of the pattern are to be
|
||||
interpreted as pairs of hexadecimal digits. White space is permitted
|
||||
interpreted as pairs of hexadecimal digits. White space is permitted
|
||||
between pairs. For example:
|
||||
|
||||
/ab 32 59/hex
|
||||
|
||||
This feature is provided as a way of creating patterns that contain
|
||||
binary zero and other non-printing characters. By default, pcre2test
|
||||
passes patterns as zero-terminated strings to pcre2_compile(), giving
|
||||
This feature is provided as a way of creating patterns that contain
|
||||
binary zero and other non-printing characters. By default, pcre2test
|
||||
passes patterns as zero-terminated strings to pcre2_compile(), giving
|
||||
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
|
||||
hexadecimal, the actual length of the pattern is passed.
|
||||
|
||||
JIT compilation
|
||||
|
||||
Just-in-time (JIT) compiling is a heavyweight optimization that can
|
||||
greatly speed up pattern matching. See the pcre2jit documentation for
|
||||
details. JIT compiling happens, optionally, after a pattern has been
|
||||
successfully compiled into an internal form. The JIT compiler converts
|
||||
Just-in-time (JIT) compiling is a heavyweight optimization that can
|
||||
greatly speed up pattern matching. See the pcre2jit documentation for
|
||||
details. JIT compiling happens, optionally, after a pattern has been
|
||||
successfully compiled into an internal form. The JIT compiler converts
|
||||
this to optimized machine code. It needs to know whether the match-time
|
||||
options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used,
|
||||
because different code is generated for the different cases. See the
|
||||
partial modifier in "Subject Modifiers" below for details of how these
|
||||
because different code is generated for the different cases. See the
|
||||
partial modifier in "Subject Modifiers" below for details of how these
|
||||
options are specified for each match attempt.
|
||||
|
||||
JIT compilation is requested by the /jit pattern modifier, which may
|
||||
JIT compilation is requested by the /jit pattern modifier, which may
|
||||
optionally be followed by an equals sign and a number in the range 0 to
|
||||
7. The three bits that make up the number specify which of the three
|
||||
7. The three bits that make up the number specify which of the three
|
||||
JIT operating modes are to be compiled:
|
||||
|
||||
1 compile JIT code for non-partial matching
|
||||
|
@ -624,31 +634,31 @@ PATTERN MODIFIERS
|
|||
6 soft and hard partial matching only
|
||||
7 all three modes
|
||||
|
||||
If no number is given, 7 is assumed. The phrase "partial matching"
|
||||
If no number is given, 7 is assumed. The phrase "partial matching"
|
||||
means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the
|
||||
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
|
||||
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
|
||||
plete match; the options enable the possibility of a partial match, but
|
||||
do not require it. Note also that if you request JIT compilation only
|
||||
for partial matching (for example, /jit=2) but do not set the partial
|
||||
modifier on a subject line, that match will not use JIT code because
|
||||
do not require it. Note also that if you request JIT compilation only
|
||||
for partial matching (for example, /jit=2) but do not set the partial
|
||||
modifier on a subject line, that match will not use JIT code because
|
||||
none was compiled for non-partial matching.
|
||||
|
||||
If JIT compilation is successful, the compiled JIT code will automati-
|
||||
cally be used when an appropriate type of match is run, except when
|
||||
incompatible run-time options are specified. For more details, see the
|
||||
pcre2jit documentation. See also the jitstack modifier below for a way
|
||||
If JIT compilation is successful, the compiled JIT code will automati-
|
||||
cally be used when an appropriate type of match is run, except when
|
||||
incompatible run-time options are specified. For more details, see the
|
||||
pcre2jit documentation. See also the jitstack modifier below for a way
|
||||
of setting the size of the JIT stack.
|
||||
|
||||
If the jitfast modifier is specified, matching is done using the JIT
|
||||
"fast path" interface, pcre2_jit_match(), which skips some of the san-
|
||||
ity checks that are done by pcre2_match(), and of course does not work
|
||||
when JIT is not supported. If jitfast is specified without jit, jit=7
|
||||
If the jitfast modifier is specified, matching is done using the JIT
|
||||
"fast path" interface, pcre2_jit_match(), which skips some of the san-
|
||||
ity checks that are done by pcre2_match(), and of course does not work
|
||||
when JIT is not supported. If jitfast is specified without jit, jit=7
|
||||
is assumed.
|
||||
|
||||
If the jitverify modifier is specified, information about the compiled
|
||||
pattern shows whether JIT compilation was or was not successful. If
|
||||
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
||||
tion is successful when jitverify is set, the text "(JIT)" is added to
|
||||
If the jitverify modifier is specified, information about the compiled
|
||||
pattern shows whether JIT compilation was or was not successful. If
|
||||
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
||||
tion is successful when jitverify is set, the text "(JIT)" is added to
|
||||
the first output line after a match or non match when JIT-compiled code
|
||||
was actually used in the match.
|
||||
|
||||
|
@ -659,18 +669,18 @@ PATTERN MODIFIERS
|
|||
/pattern/locale=fr_FR
|
||||
|
||||
The given locale is set, pcre2_maketables() is called to build a set of
|
||||
character tables for the locale, and this is then passed to pcre2_com-
|
||||
pile() when compiling the regular expression. The same tables are used
|
||||
character tables for the locale, and this is then passed to pcre2_com-
|
||||
pile() when compiling the regular expression. The same tables are used
|
||||
when matching the following subject lines. The /locale modifier applies
|
||||
only to the pattern on which it appears, but can be given in a #pattern
|
||||
command if a default is needed. Setting a locale and alternate charac-
|
||||
command if a default is needed. Setting a locale and alternate charac-
|
||||
ter tables are mutually exclusive.
|
||||
|
||||
Showing pattern memory
|
||||
|
||||
The /memory modifier causes the size in bytes of the memory used to
|
||||
hold the compiled pattern to be output. This does not include the size
|
||||
of the pcre2_code block; it is just the actual compiled data. If the
|
||||
The /memory modifier causes the size in bytes of the memory used to
|
||||
hold the compiled pattern to be output. This does not include the size
|
||||
of the pcre2_code block; it is just the actual compiled data. If the
|
||||
pattern is subsequently passed to the JIT compiler, the size of the JIT
|
||||
compiled code is also output. Here is an example:
|
||||
|
||||
|
@ -681,19 +691,19 @@ PATTERN MODIFIERS
|
|||
|
||||
Limiting nested parentheses
|
||||
|
||||
The parens_nest_limit modifier sets a limit on the depth of nested
|
||||
parentheses in a pattern. Breaching the limit causes a compilation
|
||||
error. The default for the library is set when PCRE2 is built, but
|
||||
pcre2test sets its own default of 220, which is required for running
|
||||
The parens_nest_limit modifier sets a limit on the depth of nested
|
||||
parentheses in a pattern. Breaching the limit causes a compilation
|
||||
error. The default for the library is set when PCRE2 is built, but
|
||||
pcre2test sets its own default of 220, which is required for running
|
||||
the standard test suite.
|
||||
|
||||
Using the POSIX wrapper API
|
||||
|
||||
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
|
||||
per API rather than its native API. This supports only the 8-bit
|
||||
library. Note that it does not imply POSIX matching semantics; for
|
||||
more detail see the pcre2posix documentation. When the POSIX API is
|
||||
being used, the following pattern modifiers set options for the reg-
|
||||
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
|
||||
per API rather than its native API. This supports only the 8-bit
|
||||
library. Note that it does not imply POSIX matching semantics; for
|
||||
more detail see the pcre2posix documentation. When the POSIX API is
|
||||
being used, the following pattern modifiers set options for the reg-
|
||||
comp() function:
|
||||
|
||||
caseless REG_ICASE
|
||||
|
@ -704,24 +714,24 @@ PATTERN MODIFIERS
|
|||
ucp REG_UCP ) the POSIX standard
|
||||
utf REG_UTF8 )
|
||||
|
||||
The aftertext and allaftertext subject modifiers work as described
|
||||
The aftertext and allaftertext subject modifiers work as described
|
||||
below. All other modifiers cause an error.
|
||||
|
||||
Testing the stack guard feature
|
||||
|
||||
The /stackguard modifier is used to test the use of pcre2_set_com-
|
||||
pile_recursion_guard(), a function that is provided to enable stack
|
||||
availability to be checked during compilation (see the pcre2api docu-
|
||||
mentation for details). If the number specified by the modifier is
|
||||
The /stackguard modifier is used to test the use of pcre2_set_com-
|
||||
pile_recursion_guard(), a function that is provided to enable stack
|
||||
availability to be checked during compilation (see the pcre2api docu-
|
||||
mentation for details). If the number specified by the modifier is
|
||||
greater than zero, pcre2_set_compile_recursion_guard() is called to set
|
||||
up callback from pcre2_compile() to a local function. The argument it
|
||||
receives is the current nesting parenthesis depth; if this is greater
|
||||
up callback from pcre2_compile() to a local function. The argument it
|
||||
receives is the current nesting parenthesis depth; if this is greater
|
||||
than the value given by the modifier, non-zero is returned, causing the
|
||||
compilation to be aborted.
|
||||
|
||||
Using alternative character tables
|
||||
|
||||
The value specified for the /tables modifier must be one of the digits
|
||||
The value specified for the /tables modifier must be one of the digits
|
||||
0, 1, or 2. It causes a specific set of built-in character tables to be
|
||||
passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
|
||||
haviour with different character tables. The digit specifies the tables
|
||||
|
@ -732,15 +742,15 @@ PATTERN MODIFIERS
|
|||
pcre2_chartables.c.dist
|
||||
2 a set of tables defining ISO 8859 characters
|
||||
|
||||
In table 2, some characters whose codes are greater than 128 are iden-
|
||||
tified as letters, digits, spaces, etc. Setting alternate character
|
||||
In table 2, some characters whose codes are greater than 128 are iden-
|
||||
tified as letters, digits, spaces, etc. Setting alternate character
|
||||
tables and a locale are mutually exclusive.
|
||||
|
||||
Setting certain match controls
|
||||
|
||||
The following modifiers are really subject modifiers, and are described
|
||||
below. However, they may be included in a pattern's modifier list, in
|
||||
which case they are applied to every subject line that is processed
|
||||
below. However, they may be included in a pattern's modifier list, in
|
||||
which case they are applied to every subject line that is processed
|
||||
with that pattern. They do not affect the compilation process.
|
||||
|
||||
aftertext show text after match
|
||||
|
@ -752,20 +762,20 @@ PATTERN MODIFIERS
|
|||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
as defaults, set them in a #subject command.
|
||||
|
||||
Saving a compiled pattern
|
||||
|
||||
When a pattern with the push modifier is successfully compiled, it is
|
||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||
next line to contain a new pattern (or a command) instead of a subject
|
||||
When a pattern with the push modifier is successfully compiled, it is
|
||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||
next line to contain a new pattern (or a command) instead of a subject
|
||||
line. This facility is used when saving compiled patterns to a file, as
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
terns" below. The push modifier is incompatible with compilation modi-
|
||||
fiers such as global that act at match time. Any that are specified are
|
||||
ignored, with a warning message, except for replace, which causes an
|
||||
error. Note that, jitverify, which is allowed, does not carry through
|
||||
ignored, with a warning message, except for replace, which causes an
|
||||
error. Note that, jitverify, which is allowed, does not carry through
|
||||
to any subsequent matching that uses this pattern.
|
||||
|
||||
|
||||
|
@ -776,7 +786,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
Setting match options
|
||||
|
||||
The following modifiers set options for pcre2_match() or
|
||||
The following modifiers set options for pcre2_match() or
|
||||
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
||||
|
||||
anchored set PCRE2_ANCHORED
|
||||
|
@ -790,20 +800,20 @@ SUBJECT MODIFIERS
|
|||
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
||||
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
||||
|
||||
The partial matching modifiers are provided with abbreviations because
|
||||
The partial matching modifiers are provided with abbreviations because
|
||||
they appear frequently in tests.
|
||||
|
||||
If the /posix modifier was present on the pattern, causing the POSIX
|
||||
If the /posix modifier was present on the pattern, causing the POSIX
|
||||
wrapper API to be used, the only option-setting modifiers that have any
|
||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||
Any other modifiers cause an error.
|
||||
|
||||
Setting match controls
|
||||
|
||||
The following modifiers affect the matching process or request addi-
|
||||
tional information. Some of them may also be specified on a pattern
|
||||
line (see above), in which case they apply to every subject line that
|
||||
The following modifiers affect the matching process or request addi-
|
||||
tional information. Some of them may also be specified on a pattern
|
||||
line (see above), in which case they apply to every subject line that
|
||||
is matched against that pattern.
|
||||
|
||||
aftertext show text after match
|
||||
|
@ -823,9 +833,11 @@ SUBJECT MODIFIERS
|
|||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
match_limit=>n> set a match limit
|
||||
match_limit=<n> set a match limit
|
||||
memory show memory usage
|
||||
null_context match with a NULL context
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
recursion_limit=<n> set a recursion limit
|
||||
replace=<string> specify a replacement string
|
||||
|
@ -836,23 +848,23 @@ SUBJECT MODIFIERS
|
|||
|
||||
Showing more text
|
||||
|
||||
The aftertext modifier requests that as well as outputting the part of
|
||||
The aftertext modifier requests that as well as outputting the part of
|
||||
the subject string that matched the entire pattern, pcre2test should in
|
||||
addition output the remainder of the subject string. This is useful for
|
||||
tests where the subject contains multiple copies of the same substring.
|
||||
The allaftertext modifier requests the same action for captured sub-
|
||||
The allaftertext modifier requests the same action for captured sub-
|
||||
strings as well as the main matched substring. In each case the remain-
|
||||
der is output on the following line with a plus character following the
|
||||
capture number.
|
||||
|
||||
The allusedtext modifier requests that all the text that was consulted
|
||||
during a successful pattern match by the interpreter should be shown.
|
||||
This feature is not supported for JIT matching, and if requested with
|
||||
JIT it is ignored (with a warning message). Setting this modifier
|
||||
The allusedtext modifier requests that all the text that was consulted
|
||||
during a successful pattern match by the interpreter should be shown.
|
||||
This feature is not supported for JIT matching, and if requested with
|
||||
JIT it is ignored (with a warning message). Setting this modifier
|
||||
affects the output if there is a lookbehind at the start of a match, or
|
||||
a lookahead at the end, or if \K is used in the pattern. Characters
|
||||
that precede or follow the start and end of the actual match are indi-
|
||||
cated in the output by '<' or '>' characters underneath them. Here is
|
||||
a lookahead at the end, or if \K is used in the pattern. Characters
|
||||
that precede or follow the start and end of the actual match are indi-
|
||||
cated in the output by '<' or '>' characters underneath them. Here is
|
||||
an example:
|
||||
|
||||
re> /(?<=pqr)abc(?=xyz)/
|
||||
|
@ -860,16 +872,16 @@ SUBJECT MODIFIERS
|
|||
0: pqrabcxyz
|
||||
<<< >>>
|
||||
|
||||
This shows that the matched string is "abc", with the preceding and
|
||||
following strings "pqr" and "xyz" having been consulted during the
|
||||
This shows that the matched string is "abc", with the preceding and
|
||||
following strings "pqr" and "xyz" having been consulted during the
|
||||
match (when processing the assertions).
|
||||
|
||||
The startchar modifier requests that the starting character for the
|
||||
match be indicated, if it is different to the start of the matched
|
||||
The startchar modifier requests that the starting character for the
|
||||
match be indicated, if it is different to the start of the matched
|
||||
string. The only time when this occurs is when \K has been processed as
|
||||
part of the match. In this situation, the output for the matched string
|
||||
is displayed from the starting character instead of from the match
|
||||
point, with circumflex characters under the earlier characters. For
|
||||
is displayed from the starting character instead of from the match
|
||||
point, with circumflex characters under the earlier characters. For
|
||||
example:
|
||||
|
||||
re> /abc\Kxyz/
|
||||
|
@ -877,7 +889,7 @@ SUBJECT MODIFIERS
|
|||
0: abcxyz
|
||||
^^^
|
||||
|
||||
Unlike allusedtext, the startchar modifier can be used with JIT. How-
|
||||
Unlike allusedtext, the startchar modifier can be used with JIT. How-
|
||||
ever, these two modifiers are mutually exclusive.
|
||||
|
||||
Showing the value of all capture groups
|
||||
|
@ -885,88 +897,88 @@ SUBJECT MODIFIERS
|
|||
The allcaptures modifier requests that the values of all potential cap-
|
||||
tured parentheses be output after a match. By default, only those up to
|
||||
the highest one actually used in the match are output (corresponding to
|
||||
the return code from pcre2_match()). Groups that did not take part in
|
||||
the return code from pcre2_match()). Groups that did not take part in
|
||||
the match are output as "<unset>".
|
||||
|
||||
Testing callouts
|
||||
|
||||
A callout function is supplied when pcre2test calls the library match-
|
||||
ing functions, unless callout_none is specified. If callout_capture is
|
||||
A callout function is supplied when pcre2test calls the library match-
|
||||
ing functions, unless callout_none is specified. If callout_capture is
|
||||
set, the current captured groups are output when a callout occurs.
|
||||
|
||||
The callout_fail modifier can be given one or two numbers. If there is
|
||||
The callout_fail modifier can be given one or two numbers. If there is
|
||||
only one number, 1 is returned instead of 0 when a callout of that num-
|
||||
ber is reached. If two numbers are given, 1 is returned when callout
|
||||
ber is reached. If two numbers are given, 1 is returned when callout
|
||||
<n> is reached for the <m>th time. Note that callouts with string argu-
|
||||
ments are always given the number zero. See "Callouts" below for a
|
||||
ments are always given the number zero. See "Callouts" below for a
|
||||
description of the output when a callout it taken.
|
||||
|
||||
The callout_data modifier can be given an unsigned or a negative num-
|
||||
ber. This is set as the "user data" that is passed to the matching
|
||||
function, and passed back when the callout function is invoked. Any
|
||||
value other than zero is used as a return from pcre2test's callout
|
||||
The callout_data modifier can be given an unsigned or a negative num-
|
||||
ber. This is set as the "user data" that is passed to the matching
|
||||
function, and passed back when the callout function is invoked. Any
|
||||
value other than zero is used as a return from pcre2test's callout
|
||||
function.
|
||||
|
||||
Finding all matches in a string
|
||||
|
||||
Searching for all possible matches within a subject can be requested by
|
||||
the global or /altglobal modifier. After finding a match, the matching
|
||||
function is called again to search the remainder of the subject. The
|
||||
difference between global and altglobal is that the former uses the
|
||||
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
||||
searching at a new point within the entire string (which is what Perl
|
||||
the global or /altglobal modifier. After finding a match, the matching
|
||||
function is called again to search the remainder of the subject. The
|
||||
difference between global and altglobal is that the former uses the
|
||||
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
||||
searching at a new point within the entire string (which is what Perl
|
||||
does), whereas the latter passes over a shortened subject. This makes a
|
||||
difference to the matching process if the pattern begins with a lookbe-
|
||||
hind assertion (including \b or \B).
|
||||
|
||||
If an empty string is matched, the next match is done with the
|
||||
If an empty string is matched, the next match is done with the
|
||||
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
|
||||
for another, non-empty, match at the same point in the subject. If this
|
||||
match fails, the start offset is advanced, and the normal match is
|
||||
retried. This imitates the way Perl handles such cases when using the
|
||||
/g modifier or the split() function. Normally, the start offset is
|
||||
advanced by one character, but if the newline convention recognizes
|
||||
CRLF as a newline, and the current character is CR followed by LF, an
|
||||
match fails, the start offset is advanced, and the normal match is
|
||||
retried. This imitates the way Perl handles such cases when using the
|
||||
/g modifier or the split() function. Normally, the start offset is
|
||||
advanced by one character, but if the newline convention recognizes
|
||||
CRLF as a newline, and the current character is CR followed by LF, an
|
||||
advance of two characters occurs.
|
||||
|
||||
Testing substring extraction functions
|
||||
|
||||
The copy and get modifiers can be used to test the pcre2_sub-
|
||||
The copy and get modifiers can be used to test the pcre2_sub-
|
||||
string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be
|
||||
given more than once, and each can specify a group name or number, for
|
||||
given more than once, and each can specify a group name or number, for
|
||||
example:
|
||||
|
||||
abcd\=copy=1,copy=3,get=G1
|
||||
|
||||
If the #subject command is used to set default copy and/or get lists,
|
||||
these can be unset by specifying a negative number to cancel all num-
|
||||
If the #subject command is used to set default copy and/or get lists,
|
||||
these can be unset by specifying a negative number to cancel all num-
|
||||
bered groups and an empty name to cancel all named groups.
|
||||
|
||||
The getall modifier tests pcre2_substring_list_get(), which extracts
|
||||
The getall modifier tests pcre2_substring_list_get(), which extracts
|
||||
all captured substrings.
|
||||
|
||||
If the subject line is successfully matched, the substrings extracted
|
||||
by the convenience functions are output with C, G, or L after the
|
||||
string number instead of a colon. This is in addition to the normal
|
||||
full list. The string length (that is, the return from the extraction
|
||||
If the subject line is successfully matched, the substrings extracted
|
||||
by the convenience functions are output with C, G, or L after the
|
||||
string number instead of a colon. This is in addition to the normal
|
||||
full list. The string length (that is, the return from the extraction
|
||||
function) is given in parentheses after each substring, followed by the
|
||||
name when the extraction was by name.
|
||||
|
||||
Testing the substitution function
|
||||
|
||||
If the replace modifier is set, the pcre2_substitute() function is
|
||||
called instead of one of the matching functions. Unlike subject
|
||||
strings, pcre2test does not process replacement strings for escape
|
||||
If the replace modifier is set, the pcre2_substitute() function is
|
||||
called instead of one of the matching functions. Unlike subject
|
||||
strings, pcre2test does not process replacement strings for escape
|
||||
sequences. In UTF mode, a replacement string is checked to see if it is
|
||||
a valid UTF-8 string. If so, it is correctly converted to a UTF string
|
||||
of the appropriate code unit width. If it is not a valid UTF-8 string,
|
||||
of the appropriate code unit width. If it is not a valid UTF-8 string,
|
||||
the individual code units are copied directly. This provides a means of
|
||||
passing an invalid UTF-8 string for testing purposes.
|
||||
|
||||
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
||||
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
||||
pcre2_substitute(). After a successful substitution, the modified
|
||||
string is output, preceded by the number of replacements. This may be
|
||||
zero if there were no matches. Here is a simple example of a substitu-
|
||||
string is output, preceded by the number of replacements. This may be
|
||||
zero if there were no matches. Here is a simple example of a substitu-
|
||||
tion test:
|
||||
|
||||
/abc/replace=xxx
|
||||
|
@ -975,11 +987,11 @@ SUBJECT MODIFIERS
|
|||
=abc=abc=\=global
|
||||
2: =xxx=xxx=
|
||||
|
||||
Subject and replacement strings should be kept relatively short for
|
||||
substitution tests, as fixed-size buffers are used. To make it easy to
|
||||
test for buffer overflow, if the replacement string starts with a num-
|
||||
ber in square brackets, that number is passed to pcre2_substitute() as
|
||||
the size of the output buffer, with the replacement string starting at
|
||||
Subject and replacement strings should be kept relatively short for
|
||||
substitution tests, as fixed-size buffers are used. To make it easy to
|
||||
test for buffer overflow, if the replacement string starts with a num-
|
||||
ber in square brackets, that number is passed to pcre2_substitute() as
|
||||
the size of the output buffer, with the replacement string starting at
|
||||
the next character. Here is an example that tests the edge case:
|
||||
|
||||
/abc/
|
||||
|
@ -989,90 +1001,107 @@ SUBJECT MODIFIERS
|
|||
Failed: error -47: no more memory
|
||||
|
||||
A replacement string is ignored with POSIX and DFA matching. Specifying
|
||||
partial matching provokes an error return ("bad option value") from
|
||||
partial matching provokes an error return ("bad option value") from
|
||||
pcre2_substitute().
|
||||
|
||||
Setting the JIT stack size
|
||||
|
||||
The jitstack modifier provides a way of setting the maximum stack size
|
||||
that is used by the just-in-time optimization code. It is ignored if
|
||||
The jitstack modifier provides a way of setting the maximum stack size
|
||||
that is used by the just-in-time optimization code. It is ignored if
|
||||
JIT optimization is not being used. The value is a number of kilobytes.
|
||||
Providing a stack that is larger than the default 32K is necessary only
|
||||
for very complicated patterns.
|
||||
|
||||
Setting match and recursion limits
|
||||
|
||||
The match_limit and recursion_limit modifiers set the appropriate lim-
|
||||
The match_limit and recursion_limit modifiers set the appropriate lim-
|
||||
its in the match context. These values are ignored when the find_limits
|
||||
modifier is specified.
|
||||
|
||||
Finding minimum limits
|
||||
|
||||
If the find_limits modifier is present, pcre2test calls pcre2_match()
|
||||
several times, setting different values in the match context via
|
||||
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
|
||||
the minimum values for each parameter that allow pcre2_match() to com-
|
||||
If the find_limits modifier is present, pcre2test calls pcre2_match()
|
||||
several times, setting different values in the match context via
|
||||
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
|
||||
the minimum values for each parameter that allow pcre2_match() to com-
|
||||
plete without error.
|
||||
|
||||
If JIT is being used, only the match limit is relevant. If DFA matching
|
||||
is being used, neither limit is relevant, and this modifier is ignored
|
||||
is being used, neither limit is relevant, and this modifier is ignored
|
||||
(with a warning message).
|
||||
|
||||
The match_limit number is a measure of the amount of backtracking that
|
||||
takes place, and learning the minimum value can be instructive. For
|
||||
most simple matches, the number is quite small, but for patterns with
|
||||
very large numbers of matching possibilities, it can become large very
|
||||
quickly with increasing length of subject string. The
|
||||
match_limit_recursion number is a measure of how much stack (or, if
|
||||
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
|
||||
The match_limit number is a measure of the amount of backtracking that
|
||||
takes place, and learning the minimum value can be instructive. For
|
||||
most simple matches, the number is quite small, but for patterns with
|
||||
very large numbers of matching possibilities, it can become large very
|
||||
quickly with increasing length of subject string. The
|
||||
match_limit_recursion number is a measure of how much stack (or, if
|
||||
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
|
||||
complete the match attempt.
|
||||
|
||||
Showing MARK names
|
||||
|
||||
|
||||
The mark modifier causes the names from backtracking control verbs that
|
||||
are returned from calls to pcre2_match() to be displayed. If a mark is
|
||||
returned for a match, non-match, or partial match, pcre2test shows it.
|
||||
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
||||
are returned from calls to pcre2_match() to be displayed. If a mark is
|
||||
returned for a match, non-match, or partial match, pcre2test shows it.
|
||||
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
||||
it is added to the non-match message.
|
||||
|
||||
Showing memory usage
|
||||
|
||||
The memory modifier causes pcre2test to log all memory allocation and
|
||||
The memory modifier causes pcre2test to log all memory allocation and
|
||||
freeing calls that occur during a match operation.
|
||||
|
||||
Setting a starting offset
|
||||
|
||||
The offset modifier sets an offset in the subject string at which
|
||||
The offset modifier sets an offset in the subject string at which
|
||||
matching starts. Its value is a number of code units, not characters.
|
||||
|
||||
Setting an offset limit
|
||||
|
||||
The offset_limit modifier sets a limit for unanchored matches. If a
|
||||
match cannot be found starting at or before this offset in the subject,
|
||||
a "no match" return is given. The data value is a number of code units,
|
||||
not characters. When this modifier is used, the use_offset_limit modi-
|
||||
fier must have been set for the pattern; if not, an error is generated.
|
||||
|
||||
Setting the size of the output vector
|
||||
|
||||
The ovector modifier applies only to the subject line in which it
|
||||
appears, though of course it can also be used to set a default in a
|
||||
#subject command. It specifies the number of pairs of offsets that are
|
||||
The ovector modifier applies only to the subject line in which it
|
||||
appears, though of course it can also be used to set a default in a
|
||||
#subject command. It specifies the number of pairs of offsets that are
|
||||
available for storing matching information. The default is 15.
|
||||
|
||||
A value of zero is useful when testing the POSIX API because it causes
|
||||
A value of zero is useful when testing the POSIX API because it causes
|
||||
regexec() to be called with a NULL capture vector. When not testing the
|
||||
POSIX API, a value of zero is used to cause pcre2_match_data_cre-
|
||||
ate_from_pattern() to be called, in order to create a match block of
|
||||
POSIX API, a value of zero is used to cause pcre2_match_data_cre-
|
||||
ate_from_pattern() to be called, in order to create a match block of
|
||||
exactly the right size for the pattern. (It is not possible to create a
|
||||
match block with a zero-length ovector; there is always at least one
|
||||
match block with a zero-length ovector; there is always at least one
|
||||
pair of offsets.)
|
||||
|
||||
Passing the subject as zero-terminated
|
||||
|
||||
By default, the subject string is passed to a native API matching func-
|
||||
tion with its correct length. In order to test the facility for passing
|
||||
a zero-terminated string, the zero_terminate modifier is provided. It
|
||||
a zero-terminated string, the zero_terminate modifier is provided. It
|
||||
causes the length to be passed as PCRE2_ZERO_TERMINATED. (When matching
|
||||
via the POSIX interface, this modifier has no effect, as there is no
|
||||
via the POSIX interface, this modifier has no effect, as there is no
|
||||
facility for passing a length.)
|
||||
|
||||
When testing pcre2_substitute(), this modifier also has the effect of
|
||||
When testing pcre2_substitute(), this modifier also has the effect of
|
||||
passing the replacement string as zero-terminated.
|
||||
|
||||
Passing a NULL context
|
||||
|
||||
Normally, pcre2test passes a context block to pcre2_match(),
|
||||
pcre2_dfa_match() or pcre2_jit_match(). If the null_context modifier is
|
||||
set, however, NULL is passed. This is for testing that the matching
|
||||
functions behave correctly in this case (they use default values). This
|
||||
modifier cannot be used with the find_limits modifier or when testing
|
||||
the substitution function.
|
||||
|
||||
|
||||
THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
||||
|
@ -1398,5 +1427,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 14 September 2015
|
||||
Last updated: 17 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2UNICODE 3 "18 August 2015" "PCRE2 10.21"
|
||||
.TH PCRE2UNICODE 3 "16 October 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions (revised API)
|
||||
.SH "UNICODE AND UTF SUPPORT"
|
||||
|
@ -63,11 +63,12 @@ characters (see the description of \eC in the
|
|||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
documentation). The use of \eC is not supported in the alternative matching
|
||||
function \fBpcre2_dfa_match()\fP, nor is it supported in UTF mode by the JIT
|
||||
optimization. If JIT optimization is requested for a UTF pattern that contains
|
||||
\eC, it will not succeed, and so the matching will be carried out by the normal
|
||||
interpretive function.
|
||||
documentation). The use of \eC is not supported by the alternative matching
|
||||
function \fBpcre2_dfa_match()\fP when in UTF mode. Its use provokes a
|
||||
match-time error. The JIT optimization also does not support \eC in UTF mode.
|
||||
If JIT optimization is requested for a UTF pattern that contains \eC, it will
|
||||
not succeed, and so the matching will be carried out by the normal interpretive
|
||||
function.
|
||||
.P
|
||||
The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test
|
||||
characters of any code value, but, by default, the characters that PCRE2
|
||||
|
@ -262,6 +263,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 18 August 2015
|
||||
Last updated: 16 October 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -182,6 +182,9 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define MAX_NAME_SIZE 32
|
||||
#endif
|
||||
|
||||
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
|
||||
/* #undef NEVER_BACKSLASH_C */
|
||||
|
||||
/* The value of NEWLINE_DEFAULT determines the default newline character
|
||||
sequence. PCRE2 client programs can override this by selecting other values
|
||||
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5
|
||||
|
|
|
@ -169,6 +169,9 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
overflow caused by enormously large patterns. */
|
||||
#undef MAX_NAME_SIZE
|
||||
|
||||
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
|
||||
#undef NEVER_BACKSLASH_C
|
||||
|
||||
/* The value of NEWLINE_DEFAULT determines the default newline character
|
||||
sequence. PCRE2 client programs can override this by selecting other values
|
||||
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5
|
||||
|
|
|
@ -583,7 +583,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
|
|||
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
|
||||
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
|
||||
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
|
||||
ERR81, ERR82, ERR83, ERR84 };
|
||||
ERR81, ERR82, ERR83, ERR84, ERR85 };
|
||||
|
||||
/* This is a table of start-of-pattern options such as (*UTF) and settings such
|
||||
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
|
||||
|
@ -7052,12 +7052,20 @@ for (;; ptr++)
|
|||
#endif
|
||||
|
||||
/* The use of \C can be locked out. */
|
||||
|
||||
|
||||
#ifdef NEVER_BACKSLASH_C
|
||||
else if (escape == ESC_C)
|
||||
{
|
||||
*errorcodeptr = ERR85;
|
||||
goto FAILED;
|
||||
}
|
||||
#else
|
||||
else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
|
||||
{
|
||||
*errorcodeptr = ERR83;
|
||||
goto FAILED;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* For the rest (including \X when Unicode properties are supported), we
|
||||
can obtain the OP value by negating the escape value in the default
|
||||
|
|
|
@ -168,6 +168,8 @@ static const char compile_error_texts[] =
|
|||
"unrecognized string delimiter follows (?C\0"
|
||||
"using \\C is disabled by the application\0"
|
||||
"(?| and/or (?J: or (?x: parentheses are too deeply nested\0"
|
||||
/* 85 */
|
||||
"using \\C is disabled in this PCRE2 library\0"
|
||||
;
|
||||
|
||||
/* Match-time and UTF error texts are in the same format. */
|
||||
|
|
|
@ -106,7 +106,7 @@ static const int eint1[] = {
|
|||
|
||||
static const int eint2[] = {
|
||||
30, REG_ECTYPE, /* unknown POSIX class name */
|
||||
32, REG_INVARG, /* this version of PCRE does not have UTF or UCP support */
|
||||
32, REG_INVARG, /* this version of PCRE2 does not have Unicode support */
|
||||
37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */
|
||||
56, REG_INVARG, /* internal error: unknown newline setting */
|
||||
};
|
||||
|
|
|
@ -667,6 +667,12 @@ table itself easier to read. */
|
|||
#define EBCDIC_NL 0
|
||||
#endif
|
||||
|
||||
#ifdef NEVER_BACKSLASH_C
|
||||
#define BACKSLASH_C 0
|
||||
#else
|
||||
#define BACKSLASH_C 1
|
||||
#endif
|
||||
|
||||
typedef struct coptstruct {
|
||||
const char *name;
|
||||
uint32_t type;
|
||||
|
@ -681,16 +687,17 @@ enum { CONF_BSR,
|
|||
};
|
||||
|
||||
static coptstruct coptlist[] = {
|
||||
{ "bsr", CONF_BSR, PCRE2_CONFIG_BSR },
|
||||
{ "ebcdic", CONF_FIX, SUPPORT_EBCDIC },
|
||||
{ "ebcdic-nl", CONF_FIZ, EBCDIC_NL },
|
||||
{ "jit", CONF_INT, PCRE2_CONFIG_JIT },
|
||||
{ "linksize", CONF_INT, PCRE2_CONFIG_LINKSIZE },
|
||||
{ "newline", CONF_NL, PCRE2_CONFIG_NEWLINE },
|
||||
{ "pcre2-16", CONF_FIX, SUPPORT_16 },
|
||||
{ "pcre2-32", CONF_FIX, SUPPORT_32 },
|
||||
{ "pcre2-8", CONF_FIX, SUPPORT_8 },
|
||||
{ "unicode", CONF_INT, PCRE2_CONFIG_UNICODE }
|
||||
{ "backslash-C", CONF_FIX, BACKSLASH_C },
|
||||
{ "bsr", CONF_BSR, PCRE2_CONFIG_BSR },
|
||||
{ "ebcdic", CONF_FIX, SUPPORT_EBCDIC },
|
||||
{ "ebcdic-nl", CONF_FIZ, EBCDIC_NL },
|
||||
{ "jit", CONF_INT, PCRE2_CONFIG_JIT },
|
||||
{ "linksize", CONF_INT, PCRE2_CONFIG_LINKSIZE },
|
||||
{ "newline", CONF_NL, PCRE2_CONFIG_NEWLINE },
|
||||
{ "pcre2-16", CONF_FIX, SUPPORT_16 },
|
||||
{ "pcre2-32", CONF_FIX, SUPPORT_32 },
|
||||
{ "pcre2-8", CONF_FIX, SUPPORT_8 },
|
||||
{ "unicode", CONF_INT, PCRE2_CONFIG_UNICODE }
|
||||
};
|
||||
|
||||
#define COPTLISTCOUNT sizeof(coptlist)/sizeof(coptstruct)
|
||||
|
@ -6467,6 +6474,7 @@ printf(" -b set default pattern control 'fullbincode'\n");
|
|||
printf(" -C show PCRE2 compile-time options and exit\n");
|
||||
printf(" -C arg show a specific compile-time option and exit with its\n");
|
||||
printf(" value if numeric (else 0). The arg can be:\n");
|
||||
printf(" backslash-C use of \\C is enabled [0, 1]\n");
|
||||
printf(" bsr \\R type [ANYCRLF, ANY]\n");
|
||||
printf(" ebcdic compiled for EBCDIC character code [0,1]\n");
|
||||
printf(" ebcdic-nl NL code if compiled for EBCDIC\n");
|
||||
|
@ -6618,6 +6626,11 @@ print_newline_config(optval, FALSE);
|
|||
(void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &optval);
|
||||
printf(" \\R matches %s\n", optval? "CR, LF, or CRLF only" :
|
||||
"all Unicode newlines");
|
||||
#ifdef NEVER_BACKSLASH_C
|
||||
printf(" \\C is not supported\n");
|
||||
#else
|
||||
printf(" \\C is supported\n");
|
||||
#endif
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &optval);
|
||||
printf(" Internal link size = %d\n", optval);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &optval);
|
||||
|
|
|
@ -1,46 +1,6 @@
|
|||
# This set of tests is for UTF-8 support and Unicode property support, with
|
||||
# relevance only for the 8-bit library.
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
X\x{1234}
|
||||
X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
X\x{1234}\x{512}YZ
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
X\x{1234}
|
||||
X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
a\nb
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
\= Expect no match
|
||||
a\x{12257}b
|
||||
|
||||
# The next 3 patterns have UTF-8 errors
|
||||
|
||||
/[Ã]/utf
|
||||
|
@ -212,21 +172,6 @@
|
|||
|
||||
/\x{212ab}/IB,utf
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single-byte is displayed. (In Perl it becomes a character, and you
|
||||
# can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
X\nabc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match
|
||||
a\x{100}b
|
||||
|
||||
/[^ab\xC0-\xF0]/IB,utf
|
||||
\x{f1}
|
||||
\x{bf}
|
||||
|
|
|
@ -6,10 +6,6 @@
|
|||
#forbid_utf
|
||||
#newline_default LF ANY ANYCRLF
|
||||
|
||||
/a\Cb/
|
||||
aXb
|
||||
a\nb
|
||||
|
||||
/[^\x{c4}]/IB
|
||||
|
||||
/\x{100}/I
|
||||
|
@ -344,7 +340,7 @@
|
|||
|
||||
# Non-UTF characters
|
||||
|
||||
/\C{2,3}/
|
||||
/.{2,3}/
|
||||
\x{400000}\x{400001}\x{400002}\x{400003}
|
||||
|
||||
/\x{400000}\x{800000}/IBi
|
||||
|
|
|
@ -7,49 +7,6 @@
|
|||
/abc/utf
|
||||
Ã]
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{11234}YZ
|
||||
X\x{11234}YZW
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
X\x{11234}\x{512}YZ
|
||||
\= Expect no match
|
||||
X\x{11234}
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
a\nb
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{12257}b
|
||||
\= Expect no match
|
||||
a\x{12257}\x{11234}b
|
||||
a\x{100}b
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
|
||||
# Check maximum character size
|
||||
|
||||
/\x{ffff}/IB,utf
|
||||
|
@ -90,16 +47,6 @@
|
|||
|
||||
/\x{212ab}/IB,utf
|
||||
|
||||
# These two \C tests, copied from the UTF-8 input file, do not have any
|
||||
# problems in 16 or 32 bits.
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
X\nabc
|
||||
|
||||
/a\Cb/utf
|
||||
a\x{100}b
|
||||
|
||||
/[^ab\xC0-\xF0]/IB,utf
|
||||
\x{f1}
|
||||
\x{bf}
|
||||
|
@ -336,9 +283,6 @@
|
|||
|
||||
/\o{4200000}/utf
|
||||
|
||||
/\C/utf
|
||||
\x{110000}
|
||||
|
||||
/\x{100}*A/IB,utf
|
||||
A
|
||||
|
||||
|
@ -396,4 +340,7 @@
|
|||
|
||||
/\x{3a3}B/IBi,utf
|
||||
|
||||
/./utf
|
||||
\x{110000}
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -3739,41 +3739,40 @@
|
|||
|
||||
/[bcd]*a/B
|
||||
|
||||
# A complete set of tests for auto-possessification of character types.
|
||||
# A complete set of tests for auto-possessification of character types, but
|
||||
# omitting \C because it might be disabled (it has its own tests).
|
||||
|
||||
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\C \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
||||
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
||||
|
||||
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\C \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
||||
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
||||
|
||||
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\C \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
||||
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
||||
|
||||
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\C \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
||||
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
||||
|
||||
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\C \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
||||
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
||||
|
||||
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\C \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
||||
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
||||
|
||||
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\C \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
||||
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
||||
|
||||
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\C \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
||||
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
||||
|
||||
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\C \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
||||
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
||||
|
||||
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\C \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
||||
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
||||
|
||||
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\C \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
||||
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
||||
|
||||
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\C \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
||||
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
||||
|
||||
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\C a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
||||
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
||||
|
||||
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\C \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
||||
|
||||
/\D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \C+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
||||
/ \D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
||||
|
||||
/(?=a+)a(a+)++a/B
|
||||
|
||||
|
@ -4327,8 +4326,6 @@
|
|||
|
||||
/((?2){73}(?2))((?1))/info
|
||||
|
||||
/ab\Cde/never_backslash_c
|
||||
|
||||
/abc/
|
||||
\= Expect no match
|
||||
\[9x!xxx(]{9999}
|
||||
|
@ -4446,12 +4443,6 @@
|
|||
/\x0{ab}/
|
||||
\0{ab}
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
|
||||
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
|
||||
ababababbbabZXXXX
|
||||
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
# These are tests of \C that do not involve UTF. They are not run when \C is
|
||||
# disabled by compiling with --enable-never-backslash-C.
|
||||
|
||||
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
||||
|
||||
/\D+\C \d+\C \S+\C \s+\C \W+\C \w+\C .+\C \R+\C \H+\C \h+\C \V+\C \v+\C a+\C \n+\C \C+\C/Bx
|
||||
|
||||
/ab\Cde/never_backslash_c
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
|
||||
# End of testinput21
|
|
@ -0,0 +1,95 @@
|
|||
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||
# in some widths and not in others.
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
|
||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
|
||||
# Autopossessification tests
|
||||
|
||||
/\C+\X \X+\C/Bx
|
||||
|
||||
/\C+\X \X+\C/Bx,utf
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
X\x{11234}YZ
|
||||
X\x{11234}YZW
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
X\x{1234}
|
||||
X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
X\x{1234}\x{512}YZ
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
X\x{1234}
|
||||
X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
X\x{11234}\x{512}YZ
|
||||
X\x{11234}
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
a\nb
|
||||
a\x{100}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
a\x{12257}b
|
||||
a\x{12257}\x{11234}b
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||
# and you can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
X\nabc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match in 8-bit mode
|
||||
a\x{100}b
|
||||
|
||||
# End of testinput22
|
|
@ -0,0 +1,7 @@
|
|||
# This test is run when PCRE2 has been built with --enable-never-backslash-C,
|
||||
# which disables the use of \C. All we can do is check that it gives the
|
||||
# correct error message.
|
||||
|
||||
/a\Cb/
|
||||
|
||||
# End of testinput23
|
|
@ -111,9 +111,6 @@
|
|||
/.{3,5}?/IB,utf
|
||||
\x{212ab}\x{212ab}\x{212ab}\x{861}
|
||||
|
||||
/(?<=\C)X/utf
|
||||
Should produce an error diagnostic
|
||||
|
||||
/^[ab]/IB,utf
|
||||
bar
|
||||
\= Expect no match
|
||||
|
@ -1367,8 +1364,6 @@
|
|||
\= Expect no match
|
||||
aAz
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
|
||||
/\X/
|
||||
a\=ps
|
||||
a\=ph
|
||||
|
@ -1617,13 +1612,13 @@
|
|||
|
||||
/[\p{L}ab]{2,3}+/B,no_auto_possess
|
||||
|
||||
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \C+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
||||
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
||||
|
||||
/.+\X/Bsx
|
||||
|
||||
/\X+$/Bmx
|
||||
|
||||
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\C \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
||||
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
||||
|
||||
/\d+\s{0,5}=\s*\S?=\w{0,4}\W*/B,utf,ucp
|
||||
|
||||
|
@ -1665,16 +1660,6 @@
|
|||
|
||||
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
|
||||
/[\pS#moq]/
|
||||
=
|
||||
|
||||
|
|
|
@ -4645,12 +4645,6 @@
|
|||
aaaa\=ovector=3
|
||||
aaaa\=ovector=4
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
|
||||
/^\R/
|
||||
\r\=ps
|
||||
\r\=ph
|
||||
|
|
|
@ -671,11 +671,6 @@
|
|||
the cat\=ps
|
||||
the cat\=ph
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
|
||||
/./newline=crlf,utf
|
||||
\r\=ps
|
||||
\r\=ph
|
||||
|
|
|
@ -4,10 +4,8 @@
|
|||
#forbid_utf
|
||||
#newline_default lf any anycrlf
|
||||
|
||||
/a\Cb/
|
||||
aXb
|
||||
a\nb
|
||||
\= Expect no match and error message (too big char)
|
||||
/ab/
|
||||
\= Expect error message (too big char) and no match
|
||||
A\x{123}B
|
||||
A\o{443}B
|
||||
|
||||
|
|
|
@ -1,67 +1,6 @@
|
|||
# This set of tests is for UTF-8 support and Unicode property support, with
|
||||
# relevance only for the 8-bit library.
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}Y
|
||||
1: \x{1234}Y
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
0: X\x{1234}\x{512}
|
||||
X\x{1234}\x{512}YZ
|
||||
0: X\x{1234}\x{512}
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}
|
||||
X\x{1234}\x{512}
|
||||
0: X\x{1234}
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
\= Expect no match
|
||||
a\x{12257}b
|
||||
No match
|
||||
|
||||
# The next 3 patterns have UTF-8 errors
|
||||
|
||||
/[Ã]/utf
|
||||
|
@ -511,28 +450,6 @@ First code unit = \xf0
|
|||
Last code unit = \xab
|
||||
Subject length lower bound = 1
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single-byte is displayed. (In Perl it becomes a character, and you
|
||||
# can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{e1}
|
||||
2: \x{88}\x{b4}
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match
|
||||
a\x{100}b
|
||||
No match
|
||||
|
||||
/[^ab\xC0-\xF0]/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
|
|
@ -6,12 +6,6 @@
|
|||
#forbid_utf
|
||||
#newline_default LF ANY ANYCRLF
|
||||
|
||||
/a\Cb/
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x0ab
|
||||
|
||||
/[^\x{c4}]/IB
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -582,7 +576,7 @@ Failed: error 134 at offset 11: character code point value in \x{} or \o{} is to
|
|||
|
||||
# Non-UTF characters
|
||||
|
||||
/\C{2,3}/
|
||||
/.{2,3}/
|
||||
\x{400000}\x{400001}\x{400002}\x{400003}
|
||||
** Character \x{400000} is greater than 0xffff and UTF-16 mode is not enabled.
|
||||
** Truncation will probably give the wrong result.
|
||||
|
|
|
@ -6,12 +6,6 @@
|
|||
#forbid_utf
|
||||
#newline_default LF ANY ANYCRLF
|
||||
|
||||
/a\Cb/
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x0ab
|
||||
|
||||
/[^\x{c4}]/IB
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -583,7 +577,7 @@ Subject length lower bound = 2
|
|||
|
||||
# Non-UTF characters
|
||||
|
||||
/\C{2,3}/
|
||||
/.{2,3}/
|
||||
\x{400000}\x{400001}\x{400002}\x{400003}
|
||||
0: \x{400000}\x{400001}\x{400002}
|
||||
|
||||
|
|
|
@ -9,76 +9,6 @@
|
|||
Ã]
|
||||
** Failed: invalid UTF-8 string cannot be used as input in UTF mode
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
1: \x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}Y
|
||||
1: \x{11234}Y
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
X\x{11234}YZW
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
0: X\x{11234}\x{512}
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
0: X\x{11234}\x{512}\x{11234}
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}
|
||||
\= Expect no match
|
||||
X\x{11234}
|
||||
No match
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{12257}b
|
||||
0: a\x{12257}b
|
||||
\= Expect no match
|
||||
a\x{12257}\x{11234}b
|
||||
No match
|
||||
a\x{100}b
|
||||
No match
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# Check maximum character size
|
||||
|
||||
/\x{ffff}/IB,utf
|
||||
|
@ -308,23 +238,6 @@ First code unit = \x{d844}
|
|||
Last code unit = \x{deab}
|
||||
Subject length lower bound = 1
|
||||
|
||||
# These two \C tests, copied from the UTF-8 input file, do not have any
|
||||
# problems in 16 or 32 bits.
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
2:
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
/a\Cb/utf
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
/[^ab\xC0-\xF0]/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -1127,10 +1040,6 @@ Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too
|
|||
/\o{4200000}/utf
|
||||
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
|
||||
|
||||
/\C/utf
|
||||
\x{110000}
|
||||
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
||||
|
||||
/\x{100}*A/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -1454,4 +1363,8 @@ Starting code units: \xff
|
|||
Last code unit = 'B' (caseless)
|
||||
Subject length lower bound = 2
|
||||
|
||||
/./utf
|
||||
\x{110000}
|
||||
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -9,74 +9,6 @@
|
|||
Ã]
|
||||
** Failed: invalid UTF-8 string cannot be used as input in UTF mode
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{11234}YZ
|
||||
No match
|
||||
X\x{11234}YZW
|
||||
0: X\x{11234}YZW
|
||||
1: \x{11234}YZW
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
No match
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
0: X\x{11234}\x{512}\x{11234}Z
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}Y
|
||||
\= Expect no match
|
||||
X\x{11234}
|
||||
No match
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{12257}b
|
||||
No match
|
||||
\= Expect no match
|
||||
a\x{12257}\x{11234}b
|
||||
0: a\x{12257}\x{11234}b
|
||||
a\x{100}b
|
||||
No match
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# Check maximum character size
|
||||
|
||||
/\x{ffff}/IB,utf
|
||||
|
@ -301,23 +233,6 @@ Options: utf
|
|||
First code unit = \x{212ab}
|
||||
Subject length lower bound = 1
|
||||
|
||||
# These two \C tests, copied from the UTF-8 input file, do not have any
|
||||
# problems in 16 or 32 bits.
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
2:
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
/a\Cb/utf
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
/[^ab\xC0-\xF0]/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -1119,10 +1034,6 @@ Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too
|
|||
/\o{4200000}/utf
|
||||
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
|
||||
|
||||
/\C/utf
|
||||
\x{110000}
|
||||
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0
|
||||
|
||||
/\x{100}*A/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -1446,4 +1357,8 @@ Starting code units: \xff
|
|||
Last code unit = 'B' (caseless)
|
||||
Subject length lower bound = 2
|
||||
|
||||
/./utf
|
||||
\x{110000}
|
||||
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -11948,9 +11948,10 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
# A complete set of tests for auto-possessification of character types.
|
||||
# A complete set of tests for auto-possessification of character types, but
|
||||
# omitting \C because it might be disabled (it has its own tests).
|
||||
|
||||
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\C \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
||||
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\D+
|
||||
|
@ -11968,8 +11969,6 @@ Subject length lower bound = 5
|
|||
\D+
|
||||
Any
|
||||
\D+
|
||||
AllAny
|
||||
\D+
|
||||
\R
|
||||
\D+
|
||||
\H
|
||||
|
@ -11989,7 +11988,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\C \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
||||
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\d++
|
||||
|
@ -12006,8 +12005,6 @@ Subject length lower bound = 5
|
|||
\w
|
||||
\d+
|
||||
Any
|
||||
\d+
|
||||
AllAny
|
||||
\d++
|
||||
\R
|
||||
\d+
|
||||
|
@ -12028,7 +12025,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\C \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
||||
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\S+
|
||||
|
@ -12045,8 +12042,6 @@ Subject length lower bound = 5
|
|||
\w
|
||||
\S+
|
||||
Any
|
||||
\S+
|
||||
AllAny
|
||||
\S++
|
||||
\R
|
||||
\S+
|
||||
|
@ -12067,7 +12062,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\C \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
||||
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\s+
|
||||
|
@ -12085,8 +12080,6 @@ Subject length lower bound = 5
|
|||
\s+
|
||||
Any
|
||||
\s+
|
||||
AllAny
|
||||
\s+
|
||||
\R
|
||||
\s+
|
||||
\H
|
||||
|
@ -12106,7 +12099,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\C \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
||||
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\W+
|
||||
|
@ -12124,8 +12117,6 @@ Subject length lower bound = 5
|
|||
\W+
|
||||
Any
|
||||
\W+
|
||||
AllAny
|
||||
\W+
|
||||
\R
|
||||
\W+
|
||||
\H
|
||||
|
@ -12145,7 +12136,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\C \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
||||
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\w+
|
||||
|
@ -12162,8 +12153,6 @@ Subject length lower bound = 5
|
|||
\w
|
||||
\w+
|
||||
Any
|
||||
\w+
|
||||
AllAny
|
||||
\w++
|
||||
\R
|
||||
\w+
|
||||
|
@ -12184,7 +12173,303 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\C \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
||||
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\R+
|
||||
\D
|
||||
\R++
|
||||
\d
|
||||
\R+
|
||||
\S
|
||||
\R++
|
||||
\s
|
||||
\R+
|
||||
\W
|
||||
\R++
|
||||
\w
|
||||
\R++
|
||||
Any
|
||||
\R+
|
||||
\R
|
||||
\R+
|
||||
\H
|
||||
\R++
|
||||
\h
|
||||
\R+
|
||||
\V
|
||||
\R+
|
||||
\v
|
||||
\R+
|
||||
\Z
|
||||
\R++
|
||||
\z
|
||||
\R+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\H+
|
||||
\D
|
||||
\H+
|
||||
\d
|
||||
\H+
|
||||
\S
|
||||
\H+
|
||||
\s
|
||||
\H+
|
||||
\W
|
||||
\H+
|
||||
\w
|
||||
\H+
|
||||
Any
|
||||
\H+
|
||||
\R
|
||||
\H+
|
||||
\H
|
||||
\H++
|
||||
\h
|
||||
\H+
|
||||
\V
|
||||
\H+
|
||||
\v
|
||||
\H+
|
||||
\Z
|
||||
\H++
|
||||
\z
|
||||
\H+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\h+
|
||||
\D
|
||||
\h++
|
||||
\d
|
||||
\h++
|
||||
\S
|
||||
\h+
|
||||
\s
|
||||
\h+
|
||||
\W
|
||||
\h++
|
||||
\w
|
||||
\h+
|
||||
Any
|
||||
\h++
|
||||
\R
|
||||
\h++
|
||||
\H
|
||||
\h+
|
||||
\h
|
||||
\h+
|
||||
\V
|
||||
\h++
|
||||
\v
|
||||
\h+
|
||||
\Z
|
||||
\h++
|
||||
\z
|
||||
\h+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\V+
|
||||
\D
|
||||
\V+
|
||||
\d
|
||||
\V+
|
||||
\S
|
||||
\V+
|
||||
\s
|
||||
\V+
|
||||
\W
|
||||
\V+
|
||||
\w
|
||||
\V+
|
||||
Any
|
||||
\V++
|
||||
\R
|
||||
\V+
|
||||
\H
|
||||
\V+
|
||||
\h
|
||||
\V+
|
||||
\V
|
||||
\V++
|
||||
\v
|
||||
\V+
|
||||
\Z
|
||||
\V++
|
||||
\z
|
||||
\V+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\v+
|
||||
\D
|
||||
\v++
|
||||
\d
|
||||
\v++
|
||||
\S
|
||||
\v+
|
||||
\s
|
||||
\v+
|
||||
\W
|
||||
\v++
|
||||
\w
|
||||
\v+
|
||||
Any
|
||||
\v+
|
||||
\R
|
||||
\v+
|
||||
\H
|
||||
\v++
|
||||
\h
|
||||
\v++
|
||||
\V
|
||||
\v+
|
||||
\v
|
||||
\v+
|
||||
\Z
|
||||
\v++
|
||||
\z
|
||||
\v+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
a+
|
||||
\D
|
||||
a++
|
||||
\d
|
||||
a+
|
||||
\S
|
||||
a++
|
||||
\s
|
||||
a++
|
||||
\W
|
||||
a+
|
||||
\w
|
||||
a+
|
||||
Any
|
||||
a++
|
||||
\R
|
||||
a+
|
||||
\H
|
||||
a++
|
||||
\h
|
||||
a+
|
||||
\V
|
||||
a++
|
||||
\v
|
||||
a++
|
||||
\Z
|
||||
a++
|
||||
\z
|
||||
a++
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\x0a+
|
||||
\D
|
||||
\x0a++
|
||||
\d
|
||||
\x0a++
|
||||
\S
|
||||
\x0a+
|
||||
\s
|
||||
\x0a+
|
||||
\W
|
||||
\x0a++
|
||||
\w
|
||||
\x0a+
|
||||
Any
|
||||
\x0a+
|
||||
\R
|
||||
\x0a+
|
||||
\H
|
||||
\x0a++
|
||||
\h
|
||||
\x0a++
|
||||
\V
|
||||
\x0a+
|
||||
\v
|
||||
\x0a+
|
||||
\Z
|
||||
\x0a++
|
||||
\z
|
||||
\x0a+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
Any+
|
||||
\D
|
||||
Any+
|
||||
\d
|
||||
Any+
|
||||
\S
|
||||
Any+
|
||||
\s
|
||||
Any+
|
||||
\W
|
||||
Any+
|
||||
\w
|
||||
Any+
|
||||
Any
|
||||
Any++
|
||||
\R
|
||||
Any+
|
||||
\H
|
||||
Any+
|
||||
\h
|
||||
Any+
|
||||
\V
|
||||
Any+
|
||||
\v
|
||||
Any+
|
||||
\Z
|
||||
Any++
|
||||
\z
|
||||
Any+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
|
@ -12200,8 +12485,6 @@ Subject length lower bound = 5
|
|||
AllAny+
|
||||
\w
|
||||
AllAny+
|
||||
Any
|
||||
AllAny+
|
||||
AllAny
|
||||
AllAny+
|
||||
\R
|
||||
|
@ -12223,358 +12506,7 @@ Subject length lower bound = 5
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\C \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\R+
|
||||
\D
|
||||
\R++
|
||||
\d
|
||||
\R+
|
||||
\S
|
||||
\R++
|
||||
\s
|
||||
\R+
|
||||
\W
|
||||
\R++
|
||||
\w
|
||||
\R++
|
||||
Any
|
||||
\R+
|
||||
AllAny
|
||||
\R+
|
||||
\R
|
||||
\R+
|
||||
\H
|
||||
\R++
|
||||
\h
|
||||
\R+
|
||||
\V
|
||||
\R+
|
||||
\v
|
||||
\R+
|
||||
\Z
|
||||
\R++
|
||||
\z
|
||||
\R+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\C \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\H+
|
||||
\D
|
||||
\H+
|
||||
\d
|
||||
\H+
|
||||
\S
|
||||
\H+
|
||||
\s
|
||||
\H+
|
||||
\W
|
||||
\H+
|
||||
\w
|
||||
\H+
|
||||
Any
|
||||
\H+
|
||||
AllAny
|
||||
\H+
|
||||
\R
|
||||
\H+
|
||||
\H
|
||||
\H++
|
||||
\h
|
||||
\H+
|
||||
\V
|
||||
\H+
|
||||
\v
|
||||
\H+
|
||||
\Z
|
||||
\H++
|
||||
\z
|
||||
\H+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\C \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\h+
|
||||
\D
|
||||
\h++
|
||||
\d
|
||||
\h++
|
||||
\S
|
||||
\h+
|
||||
\s
|
||||
\h+
|
||||
\W
|
||||
\h++
|
||||
\w
|
||||
\h+
|
||||
Any
|
||||
\h+
|
||||
AllAny
|
||||
\h++
|
||||
\R
|
||||
\h++
|
||||
\H
|
||||
\h+
|
||||
\h
|
||||
\h+
|
||||
\V
|
||||
\h++
|
||||
\v
|
||||
\h+
|
||||
\Z
|
||||
\h++
|
||||
\z
|
||||
\h+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\C \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\V+
|
||||
\D
|
||||
\V+
|
||||
\d
|
||||
\V+
|
||||
\S
|
||||
\V+
|
||||
\s
|
||||
\V+
|
||||
\W
|
||||
\V+
|
||||
\w
|
||||
\V+
|
||||
Any
|
||||
\V+
|
||||
AllAny
|
||||
\V++
|
||||
\R
|
||||
\V+
|
||||
\H
|
||||
\V+
|
||||
\h
|
||||
\V+
|
||||
\V
|
||||
\V++
|
||||
\v
|
||||
\V+
|
||||
\Z
|
||||
\V++
|
||||
\z
|
||||
\V+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\C \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\v+
|
||||
\D
|
||||
\v++
|
||||
\d
|
||||
\v++
|
||||
\S
|
||||
\v+
|
||||
\s
|
||||
\v+
|
||||
\W
|
||||
\v++
|
||||
\w
|
||||
\v+
|
||||
Any
|
||||
\v+
|
||||
AllAny
|
||||
\v+
|
||||
\R
|
||||
\v+
|
||||
\H
|
||||
\v++
|
||||
\h
|
||||
\v++
|
||||
\V
|
||||
\v+
|
||||
\v
|
||||
\v+
|
||||
\Z
|
||||
\v++
|
||||
\z
|
||||
\v+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\C a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
a+
|
||||
\D
|
||||
a++
|
||||
\d
|
||||
a+
|
||||
\S
|
||||
a++
|
||||
\s
|
||||
a++
|
||||
\W
|
||||
a+
|
||||
\w
|
||||
a+
|
||||
Any
|
||||
a+
|
||||
AllAny
|
||||
a++
|
||||
\R
|
||||
a+
|
||||
\H
|
||||
a++
|
||||
\h
|
||||
a+
|
||||
\V
|
||||
a++
|
||||
\v
|
||||
a++
|
||||
\Z
|
||||
a++
|
||||
\z
|
||||
a++
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\C \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\x0a+
|
||||
\D
|
||||
\x0a++
|
||||
\d
|
||||
\x0a++
|
||||
\S
|
||||
\x0a+
|
||||
\s
|
||||
\x0a+
|
||||
\W
|
||||
\x0a++
|
||||
\w
|
||||
\x0a+
|
||||
Any
|
||||
\x0a+
|
||||
AllAny
|
||||
\x0a+
|
||||
\R
|
||||
\x0a+
|
||||
\H
|
||||
\x0a++
|
||||
\h
|
||||
\x0a++
|
||||
\V
|
||||
\x0a+
|
||||
\v
|
||||
\x0a+
|
||||
\Z
|
||||
\x0a++
|
||||
\z
|
||||
\x0a+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
Any+
|
||||
\D
|
||||
Any+
|
||||
\d
|
||||
Any+
|
||||
\S
|
||||
Any+
|
||||
\s
|
||||
Any+
|
||||
\W
|
||||
Any+
|
||||
\w
|
||||
Any+
|
||||
Any
|
||||
Any+
|
||||
AllAny
|
||||
Any++
|
||||
\R
|
||||
Any+
|
||||
\H
|
||||
Any+
|
||||
\h
|
||||
Any+
|
||||
\V
|
||||
Any+
|
||||
\v
|
||||
Any+
|
||||
\Z
|
||||
Any++
|
||||
\z
|
||||
Any+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
\D
|
||||
AllAny+
|
||||
\d
|
||||
AllAny+
|
||||
\S
|
||||
AllAny+
|
||||
\s
|
||||
AllAny+
|
||||
\W
|
||||
AllAny+
|
||||
\w
|
||||
AllAny+
|
||||
AllAny
|
||||
AllAny+
|
||||
AllAny
|
||||
AllAny+
|
||||
\R
|
||||
AllAny+
|
||||
\H
|
||||
AllAny+
|
||||
\h
|
||||
AllAny+
|
||||
\V
|
||||
AllAny+
|
||||
\v
|
||||
AllAny+
|
||||
\Z
|
||||
AllAny++
|
||||
\z
|
||||
AllAny+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \C+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
||||
/ \D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\D+
|
||||
|
@ -12588,8 +12520,6 @@ Subject length lower bound = 5
|
|||
\W+
|
||||
/m $
|
||||
\w++
|
||||
/m $
|
||||
AllAny+
|
||||
/m $
|
||||
\R+
|
||||
/m $
|
||||
|
@ -14210,9 +14140,6 @@ Capturing subpattern count = 2
|
|||
May match empty string
|
||||
Subject length lower bound = 0
|
||||
|
||||
/ab\Cde/never_backslash_c
|
||||
Failed: error 183 at offset 3: using \C is disabled by the application
|
||||
|
||||
/abc/
|
||||
\= Expect no match
|
||||
\[9x!xxx(]{9999}
|
||||
|
@ -14532,14 +14459,6 @@ Subject length lower bound = 0
|
|||
\0{ab}
|
||||
0: \x00{ab}
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
0: X
|
||||
|
||||
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
|
||||
ababababbbabZXXXX
|
||||
0: ababababbbabZ
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
# These are tests of \C that do not involve UTF. They are not run when \C is
|
||||
# disabled by compiling with --enable-never-backslash-C.
|
||||
|
||||
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
\D
|
||||
AllAny+
|
||||
\d
|
||||
AllAny+
|
||||
\S
|
||||
AllAny+
|
||||
\s
|
||||
AllAny+
|
||||
\W
|
||||
AllAny+
|
||||
\w
|
||||
AllAny+
|
||||
Any
|
||||
AllAny+
|
||||
\R
|
||||
AllAny+
|
||||
\H
|
||||
AllAny+
|
||||
\h
|
||||
AllAny+
|
||||
\V
|
||||
AllAny+
|
||||
\v
|
||||
AllAny+
|
||||
\Z
|
||||
AllAny++
|
||||
\z
|
||||
AllAny+
|
||||
$
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\D+\C \d+\C \S+\C \s+\C \W+\C \w+\C .+\C \R+\C \H+\C \h+\C \V+\C \v+\C a+\C \n+\C \C+\C/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\D+
|
||||
AllAny
|
||||
\d+
|
||||
AllAny
|
||||
\S+
|
||||
AllAny
|
||||
\s+
|
||||
AllAny
|
||||
\W+
|
||||
AllAny
|
||||
\w+
|
||||
AllAny
|
||||
Any+
|
||||
AllAny
|
||||
\R+
|
||||
AllAny
|
||||
\H+
|
||||
AllAny
|
||||
\h+
|
||||
AllAny
|
||||
\V+
|
||||
AllAny
|
||||
\v+
|
||||
AllAny
|
||||
a+
|
||||
AllAny
|
||||
\x0a+
|
||||
AllAny
|
||||
AllAny+
|
||||
AllAny
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/ab\Cde/never_backslash_c
|
||||
Failed: error 183 at offset 3: using \C is disabled by the application
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
0: X
|
||||
|
||||
# End of testinput21
|
|
@ -0,0 +1,161 @@
|
|||
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||
# in some widths and not in others.
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||
|
||||
# Autopossessification tests
|
||||
|
||||
/\C+\X \X+\C/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
extuni
|
||||
extuni+
|
||||
AllAny
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C+\X \X+\C/Bx,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
Anybyte+
|
||||
extuni
|
||||
extuni+
|
||||
Anybyte
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
No match
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
No match
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
1: \x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}Y
|
||||
1: \x{11234}Y
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
X\x{11234}YZW
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
No match
|
||||
X\x{1234}\x{512}YZ
|
||||
0: X\x{1234}\x{512}YZ
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
0: X\x{11234}\x{512}
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
0: X\x{11234}\x{512}\x{11234}
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
No match
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}
|
||||
X\x{11234}
|
||||
No match
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
No match
|
||||
a\x{12257}b
|
||||
0: a\x{12257}b
|
||||
a\x{12257}\x{11234}b
|
||||
No match
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||
# and you can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
2:
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match in 8-bit mode
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
# End of testinput22
|
|
@ -0,0 +1,159 @@
|
|||
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||
# in some widths and not in others.
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||
|
||||
# Autopossessification tests
|
||||
|
||||
/\C+\X \X+\C/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
extuni
|
||||
extuni+
|
||||
AllAny
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C+\X \X+\C/Bx,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
Anybyte+
|
||||
extuni
|
||||
extuni+
|
||||
Anybyte
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
No match
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
No match
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
1: \x{11234}YZ
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
No match
|
||||
X\x{11234}YZW
|
||||
0: X\x{11234}YZW
|
||||
1: \x{11234}YZW
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
No match
|
||||
X\x{1234}\x{512}YZ
|
||||
0: X\x{1234}\x{512}YZ
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}
|
||||
No match
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}YZ
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
0: X\x{11234}\x{512}\x{11234}Z
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{1234}
|
||||
No match
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
No match
|
||||
X\x{11234}Y
|
||||
No match
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}YZ
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{512}Y
|
||||
X\x{11234}
|
||||
No match
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
No match
|
||||
a\x{12257}b
|
||||
No match
|
||||
a\x{12257}\x{11234}b
|
||||
0: a\x{12257}\x{11234}b
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||
# and you can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
2:
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match in 8-bit mode
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
|
||||
# End of testinput22
|
|
@ -0,0 +1,163 @@
|
|||
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||
# in some widths and not in others.
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||
|
||||
# Autopossessification tests
|
||||
|
||||
/\C+\X \X+\C/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
AllAny+
|
||||
extuni
|
||||
extuni+
|
||||
AllAny
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C+\X \X+\C/Bx,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
Anybyte+
|
||||
extuni
|
||||
extuni+
|
||||
Anybyte
|
||||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
No match
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
No match
|
||||
|
||||
/X(\C{3})/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{1234}
|
||||
X\x{11234}Y
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
1: \x{f0}\x{91}\x{88}
|
||||
X\x{11234}YZ
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
1: \x{f0}\x{91}\x{88}
|
||||
|
||||
/X(\C{4})/utf
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}Y
|
||||
1: \x{1234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}
|
||||
1: \x{11234}
|
||||
X\x{11234}YZW
|
||||
0: X\x{11234}
|
||||
1: \x{11234}
|
||||
|
||||
/X\C*/utf
|
||||
XYZabcdce
|
||||
0: XYZabcdce
|
||||
|
||||
/X\C*?/utf
|
||||
XYZabcde
|
||||
0: X
|
||||
|
||||
/X\C{3,5}/utf
|
||||
Xabcdefg
|
||||
0: Xabcde
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}YZ
|
||||
X\x{1234}\x{512}
|
||||
0: X\x{1234}\x{512}
|
||||
X\x{1234}\x{512}YZ
|
||||
0: X\x{1234}\x{512}
|
||||
X\x{11234}Y
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}YZ
|
||||
0: X\x{11234}Y
|
||||
X\x{11234}\x{512}
|
||||
0: X\x{11234}\x{d4}
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{11234}\x{d4}
|
||||
X\x{11234}\x{512}\x{11234}Z
|
||||
0: X\x{11234}\x{d4}
|
||||
|
||||
/X\C{3,5}?/utf
|
||||
Xabcdefg
|
||||
0: Xabc
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
X\x{1234}YZ
|
||||
0: X\x{1234}
|
||||
X\x{1234}\x{512}
|
||||
0: X\x{1234}
|
||||
X\x{11234}Y
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
X\x{11234}YZ
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
X\x{11234}\x{512}YZ
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
X\x{11234}
|
||||
0: X\x{f0}\x{91}\x{88}
|
||||
|
||||
/a\Cb/utf
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x{0a}b
|
||||
a\x{100}b
|
||||
No match
|
||||
|
||||
/a\C\Cb/utf
|
||||
a\x{100}b
|
||||
0: a\x{100}b
|
||||
a\x{12257}b
|
||||
No match
|
||||
a\x{12257}\x{11234}b
|
||||
No match
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
# This one is here not because it's different to Perl, but because the way
|
||||
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||
# and you can't tell the difference.)
|
||||
|
||||
/X(\C)(.*)/utf
|
||||
X\x{1234}
|
||||
0: X\x{1234}
|
||||
1: \x{e1}
|
||||
2: \x{88}\x{b4}
|
||||
X\nabc
|
||||
0: X\x{0a}abc
|
||||
1: \x{0a}
|
||||
2: abc
|
||||
|
||||
# This one is here because Perl gives out a grumbly error message (quite
|
||||
# correctly, but that messes up comparisons).
|
||||
|
||||
/a\Cb/utf
|
||||
\= Expect no match in 8-bit mode
|
||||
a\x{100}b
|
||||
No match
|
||||
|
||||
# End of testinput22
|
|
@ -0,0 +1,8 @@
|
|||
# This test is run when PCRE2 has been built with --enable-never-backslash-C,
|
||||
# which disables the use of \C. All we can do is check that it gives the
|
||||
# correct error message.
|
||||
|
||||
/a\Cb/
|
||||
Failed: error 185 at offset 2: using \C is disabled in this PCRE2 library
|
||||
|
||||
# End of testinput23
|
|
@ -181,10 +181,6 @@ Subject length lower bound = 3
|
|||
\x{212ab}\x{212ab}\x{212ab}\x{861}
|
||||
0: \x{212ab}\x{212ab}\x{212ab}
|
||||
|
||||
/(?<=\C)X/utf
|
||||
Failed: error 136 at offset 6: \C is not allowed in a lookbehind assertion
|
||||
Should produce an error diagnostic
|
||||
|
||||
/^[ab]/IB,utf
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
|
@ -2905,9 +2901,6 @@ No match
|
|||
aAz
|
||||
No match
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||
|
||||
/\X/
|
||||
a\=ps
|
||||
0: a
|
||||
|
@ -3803,7 +3796,7 @@ No match
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \C+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
||||
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
\D+
|
||||
|
@ -3818,8 +3811,6 @@ No match
|
|||
extuni
|
||||
\w+
|
||||
extuni
|
||||
AllAny+
|
||||
extuni
|
||||
\R+
|
||||
extuni
|
||||
\H+
|
||||
|
@ -3858,7 +3849,7 @@ No match
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\C \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
||||
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
extuni+
|
||||
|
@ -3876,8 +3867,6 @@ No match
|
|||
extuni+
|
||||
Any
|
||||
extuni+
|
||||
AllAny
|
||||
extuni+
|
||||
\R
|
||||
extuni+
|
||||
\H
|
||||
|
@ -4010,18 +3999,6 @@ Failed: error 122 at offset 1227: unmatched closing parenthesis
|
|||
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
|
||||
Failed: error 124 at offset 113: letter or underscore expected after (?< or (?'
|
||||
|
||||
/\C\X*TӅ;
|
||||
{0,6}\v+
F
|
||||
/utf
|
||||
\= Expect no match
|
||||
Ӆ\x0a
|
||||
No match
|
||||
|
||||
/\C(\W?ſ)'?{{/utf
|
||||
\= Expect no match
|
||||
\\C(\\W?ſ)'?{{
|
||||
No match
|
||||
|
||||
/[\pS#moq]/
|
||||
=
|
||||
0: =
|
||||
|
|
|
@ -7174,14 +7174,6 @@ Matched, but offsets vector is too small to show all matches
|
|||
2: aa
|
||||
3: a
|
||||
|
||||
/ab\Cde/
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
/(?<=ab\Cde)X/
|
||||
abZdeX
|
||||
0: X
|
||||
|
||||
/^\R/
|
||||
\r\=ps
|
||||
0: \x0d
|
||||
|
|
|
@ -1141,13 +1141,6 @@ Partial match: abcde
|
|||
the cat\=ph
|
||||
Partial match: the cat
|
||||
|
||||
/ab\Cde/utf
|
||||
abXde
|
||||
Failed: error -42: pattern contains an item that is not supported for DFA matching
|
||||
|
||||
/(?<=ab\Cde)X/utf
|
||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||
|
||||
/./newline=crlf,utf
|
||||
\r\=ps
|
||||
0: \x{0d}
|
||||
|
|
|
@ -4,12 +4,8 @@
|
|||
#forbid_utf
|
||||
#newline_default lf any anycrlf
|
||||
|
||||
/a\Cb/
|
||||
aXb
|
||||
0: aXb
|
||||
a\nb
|
||||
0: a\x0ab
|
||||
\= Expect no match and error message (too big char)
|
||||
/ab/
|
||||
\= Expect error message (too big char) and no match
|
||||
A\x{123}B
|
||||
** Character \x{123} is greater than 255 and UTF-8 mode is not enabled.
|
||||
** Truncation will probably give the wrong result.
|
||||
|
|
Loading…
Reference in New Issue