Implement --never-backslash-C

This commit is contained in:
Philip.Hazel 2015-10-17 13:50:56 +00:00
parent 5923caf05e
commit 3263d44b97
58 changed files with 2060 additions and 1479 deletions

View File

@ -70,6 +70,7 @@
# 2015-04-24 PH added support for PCRE2_DEBUG # 2015-04-24 PH added support for PCRE2_DEBUG
# 2015-07-16 PH updated for new pcre2_find_bracket source module # 2015-07-16 PH updated for new pcre2_find_bracket source module
# 2015-08-24 PH correct C_FLAGS setting (patch from Roy Ivy III) # 2015-08-24 PH correct C_FLAGS setting (patch from Roy Ivy III)
# 2015-10=16 PH added support for never-backslash-C
PROJECT(PCRE2 C) PROJECT(PCRE2 C)
@ -162,6 +163,9 @@ SET(PCRE2_SUPPORT_UNICODE ON CACHE BOOL
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks") "ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
SET(PCRE2_NEVER_BACKSLASH_C OFF CACHE BOOL
"If ON, backslash-C (upper case C) is locked out.")
SET(PCRE2_SUPPORT_VALGRIND OFF CACHE BOOL SET(PCRE2_SUPPORT_VALGRIND OFF CACHE BOOL
"Enable Valgrind support.") "Enable Valgrind support.")
@ -252,6 +256,10 @@ IF(PCRE2_SUPPORT_BSR_ANYCRLF)
SET(BSR_ANYCRLF 1) SET(BSR_ANYCRLF 1)
ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF) ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF)
IF(PCRE2_NEVER_BACKSLASH_C)
SET(NEVER_BACKSLASH_C 1)
ENDIF(PCRE2_NEVER_BACKSLASH_C)
IF(PCRE2_SUPPORT_UNICODE) IF(PCRE2_SUPPORT_UNICODE)
SET(SUPPORT_UNICODE 1) SET(SUPPORT_UNICODE 1)
ENDIF(PCRE2_SUPPORT_UNICODE) ENDIF(PCRE2_SUPPORT_UNICODE)
@ -719,6 +727,7 @@ IF(PCRE2_SHOW_REPORT)
MESSAGE(STATUS " Enable Unicode support .......... : ${PCRE2_SUPPORT_UNICODE}") MESSAGE(STATUS " Enable Unicode support .......... : ${PCRE2_SUPPORT_UNICODE}")
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE2_NEWLINE}") MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE2_NEWLINE}")
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE2_SUPPORT_BSR_ANYCRLF}") MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE2_SUPPORT_BSR_ANYCRLF}")
MESSAGE(STATUS " \\C is disabled .................. : ${PCRE2_NEVER_BACKSLASH_C}")
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE2_EBCDIC}") MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE2_EBCDIC}")
MESSAGE(STATUS " EBCDIC coding with NL=0x25 ...... : ${PCRE2_EBCDIC_NL25}") MESSAGE(STATUS " EBCDIC coding with NL=0x25 ...... : ${PCRE2_EBCDIC_NL25}")
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE2_REBUILD_CHARTABLES}") MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE2_REBUILD_CHARTABLES}")

View File

@ -201,6 +201,8 @@ escape was being ignored.
57. Fixed integer overflow for patterns whose minimum matching length is very, 57. Fixed integer overflow for patterns whose minimum matching length is very,
very large. very large.
58. Implemented --never-backslash-C.
Version 10.20 30-June-2015 Version 10.20 30-June-2015
-------------------------- --------------------------

9
README
View File

@ -219,6 +219,13 @@ library. They are also documented in the pcre2build man page.
to be the end of a line (see above). However, the caller of PCRE2 can to be the end of a line (see above). However, the caller of PCRE2 can
restrict \R to match only CR, LF, or CRLF. You can make this the default by restrict \R to match only CR, LF, or CRLF. You can make this the default by
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R"). adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
. In a pattern, the escape sequence \C matches a single code unit, even in a
UTF mode. This can be dangerous because it breaks up multi-code-unit
characters. You can build PCRE2 with the use of \C permanently locked out by
adding --enable-never-backslash-C (note the upper case C) to the "configure"
command. When \C is allowed by the library, individual applications can lock
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
. PCRE2 has a counter that limits the depth of nesting of parentheses in a . PCRE2 has a counter that limits the depth of nesting of parentheses in a
pattern. This limits the amount of system stack that a pattern uses when it pattern. This limits the amount of system stack that a pattern uses when it
@ -833,4 +840,4 @@ The distribution should contain the files listed below.
Philip Hazel Philip Hazel
Email local part: ph10 Email local part: ph10
Email domain: cam.ac.uk Email domain: cam.ac.uk
Last updated: 16 July 2015 Last updated: 16 October 2015

64
RunTest
View File

@ -75,7 +75,10 @@ title17="Test 17: JIT-specific features when JIT is available"
title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP" title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP"
title19="Test 19: Tests of the POSIX interface with UTF/UCP" title19="Test 19: Tests of the POSIX interface with UTF/UCP"
title20="Test 20: Serialization tests" title20="Test 20: Serialization tests"
maxtest=20 title21="Test 21: \C tests without UTF (supported for DFA matching)"
title22="Test 22: \C tests with UTF (not supported for DFA matching)"
title23="Test 23: \C disabled test"
maxtest=23
if [ $# -eq 1 -a "$1" = "list" ]; then if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title0 echo $title0
@ -99,6 +102,9 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title18 echo $title18
echo $title19 echo $title19
echo $title20 echo $title20
echo $title21
echo $title22
echo $title23
exit 0 exit 0
fi fi
@ -223,6 +229,9 @@ do17=no
do18=no do18=no
do19=no do19=no
do20=no do20=no
do21=no
do22=no
do23=no
while [ $# -gt 0 ] ; do while [ $# -gt 0 ] ; do
case $1 in case $1 in
@ -247,6 +256,9 @@ while [ $# -gt 0 ] ; do
18) do18=yes;; 18) do18=yes;;
19) do19=yes;; 19) do19=yes;;
20) do20=yes;; 20) do20=yes;;
21) do21=yes;;
22) do22=yes;;
23) do23=yes;;
-8) arg8=yes;; -8) arg8=yes;;
-16) arg16=yes;; -16) arg16=yes;;
-32) arg32=yes;; -32) arg32=yes;;
@ -326,6 +338,11 @@ support16=$?
$sim ./pcre2test -C pcre2-32 >/dev/null $sim ./pcre2test -C pcre2-32 >/dev/null
support32=$? support32=$?
# \C may be disabled
$sim ./pcre2test -C backslash-C >/dev/null
supportBSC=$?
# Initialize all bitsizes skipped # Initialize all bitsizes skipped
test8=skip test8=skip
@ -400,7 +417,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
$do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \ $do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \ $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \ $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
$do20 = no \ $do20 = no -a $do21 = no -a $do22 = no -a $do23 = no \
]; then ]; then
do0=yes do0=yes
do1=yes do1=yes
@ -423,6 +440,9 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
do18=yes do18=yes
do19=yes do19=yes
do20=yes do20=yes
do21=yes
do22=yes
do23=yes
fi fi
# Handle any explicit skips at this stage, so that an argument list may consist # Handle any explicit skips at this stage, so that an argument list may consist
@ -780,6 +800,46 @@ for bmode in "$test8" "$test16" "$test32"; do
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry
checkresult $? 20 "" checkresult $? 20 ""
fi fi
# \C tests without UTF - DFA matching is supported
if [ "$do21" = yes ] ; then
echo $title21
if [ $supportBSC -eq 0 ] ; then
echo " Skipped because \C is disabled"
else
for opt in "" $jitopt -dfa; do
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput21 testtry
checkresult $? 21 "$opt"
done
fi
fi
# \C tests with UTF - DFA matching is not supported for \C in UTF mode
if [ "$do22" = yes ] ; then
echo $title22
if [ $supportBSC -eq 0 ] ; then
echo " Skipped because \C is disabled"
else
for opt in "" $jitopt; do
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput22 testtry
checkresult $? 22-$bits "$opt"
done
fi
fi
# Test when \C is disabled
if [ "$do23" = yes ] ; then
echo $title23
if [ $supportBSC -ne 0 ] ; then
echo " Skipped because \C is not disabled"
else
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput23 testtry
checkresult $? 23 ""
fi
fi
# End of loop for 8/16/32-bit tests # End of loop for 8/16/32-bit tests
done done

View File

@ -13,11 +13,10 @@
@rem line. Added argument validation and added error reporting. @rem line. Added argument validation and added error reporting.
@rem @rem
@rem Sheri Pierce added logic to skip feature dependent tests @rem Sheri Pierce added logic to skip feature dependent tests
@rem tests 4 5 9 15 and 18 require utf support @rem tests 4 5 7 10 12 14 19 and 22 require Unicode support
@rem tests 6 7 10 16 and 19 require ucp support @rem 8 requires Unicode and link size 2
@rem 11 requires ucp and link size 2 @rem 16 requires absence of jit support
@rem 12 requires presence of jit support @rem 17 requires presence of jit support
@rem 13 requires absence of jit support
@rem Sheri P also added override tests for study and jit testing @rem Sheri P also added override tests for study and jit testing
@rem Zoltan Herczeg added libpcre16 support @rem Zoltan Herczeg added libpcre16 support
@rem Zoltan Herczeg added libpcre32 support @rem Zoltan Herczeg added libpcre32 support
@ -25,6 +24,7 @@
@rem @rem
@rem The file was converted for PCRE2 by PH, February 2015. @rem The file was converted for PCRE2 by PH, February 2015.
@rem Updated for new test 14 (moving others up a number), August 2015. @rem Updated for new test 14 (moving others up a number), August 2015.
@rem Tidied and updated for new tests 21, 22, 23 by PH, October 2015.
setlocal enabledelayedexpansion setlocal enabledelayedexpansion
@ -65,6 +65,8 @@ set support32=%ERRORLEVEL%
set unicode=%ERRORLEVEL% set unicode=%ERRORLEVEL%
%pcre2test% -C jit >NUL %pcre2test% -C jit >NUL
set jit=%ERRORLEVEL% set jit=%ERRORLEVEL%
%pcre2test% -C backslash-C >NUL
set supportBSC=%ERRORLEVEL%
if %support8% EQU 1 ( if %support8% EQU 1 (
if not exist testout8 md testout8 if not exist testout8 md testout8
@ -101,18 +103,21 @@ set do17=no
set do18=no set do18=no
set do19=no set do19=no
set do20=no set do20=no
set do21=no
set do22=no
set do23=no
set all=yes set all=yes
for %%a in (%*) do ( for %%a in (%*) do (
set valid=no set valid=no
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20) do if %%v == %%a set valid=yes for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23) do if %%v == %%a set valid=yes
if "!valid!" == "yes" ( if "!valid!" == "yes" (
set do%%a=yes set do%%a=yes
set all=no set all=no
) else ( ) else (
echo Invalid test number - %%a! echo Invalid test number - %%a!
echo Usage %0 [ test_number ] ... echo Usage %0 [ test_number ] ...
echo Where test_number is one or more optional test numbers 1 through 20, default is all tests. echo Where test_number is one or more optional test numbers 1 through 23, default is all tests.
exit /b 1 exit /b 1
) )
) )
@ -139,6 +144,9 @@ if "%all%" == "yes" (
set do18=yes set do18=yes
set do19=yes set do19=yes
set do20=yes set do20=yes
set do21=yes
set do22=yes
set do23=yes
) )
@echo RunTest.bat's pcre2test output is written to newly created subfolders @echo RunTest.bat's pcre2test output is written to newly created subfolders
@ -187,6 +195,9 @@ if "%do17%" == "yes" call :do17
if "%do18%" == "yes" call :do18 if "%do18%" == "yes" call :do18
if "%do19%" == "yes" call :do19 if "%do19%" == "yes" call :do19
if "%do20%" == "yes" call :do20 if "%do20%" == "yes" call :do20
if "%do21%" == "yes" call :do21
if "%do22%" == "yes" call :do22
if "%do23%" == "yes" call :do23
:modeSkip :modeSkip
if "%mode%" == "" ( if "%mode%" == "" (
set mode=-16 set mode=-16
@ -323,7 +334,7 @@ if %unicode% EQU 0 (
goto :eof goto :eof
:do6 :do6
call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q -dfa call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q
goto :eof goto :eof
:do7 :do7
@ -331,7 +342,7 @@ if %unicode% EQU 0 (
echo Test 7 Skipped due to absence of Unicode support. echo Test 7 Skipped due to absence of Unicode support.
goto :eof goto :eof
) )
call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q -dfa call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q
goto :eof goto :eof
:do8 :do8
@ -395,12 +406,16 @@ if %bits% EQU 8 (
echo Test 13 Skipped when running 8-bit tests. echo Test 13 Skipped when running 8-bit tests.
goto :eof goto :eof
) )
call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q -dfa call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q
goto :eof goto :eof
:do14 :do14
call :runsub 14 testout "DFA specials for UTF and UCP support" -q if %unicode% EQU 0 (
goto :eof echo Test 14 Skipped due to absence of Unicode support.
goto :eof
)
call :runsub 14 testout "DFA specials for UTF and UCP support" -q
goto :eof
:do15 :do15
call :runsub 15 testout "Non-JIT limits and other non_JIT tests" -q call :runsub 15 testout "Non-JIT limits and other non_JIT tests" -q
@ -442,6 +457,10 @@ if %bits% EQU 16 (
if %bits% EQU 32 ( if %bits% EQU 32 (
echo Test 19 Skipped when running 32-bit tests. echo Test 19 Skipped when running 32-bit tests.
goto :eof goto :eof
)
if %unicode% EQU 0 (
echo Test 19 Skipped due to absence of Unicode support.
goto :eof
) )
call :runsub 19 testout "POSIX interface with UTF-8 and UCP" -q call :runsub 19 testout "POSIX interface with UTF-8 and UCP" -q
goto :eof goto :eof
@ -450,6 +469,37 @@ goto :eof
call :runsub 20 testout "Serialization tests" -q call :runsub 20 testout "Serialization tests" -q
goto :eof goto :eof
:do21
if %supportBSC% EQU 0 (
echo Test 21 Skipped due to absence of backslash-C support.
goto :eof
)
call :runsub 21 testout "Backslash-C tests without UTF" -q
call :runsub 21 testout "Backslash-C tests without UTF (DFA)" -q -dfa
if %jit% EQU 1 call :runsub 21 testoutjit "Test with JIT Override" -q -jit
goto :eof
:do22
if %supportBSC% EQU 0 (
echo Test 22 Skipped due to absence of backslash-C support.
goto :eof
)
if %unicode% EQU 0 (
echo Test 22 Skipped due to absence of Unicode support.
goto :eof
)
call :runsub 22 testout "Backslash-C tests with UTF" -q
if %jit% EQU 1 call :runsub 22 testoutjit "Test with JIT Override" -q -jit
goto :eof
:do23
if %supportBSC% EQU 1 (
echo Test 23 Skipped due to presence of backslash-C support.
goto :eof
)
call :runsub 23 testout "Backslash-C disabled test" -q
goto :eof
:conferror :conferror
@echo. @echo.
@echo Either your build is incomplete or you have a configuration error. @echo Either your build is incomplete or you have a configuration error.

View File

@ -33,6 +33,7 @@
#cmakedefine EBCDIC 1 #cmakedefine EBCDIC 1
#cmakedefine EBCDIC_NL25 1 #cmakedefine EBCDIC_NL25 1
#cmakedefine HEAP_MATCH_RECURSE 1 #cmakedefine HEAP_MATCH_RECURSE 1
#cmakedefine NEVER_BACKSLASH_C 1
#define LINK_SIZE @PCRE2_LINK_SIZE@ #define LINK_SIZE @PCRE2_LINK_SIZE@
#define MATCH_LIMIT @PCRE2_MATCH_LIMIT@ #define MATCH_LIMIT @PCRE2_MATCH_LIMIT@

View File

@ -189,6 +189,12 @@ AC_ARG_ENABLE(bsr-anycrlf,
AS_HELP_STRING([--enable-bsr-anycrlf], AS_HELP_STRING([--enable-bsr-anycrlf],
[\R matches only CR, LF, CRLF by default]), [\R matches only CR, LF, CRLF by default]),
, enable_bsr_anycrlf=no) , enable_bsr_anycrlf=no)
# Handle --enable-never-backslash-C
AC_ARG_ENABLE(never-backslash-C,
AS_HELP_STRING([--enable-never-backslash-C],
[use of \C causes an error]),
, enable_never_backslash_C=no)
# Handle --enable-ebcdic # Handle --enable-ebcdic
AC_ARG_ENABLE(ebcdic, AC_ARG_ENABLE(ebcdic,
@ -604,6 +610,11 @@ if test "$enable_bsr_anycrlf" = "yes"; then
The build-time default can be overridden by the user of PCRE2 at runtime.]) The build-time default can be overridden by the user of PCRE2 at runtime.])
fi fi
if test "$enable_never_backslash_C" = "yes"; then
AC_DEFINE([NEVER_BACKSLASH_C], [], [
Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns.])
fi
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [ AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
The value of LINK_SIZE determines the number of bytes used to store The value of LINK_SIZE determines the number of bytes used to store
links as offsets within the compiled regex. The default is 2, which links as offsets within the compiled regex. The default is 2, which
@ -881,6 +892,7 @@ $PACKAGE-$VERSION configuration summary:
Enable Unicode support .......... : ${enable_unicode} Enable Unicode support .......... : ${enable_unicode}
Newline char/sequence ........... : ${enable_newline} Newline char/sequence ........... : ${enable_newline}
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf} \R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
\C is disabled .................. : ${enable_never_backslash_C}
EBCDIC coding ................... : ${enable_ebcdic} EBCDIC coding ................... : ${enable_ebcdic}
EBCDIC code for NL .............. : ${ebcdic_nl_code} EBCDIC code for NL .............. : ${ebcdic_nl_code}
Rebuild char tables ............. : ${enable_rebuild_chartables} Rebuild char tables ............. : ${enable_rebuild_chartables}

View File

@ -219,6 +219,13 @@ library. They are also documented in the pcre2build man page.
to be the end of a line (see above). However, the caller of PCRE2 can to be the end of a line (see above). However, the caller of PCRE2 can
restrict \R to match only CR, LF, or CRLF. You can make this the default by restrict \R to match only CR, LF, or CRLF. You can make this the default by
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R"). adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
. In a pattern, the escape sequence \C matches a single code unit, even in a
UTF mode. This can be dangerous because it breaks up multi-code-unit
characters. You can build PCRE2 with the use of \C permanently locked out by
adding --enable-never-backslash-C (note the upper case C) to the "configure"
command. When \C is allowed by the library, individual applications can lock
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
. PCRE2 has a counter that limits the depth of nesting of parentheses in a . PCRE2 has a counter that limits the depth of nesting of parentheses in a
pattern. This limits the amount of system stack that a pattern uses when it pattern. This limits the amount of system stack that a pattern uses when it
@ -833,4 +840,4 @@ The distribution should contain the files listed below.
Philip Hazel Philip Hazel
Email local part: ph10 Email local part: ph10
Email domain: cam.ac.uk Email domain: cam.ac.uk
Last updated: 16 July 2015 Last updated: 16 October 2015

View File

@ -126,8 +126,10 @@ running redundant checks.
<P> <P>
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to
problems, because it may leave the current matching point in the middle of a problems, because it may leave the current matching point in the middle of a
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
lock out the use of \C, causing a compile-time error if it is encountered. application to lock out the use of \C, causing a compile-time error if it is
encountered. It is also possible to build PCRE2 with the use of \C permanently
disabled.
</P> </P>
<P> <P>
Another way that performance can be hit is by running a pattern that has a very Another way that performance can be hit is by running a pattern that has a very
@ -187,7 +189,7 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
</P> </P>
<br><a name="SEC5" href="#TOC1">REVISION</a><br> <br><a name="SEC5" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 13 April 2015 Last updated: 16 October 2015
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2015 University of Cambridge.
<br> <br>

View File

@ -59,20 +59,22 @@ units, not characters, as is the contents of the variable pointed at by
<i>outlengthptr</i>, which is updated to the actual length of the new string. <i>outlengthptr</i>, which is updated to the actual length of the new string.
The options are: The options are:
<pre> <pre>
PCRE2_ANCHORED Match only at the first position PCRE2_ANCHORED Match only at the first position
PCRE2_NOTBOL Subject string is not the beginning of a line PCRE2_NOTBOL Subject is not the beginning of a line
PCRE2_NOTEOL Subject string is not the end of a line PCRE2_NOTEOL Subject is not the end of a line
PCRE2_NOTEMPTY An empty string is not a valid match PCRE2_NOTEMPTY An empty string is not a valid match
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject PCRE2_NOTEMPTY_ATSTART An empty string at the start of the
is not a valid match subject is not a valid match
PCRE2_NO_UTF_CHECK Do not check the subject or replacement for PCRE2_NO_UTF_CHECK Do not check the subject or replacement
UTF validity (only relevant if PCRE2_UTF for UTF validity (only relevant if
was set at compile time) PCRE2_UTF was set at compile time)
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
</pre> </pre>
The function returns the number of substitutions, which may be zero if there The function returns the number of substitutions, which may be zero if there
were no matches. The result can be greater than one only when were no matches. The result can be greater than one only when
PCRE2_SUBSTITUTE_GLOBAL is set. PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
is returned.
</P> </P>
<P> <P>
There is a complete description of the PCRE2 native API in the There is a complete description of the PCRE2 native API in the

View File

@ -1197,7 +1197,7 @@ built.
</pre> </pre>
If this option is set, an unanchored pattern is required to match before or at If this option is set, an unanchored pattern is required to match before or at
the first newline in the subject string, though the matched text may continue the first newline in the subject string, though the matched text may continue
over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
general limiting facility. general limiting facility.
<pre> <pre>
PCRE2_MATCH_UNSET_BACKREF PCRE2_MATCH_UNSET_BACKREF
@ -1237,7 +1237,8 @@ This option locks out the use of \C in the pattern that is being compiled.
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
it may leave the current matching point in the middle of a multi-code-unit it may leave the current matching point in the middle of a multi-code-unit
character. This option may be useful in applications that process patterns from character. This option may be useful in applications that process patterns from
external sources. external sources. Note that there is also a build-time option that permanently
locks out the use of \C.
<pre> <pre>
PCRE2_NEVER_UCP PCRE2_NEVER_UCP
</pre> </pre>
@ -1383,8 +1384,8 @@ with Perl. It can also be set by a (?U) option setting within the pattern.
<pre> <pre>
PCRE2_USE_OFFSET_LIMIT PCRE2_USE_OFFSET_LIMIT
</pre> </pre>
This option must be set for <b>pcre2_compile()</b> if This option must be set for <b>pcre2_compile()</b> if
<b>pcre2_set_offset_limit()</b> is going to be used to set a non-default offset <b>pcre2_set_offset_limit()</b> is going to be used to set a non-default offset
limit in a match context for matches that use this pattern. An error is limit in a match context for matches that use this pattern. An error is
generated if an offset limit is set without this option. For more details, see generated if an offset limit is set without this option. For more details, see
the description of <b>pcre2_set_offset_limit()</b> in the the description of <b>pcre2_set_offset_limit()</b> in the
@ -2613,43 +2614,17 @@ same number causes an error at compile time.
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b> <b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b> <b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b> <b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR \fIreplacementzfP,</b> <b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR <i>replacement</i>,</b>
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *\fIoutputbuffer\zfP,</b> <b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *\fIoutputbuffer\zfP,</b>
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b> <b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
</P>
<P>
This function calls <b>pcre2_match()</b> and then makes a copy of the subject This function calls <b>pcre2_match()</b> and then makes a copy of the subject
string in <i>outputbuffer</i>, replacing the part that was matched with the string in <i>outputbuffer</i>, replacing the part that was matched with the
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can <i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
</P> </P>
<P> <P>
In the replacement string, which is interpreted as a UTF string in UTF mode,
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
dollar character is an escape character that can specify the insertion of
characters from capturing groups or (*MARK) items in the pattern. The following
forms are recognized:
<pre>
$$ insert a dollar character
$&#60;n&#62; or ${&#60;n&#62;} insert the contents of group &#60;n&#62;
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
</pre>
Either a group number or a group name can be given for &#60;n&#62;. Curly brackets are
required only if the following character would be interpreted as part of the
number or name. The number may be zero to include the entire matched string.
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
string "+$1$0$1+", the result is "=+babcb+=". Group insertion is done by
calling <b>pcre2_copy_byname()</b> or <b>pcre2_copy_bynumber()</b> as
appropriate.
</P>
<P>
The facility for inserting a (*MARK) name can be used to perform simple
simultaneous substitutions, as this <b>pcre2test</b> example shows:
<pre>
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
apple lemon
2: pear orange
</PRE>
</P>
<P>
The first seven arguments of <b>pcre2_substitute()</b> are the same as for The first seven arguments of <b>pcre2_substitute()</b> are the same as for
<b>pcre2_match()</b>, except that the partial matching options are not <b>pcre2_match()</b>, except that the partial matching options are not
permitted, and <i>match_data</i> may be passed as NULL, in which case a match permitted, and <i>match_data</i> may be passed as NULL, in which case a match
@ -2658,25 +2633,112 @@ functions from the match context, if provided, or else those that were used to
allocate memory for the compiled code. allocate memory for the compiled code.
</P> </P>
<P> <P>
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the The <i>outlengthptr</i> argument must point to a variable that contains the
length, in code units, of the output buffer. If the function is successful,
the value is updated to contain the length of the new string, excluding the
trailing zero that is automatically added. If the function is not successful,
the value is set to PCRE2_UNSET for general errors (such as output buffer too
small). For syntax errors in the replacement string, the value is set to the
offset in the replacement string where the error was detected.
</P>
<P>
In the replacement string, which is interpreted as a UTF string in UTF mode,
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
dollar character is an escape character that can specify the insertion of
characters from capturing groups or (*MARK) items in the pattern. The following
forms are always recognized:
<pre>
$$ insert a dollar character
$&#60;n&#62; or ${&#60;n&#62;} insert the contents of group &#60;n&#62;
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
</pre>
Either a group number or a group name can be given for &#60;n&#62;. Curly brackets are
required only if the following character would be interpreted as part of the
number or name. The number may be zero to include the entire matched string.
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
string "+$1$0$1+", the result is "=+babcb+=".
</P>
<P>
The facility for inserting a (*MARK) name can be used to perform simple
simultaneous substitutions, as this <b>pcre2test</b> example shows:
<pre>
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
apple lemon
2: pear orange
</pre>
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
function to iterate over the subject string, replacing every matching function to iterate over the subject string, replacing every matching
substring. If this is not set, only the first matching substring is replaced. substring. If this is not set, only the first matching substring is replaced.
</P> </P>
<P> <P>
The <i>outlengthptr</i> argument must point to a variable that contains the A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra processing
length, in code units, of the output buffer. It is updated to contain the to be applied to the replacement string. Without this option, only the dollar
length of the new string, excluding the trailing zero that is automatically character is special, and only the group insertion forms listed above are
added. valid. When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
</P> </P>
<P> <P>
The function returns the number of replacements that were made. This may be Firstly, backslash in a replacement string is interpreted as an escape
zero if no matches were found, and is never greater than 1 unless character. The usual forms such as \n or \x{ddd} can be used to specify
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code particular character codes, and backslash followed by any non-alphanumeric
is returned. Except for PCRE2_ERROR_NOMATCH (which is never returned), any character quotes that character. Extended quoting can be coded using \Q...\E,
errors from <b>pcre2_match()</b> or the substring copying functions are passed exactly as in pattern strings.
straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid </P>
replacement string (unrecognized sequence following a dollar sign), and <P>
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. There are also four escape sequences for forcing the case of inserted letters.
The insertion mechanism has three states: no case forcing, force upper case,
and force lower case. The escape sequences change the current state: \U and
\L change to upper or lower case forcing, respectively, and \E (when not
terminating a \Q quoted sequence) reverts to no case forcing. The sequences
\u and \l force the next character (if it is a letter) to upper or lower
case, respectively, and then the state automatically reverts to no case
forcing. Case forcing applies to all inserted characters, including those from
captured groups and letters within \Q...\E quoted sequences.
</P>
<P>
Note that case forcing sequences such as \U...\E do not nest. For example,
the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no
effect.
</P>
<P>
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
flexibility to group substitution. The syntax is similar to that used by Bash:
<pre>
${&#60;n&#62;:-&#60;string&#62;}
${&#60;n&#62;:+&#60;string1&#62;:&#60;string2&#62;}
</pre>
As before, &#60;n&#62; may be a group number or a name. The first form specifies a
default value. If group &#60;n&#62; is set, its value is inserted; if not, &#60;string&#62; is
expanded and the result inserted. The second form specifies strings that are
expanded and inserted when group &#60;n&#62; is set or unset, respectively. The first
form is just a convenient shorthand for
<pre>
${&#60;n&#62;:+${&#60;n&#62;}:&#60;string&#62;}
</pre>
Backslash can be used to escape colons and closing curly brackets in the
replacement strings. A change of the case forcing state within a replacement
string remains in force afterwards, as shown in this <b>pcre2test</b> example:
<pre>
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
body
1: hello
somebody
1: HELLO
</pre>
If successful, the function returns the number of replacements that were made.
This may be zero if no matches were found, and is never greater than 1 unless
PCRE2_SUBSTITUTE_GLOBAL is set.
</P>
<P>
In the event of an error, a negative error code is returned. Except for
PCRE2_ERROR_NOMATCH (which is never returned), errors from <b>pcre2_match()</b>
are passed straight back. PCRE2_ERROR_NOMEMORY is returned if the output buffer
is not big enough. PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax
errors in the replacement string, with more particular errors being
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence),
PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket not found), and
PCRE2_BADSUBSTITUTION (syntax error in extended group substitution). As for all
PCRE2 errors, a text message that describes the error can be obtained by
calling <b>pcre2_get_error_message()</b>.
</P> </P>
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br> <br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
<P> <P>
@ -2908,8 +2970,8 @@ There are in addition the following errors that are specific to
PCRE2_ERROR_DFA_UITEM PCRE2_ERROR_DFA_UITEM
</pre> </pre>
This return is given if <b>pcre2_dfa_match()</b> encounters an item in the This return is given if <b>pcre2_dfa_match()</b> encounters an item in the
pattern that it does not support, for instance, the use of \C or a back pattern that it does not support, for instance, the use of \C in a UTF mode or
reference. a back reference.
<pre> <pre>
PCRE2_ERROR_DFA_UCOND PCRE2_ERROR_DFA_UCOND
</pre> </pre>
@ -2953,7 +3015,7 @@ Cambridge, England.
</P> </P>
<br><a name="SEC40" href="#TOC1">REVISION</a><br> <br><a name="SEC40" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 22 September 2015 Last updated: 16 October 2015
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2015 University of Cambridge.
<br> <br>

View File

@ -18,23 +18,24 @@ please consult the man page, in case the conversion went wrong.
<li><a name="TOC3" href="#SEC3">BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a> <li><a name="TOC3" href="#SEC3">BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a>
<li><a name="TOC4" href="#SEC4">BUILDING SHARED AND STATIC LIBRARIES</a> <li><a name="TOC4" href="#SEC4">BUILDING SHARED AND STATIC LIBRARIES</a>
<li><a name="TOC5" href="#SEC5">UNICODE AND UTF SUPPORT</a> <li><a name="TOC5" href="#SEC5">UNICODE AND UTF SUPPORT</a>
<li><a name="TOC6" href="#SEC6">JUST-IN-TIME COMPILER SUPPORT</a> <li><a name="TOC6" href="#SEC6">DISABLING THE USE OF \C</a>
<li><a name="TOC7" href="#SEC7">NEWLINE RECOGNITION</a> <li><a name="TOC7" href="#SEC7">JUST-IN-TIME COMPILER SUPPORT</a>
<li><a name="TOC8" href="#SEC8">WHAT \R MATCHES</a> <li><a name="TOC8" href="#SEC8">NEWLINE RECOGNITION</a>
<li><a name="TOC9" href="#SEC9">HANDLING VERY LARGE PATTERNS</a> <li><a name="TOC9" href="#SEC9">WHAT \R MATCHES</a>
<li><a name="TOC10" href="#SEC10">AVOIDING EXCESSIVE STACK USAGE</a> <li><a name="TOC10" href="#SEC10">HANDLING VERY LARGE PATTERNS</a>
<li><a name="TOC11" href="#SEC11">LIMITING PCRE2 RESOURCE USAGE</a> <li><a name="TOC11" href="#SEC11">AVOIDING EXCESSIVE STACK USAGE</a>
<li><a name="TOC12" href="#SEC12">CREATING CHARACTER TABLES AT BUILD TIME</a> <li><a name="TOC12" href="#SEC12">LIMITING PCRE2 RESOURCE USAGE</a>
<li><a name="TOC13" href="#SEC13">USING EBCDIC CODE</a> <li><a name="TOC13" href="#SEC13">CREATING CHARACTER TABLES AT BUILD TIME</a>
<li><a name="TOC14" href="#SEC14">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a> <li><a name="TOC14" href="#SEC14">USING EBCDIC CODE</a>
<li><a name="TOC15" href="#SEC15">PCRE2GREP BUFFER SIZE</a> <li><a name="TOC15" href="#SEC15">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
<li><a name="TOC16" href="#SEC16">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a> <li><a name="TOC16" href="#SEC16">PCRE2GREP BUFFER SIZE</a>
<li><a name="TOC17" href="#SEC17">INCLUDING DEBUGGING CODE</a> <li><a name="TOC17" href="#SEC17">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a>
<li><a name="TOC18" href="#SEC18">DEBUGGING WITH VALGRIND SUPPORT</a> <li><a name="TOC18" href="#SEC18">INCLUDING DEBUGGING CODE</a>
<li><a name="TOC19" href="#SEC19">CODE COVERAGE REPORTING</a> <li><a name="TOC19" href="#SEC19">DEBUGGING WITH VALGRIND SUPPORT</a>
<li><a name="TOC20" href="#SEC20">SEE ALSO</a> <li><a name="TOC20" href="#SEC20">CODE COVERAGE REPORTING</a>
<li><a name="TOC21" href="#SEC21">AUTHOR</a> <li><a name="TOC21" href="#SEC21">SEE ALSO</a>
<li><a name="TOC22" href="#SEC22">REVISION</a> <li><a name="TOC22" href="#SEC22">AUTHOR</a>
<li><a name="TOC23" href="#SEC23">REVISION</a>
</ul> </ul>
<br><a name="SEC1" href="#TOC1">BUILDING PCRE2</a><br> <br><a name="SEC1" href="#TOC1">BUILDING PCRE2</a><br>
<P> <P>
@ -148,13 +149,19 @@ properties. The application can request that they do by setting the PCRE2_UCP
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
request this by starting with (*UCP). request this by starting with (*UCP).
</P> </P>
<br><a name="SEC6" href="#TOC1">DISABLING THE USE OF \C</a><br>
<P> <P>
The \C escape sequence, which matches a single code unit, even in a UTF mode, The \C escape sequence, which matches a single code unit, even in a UTF mode,
can cause unpredictable behaviour because it may leave the current matching can cause unpredictable behaviour because it may leave the current matching
point in the middle of a multi-code-unit character. It can be locked out by point in the middle of a multi-code-unit character. The application can lock it
setting the PCRE2_NEVER_BACKSLASH_C option. out by setting the PCRE2_NEVER_BACKSLASH_C option when calling
<b>pcre2_compile()</b>. There is also a build-time option
<pre>
--enable-never-backslash-C
</pre>
(note the upper case C) which locks out the use of \C entirely.
</P> </P>
<br><a name="SEC6" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br> <br><a name="SEC7" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br>
<P> <P>
Just-in-time compiler support is included in the build by specifying Just-in-time compiler support is included in the build by specifying
<pre> <pre>
@ -171,7 +178,7 @@ pcre2grep automatically makes use of it, unless you add
</pre> </pre>
to the "configure" command. to the "configure" command.
</P> </P>
<br><a name="SEC7" href="#TOC1">NEWLINE RECOGNITION</a><br> <br><a name="SEC8" href="#TOC1">NEWLINE RECOGNITION</a><br>
<P> <P>
By default, PCRE2 interprets the linefeed (LF) character as indicating the end By default, PCRE2 interprets the linefeed (LF) character as indicating the end
of a line. This is the normal newline character on Unix-like systems. You can of a line. This is the normal newline character on Unix-like systems. You can
@ -208,7 +215,7 @@ Whatever default line ending convention is selected when PCRE2 is built can be
overridden by applications that use the library. At build time it is overridden by applications that use the library. At build time it is
conventional to use the standard for your operating system. conventional to use the standard for your operating system.
</P> </P>
<br><a name="SEC8" href="#TOC1">WHAT \R MATCHES</a><br> <br><a name="SEC9" href="#TOC1">WHAT \R MATCHES</a><br>
<P> <P>
By default, the sequence \R in a pattern matches any Unicode newline sequence, By default, the sequence \R in a pattern matches any Unicode newline sequence,
independently of what has been selected as the line ending sequence. If you independently of what has been selected as the line ending sequence. If you
@ -220,7 +227,7 @@ the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
selected when PCRE2 is built can be overridden by applications that use the selected when PCRE2 is built can be overridden by applications that use the
called. called.
</P> </P>
<br><a name="SEC9" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br> <br><a name="SEC10" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
<P> <P>
Within a compiled pattern, offset values are used to point from one part to Within a compiled pattern, offset values are used to point from one part to
another (for example, from an opening parenthesis to an alternation another (for example, from an opening parenthesis to an alternation
@ -239,7 +246,7 @@ longer offsets slows down the operation of PCRE2 because it has to load
additional data when handling them. For the 32-bit library the value is always additional data when handling them. For the 32-bit library the value is always
4 and cannot be overridden; the value of --with-link-size is ignored. 4 and cannot be overridden; the value of --with-link-size is ignored.
</P> </P>
<br><a name="SEC10" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br> <br><a name="SEC11" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
<P> <P>
When matching with the <b>pcre2_match()</b> function, PCRE2 implements When matching with the <b>pcre2_match()</b> function, PCRE2 implements
backtracking by making recursive calls to an internal function called backtracking by making recursive calls to an internal function called
@ -261,7 +268,7 @@ custom memory management functions can be called instead. PCRE2 runs noticeably
more slowly when built in this way. This option affects only the more slowly when built in this way. This option affects only the
<b>pcre2_match()</b> function; it is not relevant for <b>pcre2_dfa_match()</b>. <b>pcre2_match()</b> function; it is not relevant for <b>pcre2_dfa_match()</b>.
</P> </P>
<br><a name="SEC11" href="#TOC1">LIMITING PCRE2 RESOURCE USAGE</a><br> <br><a name="SEC12" href="#TOC1">LIMITING PCRE2 RESOURCE USAGE</a><br>
<P> <P>
Internally, PCRE2 has a function called <b>match()</b>, which it calls Internally, PCRE2 has a function called <b>match()</b>, which it calls
repeatedly (sometimes recursively) when matching a pattern with the repeatedly (sometimes recursively) when matching a pattern with the
@ -290,7 +297,7 @@ constraints. However, you can set a lower limit by adding, for example,
</pre> </pre>
to the <b>configure</b> command. This value can also be overridden at run time. to the <b>configure</b> command. This value can also be overridden at run time.
</P> </P>
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br> <br><a name="SEC13" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
<P> <P>
PCRE2 uses fixed tables for processing characters whose code points are less PCRE2 uses fixed tables for processing characters whose code points are less
than 256. By default, PCRE2 is built with a set of tables that are distributed than 256. By default, PCRE2 is built with a set of tables that are distributed
@ -307,7 +314,7 @@ compiling, because <b>dftables</b> is run on the local host. If you need to
create alternative tables when cross compiling, you will have to do so "by create alternative tables when cross compiling, you will have to do so "by
hand".) hand".)
</P> </P>
<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br> <br><a name="SEC14" href="#TOC1">USING EBCDIC CODE</a><br>
<P> <P>
PCRE2 assumes by default that it will run in an environment where the character PCRE2 assumes by default that it will run in an environment where the character
code is ASCII or Unicode, which is a superset of ASCII. This is the case for code is ASCII or Unicode, which is a superset of ASCII. This is the case for
@ -342,7 +349,7 @@ The options that select newline behaviour, such as --enable-newline-is-cr,
and equivalent run-time options, refer to these character values in an EBCDIC and equivalent run-time options, refer to these character values in an EBCDIC
environment. environment.
</P> </P>
<br><a name="SEC14" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br> <br><a name="SEC15" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
<P> <P>
By default, <b>pcre2grep</b> reads all files as plain text. You can build it so By default, <b>pcre2grep</b> reads all files as plain text. You can build it so
that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads
@ -355,7 +362,7 @@ to the <b>configure</b> command. These options naturally require that the
relevant libraries are installed on your system. Configuration will fail if relevant libraries are installed on your system. Configuration will fail if
they are not. they are not.
</P> </P>
<br><a name="SEC15" href="#TOC1">PCRE2GREP BUFFER SIZE</a><br> <br><a name="SEC16" href="#TOC1">PCRE2GREP BUFFER SIZE</a><br>
<P> <P>
<b>pcre2grep</b> uses an internal buffer to hold a "window" on the file it is <b>pcre2grep</b> uses an internal buffer to hold a "window" on the file it is
scanning, in order to be able to output "before" and "after" lines when it scanning, in order to be able to output "before" and "after" lines when it
@ -370,7 +377,7 @@ parameter value by adding, for example,
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override this to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override this
value by using --buffer-size on the command line.. value by using --buffer-size on the command line..
</P> </P>
<br><a name="SEC16" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br> <br><a name="SEC17" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
<P> <P>
If you add one of If you add one of
<pre> <pre>
@ -404,7 +411,7 @@ automatically included, you may need to add something like
</pre> </pre>
immediately before the <b>configure</b> command. immediately before the <b>configure</b> command.
</P> </P>
<br><a name="SEC17" href="#TOC1">INCLUDING DEBUGGING CODE</a><br> <br><a name="SEC18" href="#TOC1">INCLUDING DEBUGGING CODE</a><br>
<P> <P>
If you add If you add
<pre> <pre>
@ -413,7 +420,7 @@ If you add
to the <b>configure</b> command, additional debugging code is included in the to the <b>configure</b> command, additional debugging code is included in the
build. This feature is intended for use by the PCRE2 maintainers. build. This feature is intended for use by the PCRE2 maintainers.
</P> </P>
<br><a name="SEC18" href="#TOC1">DEBUGGING WITH VALGRIND SUPPORT</a><br> <br><a name="SEC19" href="#TOC1">DEBUGGING WITH VALGRIND SUPPORT</a><br>
<P> <P>
If you add If you add
<pre> <pre>
@ -423,7 +430,7 @@ to the <b>configure</b> command, PCRE2 will use valgrind annotations to mark
certain memory regions as unaddressable. This allows it to detect invalid certain memory regions as unaddressable. This allows it to detect invalid
memory accesses, and is mostly useful for debugging PCRE2 itself. memory accesses, and is mostly useful for debugging PCRE2 itself.
</P> </P>
<br><a name="SEC19" href="#TOC1">CODE COVERAGE REPORTING</a><br> <br><a name="SEC20" href="#TOC1">CODE COVERAGE REPORTING</a><br>
<P> <P>
If your C compiler is gcc, you can build a version of PCRE2 that can generate a If your C compiler is gcc, you can build a version of PCRE2 that can generate a
code coverage report for its test suite. To enable this, you must install code coverage report for its test suite. To enable this, you must install
@ -480,11 +487,11 @@ This cleans all coverage data including the generated coverage report. For more
information about code coverage, see the <b>gcov</b> and <b>lcov</b> information about code coverage, see the <b>gcov</b> and <b>lcov</b>
documentation. documentation.
</P> </P>
<br><a name="SEC20" href="#TOC1">SEE ALSO</a><br> <br><a name="SEC21" href="#TOC1">SEE ALSO</a><br>
<P> <P>
<b>pcre2api</b>(3), <b>pcre2-config</b>(3). <b>pcre2api</b>(3), <b>pcre2-config</b>(3).
</P> </P>
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br> <br><a name="SEC22" href="#TOC1">AUTHOR</a><br>
<P> <P>
Philip Hazel Philip Hazel
<br> <br>
@ -493,9 +500,9 @@ University Computing Service
Cambridge, England. Cambridge, England.
<br> <br>
</P> </P>
<br><a name="SEC22" href="#TOC1">REVISION</a><br> <br><a name="SEC23" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 24 April 2015 Last updated: 16 October 2015
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2015 University of Cambridge.
<br> <br>

View File

@ -1236,14 +1236,21 @@ with \C in UTF-8 or UTF-16 mode means that the rest of the string may start
with a malformed UTF character. This has undefined results, because PCRE2 with a malformed UTF character. This has undefined results, because PCRE2
assumes that it is matching character by character in a valid UTF string (by assumes that it is matching character by character in a valid UTF string (by
default it checks the subject string's validity at the start of processing default it checks the subject string's validity at the start of processing
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the unless the PCRE2_NO_UTF_CHECK option is used).
use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. </P>
<P>
An application can lock out the use of \C by setting the
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
build PCRE2 with the use of \C permanently disabled.
</P> </P>
<P> <P>
PCRE2 does not allow \C to appear in lookbehind assertions PCRE2 does not allow \C to appear in lookbehind assertions
<a href="#lookbehind">(described below)</a> <a href="#lookbehind">(described below)</a>
in a UTF mode, because this would make it impossible to calculate the length of in a UTF mode, because this would make it impossible to calculate the length of
the lookbehind. the lookbehind. Neither the alternative matching function
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
former gives a match-time error; the latter fails to optimize and so the match
is always run using the interpreter.
</P> </P>
<P> <P>
In general, the \C escape sequence is best avoided. However, one way of using In general, the \C escape sequence is best avoided. However, one way of using
@ -3351,7 +3358,7 @@ Cambridge, England.
</P> </P>
<br><a name="SEC30" href="#TOC1">REVISION</a><br> <br><a name="SEC30" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 01 September 2015 Last updated: 16 October 2015
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2015 University of Cambridge.
<br> <br>

View File

@ -12,17 +12,21 @@ This page is part of the PCRE2 HTML documentation. It was generated
automatically from the original man page. If there is any nonsense in it, automatically from the original man page. If there is any nonsense in it,
please consult the man page, in case the conversion went wrong. please consult the man page, in case the conversion went wrong.
<br> <br>
<br><b> <ul>
PCRE2 PERFORMANCE <li><a name="TOC1" href="#SEC1">PCRE2 PERFORMANCE</a>
</b><br> <li><a name="TOC2" href="#SEC2">COMPILED PATTERN MEMORY USAGE</a>
<li><a name="TOC3" href="#SEC3">STACK USAGE AT RUN TIME</a>
<li><a name="TOC4" href="#SEC4">PROCESSING TIME</a>
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
<li><a name="TOC6" href="#SEC6">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PCRE2 PERFORMANCE</a><br>
<P> <P>
Two aspects of performance are discussed below: memory usage and processing Two aspects of performance are discussed below: memory usage and processing
time. The way you express your pattern as a regular expression can affect both time. The way you express your pattern as a regular expression can affect both
of them. of them.
</P> </P>
<br><b> <br><a name="SEC2" href="#TOC1">COMPILED PATTERN MEMORY USAGE</a><br>
COMPILED PATTERN MEMORY USAGE
</b><br>
<P> <P>
Patterns are compiled by PCRE2 into a reasonably efficient interpretive code, Patterns are compiled by PCRE2 into a reasonably efficient interpretive code,
so that most simple patterns do not use much memory. However, there is one case so that most simple patterns do not use much memory. However, there is one case
@ -75,9 +79,7 @@ pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
speed is acceptable, this kind of rewriting will allow you to process patterns speed is acceptable, this kind of rewriting will allow you to process patterns
that PCRE2 cannot otherwise handle. that PCRE2 cannot otherwise handle.
</P> </P>
<br><b> <br><a name="SEC3" href="#TOC1">STACK USAGE AT RUN TIME</a><br>
STACK USAGE AT RUN TIME
</b><br>
<P> <P>
When <b>pcre2_match()</b> is used for matching, certain kinds of pattern can When <b>pcre2_match()</b> is used for matching, certain kinds of pattern can
cause it to use large amounts of the process stack. In some environments the cause it to use large amounts of the process stack. In some environments the
@ -86,9 +88,7 @@ SIGSEGV. Rewriting your pattern can often help. The
<a href="pcre2stack.html"><b>pcre2stack</b></a> <a href="pcre2stack.html"><b>pcre2stack</b></a>
documentation discusses this issue in detail. documentation discusses this issue in detail.
</P> </P>
<br><b> <br><a name="SEC4" href="#TOC1">PROCESSING TIME</a><br>
PROCESSING TIME
</b><br>
<P> <P>
Certain items in regular expression patterns are processed more efficiently Certain items in regular expression patterns are processed more efficiently
than others. It is more efficient to use a character class like [aeiou] than a than others. It is more efficient to use a character class like [aeiou] than a
@ -177,9 +177,7 @@ appreciable time with strings longer than about 20 characters.
In many cases, the solution to this kind of performance issue is to use an In many cases, the solution to this kind of performance issue is to use an
atomic group or a possessive quantifier. atomic group or a possessive quantifier.
</P> </P>
<br><b> <br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
AUTHOR
</b><br>
<P> <P>
Philip Hazel Philip Hazel
<br> <br>
@ -188,9 +186,7 @@ University Computing Service
Cambridge, England. Cambridge, England.
<br> <br>
</P> </P>
<br><b> <br><a name="SEC6" href="#TOC1">REVISION</a><br>
REVISION
</b><br>
<P> <P>
Last updated: 02 January 2015 Last updated: 02 January 2015
<br> <br>

View File

@ -111,9 +111,10 @@ it matches a literal "u".
\W a "non-word" character \W a "non-word" character
\X a Unicode extended grapheme cluster \X a Unicode extended grapheme cluster
</pre> </pre>
The application can lock out the use of \C by setting the \C is dangerous because it may leave the current matching point in the middle
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the of a UTF-8 or UTF-16 character. The application can lock out the use of \C by
current matching point in the middle of a UTF-8 or UTF-16 character. setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
with the use of \C permanently disabled.
</P> </P>
<P> <P>
By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode
@ -588,7 +589,7 @@ Cambridge, England.
</P> </P>
<br><a name="SEC27" href="#TOC1">REVISION</a><br> <br><a name="SEC27" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 17 July 2015 Last updated: 16 October 2015
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2015 University of Cambridge.
<br> <br>

View File

@ -155,12 +155,13 @@ following options output the value and set the exit code as indicated:
The following options output 1 for true or 0 for false, and set the exit code The following options output 1 for true or 0 for false, and set the exit code
to the same value: to the same value:
<pre> <pre>
ebcdic compiled for an EBCDIC environment backslash-C \C is supported (not locked out)
jit just-in-time support is available ebcdic compiled for an EBCDIC environment
pcre2-16 the 16-bit library was built jit just-in-time support is available
pcre2-32 the 32-bit library was built pcre2-16 the 16-bit library was built
pcre2-8 the 8-bit library was built pcre2-32 the 32-bit library was built
unicode Unicode support is available pcre2-8 the 8-bit library was built
unicode Unicode support is available
</pre> </pre>
If an unknown option is given, an error message is output; the exit code is 0. If an unknown option is given, an error message is output; the exit code is 0.
</P> </P>
@ -510,7 +511,7 @@ Setting compilation options
<P> <P>
The following modifiers set options for <b>pcre2_compile()</b>. The most common The following modifiers set options for <b>pcre2_compile()</b>. The most common
ones have single-letter abbreviations. See ones have single-letter abbreviations. See
<a href="pcreapi.html"><b>pcreapi</b></a> <a href="pcre2api.html"><b>pcre2api</b></a>
for a description of their effects. for a description of their effects.
<pre> <pre>
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
@ -537,6 +538,7 @@ for a description of their effects.
no_utf_check set PCRE2_NO_UTF_CHECK no_utf_check set PCRE2_NO_UTF_CHECK
ucp set PCRE2_UCP ucp set PCRE2_UCP
ungreedy set PCRE2_UNGREEDY ungreedy set PCRE2_UNGREEDY
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
utf set PCRE2_UTF utf set PCRE2_UTF
</pre> </pre>
As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all
@ -564,6 +566,7 @@ about the pattern:
locale=&#60;name&#62; use this locale locale=&#60;name&#62; use this locale
memory show memory used memory show memory used
newline=&#60;type&#62; set newline type newline=&#60;type&#62; set newline type
null_context compile with a NULL context
parens_nest_limit=&#60;n&#62; set maximum parentheses depth parens_nest_limit=&#60;n&#62; set maximum parentheses depth
posix use the POSIX API posix use the POSIX API
push push compiled pattern onto the stack push push compiled pattern onto the stack
@ -642,6 +645,15 @@ is requested. For each callout, either its number or string is given, followed
by the item that follows it in the pattern. by the item that follows it in the pattern.
</P> </P>
<br><b> <br><b>
Passing a NULL context
</b><br>
<P>
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_compile()</b>. If
the <b>null_context</b> modifier is set, however, NULL is passed. This is for
testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
default values).
</P>
<br><b>
Specifying a pattern in hex Specifying a pattern in hex
</b><br> </b><br>
<P> <P>
@ -920,9 +932,11 @@ pattern.
/g global global matching /g global global matching
jitstack=&#60;n&#62; set size of JIT stack jitstack=&#60;n&#62; set size of JIT stack
mark show mark values mark show mark values
match_limit=&#62;n&#62; set a match limit match_limit=&#60;n&#62; set a match limit
memory show memory usage memory show memory usage
null_context match with a NULL context
offset=&#60;n&#62; set starting offset offset=&#60;n&#62; set starting offset
offset_limit=&#60;n&#62; set offset limit
ovector=&#60;n&#62; set size of output vector ovector=&#60;n&#62; set size of output vector
recursion_limit=&#60;n&#62; set a recursion limit recursion_limit=&#60;n&#62; set a recursion limit
replace=&#60;string&#62; specify a replacement string replace=&#60;string&#62; specify a replacement string
@ -1170,6 +1184,16 @@ The <b>offset</b> modifier sets an offset in the subject string at which
matching starts. Its value is a number of code units, not characters. matching starts. Its value is a number of code units, not characters.
</P> </P>
<br><b> <br><b>
Setting an offset limit
</b><br>
<P>
The <b>offset_limit</b> modifier sets a limit for unanchored matches. If a match
cannot be found starting at or before this offset in the subject, a "no match"
return is given. The data value is a number of code units, not characters. When
this modifier is used, the <b>use_offset_limit</b> modifier must have been set
for the pattern; if not, an error is generated.
</P>
<br><b>
Setting the size of the output vector Setting the size of the output vector
</b><br> </b><br>
<P> <P>
@ -1201,6 +1225,17 @@ this modifier has no effect, as there is no facility for passing a length.)
When testing <b>pcre2_substitute()</b>, this modifier also has the effect of When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
passing the replacement string as zero-terminated. passing the replacement string as zero-terminated.
</P> </P>
<br><b>
Passing a NULL context
</b><br>
<P>
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
<b>pcre2_dfa_match()</b> or <b>pcre2_jit_match()</b>. If the <b>null_context</b>
modifier is set, however, NULL is passed. This is for testing that the matching
functions behave correctly in this case (they use default values). This
modifier cannot be used with the <b>find_limits</b> modifier or when testing the
substitution function.
</P>
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br> <br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
<P> <P>
By default, <b>pcre2test</b> uses the standard PCRE2 matching function, By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
@ -1539,7 +1574,7 @@ Cambridge, England.
</P> </P>
<br><a name="SEC21" href="#TOC1">REVISION</a><br> <br><a name="SEC21" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 14 September 2015 Last updated: 17 October 2015
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2015 University of Cambridge.
<br> <br>

View File

@ -71,11 +71,12 @@ The escape sequence \C can be used to match a single code unit, in a UTF mode,
but its use can lead to some strange effects because it breaks up multi-unit but its use can lead to some strange effects because it breaks up multi-unit
characters (see the description of \C in the characters (see the description of \C in the
<a href="pcre2pattern.html"><b>pcre2pattern</b></a> <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
documentation). The use of \C is not supported in the alternative matching documentation). The use of \C is not supported by the alternative matching
function <b>pcre2_dfa_match()</b>, nor is it supported in UTF mode by the JIT function <b>pcre2_dfa_match()</b> when in UTF mode. Its use provokes a
optimization. If JIT optimization is requested for a UTF pattern that contains match-time error. The JIT optimization also does not support \C in UTF mode.
\C, it will not succeed, and so the matching will be carried out by the normal If JIT optimization is requested for a UTF pattern that contains \C, it will
interpretive function. not succeed, and so the matching will be carried out by the normal interpretive
function.
</P> </P>
<P> <P>
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
@ -275,7 +276,7 @@ Cambridge, England.
REVISION REVISION
</b><br> </b><br>
<P> <P>
Last updated: 18 August 2015 Last updated: 16 October 2015
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2015 University of Cambridge.
<br> <br>

View File

@ -1,4 +1,4 @@
.TH PCRE2 3 "13 April 2015" "PCRE2 10.20" .TH PCRE2 3 "16 October 2015" "PCRE2 10.21"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.SH INTRODUCTION .SH INTRODUCTION
@ -118,8 +118,10 @@ running redundant checks.
.P .P
The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to
problems, because it may leave the current matching point in the middle of a problems, because it may leave the current matching point in the middle of a
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
lock out the use of \eC, causing a compile-time error if it is encountered. application to lock out the use of \eC, causing a compile-time error if it is
encountered. It is also possible to build PCRE2 with the use of \eC permanently
disabled.
.P .P
Another way that performance can be hit is by running a pattern that has a very Another way that performance can be hit is by running a pattern that has a very
large search tree against a string that will never match. Nested unlimited large search tree against a string that will never match. Nested unlimited
@ -187,6 +189,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
.rs .rs
.sp .sp
.nf .nf
Last updated: 13 April 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
.fi .fi

View File

@ -104,26 +104,27 @@ SECURITY CONSIDERATIONS
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead
to problems, because it may leave the current matching point in the to problems, because it may leave the current matching point in the
middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C
option can be used to lock out the use of \C, causing a compile-time option can be used by an application to lock out the use of \C, causing
error if it is encountered. a compile-time error if it is encountered. It is also possible to build
PCRE2 with the use of \C permanently disabled.
Another way that performance can be hit is by running a pattern that Another way that performance can be hit is by running a pattern that
has a very large search tree against a string that will never match. has a very large search tree against a string that will never match.
Nested unlimited repeats in a pattern are a common example. PCRE2 pro- Nested unlimited repeats in a pattern are a common example. PCRE2 pro-
vides some protection against this: see the pcre2_set_match_limit() vides some protection against this: see the pcre2_set_match_limit()
function in the pcre2api page. function in the pcre2api page.
USER DOCUMENTATION USER DOCUMENTATION
The user documentation for PCRE2 comprises a number of different sec- The user documentation for PCRE2 comprises a number of different sec-
tions. In the "man" format, each of these is a separate "man page". In tions. In the "man" format, each of these is a separate "man page". In
the HTML format, each is a separate page, linked from the index page. the HTML format, each is a separate page, linked from the index page.
In the plain text format, the descriptions of the pcre2grep and In the plain text format, the descriptions of the pcre2grep and
pcre2test programs are in files called pcre2grep.txt and pcre2test.txt, pcre2test programs are in files called pcre2grep.txt and pcre2test.txt,
respectively. The remaining sections, except for the pcre2demo section respectively. The remaining sections, except for the pcre2demo section
(which is a program listing), and the short pages for individual func- (which is a program listing), and the short pages for individual func-
tions, are concatenated in pcre2.txt, for ease of searching. The sec- tions, are concatenated in pcre2.txt, for ease of searching. The sec-
tions are as follows: tions are as follows:
pcre2 this document pcre2 this document
@ -148,7 +149,7 @@ USER DOCUMENTATION
pcre2test description of the pcre2test command pcre2test description of the pcre2test command
pcre2unicode discussion of Unicode and UTF support pcre2unicode discussion of Unicode and UTF support
In the "man" and HTML formats, there is also a short page for each C In the "man" and HTML formats, there is also a short page for each C
library function, listing its arguments and results. library function, listing its arguments and results.
@ -158,14 +159,14 @@ AUTHOR
University Computing Service University Computing Service
Cambridge, England. Cambridge, England.
Putting an actual email address here is a spam magnet. If you want to Putting an actual email address here is a spam magnet. If you want to
email me, use my two initials, followed by the two digits 10, at the email me, use my two initials, followed by the two digits 10, at the
domain cam.ac.uk. domain cam.ac.uk.
REVISION REVISION
Last updated: 13 April 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -1276,7 +1277,9 @@ COMPILING A PATTERN
piled. This escape can cause unpredictable behaviour in UTF-8 or piled. This escape can cause unpredictable behaviour in UTF-8 or
UTF-16 modes, because it may leave the current matching point in the UTF-16 modes, because it may leave the current matching point in the
middle of a multi-code-unit character. This option may be useful in middle of a multi-code-unit character. This option may be useful in
applications that process patterns from external sources. applications that process patterns from external sources. Note that
there is also a build-time option that permanently locks out the use of
\C.
PCRE2_NEVER_UCP PCRE2_NEVER_UCP
@ -2571,19 +2574,36 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
PCRE2_SIZE length, PCRE2_SIZE startoffset, PCRE2_SIZE length, PCRE2_SIZE startoffset,
uint32_t options, pcre2_match_data *match_data, uint32_t options, pcre2_match_data *match_data,
pcre2_match_context *mcontext, PCRE2_SPTR replacementzfP, pcre2_match_context *mcontext, PCRE2_SPTR replacement,
PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferP, PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferP,
PCRE2_SIZE *outlengthptr); PCRE2_SIZE *outlengthptr);
This function calls pcre2_match() and then makes a copy of the subject This function calls pcre2_match() and then makes a copy of the subject
string in outputbuffer, replacing the part that was matched with the string in outputbuffer, replacing the part that was matched with the
replacement string, whose length is supplied in rlength. This can be replacement string, whose length is supplied in rlength. This can be
given as PCRE2_ZERO_TERMINATED for a zero-terminated string. given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
The first seven arguments of pcre2_substitute() are the same as for
pcre2_match(), except that the partial matching options are not permit-
ted, and match_data may be passed as NULL, in which case a match data
block is obtained and freed within this function, using memory manage-
ment functions from the match context, if provided, or else those that
were used to allocate memory for the compiled code.
The outlengthptr argument must point to a variable that contains the
length, in code units, of the output buffer. If the function is suc-
cessful, the value is updated to contain the length of the new string,
excluding the trailing zero that is automatically added. If the func-
tion is not successful, the value is set to PCRE2_UNSET for general
errors (such as output buffer too small). For syntax errors in the
replacement string, the value is set to the offset in the replacement
string where the error was detected.
In the replacement string, which is interpreted as a UTF string in UTF In the replacement string, which is interpreted as a UTF string in UTF
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
option is set, a dollar character is an escape character that can spec- option is set, a dollar character is an escape character that can spec-
ify the insertion of characters from capturing groups or (*MARK) items ify the insertion of characters from capturing groups or (*MARK) items
in the pattern. The following forms are recognized: in the pattern. The following forms are always recognized:
$$ insert a dollar character $$ insert a dollar character
$<n> or ${<n>} insert the contents of group <n> $<n> or ${<n>} insert the contents of group <n>
@ -2594,8 +2614,7 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
preted as part of the number or name. The number may be zero to include preted as part of the number or name. The number may be zero to include
the entire matched string. For example, if the pattern a(b)c is the entire matched string. For example, if the pattern a(b)c is
matched with "=abc=" and the replacement string "+$1$0$1+", the result matched with "=abc=" and the replacement string "+$1$0$1+", the result
is "=+babcb+=". Group insertion is done by calling pcre2_copy_byname() is "=+babcb+=".
or pcre2_copy_bynumber() as appropriate.
The facility for inserting a (*MARK) name can be used to perform simple The facility for inserting a (*MARK) name can be used to perform simple
simultaneous substitutions, as this pcre2test example shows: simultaneous substitutions, as this pcre2test example shows:
@ -2604,32 +2623,80 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
apple lemon apple lemon
2: pear orange 2: pear orange
The first seven arguments of pcre2_substitute() are the same as for There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
pcre2_match(), except that the partial matching options are not permit-
ted, and match_data may be passed as NULL, in which case a match data
block is obtained and freed within this function, using memory manage-
ment functions from the match context, if provided, or else those that
were used to allocate memory for the compiled code.
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
the function to iterate over the subject string, replacing every match- the function to iterate over the subject string, replacing every match-
ing substring. If this is not set, only the first matching substring is ing substring. If this is not set, only the first matching substring is
replaced. replaced.
The outlengthptr argument must point to a variable that contains the A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra
length, in code units, of the output buffer. It is updated to contain processing to be applied to the replacement string. Without this
the length of the new string, excluding the trailing zero that is auto- option, only the dollar character is special, and only the group inser-
matically added. tion forms listed above are valid. When PCRE2_SUBSTITUTE_EXTENDED is
set, two things change:
The function returns the number of replacements that were made. This Firstly, backslash in a replacement string is interpreted as an escape
may be zero if no matches were found, and is never greater than 1 character. The usual forms such as \n or \x{ddd} can be used to specify
unless PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a neg- particular character codes, and backslash followed by any non-alphanu-
ative error code is returned. Except for PCRE2_ERROR_NOMATCH (which is meric character quotes that character. Extended quoting can be coded
never returned), any errors from pcre2_match() or the substring copying using \Q...\E, exactly as in pattern strings.
functions are passed straight back. PCRE2_ERROR_BADREPLACEMENT is
returned for an invalid replacement string (unrecognized sequence fol- There are also four escape sequences for forcing the case of inserted
lowing a dollar sign), and PCRE2_ERROR_NOMEMORY is returned if the out- letters. The insertion mechanism has three states: no case forcing,
put buffer is not big enough. force upper case, and force lower case. The escape sequences change the
current state: \U and \L change to upper or lower case forcing, respec-
tively, and \E (when not terminating a \Q quoted sequence) reverts to
no case forcing. The sequences \u and \l force the next character (if
it is a letter) to upper or lower case, respectively, and then the
state automatically reverts to no case forcing. Case forcing applies to
all inserted characters, including those from captured groups and let-
ters within \Q...\E quoted sequences.
Note that case forcing sequences such as \U...\E do not nest. For exam-
ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final
\E has no effect.
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
flexibility to group substitution. The syntax is similar to that used
by Bash:
${<n>:-<string>}
${<n>:+<string1>:<string2>}
As before, <n> may be a group number or a name. The first form speci-
fies a default value. If group <n> is set, its value is inserted; if
not, <string> is expanded and the result inserted. The second form
specifies strings that are expanded and inserted when group <n> is set
or unset, respectively. The first form is just a convenient shorthand
for
${<n>:+${<n>}:<string>}
Backslash can be used to escape colons and closing curly brackets in
the replacement strings. A change of the case forcing state within a
replacement string remains in force afterwards, as shown in this
pcre2test example:
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
body
1: hello
somebody
1: HELLO
If successful, the function returns the number of replacements that
were made. This may be zero if no matches were found, and is never
greater than 1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
In the event of an error, a negative error code is returned. Except for
PCRE2_ERROR_NOMATCH (which is never returned), errors from
pcre2_match() are passed straight back. PCRE2_ERROR_NOMEMORY is
returned if the output buffer is not big enough.
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in
the replacement string, with more particular errors being
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REP-
MISSING_BRACE (closing curly bracket not found), and PCRE2_BADSUBSTITU-
TION (syntax error in extended group substitution). As for all PCRE2
errors, a text message that describes the error can be obtained by
calling pcre2_get_error_message().
DUPLICATE SUBPATTERN NAMES DUPLICATE SUBPATTERN NAMES
@ -2845,8 +2912,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
PCRE2_ERROR_DFA_UITEM PCRE2_ERROR_DFA_UITEM
This return is given if pcre2_dfa_match() encounters an item in the This return is given if pcre2_dfa_match() encounters an item in the
pattern that it does not support, for instance, the use of \C or a back pattern that it does not support, for instance, the use of \C in a UTF
reference. mode or a back reference.
PCRE2_ERROR_DFA_UCOND PCRE2_ERROR_DFA_UCOND
@ -2890,7 +2957,7 @@ AUTHOR
REVISION REVISION
Last updated: 22 September 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -3010,10 +3077,18 @@ UNICODE AND UTF SUPPORT
PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a
pattern may also request this by starting with (*UCP). pattern may also request this by starting with (*UCP).
DISABLING THE USE OF \C
The \C escape sequence, which matches a single code unit, even in a UTF The \C escape sequence, which matches a single code unit, even in a UTF
mode, can cause unpredictable behaviour because it may leave the cur- mode, can cause unpredictable behaviour because it may leave the cur-
rent matching point in the middle of a multi-code-unit character. It rent matching point in the middle of a multi-code-unit character. The
can be locked out by setting the PCRE2_NEVER_BACKSLASH_C option. application can lock it out by setting the PCRE2_NEVER_BACKSLASH_C
option when calling pcre2_compile(). There is also a build-time option
--enable-never-backslash-C
(note the upper case C) which locks out the use of \C entirely.
JUST-IN-TIME COMPILER SUPPORT JUST-IN-TIME COMPILER SUPPORT
@ -3022,10 +3097,10 @@ JUST-IN-TIME COMPILER SUPPORT
--enable-jit --enable-jit
This support is available only for certain hardware architectures. If This support is available only for certain hardware architectures. If
this option is set for an unsupported architecture, a building error this option is set for an unsupported architecture, a building error
occurs. See the pcre2jit documentation for a discussion of JIT usage. occurs. See the pcre2jit documentation for a discussion of JIT usage.
When JIT support is enabled, pcre2grep automatically makes use of it, When JIT support is enabled, pcre2grep automatically makes use of it,
unless you add unless you add
--disable-pcre2grep-jit --disable-pcre2grep-jit
@ -3035,14 +3110,14 @@ JUST-IN-TIME COMPILER SUPPORT
NEWLINE RECOGNITION NEWLINE RECOGNITION
By default, PCRE2 interprets the linefeed (LF) character as indicating By default, PCRE2 interprets the linefeed (LF) character as indicating
the end of a line. This is the normal newline character on Unix-like the end of a line. This is the normal newline character on Unix-like
systems. You can compile PCRE2 to use carriage return (CR) instead, by systems. You can compile PCRE2 to use carriage return (CR) instead, by
adding adding
--enable-newline-is-cr --enable-newline-is-cr
to the configure command. There is also an --enable-newline-is-lf to the configure command. There is also an --enable-newline-is-lf
option, which explicitly specifies linefeed as the newline character. option, which explicitly specifies linefeed as the newline character.
Alternatively, you can specify that line endings are to be indicated by Alternatively, you can specify that line endings are to be indicated by
@ -3055,76 +3130,76 @@ NEWLINE RECOGNITION
--enable-newline-is-anycrlf --enable-newline-is-anycrlf
which causes PCRE2 to recognize any of the three sequences CR, LF, or which causes PCRE2 to recognize any of the three sequences CR, LF, or
CRLF as indicating a line ending. Finally, a fifth option, specified by CRLF as indicating a line ending. Finally, a fifth option, specified by
--enable-newline-is-any --enable-newline-is-any
causes PCRE2 to recognize any Unicode newline sequence. The Unicode causes PCRE2 to recognize any Unicode newline sequence. The Unicode
newline sequences are the three just mentioned, plus the single charac- newline sequences are the three just mentioned, plus the single charac-
ters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line, ters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line,
U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+0085), LS (line separator, U+2028), and PS (paragraph separator,
U+2029). U+2029).
Whatever default line ending convention is selected when PCRE2 is built Whatever default line ending convention is selected when PCRE2 is built
can be overridden by applications that use the library. At build time can be overridden by applications that use the library. At build time
it is conventional to use the standard for your operating system. it is conventional to use the standard for your operating system.
WHAT \R MATCHES WHAT \R MATCHES
By default, the sequence \R in a pattern matches any Unicode newline By default, the sequence \R in a pattern matches any Unicode newline
sequence, independently of what has been selected as the line ending sequence, independently of what has been selected as the line ending
sequence. If you specify sequence. If you specify
--enable-bsr-anycrlf --enable-bsr-anycrlf
the default is changed so that \R matches only CR, LF, or CRLF. What- the default is changed so that \R matches only CR, LF, or CRLF. What-
ever is selected when PCRE2 is built can be overridden by applications ever is selected when PCRE2 is built can be overridden by applications
that use the called. that use the called.
HANDLING VERY LARGE PATTERNS HANDLING VERY LARGE PATTERNS
Within a compiled pattern, offset values are used to point from one Within a compiled pattern, offset values are used to point from one
part to another (for example, from an opening parenthesis to an alter- part to another (for example, from an opening parenthesis to an alter-
nation metacharacter). By default, in the 8-bit and 16-bit libraries, nation metacharacter). By default, in the 8-bit and 16-bit libraries,
two-byte values are used for these offsets, leading to a maximum size two-byte values are used for these offsets, leading to a maximum size
for a compiled pattern of around 64K code units. This is sufficient to for a compiled pattern of around 64K code units. This is sufficient to
handle all but the most gigantic patterns. Nevertheless, some people do handle all but the most gigantic patterns. Nevertheless, some people do
want to process truly enormous patterns, so it is possible to compile want to process truly enormous patterns, so it is possible to compile
PCRE2 to use three-byte or four-byte offsets by adding a setting such PCRE2 to use three-byte or four-byte offsets by adding a setting such
as as
--with-link-size=3 --with-link-size=3
to the configure command. The value given must be 2, 3, or 4. For the to the configure command. The value given must be 2, 3, or 4. For the
16-bit library, a value of 3 is rounded up to 4. In these libraries, 16-bit library, a value of 3 is rounded up to 4. In these libraries,
using longer offsets slows down the operation of PCRE2 because it has using longer offsets slows down the operation of PCRE2 because it has
to load additional data when handling them. For the 32-bit library the to load additional data when handling them. For the 32-bit library the
value is always 4 and cannot be overridden; the value of --with-link- value is always 4 and cannot be overridden; the value of --with-link-
size is ignored. size is ignored.
AVOIDING EXCESSIVE STACK USAGE AVOIDING EXCESSIVE STACK USAGE
When matching with the pcre2_match() function, PCRE2 implements back- When matching with the pcre2_match() function, PCRE2 implements back-
tracking by making recursive calls to an internal function called tracking by making recursive calls to an internal function called
match(). In environments where the size of the stack is limited, this match(). In environments where the size of the stack is limited, this
can severely limit PCRE2's operation. (The Unix environment does not can severely limit PCRE2's operation. (The Unix environment does not
usually suffer from this problem, but it may sometimes be necessary to usually suffer from this problem, but it may sometimes be necessary to
increase the maximum stack size. There is a discussion in the increase the maximum stack size. There is a discussion in the
pcre2stack documentation.) An alternative approach to recursion that pcre2stack documentation.) An alternative approach to recursion that
uses memory from the heap to remember data, instead of using recursive uses memory from the heap to remember data, instead of using recursive
function calls, has been implemented to work round the problem of lim- function calls, has been implemented to work round the problem of lim-
ited stack size. If you want to build a version of PCRE2 that works ited stack size. If you want to build a version of PCRE2 that works
this way, add this way, add
--disable-stack-for-recursion --disable-stack-for-recursion
to the configure command. By default, the system functions malloc() and to the configure command. By default, the system functions malloc() and
free() are called to manage the heap memory that is required, but cus- free() are called to manage the heap memory that is required, but cus-
tom memory management functions can be called instead. PCRE2 runs tom memory management functions can be called instead. PCRE2 runs
noticeably more slowly when built in this way. This option affects only noticeably more slowly when built in this way. This option affects only
the pcre2_match() function; it is not relevant for pcre2_dfa_match(). the pcre2_match() function; it is not relevant for pcre2_dfa_match().
@ -3132,30 +3207,30 @@ AVOIDING EXCESSIVE STACK USAGE
LIMITING PCRE2 RESOURCE USAGE LIMITING PCRE2 RESOURCE USAGE
Internally, PCRE2 has a function called match(), which it calls repeat- Internally, PCRE2 has a function called match(), which it calls repeat-
edly (sometimes recursively) when matching a pattern with the edly (sometimes recursively) when matching a pattern with the
pcre2_match() function. By controlling the maximum number of times this pcre2_match() function. By controlling the maximum number of times this
function may be called during a single matching operation, a limit can function may be called during a single matching operation, a limit can
be placed on the resources used by a single call to pcre2_match(). The be placed on the resources used by a single call to pcre2_match(). The
limit can be changed at run time, as described in the pcre2api documen- limit can be changed at run time, as described in the pcre2api documen-
tation. The default is 10 million, but this can be changed by adding a tation. The default is 10 million, but this can be changed by adding a
setting such as setting such as
--with-match-limit=500000 --with-match-limit=500000
to the configure command. This setting has no effect on the to the configure command. This setting has no effect on the
pcre2_dfa_match() matching function. pcre2_dfa_match() matching function.
In some environments it is desirable to limit the depth of recursive In some environments it is desirable to limit the depth of recursive
calls of match() more strictly than the total number of calls, in order calls of match() more strictly than the total number of calls, in order
to restrict the maximum amount of stack (or heap, if --disable-stack- to restrict the maximum amount of stack (or heap, if --disable-stack-
for-recursion is specified) that is used. A second limit controls this; for-recursion is specified) that is used. A second limit controls this;
it defaults to the value that is set for --with-match-limit, which it defaults to the value that is set for --with-match-limit, which
imposes no additional constraints. However, you can set a lower limit imposes no additional constraints. However, you can set a lower limit
by adding, for example, by adding, for example,
--with-match-limit-recursion=10000 --with-match-limit-recursion=10000
to the configure command. This value can also be overridden at run to the configure command. This value can also be overridden at run
time. time.
@ -3163,45 +3238,45 @@ CREATING CHARACTER TABLES AT BUILD TIME
PCRE2 uses fixed tables for processing characters whose code points are PCRE2 uses fixed tables for processing characters whose code points are
less than 256. By default, PCRE2 is built with a set of tables that are less than 256. By default, PCRE2 is built with a set of tables that are
distributed in the file src/pcre2_chartables.c.dist. These tables are distributed in the file src/pcre2_chartables.c.dist. These tables are
for ASCII codes only. If you add for ASCII codes only. If you add
--enable-rebuild-chartables --enable-rebuild-chartables
to the configure command, the distributed tables are no longer used. to the configure command, the distributed tables are no longer used.
Instead, a program called dftables is compiled and run. This outputs Instead, a program called dftables is compiled and run. This outputs
the source for new set of tables, created in the default locale of your the source for new set of tables, created in the default locale of your
C run-time system. (This method of replacing the tables does not work C run-time system. (This method of replacing the tables does not work
if you are cross compiling, because dftables is run on the local host. if you are cross compiling, because dftables is run on the local host.
If you need to create alternative tables when cross compiling, you will If you need to create alternative tables when cross compiling, you will
have to do so "by hand".) have to do so "by hand".)
USING EBCDIC CODE USING EBCDIC CODE
PCRE2 assumes by default that it will run in an environment where the PCRE2 assumes by default that it will run in an environment where the
character code is ASCII or Unicode, which is a superset of ASCII. This character code is ASCII or Unicode, which is a superset of ASCII. This
is the case for most computer operating systems. PCRE2 can, however, be is the case for most computer operating systems. PCRE2 can, however, be
compiled to run in an 8-bit EBCDIC environment by adding compiled to run in an 8-bit EBCDIC environment by adding
--enable-ebcdic --disable-unicode --enable-ebcdic --disable-unicode
to the configure command. This setting implies --enable-rebuild-charta- to the configure command. This setting implies --enable-rebuild-charta-
bles. You should only use it if you know that you are in an EBCDIC bles. You should only use it if you know that you are in an EBCDIC
environment (for example, an IBM mainframe operating system). environment (for example, an IBM mainframe operating system).
It is not possible to support both EBCDIC and UTF-8 codes in the same It is not possible to support both EBCDIC and UTF-8 codes in the same
version of the library. Consequently, --enable-unicode and --enable- version of the library. Consequently, --enable-unicode and --enable-
ebcdic are mutually exclusive. ebcdic are mutually exclusive.
The EBCDIC character that corresponds to an ASCII LF is assumed to have The EBCDIC character that corresponds to an ASCII LF is assumed to have
the value 0x15 by default. However, in some EBCDIC environments, 0x25 the value 0x15 by default. However, in some EBCDIC environments, 0x25
is used. In such an environment you should use is used. In such an environment you should use
--enable-ebcdic-nl25 --enable-ebcdic-nl25
as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR
has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and
0x25 is not chosen as LF is made to correspond to the Unicode NEL char- 0x25 is not chosen as LF is made to correspond to the Unicode NEL char-
acter (which, in Unicode, is 0x85). acter (which, in Unicode, is 0x85).
@ -3212,31 +3287,31 @@ USING EBCDIC CODE
PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT
By default, pcre2grep reads all files as plain text. You can build it By default, pcre2grep reads all files as plain text. You can build it
so that it recognizes files whose names end in .gz or .bz2, and reads so that it recognizes files whose names end in .gz or .bz2, and reads
them with libz or libbz2, respectively, by adding one or both of them with libz or libbz2, respectively, by adding one or both of
--enable-pcre2grep-libz --enable-pcre2grep-libz
--enable-pcre2grep-libbz2 --enable-pcre2grep-libbz2
to the configure command. These options naturally require that the rel- to the configure command. These options naturally require that the rel-
evant libraries are installed on your system. Configuration will fail evant libraries are installed on your system. Configuration will fail
if they are not. if they are not.
PCRE2GREP BUFFER SIZE PCRE2GREP BUFFER SIZE
pcre2grep uses an internal buffer to hold a "window" on the file it is pcre2grep uses an internal buffer to hold a "window" on the file it is
scanning, in order to be able to output "before" and "after" lines when scanning, in order to be able to output "before" and "after" lines when
it finds a match. The size of the buffer is controlled by a parameter it finds a match. The size of the buffer is controlled by a parameter
whose default value is 20K. The buffer itself is three times this size, whose default value is 20K. The buffer itself is three times this size,
but because of the way it is used for holding "before" lines, the long- but because of the way it is used for holding "before" lines, the long-
est line that is guaranteed to be processable is the parameter size. est line that is guaranteed to be processable is the parameter size.
You can change the default parameter value by adding, for example, You can change the default parameter value by adding, for example,
--with-pcre2grep-bufsize=50K --with-pcre2grep-bufsize=50K
to the configure command. The caller of pcre2grep can override this to the configure command. The caller of pcre2grep can override this
value by using --buffer-size on the command line.. value by using --buffer-size on the command line..
@ -3247,26 +3322,26 @@ PCRE2TEST OPTION FOR LIBREADLINE SUPPORT
--enable-pcre2test-libreadline --enable-pcre2test-libreadline
--enable-pcre2test-libedit --enable-pcre2test-libedit
to the configure command, pcre2test is linked with the libreadline to the configure command, pcre2test is linked with the libreadline
orlibedit library, respectively, and when its input is from a terminal, orlibedit library, respectively, and when its input is from a terminal,
it reads it using the readline() function. This provides line-editing it reads it using the readline() function. This provides line-editing
and history facilities. Note that libreadline is GPL-licensed, so if and history facilities. Note that libreadline is GPL-licensed, so if
you distribute a binary of pcre2test linked in this way, there may be you distribute a binary of pcre2test linked in this way, there may be
licensing issues. These can be avoided by linking instead with libedit, licensing issues. These can be avoided by linking instead with libedit,
which has a BSD licence. which has a BSD licence.
Setting --enable-pcre2test-libreadline causes the -lreadline option to Setting --enable-pcre2test-libreadline causes the -lreadline option to
be added to the pcre2test build. In many operating environments with a be added to the pcre2test build. In many operating environments with a
sytem-installed readline library this is sufficient. However, in some sytem-installed readline library this is sufficient. However, in some
environments (e.g. if an unmodified distribution version of readline is environments (e.g. if an unmodified distribution version of readline is
in use), some extra configuration may be necessary. The INSTALL file in use), some extra configuration may be necessary. The INSTALL file
for libreadline says this: for libreadline says this:
"Readline uses the termcap functions, but does not link with "Readline uses the termcap functions, but does not link with
the termcap or curses library itself, allowing applications the termcap or curses library itself, allowing applications
which link with readline the to choose an appropriate library." which link with readline the to choose an appropriate library."
If your environment has not been set up so that an appropriate library If your environment has not been set up so that an appropriate library
is automatically included, you may need to add something like is automatically included, you may need to add something like
LIBS="-ncurses" LIBS="-ncurses"
@ -3280,7 +3355,7 @@ INCLUDING DEBUGGING CODE
--enable-debug --enable-debug
to the configure command, additional debugging code is included in the to the configure command, additional debugging code is included in the
build. This feature is intended for use by the PCRE2 maintainers. build. This feature is intended for use by the PCRE2 maintainers.
@ -3290,15 +3365,15 @@ DEBUGGING WITH VALGRIND SUPPORT
--enable-valgrind --enable-valgrind
to the configure command, PCRE2 will use valgrind annotations to mark to the configure command, PCRE2 will use valgrind annotations to mark
certain memory regions as unaddressable. This allows it to detect certain memory regions as unaddressable. This allows it to detect
invalid memory accesses, and is mostly useful for debugging PCRE2 invalid memory accesses, and is mostly useful for debugging PCRE2
itself. itself.
CODE COVERAGE REPORTING CODE COVERAGE REPORTING
If your C compiler is gcc, you can build a version of PCRE2 that can If your C compiler is gcc, you can build a version of PCRE2 that can
generate a code coverage report for its test suite. To enable this, you generate a code coverage report for its test suite. To enable this, you
must install lcov version 1.6 or above. Then specify must install lcov version 1.6 or above. Then specify
@ -3307,20 +3382,20 @@ CODE COVERAGE REPORTING
to the configure command and build PCRE2 in the usual way. to the configure command and build PCRE2 in the usual way.
Note that using ccache (a caching C compiler) is incompatible with code Note that using ccache (a caching C compiler) is incompatible with code
coverage reporting. If you have configured ccache to run automatically coverage reporting. If you have configured ccache to run automatically
on your system, you must set the environment variable on your system, you must set the environment variable
CCACHE_DISABLE=1 CCACHE_DISABLE=1
before running make to build PCRE2, so that ccache is not used. before running make to build PCRE2, so that ccache is not used.
When --enable-coverage is used, the following addition targets are When --enable-coverage is used, the following addition targets are
added to the Makefile: added to the Makefile:
make coverage make coverage
This creates a fresh coverage report for the PCRE2 test suite. It is This creates a fresh coverage report for the PCRE2 test suite. It is
equivalent to running "make coverage-reset", "make coverage-baseline", equivalent to running "make coverage-reset", "make coverage-baseline",
"make check", and then "make coverage-report". "make check", and then "make coverage-report".
make coverage-reset make coverage-reset
@ -3337,18 +3412,18 @@ CODE COVERAGE REPORTING
make coverage-clean-report make coverage-clean-report
This removes the generated coverage report without cleaning the cover- This removes the generated coverage report without cleaning the cover-
age data itself. age data itself.
make coverage-clean-data make coverage-clean-data
This removes the captured coverage data without removing the coverage This removes the captured coverage data without removing the coverage
files created at compile time (*.gcno). files created at compile time (*.gcno).
make coverage-clean make coverage-clean
This cleans all coverage data including the generated coverage report. This cleans all coverage data including the generated coverage report.
For more information about code coverage, see the gcov and lcov docu- For more information about code coverage, see the gcov and lcov docu-
mentation. mentation.
@ -3366,7 +3441,7 @@ AUTHOR
REVISION REVISION
Last updated: 24 April 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -6028,12 +6103,18 @@ MATCHING A SINGLE CODE UNIT
results, because PCRE2 assumes that it is matching character by charac- results, because PCRE2 assumes that it is matching character by charac-
ter in a valid UTF string (by default it checks the subject string's ter in a valid UTF string (by default it checks the subject string's
validity at the start of processing unless the PCRE2_NO_UTF_CHECK validity at the start of processing unless the PCRE2_NO_UTF_CHECK
option is used). An application can lock out the use of \C by setting option is used).
the PCRE2_NEVER_BACKSLASH_C option.
PCRE2 does not allow \C to appear in lookbehind assertions (described An application can lock out the use of \C by setting the
below) in a UTF mode, because this would make it impossible to calcu- PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also
late the length of the lookbehind. possible to build PCRE2 with the use of \C permanently disabled.
PCRE2 does not allow \C to appear in lookbehind assertions (described
below) in a UTF mode, because this would make it impossible to calcu-
late the length of the lookbehind. Neither the alternative matching
function pcre2_dfa_match() not the JIT optimizer support \C in a UTF
mode. The former gives a match-time error; the latter fails to optimize
and so the match is always run using the interpreter.
In general, the \C escape sequence is best avoided. However, one way of In general, the \C escape sequence is best avoided. However, one way of
using it that avoids the problem of malformed UTF characters is to use using it that avoids the problem of malformed UTF characters is to use
@ -8036,7 +8117,7 @@ AUTHOR
REVISION REVISION
Last updated: 01 September 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -8966,10 +9047,10 @@ CHARACTER TYPES
\W a "non-word" character \W a "non-word" character
\X a Unicode extended grapheme cluster \X a Unicode extended grapheme cluster
The application can lock out the use of \C by setting the \C is dangerous because it may leave the current matching point in the
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave middle of a UTF-8 or UTF-16 character. The application can lock out the
the current matching point in the middle of a UTF-8 or UTF-16 charac- use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also
ter. possible to build PCRE2 with the use of \C permanently disabled.
By default, \d, \s, and \w match only ASCII characters, even in UTF-8 By default, \d, \s, and \w match only ASCII characters, even in UTF-8
mode or in the 16-bit and 32-bit libraries. However, if locale-specific mode or in the 16-bit and 32-bit libraries. However, if locale-specific
@ -9325,7 +9406,7 @@ AUTHOR
REVISION REVISION
Last updated: 17 July 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -9384,89 +9465,90 @@ WIDE CHARACTERS AND UTF MODES
The escape sequence \C can be used to match a single code unit, in a The escape sequence \C can be used to match a single code unit, in a
UTF mode, but its use can lead to some strange effects because it UTF mode, but its use can lead to some strange effects because it
breaks up multi-unit characters (see the description of \C in the breaks up multi-unit characters (see the description of \C in the
pcre2pattern documentation). The use of \C is not supported in the pcre2pattern documentation). The use of \C is not supported by the
alternative matching function pcre2_dfa_match(), nor is it supported in alternative matching function pcre2_dfa_match() when in UTF mode. Its
UTF mode by the JIT optimization. If JIT optimization is requested for use provokes a match-time error. The JIT optimization also does not
a UTF pattern that contains \C, it will not succeed, and so the match- support \C in UTF mode. If JIT optimization is requested for a UTF
ing will be carried out by the normal interpretive function. pattern that contains \C, it will not succeed, and so the matching will
be carried out by the normal interpretive function.
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
characters of any code value, but, by default, the characters that characters of any code value, but, by default, the characters that
PCRE2 recognizes as digits, spaces, or word characters remain the same PCRE2 recognizes as digits, spaces, or word characters remain the same
set as in non-UTF mode, all with code points less than 256. This set as in non-UTF mode, all with code points less than 256. This
remains true even when PCRE2 is built to include Unicode support, remains true even when PCRE2 is built to include Unicode support,
because to do otherwise would slow down matching in many common cases. because to do otherwise would slow down matching in many common cases.
Note that this also applies to \b and \B, because they are defined in Note that this also applies to \b and \B, because they are defined in
terms of \w and \W. If you want to test for a wider sense of, say, terms of \w and \W. If you want to test for a wider sense of, say,
"digit", you can use explicit Unicode property tests such as \p{Nd}. "digit", you can use explicit Unicode property tests such as \p{Nd}.
Alternatively, if you set the PCRE2_UCP option, the way that the char- Alternatively, if you set the PCRE2_UCP option, the way that the char-
acter escapes work is changed so that Unicode properties are used to acter escapes work is changed so that Unicode properties are used to
determine which characters match. There are more details in the section determine which characters match. There are more details in the section
on generic character types in the pcre2pattern documentation. on generic character types in the pcre2pattern documentation.
Similarly, characters that match the POSIX named character classes are Similarly, characters that match the POSIX named character classes are
all low-valued characters, unless the PCRE2_UCP option is set. all low-valued characters, unless the PCRE2_UCP option is set.
However, the special horizontal and vertical white space matching However, the special horizontal and vertical white space matching
escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char- escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
acters, whether or not PCRE2_UCP is set. acters, whether or not PCRE2_UCP is set.
Case-insensitive matching in UTF mode makes use of Unicode properties. Case-insensitive matching in UTF mode makes use of Unicode properties.
A few Unicode characters such as Greek sigma have more than two code- A few Unicode characters such as Greek sigma have more than two code-
points that are case-equivalent, and these are treated as such. points that are case-equivalent, and these are treated as such.
VALIDITY OF UTF STRINGS VALIDITY OF UTF STRINGS
When the PCRE2_UTF option is set, the strings passed as patterns and When the PCRE2_UTF option is set, the strings passed as patterns and
subjects are (by default) checked for validity on entry to the relevant subjects are (by default) checked for validity on entry to the relevant
functions. If an invalid UTF string is passed, an negative error code functions. If an invalid UTF string is passed, an negative error code
is returned. The code unit offset to the offending character can be is returned. The code unit offset to the offending character can be
extracted from the match data block by calling pcre2_get_startchar(), extracted from the match data block by calling pcre2_get_startchar(),
which is used for this purpose after a UTF error. which is used for this purpose after a UTF error.
UTF-16 and UTF-32 strings can indicate their endianness by special code UTF-16 and UTF-32 strings can indicate their endianness by special code
knows as a byte-order mark (BOM). The PCRE2 functions do not handle knows as a byte-order mark (BOM). The PCRE2 functions do not handle
this, expecting strings to be in host byte order. this, expecting strings to be in host byte order.
A UTF string is checked before any other processing takes place. In the A UTF string is checked before any other processing takes place. In the
case of pcre2_match() and pcre2_dfa_match() calls with a non-zero case of pcre2_match() and pcre2_dfa_match() calls with a non-zero
starting offset, the check is applied only to that part of the subject starting offset, the check is applied only to that part of the subject
that could be inspected during matching, and there is a check that the that could be inspected during matching, and there is a check that the
starting offset points to the first code unit of a character or to the starting offset points to the first code unit of a character or to the
end of the subject. If there are no lookbehind assertions in the pat- end of the subject. If there are no lookbehind assertions in the pat-
tern, the check starts at the starting offset. Otherwise, it starts at tern, the check starts at the starting offset. Otherwise, it starts at
the length of the longest lookbehind before the starting offset, or at the length of the longest lookbehind before the starting offset, or at
the start of the subject if there are not that many characters before the start of the subject if there are not that many characters before
the starting offset. Note that the sequences \b and \B are one-charac- the starting offset. Note that the sequences \b and \B are one-charac-
ter lookbehinds. ter lookbehinds.
In addition to checking the format of the string, there is a check to In addition to checking the format of the string, there is a check to
ensure that all code points lie in the range U+0 to U+10FFFF, excluding ensure that all code points lie in the range U+0 to U+10FFFF, excluding
the surrogate area. The so-called "non-character" code points are not the surrogate area. The so-called "non-character" code points are not
excluded because Unicode corrigendum #9 makes it clear that they should excluded because Unicode corrigendum #9 makes it clear that they should
not be. not be.
Characters in the "Surrogate Area" of Unicode are reserved for use by Characters in the "Surrogate Area" of Unicode are reserved for use by
UTF-16, where they are used in pairs to encode code points with values UTF-16, where they are used in pairs to encode code points with values
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
are available independently in the UTF-8 and UTF-32 encodings. (In are available independently in the UTF-8 and UTF-32 encodings. (In
other words, the whole surrogate thing is a fudge for UTF-16 which other words, the whole surrogate thing is a fudge for UTF-16 which
unfortunately messes up UTF-8 and UTF-32.) unfortunately messes up UTF-8 and UTF-32.)
In some situations, you may already know that your strings are valid, In some situations, you may already know that your strings are valid,
and therefore want to skip these checks in order to improve perfor- and therefore want to skip these checks in order to improve perfor-
mance, for example in the case of a long subject string that is being mance, for example in the case of a long subject string that is being
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com- scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com-
pile time or at match time, PCRE2 assumes that the pattern or subject pile time or at match time, PCRE2 assumes that the pattern or subject
it is given (respectively) contains only valid UTF code unit sequences. it is given (respectively) contains only valid UTF code unit sequences.
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
for the pattern; it does not also apply to subject strings. If you want for the pattern; it does not also apply to subject strings. If you want
to disable the check for a subject string you must pass this option to to disable the check for a subject string you must pass this option to
pcre2_match() or pcre2_dfa_match(). pcre2_match() or pcre2_dfa_match().
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
result is undefined and your program may crash or loop indefinitely. result is undefined and your program may crash or loop indefinitely.
Errors in UTF-8 strings Errors in UTF-8 strings
@ -9479,10 +9561,10 @@ VALIDITY OF UTF STRINGS
PCRE2_ERROR_UTF8_ERR4 PCRE2_ERROR_UTF8_ERR4
PCRE2_ERROR_UTF8_ERR5 PCRE2_ERROR_UTF8_ERR5
The string ends with a truncated UTF-8 character; the code specifies The string ends with a truncated UTF-8 character; the code specifies
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
characters to be no longer than 4 bytes, the encoding scheme (origi- characters to be no longer than 4 bytes, the encoding scheme (origi-
nally defined by RFC 2279) allows for up to 6 bytes, and this is nally defined by RFC 2279) allows for up to 6 bytes, and this is
checked first; hence the possibility of 4 or 5 missing bytes. checked first; hence the possibility of 4 or 5 missing bytes.
PCRE2_ERROR_UTF8_ERR6 PCRE2_ERROR_UTF8_ERR6
@ -9492,24 +9574,24 @@ VALIDITY OF UTF STRINGS
PCRE2_ERROR_UTF8_ERR10 PCRE2_ERROR_UTF8_ERR10
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
the character do not have the binary value 0b10 (that is, either the the character do not have the binary value 0b10 (that is, either the
most significant bit is 0, or the next bit is 1). most significant bit is 0, or the next bit is 1).
PCRE2_ERROR_UTF8_ERR11 PCRE2_ERROR_UTF8_ERR11
PCRE2_ERROR_UTF8_ERR12 PCRE2_ERROR_UTF8_ERR12
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
long; these code points are excluded by RFC 3629. long; these code points are excluded by RFC 3629.
PCRE2_ERROR_UTF8_ERR13 PCRE2_ERROR_UTF8_ERR13
A 4-byte character has a value greater than 0x10fff; these code points A 4-byte character has a value greater than 0x10fff; these code points
are excluded by RFC 3629. are excluded by RFC 3629.
PCRE2_ERROR_UTF8_ERR14 PCRE2_ERROR_UTF8_ERR14
A 3-byte character has a value in the range 0xd800 to 0xdfff; this A 3-byte character has a value in the range 0xd800 to 0xdfff; this
range of code points are reserved by RFC 3629 for use with UTF-16, and range of code points are reserved by RFC 3629 for use with UTF-16, and
so are excluded from UTF-8. so are excluded from UTF-8.
PCRE2_ERROR_UTF8_ERR15 PCRE2_ERROR_UTF8_ERR15
@ -9518,26 +9600,26 @@ VALIDITY OF UTF STRINGS
PCRE2_ERROR_UTF8_ERR18 PCRE2_ERROR_UTF8_ERR18
PCRE2_ERROR_UTF8_ERR19 PCRE2_ERROR_UTF8_ERR19
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
for a value that can be represented by fewer bytes, which is invalid. for a value that can be represented by fewer bytes, which is invalid.
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor- For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
rect coding uses just one byte. rect coding uses just one byte.
PCRE2_ERROR_UTF8_ERR20 PCRE2_ERROR_UTF8_ERR20
The two most significant bits of the first byte of a character have the The two most significant bits of the first byte of a character have the
binary value 0b10 (that is, the most significant bit is 1 and the sec- binary value 0b10 (that is, the most significant bit is 1 and the sec-
ond is 0). Such a byte can only validly occur as the second or subse- ond is 0). Such a byte can only validly occur as the second or subse-
quent byte of a multi-byte character. quent byte of a multi-byte character.
PCRE2_ERROR_UTF8_ERR21 PCRE2_ERROR_UTF8_ERR21
The first byte of a character has the value 0xfe or 0xff. These values The first byte of a character has the value 0xfe or 0xff. These values
can never occur in a valid UTF-8 string. can never occur in a valid UTF-8 string.
Errors in UTF-16 strings Errors in UTF-16 strings
The following negative error codes are given for invalid UTF-16 The following negative error codes are given for invalid UTF-16
strings: strings:
PCRE_UTF16_ERR1 Missing low surrogate at end of string PCRE_UTF16_ERR1 Missing low surrogate at end of string
@ -9547,7 +9629,7 @@ VALIDITY OF UTF STRINGS
Errors in UTF-32 strings Errors in UTF-32 strings
The following negative error codes are given for invalid UTF-32 The following negative error codes are given for invalid UTF-32
strings: strings:
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff) PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
@ -9563,7 +9645,7 @@ AUTHOR
REVISION REVISION
Last updated: 18 August 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2API 3 "07 October 2015" "PCRE2 10.21" .TH PCRE2API 3 "16 October 2015" "PCRE2 10.21"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.sp .sp
@ -1209,7 +1209,8 @@ This option locks out the use of \eC in the pattern that is being compiled.
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
it may leave the current matching point in the middle of a multi-code-unit it may leave the current matching point in the middle of a multi-code-unit
character. This option may be useful in applications that process patterns from character. This option may be useful in applications that process patterns from
external sources. external sources. Note that there is also a build-time option that permanently
locks out the use of \eC.
.sp .sp
PCRE2_NEVER_UCP PCRE2_NEVER_UCP
.sp .sp
@ -3014,8 +3015,8 @@ There are in addition the following errors that are specific to
PCRE2_ERROR_DFA_UITEM PCRE2_ERROR_DFA_UITEM
.sp .sp
This return is given if \fBpcre2_dfa_match()\fP encounters an item in the This return is given if \fBpcre2_dfa_match()\fP encounters an item in the
pattern that it does not support, for instance, the use of \eC or a back pattern that it does not support, for instance, the use of \eC in a UTF mode or
reference. a back reference.
.sp .sp
PCRE2_ERROR_DFA_UCOND PCRE2_ERROR_DFA_UCOND
.sp .sp
@ -3065,6 +3066,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 07 October 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
.fi .fi

View File

@ -1,4 +1,4 @@
.TH PCRE2BUILD 3 "23 April 2015" "PCRE2 10.20" .TH PCRE2BUILD 3 "16 October 2015" "PCRE2 10.21"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
. .
@ -132,11 +132,20 @@ Pattern escapes such as \ed and \ew do not by default make use of Unicode
properties. The application can request that they do by setting the PCRE2_UCP properties. The application can request that they do by setting the PCRE2_UCP
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
request this by starting with (*UCP). request this by starting with (*UCP).
.P .
.
.SH "DISABLING THE USE OF \eC"
.rs
.sp
The \eC escape sequence, which matches a single code unit, even in a UTF mode, The \eC escape sequence, which matches a single code unit, even in a UTF mode,
can cause unpredictable behaviour because it may leave the current matching can cause unpredictable behaviour because it may leave the current matching
point in the middle of a multi-code-unit character. It can be locked out by point in the middle of a multi-code-unit character. The application can lock it
setting the PCRE2_NEVER_BACKSLASH_C option. out by setting the PCRE2_NEVER_BACKSLASH_C option when calling
\fBpcre2_compile()\fP. There is also a build-time option
.sp
--enable-never-backslash-C
.sp
(note the upper case C) which locks out the use of \eC entirely.
. .
. .
.SH "JUST-IN-TIME COMPILER SUPPORT" .SH "JUST-IN-TIME COMPILER SUPPORT"
@ -510,6 +519,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 24 April 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
.fi .fi

View File

@ -1,4 +1,4 @@
.TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21" .TH PCRE2PATTERN 3 "16 October 2015" "PCRE2 10.21"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS" .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -1233,8 +1233,11 @@ with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start
with a malformed UTF character. This has undefined results, because PCRE2 with a malformed UTF character. This has undefined results, because PCRE2
assumes that it is matching character by character in a valid UTF string (by assumes that it is matching character by character in a valid UTF string (by
default it checks the subject string's validity at the start of processing default it checks the subject string's validity at the start of processing
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the unless the PCRE2_NO_UTF_CHECK option is used).
use of \eC by setting the PCRE2_NEVER_BACKSLASH_C option. .P
An application can lock out the use of \eC by setting the
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
build PCRE2 with the use of \eC permanently disabled.
.P .P
PCRE2 does not allow \eC to appear in lookbehind assertions PCRE2 does not allow \eC to appear in lookbehind assertions
.\" HTML <a href="#lookbehind"> .\" HTML <a href="#lookbehind">
@ -1242,7 +1245,10 @@ PCRE2 does not allow \eC to appear in lookbehind assertions
(described below) (described below)
.\" .\"
in a UTF mode, because this would make it impossible to calculate the length of in a UTF mode, because this would make it impossible to calculate the length of
the lookbehind. the lookbehind. Neither the alternative matching function
\fBpcre2_dfa_match()\fP not the JIT optimizer support \eC in a UTF mode. The
former gives a match-time error; the latter fails to optimize and so the match
is always run using the interpreter.
.P .P
In general, the \eC escape sequence is best avoided. However, one way of using In general, the \eC escape sequence is best avoided. However, one way of using
it that avoids the problem of malformed UTF characters is to use a lookahead to it that avoids the problem of malformed UTF characters is to use a lookahead to
@ -3386,6 +3392,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 01 September 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
.fi .fi

View File

@ -1,4 +1,4 @@
.TH PCRE2SYNTAX 3 "17 July 2015" "PCRE2 10.21" .TH PCRE2SYNTAX 3 "16 October 2015" "PCRE2 10.21"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY" .SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
@ -81,9 +81,10 @@ it matches a literal "u".
\eW a "non-word" character \eW a "non-word" character
\eX a Unicode extended grapheme cluster \eX a Unicode extended grapheme cluster
.sp .sp
The application can lock out the use of \eC by setting the \eC is dangerous because it may leave the current matching point in the middle
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the of a UTF-8 or UTF-16 character. The application can lock out the use of \eC by
current matching point in the middle of a UTF-8 or UTF-16 character. setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
with the use of \eC permanently disabled.
.P .P
By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode
or in the 16-bit and 32-bit libraries. However, if locale-specific matching is or in the 16-bit and 32-bit libraries. However, if locale-specific matching is
@ -576,6 +577,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 17 July 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
.fi .fi

View File

@ -1,4 +1,4 @@
.TH PCRE2TEST 1 "23 September 2015" "PCRE 10.21" .TH PCRE2TEST 1 "17 October 2015" "PCRE 10.21"
.SH NAME .SH NAME
pcre2test - a program for testing Perl-compatible regular expressions. pcre2test - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS .SH SYNOPSIS
@ -122,12 +122,13 @@ following options output the value and set the exit code as indicated:
The following options output 1 for true or 0 for false, and set the exit code The following options output 1 for true or 0 for false, and set the exit code
to the same value: to the same value:
.sp .sp
ebcdic compiled for an EBCDIC environment backslash-C \eC is supported (not locked out)
jit just-in-time support is available ebcdic compiled for an EBCDIC environment
pcre2-16 the 16-bit library was built jit just-in-time support is available
pcre2-32 the 32-bit library was built pcre2-16 the 16-bit library was built
pcre2-8 the 8-bit library was built pcre2-32 the 32-bit library was built
unicode Unicode support is available pcre2-8 the 8-bit library was built
unicode Unicode support is available
.sp .sp
If an unknown option is given, an error message is output; the exit code is 0. If an unknown option is given, an error message is output; the exit code is 0.
.TP 10 .TP 10
@ -1559,6 +1560,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 23 September 2015 Last updated: 17 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
.fi .fi

View File

@ -119,12 +119,13 @@ COMMAND LINE OPTIONS
The following options output 1 for true or 0 for false, and The following options output 1 for true or 0 for false, and
set the exit code to the same value: set the exit code to the same value:
ebcdic compiled for an EBCDIC environment backslash-C \C is supported (not locked out)
jit just-in-time support is available ebcdic compiled for an EBCDIC environment
pcre2-16 the 16-bit library was built jit just-in-time support is available
pcre2-32 the 32-bit library was built pcre2-16 the 16-bit library was built
pcre2-8 the 8-bit library was built pcre2-32 the 32-bit library was built
unicode Unicode support is available pcre2-8 the 8-bit library was built
unicode Unicode support is available
If an unknown option is given, an error message is output; If an unknown option is given, an error message is output;
the exit code is 0. the exit code is 0.
@ -457,7 +458,7 @@ PATTERN MODIFIERS
Setting compilation options Setting compilation options
The following modifiers set options for pcre2_compile(). The most com- The following modifiers set options for pcre2_compile(). The most com-
mon ones have single-letter abbreviations. See pcreapi for a descrip- mon ones have single-letter abbreviations. See pcre2api for a descrip-
tion of their effects. tion of their effects.
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
@ -484,6 +485,7 @@ PATTERN MODIFIERS
no_utf_check set PCRE2_NO_UTF_CHECK no_utf_check set PCRE2_NO_UTF_CHECK
ucp set PCRE2_UCP ucp set PCRE2_UCP
ungreedy set PCRE2_UNGREEDY ungreedy set PCRE2_UNGREEDY
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
utf set PCRE2_UTF utf set PCRE2_UTF
As well as turning on the PCRE2_UTF option, the utf modifier causes all As well as turning on the PCRE2_UTF option, the utf modifier causes all
@ -509,6 +511,7 @@ PATTERN MODIFIERS
locale=<name> use this locale locale=<name> use this locale
memory show memory used memory show memory used
newline=<type> set newline type newline=<type> set newline type
null_context compile with a NULL context
parens_nest_limit=<n> set maximum parentheses depth parens_nest_limit=<n> set maximum parentheses depth
posix use the POSIX API posix use the POSIX API
push push compiled pattern onto the stack push push compiled pattern onto the stack
@ -579,35 +582,42 @@ PATTERN MODIFIERS
mation that is requested. For each callout, either its number or string mation that is requested. For each callout, either its number or string
is given, followed by the item that follows it in the pattern. is given, followed by the item that follows it in the pattern.
Passing a NULL context
Normally, pcre2test passes a context block to pcre2_compile(). If the
null_context modifier is set, however, NULL is passed. This is for
testing that pcre2_compile() behaves correctly in this case (it uses
default values).
Specifying a pattern in hex Specifying a pattern in hex
The hex modifier specifies that the characters of the pattern are to be The hex modifier specifies that the characters of the pattern are to be
interpreted as pairs of hexadecimal digits. White space is permitted interpreted as pairs of hexadecimal digits. White space is permitted
between pairs. For example: between pairs. For example:
/ab 32 59/hex /ab 32 59/hex
This feature is provided as a way of creating patterns that contain This feature is provided as a way of creating patterns that contain
binary zero and other non-printing characters. By default, pcre2test binary zero and other non-printing characters. By default, pcre2test
passes patterns as zero-terminated strings to pcre2_compile(), giving passes patterns as zero-terminated strings to pcre2_compile(), giving
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
hexadecimal, the actual length of the pattern is passed. hexadecimal, the actual length of the pattern is passed.
JIT compilation JIT compilation
Just-in-time (JIT) compiling is a heavyweight optimization that can Just-in-time (JIT) compiling is a heavyweight optimization that can
greatly speed up pattern matching. See the pcre2jit documentation for greatly speed up pattern matching. See the pcre2jit documentation for
details. JIT compiling happens, optionally, after a pattern has been details. JIT compiling happens, optionally, after a pattern has been
successfully compiled into an internal form. The JIT compiler converts successfully compiled into an internal form. The JIT compiler converts
this to optimized machine code. It needs to know whether the match-time this to optimized machine code. It needs to know whether the match-time
options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used,
because different code is generated for the different cases. See the because different code is generated for the different cases. See the
partial modifier in "Subject Modifiers" below for details of how these partial modifier in "Subject Modifiers" below for details of how these
options are specified for each match attempt. options are specified for each match attempt.
JIT compilation is requested by the /jit pattern modifier, which may JIT compilation is requested by the /jit pattern modifier, which may
optionally be followed by an equals sign and a number in the range 0 to optionally be followed by an equals sign and a number in the range 0 to
7. The three bits that make up the number specify which of the three 7. The three bits that make up the number specify which of the three
JIT operating modes are to be compiled: JIT operating modes are to be compiled:
1 compile JIT code for non-partial matching 1 compile JIT code for non-partial matching
@ -624,31 +634,31 @@ PATTERN MODIFIERS
6 soft and hard partial matching only 6 soft and hard partial matching only
7 all three modes 7 all three modes
If no number is given, 7 is assumed. The phrase "partial matching" If no number is given, 7 is assumed. The phrase "partial matching"
means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com- PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
plete match; the options enable the possibility of a partial match, but plete match; the options enable the possibility of a partial match, but
do not require it. Note also that if you request JIT compilation only do not require it. Note also that if you request JIT compilation only
for partial matching (for example, /jit=2) but do not set the partial for partial matching (for example, /jit=2) but do not set the partial
modifier on a subject line, that match will not use JIT code because modifier on a subject line, that match will not use JIT code because
none was compiled for non-partial matching. none was compiled for non-partial matching.
If JIT compilation is successful, the compiled JIT code will automati- If JIT compilation is successful, the compiled JIT code will automati-
cally be used when an appropriate type of match is run, except when cally be used when an appropriate type of match is run, except when
incompatible run-time options are specified. For more details, see the incompatible run-time options are specified. For more details, see the
pcre2jit documentation. See also the jitstack modifier below for a way pcre2jit documentation. See also the jitstack modifier below for a way
of setting the size of the JIT stack. of setting the size of the JIT stack.
If the jitfast modifier is specified, matching is done using the JIT If the jitfast modifier is specified, matching is done using the JIT
"fast path" interface, pcre2_jit_match(), which skips some of the san- "fast path" interface, pcre2_jit_match(), which skips some of the san-
ity checks that are done by pcre2_match(), and of course does not work ity checks that are done by pcre2_match(), and of course does not work
when JIT is not supported. If jitfast is specified without jit, jit=7 when JIT is not supported. If jitfast is specified without jit, jit=7
is assumed. is assumed.
If the jitverify modifier is specified, information about the compiled If the jitverify modifier is specified, information about the compiled
pattern shows whether JIT compilation was or was not successful. If pattern shows whether JIT compilation was or was not successful. If
jitverify is specified without jit, jit=7 is assumed. If JIT compila- jitverify is specified without jit, jit=7 is assumed. If JIT compila-
tion is successful when jitverify is set, the text "(JIT)" is added to tion is successful when jitverify is set, the text "(JIT)" is added to
the first output line after a match or non match when JIT-compiled code the first output line after a match or non match when JIT-compiled code
was actually used in the match. was actually used in the match.
@ -659,18 +669,18 @@ PATTERN MODIFIERS
/pattern/locale=fr_FR /pattern/locale=fr_FR
The given locale is set, pcre2_maketables() is called to build a set of The given locale is set, pcre2_maketables() is called to build a set of
character tables for the locale, and this is then passed to pcre2_com- character tables for the locale, and this is then passed to pcre2_com-
pile() when compiling the regular expression. The same tables are used pile() when compiling the regular expression. The same tables are used
when matching the following subject lines. The /locale modifier applies when matching the following subject lines. The /locale modifier applies
only to the pattern on which it appears, but can be given in a #pattern only to the pattern on which it appears, but can be given in a #pattern
command if a default is needed. Setting a locale and alternate charac- command if a default is needed. Setting a locale and alternate charac-
ter tables are mutually exclusive. ter tables are mutually exclusive.
Showing pattern memory Showing pattern memory
The /memory modifier causes the size in bytes of the memory used to The /memory modifier causes the size in bytes of the memory used to
hold the compiled pattern to be output. This does not include the size hold the compiled pattern to be output. This does not include the size
of the pcre2_code block; it is just the actual compiled data. If the of the pcre2_code block; it is just the actual compiled data. If the
pattern is subsequently passed to the JIT compiler, the size of the JIT pattern is subsequently passed to the JIT compiler, the size of the JIT
compiled code is also output. Here is an example: compiled code is also output. Here is an example:
@ -681,19 +691,19 @@ PATTERN MODIFIERS
Limiting nested parentheses Limiting nested parentheses
The parens_nest_limit modifier sets a limit on the depth of nested The parens_nest_limit modifier sets a limit on the depth of nested
parentheses in a pattern. Breaching the limit causes a compilation parentheses in a pattern. Breaching the limit causes a compilation
error. The default for the library is set when PCRE2 is built, but error. The default for the library is set when PCRE2 is built, but
pcre2test sets its own default of 220, which is required for running pcre2test sets its own default of 220, which is required for running
the standard test suite. the standard test suite.
Using the POSIX wrapper API Using the POSIX wrapper API
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap- The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
per API rather than its native API. This supports only the 8-bit per API rather than its native API. This supports only the 8-bit
library. Note that it does not imply POSIX matching semantics; for library. Note that it does not imply POSIX matching semantics; for
more detail see the pcre2posix documentation. When the POSIX API is more detail see the pcre2posix documentation. When the POSIX API is
being used, the following pattern modifiers set options for the reg- being used, the following pattern modifiers set options for the reg-
comp() function: comp() function:
caseless REG_ICASE caseless REG_ICASE
@ -704,24 +714,24 @@ PATTERN MODIFIERS
ucp REG_UCP ) the POSIX standard ucp REG_UCP ) the POSIX standard
utf REG_UTF8 ) utf REG_UTF8 )
The aftertext and allaftertext subject modifiers work as described The aftertext and allaftertext subject modifiers work as described
below. All other modifiers cause an error. below. All other modifiers cause an error.
Testing the stack guard feature Testing the stack guard feature
The /stackguard modifier is used to test the use of pcre2_set_com- The /stackguard modifier is used to test the use of pcre2_set_com-
pile_recursion_guard(), a function that is provided to enable stack pile_recursion_guard(), a function that is provided to enable stack
availability to be checked during compilation (see the pcre2api docu- availability to be checked during compilation (see the pcre2api docu-
mentation for details). If the number specified by the modifier is mentation for details). If the number specified by the modifier is
greater than zero, pcre2_set_compile_recursion_guard() is called to set greater than zero, pcre2_set_compile_recursion_guard() is called to set
up callback from pcre2_compile() to a local function. The argument it up callback from pcre2_compile() to a local function. The argument it
receives is the current nesting parenthesis depth; if this is greater receives is the current nesting parenthesis depth; if this is greater
than the value given by the modifier, non-zero is returned, causing the than the value given by the modifier, non-zero is returned, causing the
compilation to be aborted. compilation to be aborted.
Using alternative character tables Using alternative character tables
The value specified for the /tables modifier must be one of the digits The value specified for the /tables modifier must be one of the digits
0, 1, or 2. It causes a specific set of built-in character tables to be 0, 1, or 2. It causes a specific set of built-in character tables to be
passed to pcre2_compile(). This is used in the PCRE2 tests to check be- passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
haviour with different character tables. The digit specifies the tables haviour with different character tables. The digit specifies the tables
@ -732,15 +742,15 @@ PATTERN MODIFIERS
pcre2_chartables.c.dist pcre2_chartables.c.dist
2 a set of tables defining ISO 8859 characters 2 a set of tables defining ISO 8859 characters
In table 2, some characters whose codes are greater than 128 are iden- In table 2, some characters whose codes are greater than 128 are iden-
tified as letters, digits, spaces, etc. Setting alternate character tified as letters, digits, spaces, etc. Setting alternate character
tables and a locale are mutually exclusive. tables and a locale are mutually exclusive.
Setting certain match controls Setting certain match controls
The following modifiers are really subject modifiers, and are described The following modifiers are really subject modifiers, and are described
below. However, they may be included in a pattern's modifier list, in below. However, they may be included in a pattern's modifier list, in
which case they are applied to every subject line that is processed which case they are applied to every subject line that is processed
with that pattern. They do not affect the compilation process. with that pattern. They do not affect the compilation process.
aftertext show text after match aftertext show text after match
@ -752,20 +762,20 @@ PATTERN MODIFIERS
replace=<string> specify a replacement string replace=<string> specify a replacement string
startchar show starting character when relevant startchar show starting character when relevant
These modifiers may not appear in a #pattern command. If you want them These modifiers may not appear in a #pattern command. If you want them
as defaults, set them in a #subject command. as defaults, set them in a #subject command.
Saving a compiled pattern Saving a compiled pattern
When a pattern with the push modifier is successfully compiled, it is When a pattern with the push modifier is successfully compiled, it is
pushed onto a stack of compiled patterns, and pcre2test expects the pushed onto a stack of compiled patterns, and pcre2test expects the
next line to contain a new pattern (or a command) instead of a subject next line to contain a new pattern (or a command) instead of a subject
line. This facility is used when saving compiled patterns to a file, as line. This facility is used when saving compiled patterns to a file, as
described in the section entitled "Saving and restoring compiled pat- described in the section entitled "Saving and restoring compiled pat-
terns" below. The push modifier is incompatible with compilation modi- terns" below. The push modifier is incompatible with compilation modi-
fiers such as global that act at match time. Any that are specified are fiers such as global that act at match time. Any that are specified are
ignored, with a warning message, except for replace, which causes an ignored, with a warning message, except for replace, which causes an
error. Note that, jitverify, which is allowed, does not carry through error. Note that, jitverify, which is allowed, does not carry through
to any subsequent matching that uses this pattern. to any subsequent matching that uses this pattern.
@ -776,7 +786,7 @@ SUBJECT MODIFIERS
Setting match options Setting match options
The following modifiers set options for pcre2_match() or The following modifiers set options for pcre2_match() or
pcre2_dfa_match(). See pcreapi for a description of their effects. pcre2_dfa_match(). See pcreapi for a description of their effects.
anchored set PCRE2_ANCHORED anchored set PCRE2_ANCHORED
@ -790,20 +800,20 @@ SUBJECT MODIFIERS
partial_hard (or ph) set PCRE2_PARTIAL_HARD partial_hard (or ph) set PCRE2_PARTIAL_HARD
partial_soft (or ps) set PCRE2_PARTIAL_SOFT partial_soft (or ps) set PCRE2_PARTIAL_SOFT
The partial matching modifiers are provided with abbreviations because The partial matching modifiers are provided with abbreviations because
they appear frequently in tests. they appear frequently in tests.
If the /posix modifier was present on the pattern, causing the POSIX If the /posix modifier was present on the pattern, causing the POSIX
wrapper API to be used, the only option-setting modifiers that have any wrapper API to be used, the only option-setting modifiers that have any
effect are notbol, notempty, and noteol, causing REG_NOTBOL, effect are notbol, notempty, and noteol, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
Any other modifiers cause an error. Any other modifiers cause an error.
Setting match controls Setting match controls
The following modifiers affect the matching process or request addi- The following modifiers affect the matching process or request addi-
tional information. Some of them may also be specified on a pattern tional information. Some of them may also be specified on a pattern
line (see above), in which case they apply to every subject line that line (see above), in which case they apply to every subject line that
is matched against that pattern. is matched against that pattern.
aftertext show text after match aftertext show text after match
@ -823,9 +833,11 @@ SUBJECT MODIFIERS
/g global global matching /g global global matching
jitstack=<n> set size of JIT stack jitstack=<n> set size of JIT stack
mark show mark values mark show mark values
match_limit=>n> set a match limit match_limit=<n> set a match limit
memory show memory usage memory show memory usage
null_context match with a NULL context
offset=<n> set starting offset offset=<n> set starting offset
offset_limit=<n> set offset limit
ovector=<n> set size of output vector ovector=<n> set size of output vector
recursion_limit=<n> set a recursion limit recursion_limit=<n> set a recursion limit
replace=<string> specify a replacement string replace=<string> specify a replacement string
@ -836,23 +848,23 @@ SUBJECT MODIFIERS
Showing more text Showing more text
The aftertext modifier requests that as well as outputting the part of The aftertext modifier requests that as well as outputting the part of
the subject string that matched the entire pattern, pcre2test should in the subject string that matched the entire pattern, pcre2test should in
addition output the remainder of the subject string. This is useful for addition output the remainder of the subject string. This is useful for
tests where the subject contains multiple copies of the same substring. tests where the subject contains multiple copies of the same substring.
The allaftertext modifier requests the same action for captured sub- The allaftertext modifier requests the same action for captured sub-
strings as well as the main matched substring. In each case the remain- strings as well as the main matched substring. In each case the remain-
der is output on the following line with a plus character following the der is output on the following line with a plus character following the
capture number. capture number.
The allusedtext modifier requests that all the text that was consulted The allusedtext modifier requests that all the text that was consulted
during a successful pattern match by the interpreter should be shown. during a successful pattern match by the interpreter should be shown.
This feature is not supported for JIT matching, and if requested with This feature is not supported for JIT matching, and if requested with
JIT it is ignored (with a warning message). Setting this modifier JIT it is ignored (with a warning message). Setting this modifier
affects the output if there is a lookbehind at the start of a match, or affects the output if there is a lookbehind at the start of a match, or
a lookahead at the end, or if \K is used in the pattern. Characters a lookahead at the end, or if \K is used in the pattern. Characters
that precede or follow the start and end of the actual match are indi- that precede or follow the start and end of the actual match are indi-
cated in the output by '<' or '>' characters underneath them. Here is cated in the output by '<' or '>' characters underneath them. Here is
an example: an example:
re> /(?<=pqr)abc(?=xyz)/ re> /(?<=pqr)abc(?=xyz)/
@ -860,16 +872,16 @@ SUBJECT MODIFIERS
0: pqrabcxyz 0: pqrabcxyz
<<< >>> <<< >>>
This shows that the matched string is "abc", with the preceding and This shows that the matched string is "abc", with the preceding and
following strings "pqr" and "xyz" having been consulted during the following strings "pqr" and "xyz" having been consulted during the
match (when processing the assertions). match (when processing the assertions).
The startchar modifier requests that the starting character for the The startchar modifier requests that the starting character for the
match be indicated, if it is different to the start of the matched match be indicated, if it is different to the start of the matched
string. The only time when this occurs is when \K has been processed as string. The only time when this occurs is when \K has been processed as
part of the match. In this situation, the output for the matched string part of the match. In this situation, the output for the matched string
is displayed from the starting character instead of from the match is displayed from the starting character instead of from the match
point, with circumflex characters under the earlier characters. For point, with circumflex characters under the earlier characters. For
example: example:
re> /abc\Kxyz/ re> /abc\Kxyz/
@ -877,7 +889,7 @@ SUBJECT MODIFIERS
0: abcxyz 0: abcxyz
^^^ ^^^
Unlike allusedtext, the startchar modifier can be used with JIT. How- Unlike allusedtext, the startchar modifier can be used with JIT. How-
ever, these two modifiers are mutually exclusive. ever, these two modifiers are mutually exclusive.
Showing the value of all capture groups Showing the value of all capture groups
@ -885,88 +897,88 @@ SUBJECT MODIFIERS
The allcaptures modifier requests that the values of all potential cap- The allcaptures modifier requests that the values of all potential cap-
tured parentheses be output after a match. By default, only those up to tured parentheses be output after a match. By default, only those up to
the highest one actually used in the match are output (corresponding to the highest one actually used in the match are output (corresponding to
the return code from pcre2_match()). Groups that did not take part in the return code from pcre2_match()). Groups that did not take part in
the match are output as "<unset>". the match are output as "<unset>".
Testing callouts Testing callouts
A callout function is supplied when pcre2test calls the library match- A callout function is supplied when pcre2test calls the library match-
ing functions, unless callout_none is specified. If callout_capture is ing functions, unless callout_none is specified. If callout_capture is
set, the current captured groups are output when a callout occurs. set, the current captured groups are output when a callout occurs.
The callout_fail modifier can be given one or two numbers. If there is The callout_fail modifier can be given one or two numbers. If there is
only one number, 1 is returned instead of 0 when a callout of that num- only one number, 1 is returned instead of 0 when a callout of that num-
ber is reached. If two numbers are given, 1 is returned when callout ber is reached. If two numbers are given, 1 is returned when callout
<n> is reached for the <m>th time. Note that callouts with string argu- <n> is reached for the <m>th time. Note that callouts with string argu-
ments are always given the number zero. See "Callouts" below for a ments are always given the number zero. See "Callouts" below for a
description of the output when a callout it taken. description of the output when a callout it taken.
The callout_data modifier can be given an unsigned or a negative num- The callout_data modifier can be given an unsigned or a negative num-
ber. This is set as the "user data" that is passed to the matching ber. This is set as the "user data" that is passed to the matching
function, and passed back when the callout function is invoked. Any function, and passed back when the callout function is invoked. Any
value other than zero is used as a return from pcre2test's callout value other than zero is used as a return from pcre2test's callout
function. function.
Finding all matches in a string Finding all matches in a string
Searching for all possible matches within a subject can be requested by Searching for all possible matches within a subject can be requested by
the global or /altglobal modifier. After finding a match, the matching the global or /altglobal modifier. After finding a match, the matching
function is called again to search the remainder of the subject. The function is called again to search the remainder of the subject. The
difference between global and altglobal is that the former uses the difference between global and altglobal is that the former uses the
start_offset argument to pcre2_match() or pcre2_dfa_match() to start start_offset argument to pcre2_match() or pcre2_dfa_match() to start
searching at a new point within the entire string (which is what Perl searching at a new point within the entire string (which is what Perl
does), whereas the latter passes over a shortened subject. This makes a does), whereas the latter passes over a shortened subject. This makes a
difference to the matching process if the pattern begins with a lookbe- difference to the matching process if the pattern begins with a lookbe-
hind assertion (including \b or \B). hind assertion (including \b or \B).
If an empty string is matched, the next match is done with the If an empty string is matched, the next match is done with the
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
for another, non-empty, match at the same point in the subject. If this for another, non-empty, match at the same point in the subject. If this
match fails, the start offset is advanced, and the normal match is match fails, the start offset is advanced, and the normal match is
retried. This imitates the way Perl handles such cases when using the retried. This imitates the way Perl handles such cases when using the
/g modifier or the split() function. Normally, the start offset is /g modifier or the split() function. Normally, the start offset is
advanced by one character, but if the newline convention recognizes advanced by one character, but if the newline convention recognizes
CRLF as a newline, and the current character is CR followed by LF, an CRLF as a newline, and the current character is CR followed by LF, an
advance of two characters occurs. advance of two characters occurs.
Testing substring extraction functions Testing substring extraction functions
The copy and get modifiers can be used to test the pcre2_sub- The copy and get modifiers can be used to test the pcre2_sub-
string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be
given more than once, and each can specify a group name or number, for given more than once, and each can specify a group name or number, for
example: example:
abcd\=copy=1,copy=3,get=G1 abcd\=copy=1,copy=3,get=G1
If the #subject command is used to set default copy and/or get lists, If the #subject command is used to set default copy and/or get lists,
these can be unset by specifying a negative number to cancel all num- these can be unset by specifying a negative number to cancel all num-
bered groups and an empty name to cancel all named groups. bered groups and an empty name to cancel all named groups.
The getall modifier tests pcre2_substring_list_get(), which extracts The getall modifier tests pcre2_substring_list_get(), which extracts
all captured substrings. all captured substrings.
If the subject line is successfully matched, the substrings extracted If the subject line is successfully matched, the substrings extracted
by the convenience functions are output with C, G, or L after the by the convenience functions are output with C, G, or L after the
string number instead of a colon. This is in addition to the normal string number instead of a colon. This is in addition to the normal
full list. The string length (that is, the return from the extraction full list. The string length (that is, the return from the extraction
function) is given in parentheses after each substring, followed by the function) is given in parentheses after each substring, followed by the
name when the extraction was by name. name when the extraction was by name.
Testing the substitution function Testing the substitution function
If the replace modifier is set, the pcre2_substitute() function is If the replace modifier is set, the pcre2_substitute() function is
called instead of one of the matching functions. Unlike subject called instead of one of the matching functions. Unlike subject
strings, pcre2test does not process replacement strings for escape strings, pcre2test does not process replacement strings for escape
sequences. In UTF mode, a replacement string is checked to see if it is sequences. In UTF mode, a replacement string is checked to see if it is
a valid UTF-8 string. If so, it is correctly converted to a UTF string a valid UTF-8 string. If so, it is correctly converted to a UTF string
of the appropriate code unit width. If it is not a valid UTF-8 string, of the appropriate code unit width. If it is not a valid UTF-8 string,
the individual code units are copied directly. This provides a means of the individual code units are copied directly. This provides a means of
passing an invalid UTF-8 string for testing purposes. passing an invalid UTF-8 string for testing purposes.
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
pcre2_substitute(). After a successful substitution, the modified pcre2_substitute(). After a successful substitution, the modified
string is output, preceded by the number of replacements. This may be string is output, preceded by the number of replacements. This may be
zero if there were no matches. Here is a simple example of a substitu- zero if there were no matches. Here is a simple example of a substitu-
tion test: tion test:
/abc/replace=xxx /abc/replace=xxx
@ -975,11 +987,11 @@ SUBJECT MODIFIERS
=abc=abc=\=global =abc=abc=\=global
2: =xxx=xxx= 2: =xxx=xxx=
Subject and replacement strings should be kept relatively short for Subject and replacement strings should be kept relatively short for
substitution tests, as fixed-size buffers are used. To make it easy to substitution tests, as fixed-size buffers are used. To make it easy to
test for buffer overflow, if the replacement string starts with a num- test for buffer overflow, if the replacement string starts with a num-
ber in square brackets, that number is passed to pcre2_substitute() as ber in square brackets, that number is passed to pcre2_substitute() as
the size of the output buffer, with the replacement string starting at the size of the output buffer, with the replacement string starting at
the next character. Here is an example that tests the edge case: the next character. Here is an example that tests the edge case:
/abc/ /abc/
@ -989,90 +1001,107 @@ SUBJECT MODIFIERS
Failed: error -47: no more memory Failed: error -47: no more memory
A replacement string is ignored with POSIX and DFA matching. Specifying A replacement string is ignored with POSIX and DFA matching. Specifying
partial matching provokes an error return ("bad option value") from partial matching provokes an error return ("bad option value") from
pcre2_substitute(). pcre2_substitute().
Setting the JIT stack size Setting the JIT stack size
The jitstack modifier provides a way of setting the maximum stack size The jitstack modifier provides a way of setting the maximum stack size
that is used by the just-in-time optimization code. It is ignored if that is used by the just-in-time optimization code. It is ignored if
JIT optimization is not being used. The value is a number of kilobytes. JIT optimization is not being used. The value is a number of kilobytes.
Providing a stack that is larger than the default 32K is necessary only Providing a stack that is larger than the default 32K is necessary only
for very complicated patterns. for very complicated patterns.
Setting match and recursion limits Setting match and recursion limits
The match_limit and recursion_limit modifiers set the appropriate lim- The match_limit and recursion_limit modifiers set the appropriate lim-
its in the match context. These values are ignored when the find_limits its in the match context. These values are ignored when the find_limits
modifier is specified. modifier is specified.
Finding minimum limits Finding minimum limits
If the find_limits modifier is present, pcre2test calls pcre2_match() If the find_limits modifier is present, pcre2test calls pcre2_match()
several times, setting different values in the match context via several times, setting different values in the match context via
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
the minimum values for each parameter that allow pcre2_match() to com- the minimum values for each parameter that allow pcre2_match() to com-
plete without error. plete without error.
If JIT is being used, only the match limit is relevant. If DFA matching If JIT is being used, only the match limit is relevant. If DFA matching
is being used, neither limit is relevant, and this modifier is ignored is being used, neither limit is relevant, and this modifier is ignored
(with a warning message). (with a warning message).
The match_limit number is a measure of the amount of backtracking that The match_limit number is a measure of the amount of backtracking that
takes place, and learning the minimum value can be instructive. For takes place, and learning the minimum value can be instructive. For
most simple matches, the number is quite small, but for patterns with most simple matches, the number is quite small, but for patterns with
very large numbers of matching possibilities, it can become large very very large numbers of matching possibilities, it can become large very
quickly with increasing length of subject string. The quickly with increasing length of subject string. The
match_limit_recursion number is a measure of how much stack (or, if match_limit_recursion number is a measure of how much stack (or, if
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
complete the match attempt. complete the match attempt.
Showing MARK names Showing MARK names
The mark modifier causes the names from backtracking control verbs that The mark modifier causes the names from backtracking control verbs that
are returned from calls to pcre2_match() to be displayed. If a mark is are returned from calls to pcre2_match() to be displayed. If a mark is
returned for a match, non-match, or partial match, pcre2test shows it. returned for a match, non-match, or partial match, pcre2test shows it.
For a match, it is on a line by itself, tagged with "MK:". Otherwise, For a match, it is on a line by itself, tagged with "MK:". Otherwise,
it is added to the non-match message. it is added to the non-match message.
Showing memory usage Showing memory usage
The memory modifier causes pcre2test to log all memory allocation and The memory modifier causes pcre2test to log all memory allocation and
freeing calls that occur during a match operation. freeing calls that occur during a match operation.
Setting a starting offset Setting a starting offset
The offset modifier sets an offset in the subject string at which The offset modifier sets an offset in the subject string at which
matching starts. Its value is a number of code units, not characters. matching starts. Its value is a number of code units, not characters.
Setting an offset limit
The offset_limit modifier sets a limit for unanchored matches. If a
match cannot be found starting at or before this offset in the subject,
a "no match" return is given. The data value is a number of code units,
not characters. When this modifier is used, the use_offset_limit modi-
fier must have been set for the pattern; if not, an error is generated.
Setting the size of the output vector Setting the size of the output vector
The ovector modifier applies only to the subject line in which it The ovector modifier applies only to the subject line in which it
appears, though of course it can also be used to set a default in a appears, though of course it can also be used to set a default in a
#subject command. It specifies the number of pairs of offsets that are #subject command. It specifies the number of pairs of offsets that are
available for storing matching information. The default is 15. available for storing matching information. The default is 15.
A value of zero is useful when testing the POSIX API because it causes A value of zero is useful when testing the POSIX API because it causes
regexec() to be called with a NULL capture vector. When not testing the regexec() to be called with a NULL capture vector. When not testing the
POSIX API, a value of zero is used to cause pcre2_match_data_cre- POSIX API, a value of zero is used to cause pcre2_match_data_cre-
ate_from_pattern() to be called, in order to create a match block of ate_from_pattern() to be called, in order to create a match block of
exactly the right size for the pattern. (It is not possible to create a exactly the right size for the pattern. (It is not possible to create a
match block with a zero-length ovector; there is always at least one match block with a zero-length ovector; there is always at least one
pair of offsets.) pair of offsets.)
Passing the subject as zero-terminated Passing the subject as zero-terminated
By default, the subject string is passed to a native API matching func- By default, the subject string is passed to a native API matching func-
tion with its correct length. In order to test the facility for passing tion with its correct length. In order to test the facility for passing
a zero-terminated string, the zero_terminate modifier is provided. It a zero-terminated string, the zero_terminate modifier is provided. It
causes the length to be passed as PCRE2_ZERO_TERMINATED. (When matching causes the length to be passed as PCRE2_ZERO_TERMINATED. (When matching
via the POSIX interface, this modifier has no effect, as there is no via the POSIX interface, this modifier has no effect, as there is no
facility for passing a length.) facility for passing a length.)
When testing pcre2_substitute(), this modifier also has the effect of When testing pcre2_substitute(), this modifier also has the effect of
passing the replacement string as zero-terminated. passing the replacement string as zero-terminated.
Passing a NULL context
Normally, pcre2test passes a context block to pcre2_match(),
pcre2_dfa_match() or pcre2_jit_match(). If the null_context modifier is
set, however, NULL is passed. This is for testing that the matching
functions behave correctly in this case (they use default values). This
modifier cannot be used with the find_limits modifier or when testing
the substitution function.
THE ALTERNATIVE MATCHING FUNCTION THE ALTERNATIVE MATCHING FUNCTION
@ -1398,5 +1427,5 @@ AUTHOR
REVISION REVISION
Last updated: 14 September 2015 Last updated: 17 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.

View File

@ -1,4 +1,4 @@
.TH PCRE2UNICODE 3 "18 August 2015" "PCRE2 10.21" .TH PCRE2UNICODE 3 "16 October 2015" "PCRE2 10.21"
.SH NAME .SH NAME
PCRE - Perl-compatible regular expressions (revised API) PCRE - Perl-compatible regular expressions (revised API)
.SH "UNICODE AND UTF SUPPORT" .SH "UNICODE AND UTF SUPPORT"
@ -63,11 +63,12 @@ characters (see the description of \eC in the
.\" HREF .\" HREF
\fBpcre2pattern\fP \fBpcre2pattern\fP
.\" .\"
documentation). The use of \eC is not supported in the alternative matching documentation). The use of \eC is not supported by the alternative matching
function \fBpcre2_dfa_match()\fP, nor is it supported in UTF mode by the JIT function \fBpcre2_dfa_match()\fP when in UTF mode. Its use provokes a
optimization. If JIT optimization is requested for a UTF pattern that contains match-time error. The JIT optimization also does not support \eC in UTF mode.
\eC, it will not succeed, and so the matching will be carried out by the normal If JIT optimization is requested for a UTF pattern that contains \eC, it will
interpretive function. not succeed, and so the matching will be carried out by the normal interpretive
function.
.P .P
The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test
characters of any code value, but, by default, the characters that PCRE2 characters of any code value, but, by default, the characters that PCRE2
@ -262,6 +263,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 18 August 2015 Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
.fi .fi

View File

@ -182,6 +182,9 @@ sure both macros are undefined; an emulation function will then be used. */
#define MAX_NAME_SIZE 32 #define MAX_NAME_SIZE 32
#endif #endif
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
/* #undef NEVER_BACKSLASH_C */
/* The value of NEWLINE_DEFAULT determines the default newline character /* The value of NEWLINE_DEFAULT determines the default newline character
sequence. PCRE2 client programs can override this by selecting other values sequence. PCRE2 client programs can override this by selecting other values
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5 at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5

View File

@ -169,6 +169,9 @@ sure both macros are undefined; an emulation function will then be used. */
overflow caused by enormously large patterns. */ overflow caused by enormously large patterns. */
#undef MAX_NAME_SIZE #undef MAX_NAME_SIZE
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
#undef NEVER_BACKSLASH_C
/* The value of NEWLINE_DEFAULT determines the default newline character /* The value of NEWLINE_DEFAULT determines the default newline character
sequence. PCRE2 client programs can override this by selecting other values sequence. PCRE2 client programs can override this by selecting other values
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5 at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5

View File

@ -583,7 +583,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
ERR81, ERR82, ERR83, ERR84 }; ERR81, ERR82, ERR83, ERR84, ERR85 };
/* This is a table of start-of-pattern options such as (*UTF) and settings such /* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -7052,12 +7052,20 @@ for (;; ptr++)
#endif #endif
/* The use of \C can be locked out. */ /* The use of \C can be locked out. */
#ifdef NEVER_BACKSLASH_C
else if (escape == ESC_C)
{
*errorcodeptr = ERR85;
goto FAILED;
}
#else
else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0) else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
{ {
*errorcodeptr = ERR83; *errorcodeptr = ERR83;
goto FAILED; goto FAILED;
} }
#endif
/* For the rest (including \X when Unicode properties are supported), we /* For the rest (including \X when Unicode properties are supported), we
can obtain the OP value by negating the escape value in the default can obtain the OP value by negating the escape value in the default

View File

@ -168,6 +168,8 @@ static const char compile_error_texts[] =
"unrecognized string delimiter follows (?C\0" "unrecognized string delimiter follows (?C\0"
"using \\C is disabled by the application\0" "using \\C is disabled by the application\0"
"(?| and/or (?J: or (?x: parentheses are too deeply nested\0" "(?| and/or (?J: or (?x: parentheses are too deeply nested\0"
/* 85 */
"using \\C is disabled in this PCRE2 library\0"
; ;
/* Match-time and UTF error texts are in the same format. */ /* Match-time and UTF error texts are in the same format. */

View File

@ -106,7 +106,7 @@ static const int eint1[] = {
static const int eint2[] = { static const int eint2[] = {
30, REG_ECTYPE, /* unknown POSIX class name */ 30, REG_ECTYPE, /* unknown POSIX class name */
32, REG_INVARG, /* this version of PCRE does not have UTF or UCP support */ 32, REG_INVARG, /* this version of PCRE2 does not have Unicode support */
37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */ 37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */
56, REG_INVARG, /* internal error: unknown newline setting */ 56, REG_INVARG, /* internal error: unknown newline setting */
}; };

View File

@ -667,6 +667,12 @@ table itself easier to read. */
#define EBCDIC_NL 0 #define EBCDIC_NL 0
#endif #endif
#ifdef NEVER_BACKSLASH_C
#define BACKSLASH_C 0
#else
#define BACKSLASH_C 1
#endif
typedef struct coptstruct { typedef struct coptstruct {
const char *name; const char *name;
uint32_t type; uint32_t type;
@ -681,16 +687,17 @@ enum { CONF_BSR,
}; };
static coptstruct coptlist[] = { static coptstruct coptlist[] = {
{ "bsr", CONF_BSR, PCRE2_CONFIG_BSR }, { "backslash-C", CONF_FIX, BACKSLASH_C },
{ "ebcdic", CONF_FIX, SUPPORT_EBCDIC }, { "bsr", CONF_BSR, PCRE2_CONFIG_BSR },
{ "ebcdic-nl", CONF_FIZ, EBCDIC_NL }, { "ebcdic", CONF_FIX, SUPPORT_EBCDIC },
{ "jit", CONF_INT, PCRE2_CONFIG_JIT }, { "ebcdic-nl", CONF_FIZ, EBCDIC_NL },
{ "linksize", CONF_INT, PCRE2_CONFIG_LINKSIZE }, { "jit", CONF_INT, PCRE2_CONFIG_JIT },
{ "newline", CONF_NL, PCRE2_CONFIG_NEWLINE }, { "linksize", CONF_INT, PCRE2_CONFIG_LINKSIZE },
{ "pcre2-16", CONF_FIX, SUPPORT_16 }, { "newline", CONF_NL, PCRE2_CONFIG_NEWLINE },
{ "pcre2-32", CONF_FIX, SUPPORT_32 }, { "pcre2-16", CONF_FIX, SUPPORT_16 },
{ "pcre2-8", CONF_FIX, SUPPORT_8 }, { "pcre2-32", CONF_FIX, SUPPORT_32 },
{ "unicode", CONF_INT, PCRE2_CONFIG_UNICODE } { "pcre2-8", CONF_FIX, SUPPORT_8 },
{ "unicode", CONF_INT, PCRE2_CONFIG_UNICODE }
}; };
#define COPTLISTCOUNT sizeof(coptlist)/sizeof(coptstruct) #define COPTLISTCOUNT sizeof(coptlist)/sizeof(coptstruct)
@ -6467,6 +6474,7 @@ printf(" -b set default pattern control 'fullbincode'\n");
printf(" -C show PCRE2 compile-time options and exit\n"); printf(" -C show PCRE2 compile-time options and exit\n");
printf(" -C arg show a specific compile-time option and exit with its\n"); printf(" -C arg show a specific compile-time option and exit with its\n");
printf(" value if numeric (else 0). The arg can be:\n"); printf(" value if numeric (else 0). The arg can be:\n");
printf(" backslash-C use of \\C is enabled [0, 1]\n");
printf(" bsr \\R type [ANYCRLF, ANY]\n"); printf(" bsr \\R type [ANYCRLF, ANY]\n");
printf(" ebcdic compiled for EBCDIC character code [0,1]\n"); printf(" ebcdic compiled for EBCDIC character code [0,1]\n");
printf(" ebcdic-nl NL code if compiled for EBCDIC\n"); printf(" ebcdic-nl NL code if compiled for EBCDIC\n");
@ -6618,6 +6626,11 @@ print_newline_config(optval, FALSE);
(void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &optval); (void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &optval);
printf(" \\R matches %s\n", optval? "CR, LF, or CRLF only" : printf(" \\R matches %s\n", optval? "CR, LF, or CRLF only" :
"all Unicode newlines"); "all Unicode newlines");
#ifdef NEVER_BACKSLASH_C
printf(" \\C is not supported\n");
#else
printf(" \\C is supported\n");
#endif
(void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &optval); (void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &optval);
printf(" Internal link size = %d\n", optval); printf(" Internal link size = %d\n", optval);
(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &optval); (void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &optval);

55
testdata/testinput10 vendored
View File

@ -1,46 +1,6 @@
# This set of tests is for UTF-8 support and Unicode property support, with # This set of tests is for UTF-8 support and Unicode property support, with
# relevance only for the 8-bit library. # relevance only for the 8-bit library.
/X(\C{3})/utf
X\x{1234}
/X(\C{4})/utf
X\x{1234}YZ
/X\C*/utf
XYZabcdce
/X\C*?/utf
XYZabcde
/X\C{3,5}/utf
Xabcdefg
X\x{1234}
X\x{1234}YZ
X\x{1234}\x{512}
X\x{1234}\x{512}YZ
/X\C{3,5}?/utf
Xabcdefg
X\x{1234}
X\x{1234}YZ
X\x{1234}\x{512}
/a\Cb/utf
aXb
a\nb
/a\C\Cb/utf
a\x{100}b
/ab\Cde/utf
abXde
/a\C\Cb/utf
a\x{100}b
\= Expect no match
a\x{12257}b
# The next 3 patterns have UTF-8 errors # The next 3 patterns have UTF-8 errors
/[Ã]/utf /[Ã]/utf
@ -212,21 +172,6 @@
/\x{212ab}/IB,utf /\x{212ab}/IB,utf
# This one is here not because it's different to Perl, but because the way
# the captured single-byte is displayed. (In Perl it becomes a character, and you
# can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
X\nabc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match
a\x{100}b
/[^ab\xC0-\xF0]/IB,utf /[^ab\xC0-\xF0]/IB,utf
\x{f1} \x{f1}
\x{bf} \x{bf}

View File

@ -6,10 +6,6 @@
#forbid_utf #forbid_utf
#newline_default LF ANY ANYCRLF #newline_default LF ANY ANYCRLF
/a\Cb/
aXb
a\nb
/[^\x{c4}]/IB /[^\x{c4}]/IB
/\x{100}/I /\x{100}/I
@ -344,7 +340,7 @@
# Non-UTF characters # Non-UTF characters
/\C{2,3}/ /.{2,3}/
\x{400000}\x{400001}\x{400002}\x{400003} \x{400000}\x{400001}\x{400002}\x{400003}
/\x{400000}\x{800000}/IBi /\x{400000}\x{800000}/IBi

59
testdata/testinput12 vendored
View File

@ -7,49 +7,6 @@
/abc/utf /abc/utf
Ã] Ã]
/X(\C{3})/utf
X\x{11234}Y
X\x{11234}YZ
/X(\C{4})/utf
X\x{11234}YZ
X\x{11234}YZW
/X\C*/utf
XYZabcdce
/X\C*?/utf
XYZabcde
/X\C{3,5}/utf
Xabcdefg
X\x{11234}Y
X\x{11234}YZ
X\x{11234}\x{512}
X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
/X\C{3,5}?/utf
Xabcdefg
X\x{11234}Y
X\x{11234}YZ
X\x{11234}\x{512}YZ
\= Expect no match
X\x{11234}
/a\Cb/utf
aXb
a\nb
/a\C\Cb/utf
a\x{12257}b
\= Expect no match
a\x{12257}\x{11234}b
a\x{100}b
/ab\Cde/utf
abXde
# Check maximum character size # Check maximum character size
/\x{ffff}/IB,utf /\x{ffff}/IB,utf
@ -90,16 +47,6 @@
/\x{212ab}/IB,utf /\x{212ab}/IB,utf
# These two \C tests, copied from the UTF-8 input file, do not have any
# problems in 16 or 32 bits.
/X(\C)(.*)/utf
X\x{1234}
X\nabc
/a\Cb/utf
a\x{100}b
/[^ab\xC0-\xF0]/IB,utf /[^ab\xC0-\xF0]/IB,utf
\x{f1} \x{f1}
\x{bf} \x{bf}
@ -336,9 +283,6 @@
/\o{4200000}/utf /\o{4200000}/utf
/\C/utf
\x{110000}
/\x{100}*A/IB,utf /\x{100}*A/IB,utf
A A
@ -396,4 +340,7 @@
/\x{3a3}B/IBi,utf /\x{3a3}B/IBi,utf
/./utf
\x{110000}
# End of testinput12 # End of testinput12

45
testdata/testinput2 vendored
View File

@ -3739,41 +3739,40 @@
/[bcd]*a/B /[bcd]*a/B
# A complete set of tests for auto-possessification of character types. # A complete set of tests for auto-possessification of character types, but
# omitting \C because it might be disabled (it has its own tests).
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\C \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx /\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\C \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx /\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\C \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx /\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\C \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx /\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\C \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx /\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\C \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx /\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\C \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx /\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\C \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx /\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\C \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx /\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\C \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx /\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\C \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx /\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\C \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx / a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\C a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx /\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\C \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx / .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx / .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx / \D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
/\D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \C+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
/(?=a+)a(a+)++a/B /(?=a+)a(a+)++a/B
@ -4327,8 +4326,6 @@
/((?2){73}(?2))((?1))/info /((?2){73}(?2))((?1))/info
/ab\Cde/never_backslash_c
/abc/ /abc/
\= Expect no match \= Expect no match
\[9x!xxx(]{9999} \[9x!xxx(]{9999}
@ -4446,12 +4443,6 @@
/\x0{ab}/ /\x0{ab}/
\0{ab} \0{ab}
/ab\Cde/
abXde
/(?<=ab\Cde)X/
abZdeX
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/ /^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
ababababbbabZXXXX ababababbbabZXXXX

16
testdata/testinput21 vendored Normal file
View File

@ -0,0 +1,16 @@
# These are tests of \C that do not involve UTF. They are not run when \C is
# disabled by compiling with --enable-never-backslash-C.
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
/\D+\C \d+\C \S+\C \s+\C \W+\C \w+\C .+\C \R+\C \H+\C \h+\C \V+\C \v+\C a+\C \n+\C \C+\C/Bx
/ab\Cde/never_backslash_c
/ab\Cde/
abXde
/(?<=ab\Cde)X/
abZdeX
# End of testinput21

95
testdata/testinput22 vendored Normal file
View File

@ -0,0 +1,95 @@
# Tests of \C when Unicode support is available. Note that \C is not supported
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
# in some widths and not in others.
/ab\Cde/utf
abXde
# This should produce an error diagnostic (\C in UTF lookbehind)
/(?<=ab\Cde)X/utf
# Autopossessification tests
/\C+\X \X+\C/Bx
/\C+\X \X+\C/Bx,utf
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
/X(\C{3})/utf
X\x{1234}
X\x{11234}Y
X\x{11234}YZ
/X(\C{4})/utf
X\x{1234}YZ
X\x{11234}YZ
X\x{11234}YZW
/X\C*/utf
XYZabcdce
/X\C*?/utf
XYZabcde
/X\C{3,5}/utf
Xabcdefg
X\x{1234}
X\x{1234}YZ
X\x{1234}\x{512}
X\x{1234}\x{512}YZ
X\x{11234}Y
X\x{11234}YZ
X\x{11234}\x{512}
X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
/X\C{3,5}?/utf
Xabcdefg
X\x{1234}
X\x{1234}YZ
X\x{1234}\x{512}
X\x{11234}Y
X\x{11234}YZ
X\x{11234}\x{512}YZ
X\x{11234}
/a\Cb/utf
aXb
a\nb
a\x{100}b
/a\C\Cb/utf
a\x{100}b
a\x{12257}b
a\x{12257}\x{11234}b
/ab\Cde/utf
abXde
# This one is here not because it's different to Perl, but because the way
# the captured single code unit is displayed. (In Perl it becomes a character,
# and you can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
X\nabc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match in 8-bit mode
a\x{100}b
# End of testinput22

7
testdata/testinput23 vendored Normal file
View File

@ -0,0 +1,7 @@
# This test is run when PCRE2 has been built with --enable-never-backslash-C,
# which disables the use of \C. All we can do is check that it gives the
# correct error message.
/a\Cb/
# End of testinput23

19
testdata/testinput5 vendored
View File

@ -111,9 +111,6 @@
/.{3,5}?/IB,utf /.{3,5}?/IB,utf
\x{212ab}\x{212ab}\x{212ab}\x{861} \x{212ab}\x{212ab}\x{212ab}\x{861}
/(?<=\C)X/utf
Should produce an error diagnostic
/^[ab]/IB,utf /^[ab]/IB,utf
bar bar
\= Expect no match \= Expect no match
@ -1367,8 +1364,6 @@
\= Expect no match \= Expect no match
aAz aAz
/(?<=ab\Cde)X/utf
/\X/ /\X/
a\=ps a\=ps
a\=ph a\=ph
@ -1617,13 +1612,13 @@
/[\p{L}ab]{2,3}+/B,no_auto_possess /[\p{L}ab]{2,3}+/B,no_auto_possess
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \C+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx /\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
/.+\X/Bsx /.+\X/Bsx
/\X+$/Bmx /\X+$/Bmx
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\C \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx /\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
/\d+\s{0,5}=\s*\S?=\w{0,4}\W*/B,utf,ucp /\d+\s{0,5}=\s*\S?=\w{0,4}\W*/B,utf,ucp
@ -1665,16 +1660,6 @@
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'" "(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
/[\pS#moq]/ /[\pS#moq]/
= =

6
testdata/testinput6 vendored
View File

@ -4645,12 +4645,6 @@
aaaa\=ovector=3 aaaa\=ovector=3
aaaa\=ovector=4 aaaa\=ovector=4
/ab\Cde/
abXde
/(?<=ab\Cde)X/
abZdeX
/^\R/ /^\R/
\r\=ps \r\=ps
\r\=ph \r\=ph

5
testdata/testinput7 vendored
View File

@ -671,11 +671,6 @@
the cat\=ps the cat\=ps
the cat\=ph the cat\=ph
/ab\Cde/utf
abXde
/(?<=ab\Cde)X/utf
/./newline=crlf,utf /./newline=crlf,utf
\r\=ps \r\=ps
\r\=ph \r\=ph

6
testdata/testinput9 vendored
View File

@ -4,10 +4,8 @@
#forbid_utf #forbid_utf
#newline_default lf any anycrlf #newline_default lf any anycrlf
/a\Cb/ /ab/
aXb \= Expect error message (too big char) and no match
a\nb
\= Expect no match and error message (too big char)
A\x{123}B A\x{123}B
A\o{443}B A\o{443}B

83
testdata/testoutput10 vendored
View File

@ -1,67 +1,6 @@
# This set of tests is for UTF-8 support and Unicode property support, with # This set of tests is for UTF-8 support and Unicode property support, with
# relevance only for the 8-bit library. # relevance only for the 8-bit library.
/X(\C{3})/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
/X(\C{4})/utf
X\x{1234}YZ
0: X\x{1234}Y
1: \x{1234}Y
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{1234}
0: X\x{1234}
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
0: X\x{1234}\x{512}
X\x{1234}\x{512}YZ
0: X\x{1234}\x{512}
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{1234}
0: X\x{1234}
X\x{1234}YZ
0: X\x{1234}
X\x{1234}\x{512}
0: X\x{1234}
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
/a\C\Cb/utf
a\x{100}b
0: a\x{100}b
/ab\Cde/utf
abXde
0: abXde
/a\C\Cb/utf
a\x{100}b
0: a\x{100}b
\= Expect no match
a\x{12257}b
No match
# The next 3 patterns have UTF-8 errors # The next 3 patterns have UTF-8 errors
/[Ã]/utf /[Ã]/utf
@ -511,28 +450,6 @@ First code unit = \xf0
Last code unit = \xab Last code unit = \xab
Subject length lower bound = 1 Subject length lower bound = 1
# This one is here not because it's different to Perl, but because the way
# the captured single-byte is displayed. (In Perl it becomes a character, and you
# can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{e1}
2: \x{88}\x{b4}
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match
a\x{100}b
No match
/[^ab\xC0-\xF0]/IB,utf /[^ab\xC0-\xF0]/IB,utf
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra

View File

@ -6,12 +6,6 @@
#forbid_utf #forbid_utf
#newline_default LF ANY ANYCRLF #newline_default LF ANY ANYCRLF
/a\Cb/
aXb
0: aXb
a\nb
0: a\x0ab
/[^\x{c4}]/IB /[^\x{c4}]/IB
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
@ -582,7 +576,7 @@ Failed: error 134 at offset 11: character code point value in \x{} or \o{} is to
# Non-UTF characters # Non-UTF characters
/\C{2,3}/ /.{2,3}/
\x{400000}\x{400001}\x{400002}\x{400003} \x{400000}\x{400001}\x{400002}\x{400003}
** Character \x{400000} is greater than 0xffff and UTF-16 mode is not enabled. ** Character \x{400000} is greater than 0xffff and UTF-16 mode is not enabled.
** Truncation will probably give the wrong result. ** Truncation will probably give the wrong result.

View File

@ -6,12 +6,6 @@
#forbid_utf #forbid_utf
#newline_default LF ANY ANYCRLF #newline_default LF ANY ANYCRLF
/a\Cb/
aXb
0: aXb
a\nb
0: a\x0ab
/[^\x{c4}]/IB /[^\x{c4}]/IB
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
@ -583,7 +577,7 @@ Subject length lower bound = 2
# Non-UTF characters # Non-UTF characters
/\C{2,3}/ /.{2,3}/
\x{400000}\x{400001}\x{400002}\x{400003} \x{400000}\x{400001}\x{400002}\x{400003}
0: \x{400000}\x{400001}\x{400002} 0: \x{400000}\x{400001}\x{400002}

View File

@ -9,76 +9,6 @@
Ã] Ã]
** Failed: invalid UTF-8 string cannot be used as input in UTF mode ** Failed: invalid UTF-8 string cannot be used as input in UTF mode
/X(\C{3})/utf
X\x{11234}Y
0: X\x{11234}Y
1: \x{11234}Y
X\x{11234}YZ
0: X\x{11234}Y
1: \x{11234}Y
/X(\C{4})/utf
X\x{11234}YZ
0: X\x{11234}YZ
1: \x{11234}YZ
X\x{11234}YZW
0: X\x{11234}YZ
1: \x{11234}YZ
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{11234}Y
0: X\x{11234}Y
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}
0: X\x{11234}\x{512}
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
0: X\x{11234}\x{512}\x{11234}
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{11234}Y
0: X\x{11234}Y
X\x{11234}YZ
0: X\x{11234}Y
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}
\= Expect no match
X\x{11234}
No match
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
/a\C\Cb/utf
a\x{12257}b
0: a\x{12257}b
\= Expect no match
a\x{12257}\x{11234}b
No match
a\x{100}b
No match
/ab\Cde/utf
abXde
0: abXde
# Check maximum character size # Check maximum character size
/\x{ffff}/IB,utf /\x{ffff}/IB,utf
@ -308,23 +238,6 @@ First code unit = \x{d844}
Last code unit = \x{deab} Last code unit = \x{deab}
Subject length lower bound = 1 Subject length lower bound = 1
# These two \C tests, copied from the UTF-8 input file, do not have any
# problems in 16 or 32 bits.
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
2:
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
/a\Cb/utf
a\x{100}b
0: a\x{100}b
/[^ab\xC0-\xF0]/IB,utf /[^ab\xC0-\xF0]/IB,utf
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
@ -1127,10 +1040,6 @@ Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too
/\o{4200000}/utf /\o{4200000}/utf
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
/\C/utf
\x{110000}
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
/\x{100}*A/IB,utf /\x{100}*A/IB,utf
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
@ -1454,4 +1363,8 @@ Starting code units: \xff
Last code unit = 'B' (caseless) Last code unit = 'B' (caseless)
Subject length lower bound = 2 Subject length lower bound = 2
/./utf
\x{110000}
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
# End of testinput12 # End of testinput12

View File

@ -9,74 +9,6 @@
Ã] Ã]
** Failed: invalid UTF-8 string cannot be used as input in UTF mode ** Failed: invalid UTF-8 string cannot be used as input in UTF mode
/X(\C{3})/utf
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
1: \x{11234}YZ
/X(\C{4})/utf
X\x{11234}YZ
No match
X\x{11234}YZW
0: X\x{11234}YZW
1: \x{11234}YZW
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}
No match
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
0: X\x{11234}\x{512}\x{11234}Z
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}Y
\= Expect no match
X\x{11234}
No match
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
/a\C\Cb/utf
a\x{12257}b
No match
\= Expect no match
a\x{12257}\x{11234}b
0: a\x{12257}\x{11234}b
a\x{100}b
No match
/ab\Cde/utf
abXde
0: abXde
# Check maximum character size # Check maximum character size
/\x{ffff}/IB,utf /\x{ffff}/IB,utf
@ -301,23 +233,6 @@ Options: utf
First code unit = \x{212ab} First code unit = \x{212ab}
Subject length lower bound = 1 Subject length lower bound = 1
# These two \C tests, copied from the UTF-8 input file, do not have any
# problems in 16 or 32 bits.
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
2:
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
/a\Cb/utf
a\x{100}b
0: a\x{100}b
/[^ab\xC0-\xF0]/IB,utf /[^ab\xC0-\xF0]/IB,utf
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
@ -1119,10 +1034,6 @@ Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too
/\o{4200000}/utf /\o{4200000}/utf
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
/\C/utf
\x{110000}
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0
/\x{100}*A/IB,utf /\x{100}*A/IB,utf
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
@ -1446,4 +1357,8 @@ Starting code units: \xff
Last code unit = 'B' (caseless) Last code unit = 'B' (caseless)
Subject length lower bound = 2 Subject length lower bound = 2
/./utf
\x{110000}
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0
# End of testinput12 # End of testinput12

693
testdata/testoutput2 vendored
View File

@ -11948,9 +11948,10 @@ Subject length lower bound = 5
End End
------------------------------------------------------------------ ------------------------------------------------------------------
# A complete set of tests for auto-possessification of character types. # A complete set of tests for auto-possessification of character types, but
# omitting \C because it might be disabled (it has its own tests).
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\C \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx /\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
\D+ \D+
@ -11968,8 +11969,6 @@ Subject length lower bound = 5
\D+ \D+
Any Any
\D+ \D+
AllAny
\D+
\R \R
\D+ \D+
\H \H
@ -11989,7 +11988,7 @@ Subject length lower bound = 5
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\C \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx /\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
\d++ \d++
@ -12006,8 +12005,6 @@ Subject length lower bound = 5
\w \w
\d+ \d+
Any Any
\d+
AllAny
\d++ \d++
\R \R
\d+ \d+
@ -12028,7 +12025,7 @@ Subject length lower bound = 5
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\C \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx /\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
\S+ \S+
@ -12045,8 +12042,6 @@ Subject length lower bound = 5
\w \w
\S+ \S+
Any Any
\S+
AllAny
\S++ \S++
\R \R
\S+ \S+
@ -12067,7 +12062,7 @@ Subject length lower bound = 5
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\C \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx /\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
\s+ \s+
@ -12085,8 +12080,6 @@ Subject length lower bound = 5
\s+ \s+
Any Any
\s+ \s+
AllAny
\s+
\R \R
\s+ \s+
\H \H
@ -12106,7 +12099,7 @@ Subject length lower bound = 5
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\C \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx /\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
\W+ \W+
@ -12124,8 +12117,6 @@ Subject length lower bound = 5
\W+ \W+
Any Any
\W+ \W+
AllAny
\W+
\R \R
\W+ \W+
\H \H
@ -12145,7 +12136,7 @@ Subject length lower bound = 5
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\C \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx /\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
\w+ \w+
@ -12162,8 +12153,6 @@ Subject length lower bound = 5
\w \w
\w+ \w+
Any Any
\w+
AllAny
\w++ \w++
\R \R
\w+ \w+
@ -12184,7 +12173,303 @@ Subject length lower bound = 5
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\C \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx /\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
------------------------------------------------------------------
Bra
\R+
\D
\R++
\d
\R+
\S
\R++
\s
\R+
\W
\R++
\w
\R++
Any
\R+
\R
\R+
\H
\R++
\h
\R+
\V
\R+
\v
\R+
\Z
\R++
\z
\R+
$
Ket
End
------------------------------------------------------------------
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
------------------------------------------------------------------
Bra
\H+
\D
\H+
\d
\H+
\S
\H+
\s
\H+
\W
\H+
\w
\H+
Any
\H+
\R
\H+
\H
\H++
\h
\H+
\V
\H+
\v
\H+
\Z
\H++
\z
\H+
$
Ket
End
------------------------------------------------------------------
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
------------------------------------------------------------------
Bra
\h+
\D
\h++
\d
\h++
\S
\h+
\s
\h+
\W
\h++
\w
\h+
Any
\h++
\R
\h++
\H
\h+
\h
\h+
\V
\h++
\v
\h+
\Z
\h++
\z
\h+
$
Ket
End
------------------------------------------------------------------
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
------------------------------------------------------------------
Bra
\V+
\D
\V+
\d
\V+
\S
\V+
\s
\V+
\W
\V+
\w
\V+
Any
\V++
\R
\V+
\H
\V+
\h
\V+
\V
\V++
\v
\V+
\Z
\V++
\z
\V+
$
Ket
End
------------------------------------------------------------------
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
------------------------------------------------------------------
Bra
\v+
\D
\v++
\d
\v++
\S
\v+
\s
\v+
\W
\v++
\w
\v+
Any
\v+
\R
\v+
\H
\v++
\h
\v++
\V
\v+
\v
\v+
\Z
\v++
\z
\v+
$
Ket
End
------------------------------------------------------------------
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
------------------------------------------------------------------
Bra
a+
\D
a++
\d
a+
\S
a++
\s
a++
\W
a+
\w
a+
Any
a++
\R
a+
\H
a++
\h
a+
\V
a++
\v
a++
\Z
a++
\z
a++
$
Ket
End
------------------------------------------------------------------
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
------------------------------------------------------------------
Bra
\x0a+
\D
\x0a++
\d
\x0a++
\S
\x0a+
\s
\x0a+
\W
\x0a++
\w
\x0a+
Any
\x0a+
\R
\x0a+
\H
\x0a++
\h
\x0a++
\V
\x0a+
\v
\x0a+
\Z
\x0a++
\z
\x0a+
$
Ket
End
------------------------------------------------------------------
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
------------------------------------------------------------------
Bra
Any+
\D
Any+
\d
Any+
\S
Any+
\s
Any+
\W
Any+
\w
Any+
Any
Any++
\R
Any+
\H
Any+
\h
Any+
\V
Any+
\v
Any+
\Z
Any++
\z
Any+
$
Ket
End
------------------------------------------------------------------
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
AllAny+ AllAny+
@ -12200,8 +12485,6 @@ Subject length lower bound = 5
AllAny+ AllAny+
\w \w
AllAny+ AllAny+
Any
AllAny+
AllAny AllAny
AllAny+ AllAny+
\R \R
@ -12223,358 +12506,7 @@ Subject length lower bound = 5
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\C \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx / \D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
------------------------------------------------------------------
Bra
\R+
\D
\R++
\d
\R+
\S
\R++
\s
\R+
\W
\R++
\w
\R++
Any
\R+
AllAny
\R+
\R
\R+
\H
\R++
\h
\R+
\V
\R+
\v
\R+
\Z
\R++
\z
\R+
$
Ket
End
------------------------------------------------------------------
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\C \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
------------------------------------------------------------------
Bra
\H+
\D
\H+
\d
\H+
\S
\H+
\s
\H+
\W
\H+
\w
\H+
Any
\H+
AllAny
\H+
\R
\H+
\H
\H++
\h
\H+
\V
\H+
\v
\H+
\Z
\H++
\z
\H+
$
Ket
End
------------------------------------------------------------------
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\C \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
------------------------------------------------------------------
Bra
\h+
\D
\h++
\d
\h++
\S
\h+
\s
\h+
\W
\h++
\w
\h+
Any
\h+
AllAny
\h++
\R
\h++
\H
\h+
\h
\h+
\V
\h++
\v
\h+
\Z
\h++
\z
\h+
$
Ket
End
------------------------------------------------------------------
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\C \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
------------------------------------------------------------------
Bra
\V+
\D
\V+
\d
\V+
\S
\V+
\s
\V+
\W
\V+
\w
\V+
Any
\V+
AllAny
\V++
\R
\V+
\H
\V+
\h
\V+
\V
\V++
\v
\V+
\Z
\V++
\z
\V+
$
Ket
End
------------------------------------------------------------------
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\C \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
------------------------------------------------------------------
Bra
\v+
\D
\v++
\d
\v++
\S
\v+
\s
\v+
\W
\v++
\w
\v+
Any
\v+
AllAny
\v+
\R
\v+
\H
\v++
\h
\v++
\V
\v+
\v
\v+
\Z
\v++
\z
\v+
$
Ket
End
------------------------------------------------------------------
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\C a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
------------------------------------------------------------------
Bra
a+
\D
a++
\d
a+
\S
a++
\s
a++
\W
a+
\w
a+
Any
a+
AllAny
a++
\R
a+
\H
a++
\h
a+
\V
a++
\v
a++
\Z
a++
\z
a++
$
Ket
End
------------------------------------------------------------------
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\C \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
------------------------------------------------------------------
Bra
\x0a+
\D
\x0a++
\d
\x0a++
\S
\x0a+
\s
\x0a+
\W
\x0a++
\w
\x0a+
Any
\x0a+
AllAny
\x0a+
\R
\x0a+
\H
\x0a++
\h
\x0a++
\V
\x0a+
\v
\x0a+
\Z
\x0a++
\z
\x0a+
$
Ket
End
------------------------------------------------------------------
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
------------------------------------------------------------------
Bra
Any+
\D
Any+
\d
Any+
\S
Any+
\s
Any+
\W
Any+
\w
Any+
Any
Any+
AllAny
Any++
\R
Any+
\H
Any+
\h
Any+
\V
Any+
\v
Any+
\Z
Any++
\z
Any+
$
Ket
End
------------------------------------------------------------------
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
------------------------------------------------------------------
Bra
AllAny+
\D
AllAny+
\d
AllAny+
\S
AllAny+
\s
AllAny+
\W
AllAny+
\w
AllAny+
AllAny
AllAny+
AllAny
AllAny+
\R
AllAny+
\H
AllAny+
\h
AllAny+
\V
AllAny+
\v
AllAny+
\Z
AllAny++
\z
AllAny+
$
Ket
End
------------------------------------------------------------------
/\D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \C+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
\D+ \D+
@ -12588,8 +12520,6 @@ Subject length lower bound = 5
\W+ \W+
/m $ /m $
\w++ \w++
/m $
AllAny+
/m $ /m $
\R+ \R+
/m $ /m $
@ -14210,9 +14140,6 @@ Capturing subpattern count = 2
May match empty string May match empty string
Subject length lower bound = 0 Subject length lower bound = 0
/ab\Cde/never_backslash_c
Failed: error 183 at offset 3: using \C is disabled by the application
/abc/ /abc/
\= Expect no match \= Expect no match
\[9x!xxx(]{9999} \[9x!xxx(]{9999}
@ -14532,14 +14459,6 @@ Subject length lower bound = 0
\0{ab} \0{ab}
0: \x00{ab} 0: \x00{ab}
/ab\Cde/
abXde
0: abXde
/(?<=ab\Cde)X/
abZdeX
0: X
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/ /^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
ababababbbabZXXXX ababababbbabZXXXX
0: ababababbbabZ 0: ababababbbabZ

89
testdata/testoutput21 vendored Normal file
View File

@ -0,0 +1,89 @@
# These are tests of \C that do not involve UTF. They are not run when \C is
# disabled by compiling with --enable-never-backslash-C.
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
------------------------------------------------------------------
Bra
AllAny+
\D
AllAny+
\d
AllAny+
\S
AllAny+
\s
AllAny+
\W
AllAny+
\w
AllAny+
Any
AllAny+
\R
AllAny+
\H
AllAny+
\h
AllAny+
\V
AllAny+
\v
AllAny+
\Z
AllAny++
\z
AllAny+
$
Ket
End
------------------------------------------------------------------
/\D+\C \d+\C \S+\C \s+\C \W+\C \w+\C .+\C \R+\C \H+\C \h+\C \V+\C \v+\C a+\C \n+\C \C+\C/Bx
------------------------------------------------------------------
Bra
\D+
AllAny
\d+
AllAny
\S+
AllAny
\s+
AllAny
\W+
AllAny
\w+
AllAny
Any+
AllAny
\R+
AllAny
\H+
AllAny
\h+
AllAny
\V+
AllAny
\v+
AllAny
a+
AllAny
\x0a+
AllAny
AllAny+
AllAny
Ket
End
------------------------------------------------------------------
/ab\Cde/never_backslash_c
Failed: error 183 at offset 3: using \C is disabled by the application
/ab\Cde/
abXde
0: abXde
/(?<=ab\Cde)X/
abZdeX
0: X
# End of testinput21

161
testdata/testoutput22-16 vendored Normal file
View File

@ -0,0 +1,161 @@
# Tests of \C when Unicode support is available. Note that \C is not supported
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
# in some widths and not in others.
/ab\Cde/utf
abXde
0: abXde
# This should produce an error diagnostic (\C in UTF lookbehind)
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
# Autopossessification tests
/\C+\X \X+\C/Bx
------------------------------------------------------------------
Bra
AllAny+
extuni
extuni+
AllAny
Ket
End
------------------------------------------------------------------
/\C+\X \X+\C/Bx,utf
------------------------------------------------------------------
Bra
Anybyte+
extuni
extuni+
Anybyte
Ket
End
------------------------------------------------------------------
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
No match
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
No match
/X(\C{3})/utf
X\x{1234}
No match
X\x{11234}Y
0: X\x{11234}Y
1: \x{11234}Y
X\x{11234}YZ
0: X\x{11234}Y
1: \x{11234}Y
/X(\C{4})/utf
X\x{1234}YZ
No match
X\x{11234}YZ
0: X\x{11234}YZ
1: \x{11234}YZ
X\x{11234}YZW
0: X\x{11234}YZ
1: \x{11234}YZ
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{1234}
No match
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
No match
X\x{1234}\x{512}YZ
0: X\x{1234}\x{512}YZ
X\x{11234}Y
0: X\x{11234}Y
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}
0: X\x{11234}\x{512}
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
0: X\x{11234}\x{512}\x{11234}
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{1234}
No match
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
No match
X\x{11234}Y
0: X\x{11234}Y
X\x{11234}YZ
0: X\x{11234}Y
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}
X\x{11234}
No match
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
a\x{100}b
0: a\x{100}b
/a\C\Cb/utf
a\x{100}b
No match
a\x{12257}b
0: a\x{12257}b
a\x{12257}\x{11234}b
No match
/ab\Cde/utf
abXde
0: abXde
# This one is here not because it's different to Perl, but because the way
# the captured single code unit is displayed. (In Perl it becomes a character,
# and you can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
2:
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match in 8-bit mode
a\x{100}b
0: a\x{100}b
# End of testinput22

159
testdata/testoutput22-32 vendored Normal file
View File

@ -0,0 +1,159 @@
# Tests of \C when Unicode support is available. Note that \C is not supported
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
# in some widths and not in others.
/ab\Cde/utf
abXde
0: abXde
# This should produce an error diagnostic (\C in UTF lookbehind)
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
# Autopossessification tests
/\C+\X \X+\C/Bx
------------------------------------------------------------------
Bra
AllAny+
extuni
extuni+
AllAny
Ket
End
------------------------------------------------------------------
/\C+\X \X+\C/Bx,utf
------------------------------------------------------------------
Bra
Anybyte+
extuni
extuni+
Anybyte
Ket
End
------------------------------------------------------------------
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
No match
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
No match
/X(\C{3})/utf
X\x{1234}
No match
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
1: \x{11234}YZ
/X(\C{4})/utf
X\x{1234}YZ
No match
X\x{11234}YZ
No match
X\x{11234}YZW
0: X\x{11234}YZW
1: \x{11234}YZW
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{1234}
No match
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
No match
X\x{1234}\x{512}YZ
0: X\x{1234}\x{512}YZ
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}
No match
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}YZ
X\x{11234}\x{512}\x{11234}Z
0: X\x{11234}\x{512}\x{11234}Z
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{1234}
No match
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
No match
X\x{11234}Y
No match
X\x{11234}YZ
0: X\x{11234}YZ
X\x{11234}\x{512}YZ
0: X\x{11234}\x{512}Y
X\x{11234}
No match
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
a\x{100}b
0: a\x{100}b
/a\C\Cb/utf
a\x{100}b
No match
a\x{12257}b
No match
a\x{12257}\x{11234}b
0: a\x{12257}\x{11234}b
/ab\Cde/utf
abXde
0: abXde
# This one is here not because it's different to Perl, but because the way
# the captured single code unit is displayed. (In Perl it becomes a character,
# and you can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
2:
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match in 8-bit mode
a\x{100}b
0: a\x{100}b
# End of testinput22

163
testdata/testoutput22-8 vendored Normal file
View File

@ -0,0 +1,163 @@
# Tests of \C when Unicode support is available. Note that \C is not supported
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
# in some widths and not in others.
/ab\Cde/utf
abXde
0: abXde
# This should produce an error diagnostic (\C in UTF lookbehind)
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
# Autopossessification tests
/\C+\X \X+\C/Bx
------------------------------------------------------------------
Bra
AllAny+
extuni
extuni+
AllAny
Ket
End
------------------------------------------------------------------
/\C+\X \X+\C/Bx,utf
------------------------------------------------------------------
Bra
Anybyte+
extuni
extuni+
Anybyte
Ket
End
------------------------------------------------------------------
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
No match
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
No match
/X(\C{3})/utf
X\x{1234}
0: X\x{1234}
1: \x{1234}
X\x{11234}Y
0: X\x{f0}\x{91}\x{88}
1: \x{f0}\x{91}\x{88}
X\x{11234}YZ
0: X\x{f0}\x{91}\x{88}
1: \x{f0}\x{91}\x{88}
/X(\C{4})/utf
X\x{1234}YZ
0: X\x{1234}Y
1: \x{1234}Y
X\x{11234}YZ
0: X\x{11234}
1: \x{11234}
X\x{11234}YZW
0: X\x{11234}
1: \x{11234}
/X\C*/utf
XYZabcdce
0: XYZabcdce
/X\C*?/utf
XYZabcde
0: X
/X\C{3,5}/utf
Xabcdefg
0: Xabcde
X\x{1234}
0: X\x{1234}
X\x{1234}YZ
0: X\x{1234}YZ
X\x{1234}\x{512}
0: X\x{1234}\x{512}
X\x{1234}\x{512}YZ
0: X\x{1234}\x{512}
X\x{11234}Y
0: X\x{11234}Y
X\x{11234}YZ
0: X\x{11234}Y
X\x{11234}\x{512}
0: X\x{11234}\x{d4}
X\x{11234}\x{512}YZ
0: X\x{11234}\x{d4}
X\x{11234}\x{512}\x{11234}Z
0: X\x{11234}\x{d4}
/X\C{3,5}?/utf
Xabcdefg
0: Xabc
X\x{1234}
0: X\x{1234}
X\x{1234}YZ
0: X\x{1234}
X\x{1234}\x{512}
0: X\x{1234}
X\x{11234}Y
0: X\x{f0}\x{91}\x{88}
X\x{11234}YZ
0: X\x{f0}\x{91}\x{88}
X\x{11234}\x{512}YZ
0: X\x{f0}\x{91}\x{88}
X\x{11234}
0: X\x{f0}\x{91}\x{88}
/a\Cb/utf
aXb
0: aXb
a\nb
0: a\x{0a}b
a\x{100}b
No match
/a\C\Cb/utf
a\x{100}b
0: a\x{100}b
a\x{12257}b
No match
a\x{12257}\x{11234}b
No match
/ab\Cde/utf
abXde
0: abXde
# This one is here not because it's different to Perl, but because the way
# the captured single code unit is displayed. (In Perl it becomes a character,
# and you can't tell the difference.)
/X(\C)(.*)/utf
X\x{1234}
0: X\x{1234}
1: \x{e1}
2: \x{88}\x{b4}
X\nabc
0: X\x{0a}abc
1: \x{0a}
2: abc
# This one is here because Perl gives out a grumbly error message (quite
# correctly, but that messes up comparisons).
/a\Cb/utf
\= Expect no match in 8-bit mode
a\x{100}b
No match
# End of testinput22

8
testdata/testoutput23 vendored Normal file
View File

@ -0,0 +1,8 @@
# This test is run when PCRE2 has been built with --enable-never-backslash-C,
# which disables the use of \C. All we can do is check that it gives the
# correct error message.
/a\Cb/
Failed: error 185 at offset 2: using \C is disabled in this PCRE2 library
# End of testinput23

27
testdata/testoutput5 vendored
View File

@ -181,10 +181,6 @@ Subject length lower bound = 3
\x{212ab}\x{212ab}\x{212ab}\x{861} \x{212ab}\x{212ab}\x{212ab}\x{861}
0: \x{212ab}\x{212ab}\x{212ab} 0: \x{212ab}\x{212ab}\x{212ab}
/(?<=\C)X/utf
Failed: error 136 at offset 6: \C is not allowed in a lookbehind assertion
Should produce an error diagnostic
/^[ab]/IB,utf /^[ab]/IB,utf
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
@ -2905,9 +2901,6 @@ No match
aAz aAz
No match No match
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
/\X/ /\X/
a\=ps a\=ps
0: a 0: a
@ -3803,7 +3796,7 @@ No match
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \C+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx /\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
\D+ \D+
@ -3818,8 +3811,6 @@ No match
extuni extuni
\w+ \w+
extuni extuni
AllAny+
extuni
\R+ \R+
extuni extuni
\H+ \H+
@ -3858,7 +3849,7 @@ No match
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\C \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx /\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
------------------------------------------------------------------ ------------------------------------------------------------------
Bra Bra
extuni+ extuni+
@ -3876,8 +3867,6 @@ No match
extuni+ extuni+
Any Any
extuni+ extuni+
AllAny
extuni+
\R \R
extuni+ extuni+
\H \H
@ -4010,18 +3999,6 @@ Failed: error 122 at offset 1227: unmatched closing parenthesis
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'" "(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
Failed: error 124 at offset 113: letter or underscore expected after (?< or (?' Failed: error 124 at offset 113: letter or underscore expected after (?< or (?'
/\C\X*TӅ;
{0,6}\v+ F
/utf
\= Expect no match
Ӆ\x0a
No match
/\C(\W?ſ)'?{{/utf
\= Expect no match
\\C(\\W?ſ)'?{{
No match
/[\pS#moq]/ /[\pS#moq]/
= =
0: = 0: =

View File

@ -7174,14 +7174,6 @@ Matched, but offsets vector is too small to show all matches
2: aa 2: aa
3: a 3: a
/ab\Cde/
abXde
0: abXde
/(?<=ab\Cde)X/
abZdeX
0: X
/^\R/ /^\R/
\r\=ps \r\=ps
0: \x0d 0: \x0d

View File

@ -1141,13 +1141,6 @@ Partial match: abcde
the cat\=ph the cat\=ph
Partial match: the cat Partial match: the cat
/ab\Cde/utf
abXde
Failed: error -42: pattern contains an item that is not supported for DFA matching
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
/./newline=crlf,utf /./newline=crlf,utf
\r\=ps \r\=ps
0: \x{0d} 0: \x{0d}

View File

@ -4,12 +4,8 @@
#forbid_utf #forbid_utf
#newline_default lf any anycrlf #newline_default lf any anycrlf
/a\Cb/ /ab/
aXb \= Expect error message (too big char) and no match
0: aXb
a\nb
0: a\x0ab
\= Expect no match and error message (too big char)
A\x{123}B A\x{123}B
** Character \x{123} is greater than 255 and UTF-8 mode is not enabled. ** Character \x{123} is greater than 255 and UTF-8 mode is not enabled.
** Truncation will probably give the wrong result. ** Truncation will probably give the wrong result.