Implement --never-backslash-C
This commit is contained in:
parent
5923caf05e
commit
3263d44b97
|
@ -70,6 +70,7 @@
|
||||||
# 2015-04-24 PH added support for PCRE2_DEBUG
|
# 2015-04-24 PH added support for PCRE2_DEBUG
|
||||||
# 2015-07-16 PH updated for new pcre2_find_bracket source module
|
# 2015-07-16 PH updated for new pcre2_find_bracket source module
|
||||||
# 2015-08-24 PH correct C_FLAGS setting (patch from Roy Ivy III)
|
# 2015-08-24 PH correct C_FLAGS setting (patch from Roy Ivy III)
|
||||||
|
# 2015-10=16 PH added support for never-backslash-C
|
||||||
|
|
||||||
PROJECT(PCRE2 C)
|
PROJECT(PCRE2 C)
|
||||||
|
|
||||||
|
@ -162,6 +163,9 @@ SET(PCRE2_SUPPORT_UNICODE ON CACHE BOOL
|
||||||
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
|
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
|
||||||
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
|
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
|
||||||
|
|
||||||
|
SET(PCRE2_NEVER_BACKSLASH_C OFF CACHE BOOL
|
||||||
|
"If ON, backslash-C (upper case C) is locked out.")
|
||||||
|
|
||||||
SET(PCRE2_SUPPORT_VALGRIND OFF CACHE BOOL
|
SET(PCRE2_SUPPORT_VALGRIND OFF CACHE BOOL
|
||||||
"Enable Valgrind support.")
|
"Enable Valgrind support.")
|
||||||
|
|
||||||
|
@ -252,6 +256,10 @@ IF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
||||||
SET(BSR_ANYCRLF 1)
|
SET(BSR_ANYCRLF 1)
|
||||||
ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
||||||
|
|
||||||
|
IF(PCRE2_NEVER_BACKSLASH_C)
|
||||||
|
SET(NEVER_BACKSLASH_C 1)
|
||||||
|
ENDIF(PCRE2_NEVER_BACKSLASH_C)
|
||||||
|
|
||||||
IF(PCRE2_SUPPORT_UNICODE)
|
IF(PCRE2_SUPPORT_UNICODE)
|
||||||
SET(SUPPORT_UNICODE 1)
|
SET(SUPPORT_UNICODE 1)
|
||||||
ENDIF(PCRE2_SUPPORT_UNICODE)
|
ENDIF(PCRE2_SUPPORT_UNICODE)
|
||||||
|
@ -719,6 +727,7 @@ IF(PCRE2_SHOW_REPORT)
|
||||||
MESSAGE(STATUS " Enable Unicode support .......... : ${PCRE2_SUPPORT_UNICODE}")
|
MESSAGE(STATUS " Enable Unicode support .......... : ${PCRE2_SUPPORT_UNICODE}")
|
||||||
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE2_NEWLINE}")
|
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE2_NEWLINE}")
|
||||||
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE2_SUPPORT_BSR_ANYCRLF}")
|
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE2_SUPPORT_BSR_ANYCRLF}")
|
||||||
|
MESSAGE(STATUS " \\C is disabled .................. : ${PCRE2_NEVER_BACKSLASH_C}")
|
||||||
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE2_EBCDIC}")
|
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE2_EBCDIC}")
|
||||||
MESSAGE(STATUS " EBCDIC coding with NL=0x25 ...... : ${PCRE2_EBCDIC_NL25}")
|
MESSAGE(STATUS " EBCDIC coding with NL=0x25 ...... : ${PCRE2_EBCDIC_NL25}")
|
||||||
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE2_REBUILD_CHARTABLES}")
|
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE2_REBUILD_CHARTABLES}")
|
||||||
|
|
|
@ -201,6 +201,8 @@ escape was being ignored.
|
||||||
57. Fixed integer overflow for patterns whose minimum matching length is very,
|
57. Fixed integer overflow for patterns whose minimum matching length is very,
|
||||||
very large.
|
very large.
|
||||||
|
|
||||||
|
58. Implemented --never-backslash-C.
|
||||||
|
|
||||||
|
|
||||||
Version 10.20 30-June-2015
|
Version 10.20 30-June-2015
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
9
README
9
README
|
@ -220,6 +220,13 @@ library. They are also documented in the pcre2build man page.
|
||||||
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||||
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||||
|
|
||||||
|
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||||
|
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||||
|
characters. You can build PCRE2 with the use of \C permanently locked out by
|
||||||
|
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||||
|
command. When \C is allowed by the library, individual applications can lock
|
||||||
|
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||||
|
|
||||||
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||||
pattern. This limits the amount of system stack that a pattern uses when it
|
pattern. This limits the amount of system stack that a pattern uses when it
|
||||||
is compiled. The default is 250, but you can change it by setting, for
|
is compiled. The default is 250, but you can change it by setting, for
|
||||||
|
@ -833,4 +840,4 @@ The distribution should contain the files listed below.
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 16 July 2015
|
Last updated: 16 October 2015
|
||||||
|
|
64
RunTest
64
RunTest
|
@ -75,7 +75,10 @@ title17="Test 17: JIT-specific features when JIT is available"
|
||||||
title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP"
|
title18="Test 18: Tests of the POSIX interface, excluding UTF/UCP"
|
||||||
title19="Test 19: Tests of the POSIX interface with UTF/UCP"
|
title19="Test 19: Tests of the POSIX interface with UTF/UCP"
|
||||||
title20="Test 20: Serialization tests"
|
title20="Test 20: Serialization tests"
|
||||||
maxtest=20
|
title21="Test 21: \C tests without UTF (supported for DFA matching)"
|
||||||
|
title22="Test 22: \C tests with UTF (not supported for DFA matching)"
|
||||||
|
title23="Test 23: \C disabled test"
|
||||||
|
maxtest=23
|
||||||
|
|
||||||
if [ $# -eq 1 -a "$1" = "list" ]; then
|
if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||||
echo $title0
|
echo $title0
|
||||||
|
@ -99,6 +102,9 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||||
echo $title18
|
echo $title18
|
||||||
echo $title19
|
echo $title19
|
||||||
echo $title20
|
echo $title20
|
||||||
|
echo $title21
|
||||||
|
echo $title22
|
||||||
|
echo $title23
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -223,6 +229,9 @@ do17=no
|
||||||
do18=no
|
do18=no
|
||||||
do19=no
|
do19=no
|
||||||
do20=no
|
do20=no
|
||||||
|
do21=no
|
||||||
|
do22=no
|
||||||
|
do23=no
|
||||||
|
|
||||||
while [ $# -gt 0 ] ; do
|
while [ $# -gt 0 ] ; do
|
||||||
case $1 in
|
case $1 in
|
||||||
|
@ -247,6 +256,9 @@ while [ $# -gt 0 ] ; do
|
||||||
18) do18=yes;;
|
18) do18=yes;;
|
||||||
19) do19=yes;;
|
19) do19=yes;;
|
||||||
20) do20=yes;;
|
20) do20=yes;;
|
||||||
|
21) do21=yes;;
|
||||||
|
22) do22=yes;;
|
||||||
|
23) do23=yes;;
|
||||||
-8) arg8=yes;;
|
-8) arg8=yes;;
|
||||||
-16) arg16=yes;;
|
-16) arg16=yes;;
|
||||||
-32) arg32=yes;;
|
-32) arg32=yes;;
|
||||||
|
@ -326,6 +338,11 @@ support16=$?
|
||||||
$sim ./pcre2test -C pcre2-32 >/dev/null
|
$sim ./pcre2test -C pcre2-32 >/dev/null
|
||||||
support32=$?
|
support32=$?
|
||||||
|
|
||||||
|
# \C may be disabled
|
||||||
|
|
||||||
|
$sim ./pcre2test -C backslash-C >/dev/null
|
||||||
|
supportBSC=$?
|
||||||
|
|
||||||
# Initialize all bitsizes skipped
|
# Initialize all bitsizes skipped
|
||||||
|
|
||||||
test8=skip
|
test8=skip
|
||||||
|
@ -400,7 +417,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
||||||
$do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \
|
$do8 = no -a $do9 = no -a $do10 = no -a $do11 = no -a \
|
||||||
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
||||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
||||||
$do20 = no \
|
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no \
|
||||||
]; then
|
]; then
|
||||||
do0=yes
|
do0=yes
|
||||||
do1=yes
|
do1=yes
|
||||||
|
@ -423,6 +440,9 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
||||||
do18=yes
|
do18=yes
|
||||||
do19=yes
|
do19=yes
|
||||||
do20=yes
|
do20=yes
|
||||||
|
do21=yes
|
||||||
|
do22=yes
|
||||||
|
do23=yes
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||||
|
@ -781,6 +801,46 @@ for bmode in "$test8" "$test16" "$test32"; do
|
||||||
checkresult $? 20 ""
|
checkresult $? 20 ""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# \C tests without UTF - DFA matching is supported
|
||||||
|
|
||||||
|
if [ "$do21" = yes ] ; then
|
||||||
|
echo $title21
|
||||||
|
if [ $supportBSC -eq 0 ] ; then
|
||||||
|
echo " Skipped because \C is disabled"
|
||||||
|
else
|
||||||
|
for opt in "" $jitopt -dfa; do
|
||||||
|
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput21 testtry
|
||||||
|
checkresult $? 21 "$opt"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# \C tests with UTF - DFA matching is not supported for \C in UTF mode
|
||||||
|
|
||||||
|
if [ "$do22" = yes ] ; then
|
||||||
|
echo $title22
|
||||||
|
if [ $supportBSC -eq 0 ] ; then
|
||||||
|
echo " Skipped because \C is disabled"
|
||||||
|
else
|
||||||
|
for opt in "" $jitopt; do
|
||||||
|
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput22 testtry
|
||||||
|
checkresult $? 22-$bits "$opt"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test when \C is disabled
|
||||||
|
|
||||||
|
if [ "$do23" = yes ] ; then
|
||||||
|
echo $title23
|
||||||
|
if [ $supportBSC -ne 0 ] ; then
|
||||||
|
echo " Skipped because \C is not disabled"
|
||||||
|
else
|
||||||
|
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput23 testtry
|
||||||
|
checkresult $? 23 ""
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# End of loop for 8/16/32-bit tests
|
# End of loop for 8/16/32-bit tests
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
74
RunTest.bat
74
RunTest.bat
|
@ -13,11 +13,10 @@
|
||||||
@rem line. Added argument validation and added error reporting.
|
@rem line. Added argument validation and added error reporting.
|
||||||
@rem
|
@rem
|
||||||
@rem Sheri Pierce added logic to skip feature dependent tests
|
@rem Sheri Pierce added logic to skip feature dependent tests
|
||||||
@rem tests 4 5 9 15 and 18 require utf support
|
@rem tests 4 5 7 10 12 14 19 and 22 require Unicode support
|
||||||
@rem tests 6 7 10 16 and 19 require ucp support
|
@rem 8 requires Unicode and link size 2
|
||||||
@rem 11 requires ucp and link size 2
|
@rem 16 requires absence of jit support
|
||||||
@rem 12 requires presence of jit support
|
@rem 17 requires presence of jit support
|
||||||
@rem 13 requires absence of jit support
|
|
||||||
@rem Sheri P also added override tests for study and jit testing
|
@rem Sheri P also added override tests for study and jit testing
|
||||||
@rem Zoltan Herczeg added libpcre16 support
|
@rem Zoltan Herczeg added libpcre16 support
|
||||||
@rem Zoltan Herczeg added libpcre32 support
|
@rem Zoltan Herczeg added libpcre32 support
|
||||||
|
@ -25,6 +24,7 @@
|
||||||
@rem
|
@rem
|
||||||
@rem The file was converted for PCRE2 by PH, February 2015.
|
@rem The file was converted for PCRE2 by PH, February 2015.
|
||||||
@rem Updated for new test 14 (moving others up a number), August 2015.
|
@rem Updated for new test 14 (moving others up a number), August 2015.
|
||||||
|
@rem Tidied and updated for new tests 21, 22, 23 by PH, October 2015.
|
||||||
|
|
||||||
|
|
||||||
setlocal enabledelayedexpansion
|
setlocal enabledelayedexpansion
|
||||||
|
@ -65,6 +65,8 @@ set support32=%ERRORLEVEL%
|
||||||
set unicode=%ERRORLEVEL%
|
set unicode=%ERRORLEVEL%
|
||||||
%pcre2test% -C jit >NUL
|
%pcre2test% -C jit >NUL
|
||||||
set jit=%ERRORLEVEL%
|
set jit=%ERRORLEVEL%
|
||||||
|
%pcre2test% -C backslash-C >NUL
|
||||||
|
set supportBSC=%ERRORLEVEL%
|
||||||
|
|
||||||
if %support8% EQU 1 (
|
if %support8% EQU 1 (
|
||||||
if not exist testout8 md testout8
|
if not exist testout8 md testout8
|
||||||
|
@ -101,18 +103,21 @@ set do17=no
|
||||||
set do18=no
|
set do18=no
|
||||||
set do19=no
|
set do19=no
|
||||||
set do20=no
|
set do20=no
|
||||||
|
set do21=no
|
||||||
|
set do22=no
|
||||||
|
set do23=no
|
||||||
set all=yes
|
set all=yes
|
||||||
|
|
||||||
for %%a in (%*) do (
|
for %%a in (%*) do (
|
||||||
set valid=no
|
set valid=no
|
||||||
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20) do if %%v == %%a set valid=yes
|
for %%v in (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23) do if %%v == %%a set valid=yes
|
||||||
if "!valid!" == "yes" (
|
if "!valid!" == "yes" (
|
||||||
set do%%a=yes
|
set do%%a=yes
|
||||||
set all=no
|
set all=no
|
||||||
) else (
|
) else (
|
||||||
echo Invalid test number - %%a!
|
echo Invalid test number - %%a!
|
||||||
echo Usage %0 [ test_number ] ...
|
echo Usage %0 [ test_number ] ...
|
||||||
echo Where test_number is one or more optional test numbers 1 through 20, default is all tests.
|
echo Where test_number is one or more optional test numbers 1 through 23, default is all tests.
|
||||||
exit /b 1
|
exit /b 1
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -139,6 +144,9 @@ if "%all%" == "yes" (
|
||||||
set do18=yes
|
set do18=yes
|
||||||
set do19=yes
|
set do19=yes
|
||||||
set do20=yes
|
set do20=yes
|
||||||
|
set do21=yes
|
||||||
|
set do22=yes
|
||||||
|
set do23=yes
|
||||||
)
|
)
|
||||||
|
|
||||||
@echo RunTest.bat's pcre2test output is written to newly created subfolders
|
@echo RunTest.bat's pcre2test output is written to newly created subfolders
|
||||||
|
@ -187,6 +195,9 @@ if "%do17%" == "yes" call :do17
|
||||||
if "%do18%" == "yes" call :do18
|
if "%do18%" == "yes" call :do18
|
||||||
if "%do19%" == "yes" call :do19
|
if "%do19%" == "yes" call :do19
|
||||||
if "%do20%" == "yes" call :do20
|
if "%do20%" == "yes" call :do20
|
||||||
|
if "%do21%" == "yes" call :do21
|
||||||
|
if "%do22%" == "yes" call :do22
|
||||||
|
if "%do23%" == "yes" call :do23
|
||||||
:modeSkip
|
:modeSkip
|
||||||
if "%mode%" == "" (
|
if "%mode%" == "" (
|
||||||
set mode=-16
|
set mode=-16
|
||||||
|
@ -323,7 +334,7 @@ if %unicode% EQU 0 (
|
||||||
goto :eof
|
goto :eof
|
||||||
|
|
||||||
:do6
|
:do6
|
||||||
call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q -dfa
|
call :runsub 6 testout "DFA matching main non-UTF, non-UCP functionality" -q
|
||||||
goto :eof
|
goto :eof
|
||||||
|
|
||||||
:do7
|
:do7
|
||||||
|
@ -331,7 +342,7 @@ if %unicode% EQU 0 (
|
||||||
echo Test 7 Skipped due to absence of Unicode support.
|
echo Test 7 Skipped due to absence of Unicode support.
|
||||||
goto :eof
|
goto :eof
|
||||||
)
|
)
|
||||||
call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q -dfa
|
call :runsub 7 testout "DFA matching with UTF-%bits% and Unicode property support" -q
|
||||||
goto :eof
|
goto :eof
|
||||||
|
|
||||||
:do8
|
:do8
|
||||||
|
@ -395,12 +406,16 @@ if %bits% EQU 8 (
|
||||||
echo Test 13 Skipped when running 8-bit tests.
|
echo Test 13 Skipped when running 8-bit tests.
|
||||||
goto :eof
|
goto :eof
|
||||||
)
|
)
|
||||||
call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q -dfa
|
call :runsub 13 testout "DFA specials for the basic 16/32-bit library" -q
|
||||||
goto :eof
|
goto :eof
|
||||||
|
|
||||||
:do14
|
:do14
|
||||||
call :runsub 14 testout "DFA specials for UTF and UCP support" -q
|
if %unicode% EQU 0 (
|
||||||
goto :eof
|
echo Test 14 Skipped due to absence of Unicode support.
|
||||||
|
goto :eof
|
||||||
|
)
|
||||||
|
call :runsub 14 testout "DFA specials for UTF and UCP support" -q
|
||||||
|
goto :eof
|
||||||
|
|
||||||
:do15
|
:do15
|
||||||
call :runsub 15 testout "Non-JIT limits and other non_JIT tests" -q
|
call :runsub 15 testout "Non-JIT limits and other non_JIT tests" -q
|
||||||
|
@ -442,6 +457,10 @@ if %bits% EQU 16 (
|
||||||
if %bits% EQU 32 (
|
if %bits% EQU 32 (
|
||||||
echo Test 19 Skipped when running 32-bit tests.
|
echo Test 19 Skipped when running 32-bit tests.
|
||||||
goto :eof
|
goto :eof
|
||||||
|
)
|
||||||
|
if %unicode% EQU 0 (
|
||||||
|
echo Test 19 Skipped due to absence of Unicode support.
|
||||||
|
goto :eof
|
||||||
)
|
)
|
||||||
call :runsub 19 testout "POSIX interface with UTF-8 and UCP" -q
|
call :runsub 19 testout "POSIX interface with UTF-8 and UCP" -q
|
||||||
goto :eof
|
goto :eof
|
||||||
|
@ -450,6 +469,37 @@ goto :eof
|
||||||
call :runsub 20 testout "Serialization tests" -q
|
call :runsub 20 testout "Serialization tests" -q
|
||||||
goto :eof
|
goto :eof
|
||||||
|
|
||||||
|
:do21
|
||||||
|
if %supportBSC% EQU 0 (
|
||||||
|
echo Test 21 Skipped due to absence of backslash-C support.
|
||||||
|
goto :eof
|
||||||
|
)
|
||||||
|
call :runsub 21 testout "Backslash-C tests without UTF" -q
|
||||||
|
call :runsub 21 testout "Backslash-C tests without UTF (DFA)" -q -dfa
|
||||||
|
if %jit% EQU 1 call :runsub 21 testoutjit "Test with JIT Override" -q -jit
|
||||||
|
goto :eof
|
||||||
|
|
||||||
|
:do22
|
||||||
|
if %supportBSC% EQU 0 (
|
||||||
|
echo Test 22 Skipped due to absence of backslash-C support.
|
||||||
|
goto :eof
|
||||||
|
)
|
||||||
|
if %unicode% EQU 0 (
|
||||||
|
echo Test 22 Skipped due to absence of Unicode support.
|
||||||
|
goto :eof
|
||||||
|
)
|
||||||
|
call :runsub 22 testout "Backslash-C tests with UTF" -q
|
||||||
|
if %jit% EQU 1 call :runsub 22 testoutjit "Test with JIT Override" -q -jit
|
||||||
|
goto :eof
|
||||||
|
|
||||||
|
:do23
|
||||||
|
if %supportBSC% EQU 1 (
|
||||||
|
echo Test 23 Skipped due to presence of backslash-C support.
|
||||||
|
goto :eof
|
||||||
|
)
|
||||||
|
call :runsub 23 testout "Backslash-C disabled test" -q
|
||||||
|
goto :eof
|
||||||
|
|
||||||
:conferror
|
:conferror
|
||||||
@echo.
|
@echo.
|
||||||
@echo Either your build is incomplete or you have a configuration error.
|
@echo Either your build is incomplete or you have a configuration error.
|
||||||
|
|
|
@ -33,6 +33,7 @@
|
||||||
#cmakedefine EBCDIC 1
|
#cmakedefine EBCDIC 1
|
||||||
#cmakedefine EBCDIC_NL25 1
|
#cmakedefine EBCDIC_NL25 1
|
||||||
#cmakedefine HEAP_MATCH_RECURSE 1
|
#cmakedefine HEAP_MATCH_RECURSE 1
|
||||||
|
#cmakedefine NEVER_BACKSLASH_C 1
|
||||||
|
|
||||||
#define LINK_SIZE @PCRE2_LINK_SIZE@
|
#define LINK_SIZE @PCRE2_LINK_SIZE@
|
||||||
#define MATCH_LIMIT @PCRE2_MATCH_LIMIT@
|
#define MATCH_LIMIT @PCRE2_MATCH_LIMIT@
|
||||||
|
|
12
configure.ac
12
configure.ac
|
@ -190,6 +190,12 @@ AC_ARG_ENABLE(bsr-anycrlf,
|
||||||
[\R matches only CR, LF, CRLF by default]),
|
[\R matches only CR, LF, CRLF by default]),
|
||||||
, enable_bsr_anycrlf=no)
|
, enable_bsr_anycrlf=no)
|
||||||
|
|
||||||
|
# Handle --enable-never-backslash-C
|
||||||
|
AC_ARG_ENABLE(never-backslash-C,
|
||||||
|
AS_HELP_STRING([--enable-never-backslash-C],
|
||||||
|
[use of \C causes an error]),
|
||||||
|
, enable_never_backslash_C=no)
|
||||||
|
|
||||||
# Handle --enable-ebcdic
|
# Handle --enable-ebcdic
|
||||||
AC_ARG_ENABLE(ebcdic,
|
AC_ARG_ENABLE(ebcdic,
|
||||||
AS_HELP_STRING([--enable-ebcdic],
|
AS_HELP_STRING([--enable-ebcdic],
|
||||||
|
@ -604,6 +610,11 @@ if test "$enable_bsr_anycrlf" = "yes"; then
|
||||||
The build-time default can be overridden by the user of PCRE2 at runtime.])
|
The build-time default can be overridden by the user of PCRE2 at runtime.])
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if test "$enable_never_backslash_C" = "yes"; then
|
||||||
|
AC_DEFINE([NEVER_BACKSLASH_C], [], [
|
||||||
|
Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns.])
|
||||||
|
fi
|
||||||
|
|
||||||
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
|
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
|
||||||
The value of LINK_SIZE determines the number of bytes used to store
|
The value of LINK_SIZE determines the number of bytes used to store
|
||||||
links as offsets within the compiled regex. The default is 2, which
|
links as offsets within the compiled regex. The default is 2, which
|
||||||
|
@ -881,6 +892,7 @@ $PACKAGE-$VERSION configuration summary:
|
||||||
Enable Unicode support .......... : ${enable_unicode}
|
Enable Unicode support .......... : ${enable_unicode}
|
||||||
Newline char/sequence ........... : ${enable_newline}
|
Newline char/sequence ........... : ${enable_newline}
|
||||||
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
|
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
|
||||||
|
\C is disabled .................. : ${enable_never_backslash_C}
|
||||||
EBCDIC coding ................... : ${enable_ebcdic}
|
EBCDIC coding ................... : ${enable_ebcdic}
|
||||||
EBCDIC code for NL .............. : ${ebcdic_nl_code}
|
EBCDIC code for NL .............. : ${ebcdic_nl_code}
|
||||||
Rebuild char tables ............. : ${enable_rebuild_chartables}
|
Rebuild char tables ............. : ${enable_rebuild_chartables}
|
||||||
|
|
|
@ -220,6 +220,13 @@ library. They are also documented in the pcre2build man page.
|
||||||
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||||
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||||
|
|
||||||
|
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||||
|
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||||
|
characters. You can build PCRE2 with the use of \C permanently locked out by
|
||||||
|
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||||
|
command. When \C is allowed by the library, individual applications can lock
|
||||||
|
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||||
|
|
||||||
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||||
pattern. This limits the amount of system stack that a pattern uses when it
|
pattern. This limits the amount of system stack that a pattern uses when it
|
||||||
is compiled. The default is 250, but you can change it by setting, for
|
is compiled. The default is 250, but you can change it by setting, for
|
||||||
|
@ -833,4 +840,4 @@ The distribution should contain the files listed below.
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 16 July 2015
|
Last updated: 16 October 2015
|
||||||
|
|
|
@ -126,8 +126,10 @@ running redundant checks.
|
||||||
<P>
|
<P>
|
||||||
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||||
problems, because it may leave the current matching point in the middle of a
|
problems, because it may leave the current matching point in the middle of a
|
||||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to
|
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||||
lock out the use of \C, causing a compile-time error if it is encountered.
|
application to lock out the use of \C, causing a compile-time error if it is
|
||||||
|
encountered. It is also possible to build PCRE2 with the use of \C permanently
|
||||||
|
disabled.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Another way that performance can be hit is by running a pattern that has a very
|
Another way that performance can be hit is by running a pattern that has a very
|
||||||
|
@ -187,7 +189,7 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 13 April 2015
|
Last updated: 16 October 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -59,20 +59,22 @@ units, not characters, as is the contents of the variable pointed at by
|
||||||
<i>outlengthptr</i>, which is updated to the actual length of the new string.
|
<i>outlengthptr</i>, which is updated to the actual length of the new string.
|
||||||
The options are:
|
The options are:
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_ANCHORED Match only at the first position
|
PCRE2_ANCHORED Match only at the first position
|
||||||
PCRE2_NOTBOL Subject string is not the beginning of a line
|
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||||
PCRE2_NOTEOL Subject string is not the end of a line
|
PCRE2_NOTEOL Subject is not the end of a line
|
||||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject
|
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the
|
||||||
is not a valid match
|
subject is not a valid match
|
||||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement for
|
PCRE2_NO_UTF_CHECK Do not check the subject or replacement
|
||||||
UTF validity (only relevant if PCRE2_UTF
|
for UTF validity (only relevant if
|
||||||
was set at compile time)
|
PCRE2_UTF was set at compile time)
|
||||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||||
|
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||||
</pre>
|
</pre>
|
||||||
The function returns the number of substitutions, which may be zero if there
|
The function returns the number of substitutions, which may be zero if there
|
||||||
were no matches. The result can be greater than one only when
|
were no matches. The result can be greater than one only when
|
||||||
PCRE2_SUBSTITUTE_GLOBAL is set.
|
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
|
||||||
|
is returned.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
There is a complete description of the PCRE2 native API in the
|
There is a complete description of the PCRE2 native API in the
|
||||||
|
|
|
@ -1237,7 +1237,8 @@ This option locks out the use of \C in the pattern that is being compiled.
|
||||||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||||
it may leave the current matching point in the middle of a multi-code-unit
|
it may leave the current matching point in the middle of a multi-code-unit
|
||||||
character. This option may be useful in applications that process patterns from
|
character. This option may be useful in applications that process patterns from
|
||||||
external sources.
|
external sources. Note that there is also a build-time option that permanently
|
||||||
|
locks out the use of \C.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_NEVER_UCP
|
PCRE2_NEVER_UCP
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -2613,43 +2614,17 @@ same number causes an error at compile time.
|
||||||
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||||
<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
|
<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
|
||||||
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR \fIreplacementzfP,</b>
|
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR <i>replacement</i>,</b>
|
||||||
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *\fIoutputbuffer\zfP,</b>
|
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *\fIoutputbuffer\zfP,</b>
|
||||||
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
|
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
||||||
string in <i>outputbuffer</i>, replacing the part that was matched with the
|
string in <i>outputbuffer</i>, replacing the part that was matched with the
|
||||||
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
|
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
|
||||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
|
||||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
|
||||||
dollar character is an escape character that can specify the insertion of
|
|
||||||
characters from capturing groups or (*MARK) items in the pattern. The following
|
|
||||||
forms are recognized:
|
|
||||||
<pre>
|
|
||||||
$$ insert a dollar character
|
|
||||||
$<n> or ${<n>} insert the contents of group <n>
|
|
||||||
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
|
||||||
</pre>
|
|
||||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
|
||||||
required only if the following character would be interpreted as part of the
|
|
||||||
number or name. The number may be zero to include the entire matched string.
|
|
||||||
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
|
||||||
string "+$1$0$1+", the result is "=+babcb+=". Group insertion is done by
|
|
||||||
calling <b>pcre2_copy_byname()</b> or <b>pcre2_copy_bynumber()</b> as
|
|
||||||
appropriate.
|
|
||||||
</P>
|
|
||||||
<P>
|
|
||||||
The facility for inserting a (*MARK) name can be used to perform simple
|
|
||||||
simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
|
||||||
<pre>
|
|
||||||
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
|
||||||
apple lemon
|
|
||||||
2: pear orange
|
|
||||||
</PRE>
|
|
||||||
</P>
|
|
||||||
<P>
|
|
||||||
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
||||||
<b>pcre2_match()</b>, except that the partial matching options are not
|
<b>pcre2_match()</b>, except that the partial matching options are not
|
||||||
permitted, and <i>match_data</i> may be passed as NULL, in which case a match
|
permitted, and <i>match_data</i> may be passed as NULL, in which case a match
|
||||||
|
@ -2658,25 +2633,112 @@ functions from the match context, if provided, or else those that were used to
|
||||||
allocate memory for the compiled code.
|
allocate memory for the compiled code.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
|
The <i>outlengthptr</i> argument must point to a variable that contains the
|
||||||
|
length, in code units, of the output buffer. If the function is successful,
|
||||||
|
the value is updated to contain the length of the new string, excluding the
|
||||||
|
trailing zero that is automatically added. If the function is not successful,
|
||||||
|
the value is set to PCRE2_UNSET for general errors (such as output buffer too
|
||||||
|
small). For syntax errors in the replacement string, the value is set to the
|
||||||
|
offset in the replacement string where the error was detected.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||||
|
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||||
|
dollar character is an escape character that can specify the insertion of
|
||||||
|
characters from capturing groups or (*MARK) items in the pattern. The following
|
||||||
|
forms are always recognized:
|
||||||
|
<pre>
|
||||||
|
$$ insert a dollar character
|
||||||
|
$<n> or ${<n>} insert the contents of group <n>
|
||||||
|
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||||
|
</pre>
|
||||||
|
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||||
|
required only if the following character would be interpreted as part of the
|
||||||
|
number or name. The number may be zero to include the entire matched string.
|
||||||
|
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
||||||
|
string "+$1$0$1+", the result is "=+babcb+=".
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The facility for inserting a (*MARK) name can be used to perform simple
|
||||||
|
simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
||||||
|
<pre>
|
||||||
|
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||||
|
apple lemon
|
||||||
|
2: pear orange
|
||||||
|
</pre>
|
||||||
|
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
|
||||||
function to iterate over the subject string, replacing every matching
|
function to iterate over the subject string, replacing every matching
|
||||||
substring. If this is not set, only the first matching substring is replaced.
|
substring. If this is not set, only the first matching substring is replaced.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <i>outlengthptr</i> argument must point to a variable that contains the
|
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra processing
|
||||||
length, in code units, of the output buffer. It is updated to contain the
|
to be applied to the replacement string. Without this option, only the dollar
|
||||||
length of the new string, excluding the trailing zero that is automatically
|
character is special, and only the group insertion forms listed above are
|
||||||
added.
|
valid. When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The function returns the number of replacements that were made. This may be
|
Firstly, backslash in a replacement string is interpreted as an escape
|
||||||
zero if no matches were found, and is never greater than 1 unless
|
character. The usual forms such as \n or \x{ddd} can be used to specify
|
||||||
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
|
particular character codes, and backslash followed by any non-alphanumeric
|
||||||
is returned. Except for PCRE2_ERROR_NOMATCH (which is never returned), any
|
character quotes that character. Extended quoting can be coded using \Q...\E,
|
||||||
errors from <b>pcre2_match()</b> or the substring copying functions are passed
|
exactly as in pattern strings.
|
||||||
straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid
|
</P>
|
||||||
replacement string (unrecognized sequence following a dollar sign), and
|
<P>
|
||||||
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough.
|
There are also four escape sequences for forcing the case of inserted letters.
|
||||||
|
The insertion mechanism has three states: no case forcing, force upper case,
|
||||||
|
and force lower case. The escape sequences change the current state: \U and
|
||||||
|
\L change to upper or lower case forcing, respectively, and \E (when not
|
||||||
|
terminating a \Q quoted sequence) reverts to no case forcing. The sequences
|
||||||
|
\u and \l force the next character (if it is a letter) to upper or lower
|
||||||
|
case, respectively, and then the state automatically reverts to no case
|
||||||
|
forcing. Case forcing applies to all inserted characters, including those from
|
||||||
|
captured groups and letters within \Q...\E quoted sequences.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Note that case forcing sequences such as \U...\E do not nest. For example,
|
||||||
|
the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no
|
||||||
|
effect.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
|
||||||
|
flexibility to group substitution. The syntax is similar to that used by Bash:
|
||||||
|
<pre>
|
||||||
|
${<n>:-<string>}
|
||||||
|
${<n>:+<string1>:<string2>}
|
||||||
|
</pre>
|
||||||
|
As before, <n> may be a group number or a name. The first form specifies a
|
||||||
|
default value. If group <n> is set, its value is inserted; if not, <string> is
|
||||||
|
expanded and the result inserted. The second form specifies strings that are
|
||||||
|
expanded and inserted when group <n> is set or unset, respectively. The first
|
||||||
|
form is just a convenient shorthand for
|
||||||
|
<pre>
|
||||||
|
${<n>:+${<n>}:<string>}
|
||||||
|
</pre>
|
||||||
|
Backslash can be used to escape colons and closing curly brackets in the
|
||||||
|
replacement strings. A change of the case forcing state within a replacement
|
||||||
|
string remains in force afterwards, as shown in this <b>pcre2test</b> example:
|
||||||
|
<pre>
|
||||||
|
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
|
||||||
|
body
|
||||||
|
1: hello
|
||||||
|
somebody
|
||||||
|
1: HELLO
|
||||||
|
</pre>
|
||||||
|
If successful, the function returns the number of replacements that were made.
|
||||||
|
This may be zero if no matches were found, and is never greater than 1 unless
|
||||||
|
PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
In the event of an error, a negative error code is returned. Except for
|
||||||
|
PCRE2_ERROR_NOMATCH (which is never returned), errors from <b>pcre2_match()</b>
|
||||||
|
are passed straight back. PCRE2_ERROR_NOMEMORY is returned if the output buffer
|
||||||
|
is not big enough. PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax
|
||||||
|
errors in the replacement string, with more particular errors being
|
||||||
|
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence),
|
||||||
|
PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket not found), and
|
||||||
|
PCRE2_BADSUBSTITUTION (syntax error in extended group substitution). As for all
|
||||||
|
PCRE2 errors, a text message that describes the error can be obtained by
|
||||||
|
calling <b>pcre2_get_error_message()</b>.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -2908,8 +2970,8 @@ There are in addition the following errors that are specific to
|
||||||
PCRE2_ERROR_DFA_UITEM
|
PCRE2_ERROR_DFA_UITEM
|
||||||
</pre>
|
</pre>
|
||||||
This return is given if <b>pcre2_dfa_match()</b> encounters an item in the
|
This return is given if <b>pcre2_dfa_match()</b> encounters an item in the
|
||||||
pattern that it does not support, for instance, the use of \C or a back
|
pattern that it does not support, for instance, the use of \C in a UTF mode or
|
||||||
reference.
|
a back reference.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_ERROR_DFA_UCOND
|
PCRE2_ERROR_DFA_UCOND
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -2953,7 +3015,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 22 September 2015
|
Last updated: 16 October 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -18,23 +18,24 @@ please consult the man page, in case the conversion went wrong.
|
||||||
<li><a name="TOC3" href="#SEC3">BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a>
|
<li><a name="TOC3" href="#SEC3">BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES</a>
|
||||||
<li><a name="TOC4" href="#SEC4">BUILDING SHARED AND STATIC LIBRARIES</a>
|
<li><a name="TOC4" href="#SEC4">BUILDING SHARED AND STATIC LIBRARIES</a>
|
||||||
<li><a name="TOC5" href="#SEC5">UNICODE AND UTF SUPPORT</a>
|
<li><a name="TOC5" href="#SEC5">UNICODE AND UTF SUPPORT</a>
|
||||||
<li><a name="TOC6" href="#SEC6">JUST-IN-TIME COMPILER SUPPORT</a>
|
<li><a name="TOC6" href="#SEC6">DISABLING THE USE OF \C</a>
|
||||||
<li><a name="TOC7" href="#SEC7">NEWLINE RECOGNITION</a>
|
<li><a name="TOC7" href="#SEC7">JUST-IN-TIME COMPILER SUPPORT</a>
|
||||||
<li><a name="TOC8" href="#SEC8">WHAT \R MATCHES</a>
|
<li><a name="TOC8" href="#SEC8">NEWLINE RECOGNITION</a>
|
||||||
<li><a name="TOC9" href="#SEC9">HANDLING VERY LARGE PATTERNS</a>
|
<li><a name="TOC9" href="#SEC9">WHAT \R MATCHES</a>
|
||||||
<li><a name="TOC10" href="#SEC10">AVOIDING EXCESSIVE STACK USAGE</a>
|
<li><a name="TOC10" href="#SEC10">HANDLING VERY LARGE PATTERNS</a>
|
||||||
<li><a name="TOC11" href="#SEC11">LIMITING PCRE2 RESOURCE USAGE</a>
|
<li><a name="TOC11" href="#SEC11">AVOIDING EXCESSIVE STACK USAGE</a>
|
||||||
<li><a name="TOC12" href="#SEC12">CREATING CHARACTER TABLES AT BUILD TIME</a>
|
<li><a name="TOC12" href="#SEC12">LIMITING PCRE2 RESOURCE USAGE</a>
|
||||||
<li><a name="TOC13" href="#SEC13">USING EBCDIC CODE</a>
|
<li><a name="TOC13" href="#SEC13">CREATING CHARACTER TABLES AT BUILD TIME</a>
|
||||||
<li><a name="TOC14" href="#SEC14">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
|
<li><a name="TOC14" href="#SEC14">USING EBCDIC CODE</a>
|
||||||
<li><a name="TOC15" href="#SEC15">PCRE2GREP BUFFER SIZE</a>
|
<li><a name="TOC15" href="#SEC15">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a>
|
||||||
<li><a name="TOC16" href="#SEC16">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a>
|
<li><a name="TOC16" href="#SEC16">PCRE2GREP BUFFER SIZE</a>
|
||||||
<li><a name="TOC17" href="#SEC17">INCLUDING DEBUGGING CODE</a>
|
<li><a name="TOC17" href="#SEC17">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a>
|
||||||
<li><a name="TOC18" href="#SEC18">DEBUGGING WITH VALGRIND SUPPORT</a>
|
<li><a name="TOC18" href="#SEC18">INCLUDING DEBUGGING CODE</a>
|
||||||
<li><a name="TOC19" href="#SEC19">CODE COVERAGE REPORTING</a>
|
<li><a name="TOC19" href="#SEC19">DEBUGGING WITH VALGRIND SUPPORT</a>
|
||||||
<li><a name="TOC20" href="#SEC20">SEE ALSO</a>
|
<li><a name="TOC20" href="#SEC20">CODE COVERAGE REPORTING</a>
|
||||||
<li><a name="TOC21" href="#SEC21">AUTHOR</a>
|
<li><a name="TOC21" href="#SEC21">SEE ALSO</a>
|
||||||
<li><a name="TOC22" href="#SEC22">REVISION</a>
|
<li><a name="TOC22" href="#SEC22">AUTHOR</a>
|
||||||
|
<li><a name="TOC23" href="#SEC23">REVISION</a>
|
||||||
</ul>
|
</ul>
|
||||||
<br><a name="SEC1" href="#TOC1">BUILDING PCRE2</a><br>
|
<br><a name="SEC1" href="#TOC1">BUILDING PCRE2</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -148,13 +149,19 @@ properties. The application can request that they do by setting the PCRE2_UCP
|
||||||
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
|
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
|
||||||
request this by starting with (*UCP).
|
request this by starting with (*UCP).
|
||||||
</P>
|
</P>
|
||||||
|
<br><a name="SEC6" href="#TOC1">DISABLING THE USE OF \C</a><br>
|
||||||
<P>
|
<P>
|
||||||
The \C escape sequence, which matches a single code unit, even in a UTF mode,
|
The \C escape sequence, which matches a single code unit, even in a UTF mode,
|
||||||
can cause unpredictable behaviour because it may leave the current matching
|
can cause unpredictable behaviour because it may leave the current matching
|
||||||
point in the middle of a multi-code-unit character. It can be locked out by
|
point in the middle of a multi-code-unit character. The application can lock it
|
||||||
setting the PCRE2_NEVER_BACKSLASH_C option.
|
out by setting the PCRE2_NEVER_BACKSLASH_C option when calling
|
||||||
|
<b>pcre2_compile()</b>. There is also a build-time option
|
||||||
|
<pre>
|
||||||
|
--enable-never-backslash-C
|
||||||
|
</pre>
|
||||||
|
(note the upper case C) which locks out the use of \C entirely.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC6" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br>
|
<br><a name="SEC7" href="#TOC1">JUST-IN-TIME COMPILER SUPPORT</a><br>
|
||||||
<P>
|
<P>
|
||||||
Just-in-time compiler support is included in the build by specifying
|
Just-in-time compiler support is included in the build by specifying
|
||||||
<pre>
|
<pre>
|
||||||
|
@ -171,7 +178,7 @@ pcre2grep automatically makes use of it, unless you add
|
||||||
</pre>
|
</pre>
|
||||||
to the "configure" command.
|
to the "configure" command.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC7" href="#TOC1">NEWLINE RECOGNITION</a><br>
|
<br><a name="SEC8" href="#TOC1">NEWLINE RECOGNITION</a><br>
|
||||||
<P>
|
<P>
|
||||||
By default, PCRE2 interprets the linefeed (LF) character as indicating the end
|
By default, PCRE2 interprets the linefeed (LF) character as indicating the end
|
||||||
of a line. This is the normal newline character on Unix-like systems. You can
|
of a line. This is the normal newline character on Unix-like systems. You can
|
||||||
|
@ -208,7 +215,7 @@ Whatever default line ending convention is selected when PCRE2 is built can be
|
||||||
overridden by applications that use the library. At build time it is
|
overridden by applications that use the library. At build time it is
|
||||||
conventional to use the standard for your operating system.
|
conventional to use the standard for your operating system.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC8" href="#TOC1">WHAT \R MATCHES</a><br>
|
<br><a name="SEC9" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||||
<P>
|
<P>
|
||||||
By default, the sequence \R in a pattern matches any Unicode newline sequence,
|
By default, the sequence \R in a pattern matches any Unicode newline sequence,
|
||||||
independently of what has been selected as the line ending sequence. If you
|
independently of what has been selected as the line ending sequence. If you
|
||||||
|
@ -220,7 +227,7 @@ the default is changed so that \R matches only CR, LF, or CRLF. Whatever is
|
||||||
selected when PCRE2 is built can be overridden by applications that use the
|
selected when PCRE2 is built can be overridden by applications that use the
|
||||||
called.
|
called.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC9" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
|
<br><a name="SEC10" href="#TOC1">HANDLING VERY LARGE PATTERNS</a><br>
|
||||||
<P>
|
<P>
|
||||||
Within a compiled pattern, offset values are used to point from one part to
|
Within a compiled pattern, offset values are used to point from one part to
|
||||||
another (for example, from an opening parenthesis to an alternation
|
another (for example, from an opening parenthesis to an alternation
|
||||||
|
@ -239,7 +246,7 @@ longer offsets slows down the operation of PCRE2 because it has to load
|
||||||
additional data when handling them. For the 32-bit library the value is always
|
additional data when handling them. For the 32-bit library the value is always
|
||||||
4 and cannot be overridden; the value of --with-link-size is ignored.
|
4 and cannot be overridden; the value of --with-link-size is ignored.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC10" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
|
<br><a name="SEC11" href="#TOC1">AVOIDING EXCESSIVE STACK USAGE</a><br>
|
||||||
<P>
|
<P>
|
||||||
When matching with the <b>pcre2_match()</b> function, PCRE2 implements
|
When matching with the <b>pcre2_match()</b> function, PCRE2 implements
|
||||||
backtracking by making recursive calls to an internal function called
|
backtracking by making recursive calls to an internal function called
|
||||||
|
@ -261,7 +268,7 @@ custom memory management functions can be called instead. PCRE2 runs noticeably
|
||||||
more slowly when built in this way. This option affects only the
|
more slowly when built in this way. This option affects only the
|
||||||
<b>pcre2_match()</b> function; it is not relevant for <b>pcre2_dfa_match()</b>.
|
<b>pcre2_match()</b> function; it is not relevant for <b>pcre2_dfa_match()</b>.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC11" href="#TOC1">LIMITING PCRE2 RESOURCE USAGE</a><br>
|
<br><a name="SEC12" href="#TOC1">LIMITING PCRE2 RESOURCE USAGE</a><br>
|
||||||
<P>
|
<P>
|
||||||
Internally, PCRE2 has a function called <b>match()</b>, which it calls
|
Internally, PCRE2 has a function called <b>match()</b>, which it calls
|
||||||
repeatedly (sometimes recursively) when matching a pattern with the
|
repeatedly (sometimes recursively) when matching a pattern with the
|
||||||
|
@ -290,7 +297,7 @@ constraints. However, you can set a lower limit by adding, for example,
|
||||||
</pre>
|
</pre>
|
||||||
to the <b>configure</b> command. This value can also be overridden at run time.
|
to the <b>configure</b> command. This value can also be overridden at run time.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
|
<br><a name="SEC13" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
|
||||||
<P>
|
<P>
|
||||||
PCRE2 uses fixed tables for processing characters whose code points are less
|
PCRE2 uses fixed tables for processing characters whose code points are less
|
||||||
than 256. By default, PCRE2 is built with a set of tables that are distributed
|
than 256. By default, PCRE2 is built with a set of tables that are distributed
|
||||||
|
@ -307,7 +314,7 @@ compiling, because <b>dftables</b> is run on the local host. If you need to
|
||||||
create alternative tables when cross compiling, you will have to do so "by
|
create alternative tables when cross compiling, you will have to do so "by
|
||||||
hand".)
|
hand".)
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
|
<br><a name="SEC14" href="#TOC1">USING EBCDIC CODE</a><br>
|
||||||
<P>
|
<P>
|
||||||
PCRE2 assumes by default that it will run in an environment where the character
|
PCRE2 assumes by default that it will run in an environment where the character
|
||||||
code is ASCII or Unicode, which is a superset of ASCII. This is the case for
|
code is ASCII or Unicode, which is a superset of ASCII. This is the case for
|
||||||
|
@ -342,7 +349,7 @@ The options that select newline behaviour, such as --enable-newline-is-cr,
|
||||||
and equivalent run-time options, refer to these character values in an EBCDIC
|
and equivalent run-time options, refer to these character values in an EBCDIC
|
||||||
environment.
|
environment.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC14" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
|
<br><a name="SEC15" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
|
||||||
<P>
|
<P>
|
||||||
By default, <b>pcre2grep</b> reads all files as plain text. You can build it so
|
By default, <b>pcre2grep</b> reads all files as plain text. You can build it so
|
||||||
that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads
|
that it recognizes files whose names end in <b>.gz</b> or <b>.bz2</b>, and reads
|
||||||
|
@ -355,7 +362,7 @@ to the <b>configure</b> command. These options naturally require that the
|
||||||
relevant libraries are installed on your system. Configuration will fail if
|
relevant libraries are installed on your system. Configuration will fail if
|
||||||
they are not.
|
they are not.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC15" href="#TOC1">PCRE2GREP BUFFER SIZE</a><br>
|
<br><a name="SEC16" href="#TOC1">PCRE2GREP BUFFER SIZE</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2grep</b> uses an internal buffer to hold a "window" on the file it is
|
<b>pcre2grep</b> uses an internal buffer to hold a "window" on the file it is
|
||||||
scanning, in order to be able to output "before" and "after" lines when it
|
scanning, in order to be able to output "before" and "after" lines when it
|
||||||
|
@ -370,7 +377,7 @@ parameter value by adding, for example,
|
||||||
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override this
|
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override this
|
||||||
value by using --buffer-size on the command line..
|
value by using --buffer-size on the command line..
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC16" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
<br><a name="SEC17" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
||||||
<P>
|
<P>
|
||||||
If you add one of
|
If you add one of
|
||||||
<pre>
|
<pre>
|
||||||
|
@ -404,7 +411,7 @@ automatically included, you may need to add something like
|
||||||
</pre>
|
</pre>
|
||||||
immediately before the <b>configure</b> command.
|
immediately before the <b>configure</b> command.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC17" href="#TOC1">INCLUDING DEBUGGING CODE</a><br>
|
<br><a name="SEC18" href="#TOC1">INCLUDING DEBUGGING CODE</a><br>
|
||||||
<P>
|
<P>
|
||||||
If you add
|
If you add
|
||||||
<pre>
|
<pre>
|
||||||
|
@ -413,7 +420,7 @@ If you add
|
||||||
to the <b>configure</b> command, additional debugging code is included in the
|
to the <b>configure</b> command, additional debugging code is included in the
|
||||||
build. This feature is intended for use by the PCRE2 maintainers.
|
build. This feature is intended for use by the PCRE2 maintainers.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC18" href="#TOC1">DEBUGGING WITH VALGRIND SUPPORT</a><br>
|
<br><a name="SEC19" href="#TOC1">DEBUGGING WITH VALGRIND SUPPORT</a><br>
|
||||||
<P>
|
<P>
|
||||||
If you add
|
If you add
|
||||||
<pre>
|
<pre>
|
||||||
|
@ -423,7 +430,7 @@ to the <b>configure</b> command, PCRE2 will use valgrind annotations to mark
|
||||||
certain memory regions as unaddressable. This allows it to detect invalid
|
certain memory regions as unaddressable. This allows it to detect invalid
|
||||||
memory accesses, and is mostly useful for debugging PCRE2 itself.
|
memory accesses, and is mostly useful for debugging PCRE2 itself.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC19" href="#TOC1">CODE COVERAGE REPORTING</a><br>
|
<br><a name="SEC20" href="#TOC1">CODE COVERAGE REPORTING</a><br>
|
||||||
<P>
|
<P>
|
||||||
If your C compiler is gcc, you can build a version of PCRE2 that can generate a
|
If your C compiler is gcc, you can build a version of PCRE2 that can generate a
|
||||||
code coverage report for its test suite. To enable this, you must install
|
code coverage report for its test suite. To enable this, you must install
|
||||||
|
@ -480,11 +487,11 @@ This cleans all coverage data including the generated coverage report. For more
|
||||||
information about code coverage, see the <b>gcov</b> and <b>lcov</b>
|
information about code coverage, see the <b>gcov</b> and <b>lcov</b>
|
||||||
documentation.
|
documentation.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC20" href="#TOC1">SEE ALSO</a><br>
|
<br><a name="SEC21" href="#TOC1">SEE ALSO</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2api</b>(3), <b>pcre2-config</b>(3).
|
<b>pcre2api</b>(3), <b>pcre2-config</b>(3).
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br>
|
<br><a name="SEC22" href="#TOC1">AUTHOR</a><br>
|
||||||
<P>
|
<P>
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
<br>
|
<br>
|
||||||
|
@ -493,9 +500,9 @@ University Computing Service
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
<br>
|
<br>
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC22" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC23" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 24 April 2015
|
Last updated: 16 October 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -1236,14 +1236,21 @@ with \C in UTF-8 or UTF-16 mode means that the rest of the string may start
|
||||||
with a malformed UTF character. This has undefined results, because PCRE2
|
with a malformed UTF character. This has undefined results, because PCRE2
|
||||||
assumes that it is matching character by character in a valid UTF string (by
|
assumes that it is matching character by character in a valid UTF string (by
|
||||||
default it checks the subject string's validity at the start of processing
|
default it checks the subject string's validity at the start of processing
|
||||||
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the
|
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||||
use of \C by setting the PCRE2_NEVER_BACKSLASH_C option.
|
</P>
|
||||||
|
<P>
|
||||||
|
An application can lock out the use of \C by setting the
|
||||||
|
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
|
||||||
|
build PCRE2 with the use of \C permanently disabled.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
PCRE2 does not allow \C to appear in lookbehind assertions
|
PCRE2 does not allow \C to appear in lookbehind assertions
|
||||||
<a href="#lookbehind">(described below)</a>
|
<a href="#lookbehind">(described below)</a>
|
||||||
in a UTF mode, because this would make it impossible to calculate the length of
|
in a UTF mode, because this would make it impossible to calculate the length of
|
||||||
the lookbehind.
|
the lookbehind. Neither the alternative matching function
|
||||||
|
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
||||||
|
former gives a match-time error; the latter fails to optimize and so the match
|
||||||
|
is always run using the interpreter.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
In general, the \C escape sequence is best avoided. However, one way of using
|
In general, the \C escape sequence is best avoided. However, one way of using
|
||||||
|
@ -3351,7 +3358,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 01 September 2015
|
Last updated: 16 October 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -12,17 +12,21 @@ This page is part of the PCRE2 HTML documentation. It was generated
|
||||||
automatically from the original man page. If there is any nonsense in it,
|
automatically from the original man page. If there is any nonsense in it,
|
||||||
please consult the man page, in case the conversion went wrong.
|
please consult the man page, in case the conversion went wrong.
|
||||||
<br>
|
<br>
|
||||||
<br><b>
|
<ul>
|
||||||
PCRE2 PERFORMANCE
|
<li><a name="TOC1" href="#SEC1">PCRE2 PERFORMANCE</a>
|
||||||
</b><br>
|
<li><a name="TOC2" href="#SEC2">COMPILED PATTERN MEMORY USAGE</a>
|
||||||
|
<li><a name="TOC3" href="#SEC3">STACK USAGE AT RUN TIME</a>
|
||||||
|
<li><a name="TOC4" href="#SEC4">PROCESSING TIME</a>
|
||||||
|
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
|
||||||
|
<li><a name="TOC6" href="#SEC6">REVISION</a>
|
||||||
|
</ul>
|
||||||
|
<br><a name="SEC1" href="#TOC1">PCRE2 PERFORMANCE</a><br>
|
||||||
<P>
|
<P>
|
||||||
Two aspects of performance are discussed below: memory usage and processing
|
Two aspects of performance are discussed below: memory usage and processing
|
||||||
time. The way you express your pattern as a regular expression can affect both
|
time. The way you express your pattern as a regular expression can affect both
|
||||||
of them.
|
of them.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><a name="SEC2" href="#TOC1">COMPILED PATTERN MEMORY USAGE</a><br>
|
||||||
COMPILED PATTERN MEMORY USAGE
|
|
||||||
</b><br>
|
|
||||||
<P>
|
<P>
|
||||||
Patterns are compiled by PCRE2 into a reasonably efficient interpretive code,
|
Patterns are compiled by PCRE2 into a reasonably efficient interpretive code,
|
||||||
so that most simple patterns do not use much memory. However, there is one case
|
so that most simple patterns do not use much memory. However, there is one case
|
||||||
|
@ -75,9 +79,7 @@ pattern. Nevertheless, if the atomic grouping is not a problem and the loss of
|
||||||
speed is acceptable, this kind of rewriting will allow you to process patterns
|
speed is acceptable, this kind of rewriting will allow you to process patterns
|
||||||
that PCRE2 cannot otherwise handle.
|
that PCRE2 cannot otherwise handle.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><a name="SEC3" href="#TOC1">STACK USAGE AT RUN TIME</a><br>
|
||||||
STACK USAGE AT RUN TIME
|
|
||||||
</b><br>
|
|
||||||
<P>
|
<P>
|
||||||
When <b>pcre2_match()</b> is used for matching, certain kinds of pattern can
|
When <b>pcre2_match()</b> is used for matching, certain kinds of pattern can
|
||||||
cause it to use large amounts of the process stack. In some environments the
|
cause it to use large amounts of the process stack. In some environments the
|
||||||
|
@ -86,9 +88,7 @@ SIGSEGV. Rewriting your pattern can often help. The
|
||||||
<a href="pcre2stack.html"><b>pcre2stack</b></a>
|
<a href="pcre2stack.html"><b>pcre2stack</b></a>
|
||||||
documentation discusses this issue in detail.
|
documentation discusses this issue in detail.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><a name="SEC4" href="#TOC1">PROCESSING TIME</a><br>
|
||||||
PROCESSING TIME
|
|
||||||
</b><br>
|
|
||||||
<P>
|
<P>
|
||||||
Certain items in regular expression patterns are processed more efficiently
|
Certain items in regular expression patterns are processed more efficiently
|
||||||
than others. It is more efficient to use a character class like [aeiou] than a
|
than others. It is more efficient to use a character class like [aeiou] than a
|
||||||
|
@ -177,9 +177,7 @@ appreciable time with strings longer than about 20 characters.
|
||||||
In many cases, the solution to this kind of performance issue is to use an
|
In many cases, the solution to this kind of performance issue is to use an
|
||||||
atomic group or a possessive quantifier.
|
atomic group or a possessive quantifier.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
|
||||||
AUTHOR
|
|
||||||
</b><br>
|
|
||||||
<P>
|
<P>
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
<br>
|
<br>
|
||||||
|
@ -188,9 +186,7 @@ University Computing Service
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
<br>
|
<br>
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||||
REVISION
|
|
||||||
</b><br>
|
|
||||||
<P>
|
<P>
|
||||||
Last updated: 02 January 2015
|
Last updated: 02 January 2015
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -111,9 +111,10 @@ it matches a literal "u".
|
||||||
\W a "non-word" character
|
\W a "non-word" character
|
||||||
\X a Unicode extended grapheme cluster
|
\X a Unicode extended grapheme cluster
|
||||||
</pre>
|
</pre>
|
||||||
The application can lock out the use of \C by setting the
|
\C is dangerous because it may leave the current matching point in the middle
|
||||||
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the
|
of a UTF-8 or UTF-16 character. The application can lock out the use of \C by
|
||||||
current matching point in the middle of a UTF-8 or UTF-16 character.
|
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||||
|
with the use of \C permanently disabled.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode
|
By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode
|
||||||
|
@ -588,7 +589,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 17 July 2015
|
Last updated: 16 October 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -155,12 +155,13 @@ following options output the value and set the exit code as indicated:
|
||||||
The following options output 1 for true or 0 for false, and set the exit code
|
The following options output 1 for true or 0 for false, and set the exit code
|
||||||
to the same value:
|
to the same value:
|
||||||
<pre>
|
<pre>
|
||||||
ebcdic compiled for an EBCDIC environment
|
backslash-C \C is supported (not locked out)
|
||||||
jit just-in-time support is available
|
ebcdic compiled for an EBCDIC environment
|
||||||
pcre2-16 the 16-bit library was built
|
jit just-in-time support is available
|
||||||
pcre2-32 the 32-bit library was built
|
pcre2-16 the 16-bit library was built
|
||||||
pcre2-8 the 8-bit library was built
|
pcre2-32 the 32-bit library was built
|
||||||
unicode Unicode support is available
|
pcre2-8 the 8-bit library was built
|
||||||
|
unicode Unicode support is available
|
||||||
</pre>
|
</pre>
|
||||||
If an unknown option is given, an error message is output; the exit code is 0.
|
If an unknown option is given, an error message is output; the exit code is 0.
|
||||||
</P>
|
</P>
|
||||||
|
@ -510,7 +511,7 @@ Setting compilation options
|
||||||
<P>
|
<P>
|
||||||
The following modifiers set options for <b>pcre2_compile()</b>. The most common
|
The following modifiers set options for <b>pcre2_compile()</b>. The most common
|
||||||
ones have single-letter abbreviations. See
|
ones have single-letter abbreviations. See
|
||||||
<a href="pcreapi.html"><b>pcreapi</b></a>
|
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||||
for a description of their effects.
|
for a description of their effects.
|
||||||
<pre>
|
<pre>
|
||||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||||
|
@ -537,6 +538,7 @@ for a description of their effects.
|
||||||
no_utf_check set PCRE2_NO_UTF_CHECK
|
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||||
ucp set PCRE2_UCP
|
ucp set PCRE2_UCP
|
||||||
ungreedy set PCRE2_UNGREEDY
|
ungreedy set PCRE2_UNGREEDY
|
||||||
|
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||||
utf set PCRE2_UTF
|
utf set PCRE2_UTF
|
||||||
</pre>
|
</pre>
|
||||||
As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all
|
As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all
|
||||||
|
@ -564,6 +566,7 @@ about the pattern:
|
||||||
locale=<name> use this locale
|
locale=<name> use this locale
|
||||||
memory show memory used
|
memory show memory used
|
||||||
newline=<type> set newline type
|
newline=<type> set newline type
|
||||||
|
null_context compile with a NULL context
|
||||||
parens_nest_limit=<n> set maximum parentheses depth
|
parens_nest_limit=<n> set maximum parentheses depth
|
||||||
posix use the POSIX API
|
posix use the POSIX API
|
||||||
push push compiled pattern onto the stack
|
push push compiled pattern onto the stack
|
||||||
|
@ -642,6 +645,15 @@ is requested. For each callout, either its number or string is given, followed
|
||||||
by the item that follows it in the pattern.
|
by the item that follows it in the pattern.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
Passing a NULL context
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_compile()</b>. If
|
||||||
|
the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||||
|
testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
||||||
|
default values).
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
Specifying a pattern in hex
|
Specifying a pattern in hex
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -920,9 +932,11 @@ pattern.
|
||||||
/g global global matching
|
/g global global matching
|
||||||
jitstack=<n> set size of JIT stack
|
jitstack=<n> set size of JIT stack
|
||||||
mark show mark values
|
mark show mark values
|
||||||
match_limit=>n> set a match limit
|
match_limit=<n> set a match limit
|
||||||
memory show memory usage
|
memory show memory usage
|
||||||
|
null_context match with a NULL context
|
||||||
offset=<n> set starting offset
|
offset=<n> set starting offset
|
||||||
|
offset_limit=<n> set offset limit
|
||||||
ovector=<n> set size of output vector
|
ovector=<n> set size of output vector
|
||||||
recursion_limit=<n> set a recursion limit
|
recursion_limit=<n> set a recursion limit
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
|
@ -1170,6 +1184,16 @@ The <b>offset</b> modifier sets an offset in the subject string at which
|
||||||
matching starts. Its value is a number of code units, not characters.
|
matching starts. Its value is a number of code units, not characters.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
Setting an offset limit
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
The <b>offset_limit</b> modifier sets a limit for unanchored matches. If a match
|
||||||
|
cannot be found starting at or before this offset in the subject, a "no match"
|
||||||
|
return is given. The data value is a number of code units, not characters. When
|
||||||
|
this modifier is used, the <b>use_offset_limit</b> modifier must have been set
|
||||||
|
for the pattern; if not, an error is generated.
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
Setting the size of the output vector
|
Setting the size of the output vector
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -1201,6 +1225,17 @@ this modifier has no effect, as there is no facility for passing a length.)
|
||||||
When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
|
When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
|
||||||
passing the replacement string as zero-terminated.
|
passing the replacement string as zero-terminated.
|
||||||
</P>
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Passing a NULL context
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||||
|
<b>pcre2_dfa_match()</b> or <b>pcre2_jit_match()</b>. If the <b>null_context</b>
|
||||||
|
modifier is set, however, NULL is passed. This is for testing that the matching
|
||||||
|
functions behave correctly in this case (they use default values). This
|
||||||
|
modifier cannot be used with the <b>find_limits</b> modifier or when testing the
|
||||||
|
substitution function.
|
||||||
|
</P>
|
||||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||||
<P>
|
<P>
|
||||||
By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
|
By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
|
||||||
|
@ -1539,7 +1574,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 14 September 2015
|
Last updated: 17 October 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -71,11 +71,12 @@ The escape sequence \C can be used to match a single code unit, in a UTF mode,
|
||||||
but its use can lead to some strange effects because it breaks up multi-unit
|
but its use can lead to some strange effects because it breaks up multi-unit
|
||||||
characters (see the description of \C in the
|
characters (see the description of \C in the
|
||||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||||
documentation). The use of \C is not supported in the alternative matching
|
documentation). The use of \C is not supported by the alternative matching
|
||||||
function <b>pcre2_dfa_match()</b>, nor is it supported in UTF mode by the JIT
|
function <b>pcre2_dfa_match()</b> when in UTF mode. Its use provokes a
|
||||||
optimization. If JIT optimization is requested for a UTF pattern that contains
|
match-time error. The JIT optimization also does not support \C in UTF mode.
|
||||||
\C, it will not succeed, and so the matching will be carried out by the normal
|
If JIT optimization is requested for a UTF pattern that contains \C, it will
|
||||||
interpretive function.
|
not succeed, and so the matching will be carried out by the normal interpretive
|
||||||
|
function.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
||||||
|
@ -275,7 +276,7 @@ Cambridge, England.
|
||||||
REVISION
|
REVISION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 18 August 2015
|
Last updated: 16 October 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
10
doc/pcre2.3
10
doc/pcre2.3
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2 3 "13 April 2015" "PCRE2 10.20"
|
.TH PCRE2 3 "16 October 2015" "PCRE2 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH INTRODUCTION
|
.SH INTRODUCTION
|
||||||
|
@ -118,8 +118,10 @@ running redundant checks.
|
||||||
.P
|
.P
|
||||||
The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||||
problems, because it may leave the current matching point in the middle of a
|
problems, because it may leave the current matching point in the middle of a
|
||||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used to
|
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||||
lock out the use of \eC, causing a compile-time error if it is encountered.
|
application to lock out the use of \eC, causing a compile-time error if it is
|
||||||
|
encountered. It is also possible to build PCRE2 with the use of \eC permanently
|
||||||
|
disabled.
|
||||||
.P
|
.P
|
||||||
Another way that performance can be hit is by running a pattern that has a very
|
Another way that performance can be hit is by running a pattern that has a very
|
||||||
large search tree against a string that will never match. Nested unlimited
|
large search tree against a string that will never match. Nested unlimited
|
||||||
|
@ -187,6 +189,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 13 April 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
514
doc/pcre2.txt
514
doc/pcre2.txt
|
@ -104,26 +104,27 @@ SECURITY CONSIDERATIONS
|
||||||
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead
|
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead
|
||||||
to problems, because it may leave the current matching point in the
|
to problems, because it may leave the current matching point in the
|
||||||
middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C
|
middle of a multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C
|
||||||
option can be used to lock out the use of \C, causing a compile-time
|
option can be used by an application to lock out the use of \C, causing
|
||||||
error if it is encountered.
|
a compile-time error if it is encountered. It is also possible to build
|
||||||
|
PCRE2 with the use of \C permanently disabled.
|
||||||
|
|
||||||
Another way that performance can be hit is by running a pattern that
|
Another way that performance can be hit is by running a pattern that
|
||||||
has a very large search tree against a string that will never match.
|
has a very large search tree against a string that will never match.
|
||||||
Nested unlimited repeats in a pattern are a common example. PCRE2 pro-
|
Nested unlimited repeats in a pattern are a common example. PCRE2 pro-
|
||||||
vides some protection against this: see the pcre2_set_match_limit()
|
vides some protection against this: see the pcre2_set_match_limit()
|
||||||
function in the pcre2api page.
|
function in the pcre2api page.
|
||||||
|
|
||||||
|
|
||||||
USER DOCUMENTATION
|
USER DOCUMENTATION
|
||||||
|
|
||||||
The user documentation for PCRE2 comprises a number of different sec-
|
The user documentation for PCRE2 comprises a number of different sec-
|
||||||
tions. In the "man" format, each of these is a separate "man page". In
|
tions. In the "man" format, each of these is a separate "man page". In
|
||||||
the HTML format, each is a separate page, linked from the index page.
|
the HTML format, each is a separate page, linked from the index page.
|
||||||
In the plain text format, the descriptions of the pcre2grep and
|
In the plain text format, the descriptions of the pcre2grep and
|
||||||
pcre2test programs are in files called pcre2grep.txt and pcre2test.txt,
|
pcre2test programs are in files called pcre2grep.txt and pcre2test.txt,
|
||||||
respectively. The remaining sections, except for the pcre2demo section
|
respectively. The remaining sections, except for the pcre2demo section
|
||||||
(which is a program listing), and the short pages for individual func-
|
(which is a program listing), and the short pages for individual func-
|
||||||
tions, are concatenated in pcre2.txt, for ease of searching. The sec-
|
tions, are concatenated in pcre2.txt, for ease of searching. The sec-
|
||||||
tions are as follows:
|
tions are as follows:
|
||||||
|
|
||||||
pcre2 this document
|
pcre2 this document
|
||||||
|
@ -148,7 +149,7 @@ USER DOCUMENTATION
|
||||||
pcre2test description of the pcre2test command
|
pcre2test description of the pcre2test command
|
||||||
pcre2unicode discussion of Unicode and UTF support
|
pcre2unicode discussion of Unicode and UTF support
|
||||||
|
|
||||||
In the "man" and HTML formats, there is also a short page for each C
|
In the "man" and HTML formats, there is also a short page for each C
|
||||||
library function, listing its arguments and results.
|
library function, listing its arguments and results.
|
||||||
|
|
||||||
|
|
||||||
|
@ -158,14 +159,14 @@ AUTHOR
|
||||||
University Computing Service
|
University Computing Service
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
|
|
||||||
Putting an actual email address here is a spam magnet. If you want to
|
Putting an actual email address here is a spam magnet. If you want to
|
||||||
email me, use my two initials, followed by the two digits 10, at the
|
email me, use my two initials, followed by the two digits 10, at the
|
||||||
domain cam.ac.uk.
|
domain cam.ac.uk.
|
||||||
|
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 13 April 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -1276,7 +1277,9 @@ COMPILING A PATTERN
|
||||||
piled. This escape can cause unpredictable behaviour in UTF-8 or
|
piled. This escape can cause unpredictable behaviour in UTF-8 or
|
||||||
UTF-16 modes, because it may leave the current matching point in the
|
UTF-16 modes, because it may leave the current matching point in the
|
||||||
middle of a multi-code-unit character. This option may be useful in
|
middle of a multi-code-unit character. This option may be useful in
|
||||||
applications that process patterns from external sources.
|
applications that process patterns from external sources. Note that
|
||||||
|
there is also a build-time option that permanently locks out the use of
|
||||||
|
\C.
|
||||||
|
|
||||||
PCRE2_NEVER_UCP
|
PCRE2_NEVER_UCP
|
||||||
|
|
||||||
|
@ -2571,19 +2574,36 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
||||||
int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
|
int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject,
|
||||||
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
||||||
uint32_t options, pcre2_match_data *match_data,
|
uint32_t options, pcre2_match_data *match_data,
|
||||||
pcre2_match_context *mcontext, PCRE2_SPTR replacementzfP,
|
pcre2_match_context *mcontext, PCRE2_SPTR replacement,
|
||||||
PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferP,
|
PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferP,
|
||||||
PCRE2_SIZE *outlengthptr);
|
PCRE2_SIZE *outlengthptr);
|
||||||
|
|
||||||
This function calls pcre2_match() and then makes a copy of the subject
|
This function calls pcre2_match() and then makes a copy of the subject
|
||||||
string in outputbuffer, replacing the part that was matched with the
|
string in outputbuffer, replacing the part that was matched with the
|
||||||
replacement string, whose length is supplied in rlength. This can be
|
replacement string, whose length is supplied in rlength. This can be
|
||||||
given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||||
|
|
||||||
|
The first seven arguments of pcre2_substitute() are the same as for
|
||||||
|
pcre2_match(), except that the partial matching options are not permit-
|
||||||
|
ted, and match_data may be passed as NULL, in which case a match data
|
||||||
|
block is obtained and freed within this function, using memory manage-
|
||||||
|
ment functions from the match context, if provided, or else those that
|
||||||
|
were used to allocate memory for the compiled code.
|
||||||
|
|
||||||
|
The outlengthptr argument must point to a variable that contains the
|
||||||
|
length, in code units, of the output buffer. If the function is suc-
|
||||||
|
cessful, the value is updated to contain the length of the new string,
|
||||||
|
excluding the trailing zero that is automatically added. If the func-
|
||||||
|
tion is not successful, the value is set to PCRE2_UNSET for general
|
||||||
|
errors (such as output buffer too small). For syntax errors in the
|
||||||
|
replacement string, the value is set to the offset in the replacement
|
||||||
|
string where the error was detected.
|
||||||
|
|
||||||
In the replacement string, which is interpreted as a UTF string in UTF
|
In the replacement string, which is interpreted as a UTF string in UTF
|
||||||
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
||||||
option is set, a dollar character is an escape character that can spec-
|
option is set, a dollar character is an escape character that can spec-
|
||||||
ify the insertion of characters from capturing groups or (*MARK) items
|
ify the insertion of characters from capturing groups or (*MARK) items
|
||||||
in the pattern. The following forms are recognized:
|
in the pattern. The following forms are always recognized:
|
||||||
|
|
||||||
$$ insert a dollar character
|
$$ insert a dollar character
|
||||||
$<n> or ${<n>} insert the contents of group <n>
|
$<n> or ${<n>} insert the contents of group <n>
|
||||||
|
@ -2594,8 +2614,7 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
||||||
preted as part of the number or name. The number may be zero to include
|
preted as part of the number or name. The number may be zero to include
|
||||||
the entire matched string. For example, if the pattern a(b)c is
|
the entire matched string. For example, if the pattern a(b)c is
|
||||||
matched with "=abc=" and the replacement string "+$1$0$1+", the result
|
matched with "=abc=" and the replacement string "+$1$0$1+", the result
|
||||||
is "=+babcb+=". Group insertion is done by calling pcre2_copy_byname()
|
is "=+babcb+=".
|
||||||
or pcre2_copy_bynumber() as appropriate.
|
|
||||||
|
|
||||||
The facility for inserting a (*MARK) name can be used to perform simple
|
The facility for inserting a (*MARK) name can be used to perform simple
|
||||||
simultaneous substitutions, as this pcre2test example shows:
|
simultaneous substitutions, as this pcre2test example shows:
|
||||||
|
@ -2604,32 +2623,80 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
||||||
apple lemon
|
apple lemon
|
||||||
2: pear orange
|
2: pear orange
|
||||||
|
|
||||||
The first seven arguments of pcre2_substitute() are the same as for
|
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
|
||||||
pcre2_match(), except that the partial matching options are not permit-
|
|
||||||
ted, and match_data may be passed as NULL, in which case a match data
|
|
||||||
block is obtained and freed within this function, using memory manage-
|
|
||||||
ment functions from the match context, if provided, or else those that
|
|
||||||
were used to allocate memory for the compiled code.
|
|
||||||
|
|
||||||
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
|
|
||||||
the function to iterate over the subject string, replacing every match-
|
the function to iterate over the subject string, replacing every match-
|
||||||
ing substring. If this is not set, only the first matching substring is
|
ing substring. If this is not set, only the first matching substring is
|
||||||
replaced.
|
replaced.
|
||||||
|
|
||||||
The outlengthptr argument must point to a variable that contains the
|
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra
|
||||||
length, in code units, of the output buffer. It is updated to contain
|
processing to be applied to the replacement string. Without this
|
||||||
the length of the new string, excluding the trailing zero that is auto-
|
option, only the dollar character is special, and only the group inser-
|
||||||
matically added.
|
tion forms listed above are valid. When PCRE2_SUBSTITUTE_EXTENDED is
|
||||||
|
set, two things change:
|
||||||
|
|
||||||
The function returns the number of replacements that were made. This
|
Firstly, backslash in a replacement string is interpreted as an escape
|
||||||
may be zero if no matches were found, and is never greater than 1
|
character. The usual forms such as \n or \x{ddd} can be used to specify
|
||||||
unless PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a neg-
|
particular character codes, and backslash followed by any non-alphanu-
|
||||||
ative error code is returned. Except for PCRE2_ERROR_NOMATCH (which is
|
meric character quotes that character. Extended quoting can be coded
|
||||||
never returned), any errors from pcre2_match() or the substring copying
|
using \Q...\E, exactly as in pattern strings.
|
||||||
functions are passed straight back. PCRE2_ERROR_BADREPLACEMENT is
|
|
||||||
returned for an invalid replacement string (unrecognized sequence fol-
|
There are also four escape sequences for forcing the case of inserted
|
||||||
lowing a dollar sign), and PCRE2_ERROR_NOMEMORY is returned if the out-
|
letters. The insertion mechanism has three states: no case forcing,
|
||||||
put buffer is not big enough.
|
force upper case, and force lower case. The escape sequences change the
|
||||||
|
current state: \U and \L change to upper or lower case forcing, respec-
|
||||||
|
tively, and \E (when not terminating a \Q quoted sequence) reverts to
|
||||||
|
no case forcing. The sequences \u and \l force the next character (if
|
||||||
|
it is a letter) to upper or lower case, respectively, and then the
|
||||||
|
state automatically reverts to no case forcing. Case forcing applies to
|
||||||
|
all inserted characters, including those from captured groups and let-
|
||||||
|
ters within \Q...\E quoted sequences.
|
||||||
|
|
||||||
|
Note that case forcing sequences such as \U...\E do not nest. For exam-
|
||||||
|
ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final
|
||||||
|
\E has no effect.
|
||||||
|
|
||||||
|
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
|
||||||
|
flexibility to group substitution. The syntax is similar to that used
|
||||||
|
by Bash:
|
||||||
|
|
||||||
|
${<n>:-<string>}
|
||||||
|
${<n>:+<string1>:<string2>}
|
||||||
|
|
||||||
|
As before, <n> may be a group number or a name. The first form speci-
|
||||||
|
fies a default value. If group <n> is set, its value is inserted; if
|
||||||
|
not, <string> is expanded and the result inserted. The second form
|
||||||
|
specifies strings that are expanded and inserted when group <n> is set
|
||||||
|
or unset, respectively. The first form is just a convenient shorthand
|
||||||
|
for
|
||||||
|
|
||||||
|
${<n>:+${<n>}:<string>}
|
||||||
|
|
||||||
|
Backslash can be used to escape colons and closing curly brackets in
|
||||||
|
the replacement strings. A change of the case forcing state within a
|
||||||
|
replacement string remains in force afterwards, as shown in this
|
||||||
|
pcre2test example:
|
||||||
|
|
||||||
|
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
|
||||||
|
body
|
||||||
|
1: hello
|
||||||
|
somebody
|
||||||
|
1: HELLO
|
||||||
|
|
||||||
|
If successful, the function returns the number of replacements that
|
||||||
|
were made. This may be zero if no matches were found, and is never
|
||||||
|
greater than 1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||||
|
|
||||||
|
In the event of an error, a negative error code is returned. Except for
|
||||||
|
PCRE2_ERROR_NOMATCH (which is never returned), errors from
|
||||||
|
pcre2_match() are passed straight back. PCRE2_ERROR_NOMEMORY is
|
||||||
|
returned if the output buffer is not big enough.
|
||||||
|
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in
|
||||||
|
the replacement string, with more particular errors being
|
||||||
|
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REP-
|
||||||
|
MISSING_BRACE (closing curly bracket not found), and PCRE2_BADSUBSTITU-
|
||||||
|
TION (syntax error in extended group substitution). As for all PCRE2
|
||||||
|
errors, a text message that describes the error can be obtained by
|
||||||
|
calling pcre2_get_error_message().
|
||||||
|
|
||||||
|
|
||||||
DUPLICATE SUBPATTERN NAMES
|
DUPLICATE SUBPATTERN NAMES
|
||||||
|
@ -2845,8 +2912,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
PCRE2_ERROR_DFA_UITEM
|
PCRE2_ERROR_DFA_UITEM
|
||||||
|
|
||||||
This return is given if pcre2_dfa_match() encounters an item in the
|
This return is given if pcre2_dfa_match() encounters an item in the
|
||||||
pattern that it does not support, for instance, the use of \C or a back
|
pattern that it does not support, for instance, the use of \C in a UTF
|
||||||
reference.
|
mode or a back reference.
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_UCOND
|
PCRE2_ERROR_DFA_UCOND
|
||||||
|
|
||||||
|
@ -2890,7 +2957,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 22 September 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -3010,10 +3077,18 @@ UNICODE AND UTF SUPPORT
|
||||||
PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a
|
PCRE2_UCP option. Unless the application has set PCRE2_NEVER_UCP, a
|
||||||
pattern may also request this by starting with (*UCP).
|
pattern may also request this by starting with (*UCP).
|
||||||
|
|
||||||
|
|
||||||
|
DISABLING THE USE OF \C
|
||||||
|
|
||||||
The \C escape sequence, which matches a single code unit, even in a UTF
|
The \C escape sequence, which matches a single code unit, even in a UTF
|
||||||
mode, can cause unpredictable behaviour because it may leave the cur-
|
mode, can cause unpredictable behaviour because it may leave the cur-
|
||||||
rent matching point in the middle of a multi-code-unit character. It
|
rent matching point in the middle of a multi-code-unit character. The
|
||||||
can be locked out by setting the PCRE2_NEVER_BACKSLASH_C option.
|
application can lock it out by setting the PCRE2_NEVER_BACKSLASH_C
|
||||||
|
option when calling pcre2_compile(). There is also a build-time option
|
||||||
|
|
||||||
|
--enable-never-backslash-C
|
||||||
|
|
||||||
|
(note the upper case C) which locks out the use of \C entirely.
|
||||||
|
|
||||||
|
|
||||||
JUST-IN-TIME COMPILER SUPPORT
|
JUST-IN-TIME COMPILER SUPPORT
|
||||||
|
@ -3022,10 +3097,10 @@ JUST-IN-TIME COMPILER SUPPORT
|
||||||
|
|
||||||
--enable-jit
|
--enable-jit
|
||||||
|
|
||||||
This support is available only for certain hardware architectures. If
|
This support is available only for certain hardware architectures. If
|
||||||
this option is set for an unsupported architecture, a building error
|
this option is set for an unsupported architecture, a building error
|
||||||
occurs. See the pcre2jit documentation for a discussion of JIT usage.
|
occurs. See the pcre2jit documentation for a discussion of JIT usage.
|
||||||
When JIT support is enabled, pcre2grep automatically makes use of it,
|
When JIT support is enabled, pcre2grep automatically makes use of it,
|
||||||
unless you add
|
unless you add
|
||||||
|
|
||||||
--disable-pcre2grep-jit
|
--disable-pcre2grep-jit
|
||||||
|
@ -3035,14 +3110,14 @@ JUST-IN-TIME COMPILER SUPPORT
|
||||||
|
|
||||||
NEWLINE RECOGNITION
|
NEWLINE RECOGNITION
|
||||||
|
|
||||||
By default, PCRE2 interprets the linefeed (LF) character as indicating
|
By default, PCRE2 interprets the linefeed (LF) character as indicating
|
||||||
the end of a line. This is the normal newline character on Unix-like
|
the end of a line. This is the normal newline character on Unix-like
|
||||||
systems. You can compile PCRE2 to use carriage return (CR) instead, by
|
systems. You can compile PCRE2 to use carriage return (CR) instead, by
|
||||||
adding
|
adding
|
||||||
|
|
||||||
--enable-newline-is-cr
|
--enable-newline-is-cr
|
||||||
|
|
||||||
to the configure command. There is also an --enable-newline-is-lf
|
to the configure command. There is also an --enable-newline-is-lf
|
||||||
option, which explicitly specifies linefeed as the newline character.
|
option, which explicitly specifies linefeed as the newline character.
|
||||||
|
|
||||||
Alternatively, you can specify that line endings are to be indicated by
|
Alternatively, you can specify that line endings are to be indicated by
|
||||||
|
@ -3055,76 +3130,76 @@ NEWLINE RECOGNITION
|
||||||
|
|
||||||
--enable-newline-is-anycrlf
|
--enable-newline-is-anycrlf
|
||||||
|
|
||||||
which causes PCRE2 to recognize any of the three sequences CR, LF, or
|
which causes PCRE2 to recognize any of the three sequences CR, LF, or
|
||||||
CRLF as indicating a line ending. Finally, a fifth option, specified by
|
CRLF as indicating a line ending. Finally, a fifth option, specified by
|
||||||
|
|
||||||
--enable-newline-is-any
|
--enable-newline-is-any
|
||||||
|
|
||||||
causes PCRE2 to recognize any Unicode newline sequence. The Unicode
|
causes PCRE2 to recognize any Unicode newline sequence. The Unicode
|
||||||
newline sequences are the three just mentioned, plus the single charac-
|
newline sequences are the three just mentioned, plus the single charac-
|
||||||
ters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line,
|
ters VT (vertical tab, U+000B), FF (form feed, U+000C), NEL (next line,
|
||||||
U+0085), LS (line separator, U+2028), and PS (paragraph separator,
|
U+0085), LS (line separator, U+2028), and PS (paragraph separator,
|
||||||
U+2029).
|
U+2029).
|
||||||
|
|
||||||
Whatever default line ending convention is selected when PCRE2 is built
|
Whatever default line ending convention is selected when PCRE2 is built
|
||||||
can be overridden by applications that use the library. At build time
|
can be overridden by applications that use the library. At build time
|
||||||
it is conventional to use the standard for your operating system.
|
it is conventional to use the standard for your operating system.
|
||||||
|
|
||||||
|
|
||||||
WHAT \R MATCHES
|
WHAT \R MATCHES
|
||||||
|
|
||||||
By default, the sequence \R in a pattern matches any Unicode newline
|
By default, the sequence \R in a pattern matches any Unicode newline
|
||||||
sequence, independently of what has been selected as the line ending
|
sequence, independently of what has been selected as the line ending
|
||||||
sequence. If you specify
|
sequence. If you specify
|
||||||
|
|
||||||
--enable-bsr-anycrlf
|
--enable-bsr-anycrlf
|
||||||
|
|
||||||
the default is changed so that \R matches only CR, LF, or CRLF. What-
|
the default is changed so that \R matches only CR, LF, or CRLF. What-
|
||||||
ever is selected when PCRE2 is built can be overridden by applications
|
ever is selected when PCRE2 is built can be overridden by applications
|
||||||
that use the called.
|
that use the called.
|
||||||
|
|
||||||
|
|
||||||
HANDLING VERY LARGE PATTERNS
|
HANDLING VERY LARGE PATTERNS
|
||||||
|
|
||||||
Within a compiled pattern, offset values are used to point from one
|
Within a compiled pattern, offset values are used to point from one
|
||||||
part to another (for example, from an opening parenthesis to an alter-
|
part to another (for example, from an opening parenthesis to an alter-
|
||||||
nation metacharacter). By default, in the 8-bit and 16-bit libraries,
|
nation metacharacter). By default, in the 8-bit and 16-bit libraries,
|
||||||
two-byte values are used for these offsets, leading to a maximum size
|
two-byte values are used for these offsets, leading to a maximum size
|
||||||
for a compiled pattern of around 64K code units. This is sufficient to
|
for a compiled pattern of around 64K code units. This is sufficient to
|
||||||
handle all but the most gigantic patterns. Nevertheless, some people do
|
handle all but the most gigantic patterns. Nevertheless, some people do
|
||||||
want to process truly enormous patterns, so it is possible to compile
|
want to process truly enormous patterns, so it is possible to compile
|
||||||
PCRE2 to use three-byte or four-byte offsets by adding a setting such
|
PCRE2 to use three-byte or four-byte offsets by adding a setting such
|
||||||
as
|
as
|
||||||
|
|
||||||
--with-link-size=3
|
--with-link-size=3
|
||||||
|
|
||||||
to the configure command. The value given must be 2, 3, or 4. For the
|
to the configure command. The value given must be 2, 3, or 4. For the
|
||||||
16-bit library, a value of 3 is rounded up to 4. In these libraries,
|
16-bit library, a value of 3 is rounded up to 4. In these libraries,
|
||||||
using longer offsets slows down the operation of PCRE2 because it has
|
using longer offsets slows down the operation of PCRE2 because it has
|
||||||
to load additional data when handling them. For the 32-bit library the
|
to load additional data when handling them. For the 32-bit library the
|
||||||
value is always 4 and cannot be overridden; the value of --with-link-
|
value is always 4 and cannot be overridden; the value of --with-link-
|
||||||
size is ignored.
|
size is ignored.
|
||||||
|
|
||||||
|
|
||||||
AVOIDING EXCESSIVE STACK USAGE
|
AVOIDING EXCESSIVE STACK USAGE
|
||||||
|
|
||||||
When matching with the pcre2_match() function, PCRE2 implements back-
|
When matching with the pcre2_match() function, PCRE2 implements back-
|
||||||
tracking by making recursive calls to an internal function called
|
tracking by making recursive calls to an internal function called
|
||||||
match(). In environments where the size of the stack is limited, this
|
match(). In environments where the size of the stack is limited, this
|
||||||
can severely limit PCRE2's operation. (The Unix environment does not
|
can severely limit PCRE2's operation. (The Unix environment does not
|
||||||
usually suffer from this problem, but it may sometimes be necessary to
|
usually suffer from this problem, but it may sometimes be necessary to
|
||||||
increase the maximum stack size. There is a discussion in the
|
increase the maximum stack size. There is a discussion in the
|
||||||
pcre2stack documentation.) An alternative approach to recursion that
|
pcre2stack documentation.) An alternative approach to recursion that
|
||||||
uses memory from the heap to remember data, instead of using recursive
|
uses memory from the heap to remember data, instead of using recursive
|
||||||
function calls, has been implemented to work round the problem of lim-
|
function calls, has been implemented to work round the problem of lim-
|
||||||
ited stack size. If you want to build a version of PCRE2 that works
|
ited stack size. If you want to build a version of PCRE2 that works
|
||||||
this way, add
|
this way, add
|
||||||
|
|
||||||
--disable-stack-for-recursion
|
--disable-stack-for-recursion
|
||||||
|
|
||||||
to the configure command. By default, the system functions malloc() and
|
to the configure command. By default, the system functions malloc() and
|
||||||
free() are called to manage the heap memory that is required, but cus-
|
free() are called to manage the heap memory that is required, but cus-
|
||||||
tom memory management functions can be called instead. PCRE2 runs
|
tom memory management functions can be called instead. PCRE2 runs
|
||||||
noticeably more slowly when built in this way. This option affects only
|
noticeably more slowly when built in this way. This option affects only
|
||||||
the pcre2_match() function; it is not relevant for pcre2_dfa_match().
|
the pcre2_match() function; it is not relevant for pcre2_dfa_match().
|
||||||
|
|
||||||
|
@ -3132,30 +3207,30 @@ AVOIDING EXCESSIVE STACK USAGE
|
||||||
LIMITING PCRE2 RESOURCE USAGE
|
LIMITING PCRE2 RESOURCE USAGE
|
||||||
|
|
||||||
Internally, PCRE2 has a function called match(), which it calls repeat-
|
Internally, PCRE2 has a function called match(), which it calls repeat-
|
||||||
edly (sometimes recursively) when matching a pattern with the
|
edly (sometimes recursively) when matching a pattern with the
|
||||||
pcre2_match() function. By controlling the maximum number of times this
|
pcre2_match() function. By controlling the maximum number of times this
|
||||||
function may be called during a single matching operation, a limit can
|
function may be called during a single matching operation, a limit can
|
||||||
be placed on the resources used by a single call to pcre2_match(). The
|
be placed on the resources used by a single call to pcre2_match(). The
|
||||||
limit can be changed at run time, as described in the pcre2api documen-
|
limit can be changed at run time, as described in the pcre2api documen-
|
||||||
tation. The default is 10 million, but this can be changed by adding a
|
tation. The default is 10 million, but this can be changed by adding a
|
||||||
setting such as
|
setting such as
|
||||||
|
|
||||||
--with-match-limit=500000
|
--with-match-limit=500000
|
||||||
|
|
||||||
to the configure command. This setting has no effect on the
|
to the configure command. This setting has no effect on the
|
||||||
pcre2_dfa_match() matching function.
|
pcre2_dfa_match() matching function.
|
||||||
|
|
||||||
In some environments it is desirable to limit the depth of recursive
|
In some environments it is desirable to limit the depth of recursive
|
||||||
calls of match() more strictly than the total number of calls, in order
|
calls of match() more strictly than the total number of calls, in order
|
||||||
to restrict the maximum amount of stack (or heap, if --disable-stack-
|
to restrict the maximum amount of stack (or heap, if --disable-stack-
|
||||||
for-recursion is specified) that is used. A second limit controls this;
|
for-recursion is specified) that is used. A second limit controls this;
|
||||||
it defaults to the value that is set for --with-match-limit, which
|
it defaults to the value that is set for --with-match-limit, which
|
||||||
imposes no additional constraints. However, you can set a lower limit
|
imposes no additional constraints. However, you can set a lower limit
|
||||||
by adding, for example,
|
by adding, for example,
|
||||||
|
|
||||||
--with-match-limit-recursion=10000
|
--with-match-limit-recursion=10000
|
||||||
|
|
||||||
to the configure command. This value can also be overridden at run
|
to the configure command. This value can also be overridden at run
|
||||||
time.
|
time.
|
||||||
|
|
||||||
|
|
||||||
|
@ -3163,45 +3238,45 @@ CREATING CHARACTER TABLES AT BUILD TIME
|
||||||
|
|
||||||
PCRE2 uses fixed tables for processing characters whose code points are
|
PCRE2 uses fixed tables for processing characters whose code points are
|
||||||
less than 256. By default, PCRE2 is built with a set of tables that are
|
less than 256. By default, PCRE2 is built with a set of tables that are
|
||||||
distributed in the file src/pcre2_chartables.c.dist. These tables are
|
distributed in the file src/pcre2_chartables.c.dist. These tables are
|
||||||
for ASCII codes only. If you add
|
for ASCII codes only. If you add
|
||||||
|
|
||||||
--enable-rebuild-chartables
|
--enable-rebuild-chartables
|
||||||
|
|
||||||
to the configure command, the distributed tables are no longer used.
|
to the configure command, the distributed tables are no longer used.
|
||||||
Instead, a program called dftables is compiled and run. This outputs
|
Instead, a program called dftables is compiled and run. This outputs
|
||||||
the source for new set of tables, created in the default locale of your
|
the source for new set of tables, created in the default locale of your
|
||||||
C run-time system. (This method of replacing the tables does not work
|
C run-time system. (This method of replacing the tables does not work
|
||||||
if you are cross compiling, because dftables is run on the local host.
|
if you are cross compiling, because dftables is run on the local host.
|
||||||
If you need to create alternative tables when cross compiling, you will
|
If you need to create alternative tables when cross compiling, you will
|
||||||
have to do so "by hand".)
|
have to do so "by hand".)
|
||||||
|
|
||||||
|
|
||||||
USING EBCDIC CODE
|
USING EBCDIC CODE
|
||||||
|
|
||||||
PCRE2 assumes by default that it will run in an environment where the
|
PCRE2 assumes by default that it will run in an environment where the
|
||||||
character code is ASCII or Unicode, which is a superset of ASCII. This
|
character code is ASCII or Unicode, which is a superset of ASCII. This
|
||||||
is the case for most computer operating systems. PCRE2 can, however, be
|
is the case for most computer operating systems. PCRE2 can, however, be
|
||||||
compiled to run in an 8-bit EBCDIC environment by adding
|
compiled to run in an 8-bit EBCDIC environment by adding
|
||||||
|
|
||||||
--enable-ebcdic --disable-unicode
|
--enable-ebcdic --disable-unicode
|
||||||
|
|
||||||
to the configure command. This setting implies --enable-rebuild-charta-
|
to the configure command. This setting implies --enable-rebuild-charta-
|
||||||
bles. You should only use it if you know that you are in an EBCDIC
|
bles. You should only use it if you know that you are in an EBCDIC
|
||||||
environment (for example, an IBM mainframe operating system).
|
environment (for example, an IBM mainframe operating system).
|
||||||
|
|
||||||
It is not possible to support both EBCDIC and UTF-8 codes in the same
|
It is not possible to support both EBCDIC and UTF-8 codes in the same
|
||||||
version of the library. Consequently, --enable-unicode and --enable-
|
version of the library. Consequently, --enable-unicode and --enable-
|
||||||
ebcdic are mutually exclusive.
|
ebcdic are mutually exclusive.
|
||||||
|
|
||||||
The EBCDIC character that corresponds to an ASCII LF is assumed to have
|
The EBCDIC character that corresponds to an ASCII LF is assumed to have
|
||||||
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
||||||
is used. In such an environment you should use
|
is used. In such an environment you should use
|
||||||
|
|
||||||
--enable-ebcdic-nl25
|
--enable-ebcdic-nl25
|
||||||
|
|
||||||
as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR
|
as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR
|
||||||
has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and
|
has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and
|
||||||
0x25 is not chosen as LF is made to correspond to the Unicode NEL char-
|
0x25 is not chosen as LF is made to correspond to the Unicode NEL char-
|
||||||
acter (which, in Unicode, is 0x85).
|
acter (which, in Unicode, is 0x85).
|
||||||
|
|
||||||
|
@ -3212,31 +3287,31 @@ USING EBCDIC CODE
|
||||||
|
|
||||||
PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT
|
PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT
|
||||||
|
|
||||||
By default, pcre2grep reads all files as plain text. You can build it
|
By default, pcre2grep reads all files as plain text. You can build it
|
||||||
so that it recognizes files whose names end in .gz or .bz2, and reads
|
so that it recognizes files whose names end in .gz or .bz2, and reads
|
||||||
them with libz or libbz2, respectively, by adding one or both of
|
them with libz or libbz2, respectively, by adding one or both of
|
||||||
|
|
||||||
--enable-pcre2grep-libz
|
--enable-pcre2grep-libz
|
||||||
--enable-pcre2grep-libbz2
|
--enable-pcre2grep-libbz2
|
||||||
|
|
||||||
to the configure command. These options naturally require that the rel-
|
to the configure command. These options naturally require that the rel-
|
||||||
evant libraries are installed on your system. Configuration will fail
|
evant libraries are installed on your system. Configuration will fail
|
||||||
if they are not.
|
if they are not.
|
||||||
|
|
||||||
|
|
||||||
PCRE2GREP BUFFER SIZE
|
PCRE2GREP BUFFER SIZE
|
||||||
|
|
||||||
pcre2grep uses an internal buffer to hold a "window" on the file it is
|
pcre2grep uses an internal buffer to hold a "window" on the file it is
|
||||||
scanning, in order to be able to output "before" and "after" lines when
|
scanning, in order to be able to output "before" and "after" lines when
|
||||||
it finds a match. The size of the buffer is controlled by a parameter
|
it finds a match. The size of the buffer is controlled by a parameter
|
||||||
whose default value is 20K. The buffer itself is three times this size,
|
whose default value is 20K. The buffer itself is three times this size,
|
||||||
but because of the way it is used for holding "before" lines, the long-
|
but because of the way it is used for holding "before" lines, the long-
|
||||||
est line that is guaranteed to be processable is the parameter size.
|
est line that is guaranteed to be processable is the parameter size.
|
||||||
You can change the default parameter value by adding, for example,
|
You can change the default parameter value by adding, for example,
|
||||||
|
|
||||||
--with-pcre2grep-bufsize=50K
|
--with-pcre2grep-bufsize=50K
|
||||||
|
|
||||||
to the configure command. The caller of pcre2grep can override this
|
to the configure command. The caller of pcre2grep can override this
|
||||||
value by using --buffer-size on the command line..
|
value by using --buffer-size on the command line..
|
||||||
|
|
||||||
|
|
||||||
|
@ -3247,26 +3322,26 @@ PCRE2TEST OPTION FOR LIBREADLINE SUPPORT
|
||||||
--enable-pcre2test-libreadline
|
--enable-pcre2test-libreadline
|
||||||
--enable-pcre2test-libedit
|
--enable-pcre2test-libedit
|
||||||
|
|
||||||
to the configure command, pcre2test is linked with the libreadline
|
to the configure command, pcre2test is linked with the libreadline
|
||||||
orlibedit library, respectively, and when its input is from a terminal,
|
orlibedit library, respectively, and when its input is from a terminal,
|
||||||
it reads it using the readline() function. This provides line-editing
|
it reads it using the readline() function. This provides line-editing
|
||||||
and history facilities. Note that libreadline is GPL-licensed, so if
|
and history facilities. Note that libreadline is GPL-licensed, so if
|
||||||
you distribute a binary of pcre2test linked in this way, there may be
|
you distribute a binary of pcre2test linked in this way, there may be
|
||||||
licensing issues. These can be avoided by linking instead with libedit,
|
licensing issues. These can be avoided by linking instead with libedit,
|
||||||
which has a BSD licence.
|
which has a BSD licence.
|
||||||
|
|
||||||
Setting --enable-pcre2test-libreadline causes the -lreadline option to
|
Setting --enable-pcre2test-libreadline causes the -lreadline option to
|
||||||
be added to the pcre2test build. In many operating environments with a
|
be added to the pcre2test build. In many operating environments with a
|
||||||
sytem-installed readline library this is sufficient. However, in some
|
sytem-installed readline library this is sufficient. However, in some
|
||||||
environments (e.g. if an unmodified distribution version of readline is
|
environments (e.g. if an unmodified distribution version of readline is
|
||||||
in use), some extra configuration may be necessary. The INSTALL file
|
in use), some extra configuration may be necessary. The INSTALL file
|
||||||
for libreadline says this:
|
for libreadline says this:
|
||||||
|
|
||||||
"Readline uses the termcap functions, but does not link with
|
"Readline uses the termcap functions, but does not link with
|
||||||
the termcap or curses library itself, allowing applications
|
the termcap or curses library itself, allowing applications
|
||||||
which link with readline the to choose an appropriate library."
|
which link with readline the to choose an appropriate library."
|
||||||
|
|
||||||
If your environment has not been set up so that an appropriate library
|
If your environment has not been set up so that an appropriate library
|
||||||
is automatically included, you may need to add something like
|
is automatically included, you may need to add something like
|
||||||
|
|
||||||
LIBS="-ncurses"
|
LIBS="-ncurses"
|
||||||
|
@ -3280,7 +3355,7 @@ INCLUDING DEBUGGING CODE
|
||||||
|
|
||||||
--enable-debug
|
--enable-debug
|
||||||
|
|
||||||
to the configure command, additional debugging code is included in the
|
to the configure command, additional debugging code is included in the
|
||||||
build. This feature is intended for use by the PCRE2 maintainers.
|
build. This feature is intended for use by the PCRE2 maintainers.
|
||||||
|
|
||||||
|
|
||||||
|
@ -3290,15 +3365,15 @@ DEBUGGING WITH VALGRIND SUPPORT
|
||||||
|
|
||||||
--enable-valgrind
|
--enable-valgrind
|
||||||
|
|
||||||
to the configure command, PCRE2 will use valgrind annotations to mark
|
to the configure command, PCRE2 will use valgrind annotations to mark
|
||||||
certain memory regions as unaddressable. This allows it to detect
|
certain memory regions as unaddressable. This allows it to detect
|
||||||
invalid memory accesses, and is mostly useful for debugging PCRE2
|
invalid memory accesses, and is mostly useful for debugging PCRE2
|
||||||
itself.
|
itself.
|
||||||
|
|
||||||
|
|
||||||
CODE COVERAGE REPORTING
|
CODE COVERAGE REPORTING
|
||||||
|
|
||||||
If your C compiler is gcc, you can build a version of PCRE2 that can
|
If your C compiler is gcc, you can build a version of PCRE2 that can
|
||||||
generate a code coverage report for its test suite. To enable this, you
|
generate a code coverage report for its test suite. To enable this, you
|
||||||
must install lcov version 1.6 or above. Then specify
|
must install lcov version 1.6 or above. Then specify
|
||||||
|
|
||||||
|
@ -3307,20 +3382,20 @@ CODE COVERAGE REPORTING
|
||||||
to the configure command and build PCRE2 in the usual way.
|
to the configure command and build PCRE2 in the usual way.
|
||||||
|
|
||||||
Note that using ccache (a caching C compiler) is incompatible with code
|
Note that using ccache (a caching C compiler) is incompatible with code
|
||||||
coverage reporting. If you have configured ccache to run automatically
|
coverage reporting. If you have configured ccache to run automatically
|
||||||
on your system, you must set the environment variable
|
on your system, you must set the environment variable
|
||||||
|
|
||||||
CCACHE_DISABLE=1
|
CCACHE_DISABLE=1
|
||||||
|
|
||||||
before running make to build PCRE2, so that ccache is not used.
|
before running make to build PCRE2, so that ccache is not used.
|
||||||
|
|
||||||
When --enable-coverage is used, the following addition targets are
|
When --enable-coverage is used, the following addition targets are
|
||||||
added to the Makefile:
|
added to the Makefile:
|
||||||
|
|
||||||
make coverage
|
make coverage
|
||||||
|
|
||||||
This creates a fresh coverage report for the PCRE2 test suite. It is
|
This creates a fresh coverage report for the PCRE2 test suite. It is
|
||||||
equivalent to running "make coverage-reset", "make coverage-baseline",
|
equivalent to running "make coverage-reset", "make coverage-baseline",
|
||||||
"make check", and then "make coverage-report".
|
"make check", and then "make coverage-report".
|
||||||
|
|
||||||
make coverage-reset
|
make coverage-reset
|
||||||
|
@ -3337,18 +3412,18 @@ CODE COVERAGE REPORTING
|
||||||
|
|
||||||
make coverage-clean-report
|
make coverage-clean-report
|
||||||
|
|
||||||
This removes the generated coverage report without cleaning the cover-
|
This removes the generated coverage report without cleaning the cover-
|
||||||
age data itself.
|
age data itself.
|
||||||
|
|
||||||
make coverage-clean-data
|
make coverage-clean-data
|
||||||
|
|
||||||
This removes the captured coverage data without removing the coverage
|
This removes the captured coverage data without removing the coverage
|
||||||
files created at compile time (*.gcno).
|
files created at compile time (*.gcno).
|
||||||
|
|
||||||
make coverage-clean
|
make coverage-clean
|
||||||
|
|
||||||
This cleans all coverage data including the generated coverage report.
|
This cleans all coverage data including the generated coverage report.
|
||||||
For more information about code coverage, see the gcov and lcov docu-
|
For more information about code coverage, see the gcov and lcov docu-
|
||||||
mentation.
|
mentation.
|
||||||
|
|
||||||
|
|
||||||
|
@ -3366,7 +3441,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 24 April 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -6028,12 +6103,18 @@ MATCHING A SINGLE CODE UNIT
|
||||||
results, because PCRE2 assumes that it is matching character by charac-
|
results, because PCRE2 assumes that it is matching character by charac-
|
||||||
ter in a valid UTF string (by default it checks the subject string's
|
ter in a valid UTF string (by default it checks the subject string's
|
||||||
validity at the start of processing unless the PCRE2_NO_UTF_CHECK
|
validity at the start of processing unless the PCRE2_NO_UTF_CHECK
|
||||||
option is used). An application can lock out the use of \C by setting
|
option is used).
|
||||||
the PCRE2_NEVER_BACKSLASH_C option.
|
|
||||||
|
|
||||||
PCRE2 does not allow \C to appear in lookbehind assertions (described
|
An application can lock out the use of \C by setting the
|
||||||
below) in a UTF mode, because this would make it impossible to calcu-
|
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also
|
||||||
late the length of the lookbehind.
|
possible to build PCRE2 with the use of \C permanently disabled.
|
||||||
|
|
||||||
|
PCRE2 does not allow \C to appear in lookbehind assertions (described
|
||||||
|
below) in a UTF mode, because this would make it impossible to calcu-
|
||||||
|
late the length of the lookbehind. Neither the alternative matching
|
||||||
|
function pcre2_dfa_match() not the JIT optimizer support \C in a UTF
|
||||||
|
mode. The former gives a match-time error; the latter fails to optimize
|
||||||
|
and so the match is always run using the interpreter.
|
||||||
|
|
||||||
In general, the \C escape sequence is best avoided. However, one way of
|
In general, the \C escape sequence is best avoided. However, one way of
|
||||||
using it that avoids the problem of malformed UTF characters is to use
|
using it that avoids the problem of malformed UTF characters is to use
|
||||||
|
@ -8036,7 +8117,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 01 September 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -8966,10 +9047,10 @@ CHARACTER TYPES
|
||||||
\W a "non-word" character
|
\W a "non-word" character
|
||||||
\X a Unicode extended grapheme cluster
|
\X a Unicode extended grapheme cluster
|
||||||
|
|
||||||
The application can lock out the use of \C by setting the
|
\C is dangerous because it may leave the current matching point in the
|
||||||
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave
|
middle of a UTF-8 or UTF-16 character. The application can lock out the
|
||||||
the current matching point in the middle of a UTF-8 or UTF-16 charac-
|
use of \C by setting the PCRE2_NEVER_BACKSLASH_C option. It is also
|
||||||
ter.
|
possible to build PCRE2 with the use of \C permanently disabled.
|
||||||
|
|
||||||
By default, \d, \s, and \w match only ASCII characters, even in UTF-8
|
By default, \d, \s, and \w match only ASCII characters, even in UTF-8
|
||||||
mode or in the 16-bit and 32-bit libraries. However, if locale-specific
|
mode or in the 16-bit and 32-bit libraries. However, if locale-specific
|
||||||
|
@ -9325,7 +9406,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 17 July 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -9384,89 +9465,90 @@ WIDE CHARACTERS AND UTF MODES
|
||||||
The escape sequence \C can be used to match a single code unit, in a
|
The escape sequence \C can be used to match a single code unit, in a
|
||||||
UTF mode, but its use can lead to some strange effects because it
|
UTF mode, but its use can lead to some strange effects because it
|
||||||
breaks up multi-unit characters (see the description of \C in the
|
breaks up multi-unit characters (see the description of \C in the
|
||||||
pcre2pattern documentation). The use of \C is not supported in the
|
pcre2pattern documentation). The use of \C is not supported by the
|
||||||
alternative matching function pcre2_dfa_match(), nor is it supported in
|
alternative matching function pcre2_dfa_match() when in UTF mode. Its
|
||||||
UTF mode by the JIT optimization. If JIT optimization is requested for
|
use provokes a match-time error. The JIT optimization also does not
|
||||||
a UTF pattern that contains \C, it will not succeed, and so the match-
|
support \C in UTF mode. If JIT optimization is requested for a UTF
|
||||||
ing will be carried out by the normal interpretive function.
|
pattern that contains \C, it will not succeed, and so the matching will
|
||||||
|
be carried out by the normal interpretive function.
|
||||||
|
|
||||||
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
||||||
characters of any code value, but, by default, the characters that
|
characters of any code value, but, by default, the characters that
|
||||||
PCRE2 recognizes as digits, spaces, or word characters remain the same
|
PCRE2 recognizes as digits, spaces, or word characters remain the same
|
||||||
set as in non-UTF mode, all with code points less than 256. This
|
set as in non-UTF mode, all with code points less than 256. This
|
||||||
remains true even when PCRE2 is built to include Unicode support,
|
remains true even when PCRE2 is built to include Unicode support,
|
||||||
because to do otherwise would slow down matching in many common cases.
|
because to do otherwise would slow down matching in many common cases.
|
||||||
Note that this also applies to \b and \B, because they are defined in
|
Note that this also applies to \b and \B, because they are defined in
|
||||||
terms of \w and \W. If you want to test for a wider sense of, say,
|
terms of \w and \W. If you want to test for a wider sense of, say,
|
||||||
"digit", you can use explicit Unicode property tests such as \p{Nd}.
|
"digit", you can use explicit Unicode property tests such as \p{Nd}.
|
||||||
Alternatively, if you set the PCRE2_UCP option, the way that the char-
|
Alternatively, if you set the PCRE2_UCP option, the way that the char-
|
||||||
acter escapes work is changed so that Unicode properties are used to
|
acter escapes work is changed so that Unicode properties are used to
|
||||||
determine which characters match. There are more details in the section
|
determine which characters match. There are more details in the section
|
||||||
on generic character types in the pcre2pattern documentation.
|
on generic character types in the pcre2pattern documentation.
|
||||||
|
|
||||||
Similarly, characters that match the POSIX named character classes are
|
Similarly, characters that match the POSIX named character classes are
|
||||||
all low-valued characters, unless the PCRE2_UCP option is set.
|
all low-valued characters, unless the PCRE2_UCP option is set.
|
||||||
|
|
||||||
However, the special horizontal and vertical white space matching
|
However, the special horizontal and vertical white space matching
|
||||||
escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
|
escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
|
||||||
acters, whether or not PCRE2_UCP is set.
|
acters, whether or not PCRE2_UCP is set.
|
||||||
|
|
||||||
Case-insensitive matching in UTF mode makes use of Unicode properties.
|
Case-insensitive matching in UTF mode makes use of Unicode properties.
|
||||||
A few Unicode characters such as Greek sigma have more than two code-
|
A few Unicode characters such as Greek sigma have more than two code-
|
||||||
points that are case-equivalent, and these are treated as such.
|
points that are case-equivalent, and these are treated as such.
|
||||||
|
|
||||||
|
|
||||||
VALIDITY OF UTF STRINGS
|
VALIDITY OF UTF STRINGS
|
||||||
|
|
||||||
When the PCRE2_UTF option is set, the strings passed as patterns and
|
When the PCRE2_UTF option is set, the strings passed as patterns and
|
||||||
subjects are (by default) checked for validity on entry to the relevant
|
subjects are (by default) checked for validity on entry to the relevant
|
||||||
functions. If an invalid UTF string is passed, an negative error code
|
functions. If an invalid UTF string is passed, an negative error code
|
||||||
is returned. The code unit offset to the offending character can be
|
is returned. The code unit offset to the offending character can be
|
||||||
extracted from the match data block by calling pcre2_get_startchar(),
|
extracted from the match data block by calling pcre2_get_startchar(),
|
||||||
which is used for this purpose after a UTF error.
|
which is used for this purpose after a UTF error.
|
||||||
|
|
||||||
UTF-16 and UTF-32 strings can indicate their endianness by special code
|
UTF-16 and UTF-32 strings can indicate their endianness by special code
|
||||||
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
||||||
this, expecting strings to be in host byte order.
|
this, expecting strings to be in host byte order.
|
||||||
|
|
||||||
A UTF string is checked before any other processing takes place. In the
|
A UTF string is checked before any other processing takes place. In the
|
||||||
case of pcre2_match() and pcre2_dfa_match() calls with a non-zero
|
case of pcre2_match() and pcre2_dfa_match() calls with a non-zero
|
||||||
starting offset, the check is applied only to that part of the subject
|
starting offset, the check is applied only to that part of the subject
|
||||||
that could be inspected during matching, and there is a check that the
|
that could be inspected during matching, and there is a check that the
|
||||||
starting offset points to the first code unit of a character or to the
|
starting offset points to the first code unit of a character or to the
|
||||||
end of the subject. If there are no lookbehind assertions in the pat-
|
end of the subject. If there are no lookbehind assertions in the pat-
|
||||||
tern, the check starts at the starting offset. Otherwise, it starts at
|
tern, the check starts at the starting offset. Otherwise, it starts at
|
||||||
the length of the longest lookbehind before the starting offset, or at
|
the length of the longest lookbehind before the starting offset, or at
|
||||||
the start of the subject if there are not that many characters before
|
the start of the subject if there are not that many characters before
|
||||||
the starting offset. Note that the sequences \b and \B are one-charac-
|
the starting offset. Note that the sequences \b and \B are one-charac-
|
||||||
ter lookbehinds.
|
ter lookbehinds.
|
||||||
|
|
||||||
In addition to checking the format of the string, there is a check to
|
In addition to checking the format of the string, there is a check to
|
||||||
ensure that all code points lie in the range U+0 to U+10FFFF, excluding
|
ensure that all code points lie in the range U+0 to U+10FFFF, excluding
|
||||||
the surrogate area. The so-called "non-character" code points are not
|
the surrogate area. The so-called "non-character" code points are not
|
||||||
excluded because Unicode corrigendum #9 makes it clear that they should
|
excluded because Unicode corrigendum #9 makes it clear that they should
|
||||||
not be.
|
not be.
|
||||||
|
|
||||||
Characters in the "Surrogate Area" of Unicode are reserved for use by
|
Characters in the "Surrogate Area" of Unicode are reserved for use by
|
||||||
UTF-16, where they are used in pairs to encode code points with values
|
UTF-16, where they are used in pairs to encode code points with values
|
||||||
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
|
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
|
||||||
are available independently in the UTF-8 and UTF-32 encodings. (In
|
are available independently in the UTF-8 and UTF-32 encodings. (In
|
||||||
other words, the whole surrogate thing is a fudge for UTF-16 which
|
other words, the whole surrogate thing is a fudge for UTF-16 which
|
||||||
unfortunately messes up UTF-8 and UTF-32.)
|
unfortunately messes up UTF-8 and UTF-32.)
|
||||||
|
|
||||||
In some situations, you may already know that your strings are valid,
|
In some situations, you may already know that your strings are valid,
|
||||||
and therefore want to skip these checks in order to improve perfor-
|
and therefore want to skip these checks in order to improve perfor-
|
||||||
mance, for example in the case of a long subject string that is being
|
mance, for example in the case of a long subject string that is being
|
||||||
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com-
|
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com-
|
||||||
pile time or at match time, PCRE2 assumes that the pattern or subject
|
pile time or at match time, PCRE2 assumes that the pattern or subject
|
||||||
it is given (respectively) contains only valid UTF code unit sequences.
|
it is given (respectively) contains only valid UTF code unit sequences.
|
||||||
|
|
||||||
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
|
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
|
||||||
for the pattern; it does not also apply to subject strings. If you want
|
for the pattern; it does not also apply to subject strings. If you want
|
||||||
to disable the check for a subject string you must pass this option to
|
to disable the check for a subject string you must pass this option to
|
||||||
pcre2_match() or pcre2_dfa_match().
|
pcre2_match() or pcre2_dfa_match().
|
||||||
|
|
||||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
||||||
result is undefined and your program may crash or loop indefinitely.
|
result is undefined and your program may crash or loop indefinitely.
|
||||||
|
|
||||||
Errors in UTF-8 strings
|
Errors in UTF-8 strings
|
||||||
|
@ -9479,10 +9561,10 @@ VALIDITY OF UTF STRINGS
|
||||||
PCRE2_ERROR_UTF8_ERR4
|
PCRE2_ERROR_UTF8_ERR4
|
||||||
PCRE2_ERROR_UTF8_ERR5
|
PCRE2_ERROR_UTF8_ERR5
|
||||||
|
|
||||||
The string ends with a truncated UTF-8 character; the code specifies
|
The string ends with a truncated UTF-8 character; the code specifies
|
||||||
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
|
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
|
||||||
characters to be no longer than 4 bytes, the encoding scheme (origi-
|
characters to be no longer than 4 bytes, the encoding scheme (origi-
|
||||||
nally defined by RFC 2279) allows for up to 6 bytes, and this is
|
nally defined by RFC 2279) allows for up to 6 bytes, and this is
|
||||||
checked first; hence the possibility of 4 or 5 missing bytes.
|
checked first; hence the possibility of 4 or 5 missing bytes.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR6
|
PCRE2_ERROR_UTF8_ERR6
|
||||||
|
@ -9492,24 +9574,24 @@ VALIDITY OF UTF STRINGS
|
||||||
PCRE2_ERROR_UTF8_ERR10
|
PCRE2_ERROR_UTF8_ERR10
|
||||||
|
|
||||||
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
|
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
|
||||||
the character do not have the binary value 0b10 (that is, either the
|
the character do not have the binary value 0b10 (that is, either the
|
||||||
most significant bit is 0, or the next bit is 1).
|
most significant bit is 0, or the next bit is 1).
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR11
|
PCRE2_ERROR_UTF8_ERR11
|
||||||
PCRE2_ERROR_UTF8_ERR12
|
PCRE2_ERROR_UTF8_ERR12
|
||||||
|
|
||||||
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
|
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
|
||||||
long; these code points are excluded by RFC 3629.
|
long; these code points are excluded by RFC 3629.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR13
|
PCRE2_ERROR_UTF8_ERR13
|
||||||
|
|
||||||
A 4-byte character has a value greater than 0x10fff; these code points
|
A 4-byte character has a value greater than 0x10fff; these code points
|
||||||
are excluded by RFC 3629.
|
are excluded by RFC 3629.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR14
|
PCRE2_ERROR_UTF8_ERR14
|
||||||
|
|
||||||
A 3-byte character has a value in the range 0xd800 to 0xdfff; this
|
A 3-byte character has a value in the range 0xd800 to 0xdfff; this
|
||||||
range of code points are reserved by RFC 3629 for use with UTF-16, and
|
range of code points are reserved by RFC 3629 for use with UTF-16, and
|
||||||
so are excluded from UTF-8.
|
so are excluded from UTF-8.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR15
|
PCRE2_ERROR_UTF8_ERR15
|
||||||
|
@ -9518,26 +9600,26 @@ VALIDITY OF UTF STRINGS
|
||||||
PCRE2_ERROR_UTF8_ERR18
|
PCRE2_ERROR_UTF8_ERR18
|
||||||
PCRE2_ERROR_UTF8_ERR19
|
PCRE2_ERROR_UTF8_ERR19
|
||||||
|
|
||||||
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
|
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
|
||||||
for a value that can be represented by fewer bytes, which is invalid.
|
for a value that can be represented by fewer bytes, which is invalid.
|
||||||
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
|
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
|
||||||
rect coding uses just one byte.
|
rect coding uses just one byte.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR20
|
PCRE2_ERROR_UTF8_ERR20
|
||||||
|
|
||||||
The two most significant bits of the first byte of a character have the
|
The two most significant bits of the first byte of a character have the
|
||||||
binary value 0b10 (that is, the most significant bit is 1 and the sec-
|
binary value 0b10 (that is, the most significant bit is 1 and the sec-
|
||||||
ond is 0). Such a byte can only validly occur as the second or subse-
|
ond is 0). Such a byte can only validly occur as the second or subse-
|
||||||
quent byte of a multi-byte character.
|
quent byte of a multi-byte character.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR21
|
PCRE2_ERROR_UTF8_ERR21
|
||||||
|
|
||||||
The first byte of a character has the value 0xfe or 0xff. These values
|
The first byte of a character has the value 0xfe or 0xff. These values
|
||||||
can never occur in a valid UTF-8 string.
|
can never occur in a valid UTF-8 string.
|
||||||
|
|
||||||
Errors in UTF-16 strings
|
Errors in UTF-16 strings
|
||||||
|
|
||||||
The following negative error codes are given for invalid UTF-16
|
The following negative error codes are given for invalid UTF-16
|
||||||
strings:
|
strings:
|
||||||
|
|
||||||
PCRE_UTF16_ERR1 Missing low surrogate at end of string
|
PCRE_UTF16_ERR1 Missing low surrogate at end of string
|
||||||
|
@ -9547,7 +9629,7 @@ VALIDITY OF UTF STRINGS
|
||||||
|
|
||||||
Errors in UTF-32 strings
|
Errors in UTF-32 strings
|
||||||
|
|
||||||
The following negative error codes are given for invalid UTF-32
|
The following negative error codes are given for invalid UTF-32
|
||||||
strings:
|
strings:
|
||||||
|
|
||||||
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
|
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
|
||||||
|
@ -9563,7 +9645,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 18 August 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2API 3 "07 October 2015" "PCRE2 10.21"
|
.TH PCRE2API 3 "16 October 2015" "PCRE2 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.sp
|
.sp
|
||||||
|
@ -1209,7 +1209,8 @@ This option locks out the use of \eC in the pattern that is being compiled.
|
||||||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||||
it may leave the current matching point in the middle of a multi-code-unit
|
it may leave the current matching point in the middle of a multi-code-unit
|
||||||
character. This option may be useful in applications that process patterns from
|
character. This option may be useful in applications that process patterns from
|
||||||
external sources.
|
external sources. Note that there is also a build-time option that permanently
|
||||||
|
locks out the use of \eC.
|
||||||
.sp
|
.sp
|
||||||
PCRE2_NEVER_UCP
|
PCRE2_NEVER_UCP
|
||||||
.sp
|
.sp
|
||||||
|
@ -3014,8 +3015,8 @@ There are in addition the following errors that are specific to
|
||||||
PCRE2_ERROR_DFA_UITEM
|
PCRE2_ERROR_DFA_UITEM
|
||||||
.sp
|
.sp
|
||||||
This return is given if \fBpcre2_dfa_match()\fP encounters an item in the
|
This return is given if \fBpcre2_dfa_match()\fP encounters an item in the
|
||||||
pattern that it does not support, for instance, the use of \eC or a back
|
pattern that it does not support, for instance, the use of \eC in a UTF mode or
|
||||||
reference.
|
a back reference.
|
||||||
.sp
|
.sp
|
||||||
PCRE2_ERROR_DFA_UCOND
|
PCRE2_ERROR_DFA_UCOND
|
||||||
.sp
|
.sp
|
||||||
|
@ -3065,6 +3066,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 07 October 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2BUILD 3 "23 April 2015" "PCRE2 10.20"
|
.TH PCRE2BUILD 3 "16 October 2015" "PCRE2 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.
|
.
|
||||||
|
@ -132,11 +132,20 @@ Pattern escapes such as \ed and \ew do not by default make use of Unicode
|
||||||
properties. The application can request that they do by setting the PCRE2_UCP
|
properties. The application can request that they do by setting the PCRE2_UCP
|
||||||
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
|
option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also
|
||||||
request this by starting with (*UCP).
|
request this by starting with (*UCP).
|
||||||
.P
|
.
|
||||||
|
.
|
||||||
|
.SH "DISABLING THE USE OF \eC"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
The \eC escape sequence, which matches a single code unit, even in a UTF mode,
|
The \eC escape sequence, which matches a single code unit, even in a UTF mode,
|
||||||
can cause unpredictable behaviour because it may leave the current matching
|
can cause unpredictable behaviour because it may leave the current matching
|
||||||
point in the middle of a multi-code-unit character. It can be locked out by
|
point in the middle of a multi-code-unit character. The application can lock it
|
||||||
setting the PCRE2_NEVER_BACKSLASH_C option.
|
out by setting the PCRE2_NEVER_BACKSLASH_C option when calling
|
||||||
|
\fBpcre2_compile()\fP. There is also a build-time option
|
||||||
|
.sp
|
||||||
|
--enable-never-backslash-C
|
||||||
|
.sp
|
||||||
|
(note the upper case C) which locks out the use of \eC entirely.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH "JUST-IN-TIME COMPILER SUPPORT"
|
.SH "JUST-IN-TIME COMPILER SUPPORT"
|
||||||
|
@ -510,6 +519,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 24 April 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21"
|
.TH PCRE2PATTERN 3 "16 October 2015" "PCRE2 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||||
|
@ -1233,8 +1233,11 @@ with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start
|
||||||
with a malformed UTF character. This has undefined results, because PCRE2
|
with a malformed UTF character. This has undefined results, because PCRE2
|
||||||
assumes that it is matching character by character in a valid UTF string (by
|
assumes that it is matching character by character in a valid UTF string (by
|
||||||
default it checks the subject string's validity at the start of processing
|
default it checks the subject string's validity at the start of processing
|
||||||
unless the PCRE2_NO_UTF_CHECK option is used). An application can lock out the
|
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||||
use of \eC by setting the PCRE2_NEVER_BACKSLASH_C option.
|
.P
|
||||||
|
An application can lock out the use of \eC by setting the
|
||||||
|
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
|
||||||
|
build PCRE2 with the use of \eC permanently disabled.
|
||||||
.P
|
.P
|
||||||
PCRE2 does not allow \eC to appear in lookbehind assertions
|
PCRE2 does not allow \eC to appear in lookbehind assertions
|
||||||
.\" HTML <a href="#lookbehind">
|
.\" HTML <a href="#lookbehind">
|
||||||
|
@ -1242,7 +1245,10 @@ PCRE2 does not allow \eC to appear in lookbehind assertions
|
||||||
(described below)
|
(described below)
|
||||||
.\"
|
.\"
|
||||||
in a UTF mode, because this would make it impossible to calculate the length of
|
in a UTF mode, because this would make it impossible to calculate the length of
|
||||||
the lookbehind.
|
the lookbehind. Neither the alternative matching function
|
||||||
|
\fBpcre2_dfa_match()\fP not the JIT optimizer support \eC in a UTF mode. The
|
||||||
|
former gives a match-time error; the latter fails to optimize and so the match
|
||||||
|
is always run using the interpreter.
|
||||||
.P
|
.P
|
||||||
In general, the \eC escape sequence is best avoided. However, one way of using
|
In general, the \eC escape sequence is best avoided. However, one way of using
|
||||||
it that avoids the problem of malformed UTF characters is to use a lookahead to
|
it that avoids the problem of malformed UTF characters is to use a lookahead to
|
||||||
|
@ -3386,6 +3392,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 01 September 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2SYNTAX 3 "17 July 2015" "PCRE2 10.21"
|
.TH PCRE2SYNTAX 3 "16 October 2015" "PCRE2 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||||
|
@ -81,9 +81,10 @@ it matches a literal "u".
|
||||||
\eW a "non-word" character
|
\eW a "non-word" character
|
||||||
\eX a Unicode extended grapheme cluster
|
\eX a Unicode extended grapheme cluster
|
||||||
.sp
|
.sp
|
||||||
The application can lock out the use of \eC by setting the
|
\eC is dangerous because it may leave the current matching point in the middle
|
||||||
PCRE2_NEVER_BACKSLASH_C option. It is dangerous because it may leave the
|
of a UTF-8 or UTF-16 character. The application can lock out the use of \eC by
|
||||||
current matching point in the middle of a UTF-8 or UTF-16 character.
|
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||||
|
with the use of \eC permanently disabled.
|
||||||
.P
|
.P
|
||||||
By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode
|
By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode
|
||||||
or in the 16-bit and 32-bit libraries. However, if locale-specific matching is
|
or in the 16-bit and 32-bit libraries. However, if locale-specific matching is
|
||||||
|
@ -576,6 +577,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 17 July 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2TEST 1 "23 September 2015" "PCRE 10.21"
|
.TH PCRE2TEST 1 "17 October 2015" "PCRE 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
|
@ -122,12 +122,13 @@ following options output the value and set the exit code as indicated:
|
||||||
The following options output 1 for true or 0 for false, and set the exit code
|
The following options output 1 for true or 0 for false, and set the exit code
|
||||||
to the same value:
|
to the same value:
|
||||||
.sp
|
.sp
|
||||||
ebcdic compiled for an EBCDIC environment
|
backslash-C \eC is supported (not locked out)
|
||||||
jit just-in-time support is available
|
ebcdic compiled for an EBCDIC environment
|
||||||
pcre2-16 the 16-bit library was built
|
jit just-in-time support is available
|
||||||
pcre2-32 the 32-bit library was built
|
pcre2-16 the 16-bit library was built
|
||||||
pcre2-8 the 8-bit library was built
|
pcre2-32 the 32-bit library was built
|
||||||
unicode Unicode support is available
|
pcre2-8 the 8-bit library was built
|
||||||
|
unicode Unicode support is available
|
||||||
.sp
|
.sp
|
||||||
If an unknown option is given, an error message is output; the exit code is 0.
|
If an unknown option is given, an error message is output; the exit code is 0.
|
||||||
.TP 10
|
.TP 10
|
||||||
|
@ -1559,6 +1560,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 23 September 2015
|
Last updated: 17 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -119,12 +119,13 @@ COMMAND LINE OPTIONS
|
||||||
The following options output 1 for true or 0 for false, and
|
The following options output 1 for true or 0 for false, and
|
||||||
set the exit code to the same value:
|
set the exit code to the same value:
|
||||||
|
|
||||||
ebcdic compiled for an EBCDIC environment
|
backslash-C \C is supported (not locked out)
|
||||||
jit just-in-time support is available
|
ebcdic compiled for an EBCDIC environment
|
||||||
pcre2-16 the 16-bit library was built
|
jit just-in-time support is available
|
||||||
pcre2-32 the 32-bit library was built
|
pcre2-16 the 16-bit library was built
|
||||||
pcre2-8 the 8-bit library was built
|
pcre2-32 the 32-bit library was built
|
||||||
unicode Unicode support is available
|
pcre2-8 the 8-bit library was built
|
||||||
|
unicode Unicode support is available
|
||||||
|
|
||||||
If an unknown option is given, an error message is output;
|
If an unknown option is given, an error message is output;
|
||||||
the exit code is 0.
|
the exit code is 0.
|
||||||
|
@ -457,7 +458,7 @@ PATTERN MODIFIERS
|
||||||
Setting compilation options
|
Setting compilation options
|
||||||
|
|
||||||
The following modifiers set options for pcre2_compile(). The most com-
|
The following modifiers set options for pcre2_compile(). The most com-
|
||||||
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
mon ones have single-letter abbreviations. See pcre2api for a descrip-
|
||||||
tion of their effects.
|
tion of their effects.
|
||||||
|
|
||||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||||
|
@ -484,6 +485,7 @@ PATTERN MODIFIERS
|
||||||
no_utf_check set PCRE2_NO_UTF_CHECK
|
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||||
ucp set PCRE2_UCP
|
ucp set PCRE2_UCP
|
||||||
ungreedy set PCRE2_UNGREEDY
|
ungreedy set PCRE2_UNGREEDY
|
||||||
|
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||||
utf set PCRE2_UTF
|
utf set PCRE2_UTF
|
||||||
|
|
||||||
As well as turning on the PCRE2_UTF option, the utf modifier causes all
|
As well as turning on the PCRE2_UTF option, the utf modifier causes all
|
||||||
|
@ -509,6 +511,7 @@ PATTERN MODIFIERS
|
||||||
locale=<name> use this locale
|
locale=<name> use this locale
|
||||||
memory show memory used
|
memory show memory used
|
||||||
newline=<type> set newline type
|
newline=<type> set newline type
|
||||||
|
null_context compile with a NULL context
|
||||||
parens_nest_limit=<n> set maximum parentheses depth
|
parens_nest_limit=<n> set maximum parentheses depth
|
||||||
posix use the POSIX API
|
posix use the POSIX API
|
||||||
push push compiled pattern onto the stack
|
push push compiled pattern onto the stack
|
||||||
|
@ -579,35 +582,42 @@ PATTERN MODIFIERS
|
||||||
mation that is requested. For each callout, either its number or string
|
mation that is requested. For each callout, either its number or string
|
||||||
is given, followed by the item that follows it in the pattern.
|
is given, followed by the item that follows it in the pattern.
|
||||||
|
|
||||||
|
Passing a NULL context
|
||||||
|
|
||||||
|
Normally, pcre2test passes a context block to pcre2_compile(). If the
|
||||||
|
null_context modifier is set, however, NULL is passed. This is for
|
||||||
|
testing that pcre2_compile() behaves correctly in this case (it uses
|
||||||
|
default values).
|
||||||
|
|
||||||
Specifying a pattern in hex
|
Specifying a pattern in hex
|
||||||
|
|
||||||
The hex modifier specifies that the characters of the pattern are to be
|
The hex modifier specifies that the characters of the pattern are to be
|
||||||
interpreted as pairs of hexadecimal digits. White space is permitted
|
interpreted as pairs of hexadecimal digits. White space is permitted
|
||||||
between pairs. For example:
|
between pairs. For example:
|
||||||
|
|
||||||
/ab 32 59/hex
|
/ab 32 59/hex
|
||||||
|
|
||||||
This feature is provided as a way of creating patterns that contain
|
This feature is provided as a way of creating patterns that contain
|
||||||
binary zero and other non-printing characters. By default, pcre2test
|
binary zero and other non-printing characters. By default, pcre2test
|
||||||
passes patterns as zero-terminated strings to pcre2_compile(), giving
|
passes patterns as zero-terminated strings to pcre2_compile(), giving
|
||||||
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
|
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
|
||||||
hexadecimal, the actual length of the pattern is passed.
|
hexadecimal, the actual length of the pattern is passed.
|
||||||
|
|
||||||
JIT compilation
|
JIT compilation
|
||||||
|
|
||||||
Just-in-time (JIT) compiling is a heavyweight optimization that can
|
Just-in-time (JIT) compiling is a heavyweight optimization that can
|
||||||
greatly speed up pattern matching. See the pcre2jit documentation for
|
greatly speed up pattern matching. See the pcre2jit documentation for
|
||||||
details. JIT compiling happens, optionally, after a pattern has been
|
details. JIT compiling happens, optionally, after a pattern has been
|
||||||
successfully compiled into an internal form. The JIT compiler converts
|
successfully compiled into an internal form. The JIT compiler converts
|
||||||
this to optimized machine code. It needs to know whether the match-time
|
this to optimized machine code. It needs to know whether the match-time
|
||||||
options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used,
|
options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used,
|
||||||
because different code is generated for the different cases. See the
|
because different code is generated for the different cases. See the
|
||||||
partial modifier in "Subject Modifiers" below for details of how these
|
partial modifier in "Subject Modifiers" below for details of how these
|
||||||
options are specified for each match attempt.
|
options are specified for each match attempt.
|
||||||
|
|
||||||
JIT compilation is requested by the /jit pattern modifier, which may
|
JIT compilation is requested by the /jit pattern modifier, which may
|
||||||
optionally be followed by an equals sign and a number in the range 0 to
|
optionally be followed by an equals sign and a number in the range 0 to
|
||||||
7. The three bits that make up the number specify which of the three
|
7. The three bits that make up the number specify which of the three
|
||||||
JIT operating modes are to be compiled:
|
JIT operating modes are to be compiled:
|
||||||
|
|
||||||
1 compile JIT code for non-partial matching
|
1 compile JIT code for non-partial matching
|
||||||
|
@ -624,31 +634,31 @@ PATTERN MODIFIERS
|
||||||
6 soft and hard partial matching only
|
6 soft and hard partial matching only
|
||||||
7 all three modes
|
7 all three modes
|
||||||
|
|
||||||
If no number is given, 7 is assumed. The phrase "partial matching"
|
If no number is given, 7 is assumed. The phrase "partial matching"
|
||||||
means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the
|
means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the
|
||||||
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
|
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
|
||||||
plete match; the options enable the possibility of a partial match, but
|
plete match; the options enable the possibility of a partial match, but
|
||||||
do not require it. Note also that if you request JIT compilation only
|
do not require it. Note also that if you request JIT compilation only
|
||||||
for partial matching (for example, /jit=2) but do not set the partial
|
for partial matching (for example, /jit=2) but do not set the partial
|
||||||
modifier on a subject line, that match will not use JIT code because
|
modifier on a subject line, that match will not use JIT code because
|
||||||
none was compiled for non-partial matching.
|
none was compiled for non-partial matching.
|
||||||
|
|
||||||
If JIT compilation is successful, the compiled JIT code will automati-
|
If JIT compilation is successful, the compiled JIT code will automati-
|
||||||
cally be used when an appropriate type of match is run, except when
|
cally be used when an appropriate type of match is run, except when
|
||||||
incompatible run-time options are specified. For more details, see the
|
incompatible run-time options are specified. For more details, see the
|
||||||
pcre2jit documentation. See also the jitstack modifier below for a way
|
pcre2jit documentation. See also the jitstack modifier below for a way
|
||||||
of setting the size of the JIT stack.
|
of setting the size of the JIT stack.
|
||||||
|
|
||||||
If the jitfast modifier is specified, matching is done using the JIT
|
If the jitfast modifier is specified, matching is done using the JIT
|
||||||
"fast path" interface, pcre2_jit_match(), which skips some of the san-
|
"fast path" interface, pcre2_jit_match(), which skips some of the san-
|
||||||
ity checks that are done by pcre2_match(), and of course does not work
|
ity checks that are done by pcre2_match(), and of course does not work
|
||||||
when JIT is not supported. If jitfast is specified without jit, jit=7
|
when JIT is not supported. If jitfast is specified without jit, jit=7
|
||||||
is assumed.
|
is assumed.
|
||||||
|
|
||||||
If the jitverify modifier is specified, information about the compiled
|
If the jitverify modifier is specified, information about the compiled
|
||||||
pattern shows whether JIT compilation was or was not successful. If
|
pattern shows whether JIT compilation was or was not successful. If
|
||||||
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
||||||
tion is successful when jitverify is set, the text "(JIT)" is added to
|
tion is successful when jitverify is set, the text "(JIT)" is added to
|
||||||
the first output line after a match or non match when JIT-compiled code
|
the first output line after a match or non match when JIT-compiled code
|
||||||
was actually used in the match.
|
was actually used in the match.
|
||||||
|
|
||||||
|
@ -659,18 +669,18 @@ PATTERN MODIFIERS
|
||||||
/pattern/locale=fr_FR
|
/pattern/locale=fr_FR
|
||||||
|
|
||||||
The given locale is set, pcre2_maketables() is called to build a set of
|
The given locale is set, pcre2_maketables() is called to build a set of
|
||||||
character tables for the locale, and this is then passed to pcre2_com-
|
character tables for the locale, and this is then passed to pcre2_com-
|
||||||
pile() when compiling the regular expression. The same tables are used
|
pile() when compiling the regular expression. The same tables are used
|
||||||
when matching the following subject lines. The /locale modifier applies
|
when matching the following subject lines. The /locale modifier applies
|
||||||
only to the pattern on which it appears, but can be given in a #pattern
|
only to the pattern on which it appears, but can be given in a #pattern
|
||||||
command if a default is needed. Setting a locale and alternate charac-
|
command if a default is needed. Setting a locale and alternate charac-
|
||||||
ter tables are mutually exclusive.
|
ter tables are mutually exclusive.
|
||||||
|
|
||||||
Showing pattern memory
|
Showing pattern memory
|
||||||
|
|
||||||
The /memory modifier causes the size in bytes of the memory used to
|
The /memory modifier causes the size in bytes of the memory used to
|
||||||
hold the compiled pattern to be output. This does not include the size
|
hold the compiled pattern to be output. This does not include the size
|
||||||
of the pcre2_code block; it is just the actual compiled data. If the
|
of the pcre2_code block; it is just the actual compiled data. If the
|
||||||
pattern is subsequently passed to the JIT compiler, the size of the JIT
|
pattern is subsequently passed to the JIT compiler, the size of the JIT
|
||||||
compiled code is also output. Here is an example:
|
compiled code is also output. Here is an example:
|
||||||
|
|
||||||
|
@ -681,19 +691,19 @@ PATTERN MODIFIERS
|
||||||
|
|
||||||
Limiting nested parentheses
|
Limiting nested parentheses
|
||||||
|
|
||||||
The parens_nest_limit modifier sets a limit on the depth of nested
|
The parens_nest_limit modifier sets a limit on the depth of nested
|
||||||
parentheses in a pattern. Breaching the limit causes a compilation
|
parentheses in a pattern. Breaching the limit causes a compilation
|
||||||
error. The default for the library is set when PCRE2 is built, but
|
error. The default for the library is set when PCRE2 is built, but
|
||||||
pcre2test sets its own default of 220, which is required for running
|
pcre2test sets its own default of 220, which is required for running
|
||||||
the standard test suite.
|
the standard test suite.
|
||||||
|
|
||||||
Using the POSIX wrapper API
|
Using the POSIX wrapper API
|
||||||
|
|
||||||
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
|
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
|
||||||
per API rather than its native API. This supports only the 8-bit
|
per API rather than its native API. This supports only the 8-bit
|
||||||
library. Note that it does not imply POSIX matching semantics; for
|
library. Note that it does not imply POSIX matching semantics; for
|
||||||
more detail see the pcre2posix documentation. When the POSIX API is
|
more detail see the pcre2posix documentation. When the POSIX API is
|
||||||
being used, the following pattern modifiers set options for the reg-
|
being used, the following pattern modifiers set options for the reg-
|
||||||
comp() function:
|
comp() function:
|
||||||
|
|
||||||
caseless REG_ICASE
|
caseless REG_ICASE
|
||||||
|
@ -704,24 +714,24 @@ PATTERN MODIFIERS
|
||||||
ucp REG_UCP ) the POSIX standard
|
ucp REG_UCP ) the POSIX standard
|
||||||
utf REG_UTF8 )
|
utf REG_UTF8 )
|
||||||
|
|
||||||
The aftertext and allaftertext subject modifiers work as described
|
The aftertext and allaftertext subject modifiers work as described
|
||||||
below. All other modifiers cause an error.
|
below. All other modifiers cause an error.
|
||||||
|
|
||||||
Testing the stack guard feature
|
Testing the stack guard feature
|
||||||
|
|
||||||
The /stackguard modifier is used to test the use of pcre2_set_com-
|
The /stackguard modifier is used to test the use of pcre2_set_com-
|
||||||
pile_recursion_guard(), a function that is provided to enable stack
|
pile_recursion_guard(), a function that is provided to enable stack
|
||||||
availability to be checked during compilation (see the pcre2api docu-
|
availability to be checked during compilation (see the pcre2api docu-
|
||||||
mentation for details). If the number specified by the modifier is
|
mentation for details). If the number specified by the modifier is
|
||||||
greater than zero, pcre2_set_compile_recursion_guard() is called to set
|
greater than zero, pcre2_set_compile_recursion_guard() is called to set
|
||||||
up callback from pcre2_compile() to a local function. The argument it
|
up callback from pcre2_compile() to a local function. The argument it
|
||||||
receives is the current nesting parenthesis depth; if this is greater
|
receives is the current nesting parenthesis depth; if this is greater
|
||||||
than the value given by the modifier, non-zero is returned, causing the
|
than the value given by the modifier, non-zero is returned, causing the
|
||||||
compilation to be aborted.
|
compilation to be aborted.
|
||||||
|
|
||||||
Using alternative character tables
|
Using alternative character tables
|
||||||
|
|
||||||
The value specified for the /tables modifier must be one of the digits
|
The value specified for the /tables modifier must be one of the digits
|
||||||
0, 1, or 2. It causes a specific set of built-in character tables to be
|
0, 1, or 2. It causes a specific set of built-in character tables to be
|
||||||
passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
|
passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
|
||||||
haviour with different character tables. The digit specifies the tables
|
haviour with different character tables. The digit specifies the tables
|
||||||
|
@ -732,15 +742,15 @@ PATTERN MODIFIERS
|
||||||
pcre2_chartables.c.dist
|
pcre2_chartables.c.dist
|
||||||
2 a set of tables defining ISO 8859 characters
|
2 a set of tables defining ISO 8859 characters
|
||||||
|
|
||||||
In table 2, some characters whose codes are greater than 128 are iden-
|
In table 2, some characters whose codes are greater than 128 are iden-
|
||||||
tified as letters, digits, spaces, etc. Setting alternate character
|
tified as letters, digits, spaces, etc. Setting alternate character
|
||||||
tables and a locale are mutually exclusive.
|
tables and a locale are mutually exclusive.
|
||||||
|
|
||||||
Setting certain match controls
|
Setting certain match controls
|
||||||
|
|
||||||
The following modifiers are really subject modifiers, and are described
|
The following modifiers are really subject modifiers, and are described
|
||||||
below. However, they may be included in a pattern's modifier list, in
|
below. However, they may be included in a pattern's modifier list, in
|
||||||
which case they are applied to every subject line that is processed
|
which case they are applied to every subject line that is processed
|
||||||
with that pattern. They do not affect the compilation process.
|
with that pattern. They do not affect the compilation process.
|
||||||
|
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
|
@ -752,20 +762,20 @@ PATTERN MODIFIERS
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show starting character when relevant
|
startchar show starting character when relevant
|
||||||
|
|
||||||
These modifiers may not appear in a #pattern command. If you want them
|
These modifiers may not appear in a #pattern command. If you want them
|
||||||
as defaults, set them in a #subject command.
|
as defaults, set them in a #subject command.
|
||||||
|
|
||||||
Saving a compiled pattern
|
Saving a compiled pattern
|
||||||
|
|
||||||
When a pattern with the push modifier is successfully compiled, it is
|
When a pattern with the push modifier is successfully compiled, it is
|
||||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||||
next line to contain a new pattern (or a command) instead of a subject
|
next line to contain a new pattern (or a command) instead of a subject
|
||||||
line. This facility is used when saving compiled patterns to a file, as
|
line. This facility is used when saving compiled patterns to a file, as
|
||||||
described in the section entitled "Saving and restoring compiled pat-
|
described in the section entitled "Saving and restoring compiled pat-
|
||||||
terns" below. The push modifier is incompatible with compilation modi-
|
terns" below. The push modifier is incompatible with compilation modi-
|
||||||
fiers such as global that act at match time. Any that are specified are
|
fiers such as global that act at match time. Any that are specified are
|
||||||
ignored, with a warning message, except for replace, which causes an
|
ignored, with a warning message, except for replace, which causes an
|
||||||
error. Note that, jitverify, which is allowed, does not carry through
|
error. Note that, jitverify, which is allowed, does not carry through
|
||||||
to any subsequent matching that uses this pattern.
|
to any subsequent matching that uses this pattern.
|
||||||
|
|
||||||
|
|
||||||
|
@ -776,7 +786,7 @@ SUBJECT MODIFIERS
|
||||||
|
|
||||||
Setting match options
|
Setting match options
|
||||||
|
|
||||||
The following modifiers set options for pcre2_match() or
|
The following modifiers set options for pcre2_match() or
|
||||||
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
||||||
|
|
||||||
anchored set PCRE2_ANCHORED
|
anchored set PCRE2_ANCHORED
|
||||||
|
@ -790,20 +800,20 @@ SUBJECT MODIFIERS
|
||||||
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
||||||
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
||||||
|
|
||||||
The partial matching modifiers are provided with abbreviations because
|
The partial matching modifiers are provided with abbreviations because
|
||||||
they appear frequently in tests.
|
they appear frequently in tests.
|
||||||
|
|
||||||
If the /posix modifier was present on the pattern, causing the POSIX
|
If the /posix modifier was present on the pattern, causing the POSIX
|
||||||
wrapper API to be used, the only option-setting modifiers that have any
|
wrapper API to be used, the only option-setting modifiers that have any
|
||||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||||
Any other modifiers cause an error.
|
Any other modifiers cause an error.
|
||||||
|
|
||||||
Setting match controls
|
Setting match controls
|
||||||
|
|
||||||
The following modifiers affect the matching process or request addi-
|
The following modifiers affect the matching process or request addi-
|
||||||
tional information. Some of them may also be specified on a pattern
|
tional information. Some of them may also be specified on a pattern
|
||||||
line (see above), in which case they apply to every subject line that
|
line (see above), in which case they apply to every subject line that
|
||||||
is matched against that pattern.
|
is matched against that pattern.
|
||||||
|
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
|
@ -823,9 +833,11 @@ SUBJECT MODIFIERS
|
||||||
/g global global matching
|
/g global global matching
|
||||||
jitstack=<n> set size of JIT stack
|
jitstack=<n> set size of JIT stack
|
||||||
mark show mark values
|
mark show mark values
|
||||||
match_limit=>n> set a match limit
|
match_limit=<n> set a match limit
|
||||||
memory show memory usage
|
memory show memory usage
|
||||||
|
null_context match with a NULL context
|
||||||
offset=<n> set starting offset
|
offset=<n> set starting offset
|
||||||
|
offset_limit=<n> set offset limit
|
||||||
ovector=<n> set size of output vector
|
ovector=<n> set size of output vector
|
||||||
recursion_limit=<n> set a recursion limit
|
recursion_limit=<n> set a recursion limit
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
|
@ -836,23 +848,23 @@ SUBJECT MODIFIERS
|
||||||
|
|
||||||
Showing more text
|
Showing more text
|
||||||
|
|
||||||
The aftertext modifier requests that as well as outputting the part of
|
The aftertext modifier requests that as well as outputting the part of
|
||||||
the subject string that matched the entire pattern, pcre2test should in
|
the subject string that matched the entire pattern, pcre2test should in
|
||||||
addition output the remainder of the subject string. This is useful for
|
addition output the remainder of the subject string. This is useful for
|
||||||
tests where the subject contains multiple copies of the same substring.
|
tests where the subject contains multiple copies of the same substring.
|
||||||
The allaftertext modifier requests the same action for captured sub-
|
The allaftertext modifier requests the same action for captured sub-
|
||||||
strings as well as the main matched substring. In each case the remain-
|
strings as well as the main matched substring. In each case the remain-
|
||||||
der is output on the following line with a plus character following the
|
der is output on the following line with a plus character following the
|
||||||
capture number.
|
capture number.
|
||||||
|
|
||||||
The allusedtext modifier requests that all the text that was consulted
|
The allusedtext modifier requests that all the text that was consulted
|
||||||
during a successful pattern match by the interpreter should be shown.
|
during a successful pattern match by the interpreter should be shown.
|
||||||
This feature is not supported for JIT matching, and if requested with
|
This feature is not supported for JIT matching, and if requested with
|
||||||
JIT it is ignored (with a warning message). Setting this modifier
|
JIT it is ignored (with a warning message). Setting this modifier
|
||||||
affects the output if there is a lookbehind at the start of a match, or
|
affects the output if there is a lookbehind at the start of a match, or
|
||||||
a lookahead at the end, or if \K is used in the pattern. Characters
|
a lookahead at the end, or if \K is used in the pattern. Characters
|
||||||
that precede or follow the start and end of the actual match are indi-
|
that precede or follow the start and end of the actual match are indi-
|
||||||
cated in the output by '<' or '>' characters underneath them. Here is
|
cated in the output by '<' or '>' characters underneath them. Here is
|
||||||
an example:
|
an example:
|
||||||
|
|
||||||
re> /(?<=pqr)abc(?=xyz)/
|
re> /(?<=pqr)abc(?=xyz)/
|
||||||
|
@ -860,16 +872,16 @@ SUBJECT MODIFIERS
|
||||||
0: pqrabcxyz
|
0: pqrabcxyz
|
||||||
<<< >>>
|
<<< >>>
|
||||||
|
|
||||||
This shows that the matched string is "abc", with the preceding and
|
This shows that the matched string is "abc", with the preceding and
|
||||||
following strings "pqr" and "xyz" having been consulted during the
|
following strings "pqr" and "xyz" having been consulted during the
|
||||||
match (when processing the assertions).
|
match (when processing the assertions).
|
||||||
|
|
||||||
The startchar modifier requests that the starting character for the
|
The startchar modifier requests that the starting character for the
|
||||||
match be indicated, if it is different to the start of the matched
|
match be indicated, if it is different to the start of the matched
|
||||||
string. The only time when this occurs is when \K has been processed as
|
string. The only time when this occurs is when \K has been processed as
|
||||||
part of the match. In this situation, the output for the matched string
|
part of the match. In this situation, the output for the matched string
|
||||||
is displayed from the starting character instead of from the match
|
is displayed from the starting character instead of from the match
|
||||||
point, with circumflex characters under the earlier characters. For
|
point, with circumflex characters under the earlier characters. For
|
||||||
example:
|
example:
|
||||||
|
|
||||||
re> /abc\Kxyz/
|
re> /abc\Kxyz/
|
||||||
|
@ -877,7 +889,7 @@ SUBJECT MODIFIERS
|
||||||
0: abcxyz
|
0: abcxyz
|
||||||
^^^
|
^^^
|
||||||
|
|
||||||
Unlike allusedtext, the startchar modifier can be used with JIT. How-
|
Unlike allusedtext, the startchar modifier can be used with JIT. How-
|
||||||
ever, these two modifiers are mutually exclusive.
|
ever, these two modifiers are mutually exclusive.
|
||||||
|
|
||||||
Showing the value of all capture groups
|
Showing the value of all capture groups
|
||||||
|
@ -885,88 +897,88 @@ SUBJECT MODIFIERS
|
||||||
The allcaptures modifier requests that the values of all potential cap-
|
The allcaptures modifier requests that the values of all potential cap-
|
||||||
tured parentheses be output after a match. By default, only those up to
|
tured parentheses be output after a match. By default, only those up to
|
||||||
the highest one actually used in the match are output (corresponding to
|
the highest one actually used in the match are output (corresponding to
|
||||||
the return code from pcre2_match()). Groups that did not take part in
|
the return code from pcre2_match()). Groups that did not take part in
|
||||||
the match are output as "<unset>".
|
the match are output as "<unset>".
|
||||||
|
|
||||||
Testing callouts
|
Testing callouts
|
||||||
|
|
||||||
A callout function is supplied when pcre2test calls the library match-
|
A callout function is supplied when pcre2test calls the library match-
|
||||||
ing functions, unless callout_none is specified. If callout_capture is
|
ing functions, unless callout_none is specified. If callout_capture is
|
||||||
set, the current captured groups are output when a callout occurs.
|
set, the current captured groups are output when a callout occurs.
|
||||||
|
|
||||||
The callout_fail modifier can be given one or two numbers. If there is
|
The callout_fail modifier can be given one or two numbers. If there is
|
||||||
only one number, 1 is returned instead of 0 when a callout of that num-
|
only one number, 1 is returned instead of 0 when a callout of that num-
|
||||||
ber is reached. If two numbers are given, 1 is returned when callout
|
ber is reached. If two numbers are given, 1 is returned when callout
|
||||||
<n> is reached for the <m>th time. Note that callouts with string argu-
|
<n> is reached for the <m>th time. Note that callouts with string argu-
|
||||||
ments are always given the number zero. See "Callouts" below for a
|
ments are always given the number zero. See "Callouts" below for a
|
||||||
description of the output when a callout it taken.
|
description of the output when a callout it taken.
|
||||||
|
|
||||||
The callout_data modifier can be given an unsigned or a negative num-
|
The callout_data modifier can be given an unsigned or a negative num-
|
||||||
ber. This is set as the "user data" that is passed to the matching
|
ber. This is set as the "user data" that is passed to the matching
|
||||||
function, and passed back when the callout function is invoked. Any
|
function, and passed back when the callout function is invoked. Any
|
||||||
value other than zero is used as a return from pcre2test's callout
|
value other than zero is used as a return from pcre2test's callout
|
||||||
function.
|
function.
|
||||||
|
|
||||||
Finding all matches in a string
|
Finding all matches in a string
|
||||||
|
|
||||||
Searching for all possible matches within a subject can be requested by
|
Searching for all possible matches within a subject can be requested by
|
||||||
the global or /altglobal modifier. After finding a match, the matching
|
the global or /altglobal modifier. After finding a match, the matching
|
||||||
function is called again to search the remainder of the subject. The
|
function is called again to search the remainder of the subject. The
|
||||||
difference between global and altglobal is that the former uses the
|
difference between global and altglobal is that the former uses the
|
||||||
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
||||||
searching at a new point within the entire string (which is what Perl
|
searching at a new point within the entire string (which is what Perl
|
||||||
does), whereas the latter passes over a shortened subject. This makes a
|
does), whereas the latter passes over a shortened subject. This makes a
|
||||||
difference to the matching process if the pattern begins with a lookbe-
|
difference to the matching process if the pattern begins with a lookbe-
|
||||||
hind assertion (including \b or \B).
|
hind assertion (including \b or \B).
|
||||||
|
|
||||||
If an empty string is matched, the next match is done with the
|
If an empty string is matched, the next match is done with the
|
||||||
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
|
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
|
||||||
for another, non-empty, match at the same point in the subject. If this
|
for another, non-empty, match at the same point in the subject. If this
|
||||||
match fails, the start offset is advanced, and the normal match is
|
match fails, the start offset is advanced, and the normal match is
|
||||||
retried. This imitates the way Perl handles such cases when using the
|
retried. This imitates the way Perl handles such cases when using the
|
||||||
/g modifier or the split() function. Normally, the start offset is
|
/g modifier or the split() function. Normally, the start offset is
|
||||||
advanced by one character, but if the newline convention recognizes
|
advanced by one character, but if the newline convention recognizes
|
||||||
CRLF as a newline, and the current character is CR followed by LF, an
|
CRLF as a newline, and the current character is CR followed by LF, an
|
||||||
advance of two characters occurs.
|
advance of two characters occurs.
|
||||||
|
|
||||||
Testing substring extraction functions
|
Testing substring extraction functions
|
||||||
|
|
||||||
The copy and get modifiers can be used to test the pcre2_sub-
|
The copy and get modifiers can be used to test the pcre2_sub-
|
||||||
string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be
|
string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be
|
||||||
given more than once, and each can specify a group name or number, for
|
given more than once, and each can specify a group name or number, for
|
||||||
example:
|
example:
|
||||||
|
|
||||||
abcd\=copy=1,copy=3,get=G1
|
abcd\=copy=1,copy=3,get=G1
|
||||||
|
|
||||||
If the #subject command is used to set default copy and/or get lists,
|
If the #subject command is used to set default copy and/or get lists,
|
||||||
these can be unset by specifying a negative number to cancel all num-
|
these can be unset by specifying a negative number to cancel all num-
|
||||||
bered groups and an empty name to cancel all named groups.
|
bered groups and an empty name to cancel all named groups.
|
||||||
|
|
||||||
The getall modifier tests pcre2_substring_list_get(), which extracts
|
The getall modifier tests pcre2_substring_list_get(), which extracts
|
||||||
all captured substrings.
|
all captured substrings.
|
||||||
|
|
||||||
If the subject line is successfully matched, the substrings extracted
|
If the subject line is successfully matched, the substrings extracted
|
||||||
by the convenience functions are output with C, G, or L after the
|
by the convenience functions are output with C, G, or L after the
|
||||||
string number instead of a colon. This is in addition to the normal
|
string number instead of a colon. This is in addition to the normal
|
||||||
full list. The string length (that is, the return from the extraction
|
full list. The string length (that is, the return from the extraction
|
||||||
function) is given in parentheses after each substring, followed by the
|
function) is given in parentheses after each substring, followed by the
|
||||||
name when the extraction was by name.
|
name when the extraction was by name.
|
||||||
|
|
||||||
Testing the substitution function
|
Testing the substitution function
|
||||||
|
|
||||||
If the replace modifier is set, the pcre2_substitute() function is
|
If the replace modifier is set, the pcre2_substitute() function is
|
||||||
called instead of one of the matching functions. Unlike subject
|
called instead of one of the matching functions. Unlike subject
|
||||||
strings, pcre2test does not process replacement strings for escape
|
strings, pcre2test does not process replacement strings for escape
|
||||||
sequences. In UTF mode, a replacement string is checked to see if it is
|
sequences. In UTF mode, a replacement string is checked to see if it is
|
||||||
a valid UTF-8 string. If so, it is correctly converted to a UTF string
|
a valid UTF-8 string. If so, it is correctly converted to a UTF string
|
||||||
of the appropriate code unit width. If it is not a valid UTF-8 string,
|
of the appropriate code unit width. If it is not a valid UTF-8 string,
|
||||||
the individual code units are copied directly. This provides a means of
|
the individual code units are copied directly. This provides a means of
|
||||||
passing an invalid UTF-8 string for testing purposes.
|
passing an invalid UTF-8 string for testing purposes.
|
||||||
|
|
||||||
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
||||||
pcre2_substitute(). After a successful substitution, the modified
|
pcre2_substitute(). After a successful substitution, the modified
|
||||||
string is output, preceded by the number of replacements. This may be
|
string is output, preceded by the number of replacements. This may be
|
||||||
zero if there were no matches. Here is a simple example of a substitu-
|
zero if there were no matches. Here is a simple example of a substitu-
|
||||||
tion test:
|
tion test:
|
||||||
|
|
||||||
/abc/replace=xxx
|
/abc/replace=xxx
|
||||||
|
@ -975,11 +987,11 @@ SUBJECT MODIFIERS
|
||||||
=abc=abc=\=global
|
=abc=abc=\=global
|
||||||
2: =xxx=xxx=
|
2: =xxx=xxx=
|
||||||
|
|
||||||
Subject and replacement strings should be kept relatively short for
|
Subject and replacement strings should be kept relatively short for
|
||||||
substitution tests, as fixed-size buffers are used. To make it easy to
|
substitution tests, as fixed-size buffers are used. To make it easy to
|
||||||
test for buffer overflow, if the replacement string starts with a num-
|
test for buffer overflow, if the replacement string starts with a num-
|
||||||
ber in square brackets, that number is passed to pcre2_substitute() as
|
ber in square brackets, that number is passed to pcre2_substitute() as
|
||||||
the size of the output buffer, with the replacement string starting at
|
the size of the output buffer, with the replacement string starting at
|
||||||
the next character. Here is an example that tests the edge case:
|
the next character. Here is an example that tests the edge case:
|
||||||
|
|
||||||
/abc/
|
/abc/
|
||||||
|
@ -989,90 +1001,107 @@ SUBJECT MODIFIERS
|
||||||
Failed: error -47: no more memory
|
Failed: error -47: no more memory
|
||||||
|
|
||||||
A replacement string is ignored with POSIX and DFA matching. Specifying
|
A replacement string is ignored with POSIX and DFA matching. Specifying
|
||||||
partial matching provokes an error return ("bad option value") from
|
partial matching provokes an error return ("bad option value") from
|
||||||
pcre2_substitute().
|
pcre2_substitute().
|
||||||
|
|
||||||
Setting the JIT stack size
|
Setting the JIT stack size
|
||||||
|
|
||||||
The jitstack modifier provides a way of setting the maximum stack size
|
The jitstack modifier provides a way of setting the maximum stack size
|
||||||
that is used by the just-in-time optimization code. It is ignored if
|
that is used by the just-in-time optimization code. It is ignored if
|
||||||
JIT optimization is not being used. The value is a number of kilobytes.
|
JIT optimization is not being used. The value is a number of kilobytes.
|
||||||
Providing a stack that is larger than the default 32K is necessary only
|
Providing a stack that is larger than the default 32K is necessary only
|
||||||
for very complicated patterns.
|
for very complicated patterns.
|
||||||
|
|
||||||
Setting match and recursion limits
|
Setting match and recursion limits
|
||||||
|
|
||||||
The match_limit and recursion_limit modifiers set the appropriate lim-
|
The match_limit and recursion_limit modifiers set the appropriate lim-
|
||||||
its in the match context. These values are ignored when the find_limits
|
its in the match context. These values are ignored when the find_limits
|
||||||
modifier is specified.
|
modifier is specified.
|
||||||
|
|
||||||
Finding minimum limits
|
Finding minimum limits
|
||||||
|
|
||||||
If the find_limits modifier is present, pcre2test calls pcre2_match()
|
If the find_limits modifier is present, pcre2test calls pcre2_match()
|
||||||
several times, setting different values in the match context via
|
several times, setting different values in the match context via
|
||||||
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
|
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
|
||||||
the minimum values for each parameter that allow pcre2_match() to com-
|
the minimum values for each parameter that allow pcre2_match() to com-
|
||||||
plete without error.
|
plete without error.
|
||||||
|
|
||||||
If JIT is being used, only the match limit is relevant. If DFA matching
|
If JIT is being used, only the match limit is relevant. If DFA matching
|
||||||
is being used, neither limit is relevant, and this modifier is ignored
|
is being used, neither limit is relevant, and this modifier is ignored
|
||||||
(with a warning message).
|
(with a warning message).
|
||||||
|
|
||||||
The match_limit number is a measure of the amount of backtracking that
|
The match_limit number is a measure of the amount of backtracking that
|
||||||
takes place, and learning the minimum value can be instructive. For
|
takes place, and learning the minimum value can be instructive. For
|
||||||
most simple matches, the number is quite small, but for patterns with
|
most simple matches, the number is quite small, but for patterns with
|
||||||
very large numbers of matching possibilities, it can become large very
|
very large numbers of matching possibilities, it can become large very
|
||||||
quickly with increasing length of subject string. The
|
quickly with increasing length of subject string. The
|
||||||
match_limit_recursion number is a measure of how much stack (or, if
|
match_limit_recursion number is a measure of how much stack (or, if
|
||||||
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
|
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
|
||||||
complete the match attempt.
|
complete the match attempt.
|
||||||
|
|
||||||
Showing MARK names
|
Showing MARK names
|
||||||
|
|
||||||
|
|
||||||
The mark modifier causes the names from backtracking control verbs that
|
The mark modifier causes the names from backtracking control verbs that
|
||||||
are returned from calls to pcre2_match() to be displayed. If a mark is
|
are returned from calls to pcre2_match() to be displayed. If a mark is
|
||||||
returned for a match, non-match, or partial match, pcre2test shows it.
|
returned for a match, non-match, or partial match, pcre2test shows it.
|
||||||
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
||||||
it is added to the non-match message.
|
it is added to the non-match message.
|
||||||
|
|
||||||
Showing memory usage
|
Showing memory usage
|
||||||
|
|
||||||
The memory modifier causes pcre2test to log all memory allocation and
|
The memory modifier causes pcre2test to log all memory allocation and
|
||||||
freeing calls that occur during a match operation.
|
freeing calls that occur during a match operation.
|
||||||
|
|
||||||
Setting a starting offset
|
Setting a starting offset
|
||||||
|
|
||||||
The offset modifier sets an offset in the subject string at which
|
The offset modifier sets an offset in the subject string at which
|
||||||
matching starts. Its value is a number of code units, not characters.
|
matching starts. Its value is a number of code units, not characters.
|
||||||
|
|
||||||
|
Setting an offset limit
|
||||||
|
|
||||||
|
The offset_limit modifier sets a limit for unanchored matches. If a
|
||||||
|
match cannot be found starting at or before this offset in the subject,
|
||||||
|
a "no match" return is given. The data value is a number of code units,
|
||||||
|
not characters. When this modifier is used, the use_offset_limit modi-
|
||||||
|
fier must have been set for the pattern; if not, an error is generated.
|
||||||
|
|
||||||
Setting the size of the output vector
|
Setting the size of the output vector
|
||||||
|
|
||||||
The ovector modifier applies only to the subject line in which it
|
The ovector modifier applies only to the subject line in which it
|
||||||
appears, though of course it can also be used to set a default in a
|
appears, though of course it can also be used to set a default in a
|
||||||
#subject command. It specifies the number of pairs of offsets that are
|
#subject command. It specifies the number of pairs of offsets that are
|
||||||
available for storing matching information. The default is 15.
|
available for storing matching information. The default is 15.
|
||||||
|
|
||||||
A value of zero is useful when testing the POSIX API because it causes
|
A value of zero is useful when testing the POSIX API because it causes
|
||||||
regexec() to be called with a NULL capture vector. When not testing the
|
regexec() to be called with a NULL capture vector. When not testing the
|
||||||
POSIX API, a value of zero is used to cause pcre2_match_data_cre-
|
POSIX API, a value of zero is used to cause pcre2_match_data_cre-
|
||||||
ate_from_pattern() to be called, in order to create a match block of
|
ate_from_pattern() to be called, in order to create a match block of
|
||||||
exactly the right size for the pattern. (It is not possible to create a
|
exactly the right size for the pattern. (It is not possible to create a
|
||||||
match block with a zero-length ovector; there is always at least one
|
match block with a zero-length ovector; there is always at least one
|
||||||
pair of offsets.)
|
pair of offsets.)
|
||||||
|
|
||||||
Passing the subject as zero-terminated
|
Passing the subject as zero-terminated
|
||||||
|
|
||||||
By default, the subject string is passed to a native API matching func-
|
By default, the subject string is passed to a native API matching func-
|
||||||
tion with its correct length. In order to test the facility for passing
|
tion with its correct length. In order to test the facility for passing
|
||||||
a zero-terminated string, the zero_terminate modifier is provided. It
|
a zero-terminated string, the zero_terminate modifier is provided. It
|
||||||
causes the length to be passed as PCRE2_ZERO_TERMINATED. (When matching
|
causes the length to be passed as PCRE2_ZERO_TERMINATED. (When matching
|
||||||
via the POSIX interface, this modifier has no effect, as there is no
|
via the POSIX interface, this modifier has no effect, as there is no
|
||||||
facility for passing a length.)
|
facility for passing a length.)
|
||||||
|
|
||||||
When testing pcre2_substitute(), this modifier also has the effect of
|
When testing pcre2_substitute(), this modifier also has the effect of
|
||||||
passing the replacement string as zero-terminated.
|
passing the replacement string as zero-terminated.
|
||||||
|
|
||||||
|
Passing a NULL context
|
||||||
|
|
||||||
|
Normally, pcre2test passes a context block to pcre2_match(),
|
||||||
|
pcre2_dfa_match() or pcre2_jit_match(). If the null_context modifier is
|
||||||
|
set, however, NULL is passed. This is for testing that the matching
|
||||||
|
functions behave correctly in this case (they use default values). This
|
||||||
|
modifier cannot be used with the find_limits modifier or when testing
|
||||||
|
the substitution function.
|
||||||
|
|
||||||
|
|
||||||
THE ALTERNATIVE MATCHING FUNCTION
|
THE ALTERNATIVE MATCHING FUNCTION
|
||||||
|
|
||||||
|
@ -1398,5 +1427,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 14 September 2015
|
Last updated: 17 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2UNICODE 3 "18 August 2015" "PCRE2 10.21"
|
.TH PCRE2UNICODE 3 "16 October 2015" "PCRE2 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE - Perl-compatible regular expressions (revised API)
|
PCRE - Perl-compatible regular expressions (revised API)
|
||||||
.SH "UNICODE AND UTF SUPPORT"
|
.SH "UNICODE AND UTF SUPPORT"
|
||||||
|
@ -63,11 +63,12 @@ characters (see the description of \eC in the
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2pattern\fP
|
\fBpcre2pattern\fP
|
||||||
.\"
|
.\"
|
||||||
documentation). The use of \eC is not supported in the alternative matching
|
documentation). The use of \eC is not supported by the alternative matching
|
||||||
function \fBpcre2_dfa_match()\fP, nor is it supported in UTF mode by the JIT
|
function \fBpcre2_dfa_match()\fP when in UTF mode. Its use provokes a
|
||||||
optimization. If JIT optimization is requested for a UTF pattern that contains
|
match-time error. The JIT optimization also does not support \eC in UTF mode.
|
||||||
\eC, it will not succeed, and so the matching will be carried out by the normal
|
If JIT optimization is requested for a UTF pattern that contains \eC, it will
|
||||||
interpretive function.
|
not succeed, and so the matching will be carried out by the normal interpretive
|
||||||
|
function.
|
||||||
.P
|
.P
|
||||||
The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test
|
The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test
|
||||||
characters of any code value, but, by default, the characters that PCRE2
|
characters of any code value, but, by default, the characters that PCRE2
|
||||||
|
@ -262,6 +263,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 18 August 2015
|
Last updated: 16 October 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -182,6 +182,9 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||||
#define MAX_NAME_SIZE 32
|
#define MAX_NAME_SIZE 32
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
|
||||||
|
/* #undef NEVER_BACKSLASH_C */
|
||||||
|
|
||||||
/* The value of NEWLINE_DEFAULT determines the default newline character
|
/* The value of NEWLINE_DEFAULT determines the default newline character
|
||||||
sequence. PCRE2 client programs can override this by selecting other values
|
sequence. PCRE2 client programs can override this by selecting other values
|
||||||
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5
|
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5
|
||||||
|
|
|
@ -169,6 +169,9 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||||
overflow caused by enormously large patterns. */
|
overflow caused by enormously large patterns. */
|
||||||
#undef MAX_NAME_SIZE
|
#undef MAX_NAME_SIZE
|
||||||
|
|
||||||
|
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
|
||||||
|
#undef NEVER_BACKSLASH_C
|
||||||
|
|
||||||
/* The value of NEWLINE_DEFAULT determines the default newline character
|
/* The value of NEWLINE_DEFAULT determines the default newline character
|
||||||
sequence. PCRE2 client programs can override this by selecting other values
|
sequence. PCRE2 client programs can override this by selecting other values
|
||||||
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5
|
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5
|
||||||
|
|
|
@ -583,7 +583,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
|
||||||
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
|
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
|
||||||
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
|
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
|
||||||
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
|
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
|
||||||
ERR81, ERR82, ERR83, ERR84 };
|
ERR81, ERR82, ERR83, ERR84, ERR85 };
|
||||||
|
|
||||||
/* This is a table of start-of-pattern options such as (*UTF) and settings such
|
/* This is a table of start-of-pattern options such as (*UTF) and settings such
|
||||||
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
|
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
|
||||||
|
@ -7053,11 +7053,19 @@ for (;; ptr++)
|
||||||
|
|
||||||
/* The use of \C can be locked out. */
|
/* The use of \C can be locked out. */
|
||||||
|
|
||||||
|
#ifdef NEVER_BACKSLASH_C
|
||||||
|
else if (escape == ESC_C)
|
||||||
|
{
|
||||||
|
*errorcodeptr = ERR85;
|
||||||
|
goto FAILED;
|
||||||
|
}
|
||||||
|
#else
|
||||||
else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
|
else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
|
||||||
{
|
{
|
||||||
*errorcodeptr = ERR83;
|
*errorcodeptr = ERR83;
|
||||||
goto FAILED;
|
goto FAILED;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/* For the rest (including \X when Unicode properties are supported), we
|
/* For the rest (including \X when Unicode properties are supported), we
|
||||||
can obtain the OP value by negating the escape value in the default
|
can obtain the OP value by negating the escape value in the default
|
||||||
|
|
|
@ -168,6 +168,8 @@ static const char compile_error_texts[] =
|
||||||
"unrecognized string delimiter follows (?C\0"
|
"unrecognized string delimiter follows (?C\0"
|
||||||
"using \\C is disabled by the application\0"
|
"using \\C is disabled by the application\0"
|
||||||
"(?| and/or (?J: or (?x: parentheses are too deeply nested\0"
|
"(?| and/or (?J: or (?x: parentheses are too deeply nested\0"
|
||||||
|
/* 85 */
|
||||||
|
"using \\C is disabled in this PCRE2 library\0"
|
||||||
;
|
;
|
||||||
|
|
||||||
/* Match-time and UTF error texts are in the same format. */
|
/* Match-time and UTF error texts are in the same format. */
|
||||||
|
|
|
@ -106,7 +106,7 @@ static const int eint1[] = {
|
||||||
|
|
||||||
static const int eint2[] = {
|
static const int eint2[] = {
|
||||||
30, REG_ECTYPE, /* unknown POSIX class name */
|
30, REG_ECTYPE, /* unknown POSIX class name */
|
||||||
32, REG_INVARG, /* this version of PCRE does not have UTF or UCP support */
|
32, REG_INVARG, /* this version of PCRE2 does not have Unicode support */
|
||||||
37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */
|
37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */
|
||||||
56, REG_INVARG, /* internal error: unknown newline setting */
|
56, REG_INVARG, /* internal error: unknown newline setting */
|
||||||
};
|
};
|
||||||
|
|
|
@ -667,6 +667,12 @@ table itself easier to read. */
|
||||||
#define EBCDIC_NL 0
|
#define EBCDIC_NL 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef NEVER_BACKSLASH_C
|
||||||
|
#define BACKSLASH_C 0
|
||||||
|
#else
|
||||||
|
#define BACKSLASH_C 1
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct coptstruct {
|
typedef struct coptstruct {
|
||||||
const char *name;
|
const char *name;
|
||||||
uint32_t type;
|
uint32_t type;
|
||||||
|
@ -681,16 +687,17 @@ enum { CONF_BSR,
|
||||||
};
|
};
|
||||||
|
|
||||||
static coptstruct coptlist[] = {
|
static coptstruct coptlist[] = {
|
||||||
{ "bsr", CONF_BSR, PCRE2_CONFIG_BSR },
|
{ "backslash-C", CONF_FIX, BACKSLASH_C },
|
||||||
{ "ebcdic", CONF_FIX, SUPPORT_EBCDIC },
|
{ "bsr", CONF_BSR, PCRE2_CONFIG_BSR },
|
||||||
{ "ebcdic-nl", CONF_FIZ, EBCDIC_NL },
|
{ "ebcdic", CONF_FIX, SUPPORT_EBCDIC },
|
||||||
{ "jit", CONF_INT, PCRE2_CONFIG_JIT },
|
{ "ebcdic-nl", CONF_FIZ, EBCDIC_NL },
|
||||||
{ "linksize", CONF_INT, PCRE2_CONFIG_LINKSIZE },
|
{ "jit", CONF_INT, PCRE2_CONFIG_JIT },
|
||||||
{ "newline", CONF_NL, PCRE2_CONFIG_NEWLINE },
|
{ "linksize", CONF_INT, PCRE2_CONFIG_LINKSIZE },
|
||||||
{ "pcre2-16", CONF_FIX, SUPPORT_16 },
|
{ "newline", CONF_NL, PCRE2_CONFIG_NEWLINE },
|
||||||
{ "pcre2-32", CONF_FIX, SUPPORT_32 },
|
{ "pcre2-16", CONF_FIX, SUPPORT_16 },
|
||||||
{ "pcre2-8", CONF_FIX, SUPPORT_8 },
|
{ "pcre2-32", CONF_FIX, SUPPORT_32 },
|
||||||
{ "unicode", CONF_INT, PCRE2_CONFIG_UNICODE }
|
{ "pcre2-8", CONF_FIX, SUPPORT_8 },
|
||||||
|
{ "unicode", CONF_INT, PCRE2_CONFIG_UNICODE }
|
||||||
};
|
};
|
||||||
|
|
||||||
#define COPTLISTCOUNT sizeof(coptlist)/sizeof(coptstruct)
|
#define COPTLISTCOUNT sizeof(coptlist)/sizeof(coptstruct)
|
||||||
|
@ -6467,6 +6474,7 @@ printf(" -b set default pattern control 'fullbincode'\n");
|
||||||
printf(" -C show PCRE2 compile-time options and exit\n");
|
printf(" -C show PCRE2 compile-time options and exit\n");
|
||||||
printf(" -C arg show a specific compile-time option and exit with its\n");
|
printf(" -C arg show a specific compile-time option and exit with its\n");
|
||||||
printf(" value if numeric (else 0). The arg can be:\n");
|
printf(" value if numeric (else 0). The arg can be:\n");
|
||||||
|
printf(" backslash-C use of \\C is enabled [0, 1]\n");
|
||||||
printf(" bsr \\R type [ANYCRLF, ANY]\n");
|
printf(" bsr \\R type [ANYCRLF, ANY]\n");
|
||||||
printf(" ebcdic compiled for EBCDIC character code [0,1]\n");
|
printf(" ebcdic compiled for EBCDIC character code [0,1]\n");
|
||||||
printf(" ebcdic-nl NL code if compiled for EBCDIC\n");
|
printf(" ebcdic-nl NL code if compiled for EBCDIC\n");
|
||||||
|
@ -6618,6 +6626,11 @@ print_newline_config(optval, FALSE);
|
||||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &optval);
|
(void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &optval);
|
||||||
printf(" \\R matches %s\n", optval? "CR, LF, or CRLF only" :
|
printf(" \\R matches %s\n", optval? "CR, LF, or CRLF only" :
|
||||||
"all Unicode newlines");
|
"all Unicode newlines");
|
||||||
|
#ifdef NEVER_BACKSLASH_C
|
||||||
|
printf(" \\C is not supported\n");
|
||||||
|
#else
|
||||||
|
printf(" \\C is supported\n");
|
||||||
|
#endif
|
||||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &optval);
|
(void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &optval);
|
||||||
printf(" Internal link size = %d\n", optval);
|
printf(" Internal link size = %d\n", optval);
|
||||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &optval);
|
(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &optval);
|
||||||
|
|
|
@ -1,46 +1,6 @@
|
||||||
# This set of tests is for UTF-8 support and Unicode property support, with
|
# This set of tests is for UTF-8 support and Unicode property support, with
|
||||||
# relevance only for the 8-bit library.
|
# relevance only for the 8-bit library.
|
||||||
|
|
||||||
/X(\C{3})/utf
|
|
||||||
X\x{1234}
|
|
||||||
|
|
||||||
/X(\C{4})/utf
|
|
||||||
X\x{1234}YZ
|
|
||||||
|
|
||||||
/X\C*/utf
|
|
||||||
XYZabcdce
|
|
||||||
|
|
||||||
/X\C*?/utf
|
|
||||||
XYZabcde
|
|
||||||
|
|
||||||
/X\C{3,5}/utf
|
|
||||||
Xabcdefg
|
|
||||||
X\x{1234}
|
|
||||||
X\x{1234}YZ
|
|
||||||
X\x{1234}\x{512}
|
|
||||||
X\x{1234}\x{512}YZ
|
|
||||||
|
|
||||||
/X\C{3,5}?/utf
|
|
||||||
Xabcdefg
|
|
||||||
X\x{1234}
|
|
||||||
X\x{1234}YZ
|
|
||||||
X\x{1234}\x{512}
|
|
||||||
|
|
||||||
/a\Cb/utf
|
|
||||||
aXb
|
|
||||||
a\nb
|
|
||||||
|
|
||||||
/a\C\Cb/utf
|
|
||||||
a\x{100}b
|
|
||||||
|
|
||||||
/ab\Cde/utf
|
|
||||||
abXde
|
|
||||||
|
|
||||||
/a\C\Cb/utf
|
|
||||||
a\x{100}b
|
|
||||||
\= Expect no match
|
|
||||||
a\x{12257}b
|
|
||||||
|
|
||||||
# The next 3 patterns have UTF-8 errors
|
# The next 3 patterns have UTF-8 errors
|
||||||
|
|
||||||
/[Ã]/utf
|
/[Ã]/utf
|
||||||
|
@ -212,21 +172,6 @@
|
||||||
|
|
||||||
/\x{212ab}/IB,utf
|
/\x{212ab}/IB,utf
|
||||||
|
|
||||||
# This one is here not because it's different to Perl, but because the way
|
|
||||||
# the captured single-byte is displayed. (In Perl it becomes a character, and you
|
|
||||||
# can't tell the difference.)
|
|
||||||
|
|
||||||
/X(\C)(.*)/utf
|
|
||||||
X\x{1234}
|
|
||||||
X\nabc
|
|
||||||
|
|
||||||
# This one is here because Perl gives out a grumbly error message (quite
|
|
||||||
# correctly, but that messes up comparisons).
|
|
||||||
|
|
||||||
/a\Cb/utf
|
|
||||||
\= Expect no match
|
|
||||||
a\x{100}b
|
|
||||||
|
|
||||||
/[^ab\xC0-\xF0]/IB,utf
|
/[^ab\xC0-\xF0]/IB,utf
|
||||||
\x{f1}
|
\x{f1}
|
||||||
\x{bf}
|
\x{bf}
|
||||||
|
|
|
@ -6,10 +6,6 @@
|
||||||
#forbid_utf
|
#forbid_utf
|
||||||
#newline_default LF ANY ANYCRLF
|
#newline_default LF ANY ANYCRLF
|
||||||
|
|
||||||
/a\Cb/
|
|
||||||
aXb
|
|
||||||
a\nb
|
|
||||||
|
|
||||||
/[^\x{c4}]/IB
|
/[^\x{c4}]/IB
|
||||||
|
|
||||||
/\x{100}/I
|
/\x{100}/I
|
||||||
|
@ -344,7 +340,7 @@
|
||||||
|
|
||||||
# Non-UTF characters
|
# Non-UTF characters
|
||||||
|
|
||||||
/\C{2,3}/
|
/.{2,3}/
|
||||||
\x{400000}\x{400001}\x{400002}\x{400003}
|
\x{400000}\x{400001}\x{400002}\x{400003}
|
||||||
|
|
||||||
/\x{400000}\x{800000}/IBi
|
/\x{400000}\x{800000}/IBi
|
||||||
|
|
|
@ -7,49 +7,6 @@
|
||||||
/abc/utf
|
/abc/utf
|
||||||
Ã]
|
Ã]
|
||||||
|
|
||||||
/X(\C{3})/utf
|
|
||||||
X\x{11234}Y
|
|
||||||
X\x{11234}YZ
|
|
||||||
|
|
||||||
/X(\C{4})/utf
|
|
||||||
X\x{11234}YZ
|
|
||||||
X\x{11234}YZW
|
|
||||||
|
|
||||||
/X\C*/utf
|
|
||||||
XYZabcdce
|
|
||||||
|
|
||||||
/X\C*?/utf
|
|
||||||
XYZabcde
|
|
||||||
|
|
||||||
/X\C{3,5}/utf
|
|
||||||
Xabcdefg
|
|
||||||
X\x{11234}Y
|
|
||||||
X\x{11234}YZ
|
|
||||||
X\x{11234}\x{512}
|
|
||||||
X\x{11234}\x{512}YZ
|
|
||||||
X\x{11234}\x{512}\x{11234}Z
|
|
||||||
|
|
||||||
/X\C{3,5}?/utf
|
|
||||||
Xabcdefg
|
|
||||||
X\x{11234}Y
|
|
||||||
X\x{11234}YZ
|
|
||||||
X\x{11234}\x{512}YZ
|
|
||||||
\= Expect no match
|
|
||||||
X\x{11234}
|
|
||||||
|
|
||||||
/a\Cb/utf
|
|
||||||
aXb
|
|
||||||
a\nb
|
|
||||||
|
|
||||||
/a\C\Cb/utf
|
|
||||||
a\x{12257}b
|
|
||||||
\= Expect no match
|
|
||||||
a\x{12257}\x{11234}b
|
|
||||||
a\x{100}b
|
|
||||||
|
|
||||||
/ab\Cde/utf
|
|
||||||
abXde
|
|
||||||
|
|
||||||
# Check maximum character size
|
# Check maximum character size
|
||||||
|
|
||||||
/\x{ffff}/IB,utf
|
/\x{ffff}/IB,utf
|
||||||
|
@ -90,16 +47,6 @@
|
||||||
|
|
||||||
/\x{212ab}/IB,utf
|
/\x{212ab}/IB,utf
|
||||||
|
|
||||||
# These two \C tests, copied from the UTF-8 input file, do not have any
|
|
||||||
# problems in 16 or 32 bits.
|
|
||||||
|
|
||||||
/X(\C)(.*)/utf
|
|
||||||
X\x{1234}
|
|
||||||
X\nabc
|
|
||||||
|
|
||||||
/a\Cb/utf
|
|
||||||
a\x{100}b
|
|
||||||
|
|
||||||
/[^ab\xC0-\xF0]/IB,utf
|
/[^ab\xC0-\xF0]/IB,utf
|
||||||
\x{f1}
|
\x{f1}
|
||||||
\x{bf}
|
\x{bf}
|
||||||
|
@ -336,9 +283,6 @@
|
||||||
|
|
||||||
/\o{4200000}/utf
|
/\o{4200000}/utf
|
||||||
|
|
||||||
/\C/utf
|
|
||||||
\x{110000}
|
|
||||||
|
|
||||||
/\x{100}*A/IB,utf
|
/\x{100}*A/IB,utf
|
||||||
A
|
A
|
||||||
|
|
||||||
|
@ -396,4 +340,7 @@
|
||||||
|
|
||||||
/\x{3a3}B/IBi,utf
|
/\x{3a3}B/IBi,utf
|
||||||
|
|
||||||
|
/./utf
|
||||||
|
\x{110000}
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -3739,41 +3739,40 @@
|
||||||
|
|
||||||
/[bcd]*a/B
|
/[bcd]*a/B
|
||||||
|
|
||||||
# A complete set of tests for auto-possessification of character types.
|
# A complete set of tests for auto-possessification of character types, but
|
||||||
|
# omitting \C because it might be disabled (it has its own tests).
|
||||||
|
|
||||||
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\C \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
||||||
|
|
||||||
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\C \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
||||||
|
|
||||||
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\C \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
||||||
|
|
||||||
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\C \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
||||||
|
|
||||||
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\C \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
||||||
|
|
||||||
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\C \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
||||||
|
|
||||||
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\C \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
||||||
|
|
||||||
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\C \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
||||||
|
|
||||||
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\C \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
||||||
|
|
||||||
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\C \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
||||||
|
|
||||||
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\C \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
||||||
|
|
||||||
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\C \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
||||||
|
|
||||||
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\C a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
||||||
|
|
||||||
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\C \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
||||||
|
|
||||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
||||||
|
|
||||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
/ \D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
||||||
|
|
||||||
/\D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \C+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
|
||||||
|
|
||||||
/(?=a+)a(a+)++a/B
|
/(?=a+)a(a+)++a/B
|
||||||
|
|
||||||
|
@ -4327,8 +4326,6 @@
|
||||||
|
|
||||||
/((?2){73}(?2))((?1))/info
|
/((?2){73}(?2))((?1))/info
|
||||||
|
|
||||||
/ab\Cde/never_backslash_c
|
|
||||||
|
|
||||||
/abc/
|
/abc/
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
\[9x!xxx(]{9999}
|
\[9x!xxx(]{9999}
|
||||||
|
@ -4446,12 +4443,6 @@
|
||||||
/\x0{ab}/
|
/\x0{ab}/
|
||||||
\0{ab}
|
\0{ab}
|
||||||
|
|
||||||
/ab\Cde/
|
|
||||||
abXde
|
|
||||||
|
|
||||||
/(?<=ab\Cde)X/
|
|
||||||
abZdeX
|
|
||||||
|
|
||||||
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
|
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
|
||||||
ababababbbabZXXXX
|
ababababbbabZXXXX
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
# These are tests of \C that do not involve UTF. They are not run when \C is
|
||||||
|
# disabled by compiling with --enable-never-backslash-C.
|
||||||
|
|
||||||
|
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
||||||
|
|
||||||
|
/\D+\C \d+\C \S+\C \s+\C \W+\C \w+\C .+\C \R+\C \H+\C \h+\C \V+\C \v+\C a+\C \n+\C \C+\C/Bx
|
||||||
|
|
||||||
|
/ab\Cde/never_backslash_c
|
||||||
|
|
||||||
|
/ab\Cde/
|
||||||
|
abXde
|
||||||
|
|
||||||
|
/(?<=ab\Cde)X/
|
||||||
|
abZdeX
|
||||||
|
|
||||||
|
# End of testinput21
|
|
@ -0,0 +1,95 @@
|
||||||
|
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||||
|
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||||
|
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||||
|
# in some widths and not in others.
|
||||||
|
|
||||||
|
/ab\Cde/utf
|
||||||
|
abXde
|
||||||
|
|
||||||
|
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||||
|
|
||||||
|
/(?<=ab\Cde)X/utf
|
||||||
|
|
||||||
|
# Autopossessification tests
|
||||||
|
|
||||||
|
/\C+\X \X+\C/Bx
|
||||||
|
|
||||||
|
/\C+\X \X+\C/Bx,utf
|
||||||
|
|
||||||
|
/\C\X*TӅ;
|
||||||
|
{0,6}\v+
F
|
||||||
|
/utf
|
||||||
|
\= Expect no match
|
||||||
|
Ӆ\x0a
|
||||||
|
|
||||||
|
/\C(\W?ſ)'?{{/utf
|
||||||
|
\= Expect no match
|
||||||
|
\\C(\\W?ſ)'?{{
|
||||||
|
|
||||||
|
/X(\C{3})/utf
|
||||||
|
X\x{1234}
|
||||||
|
X\x{11234}Y
|
||||||
|
X\x{11234}YZ
|
||||||
|
|
||||||
|
/X(\C{4})/utf
|
||||||
|
X\x{1234}YZ
|
||||||
|
X\x{11234}YZ
|
||||||
|
X\x{11234}YZW
|
||||||
|
|
||||||
|
/X\C*/utf
|
||||||
|
XYZabcdce
|
||||||
|
|
||||||
|
/X\C*?/utf
|
||||||
|
XYZabcde
|
||||||
|
|
||||||
|
/X\C{3,5}/utf
|
||||||
|
Xabcdefg
|
||||||
|
X\x{1234}
|
||||||
|
X\x{1234}YZ
|
||||||
|
X\x{1234}\x{512}
|
||||||
|
X\x{1234}\x{512}YZ
|
||||||
|
X\x{11234}Y
|
||||||
|
X\x{11234}YZ
|
||||||
|
X\x{11234}\x{512}
|
||||||
|
X\x{11234}\x{512}YZ
|
||||||
|
X\x{11234}\x{512}\x{11234}Z
|
||||||
|
|
||||||
|
/X\C{3,5}?/utf
|
||||||
|
Xabcdefg
|
||||||
|
X\x{1234}
|
||||||
|
X\x{1234}YZ
|
||||||
|
X\x{1234}\x{512}
|
||||||
|
X\x{11234}Y
|
||||||
|
X\x{11234}YZ
|
||||||
|
X\x{11234}\x{512}YZ
|
||||||
|
X\x{11234}
|
||||||
|
|
||||||
|
/a\Cb/utf
|
||||||
|
aXb
|
||||||
|
a\nb
|
||||||
|
a\x{100}b
|
||||||
|
|
||||||
|
/a\C\Cb/utf
|
||||||
|
a\x{100}b
|
||||||
|
a\x{12257}b
|
||||||
|
a\x{12257}\x{11234}b
|
||||||
|
|
||||||
|
/ab\Cde/utf
|
||||||
|
abXde
|
||||||
|
|
||||||
|
# This one is here not because it's different to Perl, but because the way
|
||||||
|
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||||
|
# and you can't tell the difference.)
|
||||||
|
|
||||||
|
/X(\C)(.*)/utf
|
||||||
|
X\x{1234}
|
||||||
|
X\nabc
|
||||||
|
|
||||||
|
# This one is here because Perl gives out a grumbly error message (quite
|
||||||
|
# correctly, but that messes up comparisons).
|
||||||
|
|
||||||
|
/a\Cb/utf
|
||||||
|
\= Expect no match in 8-bit mode
|
||||||
|
a\x{100}b
|
||||||
|
|
||||||
|
# End of testinput22
|
|
@ -0,0 +1,7 @@
|
||||||
|
# This test is run when PCRE2 has been built with --enable-never-backslash-C,
|
||||||
|
# which disables the use of \C. All we can do is check that it gives the
|
||||||
|
# correct error message.
|
||||||
|
|
||||||
|
/a\Cb/
|
||||||
|
|
||||||
|
# End of testinput23
|
|
@ -111,9 +111,6 @@
|
||||||
/.{3,5}?/IB,utf
|
/.{3,5}?/IB,utf
|
||||||
\x{212ab}\x{212ab}\x{212ab}\x{861}
|
\x{212ab}\x{212ab}\x{212ab}\x{861}
|
||||||
|
|
||||||
/(?<=\C)X/utf
|
|
||||||
Should produce an error diagnostic
|
|
||||||
|
|
||||||
/^[ab]/IB,utf
|
/^[ab]/IB,utf
|
||||||
bar
|
bar
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
|
@ -1367,8 +1364,6 @@
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
aAz
|
aAz
|
||||||
|
|
||||||
/(?<=ab\Cde)X/utf
|
|
||||||
|
|
||||||
/\X/
|
/\X/
|
||||||
a\=ps
|
a\=ps
|
||||||
a\=ph
|
a\=ph
|
||||||
|
@ -1617,13 +1612,13 @@
|
||||||
|
|
||||||
/[\p{L}ab]{2,3}+/B,no_auto_possess
|
/[\p{L}ab]{2,3}+/B,no_auto_possess
|
||||||
|
|
||||||
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \C+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
||||||
|
|
||||||
/.+\X/Bsx
|
/.+\X/Bsx
|
||||||
|
|
||||||
/\X+$/Bmx
|
/\X+$/Bmx
|
||||||
|
|
||||||
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\C \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
||||||
|
|
||||||
/\d+\s{0,5}=\s*\S?=\w{0,4}\W*/B,utf,ucp
|
/\d+\s{0,5}=\s*\S?=\w{0,4}\W*/B,utf,ucp
|
||||||
|
|
||||||
|
@ -1665,16 +1660,6 @@
|
||||||
|
|
||||||
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
|
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
|
||||||
|
|
||||||
/\C\X*TӅ;
|
|
||||||
{0,6}\v+
F
|
|
||||||
/utf
|
|
||||||
\= Expect no match
|
|
||||||
Ӆ\x0a
|
|
||||||
|
|
||||||
/\C(\W?ſ)'?{{/utf
|
|
||||||
\= Expect no match
|
|
||||||
\\C(\\W?ſ)'?{{
|
|
||||||
|
|
||||||
/[\pS#moq]/
|
/[\pS#moq]/
|
||||||
=
|
=
|
||||||
|
|
||||||
|
|
|
@ -4645,12 +4645,6 @@
|
||||||
aaaa\=ovector=3
|
aaaa\=ovector=3
|
||||||
aaaa\=ovector=4
|
aaaa\=ovector=4
|
||||||
|
|
||||||
/ab\Cde/
|
|
||||||
abXde
|
|
||||||
|
|
||||||
/(?<=ab\Cde)X/
|
|
||||||
abZdeX
|
|
||||||
|
|
||||||
/^\R/
|
/^\R/
|
||||||
\r\=ps
|
\r\=ps
|
||||||
\r\=ph
|
\r\=ph
|
||||||
|
|
|
@ -671,11 +671,6 @@
|
||||||
the cat\=ps
|
the cat\=ps
|
||||||
the cat\=ph
|
the cat\=ph
|
||||||
|
|
||||||
/ab\Cde/utf
|
|
||||||
abXde
|
|
||||||
|
|
||||||
/(?<=ab\Cde)X/utf
|
|
||||||
|
|
||||||
/./newline=crlf,utf
|
/./newline=crlf,utf
|
||||||
\r\=ps
|
\r\=ps
|
||||||
\r\=ph
|
\r\=ph
|
||||||
|
|
|
@ -4,10 +4,8 @@
|
||||||
#forbid_utf
|
#forbid_utf
|
||||||
#newline_default lf any anycrlf
|
#newline_default lf any anycrlf
|
||||||
|
|
||||||
/a\Cb/
|
/ab/
|
||||||
aXb
|
\= Expect error message (too big char) and no match
|
||||||
a\nb
|
|
||||||
\= Expect no match and error message (too big char)
|
|
||||||
A\x{123}B
|
A\x{123}B
|
||||||
A\o{443}B
|
A\o{443}B
|
||||||
|
|
||||||
|
|
|
@ -1,67 +1,6 @@
|
||||||
# This set of tests is for UTF-8 support and Unicode property support, with
|
# This set of tests is for UTF-8 support and Unicode property support, with
|
||||||
# relevance only for the 8-bit library.
|
# relevance only for the 8-bit library.
|
||||||
|
|
||||||
/X(\C{3})/utf
|
|
||||||
X\x{1234}
|
|
||||||
0: X\x{1234}
|
|
||||||
1: \x{1234}
|
|
||||||
|
|
||||||
/X(\C{4})/utf
|
|
||||||
X\x{1234}YZ
|
|
||||||
0: X\x{1234}Y
|
|
||||||
1: \x{1234}Y
|
|
||||||
|
|
||||||
/X\C*/utf
|
|
||||||
XYZabcdce
|
|
||||||
0: XYZabcdce
|
|
||||||
|
|
||||||
/X\C*?/utf
|
|
||||||
XYZabcde
|
|
||||||
0: X
|
|
||||||
|
|
||||||
/X\C{3,5}/utf
|
|
||||||
Xabcdefg
|
|
||||||
0: Xabcde
|
|
||||||
X\x{1234}
|
|
||||||
0: X\x{1234}
|
|
||||||
X\x{1234}YZ
|
|
||||||
0: X\x{1234}YZ
|
|
||||||
X\x{1234}\x{512}
|
|
||||||
0: X\x{1234}\x{512}
|
|
||||||
X\x{1234}\x{512}YZ
|
|
||||||
0: X\x{1234}\x{512}
|
|
||||||
|
|
||||||
/X\C{3,5}?/utf
|
|
||||||
Xabcdefg
|
|
||||||
0: Xabc
|
|
||||||
X\x{1234}
|
|
||||||
0: X\x{1234}
|
|
||||||
X\x{1234}YZ
|
|
||||||
0: X\x{1234}
|
|
||||||
X\x{1234}\x{512}
|
|
||||||
0: X\x{1234}
|
|
||||||
|
|
||||||
/a\Cb/utf
|
|
||||||
aXb
|
|
||||||
0: aXb
|
|
||||||
a\nb
|
|
||||||
0: a\x{0a}b
|
|
||||||
|
|
||||||
/a\C\Cb/utf
|
|
||||||
a\x{100}b
|
|
||||||
0: a\x{100}b
|
|
||||||
|
|
||||||
/ab\Cde/utf
|
|
||||||
abXde
|
|
||||||
0: abXde
|
|
||||||
|
|
||||||
/a\C\Cb/utf
|
|
||||||
a\x{100}b
|
|
||||||
0: a\x{100}b
|
|
||||||
\= Expect no match
|
|
||||||
a\x{12257}b
|
|
||||||
No match
|
|
||||||
|
|
||||||
# The next 3 patterns have UTF-8 errors
|
# The next 3 patterns have UTF-8 errors
|
||||||
|
|
||||||
/[Ã]/utf
|
/[Ã]/utf
|
||||||
|
@ -511,28 +450,6 @@ First code unit = \xf0
|
||||||
Last code unit = \xab
|
Last code unit = \xab
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
# This one is here not because it's different to Perl, but because the way
|
|
||||||
# the captured single-byte is displayed. (In Perl it becomes a character, and you
|
|
||||||
# can't tell the difference.)
|
|
||||||
|
|
||||||
/X(\C)(.*)/utf
|
|
||||||
X\x{1234}
|
|
||||||
0: X\x{1234}
|
|
||||||
1: \x{e1}
|
|
||||||
2: \x{88}\x{b4}
|
|
||||||
X\nabc
|
|
||||||
0: X\x{0a}abc
|
|
||||||
1: \x{0a}
|
|
||||||
2: abc
|
|
||||||
|
|
||||||
# This one is here because Perl gives out a grumbly error message (quite
|
|
||||||
# correctly, but that messes up comparisons).
|
|
||||||
|
|
||||||
/a\Cb/utf
|
|
||||||
\= Expect no match
|
|
||||||
a\x{100}b
|
|
||||||
No match
|
|
||||||
|
|
||||||
/[^ab\xC0-\xF0]/IB,utf
|
/[^ab\xC0-\xF0]/IB,utf
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
|
|
|
@ -6,12 +6,6 @@
|
||||||
#forbid_utf
|
#forbid_utf
|
||||||
#newline_default LF ANY ANYCRLF
|
#newline_default LF ANY ANYCRLF
|
||||||
|
|
||||||
/a\Cb/
|
|
||||||
aXb
|
|
||||||
0: aXb
|
|
||||||
a\nb
|
|
||||||
0: a\x0ab
|
|
||||||
|
|
||||||
/[^\x{c4}]/IB
|
/[^\x{c4}]/IB
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
|
@ -582,7 +576,7 @@ Failed: error 134 at offset 11: character code point value in \x{} or \o{} is to
|
||||||
|
|
||||||
# Non-UTF characters
|
# Non-UTF characters
|
||||||
|
|
||||||
/\C{2,3}/
|
/.{2,3}/
|
||||||
\x{400000}\x{400001}\x{400002}\x{400003}
|
\x{400000}\x{400001}\x{400002}\x{400003}
|
||||||
** Character \x{400000} is greater than 0xffff and UTF-16 mode is not enabled.
|
** Character \x{400000} is greater than 0xffff and UTF-16 mode is not enabled.
|
||||||
** Truncation will probably give the wrong result.
|
** Truncation will probably give the wrong result.
|
||||||
|
|
|
@ -6,12 +6,6 @@
|
||||||
#forbid_utf
|
#forbid_utf
|
||||||
#newline_default LF ANY ANYCRLF
|
#newline_default LF ANY ANYCRLF
|
||||||
|
|
||||||
/a\Cb/
|
|
||||||
aXb
|
|
||||||
0: aXb
|
|
||||||
a\nb
|
|
||||||
0: a\x0ab
|
|
||||||
|
|
||||||
/[^\x{c4}]/IB
|
/[^\x{c4}]/IB
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
|
@ -583,7 +577,7 @@ Subject length lower bound = 2
|
||||||
|
|
||||||
# Non-UTF characters
|
# Non-UTF characters
|
||||||
|
|
||||||
/\C{2,3}/
|
/.{2,3}/
|
||||||
\x{400000}\x{400001}\x{400002}\x{400003}
|
\x{400000}\x{400001}\x{400002}\x{400003}
|
||||||
0: \x{400000}\x{400001}\x{400002}
|
0: \x{400000}\x{400001}\x{400002}
|
||||||
|
|
||||||
|
|
|
@ -9,76 +9,6 @@
|
||||||
Ã]
|
Ã]
|
||||||
** Failed: invalid UTF-8 string cannot be used as input in UTF mode
|
** Failed: invalid UTF-8 string cannot be used as input in UTF mode
|
||||||
|
|
||||||
/X(\C{3})/utf
|
|
||||||
X\x{11234}Y
|
|
||||||
0: X\x{11234}Y
|
|
||||||
1: \x{11234}Y
|
|
||||||
X\x{11234}YZ
|
|
||||||
0: X\x{11234}Y
|
|
||||||
1: \x{11234}Y
|
|
||||||
|
|
||||||
/X(\C{4})/utf
|
|
||||||
X\x{11234}YZ
|
|
||||||
0: X\x{11234}YZ
|
|
||||||
1: \x{11234}YZ
|
|
||||||
X\x{11234}YZW
|
|
||||||
0: X\x{11234}YZ
|
|
||||||
1: \x{11234}YZ
|
|
||||||
|
|
||||||
/X\C*/utf
|
|
||||||
XYZabcdce
|
|
||||||
0: XYZabcdce
|
|
||||||
|
|
||||||
/X\C*?/utf
|
|
||||||
XYZabcde
|
|
||||||
0: X
|
|
||||||
|
|
||||||
/X\C{3,5}/utf
|
|
||||||
Xabcdefg
|
|
||||||
0: Xabcde
|
|
||||||
X\x{11234}Y
|
|
||||||
0: X\x{11234}Y
|
|
||||||
X\x{11234}YZ
|
|
||||||
0: X\x{11234}YZ
|
|
||||||
X\x{11234}\x{512}
|
|
||||||
0: X\x{11234}\x{512}
|
|
||||||
X\x{11234}\x{512}YZ
|
|
||||||
0: X\x{11234}\x{512}YZ
|
|
||||||
X\x{11234}\x{512}\x{11234}Z
|
|
||||||
0: X\x{11234}\x{512}\x{11234}
|
|
||||||
|
|
||||||
/X\C{3,5}?/utf
|
|
||||||
Xabcdefg
|
|
||||||
0: Xabc
|
|
||||||
X\x{11234}Y
|
|
||||||
0: X\x{11234}Y
|
|
||||||
X\x{11234}YZ
|
|
||||||
0: X\x{11234}Y
|
|
||||||
X\x{11234}\x{512}YZ
|
|
||||||
0: X\x{11234}\x{512}
|
|
||||||
\= Expect no match
|
|
||||||
X\x{11234}
|
|
||||||
No match
|
|
||||||
|
|
||||||
/a\Cb/utf
|
|
||||||
aXb
|
|
||||||
0: aXb
|
|
||||||
a\nb
|
|
||||||
0: a\x{0a}b
|
|
||||||
|
|
||||||
/a\C\Cb/utf
|
|
||||||
a\x{12257}b
|
|
||||||
0: a\x{12257}b
|
|
||||||
\= Expect no match
|
|
||||||
a\x{12257}\x{11234}b
|
|
||||||
No match
|
|
||||||
a\x{100}b
|
|
||||||
No match
|
|
||||||
|
|
||||||
/ab\Cde/utf
|
|
||||||
abXde
|
|
||||||
0: abXde
|
|
||||||
|
|
||||||
# Check maximum character size
|
# Check maximum character size
|
||||||
|
|
||||||
/\x{ffff}/IB,utf
|
/\x{ffff}/IB,utf
|
||||||
|
@ -308,23 +238,6 @@ First code unit = \x{d844}
|
||||||
Last code unit = \x{deab}
|
Last code unit = \x{deab}
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
# These two \C tests, copied from the UTF-8 input file, do not have any
|
|
||||||
# problems in 16 or 32 bits.
|
|
||||||
|
|
||||||
/X(\C)(.*)/utf
|
|
||||||
X\x{1234}
|
|
||||||
0: X\x{1234}
|
|
||||||
1: \x{1234}
|
|
||||||
2:
|
|
||||||
X\nabc
|
|
||||||
0: X\x{0a}abc
|
|
||||||
1: \x{0a}
|
|
||||||
2: abc
|
|
||||||
|
|
||||||
/a\Cb/utf
|
|
||||||
a\x{100}b
|
|
||||||
0: a\x{100}b
|
|
||||||
|
|
||||||
/[^ab\xC0-\xF0]/IB,utf
|
/[^ab\xC0-\xF0]/IB,utf
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
|
@ -1127,10 +1040,6 @@ Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too
|
||||||
/\o{4200000}/utf
|
/\o{4200000}/utf
|
||||||
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
|
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
|
||||||
|
|
||||||
/\C/utf
|
|
||||||
\x{110000}
|
|
||||||
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
|
||||||
|
|
||||||
/\x{100}*A/IB,utf
|
/\x{100}*A/IB,utf
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
|
@ -1454,4 +1363,8 @@ Starting code units: \xff
|
||||||
Last code unit = 'B' (caseless)
|
Last code unit = 'B' (caseless)
|
||||||
Subject length lower bound = 2
|
Subject length lower bound = 2
|
||||||
|
|
||||||
|
/./utf
|
||||||
|
\x{110000}
|
||||||
|
** Failed: character \x{110000} is greater than 0x10ffff and so cannot be converted to UTF-16
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -9,74 +9,6 @@
|
||||||
Ã]
|
Ã]
|
||||||
** Failed: invalid UTF-8 string cannot be used as input in UTF mode
|
** Failed: invalid UTF-8 string cannot be used as input in UTF mode
|
||||||
|
|
||||||
/X(\C{3})/utf
|
|
||||||
X\x{11234}Y
|
|
||||||
No match
|
|
||||||
X\x{11234}YZ
|
|
||||||
0: X\x{11234}YZ
|
|
||||||
1: \x{11234}YZ
|
|
||||||
|
|
||||||
/X(\C{4})/utf
|
|
||||||
X\x{11234}YZ
|
|
||||||
No match
|
|
||||||
X\x{11234}YZW
|
|
||||||
0: X\x{11234}YZW
|
|
||||||
1: \x{11234}YZW
|
|
||||||
|
|
||||||
/X\C*/utf
|
|
||||||
XYZabcdce
|
|
||||||
0: XYZabcdce
|
|
||||||
|
|
||||||
/X\C*?/utf
|
|
||||||
XYZabcde
|
|
||||||
0: X
|
|
||||||
|
|
||||||
/X\C{3,5}/utf
|
|
||||||
Xabcdefg
|
|
||||||
0: Xabcde
|
|
||||||
X\x{11234}Y
|
|
||||||
No match
|
|
||||||
X\x{11234}YZ
|
|
||||||
0: X\x{11234}YZ
|
|
||||||
X\x{11234}\x{512}
|
|
||||||
No match
|
|
||||||
X\x{11234}\x{512}YZ
|
|
||||||
0: X\x{11234}\x{512}YZ
|
|
||||||
X\x{11234}\x{512}\x{11234}Z
|
|
||||||
0: X\x{11234}\x{512}\x{11234}Z
|
|
||||||
|
|
||||||
/X\C{3,5}?/utf
|
|
||||||
Xabcdefg
|
|
||||||
0: Xabc
|
|
||||||
X\x{11234}Y
|
|
||||||
No match
|
|
||||||
X\x{11234}YZ
|
|
||||||
0: X\x{11234}YZ
|
|
||||||
X\x{11234}\x{512}YZ
|
|
||||||
0: X\x{11234}\x{512}Y
|
|
||||||
\= Expect no match
|
|
||||||
X\x{11234}
|
|
||||||
No match
|
|
||||||
|
|
||||||
/a\Cb/utf
|
|
||||||
aXb
|
|
||||||
0: aXb
|
|
||||||
a\nb
|
|
||||||
0: a\x{0a}b
|
|
||||||
|
|
||||||
/a\C\Cb/utf
|
|
||||||
a\x{12257}b
|
|
||||||
No match
|
|
||||||
\= Expect no match
|
|
||||||
a\x{12257}\x{11234}b
|
|
||||||
0: a\x{12257}\x{11234}b
|
|
||||||
a\x{100}b
|
|
||||||
No match
|
|
||||||
|
|
||||||
/ab\Cde/utf
|
|
||||||
abXde
|
|
||||||
0: abXde
|
|
||||||
|
|
||||||
# Check maximum character size
|
# Check maximum character size
|
||||||
|
|
||||||
/\x{ffff}/IB,utf
|
/\x{ffff}/IB,utf
|
||||||
|
@ -301,23 +233,6 @@ Options: utf
|
||||||
First code unit = \x{212ab}
|
First code unit = \x{212ab}
|
||||||
Subject length lower bound = 1
|
Subject length lower bound = 1
|
||||||
|
|
||||||
# These two \C tests, copied from the UTF-8 input file, do not have any
|
|
||||||
# problems in 16 or 32 bits.
|
|
||||||
|
|
||||||
/X(\C)(.*)/utf
|
|
||||||
X\x{1234}
|
|
||||||
0: X\x{1234}
|
|
||||||
1: \x{1234}
|
|
||||||
2:
|
|
||||||
X\nabc
|
|
||||||
0: X\x{0a}abc
|
|
||||||
1: \x{0a}
|
|
||||||
2: abc
|
|
||||||
|
|
||||||
/a\Cb/utf
|
|
||||||
a\x{100}b
|
|
||||||
0: a\x{100}b
|
|
||||||
|
|
||||||
/[^ab\xC0-\xF0]/IB,utf
|
/[^ab\xC0-\xF0]/IB,utf
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
|
@ -1119,10 +1034,6 @@ Failed: error 134 at offset 9: character code point value in \x{} or \o{} is too
|
||||||
/\o{4200000}/utf
|
/\o{4200000}/utf
|
||||||
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
|
Failed: error 134 at offset 10: character code point value in \x{} or \o{} is too large
|
||||||
|
|
||||||
/\C/utf
|
|
||||||
\x{110000}
|
|
||||||
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0
|
|
||||||
|
|
||||||
/\x{100}*A/IB,utf
|
/\x{100}*A/IB,utf
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
|
@ -1446,4 +1357,8 @@ Starting code units: \xff
|
||||||
Last code unit = 'B' (caseless)
|
Last code unit = 'B' (caseless)
|
||||||
Subject length lower bound = 2
|
Subject length lower bound = 2
|
||||||
|
|
||||||
|
/./utf
|
||||||
|
\x{110000}
|
||||||
|
Failed: error -28: UTF-32 error: code points greater than 0x10ffff are not defined at offset 0
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -11948,9 +11948,10 @@ Subject length lower bound = 5
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
# A complete set of tests for auto-possessification of character types.
|
# A complete set of tests for auto-possessification of character types, but
|
||||||
|
# omitting \C because it might be disabled (it has its own tests).
|
||||||
|
|
||||||
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\C \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
/\D+\D \D+\d \D+\S \D+\s \D+\W \D+\w \D+. \D+\R \D+\H \D+\h \D+\V \D+\v \D+\Z \D+\z \D+$/Bx
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
\D+
|
\D+
|
||||||
|
@ -11968,8 +11969,6 @@ Subject length lower bound = 5
|
||||||
\D+
|
\D+
|
||||||
Any
|
Any
|
||||||
\D+
|
\D+
|
||||||
AllAny
|
|
||||||
\D+
|
|
||||||
\R
|
\R
|
||||||
\D+
|
\D+
|
||||||
\H
|
\H
|
||||||
|
@ -11989,7 +11988,7 @@ Subject length lower bound = 5
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\C \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
/\d+\D \d+\d \d+\S \d+\s \d+\W \d+\w \d+. \d+\R \d+\H \d+\h \d+\V \d+\v \d+\Z \d+\z \d+$/Bx
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
\d++
|
\d++
|
||||||
|
@ -12006,8 +12005,6 @@ Subject length lower bound = 5
|
||||||
\w
|
\w
|
||||||
\d+
|
\d+
|
||||||
Any
|
Any
|
||||||
\d+
|
|
||||||
AllAny
|
|
||||||
\d++
|
\d++
|
||||||
\R
|
\R
|
||||||
\d+
|
\d+
|
||||||
|
@ -12028,7 +12025,7 @@ Subject length lower bound = 5
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\C \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
/\S+\D \S+\d \S+\S \S+\s \S+\W \S+\w \S+. \S+\R \S+\H \S+\h \S+\V \S+\v \S+\Z \S+\z \S+$/Bx
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
\S+
|
\S+
|
||||||
|
@ -12045,8 +12042,6 @@ Subject length lower bound = 5
|
||||||
\w
|
\w
|
||||||
\S+
|
\S+
|
||||||
Any
|
Any
|
||||||
\S+
|
|
||||||
AllAny
|
|
||||||
\S++
|
\S++
|
||||||
\R
|
\R
|
||||||
\S+
|
\S+
|
||||||
|
@ -12067,7 +12062,7 @@ Subject length lower bound = 5
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\C \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
/\s+\D \s+\d \s+\S \s+\s \s+\W \s+\w \s+. \s+\R \s+\H \s+\h \s+\V \s+\v \s+\Z \s+\z \s+$/Bx
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
\s+
|
\s+
|
||||||
|
@ -12085,8 +12080,6 @@ Subject length lower bound = 5
|
||||||
\s+
|
\s+
|
||||||
Any
|
Any
|
||||||
\s+
|
\s+
|
||||||
AllAny
|
|
||||||
\s+
|
|
||||||
\R
|
\R
|
||||||
\s+
|
\s+
|
||||||
\H
|
\H
|
||||||
|
@ -12106,7 +12099,7 @@ Subject length lower bound = 5
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\C \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
/\W+\D \W+\d \W+\S \W+\s \W+\W \W+\w \W+. \W+\R \W+\H \W+\h \W+\V \W+\v \W+\Z \W+\z \W+$/Bx
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
\W+
|
\W+
|
||||||
|
@ -12124,8 +12117,6 @@ Subject length lower bound = 5
|
||||||
\W+
|
\W+
|
||||||
Any
|
Any
|
||||||
\W+
|
\W+
|
||||||
AllAny
|
|
||||||
\W+
|
|
||||||
\R
|
\R
|
||||||
\W+
|
\W+
|
||||||
\H
|
\H
|
||||||
|
@ -12145,7 +12136,7 @@ Subject length lower bound = 5
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\C \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
/\w+\D \w+\d \w+\S \w+\s \w+\W \w+\w \w+. \w+\R \w+\H \w+\h \w+\V \w+\v \w+\Z \w+\z \w+$/Bx
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
\w+
|
\w+
|
||||||
|
@ -12162,8 +12153,6 @@ Subject length lower bound = 5
|
||||||
\w
|
\w
|
||||||
\w+
|
\w+
|
||||||
Any
|
Any
|
||||||
\w+
|
|
||||||
AllAny
|
|
||||||
\w++
|
\w++
|
||||||
\R
|
\R
|
||||||
\w+
|
\w+
|
||||||
|
@ -12184,7 +12173,303 @@ Subject length lower bound = 5
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\C \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
\R+
|
||||||
|
\D
|
||||||
|
\R++
|
||||||
|
\d
|
||||||
|
\R+
|
||||||
|
\S
|
||||||
|
\R++
|
||||||
|
\s
|
||||||
|
\R+
|
||||||
|
\W
|
||||||
|
\R++
|
||||||
|
\w
|
||||||
|
\R++
|
||||||
|
Any
|
||||||
|
\R+
|
||||||
|
\R
|
||||||
|
\R+
|
||||||
|
\H
|
||||||
|
\R++
|
||||||
|
\h
|
||||||
|
\R+
|
||||||
|
\V
|
||||||
|
\R+
|
||||||
|
\v
|
||||||
|
\R+
|
||||||
|
\Z
|
||||||
|
\R++
|
||||||
|
\z
|
||||||
|
\R+
|
||||||
|
$
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
\H+
|
||||||
|
\D
|
||||||
|
\H+
|
||||||
|
\d
|
||||||
|
\H+
|
||||||
|
\S
|
||||||
|
\H+
|
||||||
|
\s
|
||||||
|
\H+
|
||||||
|
\W
|
||||||
|
\H+
|
||||||
|
\w
|
||||||
|
\H+
|
||||||
|
Any
|
||||||
|
\H+
|
||||||
|
\R
|
||||||
|
\H+
|
||||||
|
\H
|
||||||
|
\H++
|
||||||
|
\h
|
||||||
|
\H+
|
||||||
|
\V
|
||||||
|
\H+
|
||||||
|
\v
|
||||||
|
\H+
|
||||||
|
\Z
|
||||||
|
\H++
|
||||||
|
\z
|
||||||
|
\H+
|
||||||
|
$
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
\h+
|
||||||
|
\D
|
||||||
|
\h++
|
||||||
|
\d
|
||||||
|
\h++
|
||||||
|
\S
|
||||||
|
\h+
|
||||||
|
\s
|
||||||
|
\h+
|
||||||
|
\W
|
||||||
|
\h++
|
||||||
|
\w
|
||||||
|
\h+
|
||||||
|
Any
|
||||||
|
\h++
|
||||||
|
\R
|
||||||
|
\h++
|
||||||
|
\H
|
||||||
|
\h+
|
||||||
|
\h
|
||||||
|
\h+
|
||||||
|
\V
|
||||||
|
\h++
|
||||||
|
\v
|
||||||
|
\h+
|
||||||
|
\Z
|
||||||
|
\h++
|
||||||
|
\z
|
||||||
|
\h+
|
||||||
|
$
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
\V+
|
||||||
|
\D
|
||||||
|
\V+
|
||||||
|
\d
|
||||||
|
\V+
|
||||||
|
\S
|
||||||
|
\V+
|
||||||
|
\s
|
||||||
|
\V+
|
||||||
|
\W
|
||||||
|
\V+
|
||||||
|
\w
|
||||||
|
\V+
|
||||||
|
Any
|
||||||
|
\V++
|
||||||
|
\R
|
||||||
|
\V+
|
||||||
|
\H
|
||||||
|
\V+
|
||||||
|
\h
|
||||||
|
\V+
|
||||||
|
\V
|
||||||
|
\V++
|
||||||
|
\v
|
||||||
|
\V+
|
||||||
|
\Z
|
||||||
|
\V++
|
||||||
|
\z
|
||||||
|
\V+
|
||||||
|
$
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
\v+
|
||||||
|
\D
|
||||||
|
\v++
|
||||||
|
\d
|
||||||
|
\v++
|
||||||
|
\S
|
||||||
|
\v+
|
||||||
|
\s
|
||||||
|
\v+
|
||||||
|
\W
|
||||||
|
\v++
|
||||||
|
\w
|
||||||
|
\v+
|
||||||
|
Any
|
||||||
|
\v+
|
||||||
|
\R
|
||||||
|
\v+
|
||||||
|
\H
|
||||||
|
\v++
|
||||||
|
\h
|
||||||
|
\v++
|
||||||
|
\V
|
||||||
|
\v+
|
||||||
|
\v
|
||||||
|
\v+
|
||||||
|
\Z
|
||||||
|
\v++
|
||||||
|
\z
|
||||||
|
\v+
|
||||||
|
$
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
a+
|
||||||
|
\D
|
||||||
|
a++
|
||||||
|
\d
|
||||||
|
a+
|
||||||
|
\S
|
||||||
|
a++
|
||||||
|
\s
|
||||||
|
a++
|
||||||
|
\W
|
||||||
|
a+
|
||||||
|
\w
|
||||||
|
a+
|
||||||
|
Any
|
||||||
|
a++
|
||||||
|
\R
|
||||||
|
a+
|
||||||
|
\H
|
||||||
|
a++
|
||||||
|
\h
|
||||||
|
a+
|
||||||
|
\V
|
||||||
|
a++
|
||||||
|
\v
|
||||||
|
a++
|
||||||
|
\Z
|
||||||
|
a++
|
||||||
|
\z
|
||||||
|
a++
|
||||||
|
$
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
\x0a+
|
||||||
|
\D
|
||||||
|
\x0a++
|
||||||
|
\d
|
||||||
|
\x0a++
|
||||||
|
\S
|
||||||
|
\x0a+
|
||||||
|
\s
|
||||||
|
\x0a+
|
||||||
|
\W
|
||||||
|
\x0a++
|
||||||
|
\w
|
||||||
|
\x0a+
|
||||||
|
Any
|
||||||
|
\x0a+
|
||||||
|
\R
|
||||||
|
\x0a+
|
||||||
|
\H
|
||||||
|
\x0a++
|
||||||
|
\h
|
||||||
|
\x0a++
|
||||||
|
\V
|
||||||
|
\x0a+
|
||||||
|
\v
|
||||||
|
\x0a+
|
||||||
|
\Z
|
||||||
|
\x0a++
|
||||||
|
\z
|
||||||
|
\x0a+
|
||||||
|
$
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
Any+
|
||||||
|
\D
|
||||||
|
Any+
|
||||||
|
\d
|
||||||
|
Any+
|
||||||
|
\S
|
||||||
|
Any+
|
||||||
|
\s
|
||||||
|
Any+
|
||||||
|
\W
|
||||||
|
Any+
|
||||||
|
\w
|
||||||
|
Any+
|
||||||
|
Any
|
||||||
|
Any++
|
||||||
|
\R
|
||||||
|
Any+
|
||||||
|
\H
|
||||||
|
Any+
|
||||||
|
\h
|
||||||
|
Any+
|
||||||
|
\V
|
||||||
|
Any+
|
||||||
|
\v
|
||||||
|
Any+
|
||||||
|
\Z
|
||||||
|
Any++
|
||||||
|
\z
|
||||||
|
Any+
|
||||||
|
$
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
AllAny+
|
AllAny+
|
||||||
|
@ -12200,8 +12485,6 @@ Subject length lower bound = 5
|
||||||
AllAny+
|
AllAny+
|
||||||
\w
|
\w
|
||||||
AllAny+
|
AllAny+
|
||||||
Any
|
|
||||||
AllAny+
|
|
||||||
AllAny
|
AllAny
|
||||||
AllAny+
|
AllAny+
|
||||||
\R
|
\R
|
||||||
|
@ -12223,358 +12506,7 @@ Subject length lower bound = 5
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/\R+\D \R+\d \R+\S \R+\s \R+\W \R+\w \R+. \R+\C \R+\R \R+\H \R+\h \R+\V \R+\v \R+\Z \R+\z \R+$/Bx
|
/ \D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
||||||
------------------------------------------------------------------
|
|
||||||
Bra
|
|
||||||
\R+
|
|
||||||
\D
|
|
||||||
\R++
|
|
||||||
\d
|
|
||||||
\R+
|
|
||||||
\S
|
|
||||||
\R++
|
|
||||||
\s
|
|
||||||
\R+
|
|
||||||
\W
|
|
||||||
\R++
|
|
||||||
\w
|
|
||||||
\R++
|
|
||||||
Any
|
|
||||||
\R+
|
|
||||||
AllAny
|
|
||||||
\R+
|
|
||||||
\R
|
|
||||||
\R+
|
|
||||||
\H
|
|
||||||
\R++
|
|
||||||
\h
|
|
||||||
\R+
|
|
||||||
\V
|
|
||||||
\R+
|
|
||||||
\v
|
|
||||||
\R+
|
|
||||||
\Z
|
|
||||||
\R++
|
|
||||||
\z
|
|
||||||
\R+
|
|
||||||
$
|
|
||||||
Ket
|
|
||||||
End
|
|
||||||
------------------------------------------------------------------
|
|
||||||
|
|
||||||
/\H+\D \H+\d \H+\S \H+\s \H+\W \H+\w \H+. \H+\C \H+\R \H+\H \H+\h \H+\V \H+\v \H+\Z \H+\z \H+$/Bx
|
|
||||||
------------------------------------------------------------------
|
|
||||||
Bra
|
|
||||||
\H+
|
|
||||||
\D
|
|
||||||
\H+
|
|
||||||
\d
|
|
||||||
\H+
|
|
||||||
\S
|
|
||||||
\H+
|
|
||||||
\s
|
|
||||||
\H+
|
|
||||||
\W
|
|
||||||
\H+
|
|
||||||
\w
|
|
||||||
\H+
|
|
||||||
Any
|
|
||||||
\H+
|
|
||||||
AllAny
|
|
||||||
\H+
|
|
||||||
\R
|
|
||||||
\H+
|
|
||||||
\H
|
|
||||||
\H++
|
|
||||||
\h
|
|
||||||
\H+
|
|
||||||
\V
|
|
||||||
\H+
|
|
||||||
\v
|
|
||||||
\H+
|
|
||||||
\Z
|
|
||||||
\H++
|
|
||||||
\z
|
|
||||||
\H+
|
|
||||||
$
|
|
||||||
Ket
|
|
||||||
End
|
|
||||||
------------------------------------------------------------------
|
|
||||||
|
|
||||||
/\h+\D \h+\d \h+\S \h+\s \h+\W \h+\w \h+. \h+\C \h+\R \h+\H \h+\h \h+\V \h+\v \h+\Z \h+\z \h+$/Bx
|
|
||||||
------------------------------------------------------------------
|
|
||||||
Bra
|
|
||||||
\h+
|
|
||||||
\D
|
|
||||||
\h++
|
|
||||||
\d
|
|
||||||
\h++
|
|
||||||
\S
|
|
||||||
\h+
|
|
||||||
\s
|
|
||||||
\h+
|
|
||||||
\W
|
|
||||||
\h++
|
|
||||||
\w
|
|
||||||
\h+
|
|
||||||
Any
|
|
||||||
\h+
|
|
||||||
AllAny
|
|
||||||
\h++
|
|
||||||
\R
|
|
||||||
\h++
|
|
||||||
\H
|
|
||||||
\h+
|
|
||||||
\h
|
|
||||||
\h+
|
|
||||||
\V
|
|
||||||
\h++
|
|
||||||
\v
|
|
||||||
\h+
|
|
||||||
\Z
|
|
||||||
\h++
|
|
||||||
\z
|
|
||||||
\h+
|
|
||||||
$
|
|
||||||
Ket
|
|
||||||
End
|
|
||||||
------------------------------------------------------------------
|
|
||||||
|
|
||||||
/\V+\D \V+\d \V+\S \V+\s \V+\W \V+\w \V+. \V+\C \V+\R \V+\H \V+\h \V+\V \V+\v \V+\Z \V+\z \V+$/Bx
|
|
||||||
------------------------------------------------------------------
|
|
||||||
Bra
|
|
||||||
\V+
|
|
||||||
\D
|
|
||||||
\V+
|
|
||||||
\d
|
|
||||||
\V+
|
|
||||||
\S
|
|
||||||
\V+
|
|
||||||
\s
|
|
||||||
\V+
|
|
||||||
\W
|
|
||||||
\V+
|
|
||||||
\w
|
|
||||||
\V+
|
|
||||||
Any
|
|
||||||
\V+
|
|
||||||
AllAny
|
|
||||||
\V++
|
|
||||||
\R
|
|
||||||
\V+
|
|
||||||
\H
|
|
||||||
\V+
|
|
||||||
\h
|
|
||||||
\V+
|
|
||||||
\V
|
|
||||||
\V++
|
|
||||||
\v
|
|
||||||
\V+
|
|
||||||
\Z
|
|
||||||
\V++
|
|
||||||
\z
|
|
||||||
\V+
|
|
||||||
$
|
|
||||||
Ket
|
|
||||||
End
|
|
||||||
------------------------------------------------------------------
|
|
||||||
|
|
||||||
/\v+\D \v+\d \v+\S \v+\s \v+\W \v+\w \v+. \v+\C \v+\R \v+\H \v+\h \v+\V \v+\v \v+\Z \v+\z \v+$/Bx
|
|
||||||
------------------------------------------------------------------
|
|
||||||
Bra
|
|
||||||
\v+
|
|
||||||
\D
|
|
||||||
\v++
|
|
||||||
\d
|
|
||||||
\v++
|
|
||||||
\S
|
|
||||||
\v+
|
|
||||||
\s
|
|
||||||
\v+
|
|
||||||
\W
|
|
||||||
\v++
|
|
||||||
\w
|
|
||||||
\v+
|
|
||||||
Any
|
|
||||||
\v+
|
|
||||||
AllAny
|
|
||||||
\v+
|
|
||||||
\R
|
|
||||||
\v+
|
|
||||||
\H
|
|
||||||
\v++
|
|
||||||
\h
|
|
||||||
\v++
|
|
||||||
\V
|
|
||||||
\v+
|
|
||||||
\v
|
|
||||||
\v+
|
|
||||||
\Z
|
|
||||||
\v++
|
|
||||||
\z
|
|
||||||
\v+
|
|
||||||
$
|
|
||||||
Ket
|
|
||||||
End
|
|
||||||
------------------------------------------------------------------
|
|
||||||
|
|
||||||
/ a+\D a+\d a+\S a+\s a+\W a+\w a+. a+\C a+\R a+\H a+\h a+\V a+\v a+\Z a+\z a+$/Bx
|
|
||||||
------------------------------------------------------------------
|
|
||||||
Bra
|
|
||||||
a+
|
|
||||||
\D
|
|
||||||
a++
|
|
||||||
\d
|
|
||||||
a+
|
|
||||||
\S
|
|
||||||
a++
|
|
||||||
\s
|
|
||||||
a++
|
|
||||||
\W
|
|
||||||
a+
|
|
||||||
\w
|
|
||||||
a+
|
|
||||||
Any
|
|
||||||
a+
|
|
||||||
AllAny
|
|
||||||
a++
|
|
||||||
\R
|
|
||||||
a+
|
|
||||||
\H
|
|
||||||
a++
|
|
||||||
\h
|
|
||||||
a+
|
|
||||||
\V
|
|
||||||
a++
|
|
||||||
\v
|
|
||||||
a++
|
|
||||||
\Z
|
|
||||||
a++
|
|
||||||
\z
|
|
||||||
a++
|
|
||||||
$
|
|
||||||
Ket
|
|
||||||
End
|
|
||||||
------------------------------------------------------------------
|
|
||||||
|
|
||||||
/\n+\D \n+\d \n+\S \n+\s \n+\W \n+\w \n+. \n+\C \n+\R \n+\H \n+\h \n+\V \n+\v \n+\Z \n+\z \n+$/Bx
|
|
||||||
------------------------------------------------------------------
|
|
||||||
Bra
|
|
||||||
\x0a+
|
|
||||||
\D
|
|
||||||
\x0a++
|
|
||||||
\d
|
|
||||||
\x0a++
|
|
||||||
\S
|
|
||||||
\x0a+
|
|
||||||
\s
|
|
||||||
\x0a+
|
|
||||||
\W
|
|
||||||
\x0a++
|
|
||||||
\w
|
|
||||||
\x0a+
|
|
||||||
Any
|
|
||||||
\x0a+
|
|
||||||
AllAny
|
|
||||||
\x0a+
|
|
||||||
\R
|
|
||||||
\x0a+
|
|
||||||
\H
|
|
||||||
\x0a++
|
|
||||||
\h
|
|
||||||
\x0a++
|
|
||||||
\V
|
|
||||||
\x0a+
|
|
||||||
\v
|
|
||||||
\x0a+
|
|
||||||
\Z
|
|
||||||
\x0a++
|
|
||||||
\z
|
|
||||||
\x0a+
|
|
||||||
$
|
|
||||||
Ket
|
|
||||||
End
|
|
||||||
------------------------------------------------------------------
|
|
||||||
|
|
||||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bx
|
|
||||||
------------------------------------------------------------------
|
|
||||||
Bra
|
|
||||||
Any+
|
|
||||||
\D
|
|
||||||
Any+
|
|
||||||
\d
|
|
||||||
Any+
|
|
||||||
\S
|
|
||||||
Any+
|
|
||||||
\s
|
|
||||||
Any+
|
|
||||||
\W
|
|
||||||
Any+
|
|
||||||
\w
|
|
||||||
Any+
|
|
||||||
Any
|
|
||||||
Any+
|
|
||||||
AllAny
|
|
||||||
Any++
|
|
||||||
\R
|
|
||||||
Any+
|
|
||||||
\H
|
|
||||||
Any+
|
|
||||||
\h
|
|
||||||
Any+
|
|
||||||
\V
|
|
||||||
Any+
|
|
||||||
\v
|
|
||||||
Any+
|
|
||||||
\Z
|
|
||||||
Any++
|
|
||||||
\z
|
|
||||||
Any+
|
|
||||||
$
|
|
||||||
Ket
|
|
||||||
End
|
|
||||||
------------------------------------------------------------------
|
|
||||||
|
|
||||||
/ .+\D .+\d .+\S .+\s .+\W .+\w .+. .+\C .+\R .+\H .+\h .+\V .+\v .+\Z .+\z .+$/Bsx
|
|
||||||
------------------------------------------------------------------
|
|
||||||
Bra
|
|
||||||
AllAny+
|
|
||||||
\D
|
|
||||||
AllAny+
|
|
||||||
\d
|
|
||||||
AllAny+
|
|
||||||
\S
|
|
||||||
AllAny+
|
|
||||||
\s
|
|
||||||
AllAny+
|
|
||||||
\W
|
|
||||||
AllAny+
|
|
||||||
\w
|
|
||||||
AllAny+
|
|
||||||
AllAny
|
|
||||||
AllAny+
|
|
||||||
AllAny
|
|
||||||
AllAny+
|
|
||||||
\R
|
|
||||||
AllAny+
|
|
||||||
\H
|
|
||||||
AllAny+
|
|
||||||
\h
|
|
||||||
AllAny+
|
|
||||||
\V
|
|
||||||
AllAny+
|
|
||||||
\v
|
|
||||||
AllAny+
|
|
||||||
\Z
|
|
||||||
AllAny++
|
|
||||||
\z
|
|
||||||
AllAny+
|
|
||||||
$
|
|
||||||
Ket
|
|
||||||
End
|
|
||||||
------------------------------------------------------------------
|
|
||||||
|
|
||||||
/\D+$ \d+$ \S+$ \s+$ \W+$ \w+$ \C+$ \R+$ \H+$ \h+$ \V+$ \v+$ a+$ \n+$ .+$ .+$/Bmx
|
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
\D+
|
\D+
|
||||||
|
@ -12588,8 +12520,6 @@ Subject length lower bound = 5
|
||||||
\W+
|
\W+
|
||||||
/m $
|
/m $
|
||||||
\w++
|
\w++
|
||||||
/m $
|
|
||||||
AllAny+
|
|
||||||
/m $
|
/m $
|
||||||
\R+
|
\R+
|
||||||
/m $
|
/m $
|
||||||
|
@ -14210,9 +14140,6 @@ Capturing subpattern count = 2
|
||||||
May match empty string
|
May match empty string
|
||||||
Subject length lower bound = 0
|
Subject length lower bound = 0
|
||||||
|
|
||||||
/ab\Cde/never_backslash_c
|
|
||||||
Failed: error 183 at offset 3: using \C is disabled by the application
|
|
||||||
|
|
||||||
/abc/
|
/abc/
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
\[9x!xxx(]{9999}
|
\[9x!xxx(]{9999}
|
||||||
|
@ -14532,14 +14459,6 @@ Subject length lower bound = 0
|
||||||
\0{ab}
|
\0{ab}
|
||||||
0: \x00{ab}
|
0: \x00{ab}
|
||||||
|
|
||||||
/ab\Cde/
|
|
||||||
abXde
|
|
||||||
0: abXde
|
|
||||||
|
|
||||||
/(?<=ab\Cde)X/
|
|
||||||
abZdeX
|
|
||||||
0: X
|
|
||||||
|
|
||||||
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
|
/^(a(b))\1\g1\g{1}\g-1\g{-1}\g{-02}Z/
|
||||||
ababababbbabZXXXX
|
ababababbbabZXXXX
|
||||||
0: ababababbbabZ
|
0: ababababbbabZ
|
||||||
|
|
|
@ -0,0 +1,89 @@
|
||||||
|
# These are tests of \C that do not involve UTF. They are not run when \C is
|
||||||
|
# disabled by compiling with --enable-never-backslash-C.
|
||||||
|
|
||||||
|
/\C+\D \C+\d \C+\S \C+\s \C+\W \C+\w \C+. \C+\R \C+\H \C+\h \C+\V \C+\v \C+\Z \C+\z \C+$/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
AllAny+
|
||||||
|
\D
|
||||||
|
AllAny+
|
||||||
|
\d
|
||||||
|
AllAny+
|
||||||
|
\S
|
||||||
|
AllAny+
|
||||||
|
\s
|
||||||
|
AllAny+
|
||||||
|
\W
|
||||||
|
AllAny+
|
||||||
|
\w
|
||||||
|
AllAny+
|
||||||
|
Any
|
||||||
|
AllAny+
|
||||||
|
\R
|
||||||
|
AllAny+
|
||||||
|
\H
|
||||||
|
AllAny+
|
||||||
|
\h
|
||||||
|
AllAny+
|
||||||
|
\V
|
||||||
|
AllAny+
|
||||||
|
\v
|
||||||
|
AllAny+
|
||||||
|
\Z
|
||||||
|
AllAny++
|
||||||
|
\z
|
||||||
|
AllAny+
|
||||||
|
$
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\D+\C \d+\C \S+\C \s+\C \W+\C \w+\C .+\C \R+\C \H+\C \h+\C \V+\C \v+\C a+\C \n+\C \C+\C/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
\D+
|
||||||
|
AllAny
|
||||||
|
\d+
|
||||||
|
AllAny
|
||||||
|
\S+
|
||||||
|
AllAny
|
||||||
|
\s+
|
||||||
|
AllAny
|
||||||
|
\W+
|
||||||
|
AllAny
|
||||||
|
\w+
|
||||||
|
AllAny
|
||||||
|
Any+
|
||||||
|
AllAny
|
||||||
|
\R+
|
||||||
|
AllAny
|
||||||
|
\H+
|
||||||
|
AllAny
|
||||||
|
\h+
|
||||||
|
AllAny
|
||||||
|
\V+
|
||||||
|
AllAny
|
||||||
|
\v+
|
||||||
|
AllAny
|
||||||
|
a+
|
||||||
|
AllAny
|
||||||
|
\x0a+
|
||||||
|
AllAny
|
||||||
|
AllAny+
|
||||||
|
AllAny
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/ab\Cde/never_backslash_c
|
||||||
|
Failed: error 183 at offset 3: using \C is disabled by the application
|
||||||
|
|
||||||
|
/ab\Cde/
|
||||||
|
abXde
|
||||||
|
0: abXde
|
||||||
|
|
||||||
|
/(?<=ab\Cde)X/
|
||||||
|
abZdeX
|
||||||
|
0: X
|
||||||
|
|
||||||
|
# End of testinput21
|
|
@ -0,0 +1,161 @@
|
||||||
|
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||||
|
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||||
|
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||||
|
# in some widths and not in others.
|
||||||
|
|
||||||
|
/ab\Cde/utf
|
||||||
|
abXde
|
||||||
|
0: abXde
|
||||||
|
|
||||||
|
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||||
|
|
||||||
|
/(?<=ab\Cde)X/utf
|
||||||
|
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||||
|
|
||||||
|
# Autopossessification tests
|
||||||
|
|
||||||
|
/\C+\X \X+\C/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
AllAny+
|
||||||
|
extuni
|
||||||
|
extuni+
|
||||||
|
AllAny
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\C+\X \X+\C/Bx,utf
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
Anybyte+
|
||||||
|
extuni
|
||||||
|
extuni+
|
||||||
|
Anybyte
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\C\X*TӅ;
|
||||||
|
{0,6}\v+
F
|
||||||
|
/utf
|
||||||
|
\= Expect no match
|
||||||
|
Ӆ\x0a
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\C(\W?ſ)'?{{/utf
|
||||||
|
\= Expect no match
|
||||||
|
\\C(\\W?ſ)'?{{
|
||||||
|
No match
|
||||||
|
|
||||||
|
/X(\C{3})/utf
|
||||||
|
X\x{1234}
|
||||||
|
No match
|
||||||
|
X\x{11234}Y
|
||||||
|
0: X\x{11234}Y
|
||||||
|
1: \x{11234}Y
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{11234}Y
|
||||||
|
1: \x{11234}Y
|
||||||
|
|
||||||
|
/X(\C{4})/utf
|
||||||
|
X\x{1234}YZ
|
||||||
|
No match
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{11234}YZ
|
||||||
|
1: \x{11234}YZ
|
||||||
|
X\x{11234}YZW
|
||||||
|
0: X\x{11234}YZ
|
||||||
|
1: \x{11234}YZ
|
||||||
|
|
||||||
|
/X\C*/utf
|
||||||
|
XYZabcdce
|
||||||
|
0: XYZabcdce
|
||||||
|
|
||||||
|
/X\C*?/utf
|
||||||
|
XYZabcde
|
||||||
|
0: X
|
||||||
|
|
||||||
|
/X\C{3,5}/utf
|
||||||
|
Xabcdefg
|
||||||
|
0: Xabcde
|
||||||
|
X\x{1234}
|
||||||
|
No match
|
||||||
|
X\x{1234}YZ
|
||||||
|
0: X\x{1234}YZ
|
||||||
|
X\x{1234}\x{512}
|
||||||
|
No match
|
||||||
|
X\x{1234}\x{512}YZ
|
||||||
|
0: X\x{1234}\x{512}YZ
|
||||||
|
X\x{11234}Y
|
||||||
|
0: X\x{11234}Y
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{11234}YZ
|
||||||
|
X\x{11234}\x{512}
|
||||||
|
0: X\x{11234}\x{512}
|
||||||
|
X\x{11234}\x{512}YZ
|
||||||
|
0: X\x{11234}\x{512}YZ
|
||||||
|
X\x{11234}\x{512}\x{11234}Z
|
||||||
|
0: X\x{11234}\x{512}\x{11234}
|
||||||
|
|
||||||
|
/X\C{3,5}?/utf
|
||||||
|
Xabcdefg
|
||||||
|
0: Xabc
|
||||||
|
X\x{1234}
|
||||||
|
No match
|
||||||
|
X\x{1234}YZ
|
||||||
|
0: X\x{1234}YZ
|
||||||
|
X\x{1234}\x{512}
|
||||||
|
No match
|
||||||
|
X\x{11234}Y
|
||||||
|
0: X\x{11234}Y
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{11234}Y
|
||||||
|
X\x{11234}\x{512}YZ
|
||||||
|
0: X\x{11234}\x{512}
|
||||||
|
X\x{11234}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/a\Cb/utf
|
||||||
|
aXb
|
||||||
|
0: aXb
|
||||||
|
a\nb
|
||||||
|
0: a\x{0a}b
|
||||||
|
a\x{100}b
|
||||||
|
0: a\x{100}b
|
||||||
|
|
||||||
|
/a\C\Cb/utf
|
||||||
|
a\x{100}b
|
||||||
|
No match
|
||||||
|
a\x{12257}b
|
||||||
|
0: a\x{12257}b
|
||||||
|
a\x{12257}\x{11234}b
|
||||||
|
No match
|
||||||
|
|
||||||
|
/ab\Cde/utf
|
||||||
|
abXde
|
||||||
|
0: abXde
|
||||||
|
|
||||||
|
# This one is here not because it's different to Perl, but because the way
|
||||||
|
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||||
|
# and you can't tell the difference.)
|
||||||
|
|
||||||
|
/X(\C)(.*)/utf
|
||||||
|
X\x{1234}
|
||||||
|
0: X\x{1234}
|
||||||
|
1: \x{1234}
|
||||||
|
2:
|
||||||
|
X\nabc
|
||||||
|
0: X\x{0a}abc
|
||||||
|
1: \x{0a}
|
||||||
|
2: abc
|
||||||
|
|
||||||
|
# This one is here because Perl gives out a grumbly error message (quite
|
||||||
|
# correctly, but that messes up comparisons).
|
||||||
|
|
||||||
|
/a\Cb/utf
|
||||||
|
\= Expect no match in 8-bit mode
|
||||||
|
a\x{100}b
|
||||||
|
0: a\x{100}b
|
||||||
|
|
||||||
|
# End of testinput22
|
|
@ -0,0 +1,159 @@
|
||||||
|
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||||
|
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||||
|
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||||
|
# in some widths and not in others.
|
||||||
|
|
||||||
|
/ab\Cde/utf
|
||||||
|
abXde
|
||||||
|
0: abXde
|
||||||
|
|
||||||
|
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||||
|
|
||||||
|
/(?<=ab\Cde)X/utf
|
||||||
|
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||||
|
|
||||||
|
# Autopossessification tests
|
||||||
|
|
||||||
|
/\C+\X \X+\C/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
AllAny+
|
||||||
|
extuni
|
||||||
|
extuni+
|
||||||
|
AllAny
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\C+\X \X+\C/Bx,utf
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
Anybyte+
|
||||||
|
extuni
|
||||||
|
extuni+
|
||||||
|
Anybyte
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\C\X*TӅ;
|
||||||
|
{0,6}\v+
F
|
||||||
|
/utf
|
||||||
|
\= Expect no match
|
||||||
|
Ӆ\x0a
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\C(\W?ſ)'?{{/utf
|
||||||
|
\= Expect no match
|
||||||
|
\\C(\\W?ſ)'?{{
|
||||||
|
No match
|
||||||
|
|
||||||
|
/X(\C{3})/utf
|
||||||
|
X\x{1234}
|
||||||
|
No match
|
||||||
|
X\x{11234}Y
|
||||||
|
No match
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{11234}YZ
|
||||||
|
1: \x{11234}YZ
|
||||||
|
|
||||||
|
/X(\C{4})/utf
|
||||||
|
X\x{1234}YZ
|
||||||
|
No match
|
||||||
|
X\x{11234}YZ
|
||||||
|
No match
|
||||||
|
X\x{11234}YZW
|
||||||
|
0: X\x{11234}YZW
|
||||||
|
1: \x{11234}YZW
|
||||||
|
|
||||||
|
/X\C*/utf
|
||||||
|
XYZabcdce
|
||||||
|
0: XYZabcdce
|
||||||
|
|
||||||
|
/X\C*?/utf
|
||||||
|
XYZabcde
|
||||||
|
0: X
|
||||||
|
|
||||||
|
/X\C{3,5}/utf
|
||||||
|
Xabcdefg
|
||||||
|
0: Xabcde
|
||||||
|
X\x{1234}
|
||||||
|
No match
|
||||||
|
X\x{1234}YZ
|
||||||
|
0: X\x{1234}YZ
|
||||||
|
X\x{1234}\x{512}
|
||||||
|
No match
|
||||||
|
X\x{1234}\x{512}YZ
|
||||||
|
0: X\x{1234}\x{512}YZ
|
||||||
|
X\x{11234}Y
|
||||||
|
No match
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{11234}YZ
|
||||||
|
X\x{11234}\x{512}
|
||||||
|
No match
|
||||||
|
X\x{11234}\x{512}YZ
|
||||||
|
0: X\x{11234}\x{512}YZ
|
||||||
|
X\x{11234}\x{512}\x{11234}Z
|
||||||
|
0: X\x{11234}\x{512}\x{11234}Z
|
||||||
|
|
||||||
|
/X\C{3,5}?/utf
|
||||||
|
Xabcdefg
|
||||||
|
0: Xabc
|
||||||
|
X\x{1234}
|
||||||
|
No match
|
||||||
|
X\x{1234}YZ
|
||||||
|
0: X\x{1234}YZ
|
||||||
|
X\x{1234}\x{512}
|
||||||
|
No match
|
||||||
|
X\x{11234}Y
|
||||||
|
No match
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{11234}YZ
|
||||||
|
X\x{11234}\x{512}YZ
|
||||||
|
0: X\x{11234}\x{512}Y
|
||||||
|
X\x{11234}
|
||||||
|
No match
|
||||||
|
|
||||||
|
/a\Cb/utf
|
||||||
|
aXb
|
||||||
|
0: aXb
|
||||||
|
a\nb
|
||||||
|
0: a\x{0a}b
|
||||||
|
a\x{100}b
|
||||||
|
0: a\x{100}b
|
||||||
|
|
||||||
|
/a\C\Cb/utf
|
||||||
|
a\x{100}b
|
||||||
|
No match
|
||||||
|
a\x{12257}b
|
||||||
|
No match
|
||||||
|
a\x{12257}\x{11234}b
|
||||||
|
0: a\x{12257}\x{11234}b
|
||||||
|
|
||||||
|
/ab\Cde/utf
|
||||||
|
abXde
|
||||||
|
0: abXde
|
||||||
|
|
||||||
|
# This one is here not because it's different to Perl, but because the way
|
||||||
|
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||||
|
# and you can't tell the difference.)
|
||||||
|
|
||||||
|
/X(\C)(.*)/utf
|
||||||
|
X\x{1234}
|
||||||
|
0: X\x{1234}
|
||||||
|
1: \x{1234}
|
||||||
|
2:
|
||||||
|
X\nabc
|
||||||
|
0: X\x{0a}abc
|
||||||
|
1: \x{0a}
|
||||||
|
2: abc
|
||||||
|
|
||||||
|
# This one is here because Perl gives out a grumbly error message (quite
|
||||||
|
# correctly, but that messes up comparisons).
|
||||||
|
|
||||||
|
/a\Cb/utf
|
||||||
|
\= Expect no match in 8-bit mode
|
||||||
|
a\x{100}b
|
||||||
|
0: a\x{100}b
|
||||||
|
|
||||||
|
# End of testinput22
|
|
@ -0,0 +1,163 @@
|
||||||
|
# Tests of \C when Unicode support is available. Note that \C is not supported
|
||||||
|
# for DFA matching in UTF mode, so this test is not run with -dfa. The output
|
||||||
|
# of this test is different in 8-, 16-, and 32-bit modes. Some tests may match
|
||||||
|
# in some widths and not in others.
|
||||||
|
|
||||||
|
/ab\Cde/utf
|
||||||
|
abXde
|
||||||
|
0: abXde
|
||||||
|
|
||||||
|
# This should produce an error diagnostic (\C in UTF lookbehind)
|
||||||
|
|
||||||
|
/(?<=ab\Cde)X/utf
|
||||||
|
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
||||||
|
|
||||||
|
# Autopossessification tests
|
||||||
|
|
||||||
|
/\C+\X \X+\C/Bx
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
AllAny+
|
||||||
|
extuni
|
||||||
|
extuni+
|
||||||
|
AllAny
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\C+\X \X+\C/Bx,utf
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
Anybyte+
|
||||||
|
extuni
|
||||||
|
extuni+
|
||||||
|
Anybyte
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/\C\X*TӅ;
|
||||||
|
{0,6}\v+
F
|
||||||
|
/utf
|
||||||
|
\= Expect no match
|
||||||
|
Ӆ\x0a
|
||||||
|
No match
|
||||||
|
|
||||||
|
/\C(\W?ſ)'?{{/utf
|
||||||
|
\= Expect no match
|
||||||
|
\\C(\\W?ſ)'?{{
|
||||||
|
No match
|
||||||
|
|
||||||
|
/X(\C{3})/utf
|
||||||
|
X\x{1234}
|
||||||
|
0: X\x{1234}
|
||||||
|
1: \x{1234}
|
||||||
|
X\x{11234}Y
|
||||||
|
0: X\x{f0}\x{91}\x{88}
|
||||||
|
1: \x{f0}\x{91}\x{88}
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{f0}\x{91}\x{88}
|
||||||
|
1: \x{f0}\x{91}\x{88}
|
||||||
|
|
||||||
|
/X(\C{4})/utf
|
||||||
|
X\x{1234}YZ
|
||||||
|
0: X\x{1234}Y
|
||||||
|
1: \x{1234}Y
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{11234}
|
||||||
|
1: \x{11234}
|
||||||
|
X\x{11234}YZW
|
||||||
|
0: X\x{11234}
|
||||||
|
1: \x{11234}
|
||||||
|
|
||||||
|
/X\C*/utf
|
||||||
|
XYZabcdce
|
||||||
|
0: XYZabcdce
|
||||||
|
|
||||||
|
/X\C*?/utf
|
||||||
|
XYZabcde
|
||||||
|
0: X
|
||||||
|
|
||||||
|
/X\C{3,5}/utf
|
||||||
|
Xabcdefg
|
||||||
|
0: Xabcde
|
||||||
|
X\x{1234}
|
||||||
|
0: X\x{1234}
|
||||||
|
X\x{1234}YZ
|
||||||
|
0: X\x{1234}YZ
|
||||||
|
X\x{1234}\x{512}
|
||||||
|
0: X\x{1234}\x{512}
|
||||||
|
X\x{1234}\x{512}YZ
|
||||||
|
0: X\x{1234}\x{512}
|
||||||
|
X\x{11234}Y
|
||||||
|
0: X\x{11234}Y
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{11234}Y
|
||||||
|
X\x{11234}\x{512}
|
||||||
|
0: X\x{11234}\x{d4}
|
||||||
|
X\x{11234}\x{512}YZ
|
||||||
|
0: X\x{11234}\x{d4}
|
||||||
|
X\x{11234}\x{512}\x{11234}Z
|
||||||
|
0: X\x{11234}\x{d4}
|
||||||
|
|
||||||
|
/X\C{3,5}?/utf
|
||||||
|
Xabcdefg
|
||||||
|
0: Xabc
|
||||||
|
X\x{1234}
|
||||||
|
0: X\x{1234}
|
||||||
|
X\x{1234}YZ
|
||||||
|
0: X\x{1234}
|
||||||
|
X\x{1234}\x{512}
|
||||||
|
0: X\x{1234}
|
||||||
|
X\x{11234}Y
|
||||||
|
0: X\x{f0}\x{91}\x{88}
|
||||||
|
X\x{11234}YZ
|
||||||
|
0: X\x{f0}\x{91}\x{88}
|
||||||
|
X\x{11234}\x{512}YZ
|
||||||
|
0: X\x{f0}\x{91}\x{88}
|
||||||
|
X\x{11234}
|
||||||
|
0: X\x{f0}\x{91}\x{88}
|
||||||
|
|
||||||
|
/a\Cb/utf
|
||||||
|
aXb
|
||||||
|
0: aXb
|
||||||
|
a\nb
|
||||||
|
0: a\x{0a}b
|
||||||
|
a\x{100}b
|
||||||
|
No match
|
||||||
|
|
||||||
|
/a\C\Cb/utf
|
||||||
|
a\x{100}b
|
||||||
|
0: a\x{100}b
|
||||||
|
a\x{12257}b
|
||||||
|
No match
|
||||||
|
a\x{12257}\x{11234}b
|
||||||
|
No match
|
||||||
|
|
||||||
|
/ab\Cde/utf
|
||||||
|
abXde
|
||||||
|
0: abXde
|
||||||
|
|
||||||
|
# This one is here not because it's different to Perl, but because the way
|
||||||
|
# the captured single code unit is displayed. (In Perl it becomes a character,
|
||||||
|
# and you can't tell the difference.)
|
||||||
|
|
||||||
|
/X(\C)(.*)/utf
|
||||||
|
X\x{1234}
|
||||||
|
0: X\x{1234}
|
||||||
|
1: \x{e1}
|
||||||
|
2: \x{88}\x{b4}
|
||||||
|
X\nabc
|
||||||
|
0: X\x{0a}abc
|
||||||
|
1: \x{0a}
|
||||||
|
2: abc
|
||||||
|
|
||||||
|
# This one is here because Perl gives out a grumbly error message (quite
|
||||||
|
# correctly, but that messes up comparisons).
|
||||||
|
|
||||||
|
/a\Cb/utf
|
||||||
|
\= Expect no match in 8-bit mode
|
||||||
|
a\x{100}b
|
||||||
|
No match
|
||||||
|
|
||||||
|
# End of testinput22
|
|
@ -0,0 +1,8 @@
|
||||||
|
# This test is run when PCRE2 has been built with --enable-never-backslash-C,
|
||||||
|
# which disables the use of \C. All we can do is check that it gives the
|
||||||
|
# correct error message.
|
||||||
|
|
||||||
|
/a\Cb/
|
||||||
|
Failed: error 185 at offset 2: using \C is disabled in this PCRE2 library
|
||||||
|
|
||||||
|
# End of testinput23
|
|
@ -181,10 +181,6 @@ Subject length lower bound = 3
|
||||||
\x{212ab}\x{212ab}\x{212ab}\x{861}
|
\x{212ab}\x{212ab}\x{212ab}\x{861}
|
||||||
0: \x{212ab}\x{212ab}\x{212ab}
|
0: \x{212ab}\x{212ab}\x{212ab}
|
||||||
|
|
||||||
/(?<=\C)X/utf
|
|
||||||
Failed: error 136 at offset 6: \C is not allowed in a lookbehind assertion
|
|
||||||
Should produce an error diagnostic
|
|
||||||
|
|
||||||
/^[ab]/IB,utf
|
/^[ab]/IB,utf
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
|
@ -2905,9 +2901,6 @@ No match
|
||||||
aAz
|
aAz
|
||||||
No match
|
No match
|
||||||
|
|
||||||
/(?<=ab\Cde)X/utf
|
|
||||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
|
||||||
|
|
||||||
/\X/
|
/\X/
|
||||||
a\=ps
|
a\=ps
|
||||||
0: a
|
0: a
|
||||||
|
@ -3803,7 +3796,7 @@ No match
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \C+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
/\D+\X \d+\X \S+\X \s+\X \W+\X \w+\X \R+\X \H+\X \h+\X \V+\X \v+\X a+\X \n+\X .+\X/Bx
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
\D+
|
\D+
|
||||||
|
@ -3818,8 +3811,6 @@ No match
|
||||||
extuni
|
extuni
|
||||||
\w+
|
\w+
|
||||||
extuni
|
extuni
|
||||||
AllAny+
|
|
||||||
extuni
|
|
||||||
\R+
|
\R+
|
||||||
extuni
|
extuni
|
||||||
\H+
|
\H+
|
||||||
|
@ -3858,7 +3849,7 @@ No match
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\C \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
/\X+\D \X+\d \X+\S \X+\s \X+\W \X+\w \X+. \X+\R \X+\H \X+\h \X+\V \X+\v \X+\X \X+\Z \X+\z \X+$/Bx
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
extuni+
|
extuni+
|
||||||
|
@ -3876,8 +3867,6 @@ No match
|
||||||
extuni+
|
extuni+
|
||||||
Any
|
Any
|
||||||
extuni+
|
extuni+
|
||||||
AllAny
|
|
||||||
extuni+
|
|
||||||
\R
|
\R
|
||||||
extuni+
|
extuni+
|
||||||
\H
|
\H
|
||||||
|
@ -4010,18 +3999,6 @@ Failed: error 122 at offset 1227: unmatched closing parenthesis
|
||||||
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
|
"(*UTF)(*UCP)(.UTF).+X(\V+;\^(\D|)!999}(?(?C{7(?C')\H*\S*/^\x5\xa\\xd3\x85n?(;\D*(?m).[^mH+((*UCP)(*U:F)})(?!^)(?'"
|
||||||
Failed: error 124 at offset 113: letter or underscore expected after (?< or (?'
|
Failed: error 124 at offset 113: letter or underscore expected after (?< or (?'
|
||||||
|
|
||||||
/\C\X*TӅ;
|
|
||||||
{0,6}\v+
F
|
|
||||||
/utf
|
|
||||||
\= Expect no match
|
|
||||||
Ӆ\x0a
|
|
||||||
No match
|
|
||||||
|
|
||||||
/\C(\W?ſ)'?{{/utf
|
|
||||||
\= Expect no match
|
|
||||||
\\C(\\W?ſ)'?{{
|
|
||||||
No match
|
|
||||||
|
|
||||||
/[\pS#moq]/
|
/[\pS#moq]/
|
||||||
=
|
=
|
||||||
0: =
|
0: =
|
||||||
|
|
|
@ -7174,14 +7174,6 @@ Matched, but offsets vector is too small to show all matches
|
||||||
2: aa
|
2: aa
|
||||||
3: a
|
3: a
|
||||||
|
|
||||||
/ab\Cde/
|
|
||||||
abXde
|
|
||||||
0: abXde
|
|
||||||
|
|
||||||
/(?<=ab\Cde)X/
|
|
||||||
abZdeX
|
|
||||||
0: X
|
|
||||||
|
|
||||||
/^\R/
|
/^\R/
|
||||||
\r\=ps
|
\r\=ps
|
||||||
0: \x0d
|
0: \x0d
|
||||||
|
|
|
@ -1141,13 +1141,6 @@ Partial match: abcde
|
||||||
the cat\=ph
|
the cat\=ph
|
||||||
Partial match: the cat
|
Partial match: the cat
|
||||||
|
|
||||||
/ab\Cde/utf
|
|
||||||
abXde
|
|
||||||
Failed: error -42: pattern contains an item that is not supported for DFA matching
|
|
||||||
|
|
||||||
/(?<=ab\Cde)X/utf
|
|
||||||
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion
|
|
||||||
|
|
||||||
/./newline=crlf,utf
|
/./newline=crlf,utf
|
||||||
\r\=ps
|
\r\=ps
|
||||||
0: \x{0d}
|
0: \x{0d}
|
||||||
|
|
|
@ -4,12 +4,8 @@
|
||||||
#forbid_utf
|
#forbid_utf
|
||||||
#newline_default lf any anycrlf
|
#newline_default lf any anycrlf
|
||||||
|
|
||||||
/a\Cb/
|
/ab/
|
||||||
aXb
|
\= Expect error message (too big char) and no match
|
||||||
0: aXb
|
|
||||||
a\nb
|
|
||||||
0: a\x0ab
|
|
||||||
\= Expect no match and error message (too big char)
|
|
||||||
A\x{123}B
|
A\x{123}B
|
||||||
** Character \x{123} is greater than 255 and UTF-8 mode is not enabled.
|
** Character \x{123} is greater than 255 and UTF-8 mode is not enabled.
|
||||||
** Truncation will probably give the wrong result.
|
** Truncation will probably give the wrong result.
|
||||||
|
|
Loading…
Reference in New Issue