Add callout support to pcre2grep

This commit is contained in:
Philip.Hazel 2016-04-01 15:52:08 +00:00
parent ddcedf0338
commit c332eaf4f2
12 changed files with 441 additions and 24 deletions

View File

@ -158,6 +158,9 @@ SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL
SET(PCRE2_SUPPORT_PCRE2GREP_JIT ON CACHE BOOL
"Enable use of Just-in-time compiling in pcre2grep.")
SET(PCRE2_SUPPORT_PCRE2GREP_CALLOUT ON CACHE BOOL
"Enable callout string support in pcre2grep.")
SET(PCRE2_SUPPORT_UNICODE ON CACHE BOOL
"Enable support for Unicode and UTF-8/UTF-16/UTF-32 encoding.")
@ -273,6 +276,10 @@ IF(PCRE2_SUPPORT_PCRE2GREP_JIT)
SET(SUPPORT_PCRE2GREP_JIT 1)
ENDIF(PCRE2_SUPPORT_PCRE2GREP_JIT)
IF(PCRE2_SUPPORT_PCRE2GREP_CALLOUT)
SET(SUPPORT_PCRE2GREP_CALLOUT 1)
ENDIF(PCRE2_SUPPORT_PCRE2GREP_CALLOUT)
IF(PCRE2_SUPPORT_VALGRIND)
SET(SUPPORT_VALGRIND 1)
ENDIF(PCRE2_SUPPORT_VALGRIND)
@ -753,6 +760,7 @@ IF(PCRE2_SHOW_REPORT)
MESSAGE(STATUS " Build static libs ............... : ${BUILD_STATIC_LIBS}")
MESSAGE(STATUS " Build pcre2grep ................. : ${PCRE2_BUILD_PCRE2GREP}")
MESSAGE(STATUS " Enable JIT in pcre2grep ......... : ${PCRE2_SUPPORT_PCRE2GREP_JIT}")
MESSAGE(STATUS " Enable callouts in pcre2grep .... : ${PCRE2_SUPPORT_PCRE2GREP_CALLOUT}")
MESSAGE(STATUS " Buffer size for pcre2grep ....... : ${PCRE2GREP_BUFSIZE}")
MESSAGE(STATUS " Build tests (implies pcre2test . : ${PCRE2_BUILD_TESTS}")
MESSAGE(STATUS " and pcre2grep)")

View File

@ -97,6 +97,9 @@ appropriate line terminator: \r\n for Windows, \n otherwise.
21. When a line is too long for pcre2grep's internal buffer, show the maximum
length in the error message.
22. Added support for string callouts to pcre2grep (Zoltan's patch with PH
additions).
Version 10.21 12-January-2016
-----------------------------

View File

@ -570,6 +570,7 @@ EXTRA_DIST += \
testdata/greplist \
testdata/grepoutput \
testdata/grepoutput8 \
testdata/grepoutputC \
testdata/grepoutputN \
testdata/greppatN4 \
testdata/testinput1 \

19
README
View File

@ -168,15 +168,12 @@ library. They are also documented in the pcre2build man page.
built. If you want only the 16-bit or 32-bit library, use --disable-pcre2-8
to disable building the 8-bit library.
. If you want to include support for just-in-time compiling, which can give
large performance improvements on certain platforms, add --enable-jit to the
"configure" command. This support is available only for certain hardware
. If you want to include support for just-in-time (JIT) compiling, which can
give large performance improvements on certain platforms, add --enable-jit to
the "configure" command. This support is available only for certain hardware
architectures. If you try to enable it on an unsupported architecture, there
will be a compile time error.
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
you add --disable-pcre2grep-jit to the "configure" command.
. If you do not want to make use of the support for UTF-8 Unicode character
strings in the 8-bit library, UTF-16 Unicode character strings in the 16-bit
library, or UTF-32 Unicode character strings in the 32-bit library, you can
@ -324,6 +321,14 @@ library. They are also documented in the pcre2build man page.
running "make" to build PCRE2. There is more information about coverage
reporting in the "pcre2build" documentation.
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
you add --disable-pcre2grep-jit to the "configure" command.
. On non-Windows sytems there is support for calling external scripts during
matching in the pcre2grep command via PCRE2's callout facility with string
arguments. This support can be disabled by adding --disable-pcre2grep-callout
to the "configure" command.
. The pcre2grep program currently supports only 8-bit data files, and so
requires the 8-bit PCRE2 library. It is possible to compile pcre2grep to use
libz and/or libbz2, in order to read .gz and .bz2 files (respectively), by
@ -840,4 +845,4 @@ The distribution should contain the files listed below.
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
Last updated: 16 October 2015
Last updated: 01 April 2016

View File

@ -614,6 +614,17 @@ $valgrind $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>
$cf $srcdir/testdata/grepoutputN testtrygrep
if [ $? != 0 ] ; then exit 1; fi
# If pcre2grep supports script callouts, run some tests on them.
if $valgrind $pcre2grep --help | $valgrind $pcre2grep -q 'Callout scripts in patterns are supported'; then
echo "Testing pcre2grep script callouts"
$valgrind $pcre2grep '(T)(..(.))(?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4) ($14) ($0)")()' $srcdir/testdata/grepinputv >testtrygrep
$valgrind $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep
$cf $srcdir/testdata/grepoutputC testtrygrep
if [ $? != 0 ] ; then exit 1; fi
else
echo "Script callouts are not supported"
fi
# Finally, some tests to exercise code that is not tested above, just to be
# sure that it runs OK. Doing this improves the coverage statistics. The output

View File

@ -148,6 +148,17 @@ AC_ARG_ENABLE(pcre2grep-jit,
[disable JIT support in pcre2grep]),
, enable_pcre2grep_jit=yes)
# Handle --disable-pcre2grep-callout (enabled by default) but not supported
# for Windows.
if test "$HAVE_WINDOWS_H" != "1"; then
AC_ARG_ENABLE(pcre2grep-callout,
AS_HELP_STRING([--disable-pcre2grep-callout],
[disable callout script support in pcre2grep]),
, enable_pcre2grep_callout=yes)
else
enable_pcre2grep_callout=no
fi
# Handle --enable-rebuild-chartables
AC_ARG_ENABLE(rebuild-chartables,
AS_HELP_STRING([--enable-rebuild-chartables],
@ -392,6 +403,7 @@ sure both macros are undefined; an emulation function will then be used. */])
AC_HEADER_STDC
AC_CHECK_HEADERS(limits.h sys/types.h sys/stat.h dirent.h)
AC_CHECK_HEADERS([windows.h], [HAVE_WINDOWS_H=1])
AC_CHECK_HEADERS([sys/wait.h], [HAVE_SYS_WAIT_H=1])
# Conditional compilation
AM_CONDITIONAL(WITH_PCRE2_8, test "x$enable_pcre2_8" = "xyes")
@ -546,6 +558,21 @@ if test "$enable_pcre2grep_jit" = "yes"; then
Define to any value to enable JIT support in pcre2grep.])
fi
# Currently pcre2grep callout string is not supported under Windows.
if test "$enable_pcre2grep_callout" = "yes"; then
if test "$HAVE_WINDOWS_H" != "1"; then
if test "$HAVE_SYS_WAIT_H" != "1"; then
AC_MSG_ERROR([Callout script support needs sys/wait.h.])
fi
AC_DEFINE([SUPPORT_PCRE2GREP_CALLOUT], [], [
Define to any value to enable callout script support in pcre2grep.])
else
AC_MSG_WARN([Callout script support is not available for Windows: disabled])
enable_pcre2grep_callout=no
fi
fi
if test "$enable_unicode" = "yes"; then
AC_DEFINE([SUPPORT_UNICODE], [], [
Define to any value to enable support for Unicode and UTF encoding.
@ -908,6 +935,7 @@ $PACKAGE-$VERSION configuration summary:
Build shared libs ............... : ${enable_shared}
Build static libs ............... : ${enable_static}
Use JIT in pcre2grep ............ : ${enable_pcre2grep_jit}
Enable callouts in pcre2grep .... : ${enable_pcre2grep_callout}
Buffer size for pcre2grep ....... : ${with_pcre2grep_bufsize}
Link pcre2grep with libz ........ : ${enable_pcre2grep_libz}
Link pcre2grep with libbz2 ...... : ${enable_pcre2grep_libbz2}

View File

@ -1,4 +1,4 @@
.TH PCRE2BUILD 3 "16 October 2015" "PCRE2 10.21"
.TH PCRE2BUILD 3 "01 April 2016" "PCRE2 10.22"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.
@ -352,6 +352,19 @@ and equivalent run-time options, refer to these character values in an EBCDIC
environment.
.
.
.SH "PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS"
.rs
.sp
By default, on non-Windows systems, \fBpcre2grep\fP supports the use of
callouts with string arguments within the patterns it is matching, in order to
run external scripts. For details, see the
.\" HREF
\fBpcre2grep\fP
.\"
documentation. This support can be disabled by adding
--disable-pcre2grep-callout to the \fBconfigure\fP command.
.
.
.SH "PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT"
.rs
.sp
@ -381,7 +394,7 @@ parameter value by adding, for example,
--with-pcre2grep-bufsize=50K
.sp
to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override this
value by using --buffer-size on the command line..
value by using --buffer-size on the command line.
.
.
.SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT"
@ -519,6 +532,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 16 October 2015
Copyright (c) 1997-2015 University of Cambridge.
Last updated: 01 April 2016
Copyright (c) 1997-2016 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2GREP 1 "03 January 2015" "PCRE2 10.00"
.TH PCRE2GREP 1 "01 April 2016" "PCRE2 10.22"
.SH NAME
pcre2grep - a grep with Perl-compatible regular expressions.
.SH SYNOPSIS
@ -653,6 +653,54 @@ options does have data, it must be given in the first form, using an equals
character. Otherwise \fBpcre2grep\fP will assume that it has no data.
.
.
.SH "CALLING EXTERNAL SCRIPTS"
.rs
.sp
On non-Windows systems, \fBpcre2grep\fP has, by default, support for calling
external programs or scripts during matching by making use of PCRE2's callout
facility. However, this support can be disabled when \fBpcre2grep\fP is built.
You can find out whether your binary has support for callouts by running it
with the \fB--help\fP option. If the support is not enabled, all callouts in
patterns are ignored by \fBpcre2grep\fP.
.P
A callout in a PCRE2 pattern is of the form (?C<arg>) where the argument is
either a number or a quoted string (see the
.\" HREF
\fBpcre2callout\fP
.\"
documentation for details). Numbered callouts are ignored by \fBpcre2grep\fP.
String arguments are parsed as a list of substrings separated by pipe (vertical
bar) characters. The first substring must be an executable name, with the
following substrings specifying arguments:
.sp
executable_name|arg1|arg2|...
.sp
Any substirng (including the executable name) may contain escape sequences
started by a dollar character: $<digits> or ${<digits>} is replaced by the
captured substring of the given decimal number, which must be greater than
zero. If the number is greater than the number of capturing substrings, or if
the capture is unset, the replacement is empty.
.P
Any other character is substituted by itself. In particular, $$ is replaced by
a single dollar and $| is replaced by a pipe character. Here is an example:
.sp
echo -e "abcde\en12345" | pcre2grep \e
'(?x)(.)(..(.))
(?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' -
Output:
Arg1: [a] [bcd] [d] Arg2: |a| ()
abcde
Arg1: [1] [234] [4] Arg2: |1| ()
12345
.sp
Any syntax errors in the string (for example, a dollar not followed by another
character) cause the callout to be ignored. If running the program fails for
any reason (including the non-existence of the executable), a local matching
failure occurs and the matcher backtracks in the normal way.
.
.
.SH "MATCHING ERRORS"
.rs
.sp
@ -683,7 +731,7 @@ affect the return code.
.SH "SEE ALSO"
.rs
.sp
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3).
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3).
.
.
.SH AUTHOR
@ -700,6 +748,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 03 January 2015
Copyright (c) 1997-2015 University of Cambridge.
Last updated: 01 April 2016
Copyright (c) 1997-2016 University of Cambridge.
.fi

View File

@ -326,7 +326,7 @@ if [ $usemain -ne 0 ]; then
"--disable-shared" \
"--disable-unicode --disable-stack-for-recursion --disable-shared" \
"--disable-stack-for-recursion --disable-shared --enable-never-backslash-C" \
"--with-link-size=3 --disable-shared" \
"--with-link-size=3 --disable-shared --disable-pcre2grep-callout" \
"--disable-unicode --enable-rebuild-chartables --disable-shared" \
"--disable-unicode --enable-newline-is-any --disable-shared" \
"--disable-unicode --enable-newline-is-cr --disable-shared" \

View File

@ -111,6 +111,9 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
/* Define to 1 if you have the <sys/wait.h> header file. */
#undef HAVE_SYS_WAIT_H
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
@ -262,6 +265,9 @@ sure both macros are undefined; an emulation function will then be used. */
is able to handle .gz files. */
#undef SUPPORT_LIBZ
/* Define to any value to enable callout script support in pcre2grep. */
#undef SUPPORT_PCRE2GREP_CALLOUT
/* Define to any value to enable JIT support in pcre2grep. */
#undef SUPPORT_PCRE2GREP_JIT

View File

@ -58,6 +58,10 @@ POSSIBILITY OF SUCH DAMAGE.
#include <sys/types.h>
#include <sys/stat.h>
#ifdef SUPPORT_PCRE2GREP_CALLOUT
#include <sys/wait.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
@ -121,9 +125,9 @@ apply to fprintf(). */
#define FWRITE(a,b,c,d) if (fwrite(a,b,c,d)) {}
/* Under Windows, we have to set stdout to be binary, so that it does not
convert \r\n at the ends of output lines to \r\r\n. However, that means that
any messages written to stdout must have \r\n as their line terminator. This is
/* Under Windows, we have to set stdout to be binary, so that it does not
convert \r\n at the ends of output lines to \r\r\n. However, that means that
any messages written to stdout must have \r\n as their line terminator. This is
handled by using STDOUT_NL as the newline string. */
#if defined(_WIN32) || defined(WIN32)
@ -899,6 +903,13 @@ option_item *op;
printf("Usage: pcre2grep [OPTION]... [PATTERN] [FILE1 FILE2 ...]" STDOUT_NL);
printf("Search for PATTERN in each FILE or standard input." STDOUT_NL);
printf("PATTERN must be present if neither -e nor -f is used." STDOUT_NL);
#ifdef SUPPORT_PCRE2GREP_CALLOUT
printf("Callout scripts in patterns are supported." STDOUT_NL);
#else
printf("Callout scripts are not supported in this pcre2grep." STDOUT_NL);
#endif
printf("\"-\" can be used as a file name to mean STDIN." STDOUT_NL);
#ifdef SUPPORT_LIBZ
@ -1484,6 +1495,274 @@ return FALSE; /* No match, no errors */
}
#ifdef SUPPORT_PCRE2GREP_CALLOUT
/*************************************************
* Parse and execute callout scripts *
*************************************************/
/* This function parses a callout string block and executes the
program specified by the string. The string is a list of substrings
separated by pipe characters. The first substring represents the
executable name, and the following substrings specify the arguments:
program_name|param1|param2|...
Any substirng (including the program name) can contain escape sequences
started by the dollar character. The escape sequences are substituted as
follows:
$<digits> or ${<digits>} is replaced by the captured substring of the given
decimal number, which must be greater than zero. If the number is greater
than the number of capturing substrings, or if the capture is unset, the
replacement is empty.
Any other character is substituted by itself. E.g: $$ is replaced by a single
dollar or $| replaced by a pipe character.
Example:
echo -e "abcde\n12345" | pcre2grep \
'(.)(..(.))(?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' -
Output:
Arg1: [a] [bcd] [d] Arg2: |a| ()
abcde
Arg1: [1] [234] [4] Arg2: |1| ()
12345
Arguments:
blockptr the callout block
Returns: currently it always returns with 0
*/
static int
pcre2grep_callout(pcre2_callout_block *calloutptr, void *unused)
{
PCRE2_SIZE length = calloutptr->callout_string_length;
PCRE2_SPTR string = calloutptr->callout_string;
PCRE2_SPTR subject = calloutptr->subject;
PCRE2_SIZE *ovector = calloutptr->offset_vector;
PCRE2_SIZE capture_top = calloutptr->capture_top;
PCRE2_SIZE argsvectorlen = 2;
PCRE2_SIZE argslen = 1;
char *args;
char *argsptr;
char **argsvector;
char **argsvectorptr;
pid_t pid;
int result = 0;
(void)unused; /* Avoid compiler warning */
/* Only callout with strings are supported. */
if (string == NULL || length == 0) return 0;
/* Checking syntax and compute the number of string fragments. Callout strings
are ignored in case of a syntax error. */
while (length > 0)
{
if (*string == '|')
{
argsvectorlen++;
/* Maximum 10000 arguments allowed. */
if (argsvectorlen > 10000) return 0;
}
else if (*string == '$')
{
PCRE2_SIZE capture_id = 0;
string++;
length--;
/* Syntax error: a character must be present after $. */
if (length == 0) return 0;
if (*string >= '1' && *string <= '9')
{
do
{
/* Maximum capture id is 65535. */
if (capture_id <= 65535)
capture_id = capture_id * 10 + (*string - '0');
string++;
length--;
}
while (length > 0 && *string >= '0' && *string <= '9');
/* To negate the effect of string++ below. */
string--;
length++;
}
else if (*string == '{')
{
/* Must be a decimal number in parenthesis, e.g: (5) or (38) */
string++;
length--;
/* Syntax error: a decimal number required. */
if (length == 0) return 0;
if (*string < '1' || *string > '9') return 0;
do
{
/* Maximum capture id is 65535. */
if (capture_id <= 65535)
capture_id = capture_id * 10 + (*string - '0');
string++;
length--;
/* Syntax error: no more characters */
if (length == 0) return 0;
}
while (*string >= '0' && *string <= '9');
/* Syntax error: close paren is missing. */
if (*string != '}') return 0;
}
if (capture_id > 0)
{
if (capture_id < capture_top)
{
capture_id *= 2;
argslen += ovector[capture_id + 1] - ovector[capture_id];
}
/* To negate the effect of argslen++ below. */
argslen--;
}
}
string++;
length--;
argslen++;
}
args = (char*)malloc(argslen);
if (args == NULL) return 0;
argsvector = (char**)malloc(argsvectorlen * sizeof(char*));
if (argsvector == NULL)
{
free(args);
return 0;
}
argsptr = args;
argsvectorptr = argsvector;
*argsvectorptr++ = argsptr;
length = calloutptr->callout_string_length;
string = calloutptr->callout_string;
while (length > 0)
{
if (*string == '|')
{
*argsptr++ = '\0';
*argsvectorptr++ = argsptr;
}
else if (*string == '$')
{
string++;
length--;
if ((*string >= '1' && *string <= '9') || *string == '{')
{
PCRE2_SIZE capture_id = 0;
if (*string != '{')
{
do
{
/* Maximum capture id is 65535. */
if (capture_id <= 65535)
capture_id = capture_id * 10 + (*string - '0');
string++;
length--;
}
while (length > 0 && *string >= '0' && *string <= '9');
/* To negate the effect of string++ below. */
string--;
length++;
}
else
{
string++;
length--;
do
{
/* Maximum capture id is 65535. */
if (capture_id <= 65535)
capture_id = capture_id * 10 + (*string - '0');
string++;
length--;
}
while (*string != '}');
}
if (capture_id < capture_top)
{
PCRE2_SIZE capturesize;
capture_id *= 2;
capturesize = ovector[capture_id + 1] - ovector[capture_id];
memcpy(argsptr, subject + ovector[capture_id], capturesize);
argsptr += capturesize;
}
}
else
{
*argsptr++ = *string;
}
}
else
{
*argsptr++ = *string;
}
string++;
length--;
}
*argsptr++ = '\0';
*argsvectorptr = NULL;
pid = fork();
if (pid == 0)
{
(void)execv(argsvector[0], argsvector);
/* Control gets here if there is an error, e.g. a non-existent program */
exit(1);
}
else if (pid > 0)
(void)waitpid(pid, &result, 0);
free(args);
free(argsvector);
/* Currently negative return values are not supported, only zero (match
continues) or non-zero (match fails). */
return result != 0;
}
#endif
/*************************************************
* Grep an individual file *
@ -1786,7 +2065,7 @@ while (ptr < endptr)
}
}
if (printed || printname != NULL || number)
if (printed || printname != NULL || number)
fprintf(stdout, STDOUT_NL);
}
@ -2637,10 +2916,10 @@ const char *locale_from = "--locale";
pcre2_jit_stack *jit_stack = NULL;
#endif
/* In Windows, stdout is set up as a text stream, which means that \n is
converted to \r\n. This causes output lines that are copied from the input to
change from ....\r\n to ....\r\r\n, which is not right. We therefore ensure
that stdout is a binary stream. Note that this means all other output to stdout
/* In Windows, stdout is set up as a text stream, which means that \n is
converted to \r\n. This causes output lines that are copied from the input to
change from ....\r\n to ....\r\r\n, which is not right. We therefore ensure
that stdout is a binary stream. Note that this means all other output to stdout
must use STDOUT_NL to terminate lines. */
#if defined(_WIN32) || defined(WIN32)
@ -2654,6 +2933,13 @@ match_context = pcre2_match_context_create(NULL);
match_data = pcre2_match_data_create(OFFSET_SIZE, NULL);
offsets = pcre2_get_ovector_pointer(match_data);
/* If string (script) callouts are supported, set up the callout processing
function. */
#ifdef SUPPORT_PCRE2GREP_CALLOUT
pcre2_set_callout(match_context, pcre2grep_callout, NULL);
#endif
/* Process the options */
for (i = 1; i < argc; i++)

8
testdata/grepoutputC vendored Normal file
View File

@ -0,0 +1,8 @@
Arg1: [T] [he ] [ ] Arg2: |T| () () (0)
Arg1: [T] [his] [s] Arg2: |T| () () (0)
The quick brown
This time it jumps and jumps and jumps.
Arg1: [qu] [qu]
Arg1: [ t] [ t]
The quick brown
This time it jumps and jumps and jumps.