Implement buffer expansion in pcre2grep.

This commit is contained in:
Philip.Hazel 2016-10-11 16:40:09 +00:00
parent b451e9f3b5
commit bf6f0bb335
12 changed files with 287 additions and 168 deletions

View File

@ -76,6 +76,7 @@
# a new option instead of being unconditional. # a new option instead of being unconditional.
# 2016-10-05 PH fixed a typo (PCRE should be PCRE2) in above patch # 2016-10-05 PH fixed a typo (PCRE should be PCRE2) in above patch
# fix by David Gaussmann # fix by David Gaussmann
# 2016-10-07 PH added PCREGREP_MAX_BUFSIZE
PROJECT(PCRE2 C) PROJECT(PCRE2 C)
@ -148,7 +149,10 @@ SET(PCRE2_MATCH_LIMIT_RECURSION "MATCH_LIMIT" CACHE STRING
"Default limit on internal recursion. See MATCH_LIMIT_RECURSION in config.h.in for details.") "Default limit on internal recursion. See MATCH_LIMIT_RECURSION in config.h.in for details.")
SET(PCRE2GREP_BUFSIZE "20480" CACHE STRING SET(PCRE2GREP_BUFSIZE "20480" CACHE STRING
"Buffer size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.") "Buffer starting size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.")
SET(PCRE2GREP_MAX_BUFSIZE "1048576" CACHE STRING
"Buffer maximum size parameter for pcre2grep. See PCRE2GREP_MAX_BUFSIZE in config.h.in for details.")
SET(PCRE2_NEWLINE "LF" CACHE STRING SET(PCRE2_NEWLINE "LF" CACHE STRING
"What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).") "What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).")

View File

@ -61,6 +61,10 @@ escape sequence for a character whose code point was greater than \x{ff}.
9. Change 19 for 10.22 had a typo (PCRE_STATIC_RUNTIME should be 9. Change 19 for 10.22 had a typo (PCRE_STATIC_RUNTIME should be
PCRE2_STATIC_RUNTIME). Fix from David Gaussmann. PCRE2_STATIC_RUNTIME). Fix from David Gaussmann.
10. Added --max-buffer-size to pcre2grep, to allow for automatic buffer
expansion when long lines are encountered. Original patch by Dmitry
Cherniachenko.
Version 10.22 29-July-2016 Version 10.22 29-July-2016
-------------------------- --------------------------

18
README
View File

@ -339,12 +339,22 @@ library. They are also documented in the pcre2build man page.
Of course, the relevant libraries must be installed on your system. Of course, the relevant libraries must be installed on your system.
. The default size (in bytes) of the internal buffer used by pcre2grep can be . The default starting size (in bytes) of the internal buffer used by pcre2grep
set by, for example: can be set by, for example:
--with-pcre2grep-bufsize=51200 --with-pcre2grep-bufsize=51200
The value must be a plain integer. The default is 20480. The value must be a plain integer. The default is 20480. The amount of memory
used by pcre2grep is actually three times this number, to allow for "before"
and "after" lines.
. The default maximum size of pcre2grep's internal buffer can be set by, for
example:
--with-pcre2grep-max-bufsize=2097152
The default is either 1048576 or the value of --with-pcre2grep-bufsize,
whichever is the larger.
. It is possible to compile pcre2test so that it links with the libreadline . It is possible to compile pcre2test so that it links with the libreadline
or libedit libraries, by specifying, respectively, or libedit libraries, by specifying, respectively,
@ -845,4 +855,4 @@ The distribution should contain the files listed below.
Philip Hazel Philip Hazel
Email local part: ph10 Email local part: ph10
Email domain: cam.ac.uk Email domain: cam.ac.uk
Last updated: 01 April 2016 Last updated: 07 October 2016

View File

@ -440,7 +440,7 @@ echo "---------------------------- Test 82 -----------------------------" >>test
echo "RC=$?" >>testtrygrep echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 83 -----------------------------" >>testtrygrep echo "---------------------------- Test 83 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1 (cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=10 --max-buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 84 -----------------------------" >>testtrygrep echo "---------------------------- Test 84 -----------------------------" >>testtrygrep

View File

@ -41,6 +41,7 @@
#define NEWLINE_DEFAULT @NEWLINE_DEFAULT@ #define NEWLINE_DEFAULT @NEWLINE_DEFAULT@
#define PARENS_NEST_LIMIT @PCRE2_PARENS_NEST_LIMIT@ #define PARENS_NEST_LIMIT @PCRE2_PARENS_NEST_LIMIT@
#define PCRE2GREP_BUFSIZE @PCRE2GREP_BUFSIZE@ #define PCRE2GREP_BUFSIZE @PCRE2GREP_BUFSIZE@
#define PCRE2GREP_MAX_BUFSIZE @PCRE2GREP_MAX_BUFSIZE@
#define MAX_NAME_SIZE 32 #define MAX_NAME_SIZE 32
#define MAX_NAME_COUNT 10000 #define MAX_NAME_COUNT 10000

View File

@ -240,9 +240,15 @@ AC_ARG_ENABLE(pcre2grep-libbz2,
# Handle --with-pcre2grep-bufsize=N # Handle --with-pcre2grep-bufsize=N
AC_ARG_WITH(pcre2grep-bufsize, AC_ARG_WITH(pcre2grep-bufsize,
AS_HELP_STRING([--with-pcre2grep-bufsize=N], AS_HELP_STRING([--with-pcre2grep-bufsize=N],
[pcre2grep buffer size (default=20480, minimum=8192)]), [pcre2grep initial buffer size (default=20480, minimum=8192)]),
, with_pcre2grep_bufsize=20480) , with_pcre2grep_bufsize=20480)
# Handle --with-pcre2grep-max-bufsize=N
AC_ARG_WITH(pcre2grep-max-bufsize,
AS_HELP_STRING([--with-pcre2grep-max-bufsize=N],
[pcre2grep maximum buffer size (default=1048576, minimum=8192)]),
, with_pcre2grep_max_bufsize=1048576)
# Handle --enable-pcre2test-libedit # Handle --enable-pcre2test-libedit
AC_ARG_ENABLE(pcre2test-libedit, AC_ARG_ENABLE(pcre2test-libedit,
AS_HELP_STRING([--enable-pcre2test-libedit], AS_HELP_STRING([--enable-pcre2test-libedit],
@ -608,15 +614,30 @@ if test $with_pcre2grep_bufsize -lt 8192 ; then
with_pcre2grep_bufsize="8192" with_pcre2grep_bufsize="8192"
else else
if test $? -gt 1 ; then if test $? -gt 1 ; then
AC_MSG_ERROR([Bad value for --with-pcre2grep-bufsize]) AC_MSG_ERROR([Bad value for --with-pcre2grep-bufsize])
fi
fi
if test $with_pcre2grep_max_bufsize -lt $with_pcre2grep_bufsize ; then
with_pcre2grep_max_bufsize="$with_pcre2grep_bufsize"
else
if test $? -gt 1 ; then
AC_MSG_ERROR([Bad value for --with-pcre2grep-max-bufsize])
fi fi
fi fi
AC_DEFINE_UNQUOTED([PCRE2GREP_BUFSIZE], [$with_pcre2grep_bufsize], [ AC_DEFINE_UNQUOTED([PCRE2GREP_BUFSIZE], [$with_pcre2grep_bufsize], [
The value of PCRE2GREP_BUFSIZE determines the size of buffer used by pcre2grep The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
to hold parts of the file it is searching. This is also the minimum value. pcre2grep to hold parts of the file it is searching. The buffer will be
The actual amount of memory used by pcre2grep is three times this number, expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing very
because it allows for the buffering of "before" and "after" lines.]) long lines. The actual amount of memory used by pcre2grep is three times this
number, because it allows for the buffering of "before" and "after" lines.])
AC_DEFINE_UNQUOTED([PCRE2GREP_MAX_BUFSIZE], [$with_pcre2grep_max_bufsize], [
The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
used by pcre2grep to hold parts of the file it is searching. The actual
amount of memory used by pcre2grep is three times this number, because it
allows for the buffering of "before" and "after" lines.])
if test "$enable_pcre2test_libedit" = "yes"; then if test "$enable_pcre2test_libedit" = "yes"; then
AC_DEFINE([SUPPORT_LIBEDIT], [], [ AC_DEFINE([SUPPORT_LIBEDIT], [], [
@ -906,43 +927,44 @@ cat <<EOF
$PACKAGE-$VERSION configuration summary: $PACKAGE-$VERSION configuration summary:
Install prefix .................. : ${prefix} Install prefix ..................... : ${prefix}
C preprocessor .................. : ${CPP} C preprocessor ..................... : ${CPP}
C compiler ...................... : ${CC} C compiler ......................... : ${CC}
Linker .......................... : ${LD} Linker ............................. : ${LD}
C preprocessor flags ............ : ${CPPFLAGS} C preprocessor flags ............... : ${CPPFLAGS}
C compiler flags ................ : ${CFLAGS} ${VISIBILITY_CFLAGS} C compiler flags ................... : ${CFLAGS} ${VISIBILITY_CFLAGS}
Linker flags .................... : ${LDFLAGS} Linker flags ....................... : ${LDFLAGS}
Extra libraries ................. : ${LIBS} Extra libraries .................... : ${LIBS}
Build 8-bit pcre2 library ....... : ${enable_pcre2_8} Build 8-bit pcre2 library .......... : ${enable_pcre2_8}
Build 16-bit pcre2 library ...... : ${enable_pcre2_16} Build 16-bit pcre2 library ......... : ${enable_pcre2_16}
Build 32-bit pcre2 library ...... : ${enable_pcre2_32} Build 32-bit pcre2 library ......... : ${enable_pcre2_32}
Include debugging code .......... : ${enable_debug} Include debugging code ............. : ${enable_debug}
Enable JIT compiling support .... : ${enable_jit} Enable JIT compiling support ....... : ${enable_jit}
Enable Unicode support .......... : ${enable_unicode} Enable Unicode support ............. : ${enable_unicode}
Newline char/sequence ........... : ${enable_newline} Newline char/sequence .............. : ${enable_newline}
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf} \R matches only ANYCRLF ............ : ${enable_bsr_anycrlf}
\C is disabled .................. : ${enable_never_backslash_C} \C is disabled ..................... : ${enable_never_backslash_C}
EBCDIC coding ................... : ${enable_ebcdic} EBCDIC coding ...................... : ${enable_ebcdic}
EBCDIC code for NL .............. : ${ebcdic_nl_code} EBCDIC code for NL ................. : ${ebcdic_nl_code}
Rebuild char tables ............. : ${enable_rebuild_chartables} Rebuild char tables ................ : ${enable_rebuild_chartables}
Use stack recursion ............. : ${enable_stack_for_recursion} Use stack recursion ................ : ${enable_stack_for_recursion}
Internal link size .............. : ${with_link_size} Internal link size ................. : ${with_link_size}
Nested parentheses limit ........ : ${with_parens_nest_limit} Nested parentheses limit ........... : ${with_parens_nest_limit}
Match limit ..................... : ${with_match_limit} Match limit ........................ : ${with_match_limit}
Match limit recursion ........... : ${with_match_limit_recursion} Match limit recursion .............. : ${with_match_limit_recursion}
Build shared libs ............... : ${enable_shared} Build shared libs .................. : ${enable_shared}
Build static libs ............... : ${enable_static} Build static libs .................. : ${enable_static}
Use JIT in pcre2grep ............ : ${enable_pcre2grep_jit} Use JIT in pcre2grep ............... : ${enable_pcre2grep_jit}
Enable callouts in pcre2grep .... : ${enable_pcre2grep_callout} Enable callouts in pcre2grep ....... : ${enable_pcre2grep_callout}
Buffer size for pcre2grep ....... : ${with_pcre2grep_bufsize} Initial buffer size for pcre2grep .. : ${with_pcre2grep_bufsize}
Link pcre2grep with libz ........ : ${enable_pcre2grep_libz} Maximum buffer size for pcre2grep .. : ${with_pcre2grep_max_bufsize}
Link pcre2grep with libbz2 ...... : ${enable_pcre2grep_libbz2} Link pcre2grep with libz ........... : ${enable_pcre2grep_libz}
Link pcre2test with libedit ..... : ${enable_pcre2test_libedit} Link pcre2grep with libbz2 ......... : ${enable_pcre2grep_libbz2}
Link pcre2test with libreadline . : ${enable_pcre2test_libreadline} Link pcre2test with libedit ........ : ${enable_pcre2test_libedit}
Valgrind support ................ : ${enable_valgrind} Link pcre2test with libreadline .... : ${enable_pcre2test_libreadline}
Code coverage ................... : ${enable_coverage} Valgrind support ................... : ${enable_valgrind}
Code coverage ...................... : ${enable_coverage}
EOF EOF

View File

@ -1,4 +1,4 @@
.TH PCRE2BUILD 3 "01 April 2016" "PCRE2 10.22" .TH PCRE2BUILD 3 "07 October 2016" "PCRE2 10.23"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
. .
@ -385,16 +385,19 @@ they are not.
.sp .sp
\fBpcre2grep\fP uses an internal buffer to hold a "window" on the file it is \fBpcre2grep\fP uses an internal buffer to hold a "window" on the file it is
scanning, in order to be able to output "before" and "after" lines when it scanning, in order to be able to output "before" and "after" lines when it
finds a match. The size of the buffer is controlled by a parameter whose finds a match. The starting size of the buffer is controlled by a parameter
default value is 20K. The buffer itself is three times this size, but because whose default value is 20K. The buffer itself is three times this size, but
of the way it is used for holding "before" lines, the longest line that is because of the way it is used for holding "before" lines, the longest line that
guaranteed to be processable is the parameter size. You can change the default is guaranteed to be processable is the parameter size. If a longer line is
parameter value by adding, for example, encountered, \fBpcre2grep\fP automatically expands the buffer, up to a
specified maximum size, whose default is 1M or the starting size, whichever is
the larger. You can change the default parameter values by adding, for example,
.sp .sp
--with-pcre2grep-bufsize=50K --with-pcre2grep-bufsize=51200
--with-pcre2grep-max-bufsize=2097152
.sp .sp
to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override this to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override
value by using --buffer-size on the command line. these values by using --buffer-size and --max-buffer-size on the command line.
. .
. .
.SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT" .SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT"
@ -532,6 +535,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 01 April 2016 Last updated: 07 October 2016
Copyright (c) 1997-2016 University of Cambridge. Copyright (c) 1997-2016 University of Cambridge.
.fi .fi

View File

@ -1,4 +1,4 @@
.TH PCRE2GREP 1 "19 June 2016" "PCRE2 10.22" .TH PCRE2GREP 1 "11 October 2016" "PCRE2 10.23"
.SH NAME .SH NAME
pcre2grep - a grep with Perl-compatible regular expressions. pcre2grep - a grep with Perl-compatible regular expressions.
.SH SYNOPSIS .SH SYNOPSIS
@ -52,11 +52,18 @@ span line boundaries. What defines a line boundary is controlled by the
\fB-N\fP (\fB--newline\fP) option. \fB-N\fP (\fB--newline\fP) option.
.P .P
The amount of memory used for buffering files that are being scanned is The amount of memory used for buffering files that are being scanned is
controlled by a parameter that can be set by the \fB--buffer-size\fP option. controlled by parameters that can be set by the \fB--buffer-size\fP and
The default value for this parameter is specified when \fBpcre2grep\fP is \fB--max-buffer-size\fP options. The first of these sets the size of buffer
built, with the default default being 20K. A block of memory three times this that is obtained at the start of processing. If an input file contains very
size is used (to allow for buffering "before" and "after" lines). An error long lines, a larger buffer may be needed; this is handled by automatically
occurs if a line overflows the buffer. extending the buffer, up to the limit specified by \fB--max-buffer-size\fP. The
default values for these parameters are specified when \fBpcre2grep\fP is
built, with the default defaults being 20K and 1M respectively. An error occurs
if a line is too long and the buffer can no longer be expanded.
.P
The block of memory that is actually used is three times the "buffer size", to
allow for buffering "before" and "after" lines. If the buffer size is too
small, fewer than requested "before" and "after" lines may be output.
.P .P
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater. Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater.
BUFSIZ is defined in \fB<stdio.h>\fP. When there is more than one pattern BUFSIZ is defined in \fB<stdio.h>\fP. When there is more than one pattern
@ -126,24 +133,29 @@ command line starts with a hyphen but is not an option. This allows for the
processing of patterns and file names that start with hyphens. processing of patterns and file names that start with hyphens.
.TP .TP
\fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP \fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP
Output \fInumber\fP lines of context after each matching line. If file names Output up to \fInumber\fP lines of context after each matching line. Fewer
and/or line numbers are being output, a hyphen separator is used instead of a lines are output if the next match or the end of the file is reached, or if the
colon for the context lines. A line containing "--" is output between each processing buffer size has been set too small. If file names and/or line
group of lines, unless they are in fact contiguous in the input file. The value numbers are being output, a hyphen separator is used instead of a colon for the
of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP context lines. A line containing "--" is output between each group of lines,
guarantees to have up to 8K of following text available for context output. unless they are in fact contiguous in the input file. The value of \fInumber\fP
is expected to be relatively small. However, \fBpcre2grep\fP guarantees to have
up to 8K of following text available for context output.
.TP .TP
\fB-a\fP, \fB--text\fP \fB-a\fP, \fB--text\fP
Treat binary files as text. This is equivalent to Treat binary files as text. This is equivalent to
\fB--binary-files\fP=\fItext\fP. \fB--binary-files\fP=\fItext\fP.
.TP .TP
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP \fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
Output \fInumber\fP lines of context before each matching line. If file names Output up to \fInumber\fP lines of context before each matching line. Fewer
and/or line numbers are being output, a hyphen separator is used instead of a lines are output if the previous match or the start of the file is within
colon for the context lines. A line containing "--" is output between each \fInumber\fP lines, or if the processing buffer size has been set too small. If
group of lines, unless they are in fact contiguous in the input file. The value file names and/or line numbers are being output, a hyphen separator is used
of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP instead of a colon for the context lines. A line containing "--" is output
guarantees to have up to 8K of preceding text available for context output. between each group of lines, unless they are in fact contiguous in the input
file. The value of \fInumber\fP is expected to be relatively small. However,
\fBpcre2grep\fP guarantees to have up to 8K of preceding text available for
context output.
.TP .TP
\fB--binary-files=\fP\fIword\fP \fB--binary-files=\fP\fIword\fP
Specify how binary files are to be processed. If the word is "binary" (the Specify how binary files are to be processed. If the word is "binary" (the
@ -158,8 +170,9 @@ be of interest and are skipped without causing any output or affecting the
return code. return code.
.TP .TP
\fB--buffer-size=\fP\fInumber\fP \fB--buffer-size=\fP\fInumber\fP
Set the parameter that controls how much memory is used for buffering files Set the parameter that controls how much memory is obtained at the start of
that are being scanned. processing for buffering files that are being scanned. See also
\fB--max-buffer-size\fP below.
.TP .TP
\fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP \fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP
Output \fInumber\fP lines of context both before and after each matching line. Output \fInumber\fP lines of context both before and after each matching line.
@ -432,6 +445,11 @@ of use only if it is set smaller than \fB--match-limit\fP.
There are no short forms for these options. The default settings are specified There are no short forms for these options. The default settings are specified
when the PCRE2 library is compiled, with the default default being 10 million. when the PCRE2 library is compiled, with the default default being 10 million.
.TP .TP
\fB--max-buffer-size=\fInumber\fP
This limits the expansion of the processing buffer, whose initial size can be
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
smaller than the starting buffer size.
.TP
\fB-M\fP, \fB--multiline\fP \fB-M\fP, \fB--multiline\fP
Allow patterns to match more than one line. When this option is given, patterns Allow patterns to match more than one line. When this option is given, patterns
may usefully contain literal newline characters and internal occurrences of ^ may usefully contain literal newline characters and internal occurrences of ^
@ -757,6 +775,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 19 June 2016 Last updated: 11 October 2016
Copyright (c) 1997-2016 University of Cambridge. Copyright (c) 1997-2016 University of Cambridge.
.fi .fi

View File

@ -206,7 +206,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_NAME "PCRE2" #define PACKAGE_NAME "PCRE2"
/* Define to the full name and version of this package. */ /* Define to the full name and version of this package. */
#define PACKAGE_STRING "PCRE2 10.22" #define PACKAGE_STRING "PCRE2 10.23-RC1"
/* Define to the one symbol short name of this package. */ /* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre2" #define PACKAGE_TARNAME "pcre2"
@ -215,7 +215,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_URL "" #define PACKAGE_URL ""
/* Define to the version of this package. */ /* Define to the version of this package. */
#define PACKAGE_VERSION "10.22" #define PACKAGE_VERSION "10.23-RC1"
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested /* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
parentheses (of any kind) in a pattern. This limits the amount of system parentheses (of any kind) in a pattern. This limits the amount of system
@ -224,15 +224,24 @@ sure both macros are undefined; an emulation function will then be used. */
#define PARENS_NEST_LIMIT 250 #define PARENS_NEST_LIMIT 250
#endif #endif
/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by /* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
pcre2grep to hold parts of the file it is searching. This is also the pcre2grep to hold parts of the file it is searching. The buffer will be
minimum value. The actual amount of memory used by pcre2grep is three times expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing
this number, because it allows for the buffering of "before" and "after" very long lines. The actual amount of memory used by pcre2grep is three
lines. */ times this number, because it allows for the buffering of "before" and
"after" lines. */
#ifndef PCRE2GREP_BUFSIZE #ifndef PCRE2GREP_BUFSIZE
#define PCRE2GREP_BUFSIZE 20480 #define PCRE2GREP_BUFSIZE 20480
#endif #endif
/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
used by pcre2grep to hold parts of the file it is searching. The actual
amount of memory used by pcre2grep is three times this number, because it
allows for the buffering of "before" and "after" lines. */
#ifndef PCRE2GREP_MAX_BUFSIZE
#define PCRE2GREP_MAX_BUFSIZE 1048576
#endif
/* Define to any value to include debugging code. */ /* Define to any value to include debugging code. */
/* #undef PCRE2_DEBUG */ /* #undef PCRE2_DEBUG */
@ -299,7 +308,7 @@ sure both macros are undefined; an emulation function will then be used. */
/* #undef SUPPORT_VALGRIND */ /* #undef SUPPORT_VALGRIND */
/* Version number of package */ /* Version number of package */
#define VERSION "10.22" #define VERSION "10.23-RC1"
/* Define to empty if `const' does not conform to ANSI C. */ /* Define to empty if `const' does not conform to ANSI C. */
/* #undef const */ /* #undef const */

View File

@ -207,13 +207,20 @@ sure both macros are undefined; an emulation function will then be used. */
stack that is used while compiling a pattern. */ stack that is used while compiling a pattern. */
#undef PARENS_NEST_LIMIT #undef PARENS_NEST_LIMIT
/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by /* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
pcre2grep to hold parts of the file it is searching. This is also the pcre2grep to hold parts of the file it is searching. The buffer will be
minimum value. The actual amount of memory used by pcre2grep is three times expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing
this number, because it allows for the buffering of "before" and "after" very long lines. The actual amount of memory used by pcre2grep is three
lines. */ times this number, because it allows for the buffering of "before" and
"after" lines. */
#undef PCRE2GREP_BUFSIZE #undef PCRE2GREP_BUFSIZE
/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
used by pcre2grep to hold parts of the file it is searching. The actual
amount of memory used by pcre2grep is three times this number, because it
allows for the buffering of "before" and "after" lines. */
#undef PCRE2GREP_MAX_BUFSIZE
/* to make a symbol visible */ /* to make a symbol visible */
#undef PCRE2POSIX_EXP_DECL #undef PCRE2POSIX_EXP_DECL

View File

@ -173,6 +173,7 @@ static int before_context = 0;
static int binary_files = BIN_BINARY; static int binary_files = BIN_BINARY;
static int both_context = 0; static int both_context = 0;
static int bufthird = PCRE2GREP_BUFSIZE; static int bufthird = PCRE2GREP_BUFSIZE;
static int max_bufthird = PCRE2GREP_MAX_BUFSIZE;
static int bufsize = 3*PCRE2GREP_BUFSIZE; static int bufsize = 3*PCRE2GREP_BUFSIZE;
static int endlinetype; static int endlinetype;
@ -344,6 +345,7 @@ used to identify them. */
#define N_EXCLUDE_FROM (-19) #define N_EXCLUDE_FROM (-19)
#define N_INCLUDE_FROM (-20) #define N_INCLUDE_FROM (-20)
#define N_OM_SEPARATOR (-21) #define N_OM_SEPARATOR (-21)
#define N_MAX_BUFSIZE (-22)
static option_item optionlist[] = { static option_item optionlist[] = {
{ OP_NODATA, N_NULL, NULL, "", "terminate options" }, { OP_NODATA, N_NULL, NULL, "", "terminate options" },
@ -352,7 +354,8 @@ static option_item optionlist[] = {
{ OP_NODATA, 'a', NULL, "text", "treat binary files as text" }, { OP_NODATA, 'a', NULL, "text", "treat binary files as text" },
{ OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" }, { OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" },
{ OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" }, { OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" },
{ OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer size parameter" }, { OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer starting size" },
{ OP_NUMBER, N_MAX_BUFSIZE,&max_bufthird, "max-buffer-size=number", "set processing buffer maximum size" },
{ OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" }, { OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" },
{ OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" }, { OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" },
{ OP_NUMBER, 'C', &both_context, "context=number", "set number of context lines, before & after" }, { OP_NUMBER, 'C', &both_context, "context=number", "set number of context lines, before & after" },
@ -952,8 +955,9 @@ for (op = optionlist; op->one_char != 0; op++)
printf("%.*s%s" STDOUT_NL, n, " ", op->help_text); printf("%.*s%s" STDOUT_NL, n, " ", op->help_text);
} }
printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --buffer-size=100K." STDOUT_NL); printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --max-buffer-size=100K." STDOUT_NL);
printf("The default value for --buffer-size is %d." STDOUT_NL, PCRE2GREP_BUFSIZE); printf("The default value for --buffer-size is %d." STDOUT_NL, PCRE2GREP_BUFSIZE);
printf("The default value for --max-buffer-size is %d." STDOUT_NL, PCRE2GREP_MAX_BUFSIZE);
printf("When reading patterns or file names from a file, trailing white" STDOUT_NL); printf("When reading patterns or file names from a file, trailing white" STDOUT_NL);
printf("space is removed and blank lines are ignored." STDOUT_NL); printf("space is removed and blank lines are ignored." STDOUT_NL);
printf("The maximum size of any pattern is %d bytes." STDOUT_NL, MAXPATLEN); printf("The maximum size of any pattern is %d bytes." STDOUT_NL, MAXPATLEN);
@ -1100,12 +1104,12 @@ return om;
* Read one line of input * * Read one line of input *
*************************************************/ *************************************************/
/* Normally, input is read using fread() into a large buffer, so many lines may /* Normally, input is read using fread() (or gzread, or BZ2_read) into a large
be read at once. However, doing this for tty input means that no output appears buffer, so many lines may be read at once. However, doing this for tty input
until a lot of input has been typed. Instead, tty input is handled line by means that no output appears until a lot of input has been typed. Instead, tty
line. We cannot use fgets() for this, because it does not stop at a binary input is handled line by line. We cannot use fgets() for this, because it does
zero, and therefore there is no way of telling how many characters it has read, not stop at a binary zero, and therefore there is no way of telling how many
because there may be binary zeros embedded in the data. characters it has read, because there may be binary zeros embedded in the data.
Arguments: Arguments:
buffer the buffer to read into buffer the buffer to read into
@ -1424,17 +1428,18 @@ do_after_lines(int lastmatchnumber, char *lastmatchrestart, char *endptr,
if (after_context > 0 && lastmatchnumber > 0) if (after_context > 0 && lastmatchnumber > 0)
{ {
int count = 0; int count = 0;
while (lastmatchrestart < endptr && count++ < after_context) while (lastmatchrestart < endptr && count < after_context)
{ {
int ellength; int ellength;
char *pp = lastmatchrestart; char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
if (ellength == 0 && pp == main_buffer + bufsize) break;
if (printname != NULL) fprintf(stdout, "%s-", printname); if (printname != NULL) fprintf(stdout, "%s-", printname);
if (number) fprintf(stdout, "%d-", lastmatchnumber++); if (number) fprintf(stdout, "%d-", lastmatchnumber++);
pp = end_of_line(pp, endptr, &ellength);
FWRITE(lastmatchrestart, 1, pp - lastmatchrestart, stdout); FWRITE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
lastmatchrestart = pp; lastmatchrestart = pp;
count++;
} }
hyphenpending = TRUE; if (count > 0) hyphenpending = TRUE;
} }
} }
@ -1769,6 +1774,33 @@ return result != 0;
/*************************************************
* Read a portion of the file into buffer *
*************************************************/
static int
fill_buffer(void *handle, int frtype, char *buffer, int length,
BOOL input_line_buffered)
{
#ifdef SUPPORT_LIBZ
if (frtype == FR_LIBZ)
return gzread((gzFile)handle, buffer, length);
else
#endif
#ifdef SUPPORT_LIBBZ2
if (frtype == FR_LIBBZ2)
return BZ2_bzread((BZFILE *)handle, buffer, length);
else
#endif
return (input_line_buffered ?
read_one_line(buffer, length, (FILE *)handle) :
fread(buffer, 1, length, (FILE *)handle));
}
/************************************************* /*************************************************
* Grep an individual file * * Grep an individual file *
*************************************************/ *************************************************/
@ -1813,49 +1845,24 @@ BOOL endhyphenpending = FALSE;
BOOL input_line_buffered = line_buffered; BOOL input_line_buffered = line_buffered;
FILE *in = NULL; /* Ensure initialized */ FILE *in = NULL; /* Ensure initialized */
#ifdef SUPPORT_LIBZ
gzFile ingz = NULL;
#endif
#ifdef SUPPORT_LIBBZ2
BZFILE *inbz2 = NULL;
#endif
/* Do the first read into the start of the buffer and set up the pointer to end /* Do the first read into the start of the buffer and set up the pointer to end
of what we have. In the case of libz, a non-zipped .gz file will be read as a of what we have. In the case of libz, a non-zipped .gz file will be read as a
plain file. However, if a .bz2 file isn't actually bzipped, the first read will plain file. However, if a .bz2 file isn't actually bzipped, the first read will
fail. */ fail. */
(void)frtype; if (frtype != FR_LIBZ && frtype != FR_LIBBZ2)
#ifdef SUPPORT_LIBZ
if (frtype == FR_LIBZ)
{
ingz = (gzFile)handle;
bufflength = gzread (ingz, main_buffer, bufsize);
}
else
#endif
#ifdef SUPPORT_LIBBZ2
if (frtype == FR_LIBBZ2)
{
inbz2 = (BZFILE *)handle;
bufflength = BZ2_bzread(inbz2, main_buffer, bufsize);
if ((int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */
} /* without the cast it is unsigned. */
else
#endif
{ {
in = (FILE *)handle; in = (FILE *)handle;
if (is_file_tty(in)) input_line_buffered = TRUE; if (is_file_tty(in)) input_line_buffered = TRUE;
bufflength = input_line_buffered?
read_one_line(main_buffer, bufsize, in) :
fread(main_buffer, 1, bufsize, in);
} }
bufflength = fill_buffer(handle, frtype, main_buffer, bufsize,
input_line_buffered);
#ifdef SUPPORT_LIBBZ2
if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */
#endif
endptr = main_buffer + bufflength; endptr = main_buffer + bufflength;
/* Unless binary-files=text, see if we have a binary file. This uses the same /* Unless binary-files=text, see if we have a binary file. This uses the same
@ -1899,18 +1906,61 @@ while (ptr < endptr)
/* Check to see if the line we are looking at extends right to the very end /* Check to see if the line we are looking at extends right to the very end
of the buffer without a line terminator. This means the line is too long to of the buffer without a line terminator. This means the line is too long to
handle. */ handle at the current buffer size. Until the buffer reaches its maximum size,
try doubling it and reading more data. */
if (endlinelength == 0 && t == main_buffer + bufsize) if (endlinelength == 0 && t == main_buffer + bufsize)
{ {
fprintf(stderr, "pcre2grep: line %d%s%s is too long for the internal buffer\n" if (bufthird < max_bufthird)
"pcre2grep: the buffer size is %d\n" {
"pcre2grep: use the --buffer-size option to change it\n", char *new_buffer;
linenumber, int new_bufthird = 2*bufthird;
(filename == NULL)? "" : " of file ",
(filename == NULL)? "" : filename, if (new_bufthird > max_bufthird) new_bufthird = max_bufthird;
bufthird); new_buffer = (char *)malloc(3*new_bufthird);
return 2;
if (new_buffer == NULL)
{
fprintf(stderr,
"pcre2grep: line %d%s%s is too long for the internal buffer\n"
"pcre2grep: not enough memory to increase the buffer size to %d\n",
linenumber,
(filename == NULL)? "" : " of file ",
(filename == NULL)? "" : filename,
new_bufthird);
return 2;
}
/* Copy the data and adjust pointers to the new buffer location. */
memcpy(new_buffer, main_buffer, bufsize);
bufthird = new_bufthird;
bufsize = 3*bufthird;
ptr = new_buffer + (ptr - main_buffer);
lastmatchrestart = new_buffer + (lastmatchrestart - main_buffer);
free(main_buffer);
main_buffer = new_buffer;
/* Read more data into the buffer and then try to find the line ending
again. */
bufflength += fill_buffer(handle, frtype, main_buffer + bufflength,
bufsize - bufflength, input_line_buffered);
endptr = main_buffer + bufflength;
continue;
}
else
{
fprintf(stderr,
"pcre2grep: line %d%s%s is too long for the internal buffer\n"
"pcre2grep: the maximum buffer size is %d\n"
"pcre2grep: use the --max-buffer-size option to change it\n",
linenumber,
(filename == NULL)? "" : " of file ",
(filename == NULL)? "" : filename,
bufthird);
return 2;
}
} }
/* Extra processing for Jeffrey Friedl's debugging. */ /* Extra processing for Jeffrey Friedl's debugging. */
@ -2320,8 +2370,9 @@ while (ptr < endptr)
lastmatchnumber > 0 && lastmatchnumber > 0 &&
lastmatchrestart < main_buffer + bufthird) lastmatchrestart < main_buffer + bufthird)
{ {
do_after_lines(lastmatchnumber, lastmatchrestart, endptr, printname); do_after_lines(lastmatchnumber, lastmatchrestart, endptr, printname);
lastmatchnumber = 0; lastmatchnumber = 0; /* Indicates no after lines pending */
} }
/* Now do the shuffle */ /* Now do the shuffle */
@ -2329,24 +2380,8 @@ while (ptr < endptr)
memmove(main_buffer, main_buffer + bufthird, 2*bufthird); memmove(main_buffer, main_buffer + bufthird, 2*bufthird);
ptr -= bufthird; ptr -= bufthird;
#ifdef SUPPORT_LIBZ bufflength = 2*bufthird + fill_buffer(handle, frtype,
if (frtype == FR_LIBZ) main_buffer + 2*bufthird, bufthird, input_line_buffered);
bufflength = 2*bufthird +
gzread (ingz, main_buffer + 2*bufthird, bufthird);
else
#endif
#ifdef SUPPORT_LIBBZ2
if (frtype == FR_LIBBZ2)
bufflength = 2*bufthird +
BZ2_bzread(inbz2, main_buffer + 2*bufthird, bufthird);
else
#endif
bufflength = 2*bufthird +
(input_line_buffered?
read_one_line(main_buffer + 2*bufthird, bufthird, in) :
fread(main_buffer + 2*bufthird, 1, bufthird, in));
endptr = main_buffer + bufflength; endptr = main_buffer + bufflength;
/* Adjust any last match point */ /* Adjust any last match point */
@ -3427,6 +3462,12 @@ if (jfriedl_XT != 0 || jfriedl_XR != 0)
/* Get memory for the main buffer. */ /* Get memory for the main buffer. */
if (bufthird <= 0)
{
fprintf(stderr, "pcre2grep: --buffer-size must be greater than zero\n");
goto EXIT2;
}
bufsize = 3*bufthird; bufsize = 3*bufthird;
main_buffer = (char *)malloc(bufsize); main_buffer = (char *)malloc(bufsize);

4
testdata/grepoutput vendored
View File

@ -637,8 +637,8 @@ RC=0
RC=0 RC=0
---------------------------- Test 83 ----------------------------- ---------------------------- Test 83 -----------------------------
pcre2grep: line 4 of file ./testdata/grepinput3 is too long for the internal buffer pcre2grep: line 4 of file ./testdata/grepinput3 is too long for the internal buffer
pcre2grep: the buffer size is 100 pcre2grep: the maximum buffer size is 100
pcre2grep: use the --buffer-size option to change it pcre2grep: use the --max-buffer-size option to change it
RC=2 RC=2
---------------------------- Test 84 ----------------------------- ---------------------------- Test 84 -----------------------------
testdata/grepinputv:fox jumps testdata/grepinputv:fox jumps