Implement buffer expansion in pcre2grep.

This commit is contained in:
Philip.Hazel 2016-10-11 16:40:09 +00:00
parent b451e9f3b5
commit bf6f0bb335
12 changed files with 287 additions and 168 deletions

View File

@ -76,6 +76,7 @@
# a new option instead of being unconditional.
# 2016-10-05 PH fixed a typo (PCRE should be PCRE2) in above patch
# fix by David Gaussmann
# 2016-10-07 PH added PCREGREP_MAX_BUFSIZE
PROJECT(PCRE2 C)
@ -148,7 +149,10 @@ SET(PCRE2_MATCH_LIMIT_RECURSION "MATCH_LIMIT" CACHE STRING
"Default limit on internal recursion. See MATCH_LIMIT_RECURSION in config.h.in for details.")
SET(PCRE2GREP_BUFSIZE "20480" CACHE STRING
"Buffer size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.")
"Buffer starting size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.")
SET(PCRE2GREP_MAX_BUFSIZE "1048576" CACHE STRING
"Buffer maximum size parameter for pcre2grep. See PCRE2GREP_MAX_BUFSIZE in config.h.in for details.")
SET(PCRE2_NEWLINE "LF" CACHE STRING
"What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).")

View File

@ -61,6 +61,10 @@ escape sequence for a character whose code point was greater than \x{ff}.
9. Change 19 for 10.22 had a typo (PCRE_STATIC_RUNTIME should be
PCRE2_STATIC_RUNTIME). Fix from David Gaussmann.
10. Added --max-buffer-size to pcre2grep, to allow for automatic buffer
expansion when long lines are encountered. Original patch by Dmitry
Cherniachenko.
Version 10.22 29-July-2016
--------------------------

18
README
View File

@ -339,12 +339,22 @@ library. They are also documented in the pcre2build man page.
Of course, the relevant libraries must be installed on your system.
. The default size (in bytes) of the internal buffer used by pcre2grep can be
set by, for example:
. The default starting size (in bytes) of the internal buffer used by pcre2grep
can be set by, for example:
--with-pcre2grep-bufsize=51200
The value must be a plain integer. The default is 20480.
The value must be a plain integer. The default is 20480. The amount of memory
used by pcre2grep is actually three times this number, to allow for "before"
and "after" lines.
. The default maximum size of pcre2grep's internal buffer can be set by, for
example:
--with-pcre2grep-max-bufsize=2097152
The default is either 1048576 or the value of --with-pcre2grep-bufsize,
whichever is the larger.
. It is possible to compile pcre2test so that it links with the libreadline
or libedit libraries, by specifying, respectively,
@ -845,4 +855,4 @@ The distribution should contain the files listed below.
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
Last updated: 01 April 2016
Last updated: 07 October 2016

View File

@ -440,7 +440,7 @@ echo "---------------------------- Test 82 -----------------------------" >>test
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 83 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1
(cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=10 --max-buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 84 -----------------------------" >>testtrygrep

View File

@ -41,6 +41,7 @@
#define NEWLINE_DEFAULT @NEWLINE_DEFAULT@
#define PARENS_NEST_LIMIT @PCRE2_PARENS_NEST_LIMIT@
#define PCRE2GREP_BUFSIZE @PCRE2GREP_BUFSIZE@
#define PCRE2GREP_MAX_BUFSIZE @PCRE2GREP_MAX_BUFSIZE@
#define MAX_NAME_SIZE 32
#define MAX_NAME_COUNT 10000

View File

@ -240,9 +240,15 @@ AC_ARG_ENABLE(pcre2grep-libbz2,
# Handle --with-pcre2grep-bufsize=N
AC_ARG_WITH(pcre2grep-bufsize,
AS_HELP_STRING([--with-pcre2grep-bufsize=N],
[pcre2grep buffer size (default=20480, minimum=8192)]),
[pcre2grep initial buffer size (default=20480, minimum=8192)]),
, with_pcre2grep_bufsize=20480)
# Handle --with-pcre2grep-max-bufsize=N
AC_ARG_WITH(pcre2grep-max-bufsize,
AS_HELP_STRING([--with-pcre2grep-max-bufsize=N],
[pcre2grep maximum buffer size (default=1048576, minimum=8192)]),
, with_pcre2grep_max_bufsize=1048576)
# Handle --enable-pcre2test-libedit
AC_ARG_ENABLE(pcre2test-libedit,
AS_HELP_STRING([--enable-pcre2test-libedit],
@ -612,11 +618,26 @@ else
fi
fi
if test $with_pcre2grep_max_bufsize -lt $with_pcre2grep_bufsize ; then
with_pcre2grep_max_bufsize="$with_pcre2grep_bufsize"
else
if test $? -gt 1 ; then
AC_MSG_ERROR([Bad value for --with-pcre2grep-max-bufsize])
fi
fi
AC_DEFINE_UNQUOTED([PCRE2GREP_BUFSIZE], [$with_pcre2grep_bufsize], [
The value of PCRE2GREP_BUFSIZE determines the size of buffer used by pcre2grep
to hold parts of the file it is searching. This is also the minimum value.
The actual amount of memory used by pcre2grep is three times this number,
because it allows for the buffering of "before" and "after" lines.])
The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
pcre2grep to hold parts of the file it is searching. The buffer will be
expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing very
long lines. The actual amount of memory used by pcre2grep is three times this
number, because it allows for the buffering of "before" and "after" lines.])
AC_DEFINE_UNQUOTED([PCRE2GREP_MAX_BUFSIZE], [$with_pcre2grep_max_bufsize], [
The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
used by pcre2grep to hold parts of the file it is searching. The actual
amount of memory used by pcre2grep is three times this number, because it
allows for the buffering of "before" and "after" lines.])
if test "$enable_pcre2test_libedit" = "yes"; then
AC_DEFINE([SUPPORT_LIBEDIT], [], [
@ -906,43 +927,44 @@ cat <<EOF
$PACKAGE-$VERSION configuration summary:
Install prefix .................. : ${prefix}
C preprocessor .................. : ${CPP}
C compiler ...................... : ${CC}
Linker .......................... : ${LD}
C preprocessor flags ............ : ${CPPFLAGS}
C compiler flags ................ : ${CFLAGS} ${VISIBILITY_CFLAGS}
Linker flags .................... : ${LDFLAGS}
Extra libraries ................. : ${LIBS}
Install prefix ..................... : ${prefix}
C preprocessor ..................... : ${CPP}
C compiler ......................... : ${CC}
Linker ............................. : ${LD}
C preprocessor flags ............... : ${CPPFLAGS}
C compiler flags ................... : ${CFLAGS} ${VISIBILITY_CFLAGS}
Linker flags ....................... : ${LDFLAGS}
Extra libraries .................... : ${LIBS}
Build 8-bit pcre2 library ....... : ${enable_pcre2_8}
Build 16-bit pcre2 library ...... : ${enable_pcre2_16}
Build 32-bit pcre2 library ...... : ${enable_pcre2_32}
Include debugging code .......... : ${enable_debug}
Enable JIT compiling support .... : ${enable_jit}
Enable Unicode support .......... : ${enable_unicode}
Newline char/sequence ........... : ${enable_newline}
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
\C is disabled .................. : ${enable_never_backslash_C}
EBCDIC coding ................... : ${enable_ebcdic}
EBCDIC code for NL .............. : ${ebcdic_nl_code}
Rebuild char tables ............. : ${enable_rebuild_chartables}
Use stack recursion ............. : ${enable_stack_for_recursion}
Internal link size .............. : ${with_link_size}
Nested parentheses limit ........ : ${with_parens_nest_limit}
Match limit ..................... : ${with_match_limit}
Match limit recursion ........... : ${with_match_limit_recursion}
Build shared libs ............... : ${enable_shared}
Build static libs ............... : ${enable_static}
Use JIT in pcre2grep ............ : ${enable_pcre2grep_jit}
Enable callouts in pcre2grep .... : ${enable_pcre2grep_callout}
Buffer size for pcre2grep ....... : ${with_pcre2grep_bufsize}
Link pcre2grep with libz ........ : ${enable_pcre2grep_libz}
Link pcre2grep with libbz2 ...... : ${enable_pcre2grep_libbz2}
Link pcre2test with libedit ..... : ${enable_pcre2test_libedit}
Link pcre2test with libreadline . : ${enable_pcre2test_libreadline}
Valgrind support ................ : ${enable_valgrind}
Code coverage ................... : ${enable_coverage}
Build 8-bit pcre2 library .......... : ${enable_pcre2_8}
Build 16-bit pcre2 library ......... : ${enable_pcre2_16}
Build 32-bit pcre2 library ......... : ${enable_pcre2_32}
Include debugging code ............. : ${enable_debug}
Enable JIT compiling support ....... : ${enable_jit}
Enable Unicode support ............. : ${enable_unicode}
Newline char/sequence .............. : ${enable_newline}
\R matches only ANYCRLF ............ : ${enable_bsr_anycrlf}
\C is disabled ..................... : ${enable_never_backslash_C}
EBCDIC coding ...................... : ${enable_ebcdic}
EBCDIC code for NL ................. : ${ebcdic_nl_code}
Rebuild char tables ................ : ${enable_rebuild_chartables}
Use stack recursion ................ : ${enable_stack_for_recursion}
Internal link size ................. : ${with_link_size}
Nested parentheses limit ........... : ${with_parens_nest_limit}
Match limit ........................ : ${with_match_limit}
Match limit recursion .............. : ${with_match_limit_recursion}
Build shared libs .................. : ${enable_shared}
Build static libs .................. : ${enable_static}
Use JIT in pcre2grep ............... : ${enable_pcre2grep_jit}
Enable callouts in pcre2grep ....... : ${enable_pcre2grep_callout}
Initial buffer size for pcre2grep .. : ${with_pcre2grep_bufsize}
Maximum buffer size for pcre2grep .. : ${with_pcre2grep_max_bufsize}
Link pcre2grep with libz ........... : ${enable_pcre2grep_libz}
Link pcre2grep with libbz2 ......... : ${enable_pcre2grep_libbz2}
Link pcre2test with libedit ........ : ${enable_pcre2test_libedit}
Link pcre2test with libreadline .... : ${enable_pcre2test_libreadline}
Valgrind support ................... : ${enable_valgrind}
Code coverage ...................... : ${enable_coverage}
EOF

View File

@ -1,4 +1,4 @@
.TH PCRE2BUILD 3 "01 April 2016" "PCRE2 10.22"
.TH PCRE2BUILD 3 "07 October 2016" "PCRE2 10.23"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.
@ -385,16 +385,19 @@ they are not.
.sp
\fBpcre2grep\fP uses an internal buffer to hold a "window" on the file it is
scanning, in order to be able to output "before" and "after" lines when it
finds a match. The size of the buffer is controlled by a parameter whose
default value is 20K. The buffer itself is three times this size, but because
of the way it is used for holding "before" lines, the longest line that is
guaranteed to be processable is the parameter size. You can change the default
parameter value by adding, for example,
finds a match. The starting size of the buffer is controlled by a parameter
whose default value is 20K. The buffer itself is three times this size, but
because of the way it is used for holding "before" lines, the longest line that
is guaranteed to be processable is the parameter size. If a longer line is
encountered, \fBpcre2grep\fP automatically expands the buffer, up to a
specified maximum size, whose default is 1M or the starting size, whichever is
the larger. You can change the default parameter values by adding, for example,
.sp
--with-pcre2grep-bufsize=50K
--with-pcre2grep-bufsize=51200
--with-pcre2grep-max-bufsize=2097152
.sp
to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override this
value by using --buffer-size on the command line.
to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override
these values by using --buffer-size and --max-buffer-size on the command line.
.
.
.SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT"
@ -532,6 +535,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 01 April 2016
Last updated: 07 October 2016
Copyright (c) 1997-2016 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2GREP 1 "19 June 2016" "PCRE2 10.22"
.TH PCRE2GREP 1 "11 October 2016" "PCRE2 10.23"
.SH NAME
pcre2grep - a grep with Perl-compatible regular expressions.
.SH SYNOPSIS
@ -52,11 +52,18 @@ span line boundaries. What defines a line boundary is controlled by the
\fB-N\fP (\fB--newline\fP) option.
.P
The amount of memory used for buffering files that are being scanned is
controlled by a parameter that can be set by the \fB--buffer-size\fP option.
The default value for this parameter is specified when \fBpcre2grep\fP is
built, with the default default being 20K. A block of memory three times this
size is used (to allow for buffering "before" and "after" lines). An error
occurs if a line overflows the buffer.
controlled by parameters that can be set by the \fB--buffer-size\fP and
\fB--max-buffer-size\fP options. The first of these sets the size of buffer
that is obtained at the start of processing. If an input file contains very
long lines, a larger buffer may be needed; this is handled by automatically
extending the buffer, up to the limit specified by \fB--max-buffer-size\fP. The
default values for these parameters are specified when \fBpcre2grep\fP is
built, with the default defaults being 20K and 1M respectively. An error occurs
if a line is too long and the buffer can no longer be expanded.
.P
The block of memory that is actually used is three times the "buffer size", to
allow for buffering "before" and "after" lines. If the buffer size is too
small, fewer than requested "before" and "after" lines may be output.
.P
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater.
BUFSIZ is defined in \fB<stdio.h>\fP. When there is more than one pattern
@ -126,24 +133,29 @@ command line starts with a hyphen but is not an option. This allows for the
processing of patterns and file names that start with hyphens.
.TP
\fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP
Output \fInumber\fP lines of context after each matching line. If file names
and/or line numbers are being output, a hyphen separator is used instead of a
colon for the context lines. A line containing "--" is output between each
group of lines, unless they are in fact contiguous in the input file. The value
of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP
guarantees to have up to 8K of following text available for context output.
Output up to \fInumber\fP lines of context after each matching line. Fewer
lines are output if the next match or the end of the file is reached, or if the
processing buffer size has been set too small. If file names and/or line
numbers are being output, a hyphen separator is used instead of a colon for the
context lines. A line containing "--" is output between each group of lines,
unless they are in fact contiguous in the input file. The value of \fInumber\fP
is expected to be relatively small. However, \fBpcre2grep\fP guarantees to have
up to 8K of following text available for context output.
.TP
\fB-a\fP, \fB--text\fP
Treat binary files as text. This is equivalent to
\fB--binary-files\fP=\fItext\fP.
.TP
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
Output \fInumber\fP lines of context before each matching line. If file names
and/or line numbers are being output, a hyphen separator is used instead of a
colon for the context lines. A line containing "--" is output between each
group of lines, unless they are in fact contiguous in the input file. The value
of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP
guarantees to have up to 8K of preceding text available for context output.
Output up to \fInumber\fP lines of context before each matching line. Fewer
lines are output if the previous match or the start of the file is within
\fInumber\fP lines, or if the processing buffer size has been set too small. If
file names and/or line numbers are being output, a hyphen separator is used
instead of a colon for the context lines. A line containing "--" is output
between each group of lines, unless they are in fact contiguous in the input
file. The value of \fInumber\fP is expected to be relatively small. However,
\fBpcre2grep\fP guarantees to have up to 8K of preceding text available for
context output.
.TP
\fB--binary-files=\fP\fIword\fP
Specify how binary files are to be processed. If the word is "binary" (the
@ -158,8 +170,9 @@ be of interest and are skipped without causing any output or affecting the
return code.
.TP
\fB--buffer-size=\fP\fInumber\fP
Set the parameter that controls how much memory is used for buffering files
that are being scanned.
Set the parameter that controls how much memory is obtained at the start of
processing for buffering files that are being scanned. See also
\fB--max-buffer-size\fP below.
.TP
\fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP
Output \fInumber\fP lines of context both before and after each matching line.
@ -432,6 +445,11 @@ of use only if it is set smaller than \fB--match-limit\fP.
There are no short forms for these options. The default settings are specified
when the PCRE2 library is compiled, with the default default being 10 million.
.TP
\fB--max-buffer-size=\fInumber\fP
This limits the expansion of the processing buffer, whose initial size can be
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
smaller than the starting buffer size.
.TP
\fB-M\fP, \fB--multiline\fP
Allow patterns to match more than one line. When this option is given, patterns
may usefully contain literal newline characters and internal occurrences of ^
@ -757,6 +775,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 19 June 2016
Last updated: 11 October 2016
Copyright (c) 1997-2016 University of Cambridge.
.fi

View File

@ -206,7 +206,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_NAME "PCRE2"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "PCRE2 10.22"
#define PACKAGE_STRING "PCRE2 10.23-RC1"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre2"
@ -215,7 +215,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "10.22"
#define PACKAGE_VERSION "10.23-RC1"
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
parentheses (of any kind) in a pattern. This limits the amount of system
@ -224,15 +224,24 @@ sure both macros are undefined; an emulation function will then be used. */
#define PARENS_NEST_LIMIT 250
#endif
/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by
pcre2grep to hold parts of the file it is searching. This is also the
minimum value. The actual amount of memory used by pcre2grep is three times
this number, because it allows for the buffering of "before" and "after"
lines. */
/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
pcre2grep to hold parts of the file it is searching. The buffer will be
expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing
very long lines. The actual amount of memory used by pcre2grep is three
times this number, because it allows for the buffering of "before" and
"after" lines. */
#ifndef PCRE2GREP_BUFSIZE
#define PCRE2GREP_BUFSIZE 20480
#endif
/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
used by pcre2grep to hold parts of the file it is searching. The actual
amount of memory used by pcre2grep is three times this number, because it
allows for the buffering of "before" and "after" lines. */
#ifndef PCRE2GREP_MAX_BUFSIZE
#define PCRE2GREP_MAX_BUFSIZE 1048576
#endif
/* Define to any value to include debugging code. */
/* #undef PCRE2_DEBUG */
@ -299,7 +308,7 @@ sure both macros are undefined; an emulation function will then be used. */
/* #undef SUPPORT_VALGRIND */
/* Version number of package */
#define VERSION "10.22"
#define VERSION "10.23-RC1"
/* Define to empty if `const' does not conform to ANSI C. */
/* #undef const */

View File

@ -207,13 +207,20 @@ sure both macros are undefined; an emulation function will then be used. */
stack that is used while compiling a pattern. */
#undef PARENS_NEST_LIMIT
/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by
pcre2grep to hold parts of the file it is searching. This is also the
minimum value. The actual amount of memory used by pcre2grep is three times
this number, because it allows for the buffering of "before" and "after"
lines. */
/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
pcre2grep to hold parts of the file it is searching. The buffer will be
expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing
very long lines. The actual amount of memory used by pcre2grep is three
times this number, because it allows for the buffering of "before" and
"after" lines. */
#undef PCRE2GREP_BUFSIZE
/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
used by pcre2grep to hold parts of the file it is searching. The actual
amount of memory used by pcre2grep is three times this number, because it
allows for the buffering of "before" and "after" lines. */
#undef PCRE2GREP_MAX_BUFSIZE
/* to make a symbol visible */
#undef PCRE2POSIX_EXP_DECL

View File

@ -173,6 +173,7 @@ static int before_context = 0;
static int binary_files = BIN_BINARY;
static int both_context = 0;
static int bufthird = PCRE2GREP_BUFSIZE;
static int max_bufthird = PCRE2GREP_MAX_BUFSIZE;
static int bufsize = 3*PCRE2GREP_BUFSIZE;
static int endlinetype;
@ -344,6 +345,7 @@ used to identify them. */
#define N_EXCLUDE_FROM (-19)
#define N_INCLUDE_FROM (-20)
#define N_OM_SEPARATOR (-21)
#define N_MAX_BUFSIZE (-22)
static option_item optionlist[] = {
{ OP_NODATA, N_NULL, NULL, "", "terminate options" },
@ -352,7 +354,8 @@ static option_item optionlist[] = {
{ OP_NODATA, 'a', NULL, "text", "treat binary files as text" },
{ OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" },
{ OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" },
{ OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer size parameter" },
{ OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer starting size" },
{ OP_NUMBER, N_MAX_BUFSIZE,&max_bufthird, "max-buffer-size=number", "set processing buffer maximum size" },
{ OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" },
{ OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" },
{ OP_NUMBER, 'C', &both_context, "context=number", "set number of context lines, before & after" },
@ -952,8 +955,9 @@ for (op = optionlist; op->one_char != 0; op++)
printf("%.*s%s" STDOUT_NL, n, " ", op->help_text);
}
printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --buffer-size=100K." STDOUT_NL);
printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --max-buffer-size=100K." STDOUT_NL);
printf("The default value for --buffer-size is %d." STDOUT_NL, PCRE2GREP_BUFSIZE);
printf("The default value for --max-buffer-size is %d." STDOUT_NL, PCRE2GREP_MAX_BUFSIZE);
printf("When reading patterns or file names from a file, trailing white" STDOUT_NL);
printf("space is removed and blank lines are ignored." STDOUT_NL);
printf("The maximum size of any pattern is %d bytes." STDOUT_NL, MAXPATLEN);
@ -1100,12 +1104,12 @@ return om;
* Read one line of input *
*************************************************/
/* Normally, input is read using fread() into a large buffer, so many lines may
be read at once. However, doing this for tty input means that no output appears
until a lot of input has been typed. Instead, tty input is handled line by
line. We cannot use fgets() for this, because it does not stop at a binary
zero, and therefore there is no way of telling how many characters it has read,
because there may be binary zeros embedded in the data.
/* Normally, input is read using fread() (or gzread, or BZ2_read) into a large
buffer, so many lines may be read at once. However, doing this for tty input
means that no output appears until a lot of input has been typed. Instead, tty
input is handled line by line. We cannot use fgets() for this, because it does
not stop at a binary zero, and therefore there is no way of telling how many
characters it has read, because there may be binary zeros embedded in the data.
Arguments:
buffer the buffer to read into
@ -1424,17 +1428,18 @@ do_after_lines(int lastmatchnumber, char *lastmatchrestart, char *endptr,
if (after_context > 0 && lastmatchnumber > 0)
{
int count = 0;
while (lastmatchrestart < endptr && count++ < after_context)
while (lastmatchrestart < endptr && count < after_context)
{
int ellength;
char *pp = lastmatchrestart;
char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
if (ellength == 0 && pp == main_buffer + bufsize) break;
if (printname != NULL) fprintf(stdout, "%s-", printname);
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
pp = end_of_line(pp, endptr, &ellength);
FWRITE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
lastmatchrestart = pp;
count++;
}
hyphenpending = TRUE;
if (count > 0) hyphenpending = TRUE;
}
}
@ -1769,6 +1774,33 @@ return result != 0;
/*************************************************
* Read a portion of the file into buffer *
*************************************************/
static int
fill_buffer(void *handle, int frtype, char *buffer, int length,
BOOL input_line_buffered)
{
#ifdef SUPPORT_LIBZ
if (frtype == FR_LIBZ)
return gzread((gzFile)handle, buffer, length);
else
#endif
#ifdef SUPPORT_LIBBZ2
if (frtype == FR_LIBBZ2)
return BZ2_bzread((BZFILE *)handle, buffer, length);
else
#endif
return (input_line_buffered ?
read_one_line(buffer, length, (FILE *)handle) :
fread(buffer, 1, length, (FILE *)handle));
}
/*************************************************
* Grep an individual file *
*************************************************/
@ -1813,49 +1845,24 @@ BOOL endhyphenpending = FALSE;
BOOL input_line_buffered = line_buffered;
FILE *in = NULL; /* Ensure initialized */
#ifdef SUPPORT_LIBZ
gzFile ingz = NULL;
#endif
#ifdef SUPPORT_LIBBZ2
BZFILE *inbz2 = NULL;
#endif
/* Do the first read into the start of the buffer and set up the pointer to end
of what we have. In the case of libz, a non-zipped .gz file will be read as a
plain file. However, if a .bz2 file isn't actually bzipped, the first read will
fail. */
(void)frtype;
#ifdef SUPPORT_LIBZ
if (frtype == FR_LIBZ)
{
ingz = (gzFile)handle;
bufflength = gzread (ingz, main_buffer, bufsize);
}
else
#endif
#ifdef SUPPORT_LIBBZ2
if (frtype == FR_LIBBZ2)
{
inbz2 = (BZFILE *)handle;
bufflength = BZ2_bzread(inbz2, main_buffer, bufsize);
if ((int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */
} /* without the cast it is unsigned. */
else
#endif
if (frtype != FR_LIBZ && frtype != FR_LIBBZ2)
{
in = (FILE *)handle;
if (is_file_tty(in)) input_line_buffered = TRUE;
bufflength = input_line_buffered?
read_one_line(main_buffer, bufsize, in) :
fread(main_buffer, 1, bufsize, in);
}
bufflength = fill_buffer(handle, frtype, main_buffer, bufsize,
input_line_buffered);
#ifdef SUPPORT_LIBBZ2
if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */
#endif
endptr = main_buffer + bufflength;
/* Unless binary-files=text, see if we have a binary file. This uses the same
@ -1899,19 +1906,62 @@ while (ptr < endptr)
/* Check to see if the line we are looking at extends right to the very end
of the buffer without a line terminator. This means the line is too long to
handle. */
handle at the current buffer size. Until the buffer reaches its maximum size,
try doubling it and reading more data. */
if (endlinelength == 0 && t == main_buffer + bufsize)
{
fprintf(stderr, "pcre2grep: line %d%s%s is too long for the internal buffer\n"
"pcre2grep: the buffer size is %d\n"
"pcre2grep: use the --buffer-size option to change it\n",
if (bufthird < max_bufthird)
{
char *new_buffer;
int new_bufthird = 2*bufthird;
if (new_bufthird > max_bufthird) new_bufthird = max_bufthird;
new_buffer = (char *)malloc(3*new_bufthird);
if (new_buffer == NULL)
{
fprintf(stderr,
"pcre2grep: line %d%s%s is too long for the internal buffer\n"
"pcre2grep: not enough memory to increase the buffer size to %d\n",
linenumber,
(filename == NULL)? "" : " of file ",
(filename == NULL)? "" : filename,
new_bufthird);
return 2;
}
/* Copy the data and adjust pointers to the new buffer location. */
memcpy(new_buffer, main_buffer, bufsize);
bufthird = new_bufthird;
bufsize = 3*bufthird;
ptr = new_buffer + (ptr - main_buffer);
lastmatchrestart = new_buffer + (lastmatchrestart - main_buffer);
free(main_buffer);
main_buffer = new_buffer;
/* Read more data into the buffer and then try to find the line ending
again. */
bufflength += fill_buffer(handle, frtype, main_buffer + bufflength,
bufsize - bufflength, input_line_buffered);
endptr = main_buffer + bufflength;
continue;
}
else
{
fprintf(stderr,
"pcre2grep: line %d%s%s is too long for the internal buffer\n"
"pcre2grep: the maximum buffer size is %d\n"
"pcre2grep: use the --max-buffer-size option to change it\n",
linenumber,
(filename == NULL)? "" : " of file ",
(filename == NULL)? "" : filename,
bufthird);
return 2;
}
}
/* Extra processing for Jeffrey Friedl's debugging. */
@ -2320,8 +2370,9 @@ while (ptr < endptr)
lastmatchnumber > 0 &&
lastmatchrestart < main_buffer + bufthird)
{
do_after_lines(lastmatchnumber, lastmatchrestart, endptr, printname);
lastmatchnumber = 0;
lastmatchnumber = 0; /* Indicates no after lines pending */
}
/* Now do the shuffle */
@ -2329,24 +2380,8 @@ while (ptr < endptr)
memmove(main_buffer, main_buffer + bufthird, 2*bufthird);
ptr -= bufthird;
#ifdef SUPPORT_LIBZ
if (frtype == FR_LIBZ)
bufflength = 2*bufthird +
gzread (ingz, main_buffer + 2*bufthird, bufthird);
else
#endif
#ifdef SUPPORT_LIBBZ2
if (frtype == FR_LIBBZ2)
bufflength = 2*bufthird +
BZ2_bzread(inbz2, main_buffer + 2*bufthird, bufthird);
else
#endif
bufflength = 2*bufthird +
(input_line_buffered?
read_one_line(main_buffer + 2*bufthird, bufthird, in) :
fread(main_buffer + 2*bufthird, 1, bufthird, in));
bufflength = 2*bufthird + fill_buffer(handle, frtype,
main_buffer + 2*bufthird, bufthird, input_line_buffered);
endptr = main_buffer + bufflength;
/* Adjust any last match point */
@ -3427,6 +3462,12 @@ if (jfriedl_XT != 0 || jfriedl_XR != 0)
/* Get memory for the main buffer. */
if (bufthird <= 0)
{
fprintf(stderr, "pcre2grep: --buffer-size must be greater than zero\n");
goto EXIT2;
}
bufsize = 3*bufthird;
main_buffer = (char *)malloc(bufsize);

4
testdata/grepoutput vendored
View File

@ -637,8 +637,8 @@ RC=0
RC=0
---------------------------- Test 83 -----------------------------
pcre2grep: line 4 of file ./testdata/grepinput3 is too long for the internal buffer
pcre2grep: the buffer size is 100
pcre2grep: use the --buffer-size option to change it
pcre2grep: the maximum buffer size is 100
pcre2grep: use the --max-buffer-size option to change it
RC=2
---------------------------- Test 84 -----------------------------
testdata/grepinputv:fox jumps