Implement buffer expansion in pcre2grep.
This commit is contained in:
parent
b451e9f3b5
commit
bf6f0bb335
|
@ -76,6 +76,7 @@
|
|||
# a new option instead of being unconditional.
|
||||
# 2016-10-05 PH fixed a typo (PCRE should be PCRE2) in above patch
|
||||
# fix by David Gaussmann
|
||||
# 2016-10-07 PH added PCREGREP_MAX_BUFSIZE
|
||||
|
||||
PROJECT(PCRE2 C)
|
||||
|
||||
|
@ -148,7 +149,10 @@ SET(PCRE2_MATCH_LIMIT_RECURSION "MATCH_LIMIT" CACHE STRING
|
|||
"Default limit on internal recursion. See MATCH_LIMIT_RECURSION in config.h.in for details.")
|
||||
|
||||
SET(PCRE2GREP_BUFSIZE "20480" CACHE STRING
|
||||
"Buffer size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.")
|
||||
"Buffer starting size parameter for pcre2grep. See PCRE2GREP_BUFSIZE in config.h.in for details.")
|
||||
|
||||
SET(PCRE2GREP_MAX_BUFSIZE "1048576" CACHE STRING
|
||||
"Buffer maximum size parameter for pcre2grep. See PCRE2GREP_MAX_BUFSIZE in config.h.in for details.")
|
||||
|
||||
SET(PCRE2_NEWLINE "LF" CACHE STRING
|
||||
"What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).")
|
||||
|
|
|
@ -61,6 +61,10 @@ escape sequence for a character whose code point was greater than \x{ff}.
|
|||
9. Change 19 for 10.22 had a typo (PCRE_STATIC_RUNTIME should be
|
||||
PCRE2_STATIC_RUNTIME). Fix from David Gaussmann.
|
||||
|
||||
10. Added --max-buffer-size to pcre2grep, to allow for automatic buffer
|
||||
expansion when long lines are encountered. Original patch by Dmitry
|
||||
Cherniachenko.
|
||||
|
||||
|
||||
Version 10.22 29-July-2016
|
||||
--------------------------
|
||||
|
|
18
README
18
README
|
@ -339,12 +339,22 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
Of course, the relevant libraries must be installed on your system.
|
||||
|
||||
. The default size (in bytes) of the internal buffer used by pcre2grep can be
|
||||
set by, for example:
|
||||
. The default starting size (in bytes) of the internal buffer used by pcre2grep
|
||||
can be set by, for example:
|
||||
|
||||
--with-pcre2grep-bufsize=51200
|
||||
|
||||
The value must be a plain integer. The default is 20480.
|
||||
The value must be a plain integer. The default is 20480. The amount of memory
|
||||
used by pcre2grep is actually three times this number, to allow for "before"
|
||||
and "after" lines.
|
||||
|
||||
. The default maximum size of pcre2grep's internal buffer can be set by, for
|
||||
example:
|
||||
|
||||
--with-pcre2grep-max-bufsize=2097152
|
||||
|
||||
The default is either 1048576 or the value of --with-pcre2grep-bufsize,
|
||||
whichever is the larger.
|
||||
|
||||
. It is possible to compile pcre2test so that it links with the libreadline
|
||||
or libedit libraries, by specifying, respectively,
|
||||
|
@ -845,4 +855,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 01 April 2016
|
||||
Last updated: 07 October 2016
|
||||
|
|
|
@ -440,7 +440,7 @@ echo "---------------------------- Test 82 -----------------------------" >>test
|
|||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 83 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --buffer-size=10 --max-buffer-size=100 "^a" ./testdata/grepinput3) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 84 -----------------------------" >>testtrygrep
|
||||
|
|
|
@ -41,6 +41,7 @@
|
|||
#define NEWLINE_DEFAULT @NEWLINE_DEFAULT@
|
||||
#define PARENS_NEST_LIMIT @PCRE2_PARENS_NEST_LIMIT@
|
||||
#define PCRE2GREP_BUFSIZE @PCRE2GREP_BUFSIZE@
|
||||
#define PCRE2GREP_MAX_BUFSIZE @PCRE2GREP_MAX_BUFSIZE@
|
||||
|
||||
#define MAX_NAME_SIZE 32
|
||||
#define MAX_NAME_COUNT 10000
|
||||
|
|
104
configure.ac
104
configure.ac
|
@ -240,9 +240,15 @@ AC_ARG_ENABLE(pcre2grep-libbz2,
|
|||
# Handle --with-pcre2grep-bufsize=N
|
||||
AC_ARG_WITH(pcre2grep-bufsize,
|
||||
AS_HELP_STRING([--with-pcre2grep-bufsize=N],
|
||||
[pcre2grep buffer size (default=20480, minimum=8192)]),
|
||||
[pcre2grep initial buffer size (default=20480, minimum=8192)]),
|
||||
, with_pcre2grep_bufsize=20480)
|
||||
|
||||
# Handle --with-pcre2grep-max-bufsize=N
|
||||
AC_ARG_WITH(pcre2grep-max-bufsize,
|
||||
AS_HELP_STRING([--with-pcre2grep-max-bufsize=N],
|
||||
[pcre2grep maximum buffer size (default=1048576, minimum=8192)]),
|
||||
, with_pcre2grep_max_bufsize=1048576)
|
||||
|
||||
# Handle --enable-pcre2test-libedit
|
||||
AC_ARG_ENABLE(pcre2test-libedit,
|
||||
AS_HELP_STRING([--enable-pcre2test-libedit],
|
||||
|
@ -612,11 +618,26 @@ else
|
|||
fi
|
||||
fi
|
||||
|
||||
if test $with_pcre2grep_max_bufsize -lt $with_pcre2grep_bufsize ; then
|
||||
with_pcre2grep_max_bufsize="$with_pcre2grep_bufsize"
|
||||
else
|
||||
if test $? -gt 1 ; then
|
||||
AC_MSG_ERROR([Bad value for --with-pcre2grep-max-bufsize])
|
||||
fi
|
||||
fi
|
||||
|
||||
AC_DEFINE_UNQUOTED([PCRE2GREP_BUFSIZE], [$with_pcre2grep_bufsize], [
|
||||
The value of PCRE2GREP_BUFSIZE determines the size of buffer used by pcre2grep
|
||||
to hold parts of the file it is searching. This is also the minimum value.
|
||||
The actual amount of memory used by pcre2grep is three times this number,
|
||||
because it allows for the buffering of "before" and "after" lines.])
|
||||
The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
|
||||
pcre2grep to hold parts of the file it is searching. The buffer will be
|
||||
expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing very
|
||||
long lines. The actual amount of memory used by pcre2grep is three times this
|
||||
number, because it allows for the buffering of "before" and "after" lines.])
|
||||
|
||||
AC_DEFINE_UNQUOTED([PCRE2GREP_MAX_BUFSIZE], [$with_pcre2grep_max_bufsize], [
|
||||
The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
|
||||
used by pcre2grep to hold parts of the file it is searching. The actual
|
||||
amount of memory used by pcre2grep is three times this number, because it
|
||||
allows for the buffering of "before" and "after" lines.])
|
||||
|
||||
if test "$enable_pcre2test_libedit" = "yes"; then
|
||||
AC_DEFINE([SUPPORT_LIBEDIT], [], [
|
||||
|
@ -906,43 +927,44 @@ cat <<EOF
|
|||
|
||||
$PACKAGE-$VERSION configuration summary:
|
||||
|
||||
Install prefix .................. : ${prefix}
|
||||
C preprocessor .................. : ${CPP}
|
||||
C compiler ...................... : ${CC}
|
||||
Linker .......................... : ${LD}
|
||||
C preprocessor flags ............ : ${CPPFLAGS}
|
||||
C compiler flags ................ : ${CFLAGS} ${VISIBILITY_CFLAGS}
|
||||
Linker flags .................... : ${LDFLAGS}
|
||||
Extra libraries ................. : ${LIBS}
|
||||
Install prefix ..................... : ${prefix}
|
||||
C preprocessor ..................... : ${CPP}
|
||||
C compiler ......................... : ${CC}
|
||||
Linker ............................. : ${LD}
|
||||
C preprocessor flags ............... : ${CPPFLAGS}
|
||||
C compiler flags ................... : ${CFLAGS} ${VISIBILITY_CFLAGS}
|
||||
Linker flags ....................... : ${LDFLAGS}
|
||||
Extra libraries .................... : ${LIBS}
|
||||
|
||||
Build 8-bit pcre2 library ....... : ${enable_pcre2_8}
|
||||
Build 16-bit pcre2 library ...... : ${enable_pcre2_16}
|
||||
Build 32-bit pcre2 library ...... : ${enable_pcre2_32}
|
||||
Include debugging code .......... : ${enable_debug}
|
||||
Enable JIT compiling support .... : ${enable_jit}
|
||||
Enable Unicode support .......... : ${enable_unicode}
|
||||
Newline char/sequence ........... : ${enable_newline}
|
||||
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
|
||||
\C is disabled .................. : ${enable_never_backslash_C}
|
||||
EBCDIC coding ................... : ${enable_ebcdic}
|
||||
EBCDIC code for NL .............. : ${ebcdic_nl_code}
|
||||
Rebuild char tables ............. : ${enable_rebuild_chartables}
|
||||
Use stack recursion ............. : ${enable_stack_for_recursion}
|
||||
Internal link size .............. : ${with_link_size}
|
||||
Nested parentheses limit ........ : ${with_parens_nest_limit}
|
||||
Match limit ..................... : ${with_match_limit}
|
||||
Match limit recursion ........... : ${with_match_limit_recursion}
|
||||
Build shared libs ............... : ${enable_shared}
|
||||
Build static libs ............... : ${enable_static}
|
||||
Use JIT in pcre2grep ............ : ${enable_pcre2grep_jit}
|
||||
Enable callouts in pcre2grep .... : ${enable_pcre2grep_callout}
|
||||
Buffer size for pcre2grep ....... : ${with_pcre2grep_bufsize}
|
||||
Link pcre2grep with libz ........ : ${enable_pcre2grep_libz}
|
||||
Link pcre2grep with libbz2 ...... : ${enable_pcre2grep_libbz2}
|
||||
Link pcre2test with libedit ..... : ${enable_pcre2test_libedit}
|
||||
Link pcre2test with libreadline . : ${enable_pcre2test_libreadline}
|
||||
Valgrind support ................ : ${enable_valgrind}
|
||||
Code coverage ................... : ${enable_coverage}
|
||||
Build 8-bit pcre2 library .......... : ${enable_pcre2_8}
|
||||
Build 16-bit pcre2 library ......... : ${enable_pcre2_16}
|
||||
Build 32-bit pcre2 library ......... : ${enable_pcre2_32}
|
||||
Include debugging code ............. : ${enable_debug}
|
||||
Enable JIT compiling support ....... : ${enable_jit}
|
||||
Enable Unicode support ............. : ${enable_unicode}
|
||||
Newline char/sequence .............. : ${enable_newline}
|
||||
\R matches only ANYCRLF ............ : ${enable_bsr_anycrlf}
|
||||
\C is disabled ..................... : ${enable_never_backslash_C}
|
||||
EBCDIC coding ...................... : ${enable_ebcdic}
|
||||
EBCDIC code for NL ................. : ${ebcdic_nl_code}
|
||||
Rebuild char tables ................ : ${enable_rebuild_chartables}
|
||||
Use stack recursion ................ : ${enable_stack_for_recursion}
|
||||
Internal link size ................. : ${with_link_size}
|
||||
Nested parentheses limit ........... : ${with_parens_nest_limit}
|
||||
Match limit ........................ : ${with_match_limit}
|
||||
Match limit recursion .............. : ${with_match_limit_recursion}
|
||||
Build shared libs .................. : ${enable_shared}
|
||||
Build static libs .................. : ${enable_static}
|
||||
Use JIT in pcre2grep ............... : ${enable_pcre2grep_jit}
|
||||
Enable callouts in pcre2grep ....... : ${enable_pcre2grep_callout}
|
||||
Initial buffer size for pcre2grep .. : ${with_pcre2grep_bufsize}
|
||||
Maximum buffer size for pcre2grep .. : ${with_pcre2grep_max_bufsize}
|
||||
Link pcre2grep with libz ........... : ${enable_pcre2grep_libz}
|
||||
Link pcre2grep with libbz2 ......... : ${enable_pcre2grep_libbz2}
|
||||
Link pcre2test with libedit ........ : ${enable_pcre2test_libedit}
|
||||
Link pcre2test with libreadline .... : ${enable_pcre2test_libreadline}
|
||||
Valgrind support ................... : ${enable_valgrind}
|
||||
Code coverage ...................... : ${enable_coverage}
|
||||
|
||||
EOF
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2BUILD 3 "01 April 2016" "PCRE2 10.22"
|
||||
.TH PCRE2BUILD 3 "07 October 2016" "PCRE2 10.23"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.
|
||||
|
@ -385,16 +385,19 @@ they are not.
|
|||
.sp
|
||||
\fBpcre2grep\fP uses an internal buffer to hold a "window" on the file it is
|
||||
scanning, in order to be able to output "before" and "after" lines when it
|
||||
finds a match. The size of the buffer is controlled by a parameter whose
|
||||
default value is 20K. The buffer itself is three times this size, but because
|
||||
of the way it is used for holding "before" lines, the longest line that is
|
||||
guaranteed to be processable is the parameter size. You can change the default
|
||||
parameter value by adding, for example,
|
||||
finds a match. The starting size of the buffer is controlled by a parameter
|
||||
whose default value is 20K. The buffer itself is three times this size, but
|
||||
because of the way it is used for holding "before" lines, the longest line that
|
||||
is guaranteed to be processable is the parameter size. If a longer line is
|
||||
encountered, \fBpcre2grep\fP automatically expands the buffer, up to a
|
||||
specified maximum size, whose default is 1M or the starting size, whichever is
|
||||
the larger. You can change the default parameter values by adding, for example,
|
||||
.sp
|
||||
--with-pcre2grep-bufsize=50K
|
||||
--with-pcre2grep-bufsize=51200
|
||||
--with-pcre2grep-max-bufsize=2097152
|
||||
.sp
|
||||
to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override this
|
||||
value by using --buffer-size on the command line.
|
||||
to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override
|
||||
these values by using --buffer-size and --max-buffer-size on the command line.
|
||||
.
|
||||
.
|
||||
.SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT"
|
||||
|
@ -532,6 +535,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 01 April 2016
|
||||
Last updated: 07 October 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2GREP 1 "19 June 2016" "PCRE2 10.22"
|
||||
.TH PCRE2GREP 1 "11 October 2016" "PCRE2 10.23"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -52,11 +52,18 @@ span line boundaries. What defines a line boundary is controlled by the
|
|||
\fB-N\fP (\fB--newline\fP) option.
|
||||
.P
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by a parameter that can be set by the \fB--buffer-size\fP option.
|
||||
The default value for this parameter is specified when \fBpcre2grep\fP is
|
||||
built, with the default default being 20K. A block of memory three times this
|
||||
size is used (to allow for buffering "before" and "after" lines). An error
|
||||
occurs if a line overflows the buffer.
|
||||
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
||||
\fB--max-buffer-size\fP options. The first of these sets the size of buffer
|
||||
that is obtained at the start of processing. If an input file contains very
|
||||
long lines, a larger buffer may be needed; this is handled by automatically
|
||||
extending the buffer, up to the limit specified by \fB--max-buffer-size\fP. The
|
||||
default values for these parameters are specified when \fBpcre2grep\fP is
|
||||
built, with the default defaults being 20K and 1M respectively. An error occurs
|
||||
if a line is too long and the buffer can no longer be expanded.
|
||||
.P
|
||||
The block of memory that is actually used is three times the "buffer size", to
|
||||
allow for buffering "before" and "after" lines. If the buffer size is too
|
||||
small, fewer than requested "before" and "after" lines may be output.
|
||||
.P
|
||||
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater.
|
||||
BUFSIZ is defined in \fB<stdio.h>\fP. When there is more than one pattern
|
||||
|
@ -126,24 +133,29 @@ command line starts with a hyphen but is not an option. This allows for the
|
|||
processing of patterns and file names that start with hyphens.
|
||||
.TP
|
||||
\fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP
|
||||
Output \fInumber\fP lines of context after each matching line. If file names
|
||||
and/or line numbers are being output, a hyphen separator is used instead of a
|
||||
colon for the context lines. A line containing "--" is output between each
|
||||
group of lines, unless they are in fact contiguous in the input file. The value
|
||||
of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP
|
||||
guarantees to have up to 8K of following text available for context output.
|
||||
Output up to \fInumber\fP lines of context after each matching line. Fewer
|
||||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of \fInumber\fP
|
||||
is expected to be relatively small. However, \fBpcre2grep\fP guarantees to have
|
||||
up to 8K of following text available for context output.
|
||||
.TP
|
||||
\fB-a\fP, \fB--text\fP
|
||||
Treat binary files as text. This is equivalent to
|
||||
\fB--binary-files\fP=\fItext\fP.
|
||||
.TP
|
||||
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
|
||||
Output \fInumber\fP lines of context before each matching line. If file names
|
||||
and/or line numbers are being output, a hyphen separator is used instead of a
|
||||
colon for the context lines. A line containing "--" is output between each
|
||||
group of lines, unless they are in fact contiguous in the input file. The value
|
||||
of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP
|
||||
guarantees to have up to 8K of preceding text available for context output.
|
||||
Output up to \fInumber\fP lines of context before each matching line. Fewer
|
||||
lines are output if the previous match or the start of the file is within
|
||||
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of \fInumber\fP is expected to be relatively small. However,
|
||||
\fBpcre2grep\fP guarantees to have up to 8K of preceding text available for
|
||||
context output.
|
||||
.TP
|
||||
\fB--binary-files=\fP\fIword\fP
|
||||
Specify how binary files are to be processed. If the word is "binary" (the
|
||||
|
@ -158,8 +170,9 @@ be of interest and are skipped without causing any output or affecting the
|
|||
return code.
|
||||
.TP
|
||||
\fB--buffer-size=\fP\fInumber\fP
|
||||
Set the parameter that controls how much memory is used for buffering files
|
||||
that are being scanned.
|
||||
Set the parameter that controls how much memory is obtained at the start of
|
||||
processing for buffering files that are being scanned. See also
|
||||
\fB--max-buffer-size\fP below.
|
||||
.TP
|
||||
\fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP
|
||||
Output \fInumber\fP lines of context both before and after each matching line.
|
||||
|
@ -432,6 +445,11 @@ of use only if it is set smaller than \fB--match-limit\fP.
|
|||
There are no short forms for these options. The default settings are specified
|
||||
when the PCRE2 library is compiled, with the default default being 10 million.
|
||||
.TP
|
||||
\fB--max-buffer-size=\fInumber\fP
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
.TP
|
||||
\fB-M\fP, \fB--multiline\fP
|
||||
Allow patterns to match more than one line. When this option is given, patterns
|
||||
may usefully contain literal newline characters and internal occurrences of ^
|
||||
|
@ -757,6 +775,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 19 June 2016
|
||||
Last updated: 11 October 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -206,7 +206,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define PACKAGE_NAME "PCRE2"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "PCRE2 10.22"
|
||||
#define PACKAGE_STRING "PCRE2 10.23-RC1"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "pcre2"
|
||||
|
@ -215,7 +215,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "10.22"
|
||||
#define PACKAGE_VERSION "10.23-RC1"
|
||||
|
||||
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
|
||||
parentheses (of any kind) in a pattern. This limits the amount of system
|
||||
|
@ -224,15 +224,24 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define PARENS_NEST_LIMIT 250
|
||||
#endif
|
||||
|
||||
/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by
|
||||
pcre2grep to hold parts of the file it is searching. This is also the
|
||||
minimum value. The actual amount of memory used by pcre2grep is three times
|
||||
this number, because it allows for the buffering of "before" and "after"
|
||||
lines. */
|
||||
/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
|
||||
pcre2grep to hold parts of the file it is searching. The buffer will be
|
||||
expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing
|
||||
very long lines. The actual amount of memory used by pcre2grep is three
|
||||
times this number, because it allows for the buffering of "before" and
|
||||
"after" lines. */
|
||||
#ifndef PCRE2GREP_BUFSIZE
|
||||
#define PCRE2GREP_BUFSIZE 20480
|
||||
#endif
|
||||
|
||||
/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
|
||||
used by pcre2grep to hold parts of the file it is searching. The actual
|
||||
amount of memory used by pcre2grep is three times this number, because it
|
||||
allows for the buffering of "before" and "after" lines. */
|
||||
#ifndef PCRE2GREP_MAX_BUFSIZE
|
||||
#define PCRE2GREP_MAX_BUFSIZE 1048576
|
||||
#endif
|
||||
|
||||
/* Define to any value to include debugging code. */
|
||||
/* #undef PCRE2_DEBUG */
|
||||
|
||||
|
@ -299,7 +308,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
/* #undef SUPPORT_VALGRIND */
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "10.22"
|
||||
#define VERSION "10.23-RC1"
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
/* #undef const */
|
||||
|
|
|
@ -207,13 +207,20 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
stack that is used while compiling a pattern. */
|
||||
#undef PARENS_NEST_LIMIT
|
||||
|
||||
/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by
|
||||
pcre2grep to hold parts of the file it is searching. This is also the
|
||||
minimum value. The actual amount of memory used by pcre2grep is three times
|
||||
this number, because it allows for the buffering of "before" and "after"
|
||||
lines. */
|
||||
/* The value of PCRE2GREP_BUFSIZE is the starting size of the buffer used by
|
||||
pcre2grep to hold parts of the file it is searching. The buffer will be
|
||||
expanded up to PCRE2GREP_MAX_BUFSIZE if necessary, for files containing
|
||||
very long lines. The actual amount of memory used by pcre2grep is three
|
||||
times this number, because it allows for the buffering of "before" and
|
||||
"after" lines. */
|
||||
#undef PCRE2GREP_BUFSIZE
|
||||
|
||||
/* The value of PCRE2GREP_MAX_BUFSIZE specifies the maximum size of the buffer
|
||||
used by pcre2grep to hold parts of the file it is searching. The actual
|
||||
amount of memory used by pcre2grep is three times this number, because it
|
||||
allows for the buffering of "before" and "after" lines. */
|
||||
#undef PCRE2GREP_MAX_BUFSIZE
|
||||
|
||||
/* to make a symbol visible */
|
||||
#undef PCRE2POSIX_EXP_DECL
|
||||
|
||||
|
|
177
src/pcre2grep.c
177
src/pcre2grep.c
|
@ -173,6 +173,7 @@ static int before_context = 0;
|
|||
static int binary_files = BIN_BINARY;
|
||||
static int both_context = 0;
|
||||
static int bufthird = PCRE2GREP_BUFSIZE;
|
||||
static int max_bufthird = PCRE2GREP_MAX_BUFSIZE;
|
||||
static int bufsize = 3*PCRE2GREP_BUFSIZE;
|
||||
static int endlinetype;
|
||||
|
||||
|
@ -344,6 +345,7 @@ used to identify them. */
|
|||
#define N_EXCLUDE_FROM (-19)
|
||||
#define N_INCLUDE_FROM (-20)
|
||||
#define N_OM_SEPARATOR (-21)
|
||||
#define N_MAX_BUFSIZE (-22)
|
||||
|
||||
static option_item optionlist[] = {
|
||||
{ OP_NODATA, N_NULL, NULL, "", "terminate options" },
|
||||
|
@ -352,7 +354,8 @@ static option_item optionlist[] = {
|
|||
{ OP_NODATA, 'a', NULL, "text", "treat binary files as text" },
|
||||
{ OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" },
|
||||
{ OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" },
|
||||
{ OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer size parameter" },
|
||||
{ OP_NUMBER, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer starting size" },
|
||||
{ OP_NUMBER, N_MAX_BUFSIZE,&max_bufthird, "max-buffer-size=number", "set processing buffer maximum size" },
|
||||
{ OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" },
|
||||
{ OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" },
|
||||
{ OP_NUMBER, 'C', &both_context, "context=number", "set number of context lines, before & after" },
|
||||
|
@ -952,8 +955,9 @@ for (op = optionlist; op->one_char != 0; op++)
|
|||
printf("%.*s%s" STDOUT_NL, n, " ", op->help_text);
|
||||
}
|
||||
|
||||
printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --buffer-size=100K." STDOUT_NL);
|
||||
printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --max-buffer-size=100K." STDOUT_NL);
|
||||
printf("The default value for --buffer-size is %d." STDOUT_NL, PCRE2GREP_BUFSIZE);
|
||||
printf("The default value for --max-buffer-size is %d." STDOUT_NL, PCRE2GREP_MAX_BUFSIZE);
|
||||
printf("When reading patterns or file names from a file, trailing white" STDOUT_NL);
|
||||
printf("space is removed and blank lines are ignored." STDOUT_NL);
|
||||
printf("The maximum size of any pattern is %d bytes." STDOUT_NL, MAXPATLEN);
|
||||
|
@ -1100,12 +1104,12 @@ return om;
|
|||
* Read one line of input *
|
||||
*************************************************/
|
||||
|
||||
/* Normally, input is read using fread() into a large buffer, so many lines may
|
||||
be read at once. However, doing this for tty input means that no output appears
|
||||
until a lot of input has been typed. Instead, tty input is handled line by
|
||||
line. We cannot use fgets() for this, because it does not stop at a binary
|
||||
zero, and therefore there is no way of telling how many characters it has read,
|
||||
because there may be binary zeros embedded in the data.
|
||||
/* Normally, input is read using fread() (or gzread, or BZ2_read) into a large
|
||||
buffer, so many lines may be read at once. However, doing this for tty input
|
||||
means that no output appears until a lot of input has been typed. Instead, tty
|
||||
input is handled line by line. We cannot use fgets() for this, because it does
|
||||
not stop at a binary zero, and therefore there is no way of telling how many
|
||||
characters it has read, because there may be binary zeros embedded in the data.
|
||||
|
||||
Arguments:
|
||||
buffer the buffer to read into
|
||||
|
@ -1424,17 +1428,18 @@ do_after_lines(int lastmatchnumber, char *lastmatchrestart, char *endptr,
|
|||
if (after_context > 0 && lastmatchnumber > 0)
|
||||
{
|
||||
int count = 0;
|
||||
while (lastmatchrestart < endptr && count++ < after_context)
|
||||
while (lastmatchrestart < endptr && count < after_context)
|
||||
{
|
||||
int ellength;
|
||||
char *pp = lastmatchrestart;
|
||||
char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
|
||||
if (ellength == 0 && pp == main_buffer + bufsize) break;
|
||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
||||
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
|
||||
pp = end_of_line(pp, endptr, &ellength);
|
||||
FWRITE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
||||
lastmatchrestart = pp;
|
||||
count++;
|
||||
}
|
||||
hyphenpending = TRUE;
|
||||
if (count > 0) hyphenpending = TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1769,6 +1774,33 @@ return result != 0;
|
|||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Read a portion of the file into buffer *
|
||||
*************************************************/
|
||||
|
||||
static int
|
||||
fill_buffer(void *handle, int frtype, char *buffer, int length,
|
||||
BOOL input_line_buffered)
|
||||
{
|
||||
#ifdef SUPPORT_LIBZ
|
||||
if (frtype == FR_LIBZ)
|
||||
return gzread((gzFile)handle, buffer, length);
|
||||
else
|
||||
#endif
|
||||
|
||||
#ifdef SUPPORT_LIBBZ2
|
||||
if (frtype == FR_LIBBZ2)
|
||||
return BZ2_bzread((BZFILE *)handle, buffer, length);
|
||||
else
|
||||
#endif
|
||||
|
||||
return (input_line_buffered ?
|
||||
read_one_line(buffer, length, (FILE *)handle) :
|
||||
fread(buffer, 1, length, (FILE *)handle));
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Grep an individual file *
|
||||
*************************************************/
|
||||
|
@ -1813,49 +1845,24 @@ BOOL endhyphenpending = FALSE;
|
|||
BOOL input_line_buffered = line_buffered;
|
||||
FILE *in = NULL; /* Ensure initialized */
|
||||
|
||||
#ifdef SUPPORT_LIBZ
|
||||
gzFile ingz = NULL;
|
||||
#endif
|
||||
|
||||
#ifdef SUPPORT_LIBBZ2
|
||||
BZFILE *inbz2 = NULL;
|
||||
#endif
|
||||
|
||||
|
||||
/* Do the first read into the start of the buffer and set up the pointer to end
|
||||
of what we have. In the case of libz, a non-zipped .gz file will be read as a
|
||||
plain file. However, if a .bz2 file isn't actually bzipped, the first read will
|
||||
fail. */
|
||||
|
||||
(void)frtype;
|
||||
|
||||
#ifdef SUPPORT_LIBZ
|
||||
if (frtype == FR_LIBZ)
|
||||
{
|
||||
ingz = (gzFile)handle;
|
||||
bufflength = gzread (ingz, main_buffer, bufsize);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
|
||||
#ifdef SUPPORT_LIBBZ2
|
||||
if (frtype == FR_LIBBZ2)
|
||||
{
|
||||
inbz2 = (BZFILE *)handle;
|
||||
bufflength = BZ2_bzread(inbz2, main_buffer, bufsize);
|
||||
if ((int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */
|
||||
} /* without the cast it is unsigned. */
|
||||
else
|
||||
#endif
|
||||
|
||||
if (frtype != FR_LIBZ && frtype != FR_LIBBZ2)
|
||||
{
|
||||
in = (FILE *)handle;
|
||||
if (is_file_tty(in)) input_line_buffered = TRUE;
|
||||
bufflength = input_line_buffered?
|
||||
read_one_line(main_buffer, bufsize, in) :
|
||||
fread(main_buffer, 1, bufsize, in);
|
||||
}
|
||||
|
||||
bufflength = fill_buffer(handle, frtype, main_buffer, bufsize,
|
||||
input_line_buffered);
|
||||
|
||||
#ifdef SUPPORT_LIBBZ2
|
||||
if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is size_t; */
|
||||
#endif
|
||||
|
||||
endptr = main_buffer + bufflength;
|
||||
|
||||
/* Unless binary-files=text, see if we have a binary file. This uses the same
|
||||
|
@ -1899,19 +1906,62 @@ while (ptr < endptr)
|
|||
|
||||
/* Check to see if the line we are looking at extends right to the very end
|
||||
of the buffer without a line terminator. This means the line is too long to
|
||||
handle. */
|
||||
handle at the current buffer size. Until the buffer reaches its maximum size,
|
||||
try doubling it and reading more data. */
|
||||
|
||||
if (endlinelength == 0 && t == main_buffer + bufsize)
|
||||
{
|
||||
fprintf(stderr, "pcre2grep: line %d%s%s is too long for the internal buffer\n"
|
||||
"pcre2grep: the buffer size is %d\n"
|
||||
"pcre2grep: use the --buffer-size option to change it\n",
|
||||
if (bufthird < max_bufthird)
|
||||
{
|
||||
char *new_buffer;
|
||||
int new_bufthird = 2*bufthird;
|
||||
|
||||
if (new_bufthird > max_bufthird) new_bufthird = max_bufthird;
|
||||
new_buffer = (char *)malloc(3*new_bufthird);
|
||||
|
||||
if (new_buffer == NULL)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"pcre2grep: line %d%s%s is too long for the internal buffer\n"
|
||||
"pcre2grep: not enough memory to increase the buffer size to %d\n",
|
||||
linenumber,
|
||||
(filename == NULL)? "" : " of file ",
|
||||
(filename == NULL)? "" : filename,
|
||||
new_bufthird);
|
||||
return 2;
|
||||
}
|
||||
|
||||
/* Copy the data and adjust pointers to the new buffer location. */
|
||||
|
||||
memcpy(new_buffer, main_buffer, bufsize);
|
||||
bufthird = new_bufthird;
|
||||
bufsize = 3*bufthird;
|
||||
ptr = new_buffer + (ptr - main_buffer);
|
||||
lastmatchrestart = new_buffer + (lastmatchrestart - main_buffer);
|
||||
free(main_buffer);
|
||||
main_buffer = new_buffer;
|
||||
|
||||
/* Read more data into the buffer and then try to find the line ending
|
||||
again. */
|
||||
|
||||
bufflength += fill_buffer(handle, frtype, main_buffer + bufflength,
|
||||
bufsize - bufflength, input_line_buffered);
|
||||
endptr = main_buffer + bufflength;
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr,
|
||||
"pcre2grep: line %d%s%s is too long for the internal buffer\n"
|
||||
"pcre2grep: the maximum buffer size is %d\n"
|
||||
"pcre2grep: use the --max-buffer-size option to change it\n",
|
||||
linenumber,
|
||||
(filename == NULL)? "" : " of file ",
|
||||
(filename == NULL)? "" : filename,
|
||||
bufthird);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* Extra processing for Jeffrey Friedl's debugging. */
|
||||
|
||||
|
@ -2320,8 +2370,9 @@ while (ptr < endptr)
|
|||
lastmatchnumber > 0 &&
|
||||
lastmatchrestart < main_buffer + bufthird)
|
||||
{
|
||||
|
||||
do_after_lines(lastmatchnumber, lastmatchrestart, endptr, printname);
|
||||
lastmatchnumber = 0;
|
||||
lastmatchnumber = 0; /* Indicates no after lines pending */
|
||||
}
|
||||
|
||||
/* Now do the shuffle */
|
||||
|
@ -2329,24 +2380,8 @@ while (ptr < endptr)
|
|||
memmove(main_buffer, main_buffer + bufthird, 2*bufthird);
|
||||
ptr -= bufthird;
|
||||
|
||||
#ifdef SUPPORT_LIBZ
|
||||
if (frtype == FR_LIBZ)
|
||||
bufflength = 2*bufthird +
|
||||
gzread (ingz, main_buffer + 2*bufthird, bufthird);
|
||||
else
|
||||
#endif
|
||||
|
||||
#ifdef SUPPORT_LIBBZ2
|
||||
if (frtype == FR_LIBBZ2)
|
||||
bufflength = 2*bufthird +
|
||||
BZ2_bzread(inbz2, main_buffer + 2*bufthird, bufthird);
|
||||
else
|
||||
#endif
|
||||
|
||||
bufflength = 2*bufthird +
|
||||
(input_line_buffered?
|
||||
read_one_line(main_buffer + 2*bufthird, bufthird, in) :
|
||||
fread(main_buffer + 2*bufthird, 1, bufthird, in));
|
||||
bufflength = 2*bufthird + fill_buffer(handle, frtype,
|
||||
main_buffer + 2*bufthird, bufthird, input_line_buffered);
|
||||
endptr = main_buffer + bufflength;
|
||||
|
||||
/* Adjust any last match point */
|
||||
|
@ -3427,6 +3462,12 @@ if (jfriedl_XT != 0 || jfriedl_XR != 0)
|
|||
|
||||
/* Get memory for the main buffer. */
|
||||
|
||||
if (bufthird <= 0)
|
||||
{
|
||||
fprintf(stderr, "pcre2grep: --buffer-size must be greater than zero\n");
|
||||
goto EXIT2;
|
||||
}
|
||||
|
||||
bufsize = 3*bufthird;
|
||||
main_buffer = (char *)malloc(bufsize);
|
||||
|
||||
|
|
|
@ -637,8 +637,8 @@ RC=0
|
|||
RC=0
|
||||
---------------------------- Test 83 -----------------------------
|
||||
pcre2grep: line 4 of file ./testdata/grepinput3 is too long for the internal buffer
|
||||
pcre2grep: the buffer size is 100
|
||||
pcre2grep: use the --buffer-size option to change it
|
||||
pcre2grep: the maximum buffer size is 100
|
||||
pcre2grep: use the --max-buffer-size option to change it
|
||||
RC=2
|
||||
---------------------------- Test 84 -----------------------------
|
||||
testdata/grepinputv:fox jumps
|
||||
|
|
Loading…
Reference in New Issue