Fix pcre2grep -o bug when ovector overflows; add option to adjust the limit;
raise the default limit; give error if -o requests an uncaptured parens.
This commit is contained in:
parent
300bf6e2d6
commit
0d1ab8515f
|
@ -39,6 +39,12 @@ minimum is potentially useful.
|
|||
10. A (*MARK) value inside a successful condition was not being returned by the
|
||||
interpretive matcher (it was returned by JIT). This bug has been mended.
|
||||
|
||||
11. A bug in pcre2grep meant that -o without an argument (or -o0) didn't work
|
||||
if the pattern had more than 32 capturing parentheses. This is fixed. In
|
||||
addition (a) the default limit for groups requested by -o<n> has been raised to
|
||||
50, (b) the new --om-capture option changes the limit, (c) an error is raised
|
||||
if -o asks for a group that is above the limit.
|
||||
|
||||
|
||||
Version 10.33 16-April-2019
|
||||
---------------------------
|
||||
|
|
|
@ -653,6 +653,13 @@ printf 'ABC\0XYZ\nABCDEF\nDEFABC\n' >testtemp2grep
|
|||
$valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 127 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -o --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 128 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
# Now compare the results.
|
||||
|
||||
|
|
|
@ -2266,12 +2266,12 @@ segment.
|
|||
PCRE2_INFO_MINLENGTH
|
||||
</pre>
|
||||
If a minimum length for matching subject strings was computed, its value is
|
||||
returned. Otherwise the returned value is 0. The value is a number of
|
||||
characters, which in UTF mode may be different from the number of code units.
|
||||
The third argument should point to an <b>uint32_t</b> variable. The value is a
|
||||
lower bound to the length of any matching string. There may not be any strings
|
||||
of that length that do actually match, but every string that does match is at
|
||||
least that long.
|
||||
returned. Otherwise the returned value is 0. This value is not computed when
|
||||
PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in
|
||||
UTF mode may be different from the number of code units. The third argument
|
||||
should point to an <b>uint32_t</b> variable. The value is a lower bound to the
|
||||
length of any matching string. There may not be any strings of that length that
|
||||
do actually match, but every string that does match is at least that long.
|
||||
<pre>
|
||||
PCRE2_INFO_NAMECOUNT
|
||||
PCRE2_INFO_NAMEENTRYSIZE
|
||||
|
@ -3836,7 +3836,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 May 2019
|
||||
Last updated: 11 June 2019
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -685,20 +685,32 @@ otherwise empty line. This option is mutually exclusive with <b>--output</b>,
|
|||
<P>
|
||||
<b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
|
||||
Show only the part of the line that matched the capturing parentheses of the
|
||||
given number. Up to 32 capturing parentheses are supported, and -o0 is
|
||||
equivalent to <b>-o</b> without a number. Because these options can be given
|
||||
without an argument (see above), if an argument is present, it must be given in
|
||||
the same shell item, for example, -o3 or --only-matching=2. The comments given
|
||||
for the non-argument case above also apply to this option. If the specified
|
||||
capturing parentheses do not exist in the pattern, or were not set in the
|
||||
match, nothing is output unless the file name or line number are being output.
|
||||
given number. Up to 50 capturing parentheses are supported by default. This
|
||||
limit can be changed via the <b>--om-capture</b> option. A pattern may contain
|
||||
any number of capturing parentheses, but only those whose number is within the
|
||||
limit can be accessed by <b>-o</b>. An error occurs if the number specified by
|
||||
<b>-o</b> is greater than the limit.
|
||||
<br>
|
||||
<br>
|
||||
-o0 is the same as <b>-o</b> without a number. Because these options can be
|
||||
given without an argument (see above), if an argument is present, it must be
|
||||
given in the same shell item, for example, -o3 or --only-matching=2. The
|
||||
comments given for the non-argument case above also apply to this option. If
|
||||
the specified capturing parentheses do not exist in the pattern, or were not
|
||||
set in the match, nothing is output unless the file name or line number are
|
||||
being output.
|
||||
<br>
|
||||
<br>
|
||||
If this option is given multiple times, multiple substrings are output for each
|
||||
match, in the order the options are given, and all on one line. For example,
|
||||
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator (but see the next
|
||||
option).
|
||||
but one option).
|
||||
</P>
|
||||
<P>
|
||||
<b>--om-capture</b>=<i>number</i>
|
||||
Set the number of capturing parentheses that can be accessed by <b>-o</b>. The
|
||||
default is 50.
|
||||
</P>
|
||||
<P>
|
||||
<b>--om-separator</b>=<i>text</i>
|
||||
|
@ -980,7 +992,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 28 May 2019
|
||||
Last updated: 15 June 2019
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -739,7 +739,9 @@ options, the line is omitted. "First code unit" is where any match must start;
|
|||
if there is more than one they are listed as "starting code units". "Last code
|
||||
unit" is the last literal code unit that must be present in any match. This is
|
||||
not necessarily the last character. These lines are omitted if no starting or
|
||||
ending code units are recorded.
|
||||
ending code units are recorded. The subject length line is omitted when
|
||||
<b>no_start_optimize</b> is set because the minimum length is not calculated
|
||||
when it can never be used.
|
||||
</P>
|
||||
<P>
|
||||
The <b>framesize</b> modifier shows the size, in bytes, of the storage frames
|
||||
|
@ -2079,7 +2081,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 May 2019
|
||||
Last updated: 11 June 2019
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -2239,12 +2239,13 @@ INFORMATION ABOUT A COMPILED PATTERN
|
|||
PCRE2_INFO_MINLENGTH
|
||||
|
||||
If a minimum length for matching subject strings was computed, its
|
||||
value is returned. Otherwise the returned value is 0. The value is a
|
||||
number of characters, which in UTF mode may be different from the num-
|
||||
ber of code units. The third argument should point to an uint32_t
|
||||
variable. The value is a lower bound to the length of any matching
|
||||
string. There may not be any strings of that length that do actually
|
||||
match, but every string that does match is at least that long.
|
||||
value is returned. Otherwise the returned value is 0. This value is not
|
||||
computed when PCRE2_NO_START_OPTIMIZE is set. The value is a number of
|
||||
characters, which in UTF mode may be different from the number of code
|
||||
units. The third argument should point to an uint32_t variable. The
|
||||
value is a lower bound to the length of any matching string. There may
|
||||
not be any strings of that length that do actually match, but every
|
||||
string that does match is at least that long.
|
||||
|
||||
PCRE2_INFO_NAMECOUNT
|
||||
PCRE2_INFO_NAMEENTRYSIZE
|
||||
|
@ -3703,7 +3704,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 30 May 2019
|
||||
Last updated: 11 June 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2GREP 1 "28 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2GREP 1 "15 June 2019" "PCRE2 10.34"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -596,19 +596,29 @@ otherwise empty line. This option is mutually exclusive with \fB--output\fP,
|
|||
.TP
|
||||
\fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP
|
||||
Show only the part of the line that matched the capturing parentheses of the
|
||||
given number. Up to 32 capturing parentheses are supported, and -o0 is
|
||||
equivalent to \fB-o\fP without a number. Because these options can be given
|
||||
without an argument (see above), if an argument is present, it must be given in
|
||||
the same shell item, for example, -o3 or --only-matching=2. The comments given
|
||||
for the non-argument case above also apply to this option. If the specified
|
||||
capturing parentheses do not exist in the pattern, or were not set in the
|
||||
match, nothing is output unless the file name or line number are being output.
|
||||
given number. Up to 50 capturing parentheses are supported by default. This
|
||||
limit can be changed via the \fB--om-capture\fP option. A pattern may contain
|
||||
any number of capturing parentheses, but only those whose number is within the
|
||||
limit can be accessed by \fB-o\fP. An error occurs if the number specified by
|
||||
\fB-o\fP is greater than the limit.
|
||||
.sp
|
||||
-o0 is the same as \fB-o\fP without a number. Because these options can be
|
||||
given without an argument (see above), if an argument is present, it must be
|
||||
given in the same shell item, for example, -o3 or --only-matching=2. The
|
||||
comments given for the non-argument case above also apply to this option. If
|
||||
the specified capturing parentheses do not exist in the pattern, or were not
|
||||
set in the match, nothing is output unless the file name or line number are
|
||||
being output.
|
||||
.sp
|
||||
If this option is given multiple times, multiple substrings are output for each
|
||||
match, in the order the options are given, and all on one line. For example,
|
||||
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator (but see the next
|
||||
option).
|
||||
but one option).
|
||||
.TP
|
||||
\fB--om-capture\fP=\fInumber\fP
|
||||
Set the number of capturing parentheses that can be accessed by \fB-o\fP. The
|
||||
default is 50.
|
||||
.TP
|
||||
\fB--om-separator\fP=\fItext\fP
|
||||
Specify a separating string for multiple occurrences of \fB-o\fP. The default
|
||||
|
@ -894,6 +904,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 28 May 2019
|
||||
Last updated: 15 June 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -662,23 +662,32 @@ OPTIONS
|
|||
|
||||
-onumber, --only-matching=number
|
||||
Show only the part of the line that matched the capturing
|
||||
parentheses of the given number. Up to 32 capturing parenthe-
|
||||
ses are supported, and -o0 is equivalent to -o without a num-
|
||||
ber. Because these options can be given without an argument
|
||||
(see above), if an argument is present, it must be given in
|
||||
the same shell item, for example, -o3 or --only-matching=2.
|
||||
The comments given for the non-argument case above also apply
|
||||
to this option. If the specified capturing parentheses do not
|
||||
exist in the pattern, or were not set in the match, nothing
|
||||
is output unless the file name or line number are being out-
|
||||
put.
|
||||
parentheses of the given number. Up to 50 capturing parenthe-
|
||||
ses are supported by default. This limit can be changed via
|
||||
the --om-capture option. A pattern may contain any number of
|
||||
capturing parentheses, but only those whose number is within
|
||||
the limit can be accessed by -o. An error occurs if the num-
|
||||
ber specified by -o is greater than the limit.
|
||||
|
||||
-o0 is the same as -o without a number. Because these options
|
||||
can be given without an argument (see above), if an argument
|
||||
is present, it must be given in the same shell item, for
|
||||
example, -o3 or --only-matching=2. The comments given for the
|
||||
non-argument case above also apply to this option. If the
|
||||
specified capturing parentheses do not exist in the pattern,
|
||||
or were not set in the match, nothing is output unless the
|
||||
file name or line number are being output.
|
||||
|
||||
If this option is given multiple times, multiple substrings
|
||||
are output for each match, in the order the options are
|
||||
given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||
the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator
|
||||
(but see the next option).
|
||||
(but see the next but one option).
|
||||
|
||||
--om-capture=number
|
||||
Set the number of capturing parentheses that can be accessed
|
||||
by -o. The default is 50.
|
||||
|
||||
--om-separator=text
|
||||
Specify a separating string for multiple occurrences of -o.
|
||||
|
@ -955,5 +964,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 28 May 2019
|
||||
Last updated: 15 June 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
|
|
|
@ -670,7 +670,9 @@ PATTERN MODIFIERS
|
|||
as "starting code units". "Last code unit" is the last literal code
|
||||
unit that must be present in any match. This is not necessarily the
|
||||
last character. These lines are omitted if no starting or ending code
|
||||
units are recorded.
|
||||
units are recorded. The subject length line is omitted when
|
||||
no_start_optimize is set because the minimum length is not calculated
|
||||
when it can never be used.
|
||||
|
||||
The framesize modifier shows the size, in bytes, of the storage frames
|
||||
used by pcre2_match() for handling backtracking. The size depends on
|
||||
|
@ -1891,5 +1893,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 23 May 2019
|
||||
Last updated: 11 June 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
|
|
|
@ -128,7 +128,7 @@ be C99 don't support it (hence DISABLE_PERCENT_ZT). */
|
|||
|
||||
typedef int BOOL;
|
||||
|
||||
#define OFFSET_SIZE 33
|
||||
#define DEFAULT_CAPTURE_MAX 50
|
||||
|
||||
#if BUFSIZ > 8192
|
||||
#define MAXPATLEN BUFSIZ
|
||||
|
@ -255,6 +255,8 @@ static pcre2_compile_context *compile_context;
|
|||
static pcre2_match_context *match_context;
|
||||
static pcre2_match_data *match_data;
|
||||
static PCRE2_SIZE *offsets;
|
||||
static uint32_t offset_size;
|
||||
static uint32_t capture_max = DEFAULT_CAPTURE_MAX;
|
||||
|
||||
static BOOL count_only = FALSE;
|
||||
static BOOL do_colour = FALSE;
|
||||
|
@ -404,6 +406,7 @@ used to identify them. */
|
|||
#define N_INCLUDE_FROM (-21)
|
||||
#define N_OM_SEPARATOR (-22)
|
||||
#define N_MAX_BUFSIZE (-23)
|
||||
#define N_OM_CAPTURE (-24)
|
||||
|
||||
static option_item optionlist[] = {
|
||||
{ OP_NODATA, N_NULL, NULL, "", "terminate options" },
|
||||
|
@ -450,6 +453,7 @@ static option_item optionlist[] = {
|
|||
{ OP_STRING, 'O', &output_text, "output=text", "show only this text (possibly expanded)" },
|
||||
{ OP_OP_NUMBERS, 'o', &only_matching_data, "only-matching=n", "show only the part of the line that matched" },
|
||||
{ OP_STRING, N_OM_SEPARATOR, &om_separator, "om-separator=text", "set separator for multiple -o output" },
|
||||
{ OP_U32NUMBER, N_OM_CAPTURE, &capture_max, "om-capture=n", "set capture count for --only-matching" },
|
||||
{ OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" },
|
||||
{ OP_NODATA, 'r', NULL, "recursive", "recursively scan sub-directories" },
|
||||
{ OP_PATLIST, N_EXCLUDE,&exclude_patdata, "exclude=pattern","exclude matching files when recursing" },
|
||||
|
@ -2591,7 +2595,7 @@ while (ptr < endptr)
|
|||
|
||||
for (i = 0; i < jfriedl_XR; i++)
|
||||
match = (pcre_exec(patterns->compiled, patterns->hint, ptr, length, 0,
|
||||
PCRE2_NOTEMPTY, offsets, OFFSET_SIZE) >= 0);
|
||||
PCRE2_NOTEMPTY, offsets, offset_size) >= 0);
|
||||
|
||||
if (gettimeofday(&end_time, &dummy) != 0)
|
||||
perror("bad gettimeofday");
|
||||
|
@ -2711,7 +2715,7 @@ while (ptr < endptr)
|
|||
for (om = only_matching; om != NULL; om = om->next)
|
||||
{
|
||||
int n = om->groupnum;
|
||||
if (n < mrc)
|
||||
if (n == 0 || n < mrc)
|
||||
{
|
||||
int plen = offsets[2*n + 1] - offsets[2*n];
|
||||
if (plen > 0)
|
||||
|
@ -3663,6 +3667,7 @@ int rc = 1;
|
|||
BOOL only_one_at_top;
|
||||
patstr *cp;
|
||||
fnstr *fn;
|
||||
omstr *om;
|
||||
const char *locale_from = "--locale";
|
||||
|
||||
#ifdef SUPPORT_PCRE2GREP_JIT
|
||||
|
@ -3679,20 +3684,6 @@ must use STDOUT_NL to terminate lines. */
|
|||
_setmode(_fileno(stdout), _O_BINARY);
|
||||
#endif
|
||||
|
||||
/* Set up a default compile and match contexts and a match data block. */
|
||||
|
||||
compile_context = pcre2_compile_context_create(NULL);
|
||||
match_context = pcre2_match_context_create(NULL);
|
||||
match_data = pcre2_match_data_create(OFFSET_SIZE, NULL);
|
||||
offsets = pcre2_get_ovector_pointer(match_data);
|
||||
|
||||
/* If string (script) callouts are supported, set up the callout processing
|
||||
function. */
|
||||
|
||||
#ifdef SUPPORT_PCRE2GREP_CALLOUT
|
||||
pcre2_set_callout(match_context, pcre2grep_callout, NULL);
|
||||
#endif
|
||||
|
||||
/* Process the options */
|
||||
|
||||
for (i = 1; i < argc; i++)
|
||||
|
@ -4039,12 +4030,40 @@ if (only_matching_count > 1)
|
|||
pcre2grep_exit(usage(2));
|
||||
}
|
||||
|
||||
/* Check that there is a big enough ovector for all -o settings. */
|
||||
|
||||
for (om = only_matching; om != NULL; om = om->next)
|
||||
{
|
||||
int n = om->groupnum;
|
||||
if (n > (int)capture_max)
|
||||
{
|
||||
fprintf(stderr, "pcre2grep: Requested group %d cannot be captured.\n", n);
|
||||
fprintf(stderr, "pcre2grep: Use --om-capture to increase the size of the capture vector.\n");
|
||||
goto EXIT2;
|
||||
}
|
||||
}
|
||||
|
||||
/* Check the text supplied to --output for errors. */
|
||||
|
||||
if (output_text != NULL &&
|
||||
!syntax_check_output_text((PCRE2_SPTR)output_text, FALSE))
|
||||
goto EXIT2;
|
||||
|
||||
/* Set up default compile and match contexts and a match data block. */
|
||||
|
||||
offset_size = capture_max + 1;
|
||||
compile_context = pcre2_compile_context_create(NULL);
|
||||
match_context = pcre2_match_context_create(NULL);
|
||||
match_data = pcre2_match_data_create(offset_size, NULL);
|
||||
offsets = pcre2_get_ovector_pointer(match_data);
|
||||
|
||||
/* If string (script) callouts are supported, set up the callout processing
|
||||
function. */
|
||||
|
||||
#ifdef SUPPORT_PCRE2GREP_CALLOUT
|
||||
pcre2_set_callout(match_context, pcre2grep_callout, NULL);
|
||||
#endif
|
||||
|
||||
/* Put limits into the match data block. */
|
||||
|
||||
if (heap_limit != PCRE2_UNSET) pcre2_set_heap_limit(match_context, heap_limit);
|
||||
|
|
|
@ -949,3 +949,10 @@ RC=0
|
|||
---------------------------- Test 126 -----------------------------
|
||||
ABC |