Implement PCRE2_EXTRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD for the benefit

of pcre2grep.
This commit is contained in:
Philip.Hazel 2017-06-16 17:51:13 +00:00
parent 6679349203
commit a083420cac
9 changed files with 170 additions and 44 deletions

View File

@ -189,6 +189,9 @@ pattern lines.
41. Implement PCRE2_LITERAL and use it to support REG_NOSPEC.
42. Implement PCRE2_EXTRA_MATCH_LINE and PCRE2_EXTRA_MATCH_WORD for the benefit
of pcre2grep.
Version 10.23 14-February-2017
------------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2API 3 "15 June 2017" "PCRE2 10.30"
.TH PCRE2API 3 "16 June 2017" "PCRE2 10.30"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.sp
@ -1400,10 +1400,12 @@ If this option is set, all meta-characters in the pattern are disabled, and it
is treated as a literal string. Matching literal strings with a regular
expression engine is not the most efficient way of doing it. If you are doing a
lot of literal matching and are worried about efficiency, you should consider
using other approaches. The only other options that are allowed with
using other approaches. The only other main options that are allowed with
PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT,
PCRE2_CASELESS, PCRE2_FIRSTLINE, PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK,
PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. Any other options cause an error.
PCRE2_UTF, and PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EXTRA_MATCH_LINE
and PCRE2_EXTRA_MATCH_WORD are also supported. Any other options cause an
error.
.sp
PCRE2_MATCH_UNSET_BACKREF
.sp
@ -1689,6 +1691,24 @@ treated as single-character escapes. For example, \ej is a literal "j" and
\ex{2z} is treated as the literal string "x{2z}". Setting this option means
that typos in patterns may go undetected and have unexpected results. This is a
dangerous option. Use with care.
.sp
PCRE2_EXTRA_MATCH_LINE
.sp
This option is provided for use by the \fB-x\fP option of \fBpcre2grep\fP. It
causes the pattern only to match complete lines. This is achieved by
automatically inserting the code for "^(?:" at the start of the compiled
pattern and ")$" at the end. Thus, when PCRE2_MULTILINE is set, the matched
line may be in the middle of the subject string. This option can be used with
PCRE2_LITERAL.
.sp
PCRE2_EXTRA_MATCH_WORD
.sp
This option is provided for use by the \fB-w\fP option of \fBpcre2grep\fP. It
causes the pattern only to match strings that have a word boundary at the start
and the end. This is achieved by automatically inserting the code for "\eb(?:"
at the start of the compiled pattern and ")\eb" at the end. The option may be
used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is
also set.
.
.
.SH "COMPILATION ERROR CODES"
@ -3519,6 +3539,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 15 June 2017
Last updated: 16 June 2017
Copyright (c) 1997-2017 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2TEST 1 "15 June 2017" "PCRE 10.30"
.TH PCRE2TEST 1 "16 June 2017" "PCRE 10.30"
.SH NAME
pcre2test - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS
@ -556,7 +556,9 @@ for a description of the effects of these options.
/xx extended_more set PCRE2_EXTENDED_MORE
firstline set PCRE2_FIRSTLINE
literal set PCRE2_LITERAL
match_line set PCRE2_EXTRA_MATCH_LINE
match_unset_backref set PCRE2_MATCH_UNSET_BACKREF
match_word set PCRE2_EXTRA_MATCH_WORD
/m multiline set PCRE2_MULTILINE
never_backslash_c set PCRE2_NEVER_BACKSLASH_C
never_ucp set PCRE2_NEVER_UCP
@ -1835,6 +1837,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 15 June 2017
Last updated: 16 June 2017
Copyright (c) 1997-2017 University of Cambridge.
.fi

View File

@ -142,8 +142,10 @@ D is inspected during pcre2_dfa_match() execution
/* An additional compile options word is available in the compile context. */
#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */
#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */
#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */
#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */
#define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */
#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */
/* These are for pcre2_jit_compile(). */

View File

@ -142,8 +142,10 @@ D is inspected during pcre2_dfa_match() execution
/* An additional compile options word is available in the compile context. */
#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */
#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */
#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */
#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */
#define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */
#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */
/* These are for pcre2_jit_compile(). */

View File

@ -690,24 +690,30 @@ static int posix_substitutes[] = {
#define POSIX_SUBSIZE (sizeof(posix_substitutes) / (2*sizeof(uint32_t)))
#endif /* SUPPORT_UNICODE */
/* Masks for checking option settings. */
#define PUBLIC_COMPILE_OPTIONS \
(PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \
PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_ENDANCHORED|PCRE2_EXTENDED| \
PCRE2_EXTENDED_MORE|PCRE2_FIRSTLINE|PCRE2_LITERAL| \
PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \
PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \
PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \
PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \
PCRE2_UTF)
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
are allowed. */
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
(PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_NO_START_OPTIMIZE| \
PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
#define PUBLIC_COMPILE_OPTIONS \
(PUBLIC_LITERAL_COMPILE_OPTIONS| \
PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
(PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL)
/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
eint2 in pcre2posix.c may need to be updated, and a new error text must be
@ -2304,6 +2310,20 @@ PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
named_group *ng;
nest_save *top_nest, *end_nests;
/* Insert leading items for word and line matching (features provided for the
benefit of pcre2grep). */
if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
{
*parsed_pattern++ = META_CIRCUMFLEX;
*parsed_pattern++ = META_NOCAPTURE;
}
else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
{
*parsed_pattern++ = META_ESCAPE + ESC_b;
*parsed_pattern++ = META_NOCAPTURE;
}
/* If the pattern is actually a literal string, process it separately to avoid
cluttering up the main loop. */
@ -2323,8 +2343,7 @@ if ((options & PCRE2_LITERAL) != 0)
auto_callout, parsed_pattern, cb);
PARSED_LITERAL(c, parsed_pattern);
}
*parsed_pattern = META_END;
return 0;
goto PARSED_END;
}
/* Process a real regex which may contain meta-characters. */
@ -4166,9 +4185,24 @@ if (inverbname && ptr >= ptrend)
/* Manage callout for the final item */
PARSED_END:
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
parsed_pattern, cb);
/* Insert trailing items for word and line matching (features provided for the
benefit of pcre2grep). */
if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_LINE) != 0)
{
*parsed_pattern++ = META_KET;
*parsed_pattern++ = META_DOLLAR;
}
else if ((cb->cx->extra_options & PCRE2_EXTRA_MATCH_WORD) != 0)
{
*parsed_pattern++ = META_KET;
*parsed_pattern++ = META_ESCAPE + ESC_b;
}
/* Terminate the parsed pattern, then return success if all groups are closed.
Otherwise we have unclosed parentheses. */
@ -4177,6 +4211,7 @@ if (parsed_pattern >= parsed_pattern_end)
errorcode = ERR63; /* Internal error (parsed pattern overflow) */
goto FAILED;
}
*parsed_pattern = META_END;
if (nest_depth == 0) return 0;
@ -8984,26 +9019,28 @@ if (pattern == NULL)
return NULL;
}
/* A NULL compile context means "use a default context" */
if (ccontext == NULL)
ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
/* Check that all undefined public option bits are zero. */
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
(ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
{
*errorptr = ERR17;
return NULL;
}
if ((options & PCRE2_LITERAL) != 0 &&
(options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0)
((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
(ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
{
*errorptr = ERR92;
return NULL;
}
/* A NULL compile context means "use a default context" */
if (ccontext == NULL)
ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
/* A zero-terminated pattern is indicated by the special length value
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
@ -9262,10 +9299,10 @@ and comments removed (amongst other things).
In all but one case, when PCRE2_AUTO_CALLOUT is not set, the number of unsigned
32-bit ints in the parsed pattern is bounded by the length of the pattern plus
one (for the terminator). The exceptional case is when running in 32-bit,
non-UTF mode, when literal characters greater than META_END (0x80000000) have
to be coded as two units. In this case, therefore, we scan the pattern to check
for such values. */
one (for the terminator) plus four if PCRE2_EXTRA_WORD or PCRE2_EXTRA_LINE is
set. The exceptional case is when running in 32-bit, non-UTF mode, when literal
characters greater than META_END (0x80000000) have to be coded as two units. In
this case, therefore, we scan the pattern to check for such values. */
#if PCRE2_CODE_UNIT_WIDTH == 32
if (!utf)
@ -9282,6 +9319,11 @@ many smaller patterns the vector on the stack (which was set up above) can be
used. */
parsed_size_needed = patlen - skipatstart + big32count;
if ((ccontext->extra_options &
(PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
parsed_size_needed += 4;
if ((options & PCRE2_AUTO_CALLOUT) != 0)
parsed_size_needed = (parsed_size_needed + 1) * 5;

View File

@ -638,7 +638,9 @@ static modstruct modlist[] = {
{ "locale", MOD_PAT, MOD_STR, LOCALESIZE, PO(locale) },
{ "mark", MOD_PNDP, MOD_CTL, CTL_MARK, PO(control) },
{ "match_limit", MOD_CTM, MOD_INT, 0, MO(match_limit) },
{ "match_line", MOD_CTC, MOD_OPT, PCRE2_EXTRA_MATCH_LINE, CO(extra_options) },
{ "match_unset_backref", MOD_PAT, MOD_OPT, PCRE2_MATCH_UNSET_BACKREF, PO(options) },
{ "match_word", MOD_CTC, MOD_OPT, PCRE2_EXTRA_MATCH_WORD, CO(extra_options) },
{ "max_pattern_length", MOD_CTC, MOD_SIZ, 0, CO(max_pattern_length) },
{ "memory", MOD_PD, MOD_CTL, CTL_MEMORY, PD(control) },
{ "multiline", MOD_PATP, MOD_OPT, PCRE2_MULTILINE, PO(options) },

22
testdata/testinput2 vendored
View File

@ -5305,7 +5305,7 @@ a)"xI
X\na\\b(c
/a\b?c/literal,use_offset_limit
XXXXa\\b?c\=offset_limit=5
XXXXa\\b?c\=offset_limit=4
\= Expect no match
XXXXa\\b?c\=offset_limit=3
@ -5327,4 +5327,24 @@ a)"xI
/(*CR)abc/literal
(*CR)abc
/cat|dog/match_word
the cat sat
\= Expect no match
caterpillar
snowcat
syndicate
/(cat)|dog/match_line,literal
(cat)|dog
\= Expect no match
the cat sat
caterpillar
snowcat
syndicate
/a whole line/match_line,multiline
Rhubarb \na whole line\n custard
\= Expect no match
Not a whole line
# End of testinput2

39
testdata/testoutput2 vendored
View File

@ -16033,7 +16033,7 @@ Failed: error 108 at offset 4: range out of order in character class
No match
/a\b?c/literal,use_offset_limit
XXXXa\\b?c\=offset_limit=5
XXXXa\\b?c\=offset_limit=4
0: a\b?c
\= Expect no match
XXXXa\\b?c\=offset_limit=3
@ -16064,7 +16064,8 @@ Failed: error 192 at offset 0: invalid option bits with PCRE2_LITERAL
+1 ^^ \
+2 ^ ^ b
+3 ^ ^ (
+4 ^ ^
+4 ^ ^ c
+5 ^ ^
0: a\b(c
/a\b(c/literal,auto_callout
@ -16074,13 +16075,45 @@ Failed: error 192 at offset 0: invalid option bits with PCRE2_LITERAL
+1 ^^ \
+2 ^ ^ b
+3 ^ ^ (
+4 ^ ^
+4 ^ ^ c
+5 ^ ^
0: a\b(c
/(*CR)abc/literal
(*CR)abc
0: (*CR)abc
/cat|dog/match_word
the cat sat
0: cat
\= Expect no match
caterpillar
No match
snowcat
No match
syndicate
No match
/(cat)|dog/match_line,literal
(cat)|dog
0: (cat)|dog
\= Expect no match
the cat sat
No match
caterpillar
No match
snowcat
No match
syndicate
No match
/a whole line/match_line,multiline
Rhubarb \na whole line\n custard
0: a whole line
\= Expect no match
Not a whole line
No match
# End of testinput2
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data