teach PCRE's '.' to ignore '\n' like perl

When '.' is used in a regexp, it matches all characters but the
ones defined as line delimiter, which is a configurable set in
PCRE, and with 2 of those sets not including '\n'.

perl allows for a configurable line delimiter string (not a set),
and therefore treats '\n' specially, preventing it for matching
regardless of what the delimiter contains, therefore when PCRE
uses one of those sets without '\n', the matches will differ:

 $ printf 'a\nb' | perl -n0le '/a.b/ or exit 1'; echo $?
 1
 $ printf 'a\nb' | pcre2grep -q -NNUL 'a.b'; echo $?
 0

Since the current behaviour for '.' is historical, a new compile
option has been invented to allow PCRE to match perl's behaviour
as an alternative.

Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
This commit is contained in:
Carlo Marcelo Arenas Belón 2021-11-12 18:29:54 -08:00
parent 4085cca917
commit b9ce2ab6e2
7 changed files with 47 additions and 7 deletions

View File

@ -389,6 +389,7 @@ These items are all just one unit long:
OP_END end of pattern
OP_ANY match any one character other than newline
OP_ANY_NOTNL match any one character other than '\n' or newline
OP_ALLANY match any one character, including newline
OP_ANYBYTE match any single code unit, even in UTF-8/16 mode
OP_SOD match start of data: \A
@ -427,6 +428,9 @@ OP_ASSERT_ACCEPT is used when (*ACCEPT) is encountered within an assertion.
This ends the assertion, not the entire pattern match. The assertion (?!) is
always optimized to OP_FAIL.
OP_ANY_NOTNL is used for '.' when PCRE2_NO_DOTNL is set and unless
PCRE2_DOTALL is also set as explained below.
OP_ALLANY is used for '.' when PCRE2_DOTALL is set. It is also used for \C in
non-UTF modes and in UTF-32 mode (since one code unit still equals one
character). Another use is for [^] when empty classes are permitted

View File

@ -143,6 +143,7 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_EXTENDED_MORE 0x01000000u /* C */
#define PCRE2_LITERAL 0x02000000u /* C */
#define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */
#define PCRE2_NO_DOTNL 0x08000000u /* C */
/* An additional compile options word is available in the compile context. */

View File

@ -774,7 +774,7 @@ are allowed. */
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_NO_DOTNL)
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
(PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
@ -5518,7 +5518,8 @@ for (;; pptr++)
zerofirstcuflags = firstcuflags;
zeroreqcu = reqcu;
zeroreqcuflags = reqcuflags;
*code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
*code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY :
(options & PCRE2_NO_DOTNL)? OP_ANY_NOTNL : OP_ANY;
break;
@ -7404,7 +7405,8 @@ for (;; pptr++)
here because it just makes it horribly messy. */
default:
if (op_previous >= OP_EODN) /* Not a character type - internal error */
/* FIXME: instead of this exception OP_ANY_NOTNL should be renumbered */
if (op_previous >= OP_EODN && op_previous != OP_ANY_NOTNL) /* Not a character type - internal error */
{
*errorcodeptr = ERR10;
return 0;

View File

@ -187,7 +187,8 @@ static const uint8_t coptable[] = {
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
0, 0, /* COMMIT, COMMIT_ARG */
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
0 /* . */
};
/* This table identifies those opcodes that inspect a character. It is used to
@ -264,7 +265,8 @@ static const uint8_t poptable[] = {
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
0, 0, /* COMMIT, COMMIT_ARG */
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
0 /* . */
};
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,

View File

@ -1596,6 +1596,10 @@ enum {
OP_DEFINE, /* 167 */
/* This is like OP_ANY but also rejects '\n' for compatibility with perl's . */
OP_ANY_NOTNL, /* 168 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
some in the past. */
@ -1655,7 +1659,8 @@ some cases doesn't actually use these names at all). */
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
"*THEN", "*THEN", "*COMMIT", "*COMMIT", "*FAIL", \
"*ACCEPT", "*ASSERT_ACCEPT", \
"Close", "Skip zero", "Define"
"Close", "Skip zero", "Define", \
"."
/* This macro defines the length of fixed length operations in the compiled
@ -1751,7 +1756,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1, 3, /* COMMIT, COMMIT_ARG */ \
1, 1, 1, /* FAIL, ACCEPT, ASSERT_ACCEPT */ \
1+IMM2_SIZE, 1, /* CLOSE, SKIPZERO */ \
1 /* DEFINE */
1, /* DEFINE */ \
1 /* . */
/* A magic value for OP_RREF to indicate the "any recursion" condition. */

View File

@ -875,6 +875,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/*===================================================================== */
/* Match any single character type except '\n'; falls through OP_ALLANY */
case OP_ANY_NOTNL:
if (*Feptr == CHAR_LF) RRETURN(MATCH_NOMATCH);
/* Fall through */
/* Match any single character type except newline; have to take care with
CRLF newlines and partial matching. */
@ -2840,6 +2846,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
#ifdef SUPPORT_UNICODE
if (utf) switch(Lctype)
{
case OP_ANY_NOTNL:
case OP_ANY:
for (i = 1; i <= Lmin; i++)
{
@ -2848,6 +2855,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
RRETURN(MATCH_NOMATCH);
if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
if (mb->partial != 0 &&
Feptr + 1 >= mb->end_subject &&
@ -3093,6 +3102,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
switch(Lctype)
{
case OP_ANY_NOTNL:
case OP_ANY:
for (i = 1; i <= Lmin; i++)
{
@ -3101,6 +3111,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
RRETURN(MATCH_NOMATCH);
if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
if (mb->partial != 0 &&
Feptr + 1 >= mb->end_subject &&
@ -3610,6 +3622,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
RRETURN(MATCH_NOMATCH);
if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
GETCHARINC(fc, Feptr);
switch(Lctype)
@ -3737,6 +3751,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
RRETURN(MATCH_NOMATCH);
if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
RRETURN(MATCH_NOMATCH);
fc = *Feptr++;
@ -4174,6 +4190,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
{
switch(Lctype)
{
case OP_ANY_NOTNL:
case OP_ANY:
for (i = Lmin; i < Lmax; i++)
{
@ -4182,6 +4199,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL();
break;
}
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
RRETURN(MATCH_NOMATCH);
if (IS_NEWLINE(Feptr)) break;
if (mb->partial != 0 && /* Take care with CRLF partial */
Feptr + 1 >= mb->end_subject &&
@ -4423,6 +4442,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
{
switch(Lctype)
{
case OP_ANY_NOTNL:
case OP_ANY:
for (i = Lmin; i < Lmax; i++)
{
@ -4431,6 +4451,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL();
break;
}
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
RRETURN(MATCH_NOMATCH);
if (IS_NEWLINE(Feptr)) break;
if (mb->partial != 0 && /* Take care with CRLF partial */
Feptr + 1 >= mb->end_subject &&

View File

@ -358,6 +358,7 @@ for (;;)
case OP_WORDCHAR:
case OP_ANY:
case OP_ALLANY:
case OP_ANY_NOTNL:
case OP_EXTUNI:
case OP_HSPACE:
case OP_NOT_HSPACE:
@ -996,6 +997,7 @@ do
case OP_ASSERT_ACCEPT:
case OP_ALLANY:
case OP_ANY:
case OP_ANY_NOTNL:
case OP_ANYBYTE:
case OP_CIRCM:
case OP_CLOSE:
@ -1392,6 +1394,7 @@ do
default:
case OP_ANY:
case OP_ALLANY:
case OP_ANY_NOTNL:
return SSB_FAIL;
case OP_HSPACE: