teach PCRE's '.' to ignore '\n' like perl
When '.' is used in a regexp, it matches all characters but the ones defined as line delimiter, which is a configurable set in PCRE, and with 2 of those sets not including '\n'. perl allows for a configurable line delimiter string (not a set), and therefore treats '\n' specially, preventing it for matching regardless of what the delimiter contains, therefore when PCRE uses one of those sets without '\n', the matches will differ: $ printf 'a\nb' | perl -n0le '/a.b/ or exit 1'; echo $? 1 $ printf 'a\nb' | pcre2grep -q -NNUL 'a.b'; echo $? 0 Since the current behaviour for '.' is historical, a new compile option has been invented to allow PCRE to match perl's behaviour as an alternative. Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
This commit is contained in:
parent
4085cca917
commit
b9ce2ab6e2
4
HACKING
4
HACKING
|
@ -389,6 +389,7 @@ These items are all just one unit long:
|
|||
|
||||
OP_END end of pattern
|
||||
OP_ANY match any one character other than newline
|
||||
OP_ANY_NOTNL match any one character other than '\n' or newline
|
||||
OP_ALLANY match any one character, including newline
|
||||
OP_ANYBYTE match any single code unit, even in UTF-8/16 mode
|
||||
OP_SOD match start of data: \A
|
||||
|
@ -427,6 +428,9 @@ OP_ASSERT_ACCEPT is used when (*ACCEPT) is encountered within an assertion.
|
|||
This ends the assertion, not the entire pattern match. The assertion (?!) is
|
||||
always optimized to OP_FAIL.
|
||||
|
||||
OP_ANY_NOTNL is used for '.' when PCRE2_NO_DOTNL is set and unless
|
||||
PCRE2_DOTALL is also set as explained below.
|
||||
|
||||
OP_ALLANY is used for '.' when PCRE2_DOTALL is set. It is also used for \C in
|
||||
non-UTF modes and in UTF-32 mode (since one code unit still equals one
|
||||
character). Another use is for [^] when empty classes are permitted
|
||||
|
|
|
@ -143,6 +143,7 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_EXTENDED_MORE 0x01000000u /* C */
|
||||
#define PCRE2_LITERAL 0x02000000u /* C */
|
||||
#define PCRE2_MATCH_INVALID_UTF 0x04000000u /* J M D */
|
||||
#define PCRE2_NO_DOTNL 0x08000000u /* C */
|
||||
|
||||
/* An additional compile options word is available in the compile context. */
|
||||
|
||||
|
|
|
@ -774,7 +774,7 @@ are allowed. */
|
|||
PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
|
||||
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
|
||||
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
|
||||
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY)
|
||||
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_NO_DOTNL)
|
||||
|
||||
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
|
||||
(PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD)
|
||||
|
@ -5518,7 +5518,8 @@ for (;; pptr++)
|
|||
zerofirstcuflags = firstcuflags;
|
||||
zeroreqcu = reqcu;
|
||||
zeroreqcuflags = reqcuflags;
|
||||
*code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
|
||||
*code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY :
|
||||
(options & PCRE2_NO_DOTNL)? OP_ANY_NOTNL : OP_ANY;
|
||||
break;
|
||||
|
||||
|
||||
|
@ -7404,7 +7405,8 @@ for (;; pptr++)
|
|||
here because it just makes it horribly messy. */
|
||||
|
||||
default:
|
||||
if (op_previous >= OP_EODN) /* Not a character type - internal error */
|
||||
/* FIXME: instead of this exception OP_ANY_NOTNL should be renumbered */
|
||||
if (op_previous >= OP_EODN && op_previous != OP_ANY_NOTNL) /* Not a character type - internal error */
|
||||
{
|
||||
*errorcodeptr = ERR10;
|
||||
return 0;
|
||||
|
|
|
@ -187,7 +187,8 @@ static const uint8_t coptable[] = {
|
|||
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
|
||||
0, 0, /* COMMIT, COMMIT_ARG */
|
||||
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
|
||||
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
|
||||
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
|
||||
0 /* . */
|
||||
};
|
||||
|
||||
/* This table identifies those opcodes that inspect a character. It is used to
|
||||
|
@ -264,7 +265,8 @@ static const uint8_t poptable[] = {
|
|||
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
|
||||
0, 0, /* COMMIT, COMMIT_ARG */
|
||||
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
|
||||
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
|
||||
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
|
||||
0 /* . */
|
||||
};
|
||||
|
||||
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
|
||||
|
|
|
@ -1596,6 +1596,10 @@ enum {
|
|||
|
||||
OP_DEFINE, /* 167 */
|
||||
|
||||
/* This is like OP_ANY but also rejects '\n' for compatibility with perl's . */
|
||||
|
||||
OP_ANY_NOTNL, /* 168 */
|
||||
|
||||
/* This is not an opcode, but is used to check that tables indexed by opcode
|
||||
are the correct length, in order to catch updating errors - there have been
|
||||
some in the past. */
|
||||
|
@ -1655,7 +1659,8 @@ some cases doesn't actually use these names at all). */
|
|||
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
|
||||
"*THEN", "*THEN", "*COMMIT", "*COMMIT", "*FAIL", \
|
||||
"*ACCEPT", "*ASSERT_ACCEPT", \
|
||||
"Close", "Skip zero", "Define"
|
||||
"Close", "Skip zero", "Define", \
|
||||
"."
|
||||
|
||||
|
||||
/* This macro defines the length of fixed length operations in the compiled
|
||||
|
@ -1751,7 +1756,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
|||
1, 3, /* COMMIT, COMMIT_ARG */ \
|
||||
1, 1, 1, /* FAIL, ACCEPT, ASSERT_ACCEPT */ \
|
||||
1+IMM2_SIZE, 1, /* CLOSE, SKIPZERO */ \
|
||||
1 /* DEFINE */
|
||||
1, /* DEFINE */ \
|
||||
1 /* . */
|
||||
|
||||
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
|
||||
|
||||
|
|
|
@ -875,6 +875,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
|
||||
|
||||
/*===================================================================== */
|
||||
/* Match any single character type except '\n'; falls through OP_ALLANY */
|
||||
|
||||
case OP_ANY_NOTNL:
|
||||
if (*Feptr == CHAR_LF) RRETURN(MATCH_NOMATCH);
|
||||
/* Fall through */
|
||||
|
||||
/* Match any single character type except newline; have to take care with
|
||||
CRLF newlines and partial matching. */
|
||||
|
||||
|
@ -2840,6 +2846,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
#ifdef SUPPORT_UNICODE
|
||||
if (utf) switch(Lctype)
|
||||
{
|
||||
case OP_ANY_NOTNL:
|
||||
case OP_ANY:
|
||||
for (i = 1; i <= Lmin; i++)
|
||||
{
|
||||
|
@ -2848,6 +2855,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
SCHECK_PARTIAL();
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
|
||||
if (mb->partial != 0 &&
|
||||
Feptr + 1 >= mb->end_subject &&
|
||||
|
@ -3093,6 +3102,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
|
||||
switch(Lctype)
|
||||
{
|
||||
case OP_ANY_NOTNL:
|
||||
case OP_ANY:
|
||||
for (i = 1; i <= Lmin; i++)
|
||||
{
|
||||
|
@ -3101,6 +3111,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
SCHECK_PARTIAL();
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
if (IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
|
||||
if (mb->partial != 0 &&
|
||||
Feptr + 1 >= mb->end_subject &&
|
||||
|
@ -3610,6 +3622,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
SCHECK_PARTIAL();
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
if (Lctype == OP_ANY && IS_NEWLINE(Feptr)) RRETURN(MATCH_NOMATCH);
|
||||
GETCHARINC(fc, Feptr);
|
||||
switch(Lctype)
|
||||
|
@ -3737,6 +3751,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
SCHECK_PARTIAL();
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
if (Lctype == OP_ANY && IS_NEWLINE(Feptr))
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
fc = *Feptr++;
|
||||
|
@ -4174,6 +4190,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
{
|
||||
switch(Lctype)
|
||||
{
|
||||
case OP_ANY_NOTNL:
|
||||
case OP_ANY:
|
||||
for (i = Lmin; i < Lmax; i++)
|
||||
{
|
||||
|
@ -4182,6 +4199,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
SCHECK_PARTIAL();
|
||||
break;
|
||||
}
|
||||
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
if (IS_NEWLINE(Feptr)) break;
|
||||
if (mb->partial != 0 && /* Take care with CRLF partial */
|
||||
Feptr + 1 >= mb->end_subject &&
|
||||
|
@ -4423,6 +4442,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
{
|
||||
switch(Lctype)
|
||||
{
|
||||
case OP_ANY_NOTNL:
|
||||
case OP_ANY:
|
||||
for (i = Lmin; i < Lmax; i++)
|
||||
{
|
||||
|
@ -4431,6 +4451,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
|||
SCHECK_PARTIAL();
|
||||
break;
|
||||
}
|
||||
if ((Lctype == OP_ANY_NOTNL) && (*Feptr == CHAR_LF))
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
if (IS_NEWLINE(Feptr)) break;
|
||||
if (mb->partial != 0 && /* Take care with CRLF partial */
|
||||
Feptr + 1 >= mb->end_subject &&
|
||||
|
|
|
@ -358,6 +358,7 @@ for (;;)
|
|||
case OP_WORDCHAR:
|
||||
case OP_ANY:
|
||||
case OP_ALLANY:
|
||||
case OP_ANY_NOTNL:
|
||||
case OP_EXTUNI:
|
||||
case OP_HSPACE:
|
||||
case OP_NOT_HSPACE:
|
||||
|
@ -996,6 +997,7 @@ do
|
|||
case OP_ASSERT_ACCEPT:
|
||||
case OP_ALLANY:
|
||||
case OP_ANY:
|
||||
case OP_ANY_NOTNL:
|
||||
case OP_ANYBYTE:
|
||||
case OP_CIRCM:
|
||||
case OP_CLOSE:
|
||||
|
@ -1392,6 +1394,7 @@ do
|
|||
default:
|
||||
case OP_ANY:
|
||||
case OP_ALLANY:
|
||||
case OP_ANY_NOTNL:
|
||||
return SSB_FAIL;
|
||||
|
||||
case OP_HSPACE:
|
||||
|
|
Loading…
Reference in New Issue