Fix auto-possessification bug at the end of a capturing group that is called

recursively.
2018-01-31 17:53:56 +00:00 · 2018-01-31 17:53:56 +00:00 · 53a588431c
parent 7a5b962509
commit 53a588431c
6 changed files with 271 additions and 14 deletions
--- a/9
+++ b/9
@ -134,6 +134,15 @@ groups, making the ovector larger than this. The number has been increased to
 131072, which allows for the maximum number of captures (65535) plus the 
 overall match. This fixes oss-fuzz issue 5415.

+31. Auto-possessification at the end of a capturing group was dependent on what 
+follows the group (e.g. /(a+)b/ would auto-possessify the a+) but this caused 
+incorrect behaviour when the group was called recursively from elsewhere in the 
+pattern where something different might follow. This bug is an unforseen
+consequence of change #1 for 10.30 - the implementation of backtracking into
+recursions. Iterators at the ends of capturing groups are no longer considered
+for auto-possessification if the pattern contains any recursions. Fixes 
+Bugzilla #2232.
+

 Version 10.30 14-August-2017
 ----------------------------
--- a/src/pcre2_auto_possess.c
+++ b/src/pcre2_auto_possess.c
@ -558,47 +558,73 @@ for(;;)
    continue;
    }

+  /* At the end of a branch, skip to the end of the group. */
+
  if (c == OP_ALT)
    {
    do code += GET(code, 1); while (*code == OP_ALT);
    c = *code;
    }

+  /* Inspect the next opcode. */
+
  switch(c)
    {
-    case OP_END:
-    case OP_KETRPOS:
-    /* TRUE only in greedy case. The non-greedy case could be replaced by
-    an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
-    uses more memory, which we cannot get at this stage.) */
+    /* We can always possessify a greedy iterator at the end of the pattern,
+    which is reached after skipping over the final OP_KET. A non-greedy
+    iterator must never be possessified. */

+    case OP_END:
    return base_list[1] != 0;

+    /* When an iterator is at the end of certain kinds of group we can inspect
+    what follows the group by skipping over the closing ket. Note that this
+    does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
+    iteration is variable (could be another iteration or could be the next
+    item). As these two opcodes are not listed in the next switch, they will
+    end up as the next code to inspect, and return FALSE by virtue of being
+    unsupported. */
+
    case OP_KET:
-    /* If the bracket is capturing, and referenced by an OP_RECURSE, or
-    it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
-    cannot be converted to a possessive form. */
+    case OP_KETRPOS:
+    /* The non-greedy case cannot be converted to a possessive form. */

    if (base_list[1] == 0) return FALSE;

+    /* If the bracket is capturing it might be referenced by an OP_RECURSE
+    so its last iterator can never be possessified if the pattern contains
+    recursions. (This could be improved by keeping a list of group numbers that
+    are called by recursion.) */
+
    switch(*(code - GET(code, 1)))
      {
+      case OP_CBRA:
+      case OP_SCBRA:
+      case OP_CBRAPOS:
+      case OP_SCBRAPOS:
+      if (cb->had_recurse) return FALSE;
+      break;
+
+      /* Atomic sub-patterns and assertions can always auto-possessify their
+      last iterator. However, if the group was entered as a result of checking
+      a previous iterator, this is not possible. */
+
      case OP_ASSERT:
      case OP_ASSERT_NOT:
      case OP_ASSERTBACK:
      case OP_ASSERTBACK_NOT:
      case OP_ONCE:

-      /* Atomic sub-patterns and assertions can always auto-possessify their
-      last iterator. However, if the group was entered as a result of checking
-      a previous iterator, this is not possible. */
-
      return !entered_a_group;
      }

+    /* Skip over the bracket and inspect what comes next. */
+
    code += PRIV(OP_lengths)[c];
    continue;

+    /* Handle cases where the next item is a group. */
+
    case OP_ONCE:
    case OP_BRA:
    case OP_CBRA:
@ -637,11 +663,15 @@ for(;;)
    code += PRIV(OP_lengths)[c];
    continue;

+    /* The next opcode does not need special handling; fall through and use it
+    to see if the base can be possessified. */
+
    default:
    break;
    }

-  /* Check for a supported opcode, and load its properties. */
+  /* We now have the next appropriate opcode to compare with the base. Check
+  for a supported opcode, and load its properties. */

  code = get_chr_property_list(code, utf, cb->fcc, list);
  if (code == NULL) return FALSE;    /* Unsupported */
--- a/testdata/testinput1
+++ b/testdata/testinput1
@ -6159,4 +6159,34 @@ ef) x/x,mark
 /((?<=((*ACCEPT))X)\1?Y(*ACCEPT))\1/
    XYYZ

+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
+    aa
+    a
+
+/^(a?)b(?1)a/
+    abaa
+    aba 
+    baa
+    ba  
+
+/^(a?)+b(?1)a/
+    abaa
+    aba 
+    baa
+    ba  
+
+/^(a?)++b(?1)a/
+    abaa
+    aba 
+    baa
+    ba  
+
+/^(a?)+b/
+    b
+    ab
+    aaab 
+
+/(?=a+)a(a+)++b/
+    aab
+
 # End of testinput1 
--- a/testdata/testinput2
+++ b/testdata/testinput2
@ -5412,4 +5412,21 @@ a)"xI
 \= Expect no match
    \na
    
+# These tests are matched in test 1 as they are Perl compatible. Here we are
+# looking at what does and does not get auto-possessified. 
+
+/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
+
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
+    
+/^(a?)b(?1)a/B
+
+/^(a?)+b(?1)a/B
+
+/^(a?)++b(?1)a/B
+
+/^(a?)+b/B
+
+/(?=a+)a(a+)++b/B
+
 # End of testinput2
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@ -9758,4 +9758,68 @@ No match
 1: Y
 2: 

+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
+    aa
+ 0: aa
+    a
+ 0: a
+
+/^(a?)b(?1)a/
+    abaa
+ 0: abaa
+ 1: a
+    aba 
+ 0: aba
+ 1: a
+    baa
+ 0: baa
+ 1: 
+    ba  
+ 0: ba
+ 1: 
+
+/^(a?)+b(?1)a/
+    abaa
+ 0: abaa
+ 1: 
+    aba 
+ 0: aba
+ 1: 
+    baa
+ 0: baa
+ 1: 
+    ba  
+ 0: ba
+ 1: 
+
+/^(a?)++b(?1)a/
+    abaa
+ 0: abaa
+ 1: 
+    aba 
+ 0: aba
+ 1: 
+    baa
+ 0: baa
+ 1: 
+    ba  
+ 0: ba
+ 1: 
+
+/^(a?)+b/
+    b
+ 0: b
+ 1: 
+    ab
+ 0: ab
+ 1: 
+    aaab 
+ 0: aaab
+ 1: 
+
+/(?=a+)a(a+)++b/
+    aab
+ 0: aab
+ 1: a
+
 # End of testinput1 
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@ -12701,7 +12701,7 @@ Subject length lower bound = 5
        Ket
        a
        CBraPos 1
-        a++
+        a+
        KetRpos
        a
        Ket
@ -16468,6 +16468,113 @@ No match
    \na
 No match
    
+# These tests are matched in test 1 as they are Perl compatible. Here we are
+# looking at what does and does not get auto-possessified. 
+
+/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
+------------------------------------------------------------------
+        Bra
+        Cond
+        Cond false
+        CBra 1
+        a?
+        Ket
+        Ket
+        ^
+        Recurse
+        a
+        $
+        Ket
+        End
+------------------------------------------------------------------
+
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
+------------------------------------------------------------------
+        Bra
+        Cond
+        Cond false
+        CBra 1
+        a?
+        Ket
+        X
+        Ket
+        ^
+        Recurse
+        a
+        $
+        Ket
+        End
+------------------------------------------------------------------
+    
+/^(a?)b(?1)a/B
+------------------------------------------------------------------
+        Bra
+        ^
+        CBra 1
+        a?
+        Ket
+        b
+        Recurse
+        a
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(a?)+b(?1)a/B
+------------------------------------------------------------------
+        Bra
+        ^
+        SCBra 1
+        a?
+        KetRmax
+        b
+        Recurse
+        a
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(a?)++b(?1)a/B
+------------------------------------------------------------------
+        Bra
+        ^
+        SCBraPos 1
+        a?
+        KetRpos
+        b
+        Recurse
+        a
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(a?)+b/B
+------------------------------------------------------------------
+        Bra
+        ^
+        SCBra 1
+        a?
+        KetRmax
+        b
+        Ket
+        End
+------------------------------------------------------------------
+
+/(?=a+)a(a+)++b/B
+------------------------------------------------------------------
+        Bra
+        Assert
+        a++
+        Ket
+        a
+        CBraPos 1
+        a++
+        KetRpos
+        b
+        Ket
+        End
+------------------------------------------------------------------
+
 # End of testinput2
 Error -65: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data