Fix auto-possessification bug at the end of a capturing group that is called

recursively.
2018-01-31 17:53:56 +00:00 · 2018-01-31 17:53:56 +00:00 · 53a588431c
parent 7a5b962509
commit 53a588431c
6 changed files with 271 additions and 14 deletions
--- a/9
+++ b/9
@ -134,6 +134,15 @@ groups, making the ovector larger than this. The number has been increased to
 131072, which allows for the maximum number of captures (65535) plus the 
 overall match. This fixes oss-fuzz issue 5415.
 31. Auto-possessification at the end of a capturing group was dependent on what 
 follows the group (e.g. /(a+)b/ would auto-possessify the a+) but this caused 
 incorrect behaviour when the group was called recursively from elsewhere in the 
 pattern where something different might follow. This bug is an unforseen
 consequence of change #1 for 10.30 - the implementation of backtracking into
 recursions. Iterators at the ends of capturing groups are no longer considered
 for auto-possessification if the pattern contains any recursions. Fixes 
 Bugzilla #2232.
 Version 10.30 14-August-2017
 ----------------------------
--- a/src/pcre2_auto_possess.c
+++ b/src/pcre2_auto_possess.c
@ -558,47 +558,73 @@ for(;;)
    continue;
    }
  /* At the end of a branch, skip to the end of the group. */
  if (c == OP_ALT)
    {
    do code += GET(code, 1); while (*code == OP_ALT);
    c = *code;
    }
  /* Inspect the next opcode. */
  switch(c)
    {
-    case OP_END:
+    /* We can always possessify a greedy iterator at the end of the pattern,
-    case OP_KETRPOS:
+    which is reached after skipping over the final OP_KET. A non-greedy
-    /* TRUE only in greedy case. The non-greedy case could be replaced by
+    iterator must never be possessified. */
    an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
    uses more memory, which we cannot get at this stage.) */
    case OP_END:
    return base_list[1] != 0;
    /* When an iterator is at the end of certain kinds of group we can inspect
    what follows the group by skipping over the closing ket. Note that this
    does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
    iteration is variable (could be another iteration or could be the next
    item). As these two opcodes are not listed in the next switch, they will
    end up as the next code to inspect, and return FALSE by virtue of being
    unsupported. */
    case OP_KET:
-    /* If the bracket is capturing, and referenced by an OP_RECURSE, or
+    case OP_KETRPOS:
-    it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
+    /* The non-greedy case cannot be converted to a possessive form. */
    cannot be converted to a possessive form. */
    if (base_list[1] == 0) return FALSE;
    /* If the bracket is capturing it might be referenced by an OP_RECURSE
    so its last iterator can never be possessified if the pattern contains
    recursions. (This could be improved by keeping a list of group numbers that
    are called by recursion.) */
    switch(*(code - GET(code, 1)))
      {
      case OP_CBRA:
      case OP_SCBRA:
      case OP_CBRAPOS:
      case OP_SCBRAPOS:
      if (cb->had_recurse) return FALSE;
      break;
      /* Atomic sub-patterns and assertions can always auto-possessify their
      last iterator. However, if the group was entered as a result of checking
      a previous iterator, this is not possible. */
      case OP_ASSERT:
      case OP_ASSERT_NOT:
      case OP_ASSERTBACK:
      case OP_ASSERTBACK_NOT:
      case OP_ONCE:
      /* Atomic sub-patterns and assertions can always auto-possessify their
      last iterator. However, if the group was entered as a result of checking
      a previous iterator, this is not possible. */
      return !entered_a_group;
      }
    /* Skip over the bracket and inspect what comes next. */
    code += PRIV(OP_lengths)[c];
    continue;
    /* Handle cases where the next item is a group. */
    case OP_ONCE:
    case OP_BRA:
    case OP_CBRA:
@ -637,11 +663,15 @@ for(;;)
    code += PRIV(OP_lengths)[c];
    continue;
    /* The next opcode does not need special handling; fall through and use it
    to see if the base can be possessified. */
    default:
    break;
    }
-  /* Check for a supported opcode, and load its properties. */
+  /* We now have the next appropriate opcode to compare with the base. Check
  for a supported opcode, and load its properties. */
  code = get_chr_property_list(code, utf, cb->fcc, list);
  if (code == NULL) return FALSE;    /* Unsupported */
--- a/testdata/testinput1
+++ b/testdata/testinput1
@ -6159,4 +6159,34 @@ ef) x/x,mark
 /((?<=((*ACCEPT))X)\1?Y(*ACCEPT))\1/
    XYYZ
 /(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
    aa
    a
 /^(a?)b(?1)a/
    abaa
    aba 
    baa
    ba  
 /^(a?)+b(?1)a/
    abaa
    aba 
    baa
    ba  
 /^(a?)++b(?1)a/
    abaa
    aba 
    baa
    ba  
 /^(a?)+b/
    b
    ab
    aaab 
 /(?=a+)a(a+)++b/
    aab
 # End of testinput1 
--- a/testdata/testinput2
+++ b/testdata/testinput2
@ -5412,4 +5412,21 @@ a)"xI
 \= Expect no match
    \na
 # These tests are matched in test 1 as they are Perl compatible. Here we are
 # looking at what does and does not get auto-possessified. 
 /(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
 /(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
 /^(a?)b(?1)a/B
 /^(a?)+b(?1)a/B
 /^(a?)++b(?1)a/B
 /^(a?)+b/B
 /(?=a+)a(a+)++b/B
 # End of testinput2
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@ -9758,4 +9758,68 @@ No match
 1: Y
 2: 
 /(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
    aa
 0: aa
    a
 0: a
 /^(a?)b(?1)a/
    abaa
 0: abaa
 1: a
    aba 
 0: aba
 1: a
    baa
 0: baa
 1: 
    ba  
 0: ba
 1: 
 /^(a?)+b(?1)a/
    abaa
 0: abaa
 1: 
    aba 
 0: aba
 1: 
    baa
 0: baa
 1: 
    ba  
 0: ba
 1: 
 /^(a?)++b(?1)a/
    abaa
 0: abaa
 1: 
    aba 
 0: aba
 1: 
    baa
 0: baa
 1: 
    ba  
 0: ba
 1: 
 /^(a?)+b/
    b
 0: b
 1: 
    ab
 0: ab
 1: 
    aaab 
 0: aaab
 1: 
 /(?=a+)a(a+)++b/
    aab
 0: aab
 1: a
 # End of testinput1 
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@ -12701,7 +12701,7 @@ Subject length lower bound = 5
        Ket
        a
        CBraPos 1
-        a++
+        a+
        KetRpos
        a
        Ket
@ -16468,6 +16468,113 @@ No match
    \na
 No match
 # These tests are matched in test 1 as they are Perl compatible. Here we are
 # looking at what does and does not get auto-possessified. 
 /(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
 ------------------------------------------------------------------
        Bra
        Cond
        Cond false
        CBra 1
        a?
        Ket
        Ket
        ^
        Recurse
        a
        $
        Ket
        End
 ------------------------------------------------------------------
 /(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
 ------------------------------------------------------------------
        Bra
        Cond
        Cond false
        CBra 1
        a?
        Ket
        X
        Ket
        ^
        Recurse
        a
        $
        Ket
        End
 ------------------------------------------------------------------
 /^(a?)b(?1)a/B
 ------------------------------------------------------------------
        Bra
        ^
        CBra 1
        a?
        Ket
        b
        Recurse
        a
        Ket
        End
 ------------------------------------------------------------------
 /^(a?)+b(?1)a/B
 ------------------------------------------------------------------
        Bra
        ^
        SCBra 1
        a?
        KetRmax
        b
        Recurse
        a
        Ket
        End
 ------------------------------------------------------------------
 /^(a?)++b(?1)a/B
 ------------------------------------------------------------------
        Bra
        ^
        SCBraPos 1
        a?
        KetRpos
        b
        Recurse
        a
        Ket
        End
 ------------------------------------------------------------------
 /^(a?)+b/B
 ------------------------------------------------------------------
        Bra
        ^
        SCBra 1
        a?
        KetRmax
        b
        Ket
        End
 ------------------------------------------------------------------
 /(?=a+)a(a+)++b/B
 ------------------------------------------------------------------
        Bra
        Assert
        a++
        Ket
        a
        CBraPos 1
        a++
        KetRpos
        b
        Ket
        End
 ------------------------------------------------------------------
 # End of testinput2
 Error -65: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data