Fix backtracking bug for \C\X* in UTF mode.

2015-04-08 16:33:58 +00:00 · 2015-04-08 16:33:58 +00:00 · aa8ee3ded5
parent 7105d249f6
commit aa8ee3ded5
4 changed files with 29 additions and 7 deletions
--- a/6
+++ b/6
@ -73,6 +73,12 @@ lookbehind assertion. This bug was discovered by the LLVM fuzzer.
 18. There was a similar problem to 17 in pcre2test for global matches, though
 the code there did catch the loop.
 19. If a greedy quantified \X was preceded by \C in UTF mode (e.g. \C\X*), 
 and a subsequent item in the pattern caused a non-match, backtracking over the 
 repeated \X did not stop, but carried on past the start of the subject, causing 
 reference to random memory and/or a segfault.  This bug was discovered by the
 LLVM fuzzer.
 Version 10.10 06-March-2015
 ---------------------------
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -1333,14 +1333,14 @@ for (;;)
        if (*ecode == OP_CALLOUT)
          {
          cb.callout_number = ecode[1 + 2*LINK_SIZE];
-          cb.callout_string_offset = 0; 
+          cb.callout_string_offset = 0;
          cb.callout_string = NULL;
          cb.callout_string_length = 0;
          }
        else
          {
          cb.callout_number = 0;
-          cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); 
+          cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
          cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1;
          cb.callout_string_length =
            callout_length - (1 + 4*LINK_SIZE) - 2;
@ -1408,7 +1408,7 @@ for (;;)
      break;
      case OP_FALSE:
-      case OP_FAIL:   /* The assertion (?!) becomes OP_FAIL */ 
+      case OP_FAIL:   /* The assertion (?!) becomes OP_FAIL */
      break;
      case OP_TRUE:
@ -1760,14 +1760,14 @@ for (;;)
        if (*ecode == OP_CALLOUT)
          {
          cb.callout_number = ecode[1 + 2*LINK_SIZE];
-          cb.callout_string_offset = 0; 
+          cb.callout_string_offset = 0;
          cb.callout_string = NULL;
          cb.callout_string_length = 0;
          }
        else
          {
          cb.callout_number = 0;
-          cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); 
+          cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
          cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1;
          cb.callout_string_length =
            callout_length - (1 + 4*LINK_SIZE) - 2;
@ -5723,12 +5723,17 @@ for (;;)
        if (possessive) continue;    /* No backtracking */
        /* We use <= pp rather than == pp to detect the start of the run while
        backtracking because the use of \C in UTF mode can cause BACKCHAR to
        move back past pp. This is just palliative; the use of \C in UTF mode
        is fraught with danger. */
        for(;;)
          {
          int lgb, rgb;
          PCRE2_SPTR fptr;
-          if (eptr == pp) goto TAIL_RECURSE;   /* At start of char run */
+          if (eptr <= pp) goto TAIL_RECURSE;   /* At start of char run */
          RMATCH(eptr, ecode, offset_top, mb, eptrb, RM45);
          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@ -5746,7 +5751,7 @@ for (;;)
          for (;;)
            {
-            if (eptr == pp) goto TAIL_RECURSE;   /* At start of char run */
+            if (eptr <= pp) goto TAIL_RECURSE;   /* At start of char run */
            fptr = eptr - 1;
            if (!utf) c = *fptr; else
              {
--- a/testdata/testinput4
+++ b/testdata/testinput4
@ -2221,4 +2221,9 @@
 "[\S\V\H]"utf
 /\C\X*TӅ;
 {0,6}\v+
F
 /utf
    Ӆ\x0a
 # End of testinput4
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@ -3741,4 +3741,10 @@ No match
 "[\S\V\H]"utf
 /\C\X*TӅ;
 {0,6}\v+
F
 /utf
    Ӆ\x0a
 No match
 # End of testinput4