Fix \C bug with repeated character classes in UTF-8 mode.

This commit is contained in:
Philip.Hazel 2018-02-19 17:26:33 +00:00
parent 553bf8a1dc
commit b26aa366ba
6 changed files with 32 additions and 4 deletions

View File

@ -20,6 +20,11 @@ Unicode newlines" in the default case when --enable-bsr-anycrlf has not been
specified. Similarly, running "pcfre2test -C bsr" never produced the result specified. Similarly, running "pcfre2test -C bsr" never produced the result
ANY. ANY.
4. Matching the pattern /(*UTF)\C[^\v]+\x80/ against an 8-bit string containing
multi-code-unit characters caused bad behaviour and possibly a crash. This
issue was fixed for other kinds of repeat in release 10.20 by change 19, but
repeating character classes were overlooked.
Version 10.31 12-February-2018 Version 10.31 12-February-2018
------------------------------ ------------------------------

View File

@ -1962,11 +1962,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (reptype == REPTYPE_POS) continue; /* No backtracking */ if (reptype == REPTYPE_POS) continue; /* No backtracking */
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
go too far. */
for (;;) for (;;)
{ {
RMATCH(Fecode, RM201); RMATCH(Fecode, RM201);
if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (Feptr-- == Lstart_eptr) break; /* Tried at original position */ if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
BACKCHAR(Feptr); BACKCHAR(Feptr);
} }
} }
@ -2126,11 +2130,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (reptype == REPTYPE_POS) continue; /* No backtracking */ if (reptype == REPTYPE_POS) continue; /* No backtracking */
/* After \C in UTF mode, Lstart_eptr might be in the middle of a
Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
go too far. */
for(;;) for(;;)
{ {
RMATCH(Fecode, RM101); RMATCH(Fecode, RM101);
if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (Feptr-- == Lstart_eptr) break; /* Tried at original position */ if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) BACKCHAR(Feptr); if (utf) BACKCHAR(Feptr);
#endif #endif
@ -4002,8 +4010,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (reptype == REPTYPE_POS) continue; /* No backtracking */ if (reptype == REPTYPE_POS) continue; /* No backtracking */
/* After \C in UTF mode, Lstart_eptr might be in the middle of a /* After \C in UTF mode, Lstart_eptr might be in the middle of a
Unicode character. Use <= pp to ensure backtracking doesn't go too far. Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't
*/ go too far. */
for(;;) for(;;)
{ {

View File

@ -98,4 +98,7 @@
\= Expect no match - tests \C at end of subject \= Expect no match - tests \C at end of subject
ab ab
/\C[^\v]+\x80/utf
[AΏBŀC]
# End of testinput22 # End of testinput22

View File

@ -171,4 +171,8 @@ No match
ab ab
No match No match
/\C[^\v]+\x80/utf
[AΏBŀC]
No match
# End of testinput22 # End of testinput22

View File

@ -169,4 +169,8 @@ No match
ab ab
No match No match
/\C[^\v]+\x80/utf
[AΏBŀC]
No match
# End of testinput22 # End of testinput22

View File

@ -173,4 +173,8 @@ No match
ab ab
No match No match
/\C[^\v]+\x80/utf
[AΏBŀC]
No match
# End of testinput22 # End of testinput22