Improve class handling for \H and \V by ignoring caseless.

This commit is contained in:
Philip.Hazel 2017-01-03 18:17:31 +00:00
parent 15c30bf55a
commit 255bc030d9
5 changed files with 140 additions and 26 deletions

View File

@ -309,6 +309,11 @@ strings.
47. Detecting patterns that are too large inside the length-measuring loop
saves processing ridiculously long patterns to their end.
48. Ignore PCRE2_CASELESS when processing \h, \H, \v, and \V in classes as it
just wastes time. In the UTF case it can also produce redundant entries in
XCLASS lists caused by characters with multiple other cases and pairs of
characters in the same "not-x" sublists.
Version 10.22 29-July-2016
--------------------------

View File

@ -117,7 +117,7 @@ them will be able to (i.e. assume a 64-bit world). */
/* Function definitions to allow mutual recursion */
static unsigned int
add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
const uint32_t *, unsigned int);
static int
@ -4219,13 +4219,15 @@ return 0;
/*************************************************
* Add a character or range to a class *
* Add a character or range to a class (internal) *
*************************************************/
/* This function packages up the logic of adding a character or range of
characters to a class. The character values in the arguments will be within the
valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
mutually recursive with the function immediately below.
valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
called only from within the "add to class" group of functions, some of which
are recursive and mutually recursive. The external entry point is
add_to_class().
Arguments:
classbits the bit map for characters < 256
@ -4240,8 +4242,8 @@ Returns: the number of < 256 characters added
*/
static unsigned int
add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
compile_block *cb, uint32_t start, uint32_t end)
add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
{
uint32_t c;
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
@ -4267,12 +4269,12 @@ if ((options & PCRE2_CASELESS) != 0)
{
/* Handle a single character that has more than one other case. */
if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb,
if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
PRIV(ucd_caseless_sets) + rc, oc);
/* Do nothing if the other case range is within the original range. */
else if (oc >= start && od <= end) continue;
else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
/* Extend the original range if there is overlap, noting that if oc < c, we
can't have od > end because a subrange is always shorter than the basic
@ -4284,7 +4286,7 @@ if ((options & PCRE2_CASELESS) != 0)
end = od; /* Extend upwards */
if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
}
else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od);
else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
}
}
else
@ -4299,12 +4301,14 @@ if ((options & PCRE2_CASELESS) != 0)
}
}
/* Now handle the original range. Adjust the final value according to the bit
length - this means that the same lists of (e.g.) horizontal spaces can be used
in all cases. */
/* Now handle the originally supplied range. Adjust the final value according
to the bit length - this means that the same lists of (e.g.) horizontal spaces
can be used in all cases. */
if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
end = MAX_NON_UTF_CHAR;
if (start > cb->class_range_start && end < cb->class_range_end) return n8;
/* Use the bitmap for characters < 256. Otherwise use extra data.*/
@ -4357,10 +4361,10 @@ if (end >= start)
*uchardata++ = XCL_SINGLE;
*uchardata++ = start;
}
#endif
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
*uchardptr = uchardata; /* Updata extra data pointer */
}
#else
#else /* SUPPORT_WIDE_CHARS */
(void)uchardptr; /* Avoid compiler warning */
#endif /* SUPPORT_WIDE_CHARS */
@ -4370,14 +4374,85 @@ return n8; /* Number of 8-bit characters */
/*************************************************
* Add a list of characters to a class *
* Add a list of characters to a class (internal) *
*************************************************/
/* This function is used for adding a list of case-equivalent characters to a
class, and also for adding a list of horizontal or vertical whitespace. If the
list is in order (which it should be), ranges of characters are detected and
handled appropriately. This function is mutually recursive with the function
above.
handled appropriately. This function is called (sometimes recursively) only
from within the "add to class" set of functions. The external entry point is
add_list_to_class().
Arguments:
classbits the bit map for characters < 256
uchardptr points to the pointer for extra data
options the options word
cb contains pointers to tables etc.
p points to row of 32-bit values, terminated by NOTACHAR
except character to omit; this is used when adding lists of
case-equivalent characters to avoid including the one we
already know about
Returns: the number of < 256 characters added
the pointer to extra data is updated
*/
static unsigned int
add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
{
unsigned int n8 = 0;
while (p[0] < NOTACHAR)
{
unsigned int n = 0;
if (p[0] != except)
{
while(p[n+1] == p[0] + n + 1) n++;
n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
}
p += n + 1;
}
return n8;
}
/*************************************************
* External entry point for add range to class *
*************************************************/
/* This function sets the overall range so that the internal functions can try
to avoid duplication when handling case-independence.
Arguments:
classbits the bit map for characters < 256
uchardptr points to the pointer for extra data
options the options word
cb compile data
start start of range character
end end of range character
Returns: the number of < 256 characters added
the pointer to extra data is updated
*/
static unsigned int
add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
compile_block *cb, uint32_t start, uint32_t end)
{
cb->class_range_start = start;
cb->class_range_end = end;
return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
}
/*************************************************
* External entry point for add list to class *
*************************************************/
/* This function sets the overall range so that the internal functions can try
to avoid duplication when handling case-independence.
Arguments:
classbits the bit map for characters < 256
@ -4404,7 +4479,9 @@ while (p[0] < NOTACHAR)
if (p[0] != except)
{
while(p[n+1] == p[0] + n + 1) n++;
n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]);
cb->class_range_start = p[0];
cb->class_range_end = p[n];
n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
}
p += n + 1;
}
@ -5071,25 +5148,31 @@ for (;; pptr++)
should_flip_negation = TRUE;
for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
break;
/* When adding the horizontal or vertical space lists to a class, or
their complements, disable PCRE2_CASELESS, because it justs wastes
time, and in the "not-x" UTF cases can create unwanted duplicates in
the XCLASS list (provoked by characters that have more than one other
case and by both cases being in the same "not-x" sublist). */
case ESC_h:
(void)add_list_to_class(classbits, &class_uchardata, options, cb,
PRIV(hspace_list), NOTACHAR);
(void)add_list_to_class(classbits, &class_uchardata,
options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
break;
case ESC_H:
(void)add_not_list_to_class(classbits, &class_uchardata, options,
cb, PRIV(hspace_list));
(void)add_not_list_to_class(classbits, &class_uchardata,
options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
break;
case ESC_v:
(void)add_list_to_class(classbits, &class_uchardata, options, cb,
PRIV(vspace_list), NOTACHAR);
(void)add_list_to_class(classbits, &class_uchardata,
options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
break;
case ESC_V:
(void)add_not_list_to_class(classbits, &class_uchardata, options,
cb, PRIV(vspace_list));
(void)add_not_list_to_class(classbits, &class_uchardata,
options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
break;
case ESC_p:

View File

@ -719,6 +719,8 @@ typedef struct compile_block {
uint32_t backref_map; /* Bitmap of low back refs */
uint32_t nltype; /* Newline type */
uint32_t nllen; /* Newline string length */
uint32_t class_range_start; /* Overall class range start */
uint32_t class_range_end; /* Overall class range end */
PCRE2_UCHAR nl[4]; /* Newline string when fixed length */
int max_lookbehind; /* Maximum lookbehind (characters) */
int parens_depth; /* Depth of nested parentheses */

6
testdata/testinput5 vendored
View File

@ -1757,4 +1757,10 @@
/^(?<!(?=􃡜))/B,utf
# Horizontal and vertical space lists ignore caseless
/[\HH]/Bi,utf
/[^\HH]/Bi,utf
# End of testinput5

18
testdata/testoutput5 vendored
View File

@ -4214,4 +4214,22 @@ Failed: error 125 at offset 2: lookbehind assertion is not fixed length
End
------------------------------------------------------------------
# Horizontal and vertical space lists ignore caseless
/[\HH]/Bi,utf
------------------------------------------------------------------
Bra
[\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
Ket
End
------------------------------------------------------------------
/[^\HH]/Bi,utf
------------------------------------------------------------------
Bra
[^\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
Ket
End
------------------------------------------------------------------
# End of testinput5