Improve class handling for \H and \V by ignoring caseless.
This commit is contained in:
parent
15c30bf55a
commit
255bc030d9
|
@ -309,6 +309,11 @@ strings.
|
||||||
47. Detecting patterns that are too large inside the length-measuring loop
|
47. Detecting patterns that are too large inside the length-measuring loop
|
||||||
saves processing ridiculously long patterns to their end.
|
saves processing ridiculously long patterns to their end.
|
||||||
|
|
||||||
|
48. Ignore PCRE2_CASELESS when processing \h, \H, \v, and \V in classes as it
|
||||||
|
just wastes time. In the UTF case it can also produce redundant entries in
|
||||||
|
XCLASS lists caused by characters with multiple other cases and pairs of
|
||||||
|
characters in the same "not-x" sublists.
|
||||||
|
|
||||||
|
|
||||||
Version 10.22 29-July-2016
|
Version 10.22 29-July-2016
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
|
@ -117,7 +117,7 @@ them will be able to (i.e. assume a 64-bit world). */
|
||||||
/* Function definitions to allow mutual recursion */
|
/* Function definitions to allow mutual recursion */
|
||||||
|
|
||||||
static unsigned int
|
static unsigned int
|
||||||
add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
|
add_list_to_class_internal(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
|
||||||
const uint32_t *, unsigned int);
|
const uint32_t *, unsigned int);
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
@ -4219,13 +4219,15 @@ return 0;
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Add a character or range to a class *
|
* Add a character or range to a class (internal) *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This function packages up the logic of adding a character or range of
|
/* This function packages up the logic of adding a character or range of
|
||||||
characters to a class. The character values in the arguments will be within the
|
characters to a class. The character values in the arguments will be within the
|
||||||
valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
|
valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
|
||||||
mutually recursive with the function immediately below.
|
called only from within the "add to class" group of functions, some of which
|
||||||
|
are recursive and mutually recursive. The external entry point is
|
||||||
|
add_to_class().
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
classbits the bit map for characters < 256
|
classbits the bit map for characters < 256
|
||||||
|
@ -4240,8 +4242,8 @@ Returns: the number of < 256 characters added
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static unsigned int
|
static unsigned int
|
||||||
add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
|
add_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
|
||||||
compile_block *cb, uint32_t start, uint32_t end)
|
uint32_t options, compile_block *cb, uint32_t start, uint32_t end)
|
||||||
{
|
{
|
||||||
uint32_t c;
|
uint32_t c;
|
||||||
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
|
uint32_t classbits_end = (end <= 0xff ? end : 0xff);
|
||||||
|
@ -4267,12 +4269,12 @@ if ((options & PCRE2_CASELESS) != 0)
|
||||||
{
|
{
|
||||||
/* Handle a single character that has more than one other case. */
|
/* Handle a single character that has more than one other case. */
|
||||||
|
|
||||||
if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb,
|
if (rc > 0) n8 += add_list_to_class_internal(classbits, uchardptr, options, cb,
|
||||||
PRIV(ucd_caseless_sets) + rc, oc);
|
PRIV(ucd_caseless_sets) + rc, oc);
|
||||||
|
|
||||||
/* Do nothing if the other case range is within the original range. */
|
/* Do nothing if the other case range is within the original range. */
|
||||||
|
|
||||||
else if (oc >= start && od <= end) continue;
|
else if (oc >= cb->class_range_start && od <= cb->class_range_end) continue;
|
||||||
|
|
||||||
/* Extend the original range if there is overlap, noting that if oc < c, we
|
/* Extend the original range if there is overlap, noting that if oc < c, we
|
||||||
can't have od > end because a subrange is always shorter than the basic
|
can't have od > end because a subrange is always shorter than the basic
|
||||||
|
@ -4284,7 +4286,7 @@ if ((options & PCRE2_CASELESS) != 0)
|
||||||
end = od; /* Extend upwards */
|
end = od; /* Extend upwards */
|
||||||
if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
|
if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
|
||||||
}
|
}
|
||||||
else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od);
|
else n8 += add_to_class_internal(classbits, uchardptr, options, cb, oc, od);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -4299,13 +4301,15 @@ if ((options & PCRE2_CASELESS) != 0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Now handle the original range. Adjust the final value according to the bit
|
/* Now handle the originally supplied range. Adjust the final value according
|
||||||
length - this means that the same lists of (e.g.) horizontal spaces can be used
|
to the bit length - this means that the same lists of (e.g.) horizontal spaces
|
||||||
in all cases. */
|
can be used in all cases. */
|
||||||
|
|
||||||
if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
|
if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
|
||||||
end = MAX_NON_UTF_CHAR;
|
end = MAX_NON_UTF_CHAR;
|
||||||
|
|
||||||
|
if (start > cb->class_range_start && end < cb->class_range_end) return n8;
|
||||||
|
|
||||||
/* Use the bitmap for characters < 256. Otherwise use extra data.*/
|
/* Use the bitmap for characters < 256. Otherwise use extra data.*/
|
||||||
|
|
||||||
for (c = start; c <= classbits_end; c++)
|
for (c = start; c <= classbits_end; c++)
|
||||||
|
@ -4357,10 +4361,10 @@ if (end >= start)
|
||||||
*uchardata++ = XCL_SINGLE;
|
*uchardata++ = XCL_SINGLE;
|
||||||
*uchardata++ = start;
|
*uchardata++ = start;
|
||||||
}
|
}
|
||||||
#endif
|
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
|
||||||
*uchardptr = uchardata; /* Updata extra data pointer */
|
*uchardptr = uchardata; /* Updata extra data pointer */
|
||||||
}
|
}
|
||||||
#else
|
#else /* SUPPORT_WIDE_CHARS */
|
||||||
(void)uchardptr; /* Avoid compiler warning */
|
(void)uchardptr; /* Avoid compiler warning */
|
||||||
#endif /* SUPPORT_WIDE_CHARS */
|
#endif /* SUPPORT_WIDE_CHARS */
|
||||||
|
|
||||||
|
@ -4370,14 +4374,85 @@ return n8; /* Number of 8-bit characters */
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Add a list of characters to a class *
|
* Add a list of characters to a class (internal) *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This function is used for adding a list of case-equivalent characters to a
|
/* This function is used for adding a list of case-equivalent characters to a
|
||||||
class, and also for adding a list of horizontal or vertical whitespace. If the
|
class, and also for adding a list of horizontal or vertical whitespace. If the
|
||||||
list is in order (which it should be), ranges of characters are detected and
|
list is in order (which it should be), ranges of characters are detected and
|
||||||
handled appropriately. This function is mutually recursive with the function
|
handled appropriately. This function is called (sometimes recursively) only
|
||||||
above.
|
from within the "add to class" set of functions. The external entry point is
|
||||||
|
add_list_to_class().
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
classbits the bit map for characters < 256
|
||||||
|
uchardptr points to the pointer for extra data
|
||||||
|
options the options word
|
||||||
|
cb contains pointers to tables etc.
|
||||||
|
p points to row of 32-bit values, terminated by NOTACHAR
|
||||||
|
except character to omit; this is used when adding lists of
|
||||||
|
case-equivalent characters to avoid including the one we
|
||||||
|
already know about
|
||||||
|
|
||||||
|
Returns: the number of < 256 characters added
|
||||||
|
the pointer to extra data is updated
|
||||||
|
*/
|
||||||
|
|
||||||
|
static unsigned int
|
||||||
|
add_list_to_class_internal(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
|
||||||
|
uint32_t options, compile_block *cb, const uint32_t *p, unsigned int except)
|
||||||
|
{
|
||||||
|
unsigned int n8 = 0;
|
||||||
|
while (p[0] < NOTACHAR)
|
||||||
|
{
|
||||||
|
unsigned int n = 0;
|
||||||
|
if (p[0] != except)
|
||||||
|
{
|
||||||
|
while(p[n+1] == p[0] + n + 1) n++;
|
||||||
|
n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
|
||||||
|
}
|
||||||
|
p += n + 1;
|
||||||
|
}
|
||||||
|
return n8;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* External entry point for add range to class *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
/* This function sets the overall range so that the internal functions can try
|
||||||
|
to avoid duplication when handling case-independence.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
classbits the bit map for characters < 256
|
||||||
|
uchardptr points to the pointer for extra data
|
||||||
|
options the options word
|
||||||
|
cb compile data
|
||||||
|
start start of range character
|
||||||
|
end end of range character
|
||||||
|
|
||||||
|
Returns: the number of < 256 characters added
|
||||||
|
the pointer to extra data is updated
|
||||||
|
*/
|
||||||
|
|
||||||
|
static unsigned int
|
||||||
|
add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
|
||||||
|
compile_block *cb, uint32_t start, uint32_t end)
|
||||||
|
{
|
||||||
|
cb->class_range_start = start;
|
||||||
|
cb->class_range_end = end;
|
||||||
|
return add_to_class_internal(classbits, uchardptr, options, cb, start, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* External entry point for add list to class *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
/* This function sets the overall range so that the internal functions can try
|
||||||
|
to avoid duplication when handling case-independence.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
classbits the bit map for characters < 256
|
classbits the bit map for characters < 256
|
||||||
|
@ -4404,7 +4479,9 @@ while (p[0] < NOTACHAR)
|
||||||
if (p[0] != except)
|
if (p[0] != except)
|
||||||
{
|
{
|
||||||
while(p[n+1] == p[0] + n + 1) n++;
|
while(p[n+1] == p[0] + n + 1) n++;
|
||||||
n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]);
|
cb->class_range_start = p[0];
|
||||||
|
cb->class_range_end = p[n];
|
||||||
|
n8 += add_to_class_internal(classbits, uchardptr, options, cb, p[0], p[n]);
|
||||||
}
|
}
|
||||||
p += n + 1;
|
p += n + 1;
|
||||||
}
|
}
|
||||||
|
@ -5072,24 +5149,30 @@ for (;; pptr++)
|
||||||
for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
|
for (i = 0; i < 32; i++) classbits[i] |= ~cbits[i+cbit_space];
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
/* When adding the horizontal or vertical space lists to a class, or
|
||||||
|
their complements, disable PCRE2_CASELESS, because it justs wastes
|
||||||
|
time, and in the "not-x" UTF cases can create unwanted duplicates in
|
||||||
|
the XCLASS list (provoked by characters that have more than one other
|
||||||
|
case and by both cases being in the same "not-x" sublist). */
|
||||||
|
|
||||||
case ESC_h:
|
case ESC_h:
|
||||||
(void)add_list_to_class(classbits, &class_uchardata, options, cb,
|
(void)add_list_to_class(classbits, &class_uchardata,
|
||||||
PRIV(hspace_list), NOTACHAR);
|
options & ~PCRE2_CASELESS, cb, PRIV(hspace_list), NOTACHAR);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ESC_H:
|
case ESC_H:
|
||||||
(void)add_not_list_to_class(classbits, &class_uchardata, options,
|
(void)add_not_list_to_class(classbits, &class_uchardata,
|
||||||
cb, PRIV(hspace_list));
|
options & ~PCRE2_CASELESS, cb, PRIV(hspace_list));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ESC_v:
|
case ESC_v:
|
||||||
(void)add_list_to_class(classbits, &class_uchardata, options, cb,
|
(void)add_list_to_class(classbits, &class_uchardata,
|
||||||
PRIV(vspace_list), NOTACHAR);
|
options & ~PCRE2_CASELESS, cb, PRIV(vspace_list), NOTACHAR);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ESC_V:
|
case ESC_V:
|
||||||
(void)add_not_list_to_class(classbits, &class_uchardata, options,
|
(void)add_not_list_to_class(classbits, &class_uchardata,
|
||||||
cb, PRIV(vspace_list));
|
options & ~PCRE2_CASELESS, cb, PRIV(vspace_list));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case ESC_p:
|
case ESC_p:
|
||||||
|
|
|
@ -719,6 +719,8 @@ typedef struct compile_block {
|
||||||
uint32_t backref_map; /* Bitmap of low back refs */
|
uint32_t backref_map; /* Bitmap of low back refs */
|
||||||
uint32_t nltype; /* Newline type */
|
uint32_t nltype; /* Newline type */
|
||||||
uint32_t nllen; /* Newline string length */
|
uint32_t nllen; /* Newline string length */
|
||||||
|
uint32_t class_range_start; /* Overall class range start */
|
||||||
|
uint32_t class_range_end; /* Overall class range end */
|
||||||
PCRE2_UCHAR nl[4]; /* Newline string when fixed length */
|
PCRE2_UCHAR nl[4]; /* Newline string when fixed length */
|
||||||
int max_lookbehind; /* Maximum lookbehind (characters) */
|
int max_lookbehind; /* Maximum lookbehind (characters) */
|
||||||
int parens_depth; /* Depth of nested parentheses */
|
int parens_depth; /* Depth of nested parentheses */
|
||||||
|
|
|
@ -1757,4 +1757,10 @@
|
||||||
|
|
||||||
/^(?<!(?=))/B,utf
|
/^(?<!(?=))/B,utf
|
||||||
|
|
||||||
|
# Horizontal and vertical space lists ignore caseless
|
||||||
|
|
||||||
|
/[\HH]/Bi,utf
|
||||||
|
|
||||||
|
/[^\HH]/Bi,utf
|
||||||
|
|
||||||
# End of testinput5
|
# End of testinput5
|
||||||
|
|
|
@ -4214,4 +4214,22 @@ Failed: error 125 at offset 2: lookbehind assertion is not fixed length
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Horizontal and vertical space lists ignore caseless
|
||||||
|
|
||||||
|
/[\HH]/Bi,utf
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
[\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/[^\HH]/Bi,utf
|
||||||
|
------------------------------------------------------------------
|
||||||
|
Bra
|
||||||
|
[^\x00-\x08\x0a-\x1f!-\x9f\xa1-\xff\x{100}-\x{167f}\x{1681}-\x{180d}\x{180f}-\x{1fff}\x{200b}-\x{202e}\x{2030}-\x{205e}\x{2060}-\x{2fff}\x{3001}-\x{10ffff}]
|
||||||
|
Ket
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
# End of testinput5
|
# End of testinput5
|
||||||
|
|
Loading…
Reference in New Issue