Auto-possession and studying added; compile-time errors refactored.

This commit is contained in:
Philip.Hazel 2014-06-21 12:39:48 +00:00
parent bf2bc83ed8
commit 5008860489
9 changed files with 3175 additions and 151 deletions

View File

@ -261,6 +261,7 @@ NODIST_SOURCES = src/pcre2_chartables.c
## 8-, 16-, or 32-bit libraries are configured.
COMMON_SOURCES = \
src/pcre2_auto_possess.c \
src/pcre2_byte_order.c \
src/pcre2_compile.c \
src/pcre2_config.c \
@ -279,17 +280,14 @@ COMMON_SOURCES = \
src/pcre2_ord2utf.c \
src/pcre2_pattern_info.c \
src/pcre2_string_utils.c \
src/pcre2_study.c \
src/pcre2_substring.c \
src/pcre2_tables.c \
src/pcre2_ucd.c \
src/pcre2_ucp.h \
src/pcre2_valid_utf.c \
src/pcre2_version.c
# src/pcre2_refcount.c \
# src/pcre2_study.c \
# src/pcre2_xclass.c
src/pcre2_version.c \
src/pcre2_xclass.c
if WITH_PCRE8
lib_LTLIBRARIES += libpcre2-8.la

1322
src/pcre2_auto_possess.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -55,7 +55,6 @@ by defining macros in order to minimize #if usage. */
#if PCRE2_CODE_UNIT_WIDTH == 8
#define STRING_UTFn_RIGHTPAR STRING_UTF8_RIGHTPAR, 5
#define BAD_UTF_ERROR ERR44
#define XDIGIT(c) xdigitab[c]
#else /* Either 16-bit or 32-bit */
@ -63,11 +62,9 @@ by defining macros in order to minimize #if usage. */
#if PCRE2_CODE_UNIT_WIDTH == 16
#define STRING_UTFn_RIGHTPAR STRING_UTF16_RIGHTPAR, 6
#define BAD_UTF_ERROR ERR74
#else
#define STRING_UTFn_RIGHTPAR STRING_UTF32_RIGHTPAR, 6
#define BAD_UTF_ERROR ERR77
#endif
#endif
@ -150,8 +147,8 @@ have to check them every time. */
#define REQ_CASELESS (1 << 0) /* Indicates caselessness */
#define REQ_VARY (1 << 1) /* reqcu followed non-literal item */
/* Negative values for the firstcu and reqcu flags */
#define REQ_UNSET (-2)
#define REQ_NONE (-1)
#define REQ_UNSET (-2) /* Not yet found anything */
#define REQ_NONE (-1) /* Found not fixed char */
/* This bit (which is greater than any UTF value) is used to indicate that a
variable contains a number of code units instead of an actual code point. */
@ -553,7 +550,8 @@ static PCRE2_SPTR posix_substitutes[] = {
/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
eint2 in pcre2posix.c must be updated. */
eint2 in pcre2posix.c must be updated, and a new error text must be added to
compile_error_texts in pcre2_error.c. */
enum { ERR0 = COMPILE_ERROR_BASE,
ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
@ -563,8 +561,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86 };
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77 };
/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -1782,8 +1779,7 @@ else
}
if (overflow) /* Integer overflow */
{
while (IS_DIGIT(ptr[1]))
ptr++;
while (IS_DIGIT(ptr[1])) ptr++;
*errorcodeptr = ERR61;
break;
}
@ -1849,8 +1845,7 @@ else
}
if (overflow) /* Integer overflow */
{
while (IS_DIGIT(ptr[1]))
ptr++;
while (IS_DIGIT(ptr[1])) ptr++;
*errorcodeptr = ERR61;
break;
}
@ -1890,8 +1885,8 @@ else
specifying character codes in octal. The only supported form is \o{ddd}. */
case CHAR_o:
if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else
if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR77; else
{
ptr += 2;
c = 0;
@ -1921,7 +1916,7 @@ else
{
if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
}
else *errorcodeptr = ERR80;
else *errorcodeptr = ERR64;
}
break;
@ -1952,7 +1947,7 @@ else
ptr += 2;
if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
{
*errorcodeptr = ERR86;
*errorcodeptr = ERR77;
break;
}
c = 0;
@ -1988,7 +1983,7 @@ else
\x handling, but nowadays Perl gives an error, which seems much more
sensible, so we do too. */
else *errorcodeptr = ERR79;
else *errorcodeptr = ERR67;
} /* End of \x{} processing */
/* Read a single-byte hex-defined char (up to two hex digits after \x) */
@ -2013,7 +2008,7 @@ else
case CHAR_c:
c = *(++ptr);
if (c == CHAR_NULL)
if (c == CHAR_NULL && ptr >= cd->end_pattern)
{
*errorcodeptr = ERR2;
break;
@ -3309,7 +3304,8 @@ for (;; ptr++)
previous = NULL;
if ((options & PCRE2_MULTILINE) != 0)
{
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
if (firstcuflags == REQ_UNSET)
zerofirstcuflags = firstcuflags = REQ_NONE;
*code++ = OP_CIRCM;
}
else *code++ = OP_CIRC;
@ -3384,7 +3380,7 @@ for (;; ptr++)
ptr[1] == CHAR_EQUALS_SIGN) &&
check_posix_syntax(ptr, &tempptr))
{
*errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
*errorcodeptr = (ptr[1] == CHAR_COLON)? ERR12 : ERR13;
goto FAILED;
}
@ -3525,7 +3521,7 @@ for (;; ptr++)
if (ptr[1] != CHAR_COLON)
{
*errorcodeptr = ERR31;
*errorcodeptr = ERR13;
goto FAILED;
}
@ -3870,7 +3866,7 @@ for (;; ptr++)
{
if (descape == ESC_b) d = CHAR_BS; else
{
*errorcodeptr = ERR83;
*errorcodeptr = ERR50;
goto FAILED;
}
}
@ -3883,7 +3879,7 @@ for (;; ptr++)
ptr[1] == CHAR_EQUALS_SIGN) &&
check_posix_syntax(ptr, &tempptr))
{
*errorcodeptr = ERR83;
*errorcodeptr = ERR50;
goto FAILED;
}
}
@ -3932,7 +3928,7 @@ for (;; ptr++)
whatever repeat count may follow. In the case of reqcu, save the
previous value for reinstating. */
if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
{
ptr++;
zeroreqcu = reqcu;
@ -4833,7 +4829,7 @@ for (;; ptr++)
else
{
*errorcodeptr = ERR11;
*errorcodeptr = ERR10;
goto FAILED;
}
@ -5095,8 +5091,8 @@ for (;; ptr++)
{
case CHAR_NUMBER_SIGN: /* Comment; skip to ket */
ptr++;
while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
if (*ptr == CHAR_NULL)
while (ptr < cd->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
if (*ptr != CHAR_RIGHT_PARENTHESIS)
{
*errorcodeptr = ERR18;
goto FAILED;
@ -5216,7 +5212,7 @@ for (;; ptr++)
{
if (IS_DIGIT(*ptr))
{
*errorcodeptr = ERR84;
*errorcodeptr = ERR44; /* Group name must start with non-digit */
goto FAILED;
}
if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
@ -5477,7 +5473,7 @@ for (;; ptr++)
name = ++ptr;
if (IS_DIGIT(*ptr))
{
*errorcodeptr = ERR84; /* Group name must start with non-digit */
*errorcodeptr = ERR44; /* Group name must start with non-digit */
goto FAILED;
}
while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
@ -5554,7 +5550,6 @@ for (;; ptr++)
named_group *newspace =
cd->cx->memctl.malloc(newsize * sizeof(named_group),
cd->cx->memctl.memory_data);
if (newspace == NULL)
{
*errorcodeptr = ERR21;
@ -5597,7 +5592,7 @@ for (;; ptr++)
name = ++ptr;
if (IS_DIGIT(*ptr))
{
*errorcodeptr = ERR84; /* Group name must start with non-digit */
*errorcodeptr = ERR44; /* Group name must start with non-digit */
goto FAILED;
}
while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
@ -5613,7 +5608,6 @@ for (;; ptr++)
if (lengthptr != NULL)
{
named_group *ng;
if (namelen == 0)
{
*errorcodeptr = ERR62;
@ -5915,7 +5909,7 @@ for (;; ptr++)
case CHAR_x: *optset |= PCRE2_EXTENDED; break;
case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
default: *errorcodeptr = ERR12;
default: *errorcodeptr = ERR11;
ptr--; /* Correct the offset */
goto FAILED;
}
@ -6002,7 +5996,7 @@ for (;; ptr++)
if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
{
*errorcodeptr = ERR82;
*errorcodeptr = ERR19;
goto FAILED;
}
@ -6580,7 +6574,7 @@ branch_chain bc;
if (ccontext->stack_guard != NULL && ccontext->stack_guard(0))
{
*errorcodeptr= ERR85;
*errorcodeptr= ERR33;
return FALSE;
}
#endif
@ -7265,38 +7259,29 @@ PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_compile(PCRE2_SPTR pattern, int patlen, uint32_t options,
int *errorptr, size_t *erroroffset, pcre2_compile_context *ccontext)
{
BOOL utf; /* Set TRUE for UTF mode */
pcre2_real_code *re = NULL; /* What we will return */
pcre2_compile_context default_context; /* For use if no context given */
compile_data cd; /* "Static" compile-time data */
const uint8_t *tables; /* Char tables base pointer */
PCRE2_UCHAR *code; /* Current pointer in compiled code */
PCRE2_SPTR codestart; /* Start of compiled code */
PCRE2_SPTR ptr; /* Current pointer in pattern */
size_t length = 1; /* Allow or final END opcode */
size_t re_blocksize; /* Size of memory block */
int32_t firstcuflags, reqcuflags; /* Type of first/req code unit */
uint32_t firstcu, reqcu; /* Value of first/req code unit */
uint32_t skipatstart; /* When checking (*UTF) etc */
uint32_t limit_match = MATCH_LIMIT; /* Default match limits */
uint32_t limit_recursion = MATCH_LIMIT_RECURSION;
compile_data cd; /* "Static" compile-time data */
PCRE2_SPTR codestart; /* Start of compiled code */
PCRE2_SPTR ptr; /* Current pointer in pattern */
PCRE2_UCHAR *code; /* Current pointer in compiled code */
uint32_t skipatstart; /* When checking (*UTF) etc */
const uint8_t *tables; /* Char tables base pointer */
BOOL utf;
int newline = 0; /* Unset; can be set by the pattern */
int bsr = 0; /* Unset; can be set by the pattern */
int errorcode = 0; /* Initialize */
size_t re_blocksize;
int errorcode = 0; /* Initialize to avoid compiler warn */
/* Comments at the head of this file explain about these variables. */
@ -7494,7 +7479,7 @@ if (utf)
{
if ((options & PCRE2_NEVER_UTF) != 0)
{
errorcode = ERR78;
errorcode = ERR74;
goto HAD_ERROR;
}
if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
@ -7593,7 +7578,7 @@ if (re == NULL)
re->memctl = ccontext->memctl;
re->tables = tables;
re->executable_jit = NULL;
memset(re->start_bitmap, 32, 0);
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
re->blocksize = re_blocksize;
re->magic_number = MAGIC_NUMBER;
re->compile_options = options;
@ -7748,10 +7733,8 @@ if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
/* Unless disabled, check whether single character iterators can be
auto-possessified. The function overwrites the appropriate opcode values. */
#ifdef FIXME
if ((options & PCRE2_NO_AUTO_POSSESS) == 0)
auto_possessify((PCRE2_UCHAR *)codestart, utf, cd);
#endif
PRIV(auto_possessify)((PCRE2_UCHAR *)codestart, utf, &cd);
/* If there were any lookbehind assertions that contained OP_RECURSE
(recursions or subroutine calls), a flag is set for them to be checked here,
@ -7901,6 +7884,15 @@ do
}
while (*codestart == OP_ALT);
/* Finally, study the compiled pattern to set up information such as a bitmap
of starting code units and a minimum matching length. */
if (PRIV(study)(re) != 0)
{
errorcode = ERR31;
goto HAD_ERROR;
}
/* Control ends up here in all cases. If memory was obtained for a
zero-terminated copy of the pattern, remember to free it before returning. */

View File

@ -54,15 +54,13 @@ POSSIBILITY OF SUCH DAMAGE.
/* The texts of compile-time error messages. Compile-time error numbers start
at COMPILE_ERROR_BASE (100).
Do not ever re-use any error
number, because they are documented. Always add a new error instead. Messages
marked DEAD below are no longer used. This used to be a table of strings, but
in order to reduce the number of relocations needed when a shared library is
loaded dynamically, it is now one long string. We cannot use a table of
offsets, because the lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not
known. Instead, pcre2_get_error_message() counts through to the one it wants -
this isn't a performance issue because these strings are used only when there
is an error.
Do not ever re-use any error number, because they are documented. Always add a
new error instead. This used to be a table of strings, but in order to reduce
the number of relocations needed when a shared library is loaded dynamically,
it is now one long string. We cannot use a table of offsets, because the
lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead,
pcre2_get_error_message() counts through to the one it wants - this isn't a
performance issue because these strings are used only when there is an error.
Each substring ends with \0 to insert a null character. This includes the final
substring, so that the whole string ends with \0\0, which can be detected when
@ -81,21 +79,21 @@ static const char compile_error_texts[] =
"range out of order in character class\0"
"nothing to repeat\0"
/* 10 */
"operand of unlimited repeat could match the empty string\0" /** DEAD **/
"internal error: unexpected repeat\0"
"unrecognized character after (? or (?-\0"
"POSIX named classes are supported only within a class\0"
"missing )\0"
"POSIX collating elements are not supported\0"
"missing closing parenthesis\0"
/* 15 */
"reference to non-existent subpattern\0"
"pattern or erroffset passed as NULL\0"
"unknown option bit(s) set\0"
"missing ) after comment\0"
"parentheses nested too deeply\0" /** DEAD **/
"pattern passed as NULL\0"
"unknown compile-time option bit(s)\0"
"missing ) after (?# comment\0"
"parentheses are too deeply nested\0"
/* 20 */
"regular expression is too large\0"
"failed to get memory\0"
"unmatched parentheses\0"
"failed to allocate heap memory\0"
"unmatched closing parenthesis\0"
"internal error: code overflow\0"
"unrecognized character after (?<\0"
/* 25 */
@ -106,36 +104,36 @@ static const char compile_error_texts[] =
"(?R or (?[+-]digits must be followed by )\0"
/* 30 */
"unknown POSIX class name\0"
"POSIX collating elements are not supported\0"
"internal error in pcre2_study(): should not occur\0"
"this version of PCRE does not have UTF or Unicode property support\0"
"spare error\0" /** DEAD **/
"character value in \\x{} or \\o{} is too large\0"
"parentheses are too deeply nested (stack check)\0"
"character code point value in \\x{} or \\o{} is too large\0"
/* 35 */
"invalid condition (?(0)\0"
"\\C not allowed in lookbehind assertion\0"
"\\C is not allowed in a lookbehind assertion\0"
"PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
"number after (?C is > 255\0"
"closing ) for (?C expected\0"
"number after (?C is greater than 255\0"
"closing parenthesis for (?C expected\0"
/* 40 */
"recursive call could loop indefinitely\0"
"recursion could loop indefinitely\0"
"unrecognized character after (?P\0"
"syntax error in subpattern name (missing terminator)\0"
"two named subpatterns have the same name\0"
"invalid UTF-8 string\0"
"two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0"
"group name must start with a non-digit\0"
/* 45 */
"support for \\P, \\p, and \\X has not been compiled\0"
"this version of PCRE does not have support for \\P, \\p, or \\X\0"
"malformed \\P or \\p sequence\0"
"unknown property name after \\P or \\p\0"
"subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
/* 50 */
"repeated subpattern is too long\0" /** DEAD **/
"invalid range in character class\0"
"octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
"internal error: overran compiling workspace\0"
"internal error: previously-checked referenced subpattern not found\0"
"DEFINE group contains more than one branch\0"
/* 55 */
"repeating a DEFINE group is not allowed\0" /** DEAD **/
"missing opening brace after \\o\0"
"internal error: unknown newline setting\0"
"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
"a numbered reference must not be zero\0"
@ -145,11 +143,11 @@ static const char compile_error_texts[] =
"number is too big\0"
"subpattern name expected\0"
"digit expected after (?+\0"
"] is an invalid data character in JavaScript compatibility mode\0"
"non-octal character in \\o{} (closing brace missing?)\0"
/* 65 */
"different names for subpatterns of the same number are not allowed\0"
"(*MARK) must have an argument\0"
"this version of PCRE is not compiled with Unicode property support\0"
"non-hex character in \\x{} (closing brace missing?)\0"
"\\c must be followed by an ASCII character\0"
"\\k is not followed by a braced, angle-bracketed, or quoted name\0"
/* 70 */
@ -157,21 +155,11 @@ static const char compile_error_texts[] =
"\\N is not supported in a class\0"
"too many forward references\0"
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
"invalid UTF-16 string\0"
"using (*UTF) is disabled by the application\0"
/* 75 */
"name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
"character value in \\u.... sequence is too large\0"
"invalid UTF-32 string\0"
"setting UTF is disabled by the application\0"
"non-hex character in \\x{} (closing brace missing?)\0"
/* 80 */
"non-octal character in \\o{} (closing brace missing?)\0"
"missing opening brace after \\o\0"
"parentheses are too deeply nested\0"
"invalid range in character class\0"
"group name must start with a non-digit\0"
/* 85 */
"parentheses are too deeply nested (stack check)\0"
"character code point value in \\u.... sequence is too large\0"
"digits missing in \\x{} or \\o{}\0"
;
/* Match-time and UTF error texts are in the same format. */

View File

@ -225,10 +225,11 @@ else
#endif /* not HAVE_MEMMOVE */
#endif /* not VPCOMPAT */
/* External (in the C sense) functions and macros that are private to the
/* External (in the C sense) functions and tables that are private to the
libraries are always referenced using the PRIV macro. This makes it possible
for pcre2test.c to include some of the source files from the libraries using a
different PRIV definition to avoid name clashes. */
different PRIV definition to avoid name clashes. It also makes it clear in the
code that a non-static object is being referenced. */
#ifndef PRIV
#define PRIV(name) _pcre2_##name
@ -1724,7 +1725,7 @@ typedef struct {
uint16_t value;
} ucp_type_table;
/* Unicode character database (UCD) */
/* Unicode character database (UCD) record format */
typedef struct {
uint8_t script; /* ucp_Arabic, etc. */
@ -1734,16 +1735,6 @@ typedef struct {
int32_t other_case; /* offset to other case, or zero if none */
} ucd_record;
extern const uint32_t PRIV(ucd_caseless_sets)[];
extern const ucd_record PRIV(ucd_records)[];
extern const uint8_t PRIV(ucd_stage1)[];
extern const uint16_t PRIV(ucd_stage2)[];
extern const uint32_t PRIV(ucp_gentype)[];
extern const uint32_t PRIV(ucp_gbtable)[];
#ifdef SUPPORT_JIT
extern const int PRIV(ucp_typerange)[];
#endif
/* UCD access macros */
#define UCD_BLOCK_SIZE 128
@ -1774,11 +1765,10 @@ defined, so the following items are omitted. */
/* Internal shared data tables. These are tables that are used by more than one
of the exported public functions. They have to be "external" in the C sense,
but are not part of the PCRE2 public API. The data for these tables is in the
pcre2_tables.c module. Even though some of them are identical in each library,
they must have different names so that more than one library can be linked with
an application. UTF-8 tables are needed only when compiling the 8-bit library.
*/
but are not part of the PCRE2 public API. Although the data for some of the
tables is identical in all libraries, they must have different names so that
multiple libraries can be simultaneously linked to a single application.
However, UTF-8 tables are needed only when compiling the 8-bit library. */
#if PCRE2_CODE_UNIT_WIDTH == 8
extern const int PRIV(utf8_table1)[];
@ -1788,17 +1778,38 @@ extern const int PRIV(utf8_table3)[];
extern const uint8_t PRIV(utf8_table4)[];
#endif
extern const uint8_t PRIV(default_tables)[];
extern const uint8_t PRIV(OP_lengths)[];
#define _pcre2_OP_lengths PCRE2_SUFFIX(_pcre2_OP_lengths_)
#define _pcre2_default_tables PCRE2_SUFFIX(_pcre2_default_tables_)
#define _pcre2_hspace_list PCRE2_SUFFIX(_pcre2_hspace_list_)
#define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_)
#define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
#define _pcre2_ucd_records PCRE2_SUFFIX(_pcre2_ucd_records_)
#define _pcre2_ucd_stage1 PCRE2_SUFFIX(_pcre2_ucd_stage1_)
#define _pcre2_ucd_stage2 PCRE2_SUFFIX(_pcre2_ucd_stage2_)
#define _pcre2_ucp_gbtable PCRE2_SUFFIX(_pcre2_ucp_gbtable_)
#define _pcre2_ucp_gentype PCRE2_SUFFIX(_pcre2_ucp_gentype_)
#define _pcre2_ucp_typerange PCRE2_SUFFIX(_pcre2_ucp_typerange_)
#define _pcre2_utt PCRE2_SUFFIX(_pcre2_utt_)
#define _pcre2_utt_names PCRE2_SUFFIX(_pcre2_utt_names_)
#define _pcre2_utt_size PCRE2_SUFFIX(_pcre2_utt_size_)
extern const uint8_t PRIV(OP_lengths)[];
extern const uint8_t PRIV(default_tables)[];
extern const uint32_t PRIV(hspace_list)[];
extern const uint32_t PRIV(vspace_list)[];
extern const uint32_t PRIV(ucd_caseless_sets)[];
extern const ucd_record PRIV(ucd_records)[];
extern const uint8_t PRIV(ucd_stage1)[];
extern const uint16_t PRIV(ucd_stage2)[];
extern const uint32_t PRIV(ucp_gbtable)[];
extern const uint32_t PRIV(ucp_gentype)[];
#ifdef SUPPORT_JIT
extern const int PRIV(ucp_typerange)[];
#endif
extern const ucp_type_table PRIV(utt)[];
extern const char PRIV(utt_names)[];
extern const size_t PRIV(utt_size);
/* Mode-dependent macros and hidden and private structures are defined in a
separate file so that pcre2test can include them at all supported widths. When
compiling the library, PCRE2_CODE_UNIT_WIDTH will be defined, and we can
@ -1811,12 +1822,13 @@ private structures. */
#include "pcre2_intmodedep.h"
/* Internal shared functions. These are functions that are used by more than
one of the library's exported public functions. They have to be "external" in
the C sense, but are not part of the PCRE public API. They are not referenced
from pcre2test, and must not be defined when no code unit width is available.
*/
/* Private "external" functions. These are internal functions that are called
from modules other than the one in which they are defined. They have to be
"external" in the C sense, but are not part of the PCRE public API. They are
not referenced from pcre2test, and must not be defined when no code unit width
is available. */
#define _pcre2_auto_possessify PCRE2_SUFFIX(_pcre2_auto_possessify_)
#define _pcre2_compile_context_init PCRE2_SUFFIX(_pcre2_compile_context_init_)
#define _pcre2_find_bracket PCRE2_SUFFIX(_pcre2_find_bracket_)
#define _pcre2_is_newline PCRE2_SUFFIX(_pcre2_is_newline_)
@ -1828,9 +1840,12 @@ from pcre2test, and must not be defined when no code unit width is available.
#define _pcre2_strlen PCRE2_SUFFIX(_pcre_strlen_)
#define _pcre2_strncmp PCRE2_SUFFIX(_pcre_strncmp_)
#define _pcre2_strncmp_c8 PCRE2_SUFFIX(_pcre_strncmp_c8_)
#define _pcre2_study PCRE2_SUFFIX(_pcre_study_)
#define _pcre2_valid_utf PCRE2_SUFFIX(_pcre_valid_utf_)
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
extern void _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, const compile_data *);
extern void _pcre2_compile_context_init(pcre2_compile_context *, BOOL);
extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int);
extern BOOL _pcre2_is_newline(PCRE2_SPTR, int, PCRE2_SPTR, int *, BOOL);
@ -1842,8 +1857,10 @@ extern int _pcre2_strcmp_c8(PCRE2_SPTR, const char *);
extern int _pcre2_strlen(PCRE2_SPTR);
extern int _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t);
extern int _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t);
extern int _pcre2_study(pcre2_real_code *);
extern int _pcre2_valid_utf(PCRE2_SPTR, int, size_t *);
extern BOOL _pcre2_was_newline(PCRE2_SPTR, int, PCRE2_SPTR, int *, BOOL);
extern BOOL _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL);
#endif /* PCRE2_CODE_UNIT_WIDTH */
/* End of pcre2_internal.h */

1439
src/pcre2_study.c Normal file

File diff suppressed because it is too large Load Diff

269
src/pcre2_xclass.c Normal file
View File

@ -0,0 +1,269 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains an internal function that is used to match an extended
class. It is used by pcre2_auto_possessify() and by both pcre2_match() and
pcre2_def_match(). */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre2_internal.h"
/*************************************************
* Match character against an XCLASS *
*************************************************/
/* This function is called to match a character against an extended class that
might contain codepoints above 255 and/or Unicode properties.
Arguments:
c the character
data points to the flag code unit of the XCLASS data
utf TRUE if in UTF mode
Returns: TRUE if character matches, else FALSE
*/
BOOL
PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf)
{
PCRE2_UCHAR t;
BOOL negated = (*data & XCL_NOT) != 0;
#if PCRE2_CODE_UNIT_WIDTH == 8
/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
utf = TRUE;
#endif
/* Code points < 256 are matched against a bitmap, if one is present. If not,
we still carry on, because there may be ranges that start below 256 in the
additional data. */
if (c < 256)
{
if ((*data & XCL_HASPROP) == 0)
{
if ((*data & XCL_MAP) == 0) return negated;
return (((uint8_t *)(data + 1))[c/8] & (1 << (c&7))) != 0;
}
if ((*data & XCL_MAP) != 0 &&
(((uint8_t *)(data + 1))[c/8] & (1 << (c&7))) != 0)
return !negated; /* char found */
}
/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */
if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(PCRE2_UCHAR);
while ((t = *data++) != XCL_END)
{
uint32_t x, y;
if (t == XCL_SINGLE)
{
#ifdef SUPPORT_UTF
if (utf)
{
GETCHARINC(x, data); /* macro generates multiple statements */
}
else
#endif
x = *data++;
if (c == x) return !negated;
}
else if (t == XCL_RANGE)
{
#ifdef SUPPORT_UTF
if (utf)
{
GETCHARINC(x, data); /* macro generates multiple statements */
GETCHARINC(y, data); /* macro generates multiple statements */
}
else
#endif
{
x = *data++;
y = *data++;
}
if (c >= x && c <= y) return !negated;
}
#ifdef SUPPORT_UTF
else /* XCL_PROP & XCL_NOTPROP */
{
const ucd_record *prop = GET_UCD(c);
BOOL isprop = t == XCL_PROP;
switch(*data)
{
case PT_ANY:
if (isprop) return !negated;
break;
case PT_LAMP:
if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
prop->chartype == ucp_Lt) == isprop) return !negated;
break;
case PT_GC:
if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
return !negated;
break;
case PT_PC:
if ((data[1] == prop->chartype) == isprop) return !negated;
break;
case PT_SC:
if ((data[1] == prop->script) == isprop) return !negated;
break;
case PT_ALNUM:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
return !negated;
break;
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
which means that Perl space and POSIX space are now identical. PCRE
was changed at release 8.34. */
case PT_SPACE: /* Perl space */
case PT_PXSPACE: /* POSIX space */
switch(c)
{
HSPACE_CASES:
VSPACE_CASES:
if (isprop) return !negated;
break;
default:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
return !negated;
break;
}
break;
case PT_WORD:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
== isprop)
return !negated;
break;
case PT_UCNC:
if (c < 0xa0)
{
if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
c == CHAR_GRAVE_ACCENT) == isprop)
return !negated;
}
else
{
if ((c < 0xd800 || c > 0xdfff) == isprop)
return !negated;
}
break;
/* The following three properties can occur only in an XCLASS, as there
is no \p or \P coding for them. */
/* Graphic character. Implement this as not Z (space or separator) and
not C (other), except for Cf (format) with a few exceptions. This seems
to be what Perl does. The exceptional characters are:
U+061C Arabic Letter Mark
U+180E Mongolian Vowel Separator
U+2066 - U+2069 Various "isolate"s
*/
case PT_PXGRAPH:
if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
(PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
(prop->chartype == ucp_Cf &&
c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
)) == isprop)
return !negated;
break;
/* Printable character: same as graphic, with the addition of Zs, i.e.
not Zl and not Zp, and U+180E. */
case PT_PXPRINT:
if ((prop->chartype != ucp_Zl &&
prop->chartype != ucp_Zp &&
(PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
(prop->chartype == ucp_Cf &&
c != 0x061c && (c < 0x2066 || c > 0x2069))
)) == isprop)
return !negated;
break;
/* Punctuation: all Unicode punctuation, plus ASCII characters that
Unicode treats as symbols rather than punctuation, for Perl
compatibility (these are $+<=>^`|~). */
case PT_PXPUNCT:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
(c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
return !negated;
break;
/* This should never occur, but compilers may mutter if there is no
default. */
default:
return FALSE;
}
data += 2;
}
#endif /* SUPPORT_UTF */
}
return negated; /* char did not match */
}
/* End of pcre2_xclass.c */

View File

@ -80,36 +80,35 @@ static const int eint1[] = {
REG_EESCAPE, /* unrecognized character follows \ */
REG_BADBR, /* numbers out of order in {} quantifier */
/* 5 */
5, REG_BADBR, /* number too big in {} quantifier */
REG_BADBR, /* number too big in {} quantifier */
REG_EBRACK, /* missing terminating ] for character class */
REG_ECTYPE, /* invalid escape sequence in character class */
REG_ERANGE, /* range out of order in character class */
REG_BADRPT, /* nothing to repeat */
/* 10 */
REG_BADRPT, /* operand of unlimited repeat could match the empty string */
REG_ASSERT, /* internal error: unexpected repeat */
REG_BADPAT, /* unrecognized character after (? */
REG_BADPAT, /* unrecognized character after (? or (?- */
REG_BADPAT, /* POSIX named classes are supported only within a class */
REG_BADPAT, /* POSIX collating elements are not supported */
REG_EPAREN, /* missing ) */
/* 15 */
REG_ESUBREG, /* reference to non-existent subpattern */
REG_INVARG, /* erroffset passed as NULL */
REG_INVARG, /* unknown option bit(s) set */
REG_EPAREN, /* missing ) after comment */
REG_INVARG, /* pattern passed as NULL */
REG_INVARG, /* unknown compile-time option bit(s) */
REG_EPAREN, /* missing ) after (?# comment */
REG_ESIZE, /* parentheses nested too deeply */
/* 20 */
REG_ESIZE, /* regular expression too large */
REG_ESPACE, /* failed to get memory */
REG_EPAREN, /* unmatched parentheses */
REG_EPAREN, /* unmatched closing parenthesis */
REG_ASSERT /* internal error: code overflow */
};
static const int eint2[] = {
30, REG_ECTYPE, /* unknown POSIX class name */
32, REG_INVARG, /* this version of PCRE2 is not compiled with PCRE2_UTF8 support */
37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N, \U, or \u */
32, REG_INVARG, /* this version of PCRE does not have UTF or UCP support */
37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */
56, REG_INVARG, /* internal error: unknown newline setting */
67, REG_INVARG, /* this version of PCRE2 is not compiled with PCRE2_UCP support */
};
/* Table of texts corresponding to POSIX error codes */

View File

@ -422,8 +422,8 @@ static modstruct modlist[] = {
{ "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) },
{ "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) },
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
{ "bsr", MOD_CTC, MOD_BSR, 0, CO(bsr_convention) },
{ "bincode", MOD_PAT, MOD_CTL, CTL_BINCODE, PO(control) },
{ "bsr", MOD_CTC, MOD_BSR, 0, CO(bsr_convention) },
{ "callout_capture", MOD_DAT, MOD_CTL, CTL_CALLOUT_CAPTURE, DO(control) },
{ "callout_fail", MOD_DAT, MOD_IN2, 0, DO(cfail) },
{ "callout_none", MOD_DAT, MOD_CTL, CTL_CALLOUT_NONE, DO(control) },
@ -4899,8 +4899,8 @@ _setmode( _fileno( stdout ), _O_BINARY );
/* Initialization that does not depend on the running mode. */
memset(&def_patctl, sizeof(patctl), 0);
memset(&def_datctl, sizeof(datctl), 0);
memset(&def_patctl, 0, sizeof(patctl));
memset(&def_datctl, 0, sizeof(datctl));
def_datctl.oveccount = DEFAULT_OVECCOUNT;
def_datctl.copy_numbers[0] = -1;
def_datctl.get_numbers[0] = -1;