Auto-possession and studying added; compile-time errors refactored.

2014-06-21 12:39:48 +00:00 · 2014-06-21 12:39:48 +00:00 · 5008860489
parent bf2bc83ed8
commit 5008860489
9 changed files with 3175 additions and 151 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -261,6 +261,7 @@ NODIST_SOURCES = src/pcre2_chartables.c
 ## 8-, 16-, or 32-bit libraries are configured.

 COMMON_SOURCES = \
+  src/pcre2_auto_possess.c \
  src/pcre2_byte_order.c \
  src/pcre2_compile.c \
  src/pcre2_config.c \
@ -279,17 +280,14 @@ COMMON_SOURCES = \
  src/pcre2_ord2utf.c \
  src/pcre2_pattern_info.c \
  src/pcre2_string_utils.c \
+  src/pcre2_study.c \
  src/pcre2_substring.c \
  src/pcre2_tables.c \
  src/pcre2_ucd.c \
  src/pcre2_ucp.h \
  src/pcre2_valid_utf.c \
-  src/pcre2_version.c
-
-#  src/pcre2_refcount.c \
-#  src/pcre2_study.c \
-#  src/pcre2_xclass.c
-
+  src/pcre2_version.c \
+  src/pcre2_xclass.c

 if WITH_PCRE8
 lib_LTLIBRARIES += libpcre2-8.la
--- a/src/pcre2_auto_possess.c
+++ b/src/pcre2_auto_possess.c
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -55,7 +55,6 @@ by defining macros in order to minimize #if usage. */

 #if PCRE2_CODE_UNIT_WIDTH == 8
 #define STRING_UTFn_RIGHTPAR        STRING_UTF8_RIGHTPAR, 5
-#define BAD_UTF_ERROR               ERR44
 #define XDIGIT(c)                   xdigitab[c]

 #else  /* Either 16-bit or 32-bit */
@ -63,11 +62,9 @@ by defining macros in order to minimize #if usage. */

 #if PCRE2_CODE_UNIT_WIDTH == 16
 #define STRING_UTFn_RIGHTPAR        STRING_UTF16_RIGHTPAR, 6
-#define BAD_UTF_ERROR               ERR74

 #else
 #define STRING_UTFn_RIGHTPAR        STRING_UTF32_RIGHTPAR, 6
-#define BAD_UTF_ERROR               ERR77
 #endif
 #endif

@ -150,8 +147,8 @@ have to check them every time. */
 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
 #define REQ_VARY        (1 << 1)        /* reqcu followed non-literal item */
 /* Negative values for the firstcu and reqcu flags */
-#define REQ_UNSET       (-2)
-#define REQ_NONE        (-1)
+#define REQ_UNSET       (-2)            /* Not yet found anything */
+#define REQ_NONE        (-1)            /* Found not fixed char */

 /* This bit (which is greater than any UTF value) is used to indicate that a 
 variable contains a number of code units instead of an actual code point. */
@ -553,7 +550,8 @@ static PCRE2_SPTR posix_substitutes[] = {

 /* Compile time error code numbers. They are given names so that they can more
 easily be tracked. When a new number is added, the tables called eint1 and 
-eint2 in pcre2posix.c must be updated. */
+eint2 in pcre2posix.c must be updated, and a new error text must be added to 
+compile_error_texts in pcre2_error.c. */

 enum { ERR0 = COMPILE_ERROR_BASE,  
       ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10, 
@ -563,8 +561,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
       ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, 
       ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, 
       ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, 
-       ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80, 
-       ERR81, ERR82, ERR83, ERR84, ERR85, ERR86 };
+       ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77 }; 

 /* This is a table of start-of-pattern options such as (*UTF) and settings such
 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -1782,8 +1779,7 @@ else
      }
    if (overflow) /* Integer overflow */
      {
-      while (IS_DIGIT(ptr[1]))
-        ptr++;
+      while (IS_DIGIT(ptr[1])) ptr++;
      *errorcodeptr = ERR61;
      break;
      }
@ -1849,8 +1845,7 @@ else
        }
      if (overflow) /* Integer overflow */
        {
-        while (IS_DIGIT(ptr[1]))
-          ptr++;
+        while (IS_DIGIT(ptr[1])) ptr++;
        *errorcodeptr = ERR61;
        break;
        }
@ -1890,8 +1885,8 @@ else
    specifying character codes in octal. The only supported form is \o{ddd}. */

    case CHAR_o:
-    if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
-    if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else 
+    if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else
+    if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR77; else 
      {
      ptr += 2;
      c = 0;
@ -1921,7 +1916,7 @@ else
        {
        if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
        }
-      else *errorcodeptr = ERR80;
+      else *errorcodeptr = ERR64;
      }
    break;

@ -1952,7 +1947,7 @@ else
        ptr += 2;
        if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
          {
-          *errorcodeptr = ERR86;
+          *errorcodeptr = ERR77;
          break;
          }    
        c = 0;
@ -1988,7 +1983,7 @@ else
        \x handling, but nowadays Perl gives an error, which seems much more
        sensible, so we do too. */

-        else *errorcodeptr = ERR79;
+        else *errorcodeptr = ERR67;
        }   /* End of \x{} processing */

      /* Read a single-byte hex-defined char (up to two hex digits after \x) */
@ -2013,7 +2008,7 @@ else

    case CHAR_c:
    c = *(++ptr);
-    if (c == CHAR_NULL)
+    if (c == CHAR_NULL && ptr >= cd->end_pattern)
      {
      *errorcodeptr = ERR2;
      break;
@ -3309,7 +3304,8 @@ for (;; ptr++)
    previous = NULL;
    if ((options & PCRE2_MULTILINE) != 0)
      {
-      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
+      if (firstcuflags == REQ_UNSET) 
+        zerofirstcuflags = firstcuflags = REQ_NONE;
      *code++ = OP_CIRCM;
      }
    else *code++ = OP_CIRC;
@ -3384,7 +3380,7 @@ for (;; ptr++)
         ptr[1] == CHAR_EQUALS_SIGN) &&
        check_posix_syntax(ptr, &tempptr))
      {
-      *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
+      *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR12 : ERR13;
      goto FAILED;
      }

@ -3525,7 +3521,7 @@ for (;; ptr++)

        if (ptr[1] != CHAR_COLON)
          {
-          *errorcodeptr = ERR31;
+          *errorcodeptr = ERR13;
          goto FAILED;
          }

@ -3870,7 +3866,7 @@ for (;; ptr++)
              {
              if (descape == ESC_b) d = CHAR_BS; else
                {
-                *errorcodeptr = ERR83;
+                *errorcodeptr = ERR50;
                goto FAILED;
                }
              }
@ -3883,7 +3879,7 @@ for (;; ptr++)
                    ptr[1] == CHAR_EQUALS_SIGN) &&
                   check_posix_syntax(ptr, &tempptr))
            {
-            *errorcodeptr = ERR83;
+            *errorcodeptr = ERR50;
            goto FAILED;
            }
          }
@ -3932,7 +3928,7 @@ for (;; ptr++)
      whatever repeat count may follow. In the case of reqcu, save the
      previous value for reinstating. */

-      if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
+      if (!inescq && class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
        {
        ptr++;
        zeroreqcu = reqcu;
@ -4833,7 +4829,7 @@ for (;; ptr++)

    else
      {
-      *errorcodeptr = ERR11;
+      *errorcodeptr = ERR10;
      goto FAILED;
      }

@ -5095,8 +5091,8 @@ for (;; ptr++)
        {
        case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
        ptr++;
-        while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
-        if (*ptr == CHAR_NULL)
+        while (ptr < cd->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
+        if (*ptr != CHAR_RIGHT_PARENTHESIS)
          {
          *errorcodeptr = ERR18;
          goto FAILED;
@ -5216,7 +5212,7 @@ for (;; ptr++)
          {
          if (IS_DIGIT(*ptr))
            {
-            *errorcodeptr = ERR84;
+            *errorcodeptr = ERR44;  /* Group name must start with non-digit */
            goto FAILED;
            }
          if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
@ -5477,7 +5473,7 @@ for (;; ptr++)
        name = ++ptr;
        if (IS_DIGIT(*ptr))
          {
-          *errorcodeptr = ERR84;   /* Group name must start with non-digit */
+          *errorcodeptr = ERR44;   /* Group name must start with non-digit */
          goto FAILED;
          }
        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
@ -5554,7 +5550,6 @@ for (;; ptr++)
              named_group *newspace = 
                cd->cx->memctl.malloc(newsize * sizeof(named_group),
                cd->cx->memctl.memory_data);
-
              if (newspace == NULL)
                {
                *errorcodeptr = ERR21;
@ -5597,7 +5592,7 @@ for (;; ptr++)
        name = ++ptr;
        if (IS_DIGIT(*ptr))
          {
-          *errorcodeptr = ERR84;   /* Group name must start with non-digit */
+          *errorcodeptr = ERR44;   /* Group name must start with non-digit */
          goto FAILED;
          }
        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
@ -5613,7 +5608,6 @@ for (;; ptr++)
        if (lengthptr != NULL)
          {
          named_group *ng;
-
          if (namelen == 0)
            {
            *errorcodeptr = ERR62;
@ -5915,7 +5909,7 @@ for (;; ptr++)
            case CHAR_x: *optset |= PCRE2_EXTENDED; break;
            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;

-            default:  *errorcodeptr = ERR12;
+            default:  *errorcodeptr = ERR11;
                      ptr--;    /* Correct the offset */
                      goto FAILED;
            }
@ -6002,7 +5996,7 @@ for (;; ptr++)

    if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
      {
-      *errorcodeptr = ERR82;
+      *errorcodeptr = ERR19;
      goto FAILED;
      }

@ -6580,7 +6574,7 @@ branch_chain bc;

 if (ccontext->stack_guard != NULL && ccontext->stack_guard(0))
  {
-  *errorcodeptr= ERR85;
+  *errorcodeptr= ERR33;
  return FALSE;
  }
 #endif   
@ -7265,38 +7259,29 @@ PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
 pcre2_compile(PCRE2_SPTR pattern, int patlen, uint32_t options,
   int *errorptr, size_t *erroroffset, pcre2_compile_context *ccontext)
 {
+BOOL utf;                               /* Set TRUE for UTF mode */
 pcre2_real_code *re = NULL;             /* What we will return */
 pcre2_compile_context default_context;  /* For use if no context given */
+compile_data cd;                        /* "Static" compile-time data */
+const uint8_t *tables;                  /* Char tables base pointer */
+
+PCRE2_UCHAR *code;                      /* Current pointer in compiled code */
+PCRE2_SPTR codestart;                   /* Start of compiled code */
+PCRE2_SPTR ptr;                         /* Current pointer in pattern */

 size_t length = 1;                      /* Allow or final END opcode */
+size_t re_blocksize;                    /* Size of memory block */

 int32_t firstcuflags, reqcuflags;       /* Type of first/req code unit */
 uint32_t firstcu, reqcu;                /* Value of first/req code unit */

+uint32_t skipatstart;                   /* When checking (*UTF) etc */
 uint32_t limit_match = MATCH_LIMIT;     /* Default match limits */
 uint32_t limit_recursion = MATCH_LIMIT_RECURSION;

-compile_data cd;                        /* "Static" compile-time data */
-
-PCRE2_SPTR codestart;                   /* Start of compiled code */
-PCRE2_SPTR ptr;                         /* Current pointer in pattern */
-PCRE2_UCHAR *code;                      /* Current pointer in compiled code */
-
-uint32_t skipatstart;                   /* When checking (*UTF) etc */
-
-const uint8_t *tables;                  /* Char tables base pointer */
-
-
-BOOL utf;
-
 int newline = 0;                        /* Unset; can be set by the pattern */
 int bsr = 0;                            /* Unset; can be set by the pattern */
-
-int errorcode = 0;                      /* Initialize  */
-
-size_t re_blocksize;
-
-
+int errorcode = 0;                      /* Initialize to avoid compiler warn */

 /* Comments at the head of this file explain about these variables. */

@ -7494,7 +7479,7 @@ if (utf)
  {
  if ((options & PCRE2_NEVER_UTF) != 0)
    {
-    errorcode = ERR78;
+    errorcode = ERR74;
    goto HAD_ERROR;
    }
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
@ -7593,7 +7578,7 @@ if (re == NULL)
 re->memctl = ccontext->memctl;
 re->tables = tables;
 re->executable_jit = NULL;
-memset(re->start_bitmap, 32, 0);
+memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
 re->blocksize = re_blocksize;
 re->magic_number = MAGIC_NUMBER;
 re->compile_options = options;
@ -7748,10 +7733,8 @@ if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
 /* Unless disabled, check whether single character iterators can be
 auto-possessified. The function overwrites the appropriate opcode values. */

-#ifdef FIXME
 if ((options & PCRE2_NO_AUTO_POSSESS) == 0)
-  auto_possessify((PCRE2_UCHAR *)codestart, utf, cd);
-#endif   
+  PRIV(auto_possessify)((PCRE2_UCHAR *)codestart, utf, &cd);

 /* If there were any lookbehind assertions that contained OP_RECURSE
 (recursions or subroutine calls), a flag is set for them to be checked here,
@ -7901,6 +7884,15 @@ do
  }
 while (*codestart == OP_ALT);

+/* Finally, study the compiled pattern to set up information such as a bitmap 
+of starting code units and a minimum matching length. */
+
+if (PRIV(study)(re) != 0)
+  {
+  errorcode = ERR31;
+  goto HAD_ERROR;  
+  } 
+
 /* Control ends up here in all cases. If memory was obtained for a
 zero-terminated copy of the pattern, remember to free it before returning. */

--- a/src/pcre2_error.c
+++ b/src/pcre2_error.c
@ -54,15 +54,13 @@ POSSIBILITY OF SUCH DAMAGE.
 /* The texts of compile-time error messages. Compile-time error numbers start 
 at COMPILE_ERROR_BASE (100).

-Do not ever re-use any error
-number, because they are documented. Always add a new error instead. Messages
-marked DEAD below are no longer used. This used to be a table of strings, but
-in order to reduce the number of relocations needed when a shared library is
-loaded dynamically, it is now one long string. We cannot use a table of
-offsets, because the lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not
-known. Instead, pcre2_get_error_message() counts through to the one it wants -
-this isn't a performance issue because these strings are used only when there
-is an error.
+Do not ever re-use any error number, because they are documented. Always add a
+new error instead. This used to be a table of strings, but in order to reduce
+the number of relocations needed when a shared library is loaded dynamically,
+it is now one long string. We cannot use a table of offsets, because the
+lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead,
+pcre2_get_error_message() counts through to the one it wants - this isn't a
+performance issue because these strings are used only when there is an error.

 Each substring ends with \0 to insert a null character. This includes the final
 substring, so that the whole string ends with \0\0, which can be detected when
@ -81,21 +79,21 @@ static const char compile_error_texts[] =
  "range out of order in character class\0"
  "nothing to repeat\0"
  /* 10 */
-  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
  "internal error: unexpected repeat\0"
  "unrecognized character after (? or (?-\0"
  "POSIX named classes are supported only within a class\0"
-  "missing )\0"
+  "POSIX collating elements are not supported\0"
+  "missing closing parenthesis\0"
  /* 15 */
  "reference to non-existent subpattern\0"
-  "pattern or erroffset passed as NULL\0"
-  "unknown option bit(s) set\0"
-  "missing ) after comment\0"
-  "parentheses nested too deeply\0"  /** DEAD **/
+  "pattern passed as NULL\0"
+  "unknown compile-time option bit(s)\0"
+  "missing ) after (?# comment\0"
+  "parentheses are too deeply nested\0"
  /* 20 */
  "regular expression is too large\0"
-  "failed to get memory\0"
-  "unmatched parentheses\0"
+  "failed to allocate heap memory\0"
+  "unmatched closing parenthesis\0"
  "internal error: code overflow\0"
  "unrecognized character after (?<\0"
  /* 25 */
@ -106,36 +104,36 @@ static const char compile_error_texts[] =
  "(?R or (?[+-]digits must be followed by )\0"
  /* 30 */
  "unknown POSIX class name\0"
-  "POSIX collating elements are not supported\0"
+  "internal error in pcre2_study(): should not occur\0" 
  "this version of PCRE does not have UTF or Unicode property support\0"
-  "spare error\0"  /** DEAD **/
-  "character value in \\x{} or \\o{} is too large\0"
+  "parentheses are too deeply nested (stack check)\0"
+  "character code point value in \\x{} or \\o{} is too large\0"
  /* 35 */
  "invalid condition (?(0)\0"
-  "\\C not allowed in lookbehind assertion\0"
+  "\\C is not allowed in a lookbehind assertion\0"
  "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
-  "number after (?C is > 255\0"
-  "closing ) for (?C expected\0"
+  "number after (?C is greater than 255\0"
+  "closing parenthesis for (?C expected\0"
  /* 40 */
-  "recursive call could loop indefinitely\0"
+  "recursion could loop indefinitely\0"
  "unrecognized character after (?P\0"
  "syntax error in subpattern name (missing terminator)\0"
-  "two named subpatterns have the same name\0"
-  "invalid UTF-8 string\0"
+  "two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0"
+  "group name must start with a non-digit\0"
  /* 45 */
-  "support for \\P, \\p, and \\X has not been compiled\0"
+  "this version of PCRE does not have support for \\P, \\p, or \\X\0"
  "malformed \\P or \\p sequence\0"
  "unknown property name after \\P or \\p\0"
  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
  /* 50 */
-  "repeated subpattern is too long\0"    /** DEAD **/
+  "invalid range in character class\0"
  "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
  "internal error: overran compiling workspace\0"
  "internal error: previously-checked referenced subpattern not found\0"
  "DEFINE group contains more than one branch\0"
  /* 55 */
-  "repeating a DEFINE group is not allowed\0"  /** DEAD **/
+  "missing opening brace after \\o\0"
  "internal error: unknown newline setting\0"
  "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
  "a numbered reference must not be zero\0"
@ -145,11 +143,11 @@ static const char compile_error_texts[] =
  "number is too big\0"
  "subpattern name expected\0"
  "digit expected after (?+\0"
-  "] is an invalid data character in JavaScript compatibility mode\0"
+  "non-octal character in \\o{} (closing brace missing?)\0"
  /* 65 */
  "different names for subpatterns of the same number are not allowed\0"
  "(*MARK) must have an argument\0"
-  "this version of PCRE is not compiled with Unicode property support\0"
+  "non-hex character in \\x{} (closing brace missing?)\0"
  "\\c must be followed by an ASCII character\0"
  "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
  /* 70 */
@ -157,21 +155,11 @@ static const char compile_error_texts[] =
  "\\N is not supported in a class\0"
  "too many forward references\0"
  "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
-  "invalid UTF-16 string\0"
+  "using (*UTF) is disabled by the application\0"
  /* 75 */
  "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
-  "character value in \\u.... sequence is too large\0"
-  "invalid UTF-32 string\0"
-  "setting UTF is disabled by the application\0"
-  "non-hex character in \\x{} (closing brace missing?)\0"
-  /* 80 */
-  "non-octal character in \\o{} (closing brace missing?)\0"
-  "missing opening brace after \\o\0"
-  "parentheses are too deeply nested\0"
-  "invalid range in character class\0"
-  "group name must start with a non-digit\0"
-  /* 85 */
-  "parentheses are too deeply nested (stack check)\0"
+  "character code point value in \\u.... sequence is too large\0"
+  "digits missing in \\x{} or \\o{}\0" 
  ;

 /* Match-time and UTF error texts are in the same format. */
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -225,10 +225,11 @@ else
 #endif   /* not HAVE_MEMMOVE */
 #endif   /* not VPCOMPAT */

-/* External (in the C sense) functions and macros that are private to the 
+/* External (in the C sense) functions and tables that are private to the 
 libraries are always referenced using the PRIV macro. This makes it possible
 for pcre2test.c to include some of the source files from the libraries using a
-different PRIV definition to avoid name clashes. */
+different PRIV definition to avoid name clashes. It also makes it clear in the 
+code that a non-static object is being referenced. */

 #ifndef PRIV
 #define PRIV(name) _pcre2_##name
@ -1686,7 +1687,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
  1, 3,                          /* THEN, THEN_ARG                         */ \
  1, 1, 1, 1,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */ \
  1+IMM2_SIZE, 1                 /* CLOSE, SKIPZERO                        */
-
+  
 /* A magic value for OP_RREF to indicate the "any recursion" condition. */

 #define RREF_ANY  0xffff
@ -1724,7 +1725,7 @@ typedef struct {
  uint16_t value;
 } ucp_type_table;

-/* Unicode character database (UCD) */
+/* Unicode character database (UCD) record format */

 typedef struct {
  uint8_t script;     /* ucp_Arabic, etc. */
@ -1734,16 +1735,6 @@ typedef struct {
  int32_t other_case; /* offset to other case, or zero if none */
 } ucd_record;

-extern const uint32_t    PRIV(ucd_caseless_sets)[];
-extern const ucd_record  PRIV(ucd_records)[];
-extern const uint8_t     PRIV(ucd_stage1)[];
-extern const uint16_t    PRIV(ucd_stage2)[];
-extern const uint32_t    PRIV(ucp_gentype)[];
-extern const uint32_t    PRIV(ucp_gbtable)[];
-#ifdef SUPPORT_JIT
-extern const int         PRIV(ucp_typerange)[];
-#endif
-
 /* UCD access macros */

 #define UCD_BLOCK_SIZE 128
@ -1774,11 +1765,10 @@ defined, so the following items are omitted. */

 /* Internal shared data tables. These are tables that are used by more than one
 of the exported public functions. They have to be "external" in the C sense,
-but are not part of the PCRE2 public API. The data for these tables is in the
-pcre2_tables.c module. Even though some of them are identical in each library, 
-they must have different names so that more than one library can be linked with 
-an application. UTF-8 tables are needed only when compiling the 8-bit library.
-*/
+but are not part of the PCRE2 public API. Although the data for some of the
+tables is identical in all libraries, they must have different names so that
+multiple libraries can be simultaneously linked to a single application.
+However, UTF-8 tables are needed only when compiling the 8-bit library. */

 #if PCRE2_CODE_UNIT_WIDTH == 8
 extern const int              PRIV(utf8_table1)[];
@ -1787,18 +1777,39 @@ extern const int              PRIV(utf8_table2)[];
 extern const int              PRIV(utf8_table3)[];
 extern const uint8_t          PRIV(utf8_table4)[];       
 #endif                        
-                              
-extern const uint8_t          PRIV(default_tables)[];
-extern const uint8_t          PRIV(OP_lengths)[];

+#define _pcre2_OP_lengths         PCRE2_SUFFIX(_pcre2_OP_lengths_)
+#define _pcre2_default_tables     PCRE2_SUFFIX(_pcre2_default_tables_)
+#define _pcre2_hspace_list        PCRE2_SUFFIX(_pcre2_hspace_list_)
+#define _pcre2_vspace_list        PCRE2_SUFFIX(_pcre2_vspace_list_)
+#define _pcre2_ucd_caseless_sets  PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
+#define _pcre2_ucd_records        PCRE2_SUFFIX(_pcre2_ucd_records_)
+#define _pcre2_ucd_stage1         PCRE2_SUFFIX(_pcre2_ucd_stage1_)
+#define _pcre2_ucd_stage2         PCRE2_SUFFIX(_pcre2_ucd_stage2_)
+#define _pcre2_ucp_gbtable        PCRE2_SUFFIX(_pcre2_ucp_gbtable_)
+#define _pcre2_ucp_gentype        PCRE2_SUFFIX(_pcre2_ucp_gentype_)
+#define _pcre2_ucp_typerange      PCRE2_SUFFIX(_pcre2_ucp_typerange_)
+#define _pcre2_utt                PCRE2_SUFFIX(_pcre2_utt_)
+#define _pcre2_utt_names          PCRE2_SUFFIX(_pcre2_utt_names_)
+#define _pcre2_utt_size           PCRE2_SUFFIX(_pcre2_utt_size_)
+
+extern const uint8_t          PRIV(OP_lengths)[];
+extern const uint8_t          PRIV(default_tables)[];
 extern const uint32_t         PRIV(hspace_list)[];
 extern const uint32_t         PRIV(vspace_list)[];
-                              
+extern const uint32_t         PRIV(ucd_caseless_sets)[];
+extern const ucd_record       PRIV(ucd_records)[];
+extern const uint8_t          PRIV(ucd_stage1)[];
+extern const uint16_t         PRIV(ucd_stage2)[];
+extern const uint32_t         PRIV(ucp_gbtable)[];
+extern const uint32_t         PRIV(ucp_gentype)[];
+#ifdef SUPPORT_JIT
+extern const int              PRIV(ucp_typerange)[];
+#endif
 extern const ucp_type_table   PRIV(utt)[];
 extern const char             PRIV(utt_names)[];
 extern const size_t           PRIV(utt_size);

-
 /* Mode-dependent macros and hidden and private structures are defined in a
 separate file so that pcre2test can include them at all supported widths. When
 compiling the library, PCRE2_CODE_UNIT_WIDTH will be defined, and we can
@ -1811,12 +1822,13 @@ private structures. */

 #include "pcre2_intmodedep.h"

-/* Internal shared functions. These are functions that are used by more than
-one of the library's exported public functions. They have to be "external" in
-the C sense, but are not part of the PCRE public API. They are not referenced
-from pcre2test, and must not be defined when no code unit width is available.
-*/
+/* Private "external" functions. These are internal functions that are called
+from modules other than the one in which they are defined. They have to be
+"external" in the C sense, but are not part of the PCRE public API. They are
+not referenced from pcre2test, and must not be defined when no code unit width
+is available. */

+#define _pcre2_auto_possessify       PCRE2_SUFFIX(_pcre2_auto_possessify_)
 #define _pcre2_compile_context_init  PCRE2_SUFFIX(_pcre2_compile_context_init_)
 #define _pcre2_find_bracket          PCRE2_SUFFIX(_pcre2_find_bracket_)
 #define _pcre2_is_newline            PCRE2_SUFFIX(_pcre2_is_newline_)
@ -1828,9 +1840,12 @@ from pcre2test, and must not be defined when no code unit width is available.
 #define _pcre2_strlen                PCRE2_SUFFIX(_pcre_strlen_)
 #define _pcre2_strncmp               PCRE2_SUFFIX(_pcre_strncmp_)
 #define _pcre2_strncmp_c8            PCRE2_SUFFIX(_pcre_strncmp_c8_)
+#define _pcre2_study                 PCRE2_SUFFIX(_pcre_study_)
 #define _pcre2_valid_utf             PCRE2_SUFFIX(_pcre_valid_utf_)
 #define _pcre2_was_newline           PCRE2_SUFFIX(_pcre2_was_newline_)
+#define _pcre2_xclass                PCRE2_SUFFIX(_pcre2_xclass_)

+extern void  _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, const compile_data *);
 extern void  _pcre2_compile_context_init(pcre2_compile_context *, BOOL);
 extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int);
 extern BOOL  _pcre2_is_newline(PCRE2_SPTR, int, PCRE2_SPTR, int *, BOOL);
@ -1842,8 +1857,10 @@ extern int   _pcre2_strcmp_c8(PCRE2_SPTR, const char *);
 extern int   _pcre2_strlen(PCRE2_SPTR);
 extern int   _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t);
 extern int   _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t);
+extern int   _pcre2_study(pcre2_real_code *);
 extern int   _pcre2_valid_utf(PCRE2_SPTR, int, size_t *);
 extern BOOL  _pcre2_was_newline(PCRE2_SPTR, int, PCRE2_SPTR, int *, BOOL);
+extern BOOL  _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL);
 #endif  /* PCRE2_CODE_UNIT_WIDTH */

 /* End of pcre2_internal.h */
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
--- a/src/pcre2_xclass.c
+++ b/src/pcre2_xclass.c
@ -0,0 +1,269 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+     Original API code Copyright (c) 1997-2012 University of Cambridge
+         New API code Copyright (c) 2014 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* This module contains an internal function that is used to match an extended
+class. It is used by pcre2_auto_possessify() and by both pcre2_match() and
+pcre2_def_match(). */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "pcre2_internal.h"
+
+/*************************************************
+*       Match character against an XCLASS        *
+*************************************************/
+
+/* This function is called to match a character against an extended class that
+might contain codepoints above 255 and/or Unicode properties.
+
+Arguments:
+  c           the character
+  data        points to the flag code unit of the XCLASS data
+  utf         TRUE if in UTF mode 
+
+Returns:      TRUE if character matches, else FALSE
+*/
+
+BOOL
+PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf)
+{
+PCRE2_UCHAR t;
+BOOL negated = (*data & XCL_NOT) != 0;
+
+#if PCRE2_CODE_UNIT_WIDTH == 8
+/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
+utf = TRUE;
+#endif
+
+/* Code points < 256 are matched against a bitmap, if one is present. If not,
+we still carry on, because there may be ranges that start below 256 in the
+additional data. */
+
+if (c < 256)
+  {
+  if ((*data & XCL_HASPROP) == 0)
+    {
+    if ((*data & XCL_MAP) == 0) return negated;
+    return (((uint8_t *)(data + 1))[c/8] & (1 << (c&7))) != 0;
+    }
+  if ((*data & XCL_MAP) != 0 &&
+    (((uint8_t *)(data + 1))[c/8] & (1 << (c&7))) != 0)
+    return !negated; /* char found */
+  }
+
+/* First skip the bit map if present. Then match against the list of Unicode
+properties or large chars or ranges that end with a large char. We won't ever
+encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. */
+
+if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(PCRE2_UCHAR);
+
+while ((t = *data++) != XCL_END)
+  {
+  uint32_t x, y;
+  if (t == XCL_SINGLE)
+    {
+#ifdef SUPPORT_UTF
+    if (utf)
+      {
+      GETCHARINC(x, data); /* macro generates multiple statements */
+      }
+    else
+#endif
+      x = *data++;
+    if (c == x) return !negated;
+    }
+  else if (t == XCL_RANGE)
+    {
+#ifdef SUPPORT_UTF
+    if (utf)
+      {
+      GETCHARINC(x, data); /* macro generates multiple statements */
+      GETCHARINC(y, data); /* macro generates multiple statements */
+      }
+    else
+#endif
+      {
+      x = *data++;
+      y = *data++;
+      }
+    if (c >= x && c <= y) return !negated;
+    }
+
+#ifdef SUPPORT_UTF
+  else  /* XCL_PROP & XCL_NOTPROP */
+    {
+    const ucd_record *prop = GET_UCD(c);
+    BOOL isprop = t == XCL_PROP;
+
+    switch(*data)
+      {
+      case PT_ANY:
+      if (isprop) return !negated;
+      break;
+
+      case PT_LAMP:
+      if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
+           prop->chartype == ucp_Lt) == isprop) return !negated;
+      break;
+
+      case PT_GC:
+      if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
+        return !negated;
+      break;
+
+      case PT_PC:
+      if ((data[1] == prop->chartype) == isprop) return !negated;
+      break;
+
+      case PT_SC:
+      if ((data[1] == prop->script) == isprop) return !negated;
+      break;
+
+      case PT_ALNUM:
+      if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
+           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
+        return !negated;
+      break;
+
+      /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+      which means that Perl space and POSIX space are now identical. PCRE
+      was changed at release 8.34. */
+
+      case PT_SPACE:    /* Perl space */
+      case PT_PXSPACE:  /* POSIX space */
+      switch(c)
+        {
+        HSPACE_CASES:
+        VSPACE_CASES:
+        if (isprop) return !negated;
+        break;
+
+        default:
+        if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
+          return !negated;
+        break;
+        }
+      break;
+
+      case PT_WORD:
+      if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
+           PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
+             == isprop)
+        return !negated;
+      break;
+
+      case PT_UCNC:
+      if (c < 0xa0)
+        {
+        if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+             c == CHAR_GRAVE_ACCENT) == isprop)
+          return !negated;
+        }
+      else
+        {
+        if ((c < 0xd800 || c > 0xdfff) == isprop)
+          return !negated;
+        }
+      break;
+
+      /* The following three properties can occur only in an XCLASS, as there
+      is no \p or \P coding for them. */
+
+      /* Graphic character. Implement this as not Z (space or separator) and
+      not C (other), except for Cf (format) with a few exceptions. This seems
+      to be what Perl does. The exceptional characters are:
+
+      U+061C           Arabic Letter Mark
+      U+180E           Mongolian Vowel Separator
+      U+2066 - U+2069  Various "isolate"s
+      */
+
+      case PT_PXGRAPH:
+      if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
+            (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
+              (prop->chartype == ucp_Cf &&
+                c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
+         )) == isprop)
+        return !negated;
+      break;
+
+      /* Printable character: same as graphic, with the addition of Zs, i.e.
+      not Zl and not Zp, and U+180E. */
+
+      case PT_PXPRINT:
+      if ((prop->chartype != ucp_Zl &&
+           prop->chartype != ucp_Zp &&
+            (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
+              (prop->chartype == ucp_Cf &&
+                c != 0x061c && (c < 0x2066 || c > 0x2069))
+         )) == isprop)
+        return !negated;
+      break;
+
+      /* Punctuation: all Unicode punctuation, plus ASCII characters that
+      Unicode treats as symbols rather than punctuation, for Perl
+      compatibility (these are $+<=>^`|~). */
+
+      case PT_PXPUNCT:
+      if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
+            (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
+        return !negated;
+      break;
+
+      /* This should never occur, but compilers may mutter if there is no
+      default. */
+
+      default:
+      return FALSE;
+      }
+
+    data += 2;
+    }
+#endif  /* SUPPORT_UTF */
+  }
+
+return negated;   /* char did not match */
+}
+
+/* End of pcre2_xclass.c */
--- a/src/pcre2posix.c
+++ b/src/pcre2posix.c
@ -80,36 +80,35 @@ static const int eint1[] = {
  REG_EESCAPE, /* unrecognized character follows \ */
  REG_BADBR,   /* numbers out of order in {} quantifier */
  /* 5 */
-  5, REG_BADBR,   /* number too big in {} quantifier */
+  REG_BADBR,   /* number too big in {} quantifier */
  REG_EBRACK,  /* missing terminating ] for character class */
  REG_ECTYPE,  /* invalid escape sequence in character class */
  REG_ERANGE,  /* range out of order in character class */
  REG_BADRPT,  /* nothing to repeat */
  /* 10 */
-  REG_BADRPT,  /* operand of unlimited repeat could match the empty string */
  REG_ASSERT,  /* internal error: unexpected repeat */
-  REG_BADPAT,  /* unrecognized character after (? */
+  REG_BADPAT,  /* unrecognized character after (? or (?- */
  REG_BADPAT,  /* POSIX named classes are supported only within a class */
+  REG_BADPAT,  /* POSIX collating elements are not supported */ 
  REG_EPAREN,  /* missing ) */
  /* 15 */
  REG_ESUBREG, /* reference to non-existent subpattern */
-  REG_INVARG,  /* erroffset passed as NULL */
-  REG_INVARG,  /* unknown option bit(s) set */
-  REG_EPAREN,  /* missing ) after comment */
+  REG_INVARG,  /* pattern passed as NULL */
+  REG_INVARG,  /* unknown compile-time option bit(s) */
+  REG_EPAREN,  /* missing ) after (?# comment */
  REG_ESIZE,   /* parentheses nested too deeply */
  /* 20 */
  REG_ESIZE,   /* regular expression too large */
  REG_ESPACE,  /* failed to get memory */
-  REG_EPAREN,  /* unmatched parentheses */
+  REG_EPAREN,  /* unmatched closing parenthesis */
  REG_ASSERT   /* internal error: code overflow */
  };
  
 static const int eint2[] = {
  30, REG_ECTYPE,  /* unknown POSIX class name */
-  32, REG_INVARG,  /* this version of PCRE2 is not compiled with PCRE2_UTF8 support */
-  37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N, \U, or \u */
+  32, REG_INVARG,  /* this version of PCRE does not have UTF or UCP support */
+  37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */
  56, REG_INVARG,  /* internal error: unknown newline setting */
-  67, REG_INVARG,  /* this version of PCRE2 is not compiled with PCRE2_UCP support */
 };

 /* Table of texts corresponding to POSIX error codes */
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@ -422,8 +422,8 @@ static modstruct modlist[] = {
  { "altglobal",           MOD_PND,  MOD_CTL, CTL_ALTGLOBAL,             PO(control) },
  { "anchored",            MOD_PD,   MOD_OPT, PCRE2_ANCHORED,            PD(options) },
  { "auto_callout",        MOD_PAT,  MOD_OPT, PCRE2_AUTO_CALLOUT,        PO(options) },
-  { "bsr",                 MOD_CTC,  MOD_BSR, 0,                         CO(bsr_convention) },
  { "bincode",             MOD_PAT,  MOD_CTL, CTL_BINCODE,               PO(control) },
+  { "bsr",                 MOD_CTC,  MOD_BSR, 0,                         CO(bsr_convention) },
  { "callout_capture",     MOD_DAT,  MOD_CTL, CTL_CALLOUT_CAPTURE,       DO(control) },
  { "callout_fail",        MOD_DAT,  MOD_IN2, 0,                         DO(cfail) },
  { "callout_none",        MOD_DAT,  MOD_CTL, CTL_CALLOUT_NONE,          DO(control) },
@ -4899,8 +4899,8 @@ _setmode( _fileno( stdout ), _O_BINARY );

 /* Initialization that does not depend on the running mode. */

-memset(&def_patctl, sizeof(patctl), 0);
-memset(&def_datctl, sizeof(datctl), 0);
+memset(&def_patctl, 0, sizeof(patctl));
+memset(&def_datctl, 0, sizeof(datctl));
 def_datctl.oveccount = DEFAULT_OVECCOUNT;
 def_datctl.copy_numbers[0] = -1;
 def_datctl.get_numbers[0] = -1;