From 53bf29d689358908a10767774bc2c310ba056b88 Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Fri, 24 Feb 2017 18:25:32 +0000
Subject: [PATCH] Fix 32-bit non-UTF property test crash.

---
 ChangeLog                |  4 ++++
 maint/MultiStage2.py     | 17 ++++++++++++++++-
 src/pcre2_internal.h     | 15 ++++++++++++++-
 src/pcre2_ucd.c          | 14 ++++++++++++++
 testdata/testinput12     |  3 +++
 testdata/testoutput12-16 |  6 ++++++
 testdata/testoutput12-32 |  4 ++++
 7 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index ef29f76..9560c54 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -9,6 +9,10 @@ Version 10.24 14-February-2017
 
    (a) Check for malloc failures when getting memory for the ovector (POSIX) or 
        the match data block (non-POSIX). 
+       
+2. In the 32-bit library in non-UTF mode, an attempt to find a Unicode property
+for a character with a code point greater than 0x10ffff (the Unicode maximum)
+caused a crash.
 
 
 Version 10.23 14-February-2017
diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py
index eac2f16..0649332 100755
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@@ -236,7 +236,8 @@ def print_table(table, table_name, block_size = None):
                 fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
                 mult = MAX_UNICODE / len(table)
                 for i in range(0, len(table), ELEMS_PER_LINE):
-                        print(fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)))
+                        print(fmt % (table[i:i+ELEMS_PER_LINE] + 
+                          (int(i * mult),)))
         else:
                 if block_size > ELEMS_PER_LINE:
                         el = ELEMS_PER_LINE
@@ -485,6 +486,20 @@ print("#else")
 print()
 print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version))
 print()
+print("/* If the 32-bit library is run in non-32-bit mode, character values")
+print("greater than 0x10ffff may be encountered. For these we set up a")
+print("special record. */")
+print()
+print("#if PCRE2_CODE_UNIT_WIDTH == 32")
+print("const ucd_record PRIV(dummy_ucd_record)[] = {{")
+print("  ucp_Common,    /* script */")
+print("  ucp_Cn,        /* type unassigned */")
+print("  ucp_gbOther,   /* grapheme break property */")
+print("  0,             /* case set */")
+print("  0,             /* other case */")
+print("  }};")
+print("#endif")
+print()
 print(record_struct)
 
 # --- Added by PH: output the table of caseless character sets ---
diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
index 6a8774c..720bbc9 100644
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@@ -1774,10 +1774,17 @@ typedef struct {
 /* UCD access macros */
 
 #define UCD_BLOCK_SIZE 128
-#define GET_UCD(ch) (PRIV(ucd_records) + \
+#define REAL_GET_UCD(ch) (PRIV(ucd_records) + \
         PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \
         UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE])
 
+#if PCRE2_CODE_UNIT_WIDTH == 32
+#define GET_UCD(ch) ((ch > MAX_UTF_CODE_POINT)? \
+  PRIV(dummy_ucd_record) : REAL_GET_UCD(ch))
+#else
+#define GET_UCD(ch) REAL_GET_UCD(ch)
+#endif
+
 #define UCD_CHARTYPE(ch)    GET_UCD(ch)->chartype
 #define UCD_SCRIPT(ch)      GET_UCD(ch)->script
 #define UCD_CATEGORY(ch)    PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
@@ -1834,6 +1841,9 @@ extern const uint8_t          PRIV(utf8_table4)[];
 #define _pcre2_default_compile_context PCRE2_SUFFIX(_pcre2_default_compile_context_)
 #define _pcre2_default_match_context   PCRE2_SUFFIX(_pcre2_default_match_context_)
 #define _pcre2_default_tables          PCRE2_SUFFIX(_pcre2_default_tables_)
+#if PCRE2_CODE_UNIT_WIDTH == 32
+#define _pcre2_dummy_ucd_record        PCRE2_SUFFIX(_pcre2_dummy_ucd_record_)
+#endif
 #define _pcre2_hspace_list             PCRE2_SUFFIX(_pcre2_hspace_list_)
 #define _pcre2_vspace_list             PCRE2_SUFFIX(_pcre2_vspace_list_)
 #define _pcre2_ucd_caseless_sets       PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
@@ -1858,6 +1868,9 @@ extern const uint32_t                  PRIV(hspace_list)[];
 extern const uint32_t                  PRIV(vspace_list)[];
 extern const uint32_t                  PRIV(ucd_caseless_sets)[];
 extern const ucd_record                PRIV(ucd_records)[];
+#if PCRE2_CODE_UNIT_WIDTH == 32
+extern const ucd_record                PRIV(dummy_ucd_record)[];
+#endif
 extern const uint8_t                   PRIV(ucd_stage1)[];
 extern const uint16_t                  PRIV(ucd_stage2)[];
 extern const uint32_t                  PRIV(ucp_gbtable)[];
diff --git a/src/pcre2_ucd.c b/src/pcre2_ucd.c
index 116f537..56aa29d 100644
--- a/src/pcre2_ucd.c
+++ b/src/pcre2_ucd.c
@@ -41,6 +41,20 @@ const uint32_t PRIV(ucd_caseless_sets)[] = {0};
 
 const char *PRIV(unicode_version) = "8.0.0";
 
+/* If the 32-bit library is run in non-32-bit mode, character values
+greater than 0x10ffff may be encountered. For these we set up a
+special record. */
+
+#if PCRE2_CODE_UNIT_WIDTH == 32
+const ucd_record PRIV(dummy_ucd_record)[] = {{
+  ucp_Common,    /* script */
+  ucp_Cn,        /* type unassigned */
+  ucp_gbOther,   /* grapheme break property */
+  0,             /* case set */
+  0,             /* other case */
+  }};
+#endif
+
 /* When recompiling tables with a new Unicode version, please check the
 types in this structure definition from pcre2_internal.h (the actual
 field names will be different):
diff --git a/testdata/testinput12 b/testdata/testinput12
index c3b2bfc..decfe82 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -360,4 +360,7 @@
 
 /[\s[:^ascii:]]/B,ucp
 
+/\pP/ucp
+    \x{7fffffff}\=no_jit
+
 # End of testinput12
diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
index 3b5a0cd..41e0a48 100644
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@@ -1415,4 +1415,10 @@ No match
         End
 ------------------------------------------------------------------
 
+/\pP/ucp
+    \x{7fffffff}\=no_jit
+** Character \x{7fffffff} is greater than 0xffff and UTF-16 mode is not enabled.
+** Truncation will probably give the wrong result.
+No match
+
 # End of testinput12
diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
index 1496159..e9130b9 100644
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@@ -1409,4 +1409,8 @@ No match
         End
 ------------------------------------------------------------------
 
+/\pP/ucp
+    \x{7fffffff}\=no_jit
+No match
+
 # End of testinput12