From 53bf29d689358908a10767774bc2c310ba056b88 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 24 Feb 2017 18:25:32 +0000 Subject: [PATCH] Fix 32-bit non-UTF property test crash. --- ChangeLog | 4 ++++ maint/MultiStage2.py | 17 ++++++++++++++++- src/pcre2_internal.h | 15 ++++++++++++++- src/pcre2_ucd.c | 14 ++++++++++++++ testdata/testinput12 | 3 +++ testdata/testoutput12-16 | 6 ++++++ testdata/testoutput12-32 | 4 ++++ 7 files changed, 61 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index ef29f76..9560c54 100644 --- a/ChangeLog +++ b/ChangeLog @@ -9,6 +9,10 @@ Version 10.24 14-February-2017 (a) Check for malloc failures when getting memory for the ovector (POSIX) or the match data block (non-POSIX). + +2. In the 32-bit library in non-UTF mode, an attempt to find a Unicode property +for a character with a code point greater than 0x10ffff (the Unicode maximum) +caused a crash. Version 10.23 14-February-2017 diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py index eac2f16..0649332 100755 --- a/maint/MultiStage2.py +++ b/maint/MultiStage2.py @@ -236,7 +236,8 @@ def print_table(table, table_name, block_size = None): fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */" mult = MAX_UNICODE / len(table) for i in range(0, len(table), ELEMS_PER_LINE): - print(fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))) + print(fmt % (table[i:i+ELEMS_PER_LINE] + + (int(i * mult),))) else: if block_size > ELEMS_PER_LINE: el = ELEMS_PER_LINE @@ -485,6 +486,20 @@ print("#else") print() print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version)) print() +print("/* If the 32-bit library is run in non-32-bit mode, character values") +print("greater than 0x10ffff may be encountered. For these we set up a") +print("special record. */") +print() +print("#if PCRE2_CODE_UNIT_WIDTH == 32") +print("const ucd_record PRIV(dummy_ucd_record)[] = {{") +print(" ucp_Common, /* script */") +print(" ucp_Cn, /* type unassigned */") +print(" ucp_gbOther, /* grapheme break property */") +print(" 0, /* case set */") +print(" 0, /* other case */") +print(" }};") +print("#endif") +print() print(record_struct) # --- Added by PH: output the table of caseless character sets --- diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 6a8774c..720bbc9 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1774,10 +1774,17 @@ typedef struct { /* UCD access macros */ #define UCD_BLOCK_SIZE 128 -#define GET_UCD(ch) (PRIV(ucd_records) + \ +#define REAL_GET_UCD(ch) (PRIV(ucd_records) + \ PRIV(ucd_stage2)[PRIV(ucd_stage1)[(int)(ch) / UCD_BLOCK_SIZE] * \ UCD_BLOCK_SIZE + (int)(ch) % UCD_BLOCK_SIZE]) +#if PCRE2_CODE_UNIT_WIDTH == 32 +#define GET_UCD(ch) ((ch > MAX_UTF_CODE_POINT)? \ + PRIV(dummy_ucd_record) : REAL_GET_UCD(ch)) +#else +#define GET_UCD(ch) REAL_GET_UCD(ch) +#endif + #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype #define UCD_SCRIPT(ch) GET_UCD(ch)->script #define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)] @@ -1834,6 +1841,9 @@ extern const uint8_t PRIV(utf8_table4)[]; #define _pcre2_default_compile_context PCRE2_SUFFIX(_pcre2_default_compile_context_) #define _pcre2_default_match_context PCRE2_SUFFIX(_pcre2_default_match_context_) #define _pcre2_default_tables PCRE2_SUFFIX(_pcre2_default_tables_) +#if PCRE2_CODE_UNIT_WIDTH == 32 +#define _pcre2_dummy_ucd_record PCRE2_SUFFIX(_pcre2_dummy_ucd_record_) +#endif #define _pcre2_hspace_list PCRE2_SUFFIX(_pcre2_hspace_list_) #define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_) #define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_) @@ -1858,6 +1868,9 @@ extern const uint32_t PRIV(hspace_list)[]; extern const uint32_t PRIV(vspace_list)[]; extern const uint32_t PRIV(ucd_caseless_sets)[]; extern const ucd_record PRIV(ucd_records)[]; +#if PCRE2_CODE_UNIT_WIDTH == 32 +extern const ucd_record PRIV(dummy_ucd_record)[]; +#endif extern const uint8_t PRIV(ucd_stage1)[]; extern const uint16_t PRIV(ucd_stage2)[]; extern const uint32_t PRIV(ucp_gbtable)[]; diff --git a/src/pcre2_ucd.c b/src/pcre2_ucd.c index 116f537..56aa29d 100644 --- a/src/pcre2_ucd.c +++ b/src/pcre2_ucd.c @@ -41,6 +41,20 @@ const uint32_t PRIV(ucd_caseless_sets)[] = {0}; const char *PRIV(unicode_version) = "8.0.0"; +/* If the 32-bit library is run in non-32-bit mode, character values +greater than 0x10ffff may be encountered. For these we set up a +special record. */ + +#if PCRE2_CODE_UNIT_WIDTH == 32 +const ucd_record PRIV(dummy_ucd_record)[] = {{ + ucp_Common, /* script */ + ucp_Cn, /* type unassigned */ + ucp_gbOther, /* grapheme break property */ + 0, /* case set */ + 0, /* other case */ + }}; +#endif + /* When recompiling tables with a new Unicode version, please check the types in this structure definition from pcre2_internal.h (the actual field names will be different): diff --git a/testdata/testinput12 b/testdata/testinput12 index c3b2bfc..decfe82 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -360,4 +360,7 @@ /[\s[:^ascii:]]/B,ucp +/\pP/ucp + \x{7fffffff}\=no_jit + # End of testinput12 diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index 3b5a0cd..41e0a48 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1415,4 +1415,10 @@ No match End ------------------------------------------------------------------ +/\pP/ucp + \x{7fffffff}\=no_jit +** Character \x{7fffffff} is greater than 0xffff and UTF-16 mode is not enabled. +** Truncation will probably give the wrong result. +No match + # End of testinput12 diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index 1496159..e9130b9 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1409,4 +1409,8 @@ No match End ------------------------------------------------------------------ +/\pP/ucp + \x{7fffffff}\=no_jit +No match + # End of testinput12