diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py index 5289d67..19998a6 100755 --- a/maint/MultiStage2.py +++ b/maint/MultiStage2.py @@ -120,6 +120,7 @@ # 13-May-2014: Updated for PCRE2 # 03-June-2014: Updated for Python 3 # 20-June-2014: Updated for Unicode 7.0.0 +# 12-August-2014: Updated to put Unicode version into the file ############################################################################## @@ -130,6 +131,7 @@ import sys MAX_UNICODE = 0x110000 NOTACHAR = 0xffffffff + # Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt def make_get_names(enum): return lambda chardata: enum.index(chardata[1]) @@ -141,9 +143,21 @@ def get_other_case(chardata): return 0 -# Read the whole table in memory +# Read the whole table in memory, setting/checking the Unicode version def read_table(file_name, get_value, default_value): + global unicode_version + + f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name) + file_base = f.group(1) + version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$" file = open(file_name, 'r', encoding='utf-8') + f = re.match(version_pat, file.readline()) + version = f.group(1) + if unicode_version == "": + unicode_version = version + elif unicode_version != version: + print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr) + table = [default_value] * MAX_UNICODE for line in file: line = re.sub(r'#.*', '', line) @@ -327,6 +341,7 @@ break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend', 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other' ] test_record_size() +unicode_version = "" script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common')) category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) @@ -464,6 +479,8 @@ print("const uint16_t PRIV(ucd_stage2)[] = {0};") print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};") print("#else") print() +print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version)) +print() print(record_struct) # --- Added by PH: output the table of caseless character sets --- diff --git a/src/pcre2.h.in b/src/pcre2.h.in index 1f65db6..b59111f 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -257,8 +257,9 @@ must all be greater than zero. */ #define PCRE2_CONFIG_PARENSLIMIT 7 #define PCRE2_CONFIG_RECURSIONLIMIT 5 #define PCRE2_CONFIG_STACKRECURSE 8 -#define PCRE2_CONFIG_UTF 9 -#define PCRE2_CONFIG_VERSION 10 +#define PCRE2_CONFIG_UNICODE_VERSION 9 +#define PCRE2_CONFIG_UTF 10 +#define PCRE2_CONFIG_VERSION 11 /* Types for code units in patterns and subject strings. */ diff --git a/src/pcre2_config.c b/src/pcre2_config.c index adf8937..f468087 100644 --- a/src/pcre2_config.c +++ b/src/pcre2_config.c @@ -142,6 +142,21 @@ switch (what) *((int *)where) = 1; #endif break; + + case PCRE2_CONFIG_UNICODE_VERSION: + { +#if defined SUPPORT_UTF + const char *v = PRIV(unicode_version); +#else + const char *v = "Unicode not supported"; +#endif + PCRE2_UCHAR *t = (PCRE2_UCHAR *)where; + if (strlen(v) >= BYTES2CU(length) - 1) return PCRE2_ERROR_BADLENGTH; + while (*v != 0) *t++ = *v++; + *t = 0; + return t - (PCRE2_UCHAR *)where; + } + break; case PCRE2_CONFIG_UTF: #if defined SUPPORT_UTF diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index d1d8e49..fd1ee41 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1795,6 +1795,7 @@ extern const uint8_t PRIV(utf8_table4)[]; #define _pcre2_ucp_gbtable PCRE2_SUFFIX(_pcre2_ucp_gbtable_) #define _pcre2_ucp_gentype PCRE2_SUFFIX(_pcre2_ucp_gentype_) #define _pcre2_ucp_typerange PCRE2_SUFFIX(_pcre2_ucp_typerange_) +#define _pcre2_unicode_version PCRE2_SUFFIX(_pcre2_unicode_version_) #define _pcre2_utt PCRE2_SUFFIX(_pcre2_utt_) #define _pcre2_utt_names PCRE2_SUFFIX(_pcre2_utt_names_) #define _pcre2_utt_size PCRE2_SUFFIX(_pcre2_utt_size_) @@ -1812,6 +1813,7 @@ extern const uint32_t PRIV(ucp_gentype)[]; #ifdef SUPPORT_JIT extern const int PRIV(ucp_typerange)[]; #endif +extern const char *PRIV(unicode_version); extern const ucp_type_table PRIV(utt)[]; extern const char PRIV(utt_names)[]; extern const size_t PRIV(utt_size); diff --git a/src/pcre2_ucd.c b/src/pcre2_ucd.c index 6e22afa..11b6fb5 100644 --- a/src/pcre2_ucd.c +++ b/src/pcre2_ucd.c @@ -39,6 +39,8 @@ const uint16_t PRIV(ucd_stage2)[] = {0}; const uint32_t PRIV(ucd_caseless_sets)[] = {0}; #else +const char *PRIV(unicode_version) = "7.0.0"; + /* When recompiling tables with a new Unicode version, please check the types in this structure definition from pcre2_internal.h (the actual field names will be different): diff --git a/src/pcre2test.c b/src/pcre2test.c index d4225c1..f90591d 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -164,8 +164,8 @@ void vms_setsymbol( char *, char *, int ); #define CFAIL_UNSET UINT32_MAX /* Unset value for cfail fields */ #define DFA_WS_DIMENSION 1000 /* Size of DFA workspace */ #define DEFAULT_OVECCOUNT 15 /* Default ovector count */ -#define LOOPREPEAT 500000 /* Default loop count for timing. */ -#define VERSION_SIZE 64 /* Size of buffer for the version string. */ +#define LOOPREPEAT 500000 /* Default loop count for timing */ +#define VERSION_SIZE 64 /* Size of buffer for the version strings */ /* Execution modes */ @@ -615,6 +615,7 @@ static uint32_t max_oveccount; static uint32_t callout_count; static VERSION_TYPE version[VERSION_SIZE]; +static VERSION_TYPE uversion[VERSION_SIZE]; static patctl def_patctl; static patctl pat_patctl; @@ -5220,7 +5221,10 @@ printf(" 32-bit support\n"); #endif (void)PCRE2_CONFIG(PCRE2_CONFIG_UTF, &rc, sizeof(rc)); -printf (" %sUTF support\n", rc ? "" : "No "); +if (rc != 0) + printf(" UTF support (Unicode version %s)\n", uversion); +else + printf(" No UTF support\n"); (void)PCRE2_CONFIG(PCRE2_CONFIG_JIT, &rc, sizeof(rc)); if (rc != 0) { @@ -5289,9 +5293,11 @@ if (PO(options) != DO(options) || PO(control) != DO(control)) return 1; } -/* Get the PCRE version number information. */ +/* Get the PCRE2 and Unicode version number information. */ PCRE2_CONFIG(PCRE2_CONFIG_VERSION, version, sizeof(VERSION_TYPE)*VERSION_SIZE); +PCRE2_CONFIG(PCRE2_CONFIG_UNICODE_VERSION, uversion, + sizeof(VERSION_TYPE)*VERSION_SIZE); /* Get buffers from malloc() so that valgrind will check their misuse when debugging. They grow automatically when very long lines are read. The 16-