Add PCRE2_CONFIG_UNICODE_VERSION to give the Unicode version string.

2014-08-12 09:48:56 +00:00 · 2014-08-12 09:48:56 +00:00 · 803c38f004
commit 803c38f004
parent 059a8ebfe4
6 changed files with 50 additions and 7 deletions
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@ -120,6 +120,7 @@
 # 13-May-2014:       Updated for PCRE2
 # 03-June-2014:      Updated for Python 3
 # 20-June-2014:      Updated for Unicode 7.0.0
+# 12-August-2014:    Updated to put Unicode version into the file
 ##############################################################################


@ -130,6 +131,7 @@ import sys
 MAX_UNICODE = 0x110000
 NOTACHAR = 0xffffffff

+
 # Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
 def make_get_names(enum):
        return lambda chardata: enum.index(chardata[1])
@ -141,9 +143,21 @@ def get_other_case(chardata):
        return 0


-# Read the whole table in memory
+# Read the whole table in memory, setting/checking the Unicode version
 def read_table(file_name, get_value, default_value):
+        global unicode_version
+         
+        f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
+        file_base = f.group(1)
+        version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
        file = open(file_name, 'r', encoding='utf-8')
+        f = re.match(version_pat, file.readline())
+        version = f.group(1)
+        if unicode_version == "":
+                unicode_version = version
+        elif unicode_version != version:
+                print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
+ 
        table = [default_value] * MAX_UNICODE
        for line in file:
                line = re.sub(r'#.*', '', line)
@ -327,6 +341,7 @@ break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
  'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other' ]

 test_record_size()
+unicode_version = ""

 script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
 category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
@ -464,6 +479,8 @@ print("const uint16_t PRIV(ucd_stage2)[] = {0};")
 print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
 print("#else")
 print()
+print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version))
+print()
 print(record_struct)

 # --- Added by PH: output the table of caseless character sets ---
--- a/src/pcre2.h.in
+++ b/src/pcre2.h.in
@ -257,8 +257,9 @@ must all be greater than zero. */
 #define PCRE2_CONFIG_PARENSLIMIT             7
 #define PCRE2_CONFIG_RECURSIONLIMIT          5
 #define PCRE2_CONFIG_STACKRECURSE            8
-#define PCRE2_CONFIG_UTF                     9
-#define PCRE2_CONFIG_VERSION                10
+#define PCRE2_CONFIG_UNICODE_VERSION         9
+#define PCRE2_CONFIG_UTF                    10
+#define PCRE2_CONFIG_VERSION                11

 /* Types for code units in patterns and subject strings. */

--- a/src/pcre2_config.c
+++ b/src/pcre2_config.c
@ -142,6 +142,21 @@ switch (what)
  *((int *)where) = 1;
 #endif
  break;
+  
+  case PCRE2_CONFIG_UNICODE_VERSION:
+    { 
+#if defined SUPPORT_UTF
+    const char *v = PRIV(unicode_version);
+#else
+    const char *v = "Unicode not supported";
+#endif
+    PCRE2_UCHAR *t = (PCRE2_UCHAR *)where;
+    if (strlen(v) >= BYTES2CU(length) - 1) return PCRE2_ERROR_BADLENGTH; 
+    while (*v != 0) *t++ = *v++;
+    *t = 0;
+    return t - (PCRE2_UCHAR *)where;
+    }
+  break;

  case PCRE2_CONFIG_UTF:
 #if defined SUPPORT_UTF
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -1795,6 +1795,7 @@ extern const uint8_t          PRIV(utf8_table4)[];
 #define _pcre2_ucp_gbtable        PCRE2_SUFFIX(_pcre2_ucp_gbtable_)
 #define _pcre2_ucp_gentype        PCRE2_SUFFIX(_pcre2_ucp_gentype_)
 #define _pcre2_ucp_typerange      PCRE2_SUFFIX(_pcre2_ucp_typerange_)
+#define _pcre2_unicode_version    PCRE2_SUFFIX(_pcre2_unicode_version_)
 #define _pcre2_utt                PCRE2_SUFFIX(_pcre2_utt_)
 #define _pcre2_utt_names          PCRE2_SUFFIX(_pcre2_utt_names_)
 #define _pcre2_utt_size           PCRE2_SUFFIX(_pcre2_utt_size_)
@ -1812,6 +1813,7 @@ extern const uint32_t         PRIV(ucp_gentype)[];
 #ifdef SUPPORT_JIT
 extern const int              PRIV(ucp_typerange)[];
 #endif
+extern const char            *PRIV(unicode_version);
 extern const ucp_type_table   PRIV(utt)[];
 extern const char             PRIV(utt_names)[];
 extern const size_t           PRIV(utt_size);
--- a/src/pcre2_ucd.c
+++ b/src/pcre2_ucd.c
@ -39,6 +39,8 @@ const uint16_t PRIV(ucd_stage2)[] = {0};
 const uint32_t PRIV(ucd_caseless_sets)[] = {0};
 #else

+const char *PRIV(unicode_version) = "7.0.0";
+
 /* When recompiling tables with a new Unicode version, please check the
 types in this structure definition from pcre2_internal.h (the actual
 field names will be different):
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@ -164,8 +164,8 @@ void vms_setsymbol( char *, char *, int );
 #define CFAIL_UNSET UINT32_MAX  /* Unset value for cfail fields */
 #define DFA_WS_DIMENSION 1000   /* Size of DFA workspace */
 #define DEFAULT_OVECCOUNT 15    /* Default ovector count */
-#define LOOPREPEAT 500000       /* Default loop count for timing. */
-#define VERSION_SIZE 64         /* Size of buffer for the version string. */
+#define LOOPREPEAT 500000       /* Default loop count for timing */
+#define VERSION_SIZE 64         /* Size of buffer for the version strings */

 /* Execution modes */

@ -615,6 +615,7 @@ static uint32_t max_oveccount;
 static uint32_t callout_count;

 static VERSION_TYPE version[VERSION_SIZE];
+static VERSION_TYPE uversion[VERSION_SIZE];

 static patctl def_patctl;
 static patctl pat_patctl;
@ -5220,7 +5221,10 @@ printf("  32-bit support\n");
 #endif

 (void)PCRE2_CONFIG(PCRE2_CONFIG_UTF, &rc, sizeof(rc));
-printf ("  %sUTF support\n", rc ? "" : "No ");
+if (rc != 0)
+  printf("  UTF support (Unicode version %s)\n", uversion);
+else 
+  printf("  No UTF support\n");
 (void)PCRE2_CONFIG(PCRE2_CONFIG_JIT, &rc, sizeof(rc));
 if (rc != 0)
  {
@ -5289,9 +5293,11 @@ if (PO(options) != DO(options) || PO(control) != DO(control))
  return 1;
  }

-/* Get the PCRE version number information. */
+/* Get the PCRE2 and Unicode version number information. */

 PCRE2_CONFIG(PCRE2_CONFIG_VERSION, version, sizeof(VERSION_TYPE)*VERSION_SIZE);
+PCRE2_CONFIG(PCRE2_CONFIG_UNICODE_VERSION, uversion, 
+  sizeof(VERSION_TYPE)*VERSION_SIZE);

 /* Get buffers from malloc() so that valgrind will check their misuse when
 debugging. They grow automatically when very long lines are read. The 16-