Add PCRE2_CONFIG_UNICODE_VERSION to give the Unicode version string.

This commit is contained in:
Philip.Hazel 2014-08-12 09:48:56 +00:00
parent 059a8ebfe4
commit 803c38f004
6 changed files with 50 additions and 7 deletions

View File

@ -120,6 +120,7 @@
# 13-May-2014: Updated for PCRE2
# 03-June-2014: Updated for Python 3
# 20-June-2014: Updated for Unicode 7.0.0
# 12-August-2014: Updated to put Unicode version into the file
##############################################################################
@ -130,6 +131,7 @@ import sys
MAX_UNICODE = 0x110000
NOTACHAR = 0xffffffff
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
def make_get_names(enum):
return lambda chardata: enum.index(chardata[1])
@ -141,9 +143,21 @@ def get_other_case(chardata):
return 0
# Read the whole table in memory
# Read the whole table in memory, setting/checking the Unicode version
def read_table(file_name, get_value, default_value):
global unicode_version
f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
file_base = f.group(1)
version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
file = open(file_name, 'r', encoding='utf-8')
f = re.match(version_pat, file.readline())
version = f.group(1)
if unicode_version == "":
unicode_version = version
elif unicode_version != version:
print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
table = [default_value] * MAX_UNICODE
for line in file:
line = re.sub(r'#.*', '', line)
@ -327,6 +341,7 @@ break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other' ]
test_record_size()
unicode_version = ""
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
@ -464,6 +479,8 @@ print("const uint16_t PRIV(ucd_stage2)[] = {0};")
print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};")
print("#else")
print()
print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version))
print()
print(record_struct)
# --- Added by PH: output the table of caseless character sets ---

View File

@ -257,8 +257,9 @@ must all be greater than zero. */
#define PCRE2_CONFIG_PARENSLIMIT 7
#define PCRE2_CONFIG_RECURSIONLIMIT 5
#define PCRE2_CONFIG_STACKRECURSE 8
#define PCRE2_CONFIG_UTF 9
#define PCRE2_CONFIG_VERSION 10
#define PCRE2_CONFIG_UNICODE_VERSION 9
#define PCRE2_CONFIG_UTF 10
#define PCRE2_CONFIG_VERSION 11
/* Types for code units in patterns and subject strings. */

View File

@ -143,6 +143,21 @@ switch (what)
#endif
break;
case PCRE2_CONFIG_UNICODE_VERSION:
{
#if defined SUPPORT_UTF
const char *v = PRIV(unicode_version);
#else
const char *v = "Unicode not supported";
#endif
PCRE2_UCHAR *t = (PCRE2_UCHAR *)where;
if (strlen(v) >= BYTES2CU(length) - 1) return PCRE2_ERROR_BADLENGTH;
while (*v != 0) *t++ = *v++;
*t = 0;
return t - (PCRE2_UCHAR *)where;
}
break;
case PCRE2_CONFIG_UTF:
#if defined SUPPORT_UTF
*((int *)where) = 1;

View File

@ -1795,6 +1795,7 @@ extern const uint8_t PRIV(utf8_table4)[];
#define _pcre2_ucp_gbtable PCRE2_SUFFIX(_pcre2_ucp_gbtable_)
#define _pcre2_ucp_gentype PCRE2_SUFFIX(_pcre2_ucp_gentype_)
#define _pcre2_ucp_typerange PCRE2_SUFFIX(_pcre2_ucp_typerange_)
#define _pcre2_unicode_version PCRE2_SUFFIX(_pcre2_unicode_version_)
#define _pcre2_utt PCRE2_SUFFIX(_pcre2_utt_)
#define _pcre2_utt_names PCRE2_SUFFIX(_pcre2_utt_names_)
#define _pcre2_utt_size PCRE2_SUFFIX(_pcre2_utt_size_)
@ -1812,6 +1813,7 @@ extern const uint32_t PRIV(ucp_gentype)[];
#ifdef SUPPORT_JIT
extern const int PRIV(ucp_typerange)[];
#endif
extern const char *PRIV(unicode_version);
extern const ucp_type_table PRIV(utt)[];
extern const char PRIV(utt_names)[];
extern const size_t PRIV(utt_size);

View File

@ -39,6 +39,8 @@ const uint16_t PRIV(ucd_stage2)[] = {0};
const uint32_t PRIV(ucd_caseless_sets)[] = {0};
#else
const char *PRIV(unicode_version) = "7.0.0";
/* When recompiling tables with a new Unicode version, please check the
types in this structure definition from pcre2_internal.h (the actual
field names will be different):

View File

@ -164,8 +164,8 @@ void vms_setsymbol( char *, char *, int );
#define CFAIL_UNSET UINT32_MAX /* Unset value for cfail fields */
#define DFA_WS_DIMENSION 1000 /* Size of DFA workspace */
#define DEFAULT_OVECCOUNT 15 /* Default ovector count */
#define LOOPREPEAT 500000 /* Default loop count for timing. */
#define VERSION_SIZE 64 /* Size of buffer for the version string. */
#define LOOPREPEAT 500000 /* Default loop count for timing */
#define VERSION_SIZE 64 /* Size of buffer for the version strings */
/* Execution modes */
@ -615,6 +615,7 @@ static uint32_t max_oveccount;
static uint32_t callout_count;
static VERSION_TYPE version[VERSION_SIZE];
static VERSION_TYPE uversion[VERSION_SIZE];
static patctl def_patctl;
static patctl pat_patctl;
@ -5220,7 +5221,10 @@ printf(" 32-bit support\n");
#endif
(void)PCRE2_CONFIG(PCRE2_CONFIG_UTF, &rc, sizeof(rc));
printf (" %sUTF support\n", rc ? "" : "No ");
if (rc != 0)
printf(" UTF support (Unicode version %s)\n", uversion);
else
printf(" No UTF support\n");
(void)PCRE2_CONFIG(PCRE2_CONFIG_JIT, &rc, sizeof(rc));
if (rc != 0)
{
@ -5289,9 +5293,11 @@ if (PO(options) != DO(options) || PO(control) != DO(control))
return 1;
}
/* Get the PCRE version number information. */
/* Get the PCRE2 and Unicode version number information. */
PCRE2_CONFIG(PCRE2_CONFIG_VERSION, version, sizeof(VERSION_TYPE)*VERSION_SIZE);
PCRE2_CONFIG(PCRE2_CONFIG_UNICODE_VERSION, uversion,
sizeof(VERSION_TYPE)*VERSION_SIZE);
/* Get buffers from malloc() so that valgrind will check their misuse when
debugging. They grow automatically when very long lines are read. The 16-