From bf35c0518ca0d62ddb647758f777e95b5c6aec22 Mon Sep 17 00:00:00 2001 From: Philip Hazel Date: Wed, 12 Jan 2022 15:01:14 +0000 Subject: [PATCH] Add -LP and -LS (list properties, list scripts) features to pcre2test. --- ChangeLog | 5 +- doc/pcre2test.1 | 18 +++- src/pcre2test.c | 214 ++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 214 insertions(+), 23 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8892962..118f586 100644 --- a/ChangeLog +++ b/ChangeLog @@ -39,7 +39,8 @@ pcre2_substitute(), and the replacement argument of the latter, if the pointer is NULL and the length is zero, treat as an empty string. Apparently a number of applications treat NULL/0 in this way. -14. Added support for Bidi_Class and Bidi_Control Unicode properties. +14. Added support for Bidi_Class and a number of binary Unicode properties, +including Bidi_Control. 15. Fix some minor issues raised by clang sanitize. @@ -76,6 +77,8 @@ misaligned the frame that follows, resulting in an alignment fault when storing a pointer to Fecode at the start of match. Patch to fix this issue by Jessica Clarke PR#72. +20. Added -LP and -LS listing options to pcre2test. + Version 10.39 29-October-2021 ----------------------------- diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index 9707f88..1835193 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "28 November 2021" "PCRE 10.40" +.TH PCRE2TEST 1 "12 January 2022" "PCRE 10.40" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -211,7 +211,17 @@ available, and the use of JIT for matching is verified. \fB-LM\fP List modifiers: write a list of available pattern and subject modifiers to the standard output, then exit with zero exit code. All other options are ignored. -If both -C and -LM are present, whichever is first is recognized. +If both -C and any -Lx options are present, whichever is first is recognized. +.TP 10 +\fB-LP\fP +List properties: write a list of recognized Unicode properties to the standard +output, then exit with zero exit code. All other options are ignored. If both +-C and any -Lx options are present, whichever is first is recognized. +.TP 10 +\fB-LS\fP +List scripts: write a list of recogized Unicode script names to the standard +output, then exit with zero exit code. All other options are ignored. If both +-C and any -Lx options are present, whichever is first is recognized. .TP 10 \fB-pattern\fP \fImodifier-list\fP Behave as if each pattern line contains the given modifiers. @@ -2109,6 +2119,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 28 November 2021 -Copyright (c) 1997-2021 University of Cambridge. +Last updated: 12 January 2022 +Copyright (c) 1997-2022 University of Cambridge. .fi diff --git a/src/pcre2test.c b/src/pcre2test.c index 0a19c20..8e712c3 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -5490,7 +5490,7 @@ if ((pat_patctl.control & CTL_POSIX) != 0) if ((pat_patctl.options & ~POSIX_SUPPORTED_COMPILE_OPTIONS) != 0) { show_compile_options( - pat_patctl.options & (uint32_t)(~POSIX_SUPPORTED_COMPILE_OPTIONS), + pat_patctl.options & (uint32_t)(~POSIX_SUPPORTED_COMPILE_OPTIONS), msg, ""); msg = ""; } @@ -5499,7 +5499,7 @@ if ((pat_patctl.control & CTL_POSIX) != 0) (uint32_t)(~POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS)) != 0) { show_compile_extra_options( - FLD(pat_context, extra_options) & + FLD(pat_context, extra_options) & (uint32_t)(~POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS), msg, ""); msg = ""; } @@ -5509,7 +5509,7 @@ if ((pat_patctl.control & CTL_POSIX) != 0) { show_controls( pat_patctl.control & (uint32_t)(~POSIX_SUPPORTED_COMPILE_CONTROLS), - pat_patctl.control2 & (uint32_t)(~POSIX_SUPPORTED_COMPILE_CONTROLS2), + pat_patctl.control2 & (uint32_t)(~POSIX_SUPPORTED_COMPILE_CONTROLS2), msg); msg = ""; } @@ -7310,7 +7310,7 @@ if (dat_datctl.replacement[0] != 0) uint8_t *pr; uint8_t rbuffer[REPLACE_BUFFSIZE]; uint8_t nbuffer[REPLACE_BUFFSIZE]; - uint8_t *rbptr; + uint8_t *rbptr; uint32_t xoptions; uint32_t emoption; /* External match option */ PCRE2_SIZE j, rlen, nsize, erroroffset; @@ -7460,10 +7460,10 @@ if (dat_datctl.replacement[0] != 0) { PCRE2_SET_SUBSTITUTE_CALLOUT(dat_context, NULL, NULL); /* No callout */ } - + /* There is a special option to set the replacement to NULL in order to test that case. */ - + rbptr = ((dat_datctl.control2 & CTL2_NULL_REPLACEMENT) == 0)? rbuffer : NULL; PCRE2_SUBSTITUTE(rc, compiled_code, pp, arg_ulen, dat_datctl.offset, @@ -7655,15 +7655,15 @@ for (gmatched = 0;; gmatched++) } /* The result of the match is now in capcount. First handle a successful - match. If pp was forced to be NULL (to test NULL handling) it will have been - treated as an empty string if the length was zero. So re-create that for + match. If pp was forced to be NULL (to test NULL handling) it will have been + treated as an empty string if the length was zero. So re-create that for outputting. */ if (capcount >= 0) { int i; - - if (pp == NULL) pp = (uint8_t *)""; + + if (pp == NULL) pp = (uint8_t *)""; if (capcount > (int)oveccount) /* Check for lunatic return value */ { @@ -8251,6 +8251,8 @@ printf(" -jit set default pattern modifier 'jit'\n"); printf(" -jitfast set default pattern modifier 'jitfast'\n"); printf(" -jitverify set default pattern modifier 'jitverify'\n"); printf(" -LM list pattern and subject modifiers, then exit\n"); +printf(" -LP list non-script properties, then exit\n"); +printf(" -LS list supported scripts, then exit\n"); printf(" -q quiet: do not output PCRE2 version number at start\n"); printf(" -pattern set default pattern modifier fields\n"); printf(" -subject set default subject modifier fields\n"); @@ -8431,6 +8433,166 @@ return 0; } +/************************************************* +* Format one property/script list item * +*************************************************/ + +#ifdef SUPPORT_UNICODE +static void +format_list_item(int16_t *ff, char *buff, BOOL isscript) +{ +int count; +int maxi = 0; +const char *maxs = ""; +size_t max = 0; + +for (count = 0; ff[count] >= 0; count++) {} + +/* Find the name to put first. For scripts, any 3-character name is chosen. +For non-scripts, or if there is no 3-character name, take the longest. */ + +for (int i = 0; ff[i] >= 0; i++) + { + const char *s = PRIV(utt_names) + ff[i]; + size_t len = strlen(s); + if (isscript && len == 3) + { + maxi = i; + max = len; + maxs = s; + break; + } + else if (len > max) + { + max = len; + maxi = i; + maxs = s; + } + } + +strcpy(buff, maxs); +buff += max; + +if (count > 1) + { + const char *sep = " ("; + for (int i = 0; i < count; i++) + { + if (i == maxi) continue; + buff += sprintf(buff, "%s%s", sep, PRIV(utt_names) + ff[i]); + sep = ", "; + } + (void)sprintf(buff, ")"); + } +} +#endif /* SUPPORT_UNICODE */ + + + +/************************************************* +* Display scripts or properties * +*************************************************/ + +#define MAX_SYNONYMS 5 + +static void +display_properties(BOOL wantscripts) +{ +#ifndef SUPPORT_UNICODE +printf("** This version of PCRE2 was compiled without Unicode support.\n"); +#else + +const char *typename; +uint16_t seentypes[1024]; +uint16_t seenvalues[1024]; +int seencount = 0; +int16_t found[256][MAX_SYNONYMS + 1]; +int fc = 0; +int colwidth = 40; +int n; + +if (wantscripts) + { + n = ucp_Script_Count; + typename = "SCRIPTS"; + } +else + { + n = ucp_Bprop_Count; + typename = "PROPERTIES"; + } + +for (size_t i = 0; i < PRIV(utt_size); i++) + { + int k; + int m = 0; + int16_t *fv; + const ucp_type_table *t = PRIV(utt) + i; + unsigned int value = t->value; + + if (wantscripts) + { + if (t->type != PT_SC && t->type != PT_SCX) continue; + } + else + { + if (t->type != PT_BOOL) continue; + } + + for (k = 0; k < seencount; k++) + { + if (t->type == seentypes[k] && t->value == seenvalues[k]) break; + } + if (k < seencount) continue; + + seentypes[seencount] = t->type; + seenvalues[seencount++] = t->value; + + fv = found[fc++]; + fv[m++] = t->name_offset; + + for (size_t j = i + 1; j < PRIV(utt_size); j++) + { + const ucp_type_table *tt = PRIV(utt) + j; + if (tt->type != t->type || tt->value != value) continue; + if (m >= MAX_SYNONYMS) + printf("** Too many synonyms: %s ignored\n", + PRIV(utt_names) + tt->name_offset); + else fv[m++] = tt->name_offset; + } + + fv[m] = -1; + } + +printf("-------------------------- SUPPORTED %s --------------------------\n\n", + typename); + +if (!wantscripts) printf( +"This release of PCRE2 supports Unicode's general category properties such\n" +"as Lu (upper case letter), bi-directional properties such as Bidi_Class,\n" +"and the following binary (yes/no) properties:\n\n"); + + +for (int k = 0; k < (n+1)/2; k++) + { + int x; + char buff1[128]; + char buff2[128]; + + format_list_item(found[k], buff1, wantscripts); + x = k + (n+1)/2; + if (x < n) format_list_item(found[x], buff2, wantscripts); + else buff2[0] = 0; + + x = printf("%s", buff1); + while (x++ < colwidth) printf(" "); + printf("%s\n", buff2); + } + +#endif /* SUPPORT_UNICODE */ +} + + /************************************************* * Display one modifier * @@ -8445,8 +8607,8 @@ printf("%c%s", c, m->name); for (size_t i = 0; i < C1MODLISTCOUNT; i++) { if (strcmp(m->name, c1modlist[i].fullname) == 0) - printf(" (%c)", c1modlist[i].onechar); - } + printf(" (%c)", c1modlist[i].onechar); + } } @@ -8505,19 +8667,19 @@ for (i = 0; i < MODLISTCOUNT; i++) break; } - if (for_pattern == is_pattern) + if (for_pattern == is_pattern) { - extra[n] = 0; + extra[n] = 0; for (size_t k = 0; k < C1MODLISTCOUNT; k++) { if (strcmp(m->name, c1modlist[k].fullname) == 0) { - extra[n] += 4; + extra[n] += 4; break; - } - } + } + } list[n++] = i; - } + } } /* Now print from the list in two columns. */ @@ -8672,6 +8834,22 @@ while (argc > 1 && argv[op][0] == '-' && argv[op][1] != 0) goto EXIT; } + /* List properties and exit */ + + if (strcmp(arg, "-LP") == 0) + { + display_properties(FALSE); + goto EXIT; + } + + /* List scripts and exit */ + + if (strcmp(arg, "-LS") == 0) + { + display_properties(TRUE); + goto EXIT; + } + /* Display and/or set return code for configuration options. */ if (strcmp(arg, "-C") == 0)