pcre2/maint/ucptest.c

/***************************************************
* A program for testing the Unicode property table *
***************************************************/

/* Copyright (c) University of Cambridge 2008-2021 */

/* Compile thus:

   gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
     ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c

   Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
   support in pcre2test.
*/

/* This is a hacked-up program for testing the Unicode properties tables of
PCRE2. It can also be used for finding characters with certain properties.
I wrote it to help with debugging PCRE, and have added things that I found
useful, in a rather haphazard way. The code has never been seriously tidied or
checked for robustness, but it shouldn't now give compiler warnings.

There is only one option: "-s". If given, it applies only to the "findprop"
command. It causes the UTF-8 sequence of bytes that encode the character to be
output between angle brackets at the end of the line. On a UTF-8 terminal, this
will show the appropriate graphic for the code point.

If the command has arguments, they are concatenated into a buffer, separated by
spaces. If the first argument starts "U+" or consists entirely of hexadecimal
digits, "findprop" is inserted at the start. The buffer is then processed as a
single line file, after which the program exits. If there are no arguments, the
program reads commands line by line on stdin and writes output to stdout. The
return code is always zero.

There are three commands:

"findprop" must be followed by a space-separated list of Unicode code points as
hex numbers, either without any prefix or starting with "U+", or as individual
UTF-8 characters preceded by '+'. For example:

  findprop U+1234 5Abc +?

The output is one line per character, giving its Unicode properties followed by
its other case or cases if one or more exist, followed by its Script Extension
list if it is not just the same as the base script. This list is in square
brackets. The properties are:

Bidi control        shown as '*' if true
Bidi class          e.g. NSM (most common is L)
General type        e.g. Letter
Specific type       e.g. Upper case letter
Script              e.g. Medefaidrin
Grapheme break type e.g. Extend (most common is Other)

The scripts names are all in lower case, with underscores removed, because
that's how they are stored for "loose" matching.

"find" must be followed by a list of property names and their values. The
values are case-sensitive, except for bidi class. This finds characters that
have those properties. If multiple properties are listed, they must all be
matched. Currently supported:

  script <name>    The character must have this script property. Only one
                     such script may be given.
  scriptx <name>   This script must be in the character's Script Extension
                     property list. If this is used many times, all the given
                     scripts must be present.
  type <abbrev>    The character's specific type (e.g. Lu or Nd) must match.
  gbreak <name>    The grapheme break property must match.
  bidi <class>     The character's bidi class must match.
  bidi_control     The character must be a bidi control character

If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
Script Extensions, there may be a mixture of positive and negative
requirements. All must be satisfied.

Sequences of two or more characters are shown as ranges, for example
U+0041..U+004A. No more than 100 lines are are output. If there are more
characters, the list ends with ...

"list" must be followed by one of property names script, type, gbreak or bidi.
The defined values for that property are listed. */


#ifdef HAVE_CONFIG_H
#include "../src/config.h"
#endif

#ifndef SUPPORT_UNICODE
#define SUPPORT_UNICODE
#endif

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../src/pcre2_internal.h"
#include "../src/pcre2_ucp.h"

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
#if defined(SUPPORT_LIBREADLINE)
#include <readline/readline.h>
#include <readline/history.h>
#else
#if defined(HAVE_EDITLINE_READLINE_H)
#include <editline/readline.h>
#else
#include <readline/readline.h>
#endif
#endif
#endif


/* -------------------------------------------------------------------*/

#define CS   (char *)
#define CCS  (const char *)
#define CSS  (char **)
#define US   (unsigned char *)
#define CUS  (const unsigned char *)
#define USS  (unsigned char **)

/* -------------------------------------------------------------------*/

static BOOL show_character = FALSE;

static const unsigned char *type_names[] = {
  US"Cc", US"Control",
  US"Cf", US"Format",
  US"Cn", US"Unassigned",
  US"Co", US"Private use",
  US"Cs", US"Surrogate",
  US"Ll", US"Lower case letter",
  US"Lm", US"Modifier letter",
  US"Lo", US"Other letter",
  US"Lt", US"Title case letter",
  US"Lu", US"Upper case letter",
  US"Mc", US"Spacing mark",
  US"Me", US"Enclosing mark",
  US"Mn", US"Non-spacing mark",
  US"Nd", US"Decimal number",
  US"Nl", US"Letter number",
  US"No", US"Other number",
  US"Pc", US"Connector punctuation",
  US"Pd", US"Dash punctuation",
  US"Pe", US"Close punctuation",
  US"Pf", US"Final punctuation",
  US"Pi", US"Initial punctuation",
  US"Po", US"Other punctuation",
  US"Ps", US"Open punctuation",
  US"Sc", US"Currency symbol",
  US"Sk", US"Modifier symbol",
  US"Sm", US"Mathematical symbol",
  US"So", US"Other symbol",
  US"Zl", US"Line separator",
  US"Zp", US"Paragraph separator",
  US"Zs", US"Space separator"
};

static const unsigned char *gb_names[] = {
  US"CR",                    US"carriage return",
  US"LF",                    US"linefeed",
  US"Control",               US"",
  US"Extend",                US"",
  US"Prepend",               US"",
  US"SpacingMark",           US"",
  US"L",                     US"Hangul syllable type L",
  US"V",                     US"Hangul syllable type V",
  US"T",                     US"Hangul syllable type T",
  US"LV",                    US"Hangul syllable type LV",
  US"LVT",                   US"Hangul syllable type LVT",
  US"Regional_Indicator",    US"",
  US"Other",                 US"",
  US"ZWJ",                   US"zero width joiner",
  US"Extended_Pictographic", US""
};

static const unsigned char *bd_names[] = {
  US"AL",   US"Arabic letter",
  US"AN",   US"Arabid number",
  US"B",    US"Paragraph separator",
  US"BN",   US"Boundary neutral",
  US"CS",   US"Common separator",
  US"EN",   US"European number",
  US"ES",   US"European separator",
  US"ET",   US"European terminator",
  US"FSI",  US"First string isolate",
  US"L",    US"Left-to-right",
  US"LRE",  US"Left-to-right embedding",
  US"LRI",  US"Left-to-right isolate",
  US"LRO",  US"Left-to-right override",
  US"NSM",  US"Non-spacing mark",
  US"ON",   US"Other neutral",
  US"PDF",  US"Pop directional format",
  US"PDI",  US"Pop directional isolate",
  US"R",    US"Right-to-left",
  US"RLE",  US"Right-to-left embedding",
  US"RLI",  US"Right-to-left isolate",
  US"RLO",  US"Right-to-left override",
  US"S",    US"Segment separator",
  US"WS",   US"White space"
};

static const unsigned int utf8_table1[] = {
  0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};

static const int utf8_table2[] = {
  0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};

/* Macro to pick up the remaining bytes of a UTF-8 character, advancing
the pointer. */

#define GETUTF8INC(c, eptr) \
    { \
    if ((c & 0x20u) == 0) \
      c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
    else if ((c & 0x10u) == 0) \
      { \
      c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
      eptr += 2; \
      } \
    else if ((c & 0x08u) == 0) \
      { \
      c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
          ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
      eptr += 3; \
      } \
    else if ((c & 0x04u) == 0) \
      { \
      c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
          ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
          (eptr[3] & 0x3fu); \
      eptr += 4; \
      } \
    else \
      { \
      c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
          ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
          ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
      eptr += 5; \
      } \
    }


/*************************************************
*       Convert character value to UTF-8         *
*************************************************/

/* This function takes an unsigned long integer value in the range 0 -
0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.

Arguments:
  cvalue     the character value
  buffer     pointer to buffer for result - at least 6 bytes long

Returns:     number of bytes placed in the buffer
             0 if input code point is too big
*/

static size_t
ord2utf8(unsigned int cvalue, unsigned char *buffer)
{
size_t i, j;
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
  if (cvalue <= utf8_table1[i]) break;
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
buffer += i;
for (j = i; j > 0; j--)
 {
 *buffer-- = 0x80 | (cvalue & 0x3f);
 cvalue >>= 6;
 }
*buffer = utf8_table2[i] | cvalue;
return i + 1;
}


/*************************************************
*             Test for interaction               *
*************************************************/

static BOOL
is_stdin_tty(void)
{
#if defined WIN32
return _isatty(_fileno(stdin));
#else
return isatty(fileno(stdin));
#endif
}


/*************************************************
*      Get script name from ucp ident            *
*************************************************/

/* The utt table contains both the full script names and the 4-letter
abbreviations. So search for both and use the longer if two are found, unless
the first one is only 3 characters (some scripts have 3-character names). If
this were not just a test program it might be worth making some kind of reverse
index. */

static const char *
get_scriptname(int script)
{
size_t i, j, len;
size_t foundlist[2];
const char *yield;

j = 0;
for (i = 0; i < PRIV(utt_size); i++)
  {
  const ucp_type_table *u = PRIV(utt) + i;
  if (u->type == PT_SCX && u->value == script)
    {
    foundlist[j++] = i;
    if (j >= 2) break;
    }
  }

if (j == 0) return "??";

yield = NULL;
len = 0;

for (i = 0; i < j; i++)
  {
  const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
  size_t sl = strlen(s);
  if (sl > len)
    {
    yield = s;
    if (sl == 3) break;
    len = sl;
    }
  }

return yield;
}


/*************************************************
*      Print Unicode property info for a char    *
*************************************************/

static void
print_prop(unsigned int c, BOOL is_just_one)
{
int type = UCD_CATEGORY(c);
int fulltype = UCD_CHARTYPE(c);
int script = UCD_SCRIPT(c);
int scriptx = UCD_SCRIPTX(c);
int gbprop = UCD_GRAPHBREAK(c);
int bidi = UCD_BIDICLASS(c);
int bidicontrol = UCD_BIDICONTROL(c);
unsigned int othercase = UCD_OTHERCASE(c);
int caseset = UCD_CASESET(c);

const unsigned char *fulltypename = US"??";
const unsigned char *typename = US"??";
const unsigned char *graphbreak = US"??";
const unsigned char *bidiclass = US"??";
const unsigned char *scriptname = CUS get_scriptname(script);

switch (type)
  {
  case ucp_C: typename = US"Control"; break;
  case ucp_L: typename = US"Letter"; break;
  case ucp_M: typename = US"Mark"; break;
  case ucp_N: typename = US"Number"; break;
  case ucp_P: typename = US"Punctuation"; break;
  case ucp_S: typename = US"Symbol"; break;
  case ucp_Z: typename = US"Separator"; break;
  }

switch (fulltype)
  {
  case ucp_Cc: fulltypename = US"Control"; break;
  case ucp_Cf: fulltypename = US"Format"; break;
  case ucp_Cn: fulltypename = US"Unassigned"; break;
  case ucp_Co: fulltypename = US"Private use"; break;
  case ucp_Cs: fulltypename = US"Surrogate"; break;
  case ucp_Ll: fulltypename = US"Lower case letter"; break;
  case ucp_Lm: fulltypename = US"Modifier letter"; break;
  case ucp_Lo: fulltypename = US"Other letter"; break;
  case ucp_Lt: fulltypename = US"Title case letter"; break;
  case ucp_Lu: fulltypename = US"Upper case letter"; break;
  case ucp_Mc: fulltypename = US"Spacing mark"; break;
  case ucp_Me: fulltypename = US"Enclosing mark"; break;
  case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
  case ucp_Nd: fulltypename = US"Decimal number"; break;
  case ucp_Nl: fulltypename = US"Letter number"; break;
  case ucp_No: fulltypename = US"Other number"; break;
  case ucp_Pc: fulltypename = US"Connector punctuation"; break;
  case ucp_Pd: fulltypename = US"Dash punctuation"; break;
  case ucp_Pe: fulltypename = US"Close punctuation"; break;
  case ucp_Pf: fulltypename = US"Final punctuation"; break;
  case ucp_Pi: fulltypename = US"Initial punctuation"; break;
  case ucp_Po: fulltypename = US"Other punctuation"; break;
  case ucp_Ps: fulltypename = US"Open punctuation"; break;
  case ucp_Sc: fulltypename = US"Currency symbol"; break;
  case ucp_Sk: fulltypename = US"Modifier symbol"; break;
  case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
  case ucp_So: fulltypename = US"Other symbol"; break;
  case ucp_Zl: fulltypename = US"Line separator"; break;
  case ucp_Zp: fulltypename = US"Paragraph separator"; break;
  case ucp_Zs: fulltypename = US"Space separator"; break;
  }

switch(gbprop)
  {
  case ucp_gbCR:           graphbreak = US"CR"; break;
  case ucp_gbLF:           graphbreak = US"LF"; break;
  case ucp_gbControl:      graphbreak = US"Control"; break;
  case ucp_gbExtend:       graphbreak = US"Extend"; break;
  case ucp_gbPrepend:      graphbreak = US"Prepend"; break;
  case ucp_gbSpacingMark:  graphbreak = US"SpacingMark"; break;
  case ucp_gbL:            graphbreak = US"Hangul syllable type L"; break;
  case ucp_gbV:            graphbreak = US"Hangul syllable type V"; break;
  case ucp_gbT:            graphbreak = US"Hangul syllable type T"; break;
  case ucp_gbLV:           graphbreak = US"Hangul syllable type LV"; break;
  case ucp_gbLVT:          graphbreak = US"Hangul syllable type LVT"; break;
  case ucp_gbRegional_Indicator:
                           graphbreak = US"Regional Indicator"; break;
  case ucp_gbOther:        graphbreak = US"Other"; break;
  case ucp_gbZWJ:          graphbreak = US"Zero Width Joiner"; break;
  case ucp_gbExtended_Pictographic:
                           graphbreak = US"Extended Pictographic"; break;
  default:                 graphbreak = US"Unknown"; break;
  }

switch(bidi)
  {
  case ucp_bidiAL:   bidiclass = US"AL "; break;
  case ucp_bidiFSI:  bidiclass = US"FSI"; break;
  case ucp_bidiL:    bidiclass = US"L  "; break;
  case ucp_bidiLRE:  bidiclass = US"LRE"; break;
  case ucp_bidiLRI:  bidiclass = US"LRI"; break;
  case ucp_bidiLRO:  bidiclass = US"LRO"; break;
  case ucp_bidiPDF:  bidiclass = US"PDF"; break;
  case ucp_bidiPDI:  bidiclass = US"PDI"; break;
  case ucp_bidiR:    bidiclass = US"R  "; break;
  case ucp_bidiRLE:  bidiclass = US"RLE"; break;
  case ucp_bidiRLI:  bidiclass = US"RLI"; break;
  case ucp_bidiRLO:  bidiclass = US"RLO"; break;
  case ucp_bidiAN:   bidiclass = US"AN "; break;
  case ucp_bidiB:    bidiclass = US"B  "; break;
  case ucp_bidiBN:   bidiclass = US"BN "; break;
  case ucp_bidiCS:   bidiclass = US"CS "; break;
  case ucp_bidiEN:   bidiclass = US"EN "; break;
  case ucp_bidiES:   bidiclass = US"ES "; break;
  case ucp_bidiET:   bidiclass = US"ET "; break;
  case ucp_bidiNSM:  bidiclass = US"NSM"; break;
  case ucp_bidiON:   bidiclass = US"ON "; break;
  case ucp_bidiS:    bidiclass = US"S  "; break;
  case ucp_bidiWS:   bidiclass = US"WS "; break;
  default:           bidiclass = US"???"; break;
  }

printf("U+%04X %c%s %s: %s, %s, %s", c, bidicontrol? '*':' ', bidiclass,
  typename, fulltypename, scriptname, graphbreak);

if (is_just_one && othercase != c)
  {
  printf(", U+%04X", othercase);
  if (caseset != 0)
    {
    const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
    while (*(++p) < NOTACHAR)
      {
      unsigned int d = *p;
      if (d != othercase && d != c) printf(", U+%04X", d);
      }
    }
  }

if (scriptx != script)
  {
  printf(", [");
  if (scriptx >= 0)
    printf("%s", get_scriptname(scriptx));
  else
    {
    const char *sep = "";


/*
    const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
    while (*p != 0)
      {
      printf("%s%s", sep, get_scriptname(*p++));
      sep = ", ";
      }
*/

    const uint32_t *p = PRIV(ucd_script_sets) - scriptx;
    for (int i = 0; i < ucp_Script_Count; i++)
      {
      int x = i/32;
      int y = i%32;

      if ((p[x] & (1u<<y)) != 0)
        {
        printf("%s%s", sep, get_scriptname(i));
        sep = ", ";
        }
      }

    }
  printf("]");
  }

if (show_character && is_just_one)
  {
  unsigned char buffer[8];
  size_t len = ord2utf8(c, buffer);
  printf(", >%.*s<", (int)len, buffer);
  }

printf("\n");
}


/*************************************************
*   Find character(s) with given property/ies    *
*************************************************/

static void
find_chars(unsigned char *s)
{
unsigned char name[24];
unsigned char value[24];
unsigned char *t;
unsigned int count= 0;
int scriptx_list[24];
unsigned int scriptx_count = 0;
uint32_t i, c;
int script = -1;
int type = -1;
int gbreak = -1;
int bidiclass = -1;
BOOL bidicontrol = FALSE;
BOOL script_not = FALSE;
BOOL type_not = FALSE;
BOOL gbreak_not = FALSE;
BOOL bidiclass_not = FALSE;
BOOL hadrange = FALSE;
const ucd_record *ucd, *next_ucd;
const char *pad = "        ";

while (*s != 0)
  {
  unsigned int offset = 0;
  BOOL scriptx_not = FALSE;
  char *value_start;

  for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
  *t = 0;
  while (isspace(*s)) s++;
  value_start = s;

  for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s;
  *t = 0;
  while (isspace(*s)) s++;

  if (strcmp(CS name, "script") == 0 ||
      strcmp(CS name, "scriptx") == 0)
    {
    if (value[0] == '!')
      {
      if (name[6] == 'x') scriptx_not = TRUE;
        else script_not = TRUE;
      offset = 1;
      }

    for (i = 0; i < PRIV(utt_size); i++)
      {
      const ucp_type_table *u = PRIV(utt) + i;
      if (u->type == PT_SCX && strcmp(CS(value + offset),
            PRIV(utt_names) + u->name_offset) == 0)
        {
        c = u->value;
        if (name[6] == 'x')
          {
          scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
          }
        else
          {
          if (script < 0) script = c; else
            {
            printf("** Only 1 script value allowed\n");
            return;
            }
          }
        break;
        }
      }

    if (i >= PRIV(utt_size))
      {
      printf("** Unrecognized script name \"%s\"\n", value);
      return;
      }
    }

  else if (strcmp(CS name, "type") == 0)
    {
    if (type >= 0)
      {
      printf("** Only 1 type value allowed\n");
      return;
      }
    else
      {
      if (value[0] == '!')
        {
        type_not = TRUE;
        offset = 1;
        }

      for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
        {
        if (strcmp(CS (value + offset), CS type_names[i]) == 0)
          {
          type = i/2;
          break;
          }
        }
      if (i >= sizeof(type_names)/sizeof(char *))
        {
        printf("** Unrecognized type name \"%s\"\n", value);
        return;
        }
      }
    }

  else if (strcmp(CS name, "gbreak") == 0)
    {
    if (gbreak >= 0)
      {
      printf("** Only 1 grapheme break value allowed\n");
      return;
      }
    else
      {
      if (value[0] == '!')
        {
        gbreak_not = TRUE;
        offset = 1;
        }

      for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
        {
        if (strcmp(CS (value + offset), CS gb_names[i]) == 0)
          {
          gbreak = i/2;
          break;
          }
        }
      if (i >= sizeof(gb_names)/sizeof(char *))
        {
        printf("** Unrecognized gbreak name \"%s\"\n", value);
        return;
        }
      }
    }

  else if (strcmp(CS name, "bidi") == 0 ||
           strcmp(CS name, "bidiclass") == 0 ||
           strcmp(CS name, "bidi_class") == 0 )
    {
    if (bidiclass >= 0)
      {
      printf("** Only 1 bidi class value allowed\n");
      return;
      }
    else
      {
      if (value[0] == '!')
        {
        bidiclass_not = TRUE;
        offset = 1;
        }
      for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
        {
        if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
          {
          bidiclass = i/2;
          break;
          }
        }
      if (i >= sizeof(bd_names)/sizeof(char *))
        {
        printf("** Unrecognized bidi class name \"%s\"\n", value);
        return;
        }
      }
    }

  else if (strcmp(CS name, "bidi_control") == 0 ||
           strcmp(CS name, "bidicontrol") == 0)
    {
    bidicontrol = TRUE;
    s = value_start;     /* No data */
    }

  else
    {
    printf("** Unrecognized property name \"%s\"\n", name);
    return;
    }
  }

if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0 &&
    bidiclass < 0 && !bidicontrol)
  {
  printf("** No properties specified\n");
  return;
  }

for (c = 0; c <= 0x10ffff; c++)
  {
  if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;

  if (scriptx_count > 0)
    {
    const uint32_t *bits_scriptx = NULL;
    unsigned int found = 0;
    int scriptx = UCD_SCRIPTX(c);

    if (scriptx < 0) bits_scriptx = PRIV(ucd_script_sets) - scriptx;

    for (i = 0; i < scriptx_count; i++)
      {
      /* Positive requirment */
      if (scriptx_list[i] >= 0)
        {
        if (scriptx >= 0)
          {
          if (scriptx == scriptx_list[i]) found++;
          }

        else
          {
          int x = scriptx_list[i]/32;
          int y = scriptx_list[i]%32;
          if ((bits_scriptx[x] & (1u<<y)) != 0) found++;
          }
        }
      /* Negative requirement */
      else
        {
        if (scriptx >= 0)
          {
          if (scriptx != -scriptx_list[i]) found++;
          }
        else
          {
          int x = scriptx_list[i]/32;
          int y = scriptx_list[i]%32;
          if ((bits_scriptx[x] & (1u<<y)) == 0) found++;
          }
        }
      }

    if (found != scriptx_count) continue;
    }

  if (type >= 0)
    {
    if (type_not)
      {
      if (type == UCD_CHARTYPE(c)) continue;
      }
    else
      {
      if (type != UCD_CHARTYPE(c)) continue;
      }
    }

  if (gbreak >= 0)
    {
    if (gbreak_not)
      {
      if (gbreak == UCD_GRAPHBREAK(c)) continue;
      }
    else
      {
      if (gbreak != UCD_GRAPHBREAK(c)) continue;
      }
    }

  if (bidiclass >= 0)
    {
    if (bidiclass_not)
      {
      if (bidiclass == UCD_BIDICLASS(c)) continue;
      }
    else
      {
      if (bidiclass != UCD_BIDICLASS(c)) continue;
      }
    }

  if (bidicontrol && UCD_BIDICONTROL(c) == 0) continue;

  /* All conditions are met. Look for runs. */

  ucd = GET_UCD(c);

  for (i = c + 1; i < 0x10ffff; i++)
    {
    next_ucd = GET_UCD(i);
    if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
    }

  if (--i > c)
    {
    printf("U+%04X..", c);
    c = i;
    hadrange = TRUE;
    }
  else if (hadrange) printf("%s", pad);

  print_prop(c, FALSE);
  if (c >= 0x100000) pad = "        ";
    else if (c >= 0x10000) pad = "       ";
  count++;
  if (count >= 100)
    {
    printf("...\n");
    break;
    }
  }

if (count == 0) printf("No characters found\n");
}


/*************************************************
*        Process command line                    *
*************************************************/

static void
process_command_line(unsigned char *buffer)
{
unsigned char *s, *t;
unsigned char name[24];

s = buffer;
while (isspace(*s)) s++;
if (*s == 0) return;

for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
*t = 0;
while (isspace(*s)) s++;

if (strcmp(CS name, "findprop") == 0)
  {
  while (*s != 0)
    {
    unsigned int c;
    unsigned char *endptr;
    t = s;

    if (*t == '+')
      {
      c = *(++t);
      if (c > 0x7fu)
        {
        GETCHARINC(c, t);
        }
      endptr = t+1;
      }
    else
      {
      if (strncmp(CS t, "U+", 2) == 0) t += 2;
      c = strtoul(CS t, CSS(&endptr), 16);
      }

    if (*endptr != 0 && !isspace(*endptr))
      {
      while (*endptr != 0 && !isspace(*endptr)) endptr++;
      printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
      }
    else
      {
      if (c > 0x10ffff)
        printf("** U+%x is too big for a Unicode code point\n", c);
      else
        print_prop(c, TRUE);
      }
    s = endptr;
    while (isspace(*s)) s++;
    }
  }

else if (strcmp(CS name, "find") == 0)
  {
  find_chars(s);
  }

else if (strcmp(CS name, "list") == 0)
  {
  while (*s != 0)
    {
    size_t i;
    for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
    *t = 0;
    while (isspace(*s)) s++;

    if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
      {
      for (i = 0; i < PRIV(utt_size); i++)
        if (PRIV(utt)[i].type == PT_SCX)
          printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
      }

    else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
      {
      for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
        printf("%s %s\n", type_names[i], type_names[i+1]);
      }

    else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
      {
      for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
        {
        if (gb_names[i+1][0] != 0)
          printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
        else
          printf("%s\n", gb_names[i]);
        }
      }

    else if (strcmp(CS name, "bidi") == 0 ||
             strcmp(CS name, "bidiclasses") == 0)
      {
      for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
        printf("%3s %s\n", bd_names[i], bd_names[i+1]);
      }

    else
      {
      printf("** Unknown property \"%s\"\n", name);
      break;
      }
    }
  }

else printf("** Unknown test command \"%s\"\n", name);
}


/*************************************************
*               Main program                     *
*************************************************/

int
main(int argc, char **argv)
{
BOOL interactive;
int first_arg = 1;
unsigned char buffer[1024];

if (argc > 1 && strcmp(argv[1], "-s") == 0)
  {
  show_character = TRUE;
  first_arg++;
  }

if (argc > first_arg)
  {
  int i;
  BOOL datafirst = TRUE;
  char *arg = argv[first_arg];
  unsigned char *s = buffer;

  if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
    {
    while (*arg != 0)
      {
      if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
      }
    }

  if (datafirst)
    {
    strcpy(CS s, "findprop ");
    s += 9;
    }

  for (i = first_arg; i < argc; i++)
    {
    s += sprintf(CS s, "%s ", argv[i]);
    }

  process_command_line(buffer);
  return 0;
  }

interactive = is_stdin_tty();

#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
if (interactive) using_history();
#endif

for(;;)
  {
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
  if (interactive)
    {
    size_t len;
    unsigned char *s = US readline("> ");
    if (s == NULL) break;
    len = strlen(CS s);
    if (len > 0) add_history(CS s);
    memcpy(buffer, s, len);
    buffer[len] = '\n';
    buffer[len+1] = 0;
    free(s);
    }
  else
#endif

    {
    if (interactive) printf("> ");
    if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
    if (!interactive) printf("%s", buffer);
    }

  process_command_line(buffer);
  }

if (interactive) printf("\n");

#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
if (interactive) clear_history();
#endif

return 0;
}

/* End */