Basic "script run" implementation. Not yet complete, and not yet documented.

2018-10-02 15:25:58 +00:00 · 2018-10-02 15:25:58 +00:00 · 866750fd53
parent f26b0b0bae
commit 866750fd53
30 changed files with 1787 additions and 1012 deletions
--- a/4
+++ b/4
@ -30,6 +30,10 @@ new "is lower case letter" bit. At the same time, the now unused "is
 hexadecimal digit" bit was removed. The default tables in
 src/pcre2_chartables.c.dist are updated.

+8. Implement the new Perl "script run" features (*script_run:...) and 
+(*atomic_script_run:...) aka (*sr:...) and (*asr:...). At present, this is 
+incomplete and not yet documented.
+

 Version 10.32 10-September-2018
 -------------------------------
--- a/Makefile.am
+++ b/Makefile.am
@ -364,6 +364,7 @@ COMMON_SOURCES = \
  src/pcre2_newline.c \
  src/pcre2_ord2utf.c \
  src/pcre2_pattern_info.c \
+  src/pcre2_script_run.c \
  src/pcre2_serialize.c \
  src/pcre2_string_utils.c \
  src/pcre2_study.c \
--- a/1
+++ b/1
@ -104,6 +104,7 @@ can skip ahead to the CMake section.
       pcre2_newline.c
       pcre2_ord2utf.c
       pcre2_pattern_info.c
+       pcre2_script_run.c 
       pcre2_serialize.c
       pcre2_string_utils.c
       pcre2_study.c
--- a/1
+++ b/1
@ -788,6 +788,7 @@ The distribution should contain the files listed below.
  src/pcre2_newline.c      )
  src/pcre2_ord2utf.c      )
  src/pcre2_pattern_info.c )
+  src/pcre2_script_run.c   ) 
  src/pcre2_serialize.c    )
  src/pcre2_string_utils.c )
  src/pcre2_study.c        )
--- a/doc/html/NON-AUTOTOOLS-BUILD.txt
+++ b/doc/html/NON-AUTOTOOLS-BUILD.txt
@ -104,6 +104,7 @@ can skip ahead to the CMake section.
       pcre2_newline.c
       pcre2_ord2utf.c
       pcre2_pattern_info.c
+       pcre2_script_run.c 
       pcre2_serialize.c
       pcre2_string_utils.c
       pcre2_study.c
--- a/doc/html/README.txt
+++ b/doc/html/README.txt
@ -788,6 +788,7 @@ The distribution should contain the files listed below.
  src/pcre2_newline.c      )
  src/pcre2_ord2utf.c      )
  src/pcre2_pattern_info.c )
+  src/pcre2_script_run.c   ) 
  src/pcre2_serialize.c    )
  src/pcre2_string_utils.c )
  src/pcre2_study.c        )
--- a/maint/GenerateUtt.py
+++ b/maint/GenerateUtt.py
@ -25,8 +25,9 @@
 # Added script names for Unicode 8.0.0, 19-June-2015.
 # Added script names for Unicode 10.0.0, 02-July-2017.
 # Added script names for Unicode 11.0.0, 03-July-2018.
+# Added 'Unknown' script, 01-October-2018.

-script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
+script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
--- a/maint/MultiStage2.py
+++ b/maint/MultiStage2.py
@ -143,6 +143,7 @@
 # 03-July-2018:      Updated for Unicode 11.0.0
 # 07-July-2018:      Added code to scan emoji-data.txt for the Extended
 #                      Pictographic property.
+# 01-October-2018:   Added the 'Unknown' script name
 ##############################################################################


@ -300,7 +301,7 @@ def get_record_size_struct(records):
        slice_type, slice_size = get_type_size(record_slice)
        size = (size + slice_size - 1) & -slice_size

-        structure += '} ucd_record;\n*/\n\n'
+        structure += '} ucd_record;\n*/\n'
        return size, structure

 def test_record_size():
@ -329,7 +330,7 @@ def print_records(records, record_size):
                print(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
        print('};\n')

-script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
+script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
@ -380,7 +381,7 @@ break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
 test_record_size()
 unicode_version = ""

-script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
+script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
 category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
 break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
 other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
@ -553,11 +554,11 @@ print("special record. */")
 print()
 print("#if PCRE2_CODE_UNIT_WIDTH == 32")
 print("const ucd_record PRIV(dummy_ucd_record)[] = {{")
-print("  ucp_Common,    /* script */")
-print("  ucp_Cn,        /* type unassigned */")
-print("  ucp_gbOther,   /* grapheme break property */")
-print("  0,             /* case set */")
-print("  0,             /* other case */")
+print("  ucp_Unknown,    /* script */")
+print("  ucp_Cn,         /* type unassigned */")
+print("  ucp_gbOther,    /* grapheme break property */")
+print("  0,              /* case set */")
+print("  0,              /* other case */")
 print("  }};")
 print("#endif")
 print()
@ -565,6 +566,9 @@ print(record_struct)

 # --- Added by PH: output the table of caseless character sets ---

+print("/* This table contains lists of characters that are caseless sets of")
+print("more than one character. Each list is terminated by NOTACHAR. */\n")
+
 print("const uint32_t PRIV(ucd_caseless_sets)[] = {")
 print("  NOTACHAR,")
 for s in sets:
@ -577,10 +581,53 @@ print()

 # ------

-print("/* When #included in pcre2test, we don't need this large table. */")
+print("/* When #included in pcre2test, we don't need the table of digit")
+print("sets, nor the the large main UCD tables. */")
 print()
 print("#ifndef PCRE2_PCRE2TEST")
 print()
+
+# --- Added by PH: read Scripts.txt again for the sets of 10 digits. ---
+
+digitsets = []
+file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
+
+for line in file:
+  m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
+  if m is None:
+    continue
+  first = int(m.group(1),16)   
+  last  = int(m.group(2),16)   
+  if ((last - first + 1) % 10) != 0:
+    print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
+      file=sys.stderr) 
+  while first < last:
+    digitsets.append(first + 9)
+    first += 10
+file.close()
+digitsets.sort()
+
+print("/* This table lists the code points for the '9' characters in each")
+print("set of decimal digits. It is used to ensure that all the digits in")
+print("a script run come from the same set. */")
+print()
+print("const uint32_t PRIV(ucd_digit_sets)[] = {")
+
+print("  %d,  /* Number of subsequent values */" % len(digitsets), end='')
+count = 8
+for d in digitsets:
+  if count == 8:
+    print("\n ", end='')
+    count = 0
+  print(" 0x%05x," % d, end='')
+  count += 1
+print("\n};")
+print()
+
+# Output the main UCD tables.
+
+print("/* These are the main two-stage UCD tables. */\n")
+
 print_records(records, record_size)
 print_table(min_stage1, 'PRIV(ucd_stage1)')
 print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
@ -591,6 +638,10 @@ print("#endif  /* SUPPORT_UNICODE */")
 print()
 print("#endif  /* PCRE2_PCRE2TEST */")

+
+# This code was part of the original contribution, but is commented out as it
+# was never used. A two-stage table has sufficed.
+
 """

 # Three-stage tables:
--- a/maint/ucptest.c
+++ b/maint/ucptest.c
@ -134,6 +134,7 @@ switch(gbprop)

 switch(script)
  {
+  case ucp_Unknown:     scriptname = US"Unknown"; break; 
  case ucp_Arabic:      scriptname = US"Arabic"; break;
  case ucp_Armenian:    scriptname = US"Armenian"; break;
  case ucp_Balinese:    scriptname = US"Balinese"; break;
--- a/perltest.sh
+++ b/perltest.sh
@ -1,8 +1,10 @@
 #! /bin/sh

 # Script for testing regular expressions with perl to check that PCRE2 handles
-# them the same. If the first argument to this script is "-w", Perl is also
-# called with "-w", which turns on its warning mode.
+# them the same. For testing with different versions of Perl, if the first
+# argument is -perl then the second is taken as the Perl command to use, and
+# both are then removed. If the next argument is "-w", Perl is called with
+# "-w", which turns on its warning mode.
 #
 # The Perl code has to have "use utf8" and "require Encode" at the start when
 # running UTF-8 tests, but *not* for non-utf8 tests. (The "require" would
@ -10,8 +12,8 @@
 # the script will always run for these tests.)
 #
 # The desired effect is achieved by making this a shell script that passes the
-# Perl script to Perl through a pipe. If the first argument (possibly after
-# removing "-w") is "-utf8", a suitable prefix is set up.
+# Perl script to Perl through a pipe. If the next argument is "-utf8", a
+# suitable prefix is set up.
 #
 # The remaining arguments, if any, are passed to Perl. They are an input file
 # and an output file. If there is one argument, the output is written to
@ -23,6 +25,12 @@ perl=perl
 perlarg=''
 prefix=''

+if [ $# -gt 1 -a "$1" = "-perl" ] ; then
+  shift
+  perl=$1
+  shift
+fi     
+
 if [ $# -gt 0 -a "$1" = "-w" ] ; then
  perlarg="-w"
  shift
@ -78,6 +86,7 @@ fi
 # The alpha assertions currently give warnings even when -w is not specified.

 no warnings "experimental::alpha_assertions";
+no warnings "experimental::script_run";

 # Function for turning a string into a string of printing chars.

--- a/src/pcre2.h.in
+++ b/src/pcre2.h.in
@ -321,6 +321,7 @@ pcre2_pattern_convert(). */
 #define PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE      193
 #define PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS      194
 #define PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN        195
+#define PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE       196


 /* "Expected" matching error codes: no match and partial match. */
--- a/src/pcre2_auto_possess.c
+++ b/src/pcre2_auto_possess.c
@ -605,6 +605,15 @@ for(;;)
      if (cb->had_recurse) return FALSE;
      break;
      
+      /* A script run might have to backtrack if the iterated item can match
+      characters from more than one script. So give up unless repeating an 
+      explicit character. */
+      
+      case OP_SCRIPT_RUN:
+      if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)
+        return FALSE; 
+      break;   
+
      /* Atomic sub-patterns and assertions can always auto-possessify their
      last iterator. However, if the group was entered as a result of checking
      a previous iterator, this is not possible. */
@ -614,7 +623,6 @@ for(;;)
      case OP_ASSERTBACK:
      case OP_ASSERTBACK_NOT:
      case OP_ONCE:
-
      return !entered_a_group;
      }

--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -240,49 +240,57 @@ code (meta_extra_lengths, just below) must be updated to remain in step. */
 #define META_RANGE_LITERAL    0x801f0000u  /* range defined literally */
 #define META_RECURSE          0x80200000u  /* Recursion */
 #define META_RECURSE_BYNAME   0x80210000u  /* (?&name) */
+#define META_SCRIPT_RUN       0x80220000u  /* (*script_run:...) */

 /* These must be kept together to make it easy to check that an assertion
 is present where expected in a conditional group. */

-#define META_LOOKAHEAD        0x80220000u  /* (?= */
-#define META_LOOKAHEADNOT     0x80230000u  /* (?! */
-#define META_LOOKBEHIND       0x80240000u  /* (?<= */
-#define META_LOOKBEHINDNOT    0x80250000u  /* (?<! */
+#define META_LOOKAHEAD        0x80230000u  /* (?= */
+#define META_LOOKAHEADNOT     0x80240000u  /* (?! */
+#define META_LOOKBEHIND       0x80250000u  /* (?<= */
+#define META_LOOKBEHINDNOT    0x80260000u  /* (?<! */

 /* These must be kept in this order, with consecutive values, and the _ARG
 versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
 versions. */

-#define META_MARK             0x80260000u  /* (*MARK) */
-#define META_ACCEPT           0x80270000u  /* (*ACCEPT) */
-#define META_FAIL             0x80280000u  /* (*FAIL) */
-#define META_COMMIT           0x80290000u  /* These               */
-#define META_COMMIT_ARG       0x802a0000u  /*   pairs             */
-#define META_PRUNE            0x802b0000u  /*     must            */
-#define META_PRUNE_ARG        0x802c0000u  /*       be            */
-#define META_SKIP             0x802d0000u  /*         kept        */
-#define META_SKIP_ARG         0x802e0000u  /*           in        */
-#define META_THEN             0x802f0000u  /*             this    */
-#define META_THEN_ARG         0x80300000u  /*               order */
+#define META_MARK             0x80270000u  /* (*MARK) */
+#define META_ACCEPT           0x80280000u  /* (*ACCEPT) */
+#define META_FAIL             0x80290000u  /* (*FAIL) */
+#define META_COMMIT           0x802a0000u  /* These               */
+#define META_COMMIT_ARG       0x802b0000u  /*   pairs             */
+#define META_PRUNE            0x802c0000u  /*     must            */
+#define META_PRUNE_ARG        0x802d0000u  /*       be            */
+#define META_SKIP             0x802e0000u  /*         kept        */
+#define META_SKIP_ARG         0x802f0000u  /*           in        */
+#define META_THEN             0x80300000u  /*             this    */
+#define META_THEN_ARG         0x80310000u  /*               order */

 /* These must be kept in groups of adjacent 3 values, and all together. */

-#define META_ASTERISK         0x80310000u  /* *  */
-#define META_ASTERISK_PLUS    0x80320000u  /* *+ */
-#define META_ASTERISK_QUERY   0x80330000u  /* *? */
-#define META_PLUS             0x80340000u  /* +  */
-#define META_PLUS_PLUS        0x80350000u  /* ++ */
-#define META_PLUS_QUERY       0x80360000u  /* +? */
-#define META_QUERY            0x80370000u  /* ?  */
-#define META_QUERY_PLUS       0x80380000u  /* ?+ */
-#define META_QUERY_QUERY      0x80390000u  /* ?? */
-#define META_MINMAX           0x803a0000u  /* {n,m}  repeat */
-#define META_MINMAX_PLUS      0x803b0000u  /* {n,m}+ repeat */
-#define META_MINMAX_QUERY     0x803c0000u  /* {n,m}? repeat */
+#define META_ASTERISK         0x80320000u  /* *  */
+#define META_ASTERISK_PLUS    0x80330000u  /* *+ */
+#define META_ASTERISK_QUERY   0x80340000u  /* *? */
+#define META_PLUS             0x80350000u  /* +  */
+#define META_PLUS_PLUS        0x80360000u  /* ++ */
+#define META_PLUS_QUERY       0x80370000u  /* +? */
+#define META_QUERY            0x80380000u  /* ?  */
+#define META_QUERY_PLUS       0x80390000u  /* ?+ */
+#define META_QUERY_QUERY      0x803a0000u  /* ?? */
+#define META_MINMAX           0x803b0000u  /* {n,m}  repeat */
+#define META_MINMAX_PLUS      0x803c0000u  /* {n,m}+ repeat */
+#define META_MINMAX_QUERY     0x803d0000u  /* {n,m}? repeat */

 #define META_FIRST_QUANTIFIER META_ASTERISK
 #define META_LAST_QUANTIFIER  META_MINMAX_QUERY

+/* This is a special "meta code" that is used only to distinguish (*asr: from
+(*sr in the table of aphabetic assertions. It is never stored in the parsed
+pattern because (*asr: is turned into (*sr:(*atomic: at that stage. There is
+therefore no need for it to have a length entry, so use a high value. */
+
+#define META_ATOMIC_SCRIPT_RUN 0x8fff0000u
+
 /* Table of extra lengths for each of the meta codes. Must be kept in step with
 the definitions above. For some items these values are a basic length to which
 a variable amount has to be added. */
@ -322,6 +330,7 @@ static unsigned char meta_extra_lengths[] = {
  0,             /* META_RANGE_LITERAL */
  SIZEOFFSET,    /* META_RECURSE */
  1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
+  0,             /* META_SCRIPT_RUN */
  0,             /* META_LOOKAHEAD */
  0,             /* META_LOOKAHEADNOT */
  SIZEOFFSET,    /* META_LOOKBEHIND */
@ -638,19 +647,19 @@ static const char alasnames[] =
  STRING_atomic_script_run;

 static const alasitem alasmeta[] = {
-  {  3, META_LOOKAHEAD     },
-  {  3, META_LOOKBEHIND    },
-  {  3, META_LOOKAHEADNOT  },
-  {  3, META_LOOKBEHINDNOT },
-  { 18, META_LOOKAHEAD     },
-  { 19, META_LOOKBEHIND    },
-  { 18, META_LOOKAHEADNOT  },
-  { 19, META_LOOKBEHINDNOT },
-  {  6, META_ATOMIC        },
-  {  2, 0                  }, /* sr = script run */
-  {  3, 0                  }, /* asr = atomic script run */
-  { 10, 0                  }, /* script run */
-  { 17, 0                  }  /* atomic script run */
+  {  3, META_LOOKAHEAD         },
+  {  3, META_LOOKBEHIND        },
+  {  3, META_LOOKAHEADNOT      },
+  {  3, META_LOOKBEHINDNOT     },
+  { 18, META_LOOKAHEAD         },
+  { 19, META_LOOKBEHIND        },
+  { 18, META_LOOKAHEADNOT      },
+  { 19, META_LOOKBEHINDNOT     },
+  {  6, META_ATOMIC            },
+  {  2, META_SCRIPT_RUN        }, /* sr = script run */
+  {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
+  { 10, META_SCRIPT_RUN        }, /* script run */
+  { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
 };

 static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
@ -772,7 +781,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
       ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
       ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
       ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
-       ERR91, ERR92, ERR93, ERR94, ERR95 };
+       ERR91, ERR92, ERR93, ERR94, ERR95, ERR96 };

 /* This is a table of start-of-pattern options such as (*UTF) and settings such
 as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -1003,6 +1012,7 @@ for (;;)
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
+    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
    case META_KET: fprintf(stderr, "META )"); break;
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;

@ -2345,6 +2355,7 @@ typedef struct nest_save {

 #define NSF_RESET          0x0001u
 #define NSF_CONDASSERT     0x0002u
+#define NSF_ATOMICSR       0x0004u

 /* Options that are changeable within the pattern must be tracked during
 parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
@ -3576,8 +3587,15 @@ while (ptr < ptrend)
          goto FAILED;
          }

+        /* The lookaround alphabetic synonyms can be almost entirely handled by
+        jumping to the code that handles the traditional symbolic forms. */
+
        switch(meta)
          {
+          default:
+          errorcode = ERR89;  /* Unknown code; should never occur because */
+          goto FAILED;        /* the meta values come from a table above. */
+
          case META_ATOMIC:
          goto ATOMIC_GROUP;

@ -3591,14 +3609,37 @@ while (ptr < ptrend)
          case META_LOOKBEHINDNOT:
          *parsed_pattern++ = meta;
          ptr--;
-          goto LOOKBEHIND;  
-          
-          /* FIXME: Script Run stuff ... */ 
-            
-          
- 
+          goto POST_LOOKBEHIND;

+          /* The script run facilities are handled here. Unicode support is
+          required (give an error if not, as this is a security issue). Always
+          record a META_SCRIPT_RUN item. Then, for the atomic version, insert
+          META_ATOMIC and remember that we need two META_KETs at the end. */

+          case META_SCRIPT_RUN:
+          case META_ATOMIC_SCRIPT_RUN:
+#ifdef SUPPORT_UNICODE
+          *parsed_pattern++ = META_SCRIPT_RUN;
+          nest_depth++;
+          ptr++;
+          if (meta == META_ATOMIC_SCRIPT_RUN)
+            {
+            *parsed_pattern++ = META_ATOMIC;
+            if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
+            else if (++top_nest >= end_nests)
+              {
+              errorcode = ERR84;
+              goto FAILED;
+              }
+            top_nest->nest_depth = nest_depth;
+            top_nest->flags = NSF_ATOMICSR;
+            top_nest->options = options & PARSE_TRACKED_OPTIONS;
+            }
+          break;
+#else  /* SUPPORT_UNICODE */
+          errorcode = ERR96;
+          goto FAILED;
+#endif
          }
        }

@ -4263,7 +4304,7 @@ while (ptr < ptrend)
      *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
        META_LOOKBEHIND : META_LOOKBEHINDNOT;

-      LOOKBEHIND:                /* Come from (*plb: and (*nlb: */
+      POST_LOOKBEHIND:              /* Come from (*plb: and (*nlb: */
      *has_lookbehind = TRUE;
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
      PUTOFFSET(offset, parsed_pattern);
@ -4425,6 +4466,14 @@ while (ptr < ptrend)
        cb->bracount = top_nest->max_group;
      if ((top_nest->flags & NSF_CONDASSERT) != 0)
        okquantifier = FALSE;
+
+      if ((top_nest->flags & NSF_ATOMICSR) != 0)
+        {
+        *parsed_pattern++ = META_KET;
+        }
+
+
+
      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
        else top_nest--;
      }
@ -6142,6 +6191,10 @@ for (;; pptr++)
    bravalue = OP_ONCE;
    goto GROUP_PROCESS_NOTE_EMPTY;

+    case META_SCRIPT_RUN:
+    bravalue = OP_SCRIPT_RUN;
+    goto GROUP_PROCESS_NOTE_EMPTY;
+
    case META_NOCAPTURE:
    bravalue = OP_BRA;
    /* Fall through */
@ -6777,6 +6830,7 @@ for (;; pptr++)
      case OP_ASSERTBACK:
      case OP_ASSERTBACK_NOT:
      case OP_ONCE:
+      case OP_SCRIPT_RUN:
      case OP_BRA:
      case OP_CBRA:
      case OP_COND:
@ -6989,16 +7043,16 @@ for (;; pptr++)
          }

        /* If the maximum is unlimited, set a repeater in the final copy. For
-        ONCE brackets, that's all we need to do. However, possessively repeated
-        ONCE brackets can be converted into non-capturing brackets, as the
-        behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
-        deal with possessive ONCEs specially.
+        SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
+        possessively repeated ONCE brackets can be converted into non-capturing
+        brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
+        saves having to deal with possessive ONCEs specially.

        Otherwise, when we are doing the actual compile phase, check to see
        whether this group is one that could match an empty string. If so,
        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
        that runtime checking can be done. [This check is also applied to ONCE
-        groups at runtime, but in a different way.]
+        and SCRIPT_RUN groups at runtime, but in a different way.]

        Then, if the quantifier was possessive and the bracket is not a
        conditional, we convert the BRA code to the POS form, and the KET code to
@ -7022,13 +7076,14 @@ for (;; pptr++)

          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;

-          /* For non-possessive ONCE brackets, all we need to do is to
-          set the KET. */
+          /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
+          to do is to set the KET. */

-          if (*bracode == OP_ONCE) *ketcode = OP_KETRMAX + repeat_type;
+          if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
+            *ketcode = OP_KETRMAX + repeat_type;

-          /* Handle non-ONCE brackets and possessive ONCEs (which have been
-          converted to non-capturing above). */
+          /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
+          (which have been converted to non-capturing above). */

          else
            {
@ -8385,6 +8440,7 @@ do {
     case OP_SCBRAPOS:
     case OP_ASSERT:
     case OP_ONCE:
+     case OP_SCRIPT_RUN:
     d = find_firstassertedcu(scode, &dflags, inassert + ((op==OP_ASSERT)?1:0));
     if (dflags < 0)
       return 0;
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -173,6 +173,7 @@ static const uint8_t coptable[] = {
  0,                             /* Assert behind                          */
  0,                             /* Assert behind not                      */
  0,                             /* ONCE                                   */
+  0,                             /* SCRIPT_RUN                             */
  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
  0, 0,                          /* CREF, DNCREF                           */
@ -247,6 +248,7 @@ static const uint8_t poptable[] = {
  0,                             /* Assert behind                          */
  0,                             /* Assert behind not                      */
  0,                             /* ONCE                                   */
+  0,                             /* SCRIPT_RUN                             */
  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
  0, 0,                          /* CREF, DNCREF                           */
--- a/src/pcre2_error.c
+++ b/src/pcre2_error.c
@ -183,6 +183,7 @@ static const unsigned char compile_error_texts[] =
  "invalid hyphen in option setting\0"
  /* 95 */
  "(*alpha_assertion) not recognized\0"  
+  "script runs require Unicode support, which this version of PCRE2 does not have\0" 
  ;

 /* Match-time and UTF error texts are in the same format. */
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@ -1513,70 +1513,71 @@ enum {
  OP_ASSERTBACK,     /* 128 Positive lookbehind */
  OP_ASSERTBACK_NOT, /* 129 Negative lookbehind */

-  /* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately after the
-  assertions, with ONCE first, as there's a test for >= ONCE for a subpattern
-  that isn't an assertion. The POS versions must immediately follow the non-POS
-  versions in each case. */
+  /* ONCE, SCRIPT_RUN, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come
+  immediately after the assertions, with ONCE first, as there's a test for >=
+  ONCE for a subpattern that isn't an assertion. The POS versions must
+  immediately follow the non-POS versions in each case. */

  OP_ONCE,           /* 130 Atomic group, contains captures */
-  OP_BRA,            /* 131 Start of non-capturing bracket */
-  OP_BRAPOS,         /* 132 Ditto, with unlimited, possessive repeat */
-  OP_CBRA,           /* 133 Start of capturing bracket */
-  OP_CBRAPOS,        /* 134 Ditto, with unlimited, possessive repeat */
-  OP_COND,           /* 135 Conditional group */
+  OP_SCRIPT_RUN,     /* 131 Non-capture, but check characters' scripts */
+  OP_BRA,            /* 132 Start of non-capturing bracket */
+  OP_BRAPOS,         /* 133 Ditto, with unlimited, possessive repeat */
+  OP_CBRA,           /* 134 Start of capturing bracket */
+  OP_CBRAPOS,        /* 135 Ditto, with unlimited, possessive repeat */
+  OP_COND,           /* 136 Conditional group */

  /* These five must follow the previous five, in the same order. There's a
  check for >= SBRA to distinguish the two sets. */

-  OP_SBRA,           /* 136 Start of non-capturing bracket, check empty  */
-  OP_SBRAPOS,        /* 137 Ditto, with unlimited, possessive repeat */
-  OP_SCBRA,          /* 138 Start of capturing bracket, check empty */
-  OP_SCBRAPOS,       /* 139 Ditto, with unlimited, possessive repeat */
-  OP_SCOND,          /* 140 Conditional group, check empty */
+  OP_SBRA,           /* 137 Start of non-capturing bracket, check empty  */
+  OP_SBRAPOS,        /* 138 Ditto, with unlimited, possessive repeat */
+  OP_SCBRA,          /* 139 Start of capturing bracket, check empty */
+  OP_SCBRAPOS,       /* 140 Ditto, with unlimited, possessive repeat */
+  OP_SCOND,          /* 141 Conditional group, check empty */

  /* The next two pairs must (respectively) be kept together. */

-  OP_CREF,           /* 141 Used to hold a capture number as condition */
-  OP_DNCREF,         /* 142 Used to point to duplicate names as a condition */
-  OP_RREF,           /* 143 Used to hold a recursion number as condition */
-  OP_DNRREF,         /* 144 Used to point to duplicate names as a condition */
-  OP_FALSE,          /* 145 Always false (used by DEFINE and VERSION) */
-  OP_TRUE,           /* 146 Always true (used by VERSION) */
+  OP_CREF,           /* 142 Used to hold a capture number as condition */
+  OP_DNCREF,         /* 143 Used to point to duplicate names as a condition */
+  OP_RREF,           /* 144 Used to hold a recursion number as condition */
+  OP_DNRREF,         /* 145 Used to point to duplicate names as a condition */
+  OP_FALSE,          /* 146 Always false (used by DEFINE and VERSION) */
+  OP_TRUE,           /* 147 Always true (used by VERSION) */

-  OP_BRAZERO,        /* 147 These two must remain together and in this */
-  OP_BRAMINZERO,     /* 148 order. */
-  OP_BRAPOSZERO,     /* 149 */
+  OP_BRAZERO,        /* 148 These two must remain together and in this */
+  OP_BRAMINZERO,     /* 149 order. */
+  OP_BRAPOSZERO,     /* 150 */

  /* These are backtracking control verbs */

-  OP_MARK,           /* 150 always has an argument */
-  OP_PRUNE,          /* 151 */
-  OP_PRUNE_ARG,      /* 152 same, but with argument */
-  OP_SKIP,           /* 153 */
-  OP_SKIP_ARG,       /* 154 same, but with argument */
-  OP_THEN,           /* 155 */
-  OP_THEN_ARG,       /* 156 same, but with argument */
-  OP_COMMIT,         /* 157 */
-  OP_COMMIT_ARG,     /* 158 same, but with argument */
+  OP_MARK,           /* 151 always has an argument */
+  OP_PRUNE,          /* 152 */
+  OP_PRUNE_ARG,      /* 153 same, but with argument */
+  OP_SKIP,           /* 154 */
+  OP_SKIP_ARG,       /* 155 same, but with argument */
+  OP_THEN,           /* 156 */
+  OP_THEN_ARG,       /* 157 same, but with argument */
+  OP_COMMIT,         /* 158 */
+  OP_COMMIT_ARG,     /* 159 same, but with argument */

  /* These are forced failure and success verbs. FAIL and ACCEPT do accept an
  argument, but these cases can be compiled as, for example, (*MARK:X)(*FAIL)
  without the need for a special opcode. */

-  OP_FAIL,           /* 159 */
-  OP_ACCEPT,         /* 160 */
-  OP_ASSERT_ACCEPT,  /* 161 Used inside assertions */
-  OP_CLOSE,          /* 162 Used before OP_ACCEPT to close open captures */
+  OP_FAIL,           /* 160 */
+  OP_ACCEPT,         /* 161 */
+  OP_ASSERT_ACCEPT,  /* 162 Used inside assertions */
+  OP_CLOSE,          /* 163 Used before OP_ACCEPT to close open captures */

  /* This is used to skip a subpattern with a {0} quantifier */

-  OP_SKIPZERO,       /* 163 */
+  OP_SKIPZERO,       /* 164 */

  /* This is used to identify a DEFINE group during compilation so that it can
  be checked for having only one branch. It is changed to OP_FALSE before
  compilation finishes. */

-  OP_DEFINE,         /* 164 */
+  OP_DEFINE,         /* 165 */

  /* This is not an opcode, but is used to check that tables indexed by opcode
  are the correct length, in order to catch updating errors - there have been
@ -1624,6 +1625,7 @@ some cases doesn't actually use these names at all). */
  "Alt", "Ket", "KetRmax", "KetRmin", "KetRpos",                  \
  "Reverse", "Assert", "Assert not", "AssertB", "AssertB not",    \
  "Once",                                                         \
+  "Script run",                                                   \
  "Bra", "BraPos", "CBra", "CBraPos",                             \
  "Cond",                                                         \
  "SBra", "SBraPos", "SCBra", "SCBraPos",                         \
@ -1707,6 +1709,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
  1+LINK_SIZE,                   /* Assert behind                          */ \
  1+LINK_SIZE,                   /* Assert behind not                      */ \
  1+LINK_SIZE,                   /* ONCE                                   */ \
+  1+LINK_SIZE,                   /* SCRIPT_RUN                             */ \
  1+LINK_SIZE,                   /* BRA                                    */ \
  1+LINK_SIZE,                   /* BRAPOS                                 */ \
  1+LINK_SIZE+IMM2_SIZE,         /* CBRA                                   */ \
@ -1854,6 +1857,7 @@ extern const uint8_t          PRIV(utf8_table4)[];
 #define _pcre2_hspace_list             PCRE2_SUFFIX(_pcre2_hspace_list_)
 #define _pcre2_vspace_list             PCRE2_SUFFIX(_pcre2_vspace_list_)
 #define _pcre2_ucd_caseless_sets       PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
+#define _pcre2_ucd_digit_sets          PCRE2_SUFFIX(_pcre2_ucd_digit_sets_)
 #define _pcre2_ucd_records             PCRE2_SUFFIX(_pcre2_ucd_records_)
 #define _pcre2_ucd_stage1              PCRE2_SUFFIX(_pcre2_ucd_stage1_)
 #define _pcre2_ucd_stage2              PCRE2_SUFFIX(_pcre2_ucd_stage2_)
@ -1875,6 +1879,7 @@ extern const uint8_t                   PRIV(default_tables)[];
 extern const uint32_t                  PRIV(hspace_list)[];
 extern const uint32_t                  PRIV(vspace_list)[];
 extern const uint32_t                  PRIV(ucd_caseless_sets)[];
+extern const uint32_t                  PRIV(ucd_digit_sets)[];
 extern const ucd_record                PRIV(ucd_records)[];
 #if PCRE2_CODE_UNIT_WIDTH == 32
 extern const ucd_record                PRIV(dummy_ucd_record)[];
@ -1922,6 +1927,7 @@ is available. */
 #define _pcre2_jit_get_target        PCRE2_SUFFIX(_pcre2_jit_get_target_)
 #define _pcre2_memctl_malloc         PCRE2_SUFFIX(_pcre2_memctl_malloc_)
 #define _pcre2_ord2utf               PCRE2_SUFFIX(_pcre2_ord2utf_)
+#define _pcre2_script_run            PCRE2_SUFFIX(_pcre2_script_run_)
 #define _pcre2_strcmp                PCRE2_SUFFIX(_pcre2_strcmp_)
 #define _pcre2_strcmp_c8             PCRE2_SUFFIX(_pcre2_strcmp_c8_)
 #define _pcre2_strcpy_c8             PCRE2_SUFFIX(_pcre2_strcpy_c8_)
@ -1948,6 +1954,7 @@ extern size_t       _pcre2_jit_get_size(void *);
 const char *        _pcre2_jit_get_target(void);
 extern void *       _pcre2_memctl_malloc(size_t, pcre2_memctl *);
 extern unsigned int _pcre2_ord2utf(uint32_t, PCRE2_UCHAR *);
+extern BOOL         _pcre2_script_run(PCRE2_SPTR, PCRE2_SPTR, BOOL);
 extern int          _pcre2_strcmp(PCRE2_SPTR, PCRE2_SPTR);
 extern int          _pcre2_strcmp_c8(PCRE2_SPTR, const char *);
 extern PCRE2_SIZE   _pcre2_strcpy_c8(PCRE2_UCHAR *, const char *);
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -5014,6 +5014,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
    must record a backtracking point and also set up a chained frame. */

    case OP_ONCE:
+    case OP_SCRIPT_RUN: 
    case OP_SBRA:
    Lframe_type = GF_NOCAPTURE | Fop;

@ -5526,6 +5527,14 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
      case OP_ASSERTBACK_NOT:
      RRETURN(MATCH_MATCH);
      
+      /* At the end of a script run, apply the script-checking rules. This code 
+      will never by exercised if Unicode support it not compiled, because in 
+      that environment script runs cause an error at compile time. */
+      
+      case OP_SCRIPT_RUN:
+      if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
+      break;  
+
      /* Whole-pattern recursion is coded as a recurse into group 0, so it
      won't be picked up here. Instead, we catch it when the OP_END is reached.
      Other recursion is handled here. */
--- a/src/pcre2_printint.c
+++ b/src/pcre2_printint.c
@ -393,6 +393,7 @@ for(;;)
    case OP_ASSERTBACK:
    case OP_ASSERTBACK_NOT:
    case OP_ONCE:
+    case OP_SCRIPT_RUN: 
    case OP_COND:
    case OP_SCOND:
    case OP_REVERSE:
--- a/src/pcre2_script_run.c
+++ b/src/pcre2_script_run.c
@ -0,0 +1,228 @@
+/*************************************************
+*      Perl-Compatible Regular Expressions       *
+*************************************************/
+
+/* PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+                       Written by Philip Hazel
+     Original API code Copyright (c) 1997-2012 University of Cambridge
+          New API code Copyright (c) 2016-2018 University of Cambridge
+
+-----------------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of the University of Cambridge nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-----------------------------------------------------------------------------
+*/
+
+/* This module contains the function for checking a script run. */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pcre2_internal.h"
+
+
+/*************************************************
+*                Check script run                *
+*************************************************/
+
+/* A script run is conceptually a sequence of characters all in the same
+Unicode script. However, it isn't quite that simple. There are special rules
+for scripts that are commonly used together, and also special rules for digits.
+This function implements the appropriate checks, which is possible only when
+PCRE2 is compiled with Unicode support. The function returns TRUE if there is
+no Unicode support; however, it should never be called in that circumstance
+because an error is given by pcre2_compile() if a script run is called for in a
+version of PCRE2 compiled without Unicode support.
+
+Arguments:
+  pgr       point to the first character
+  endptr    point after the last character
+  utf       TRUE if in UTF mode
+
+Returns:    TRUE if this is a valid script run
+*/
+
+#define SCRIPT_UNSET        (-1)
+#define SCRIPT_HANPENDING   (-2)
+#define SCRIPT_HANHIRAKATA  (-3)
+#define SCRIPT_HANBOPOMOFO  (-4)
+#define SCRIPT_HANHANGUL    (-5)
+
+BOOL
+PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
+{
+#ifdef SUPPORT_UNICODE
+int require_script = SCRIPT_UNSET;
+uint32_t require_digitset = 0;
+uint32_t c;
+
+#if PCRE2_CODE_UNIT_WIDTH == 32
+(void)utf;    /* Avoid compiler warning */
+#endif
+
+/* Any string containing fewer than 2 characters is a valid script run. */
+
+if (ptr >= endptr) return TRUE;
+GETCHARINCTEST(c, ptr);
+if (ptr >= endptr) return TRUE;
+
+/* Scan strings of two or more characters, checking the Unicode characteristics
+of each code point. */
+
+for (;;)
+  {
+  const ucd_record *ucd = GET_UCD(c);
+  uint32_t script = ucd->script;
+
+  /* If the script is Unknown, the string is not a valid script run. Such
+  characters can only form script runs of length one. */
+  
+  if (script == ucp_Unknown) return FALSE; 
+
+  /* A character whose script is Inherited is always accepted, and plays no
+  further part. A character whose script is Common is always accepted, but must
+  still be tested for a digit below. Otherwise, the character must match the
+  script of the first non-Inherited, non-Common character encountered. For most
+  scripts, the test is for the same script. However, the Han Chinese script may
+  be used in conjunction with four other scripts in these combinations:
+
+  . Han with Hiragana and Katakana is allowed (for Japanese).
+
+  . Han with Bopomofo is allowed (for Taiwanese Mandarin).
+
+  . Han with Hangul is allowed (for Korean).
+
+  If the first significant character's script is one of the four, the required
+  script type is immediately known. However, if the first significant
+  character's script is Han, we have to keep checking for a non-Han character.
+  Hence the SCRIPT_HANPENDING state. */
+ 
+  if (script != ucp_Inherited)
+    { 
+    if (script != ucp_Common) switch(require_script)
+      {
+      default:
+      if (script != (unsigned int)require_script) return FALSE;
+      break;
+    
+      case SCRIPT_UNSET:
+      case SCRIPT_HANPENDING:
+      switch(script)
+        {
+        case ucp_Han:
+        require_script = SCRIPT_HANPENDING;
+        break;
+    
+        case ucp_Hiragana:
+        case ucp_Katakana:
+        require_script = SCRIPT_HANHIRAKATA;
+        break;
+    
+        case ucp_Bopomofo:
+        require_script = SCRIPT_HANBOPOMOFO;
+        break;
+    
+        case ucp_Hangul:
+        require_script = SCRIPT_HANHANGUL;
+        break;
+    
+        default:
+        if (require_script == SCRIPT_HANPENDING) return FALSE;
+        require_script = script;
+        break;
+        }
+      break;
+    
+      case SCRIPT_HANHIRAKATA:
+      if (script != ucp_Han && script != ucp_Hiragana && script != ucp_Katakana)
+        return FALSE;
+      break;
+    
+      case SCRIPT_HANBOPOMOFO:
+      if (script != ucp_Han && script != ucp_Bopomofo) return FALSE;
+      break;
+    
+      case SCRIPT_HANHANGUL:
+      if (script != ucp_Han && script != ucp_Hangul) return FALSE;
+      break;
+      }
+    
+    /* The character is in an acceptable script. We must now ensure that all
+    decimal digits in the string come from the same set. Some scripts (e.g.
+    Common, Arabic) have more than one set of decimal digits. This code does
+    not allow mixing sets, even within the same script. The vector called
+    PRIV(ucd_digit_sets)[] contains, in its first element, the number of
+    following elements, and then, in ascending order, the code points of the
+    '9' characters in every set of 10 digits. Each set is identified by the
+    offset in the vector of its '9' character. An initial check of the first
+    value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
+    
+    if (ucd->chartype == ucp_Nd)
+      {
+      uint32_t digitset;
+        
+      if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
+        {
+        int mid;
+        int bot = 1;
+        int top = PRIV(ucd_digit_sets)[0];
+        for (;;)
+          {
+          if (top <= bot + 1)    /* <= rather than == is paranoia */
+            {
+            digitset = top;
+            break;
+            }
+          mid = (top + bot) / 2;
+          if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
+          }
+        }
+    
+      /* A required value of 0 means "unset". */
+    
+      if (require_digitset == 0) require_digitset = digitset;
+        else if (digitset != require_digitset) return FALSE;
+      }   /* End digit handling */
+    }     /* End checking non-Inherited character */
+
+  /* If we haven't yet got to the end, pick up the next character. */
+
+  if (ptr >= endptr) return TRUE;
+  GETCHARINCTEST(c, ptr);
+  }  /* End checking loop */
+
+#else   /* NOT SUPPORT_UNICODE */
+(void)ptr;
+(void)endptr;
+(void)utf;
+return TRUE;
+#endif  /* SUPPORT_UNICODE */
+}
+
+/* End of pcre2_script_run.c */
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@ -171,6 +171,7 @@ for (;;)
    /* Fall through */

    case OP_ONCE:
+    case OP_SCRIPT_RUN: 
    case OP_SBRA:
    case OP_BRAPOS:
    case OP_SBRAPOS:
@ -1075,6 +1076,7 @@ do
      case OP_CBRAPOS:
      case OP_SCBRAPOS:
      case OP_ONCE:
+      case OP_SCRIPT_RUN: 
      case OP_ASSERT:
      rc = set_start_bits(re, tcode, utf);
      if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
--- a/src/pcre2_tables.c
+++ b/src/pcre2_tables.c
@ -417,6 +417,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
 #define STRING_Tirhuta0 STR_T STR_i STR_r STR_h STR_u STR_t STR_a "\0"
 #define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
+#define STRING_Unknown0 STR_U STR_n STR_k STR_n STR_o STR_w STR_n "\0"
 #define STRING_Vai0 STR_V STR_a STR_i "\0"
 #define STRING_Warang_Citi0 STR_W STR_a STR_r STR_a STR_n STR_g STR_UNDERSCORE STR_C STR_i STR_t STR_i "\0"
 #define STRING_Xan0 STR_X STR_a STR_n "\0"
@ -611,6 +612,7 @@ const char PRIV(utt_names)[] =
  STRING_Tifinagh0
  STRING_Tirhuta0
  STRING_Ugaritic0
+  STRING_Unknown0
  STRING_Vai0
  STRING_Warang_Citi0
  STRING_Xan0
@ -805,19 +807,20 @@ const ucp_type_table PRIV(utt)[] = {
  { 1424, PT_SC, ucp_Tifinagh },
  { 1433, PT_SC, ucp_Tirhuta },
  { 1441, PT_SC, ucp_Ugaritic },
-  { 1450, PT_SC, ucp_Vai },
-  { 1454, PT_SC, ucp_Warang_Citi },
-  { 1466, PT_ALNUM, 0 },
-  { 1470, PT_PXSPACE, 0 },
-  { 1474, PT_SPACE, 0 },
-  { 1478, PT_UCNC, 0 },
-  { 1482, PT_WORD, 0 },
-  { 1486, PT_SC, ucp_Yi },
-  { 1489, PT_GC, ucp_Z },
-  { 1491, PT_SC, ucp_Zanabazar_Square },
-  { 1508, PT_PC, ucp_Zl },
-  { 1511, PT_PC, ucp_Zp },
-  { 1514, PT_PC, ucp_Zs }
+  { 1450, PT_SC, ucp_Unknown },
+  { 1458, PT_SC, ucp_Vai },
+  { 1462, PT_SC, ucp_Warang_Citi },
+  { 1474, PT_ALNUM, 0 },
+  { 1478, PT_PXSPACE, 0 },
+  { 1482, PT_SPACE, 0 },
+  { 1486, PT_UCNC, 0 },
+  { 1490, PT_WORD, 0 },
+  { 1494, PT_SC, ucp_Yi },
+  { 1497, PT_GC, ucp_Z },
+  { 1499, PT_SC, ucp_Zanabazar_Square },
+  { 1516, PT_PC, ucp_Zl },
+  { 1519, PT_PC, ucp_Zp },
+  { 1522, PT_PC, ucp_Zs }
 };

 const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
--- a/src/pcre2_ucd.c
+++ b/src/pcre2_ucd.c
--- a/src/pcre2_ucp.h
+++ b/src/pcre2_ucp.h
@ -124,6 +124,7 @@ enum {
 /* These are the script identifications. */

 enum {
+  ucp_Unknown,
  ucp_Arabic,
  ucp_Armenian,
  ucp_Bengali,
--- a/testdata/testinput12
+++ b/testdata/testinput12
@ -387,4 +387,15 @@
 /(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
    123abcáyzabcdef789abcሴqr
    
+# A few script run tests in non-UTF mode (but they need Unicode support)
+
+/^(*script_run:.{4})/
+    \x{3041}\x{30a1}\x{3007}\x{3007}   Hiragana Katakana Han Han
+    \x{30a1}\x{3041}\x{3007}\x{3007}   Katakana Hiragana Han Han
+    \x{1100}\x{2e80}\x{2e80}\x{1101}   Hangul Han Han Hangul
+ 
+/^(*sr:.*)/utf,allow_surrogate_escapes
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+    \x{d800}\x{dfff}                   Surrogates (Unknown) \=no_utf_check
+
 # End of testinput12
--- a/testdata/testinput4
+++ b/testdata/testinput4
@ -2318,4 +2318,95 @@
 /[^\x{100}-\x{ffff}]*[\x80-\xff]/i,utf
    \x{99}\x{99}\x{99}
    
+# Script run tests
+
+/^(*script_run:.{4})/utf
+    abcd                               Latin x4
+    \x{2e80}\x{2fa1d}\x{3041}\x{30a1}  Han Han Hiragana Katakana
+    \x{3041}\x{30a1}\x{3007}\x{3007}   Hiragana Katakana Han Han
+    \x{30a1}\x{3041}\x{3007}\x{3007}   Katakana Hiragana Han Han
+    \x{1100}\x{2e80}\x{2e80}\x{1101}   Hangul Han Han Hangul
+    \x{2e80}\x{3105}\x{2e80}\x{3105}   Han Bopomofo Han Bopomofo
+    \x{02ea}\x{2e80}\x{2e80}\x{3105}   Bopomofo-Sk Han Han Bopomofo
+    \x{3105}\x{2e80}\x{2e80}\x{3105}   Bopomofo Han Han Bopomofo
+    \x{0300}cd!                        Inherited Latin Latin Common
+    \x{0391}12\x{03a9}                 Greek Common-digits Greek 
+    \x{0400}12\x{fe2f}                 Cyrillic Common-digits Cyrillic
+    \x{0531}12\x{fb17}                 Armenian Common-digits Armenian
+    \x{0591}12\x{fb4f}                 Hebrew Common-digits Hebrew
+    \x{0600}12\x{1eef1}                Arabic Common-digits Arabic
+    \x{0600}\x{0660}\x{0669}\x{1eef1}  Arabic Arabic-digits Arabic
+    \x{0700}12\x{086a}                 Syriac Common-digits Syriac
+    \x{1200}12\x{ab2e}                 Ethiopic Common-digits Ethiopic
+    \x{1680}12\x{169c}                 Ogham Common-digits Ogham
+    \x{3041}12\x{3041}                 Hiragana Common-digits Hiragana
+    \x{0980}\x{09e6}\x{09e7}\x{0993}   Bengali Bengali-digits Bengali
+    !cde                               Common Latin Latin Latin
+    A..B                               Latin Common Common Latin 
+    0abc                               Ascii-digit Latin Latin Latin
+    1\x{0700}\x{0700}\x{0700}          Ascii-digit Syriac x 3
+    \x{1A80}\x{1A80}\x{1a40}\x{1a41}   Tai Tham Hora digits, letters
+\= Expect no match
+    a\x{370}bcd                        Latin Greek Latin Latin
+    \x{1100}\x{02ea}\x{02ea}\x{02ea}   Hangul Bopomofo x3
+    \x{02ea}\x{02ea}\x{02ea}\x{1100}   Bopomofo x3 Hangul
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+    \x{0391}\x{09e6}\x{09e7}\x{03a9}   Greek Bengali digits Greek 
+    \x{0600}7\x{0669}\x{1eef1}         Arabic ascii-digit Arabic-digit Arabic
+    \x{0600}\x{0669}7\x{1eef1}         Arabic Arabic-digit ascii-digit Arabic
+    A5\x{ff19}B                        Latin Common-ascii/notascii-digits Latin 
+    \x{0300}cd\x{0391}                 Inherited Latin Latin Greek
+    !cd\x{0391}                        Common Latin Latin Greek
+    \x{1A80}\x{1A90}\x{1a40}\x{1a41}   Tai Tham Hora digit, Tham digit, letters
+    A\x{1d7ce}\x{1d7ff}B               Common fancy-common-2-sets-digits Common
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+    
+/^(*sr:.{4}|..)/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+
+/^(*atomic_script_run:.{4}|..)/utf
+\= Expect no match
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+
+/^(*asr:.*)/utf
+\= Expect no match
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+
+/^(?>(*sr:.*))/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+
+/^(*sr:.*)/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+    \x{10fffd}\x{10fffd}\x{10fffd}     Private use (Unknown)
+
+/^(*sr:\x{2e80}*)/utf
+    \x{2e80}\x{2e80}\x{3105}           Han Han Bopomofo
+
+/^(*sr:\x{2e80}*)\x{2e80}/utf
+    \x{2e80}\x{2e80}\x{3105}           Han Han Bopomofo
+    
+/^(*sr:.*)Test/utf
+    Test script run on an empty string
+
+/^(*sr:(.{2})){2}/utf
+    \x{0600}7\x{0669}\x{1eef1}         Arabic ascii-digit Arabic-digit Arabic
+    \x{1A80}\x{1A80}\x{1a40}\x{1a41}   Tai Tham Hora digits, letters
+    \x{1A80}\x{1a40}\x{1A90}\x{1a41}   Tai Tham Hora digit, letter, Tham digit, letter
+\= Expect no match
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+
+# Test loop breaking for empty string match
+
+/^(*sr:A|)*BCD/utf
+    AABCD
+    ABCD
+    BCD 
+    
+# The use of (*ACCEPT) breaks script run checking 
+
+/^(*sr:.*(*ACCEPT)ZZ)/utf
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+
+# ------- 
+
 # End of testinput4
--- a/testdata/testinput5
+++ b/testdata/testinput5
@ -2107,4 +2107,27 @@
 /(*: A‎B C)abc/x,utf,mark,alt_verbnames
    abc
    
+# Script run tests: auto-possessification
+
+/^(*sr:.*)/B,utf 
+    paypаl.com   A classic example of why script run checks are a good thing
+
+/^(*sr:\x{2e80}*)/B,utf
+
+/^(*sr:\x{2e80}*)\x{2e80}/B,utf
+
+# Some script run patterns are broken in Perl 5.28.0. These can be moved into
+# test 4 when a mended version of Perl is released.
+
+/^(*sr:.{4})/utf
+    \x{0980}12\x{0993}     Bengali Common-digits Bengali
+    \x{0780}12\x{07b1}     Thaana Common-digits Thaana
+    \x{0e01}12\x{0e5b}     Thai Common-digits Thai
+    \x{1780}12\x{19ff}     Khmer Common-digits Khmer
+    \x{0904}12\x{0939}     Devanagari Common-digits Devanagari
+    A\x{ff10}\x{ff19}B     Latin Common-notascii-digits Latin 
+    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
+
+# ------- 
+
 # End of testinput5
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@ -1481,4 +1481,19 @@ Old 12 15  New 16 21
 Old 21 21  New 27 29
 4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
    
+# A few script run tests in non-UTF mode (but they need Unicode support)
+
+/^(*script_run:.{4})/
+    \x{3041}\x{30a1}\x{3007}\x{3007}   Hiragana Katakana Han Han
+ 0: \x{3041}\x{30a1}\x{3007}\x{3007}
+    \x{30a1}\x{3041}\x{3007}\x{3007}   Katakana Hiragana Han Han
+ 0: \x{30a1}\x{3041}\x{3007}\x{3007}
+    \x{1100}\x{2e80}\x{2e80}\x{1101}   Hangul Han Han Hangul
+ 0: \x{1100}\x{2e80}\x{2e80}\x{1101}
+ 
+/^(*sr:.*)/utf,allow_surrogate_escapes
+Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+    \x{d800}\x{dfff}                   Surrogates (Unknown) \=no_utf_check
+
 # End of testinput12
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@ -1478,4 +1478,20 @@ Old 12 15  New 16 21
 Old 21 21  New 27 29
 4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
    
+# A few script run tests in non-UTF mode (but they need Unicode support)
+
+/^(*script_run:.{4})/
+    \x{3041}\x{30a1}\x{3007}\x{3007}   Hiragana Katakana Han Han
+ 0: \x{3041}\x{30a1}\x{3007}\x{3007}
+    \x{30a1}\x{3041}\x{3007}\x{3007}   Katakana Hiragana Han Han
+ 0: \x{30a1}\x{3041}\x{3007}\x{3007}
+    \x{1100}\x{2e80}\x{2e80}\x{1101}   Hangul Han Han Hangul
+ 0: \x{1100}\x{2e80}\x{2e80}\x{1101}
+ 
+/^(*sr:.*)/utf,allow_surrogate_escapes
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}\x{2e80}
+    \x{d800}\x{dfff}                   Surrogates (Unknown) \=no_utf_check
+ 0: \x{d800}
+
 # End of testinput12
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@ -3742,4 +3742,153 @@ No match
    \x{99}\x{99}\x{99}
 0: \x{99}\x{99}\x{99}
    
+# Script run tests
+
+/^(*script_run:.{4})/utf
+    abcd                               Latin x4
+ 0: abcd
+    \x{2e80}\x{2fa1d}\x{3041}\x{30a1}  Han Han Hiragana Katakana
+ 0: \x{2e80}\x{2fa1d}\x{3041}\x{30a1}
+    \x{3041}\x{30a1}\x{3007}\x{3007}   Hiragana Katakana Han Han
+ 0: \x{3041}\x{30a1}\x{3007}\x{3007}
+    \x{30a1}\x{3041}\x{3007}\x{3007}   Katakana Hiragana Han Han
+ 0: \x{30a1}\x{3041}\x{3007}\x{3007}
+    \x{1100}\x{2e80}\x{2e80}\x{1101}   Hangul Han Han Hangul
+ 0: \x{1100}\x{2e80}\x{2e80}\x{1101}
+    \x{2e80}\x{3105}\x{2e80}\x{3105}   Han Bopomofo Han Bopomofo
+ 0: \x{2e80}\x{3105}\x{2e80}\x{3105}
+    \x{02ea}\x{2e80}\x{2e80}\x{3105}   Bopomofo-Sk Han Han Bopomofo
+ 0: \x{2ea}\x{2e80}\x{2e80}\x{3105}
+    \x{3105}\x{2e80}\x{2e80}\x{3105}   Bopomofo Han Han Bopomofo
+ 0: \x{3105}\x{2e80}\x{2e80}\x{3105}
+    \x{0300}cd!                        Inherited Latin Latin Common
+ 0: \x{300}cd!
+    \x{0391}12\x{03a9}                 Greek Common-digits Greek 
+ 0: \x{391}12\x{3a9}
+    \x{0400}12\x{fe2f}                 Cyrillic Common-digits Cyrillic
+ 0: \x{400}12\x{fe2f}
+    \x{0531}12\x{fb17}                 Armenian Common-digits Armenian
+ 0: \x{531}12\x{fb17}
+    \x{0591}12\x{fb4f}                 Hebrew Common-digits Hebrew
+ 0: \x{591}12\x{fb4f}
+    \x{0600}12\x{1eef1}                Arabic Common-digits Arabic
+ 0: \x{600}12\x{1eef1}
+    \x{0600}\x{0660}\x{0669}\x{1eef1}  Arabic Arabic-digits Arabic
+ 0: \x{600}\x{660}\x{669}\x{1eef1}
+    \x{0700}12\x{086a}                 Syriac Common-digits Syriac
+ 0: \x{700}12\x{86a}
+    \x{1200}12\x{ab2e}                 Ethiopic Common-digits Ethiopic
+ 0: \x{1200}12\x{ab2e}
+    \x{1680}12\x{169c}                 Ogham Common-digits Ogham
+ 0: \x{1680}12\x{169c}
+    \x{3041}12\x{3041}                 Hiragana Common-digits Hiragana
+ 0: \x{3041}12\x{3041}
+    \x{0980}\x{09e6}\x{09e7}\x{0993}   Bengali Bengali-digits Bengali
+ 0: \x{980}\x{9e6}\x{9e7}\x{993}
+    !cde                               Common Latin Latin Latin
+ 0: !cde
+    A..B                               Latin Common Common Latin 
+ 0: A..B
+    0abc                               Ascii-digit Latin Latin Latin
+ 0: 0abc
+    1\x{0700}\x{0700}\x{0700}          Ascii-digit Syriac x 3
+ 0: 1\x{700}\x{700}\x{700}
+    \x{1A80}\x{1A80}\x{1a40}\x{1a41}   Tai Tham Hora digits, letters
+ 0: \x{1a80}\x{1a80}\x{1a40}\x{1a41}
+\= Expect no match
+    a\x{370}bcd                        Latin Greek Latin Latin
+No match
+    \x{1100}\x{02ea}\x{02ea}\x{02ea}   Hangul Bopomofo x3
+No match
+    \x{02ea}\x{02ea}\x{02ea}\x{1100}   Bopomofo x3 Hangul
+No match
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+No match
+    \x{0391}\x{09e6}\x{09e7}\x{03a9}   Greek Bengali digits Greek 
+No match
+    \x{0600}7\x{0669}\x{1eef1}         Arabic ascii-digit Arabic-digit Arabic
+No match
+    \x{0600}\x{0669}7\x{1eef1}         Arabic Arabic-digit ascii-digit Arabic
+No match
+    A5\x{ff19}B                        Latin Common-ascii/notascii-digits Latin 
+No match
+    \x{0300}cd\x{0391}                 Inherited Latin Latin Greek
+No match
+    !cd\x{0391}                        Common Latin Latin Greek
+No match
+    \x{1A80}\x{1A90}\x{1a40}\x{1a41}   Tai Tham Hora digit, Tham digit, letters
+No match
+    A\x{1d7ce}\x{1d7ff}B               Common fancy-common-2-sets-digits Common
+No match
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+No match
+    
+/^(*sr:.{4}|..)/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}
+
+/^(*atomic_script_run:.{4}|..)/utf
+\= Expect no match
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+No match
+
+/^(*asr:.*)/utf
+\= Expect no match
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+No match
+
+/^(?>(*sr:.*))/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}\x{2e80}
+
+/^(*sr:.*)/utf
+    \x{2e80}\x{3105}\x{2e80}\x{30a1}   Han Bopomofo Han Katakana
+ 0: \x{2e80}\x{3105}\x{2e80}
+    \x{10fffd}\x{10fffd}\x{10fffd}     Private use (Unknown)
+ 0: \x{10fffd}
+
+/^(*sr:\x{2e80}*)/utf
+    \x{2e80}\x{2e80}\x{3105}           Han Han Bopomofo
+ 0: \x{2e80}\x{2e80}
+
+/^(*sr:\x{2e80}*)\x{2e80}/utf
+    \x{2e80}\x{2e80}\x{3105}           Han Han Bopomofo
+ 0: \x{2e80}\x{2e80}
+    
+/^(*sr:.*)Test/utf
+    Test script run on an empty string
+ 0: Test
+
+/^(*sr:(.{2})){2}/utf
+    \x{0600}7\x{0669}\x{1eef1}         Arabic ascii-digit Arabic-digit Arabic
+ 0: \x{600}7\x{669}\x{1eef1}
+ 1: \x{669}\x{1eef1}
+    \x{1A80}\x{1A80}\x{1a40}\x{1a41}   Tai Tham Hora digits, letters
+ 0: \x{1a80}\x{1a80}\x{1a40}\x{1a41}
+ 1: \x{1a40}\x{1a41}
+    \x{1A80}\x{1a40}\x{1A90}\x{1a41}   Tai Tham Hora digit, letter, Tham digit, letter
+ 0: \x{1a80}\x{1a40}\x{1a90}\x{1a41}
+ 1: \x{1a90}\x{1a41}
+\= Expect no match
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+No match
+
+# Test loop breaking for empty string match
+
+/^(*sr:A|)*BCD/utf
+    AABCD
+ 0: AABCD
+    ABCD
+ 0: ABCD
+    BCD 
+ 0: BCD
+    
+# The use of (*ACCEPT) breaks script run checking 
+
+/^(*sr:.*(*ACCEPT)ZZ)/utf
+    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+ 0: \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
+
+# ------- 
+
 # End of testinput4
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@ -4776,4 +4776,63 @@ Failed: error 137 at offset 2: PCRE2 does not support \F, \L, \l, \N{name}, \U,
 0: abc
 MK: ABC
    
+# Script run tests: auto-possessification
+
+/^(*sr:.*)/B,utf 
+------------------------------------------------------------------
+        Bra
+        ^
+        Script run
+        Any*
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+    paypаl.com   A classic example of why script run checks are a good thing
+ 0: payp
+
+/^(*sr:\x{2e80}*)/B,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        Script run
+        \x{2e80}*+
+        Ket
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(*sr:\x{2e80}*)\x{2e80}/B,utf
+------------------------------------------------------------------
+        Bra
+        ^
+        Script run
+        \x{2e80}*
+        Ket
+        \x{2e80}
+        Ket
+        End
+------------------------------------------------------------------
+
+# Some script run patterns are broken in Perl 5.28.0. These can be moved into
+# test 4 when a mended version of Perl is released.
+
+/^(*sr:.{4})/utf
+    \x{0980}12\x{0993}     Bengali Common-digits Bengali
+ 0: \x{980}12\x{993}
+    \x{0780}12\x{07b1}     Thaana Common-digits Thaana
+ 0: \x{780}12\x{7b1}
+    \x{0e01}12\x{0e5b}     Thai Common-digits Thai
+ 0: \x{e01}12\x{e5b}
+    \x{1780}12\x{19ff}     Khmer Common-digits Khmer
+ 0: \x{1780}12\x{19ff}
+    \x{0904}12\x{0939}     Devanagari Common-digits Devanagari
+ 0: \x{904}12\x{939}
+    A\x{ff10}\x{ff19}B     Latin Common-notascii-digits Latin 
+ 0: A\x{ff10}\x{ff19}B
+    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
+ 0: A\x{1d7ce}\x{1d7cf}B
+
+# ------- 
+
 # End of testinput5