Update Script Run code to use the Script Extension property instead of the

Script property.
2018-10-09 16:42:21 +00:00 · 2018-10-09 16:42:21 +00:00 · 4e7a204d18
parent 83726c359d
commit 4e7a204d18
6 changed files with 398 additions and 80 deletions
--- a/2
+++ b/2
@ -32,7 +32,7 @@ src/pcre2_chartables.c.dist are updated.
 8. Implement the new Perl "script run" features (*script_run:...) and 
 (*atomic_script_run:...) aka (*sr:...) and (*asr:...). At present, this is 
-incomplete and not yet documented.
+not yet documented.
 Version 10.32 10-September-2018
--- a/src/pcre2_script_run.c
+++ b/src/pcre2_script_run.c
@ -68,17 +68,26 @@ Arguments:
 Returns:    TRUE if this is a valid script run
 */
-#define SCRIPT_UNSET        (-1)
+/* These dummy values must be less than the negation of the largest offset in
-#define SCRIPT_HANPENDING   (-2)
+the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
-#define SCRIPT_HANHIRAKATA  (-3)
+records (and is only likely to be a few hundred). */
-#define SCRIPT_HANBOPOMOFO  (-4)
+
-#define SCRIPT_HANHANGUL    (-5)
+#define SCRIPT_UNSET        (-99999)
 #define SCRIPT_HANPENDING   (-99998)
 #define SCRIPT_HANHIRAKATA  (-99997)
 #define SCRIPT_HANBOPOMOFO  (-99996)
 #define SCRIPT_HANHANGUL    (-99995)
 #define SCRIPT_LIST         (-99994)
 #define INTERSECTION_LIST_SIZE 50
 BOOL
 PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
 {
 #ifdef SUPPORT_UNICODE
 int require_script = SCRIPT_UNSET;
 uint8_t intersection_list[INTERSECTION_LIST_SIZE];
 const uint8_t *require_list = NULL;
 uint32_t require_digitset = 0;
 uint32_t c;
@ -93,86 +102,290 @@ GETCHARINCTEST(c, ptr);
 if (ptr >= endptr) return TRUE;
 /* Scan strings of two or more characters, checking the Unicode characteristics
-of each code point. */
+of each code point. We make use of the Script Extensions property. There is
 special code for scripts that can be combined with characters from the Han
 Chinese script. This may be used in conjunction with four other scripts in
 these combinations:
 . Han with Hiragana and Katakana is allowed (for Japanese).
 . Han with Bopomofo is allowed (for Taiwanese Mandarin).
 . Han with Hangul is allowed (for Korean).
 If the first significant character's script is one of the four, the required
 script type is immediately known. However, if the first significant
 character's script is Han, we have to keep checking for a non-Han character.
 Hence the SCRIPT_HANPENDING state. */
 for (;;)
  {
  const ucd_record *ucd = GET_UCD(c);
-  uint32_t script = ucd->script;
+  int32_t scriptx = ucd->scriptx;
-  /* If the script is Unknown, the string is not a valid script run. Such
+  /* If the script extension is Unknown, the string is not a valid script run.
-  characters can only form script runs of length one. */
+  Such characters can only form script runs of length one. */
  if (script == ucp_Unknown) return FALSE; 
-  /* A character whose script is Inherited is always accepted, and plays no
+  if (scriptx == ucp_Unknown) return FALSE;
  further part. A character whose script is Common is always accepted, but must
  still be tested for a digit below. Otherwise, the character must match the
  script of the first non-Inherited, non-Common character encountered. For most
  scripts, the test is for the same script. However, the Han Chinese script may
  be used in conjunction with four other scripts in these combinations:
-  . Han with Hiragana and Katakana is allowed (for Japanese).
+  /* A character whose script extension is Inherited is always accepted with
  any script, and plays no further part in this testing. A character whose
  script is Common is always accepted, but must still be tested for a digit
  below. The scriptx value at this point is non-zero, because zero is
  ucp_Unknown, tested for above. */
-  . Han with Bopomofo is allowed (for Taiwanese Mandarin).
+  if (scriptx != ucp_Inherited)
-
+    {
-  . Han with Hangul is allowed (for Korean).
+    if (scriptx != ucp_Common)
  If the first significant character's script is one of the four, the required
  script type is immediately known. However, if the first significant
  character's script is Han, we have to keep checking for a non-Han character.
  Hence the SCRIPT_HANPENDING state. */
  if (script != ucp_Inherited)
    { 
    if (script != ucp_Common) switch(require_script)
      {
-      default:
+      /* If the script extension value is positive, the character is not a mark
-      if (script != (unsigned int)require_script) return FALSE;
+      that can be used with many scripts. In the simple case we either set or
-      break;
+      compare with the required script. However, handling the scripts that can
-    
+      combine with Han are more complicated, as is the case when the previous
-      case SCRIPT_UNSET:
+      characters have been man-script marks. */
-      case SCRIPT_HANPENDING:
+
-      switch(script)
+      if (scriptx > 0)
        {
-        case ucp_Han:
+        switch(require_script)
-        require_script = SCRIPT_HANPENDING;
+          {
-        break;
+          /* Either the first significant character (require_script unset) or
-    
+          after only Han characters. */
-        case ucp_Hiragana:
+
-        case ucp_Katakana:
+          case SCRIPT_UNSET:
-        require_script = SCRIPT_HANHIRAKATA;
+          case SCRIPT_HANPENDING:
-        break;
+          switch(scriptx)
-    
+            {
-        case ucp_Bopomofo:
+            case ucp_Han:
-        require_script = SCRIPT_HANBOPOMOFO;
+            require_script = SCRIPT_HANPENDING;
-        break;
+            break;
-    
+
-        case ucp_Hangul:
+            case ucp_Hiragana:
-        require_script = SCRIPT_HANHANGUL;
+            case ucp_Katakana:
-        break;
+            require_script = SCRIPT_HANHIRAKATA;
-    
+            break;
-        default:
+
-        if (require_script == SCRIPT_HANPENDING) return FALSE;
+            case ucp_Bopomofo:
-        require_script = script;
+            require_script = SCRIPT_HANBOPOMOFO;
-        break;
+            break;
-        }
+
-      break;
+            case ucp_Hangul:
-    
+            require_script = SCRIPT_HANHANGUL;
-      case SCRIPT_HANHIRAKATA:
+            break;
-      if (script != ucp_Han && script != ucp_Hiragana && script != ucp_Katakana)
+
-        return FALSE;
+            /* Not a Han-related script. If expecting one, fail. Otherise set
-      break;
+            the requirement to this script. */
-    
+
-      case SCRIPT_HANBOPOMOFO:
+            default:
-      if (script != ucp_Han && script != ucp_Bopomofo) return FALSE;
+            if (require_script == SCRIPT_HANPENDING) return FALSE;
-      break;
+            require_script = scriptx;
-    
+            break;
-      case SCRIPT_HANHANGUL:
+            }
-      if (script != ucp_Han && script != ucp_Hangul) return FALSE;
+          break;
-      break;
+
-      }
+          /* Previously encountered one of the "with Han" scripts. Check that
-    
+          this character is appropriate. */
          case SCRIPT_HANHIRAKATA:
          if (scriptx != ucp_Han && scriptx != ucp_Hiragana && 
              scriptx != ucp_Katakana)
            return FALSE;
          break;
          case SCRIPT_HANBOPOMOFO:
          if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
          break;
          case SCRIPT_HANHANGUL:
          if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
          break;
          /* We have a list of scripts to check that is derived from one or
          more previous characters. This is either one of the lists in
          ucd_script_sets[] (for one previous character) or the intersection of
          several lists for multiple characters. */
          case SCRIPT_LIST:
            {
            const uint8_t *list;
            for (list = require_list; *list != 0; list++)
              {
              if (*list == scriptx) break;
              }
            if (*list == 0) return FALSE;
            }
          /* The rest of the string must be in this script, but we have to 
          allow for the Han complications. */
          switch(scriptx)
            {
            case ucp_Han:
            require_script = SCRIPT_HANPENDING;
            break;
            case ucp_Hiragana:
            case ucp_Katakana:
            require_script = SCRIPT_HANHIRAKATA;
            break;
            case ucp_Bopomofo:
            require_script = SCRIPT_HANBOPOMOFO;
            break;
            case ucp_Hangul:
            require_script = SCRIPT_HANHANGUL;
            break;
            default:
            require_script = scriptx;
            break;
            }  
          break;
          /* This is the easy case when a single script is required. */
          default:
          if (scriptx != require_script) return FALSE;
          break;
          }
        }  /* End of handing positive scriptx */
      /* If scriptx is negative, this character is a mark-type character that
      has a list of permitted scripts. */
      else
        {
        uint32_t chspecial;
        const uint8_t *clist, *rlist;
        const uint8_t *list = PRIV(ucd_script_sets) - scriptx;
        switch(require_script)
          {
          case SCRIPT_UNSET:
          require_list = PRIV(ucd_script_sets) - scriptx;
          require_script = SCRIPT_LIST;
          break;
          /* An inspection of the Unicode 11.0.0 files shows that there are the
          following types of Script Extension list that involve the Han,
          Bopomofo, Hiragana, Katakana, and Hangul scripts:
          . Bopomofo + Han
          . Han + Hiragana + Katakana
          . Hiragana + Katakana
          . Bopopmofo + Hangul + Han + Hiragana + Katakana
          The following code tries to make sense of this. */
 #define FOUND_BOPOMOFO 1
 #define FOUND_HIRAGANA 2
 #define FOUND_KATAKANA 4
 #define FOUND_HANGUL   8
          case SCRIPT_HANPENDING:
          chspecial = 0;
          for (; *list != 0; list++)
            {
            switch (*list)
              {
              case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break;
              case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break;
              case ucp_Katakana: chspecial |= FOUND_KATAKANA; break;
              case ucp_Hangul:   chspecial |= FOUND_HANGUL; break;
              default: break;
              }
            }
           if (chspecial == 0) return FALSE;
           if (chspecial == FOUND_BOPOMOFO)
             {
             require_script = SCRIPT_HANBOPOMOFO;
             }
           else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
             {
             require_script = SCRIPT_HANHIRAKATA;
             }
          /* Otherwise it must be allowed with all of them, so remain in
          the pending state. */
          break;
          case SCRIPT_HANHIRAKATA:
          for (; *list != 0; list++)
            {
            if (*list == ucp_Hiragana || *list == ucp_Katakana) break;
            }
          if (*list == 0) return FALSE;
          break;
          case SCRIPT_HANBOPOMOFO:
          for (; *list != 0; list++)
            {
            if (*list == ucp_Bopomofo) break;
            }
          if (*list == 0) return FALSE;
          break;
          case SCRIPT_HANHANGUL:
          for (; *list != 0; list++)
            {
            if (*list == ucp_Hangul) break;
            }
          if (*list == 0) return FALSE;
          break;
          /* Previously encountered one or more characters that are allowed
          with a list of scripts. Build the intersection of the required list
          with this character's list in intersection_list[]. This code is
          written so that it still works OK if the required list is already in
          that vector. */
          case SCRIPT_LIST:
            {
            int i = 0;
            for (rlist = require_list; *rlist != 0; rlist++)
              {
              for (clist = list; *clist != 0; clist++)
                {
                if (*rlist == *clist)
                  {
                  intersection_list[i++] = *rlist;
                  break;
                  }
                }
              }
            if (i == 0) return FALSE;  /* No scripts in common */
            /* If there's just one script in common, we can set it as the
            unique required script. Otherwise, terminate the intersection list
            and make it the required list. */
            if (i == 1)
              {
              require_script = intersection_list[0];
              }
            else
              {
              intersection_list[i] = 0;
              require_list = intersection_list;
              }
            }
          break;
          /* The previously set required script is a single script, not
          Han-related. Check that it is in this character's list. */
          default:
          for (; *list != 0; list++)
            {
            if (*list == require_script) break;
            }
          if (*list == 0) return FALSE;
          break;
          }
        }  /* End of handling negative scriptx */
      }    /* End of checking non-Common character */
    /* The character is in an acceptable script. We must now ensure that all
    decimal digits in the string come from the same set. Some scripts (e.g.
    Common, Arabic) have more than one set of decimal digits. This code does
@ -182,11 +395,11 @@ for (;;)
    '9' characters in every set of 10 digits. Each set is identified by the
    offset in the vector of its '9' character. An initial check of the first
    value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
-    
+
    if (ucd->chartype == ucp_Nd)
      {
      uint32_t digitset;
-        
+
      if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
        {
        int mid;
@ -203,9 +416,9 @@ for (;;)
          if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
          }
        }
-    
+
      /* A required value of 0 means "unset". */
-    
+
      if (require_digitset == 0) require_digitset = digitset;
        else if (digitset != require_digitset) return FALSE;
      }   /* End digit handling */
--- a/testdata/testinput4
+++ b/testdata/testinput4
@ -2394,6 +2394,22 @@
    \x{1A80}\x{1a40}\x{1A90}\x{1a41}   Tai Tham Hora digit, letter, Tham digit, letter
 \= Expect no match
    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
 /^(*sr:\S*)/utf
    \x{1cf4}\x{20f0}\x{900}\x{11305}   [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Gran
    \x{1cf4}\x{20f0}\x{11305}\x{900}   [Dev,Gran,Kan] [Dev,Gran,Lat] Gran Dev
    \x{1cf4}\x{20f0}\x{900}ABC         [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Lat
    \x{1cf4}\x{20f0}ABC                [Dev,Gran,Kan] [Dev,Gran,Lat] Lat
    \x{20f0}ABC                        [Dev,Gran,Lat] Lat
    XYZ\x{20f0}ABC                     Lat [Dev,Gran,Lat] Lat
    \x{a36}\x{a33}\x{900}              [Dev,...] [Dev,...] Dev  
    \x{3001}\x{2e80}\x{3041}\x{30a1}   [Bopo, Han, etc] Han Hira Kata
    \x{3001}\x{30a1}\x{2e80}\x{3041}   [Bopo, Han, etc] Kata Han Hira
    \x{3001}\x{3105}\x{2e80}\x{1101}   [Bopo, Han, etc] Bopomofo Han Hangul
    \x{3105}\x{3001}\x{2e80}\x{1101}   Bopomofo [Bopo, Han, etc] Han Hangul
    \x{3031}\x{3041}\x{30a1}\x{2e80}   [Hira Kata] Hira Kata Han
    \x{060c}\x{06d4}\x{0600}\x{10d00}\x{0700}  [Arab Rohg Syrc Thaa] [Arab Rohg] Arab Rohg Syrc
    \x{060c}\x{06d4}\x{0700}\x{0600}\x{10d00}  [Arab Rohg Syrc Thaa] [Arab Rohg] Syrc Arab Rohg
 /(?<!)(*sr:)/
@ -2405,6 +2421,17 @@
 /(?<=abc(?=X(*sr:BXY)CCC)XBXYCCC)./
   abcXBXYCCC!
 /^(*sr:\S*)/utf
    \x{10d00}\x{10d00}\x{06d4}     Rohingya Rohingya Arabic-full-stop
    \x{06d4}\x{10d00}\x{10d00}     Arabic-full-stop Rohingya Rohingya
    \x{10d00}\x{10d00}\x{0363}     Rohingya Rohingya Inherited-extend-Latin
    \x{0363}\x{10d00}\x{10d00}     Inherited-extend-Latin Rohingya Rohingya
    AB\x{0363}                     Latin Latin Inherited-extend-Latin
    \x{0363}AB                     Inherited-extend-Latin Latin Latin
    AB\x{1cf7}                     Latin Latin Common-extended-Beng
    \x{1cf7}AB                     Common-extend-Beng Latin Latin
    \x{1cf7}\x{0993}               Common-extend-Beng Bengali
 # Test loop breaking for empty string match
 /^(*sr:A|)*BCD/utf
--- a/testdata/testinput5
+++ b/testdata/testinput5
@ -2132,6 +2132,17 @@
    \x{0904}12\x{0939}     Devanagari Common-digits Devanagari
    A\x{ff10}\x{ff19}B     Latin Common-notascii-digits Latin 
    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
 # These ones involve non-ASCII but nevertheless Common digits. As of October
 # 2018 even blead Perl wasn't handling all of these - but is going to. 
 /^(*sr:.{4})/utf
    A\x{ff10}\x{ff19}B     Latin Common-notascii-digits Latin 
    \x{ff10}\x{ff19}..     Common-notascii-digits Common Common
    A\x{ff10}BC            Latin Common-notascii-digit Latin Latin
    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
    \x{1d7ce}\x{1d7cf},,   fancy-common-digits Common Common
    A\x{1d7ce}BC           Latin fancy-common-digit Latin Latin
 # ------- 
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@ -3872,6 +3872,36 @@ No match
 \= Expect no match
    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
 No match
 /^(*sr:\S*)/utf
    \x{1cf4}\x{20f0}\x{900}\x{11305}   [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Gran
 0: \x{1cf4}\x{20f0}\x{900}
    \x{1cf4}\x{20f0}\x{11305}\x{900}   [Dev,Gran,Kan] [Dev,Gran,Lat] Gran Dev
 0: \x{1cf4}\x{20f0}\x{11305}
    \x{1cf4}\x{20f0}\x{900}ABC         [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Lat
 0: \x{1cf4}\x{20f0}\x{900}
    \x{1cf4}\x{20f0}ABC                [Dev,Gran,Kan] [Dev,Gran,Lat] Lat
 0: \x{1cf4}\x{20f0}
    \x{20f0}ABC                        [Dev,Gran,Lat] Lat
 0: \x{20f0}ABC
    XYZ\x{20f0}ABC                     Lat [Dev,Gran,Lat] Lat
 0: XYZ\x{20f0}ABC
    \x{a36}\x{a33}\x{900}              [Dev,...] [Dev,...] Dev  
 0: \x{a36}\x{a33}
    \x{3001}\x{2e80}\x{3041}\x{30a1}   [Bopo, Han, etc] Han Hira Kata
 0: \x{3001}\x{2e80}\x{3041}\x{30a1}
    \x{3001}\x{30a1}\x{2e80}\x{3041}   [Bopo, Han, etc] Kata Han Hira
 0: \x{3001}\x{30a1}\x{2e80}\x{3041}
    \x{3001}\x{3105}\x{2e80}\x{1101}   [Bopo, Han, etc] Bopomofo Han Hangul
 0: \x{3001}\x{3105}\x{2e80}
    \x{3105}\x{3001}\x{2e80}\x{1101}   Bopomofo [Bopo, Han, etc] Han Hangul
 0: \x{3105}\x{3001}\x{2e80}
    \x{3031}\x{3041}\x{30a1}\x{2e80}   [Hira Kata] Hira Kata Han
 0: \x{3031}\x{3041}\x{30a1}\x{2e80}
    \x{060c}\x{06d4}\x{0600}\x{10d00}\x{0700}  [Arab Rohg Syrc Thaa] [Arab Rohg] Arab Rohg Syrc
 0: \x{60c}\x{6d4}\x{600}
    \x{060c}\x{06d4}\x{0700}\x{0600}\x{10d00}  [Arab Rohg Syrc Thaa] [Arab Rohg] Syrc Arab Rohg
 0: \x{60c}\x{6d4}
 /(?<!)(*sr:)/
@ -3885,6 +3915,26 @@ No match
   abcXBXYCCC!
 0: !
 /^(*sr:\S*)/utf
    \x{10d00}\x{10d00}\x{06d4}     Rohingya Rohingya Arabic-full-stop
 0: \x{10d00}\x{10d00}\x{6d4}
    \x{06d4}\x{10d00}\x{10d00}     Arabic-full-stop Rohingya Rohingya
 0: \x{6d4}\x{10d00}\x{10d00}
    \x{10d00}\x{10d00}\x{0363}     Rohingya Rohingya Inherited-extend-Latin
 0: \x{10d00}\x{10d00}
    \x{0363}\x{10d00}\x{10d00}     Inherited-extend-Latin Rohingya Rohingya
 0: \x{363}
    AB\x{0363}                     Latin Latin Inherited-extend-Latin
 0: AB\x{363}
    \x{0363}AB                     Inherited-extend-Latin Latin Latin
 0: \x{363}AB
    AB\x{1cf7}                     Latin Latin Common-extended-Beng
 0: AB
    \x{1cf7}AB                     Common-extend-Beng Latin Latin
 0: \x{1cf7}
    \x{1cf7}\x{0993}               Common-extend-Beng Bengali
 0: \x{1cf7}\x{993}
 # Test loop breaking for empty string match
 /^(*sr:A|)*BCD/utf
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@ -4865,6 +4865,23 @@ MK: ABC
 0: A\x{ff10}\x{ff19}B
    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
 0: A\x{1d7ce}\x{1d7cf}B
 # These ones involve non-ASCII but nevertheless Common digits. As of October
 # 2018 even blead Perl wasn't handling all of these - but is going to. 
 /^(*sr:.{4})/utf
    A\x{ff10}\x{ff19}B     Latin Common-notascii-digits Latin 
 0: A\x{ff10}\x{ff19}B
    \x{ff10}\x{ff19}..     Common-notascii-digits Common Common
 0: \x{ff10}\x{ff19}..
    A\x{ff10}BC            Latin Common-notascii-digit Latin Latin
 0: A\x{ff10}BC
    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
 0: A\x{1d7ce}\x{1d7cf}B
    \x{1d7ce}\x{1d7cf},,   fancy-common-digits Common Common
 0: \x{1d7ce}\x{1d7cf},,
    A\x{1d7ce}BC           Latin fancy-common-digit Latin Latin
 0: A\x{1d7ce}BC
 # -------