Update Script Run code to use the Script Extension property instead of the

Script property.
2018-10-09 16:42:21 +00:00 · 2018-10-09 16:42:21 +00:00 · 4e7a204d18
parent 83726c359d
commit 4e7a204d18
6 changed files with 398 additions and 80 deletions
--- a/2
+++ b/2
@ -32,7 +32,7 @@ src/pcre2_chartables.c.dist are updated.

 8. Implement the new Perl "script run" features (*script_run:...) and 
 (*atomic_script_run:...) aka (*sr:...) and (*asr:...). At present, this is 
-incomplete and not yet documented.
+not yet documented.


 Version 10.32 10-September-2018
--- a/src/pcre2_script_run.c
+++ b/src/pcre2_script_run.c
@ -68,17 +68,26 @@ Arguments:
 Returns:    TRUE if this is a valid script run
 */

-#define SCRIPT_UNSET        (-1)
-#define SCRIPT_HANPENDING   (-2)
-#define SCRIPT_HANHIRAKATA  (-3)
-#define SCRIPT_HANBOPOMOFO  (-4)
-#define SCRIPT_HANHANGUL    (-5)
+/* These dummy values must be less than the negation of the largest offset in
+the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
+records (and is only likely to be a few hundred). */
+
+#define SCRIPT_UNSET        (-99999)
+#define SCRIPT_HANPENDING   (-99998)
+#define SCRIPT_HANHIRAKATA  (-99997)
+#define SCRIPT_HANBOPOMOFO  (-99996)
+#define SCRIPT_HANHANGUL    (-99995)
+#define SCRIPT_LIST         (-99994)
+
+#define INTERSECTION_LIST_SIZE 50

 BOOL
 PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
 {
 #ifdef SUPPORT_UNICODE
 int require_script = SCRIPT_UNSET;
+uint8_t intersection_list[INTERSECTION_LIST_SIZE];
+const uint8_t *require_list = NULL;
 uint32_t require_digitset = 0;
 uint32_t c;

@ -93,85 +102,289 @@ GETCHARINCTEST(c, ptr);
 if (ptr >= endptr) return TRUE;

 /* Scan strings of two or more characters, checking the Unicode characteristics
-of each code point. */
+of each code point. We make use of the Script Extensions property. There is
+special code for scripts that can be combined with characters from the Han
+Chinese script. This may be used in conjunction with four other scripts in
+these combinations:
+
+. Han with Hiragana and Katakana is allowed (for Japanese).
+. Han with Bopomofo is allowed (for Taiwanese Mandarin).
+. Han with Hangul is allowed (for Korean).
+
+If the first significant character's script is one of the four, the required
+script type is immediately known. However, if the first significant
+character's script is Han, we have to keep checking for a non-Han character.
+Hence the SCRIPT_HANPENDING state. */

 for (;;)
  {
  const ucd_record *ucd = GET_UCD(c);
-  uint32_t script = ucd->script;
+  int32_t scriptx = ucd->scriptx;

-  /* If the script is Unknown, the string is not a valid script run. Such
-  characters can only form script runs of length one. */
+  /* If the script extension is Unknown, the string is not a valid script run.
+  Such characters can only form script runs of length one. */

-  if (script == ucp_Unknown) return FALSE; 
+  if (scriptx == ucp_Unknown) return FALSE;

-  /* A character whose script is Inherited is always accepted, and plays no
-  further part. A character whose script is Common is always accepted, but must
-  still be tested for a digit below. Otherwise, the character must match the
-  script of the first non-Inherited, non-Common character encountered. For most
-  scripts, the test is for the same script. However, the Han Chinese script may
-  be used in conjunction with four other scripts in these combinations:
+  /* A character whose script extension is Inherited is always accepted with
+  any script, and plays no further part in this testing. A character whose
+  script is Common is always accepted, but must still be tested for a digit
+  below. The scriptx value at this point is non-zero, because zero is
+  ucp_Unknown, tested for above. */

-  . Han with Hiragana and Katakana is allowed (for Japanese).
-
-  . Han with Bopomofo is allowed (for Taiwanese Mandarin).
-
-  . Han with Hangul is allowed (for Korean).
-
-  If the first significant character's script is one of the four, the required
-  script type is immediately known. However, if the first significant
-  character's script is Han, we have to keep checking for a non-Han character.
-  Hence the SCRIPT_HANPENDING state. */
- 
-  if (script != ucp_Inherited)
+  if (scriptx != ucp_Inherited)
    {
-    if (script != ucp_Common) switch(require_script)
+    if (scriptx != ucp_Common)
      {
-      default:
-      if (script != (unsigned int)require_script) return FALSE;
-      break;
+      /* If the script extension value is positive, the character is not a mark
+      that can be used with many scripts. In the simple case we either set or
+      compare with the required script. However, handling the scripts that can
+      combine with Han are more complicated, as is the case when the previous
+      characters have been man-script marks. */

-      case SCRIPT_UNSET:
-      case SCRIPT_HANPENDING:
-      switch(script)
+      if (scriptx > 0)
        {
-        case ucp_Han:
-        require_script = SCRIPT_HANPENDING;
-        break;
+        switch(require_script)
+          {
+          /* Either the first significant character (require_script unset) or
+          after only Han characters. */

-        case ucp_Hiragana:
-        case ucp_Katakana:
-        require_script = SCRIPT_HANHIRAKATA;
-        break;
+          case SCRIPT_UNSET:
+          case SCRIPT_HANPENDING:
+          switch(scriptx)
+            {
+            case ucp_Han:
+            require_script = SCRIPT_HANPENDING;
+            break;

-        case ucp_Bopomofo:
-        require_script = SCRIPT_HANBOPOMOFO;
-        break;
+            case ucp_Hiragana:
+            case ucp_Katakana:
+            require_script = SCRIPT_HANHIRAKATA;
+            break;

-        case ucp_Hangul:
-        require_script = SCRIPT_HANHANGUL;
-        break;
+            case ucp_Bopomofo:
+            require_script = SCRIPT_HANBOPOMOFO;
+            break;

-        default:
-        if (require_script == SCRIPT_HANPENDING) return FALSE;
-        require_script = script;
-        break;
-        }
-      break;
+            case ucp_Hangul:
+            require_script = SCRIPT_HANHANGUL;
+            break;

-      case SCRIPT_HANHIRAKATA:
-      if (script != ucp_Han && script != ucp_Hiragana && script != ucp_Katakana)
-        return FALSE;
-      break;
+            /* Not a Han-related script. If expecting one, fail. Otherise set
+            the requirement to this script. */

-      case SCRIPT_HANBOPOMOFO:
-      if (script != ucp_Han && script != ucp_Bopomofo) return FALSE;
-      break;
+            default:
+            if (require_script == SCRIPT_HANPENDING) return FALSE;
+            require_script = scriptx;
+            break;
+            }
+          break;

-      case SCRIPT_HANHANGUL:
-      if (script != ucp_Han && script != ucp_Hangul) return FALSE;
-      break;
-      }
+          /* Previously encountered one of the "with Han" scripts. Check that
+          this character is appropriate. */
+
+          case SCRIPT_HANHIRAKATA:
+          if (scriptx != ucp_Han && scriptx != ucp_Hiragana && 
+              scriptx != ucp_Katakana)
+            return FALSE;
+          break;
+
+          case SCRIPT_HANBOPOMOFO:
+          if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
+          break;
+
+          case SCRIPT_HANHANGUL:
+          if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
+          break;
+
+          /* We have a list of scripts to check that is derived from one or
+          more previous characters. This is either one of the lists in
+          ucd_script_sets[] (for one previous character) or the intersection of
+          several lists for multiple characters. */
+
+          case SCRIPT_LIST:
+            {
+            const uint8_t *list;
+            for (list = require_list; *list != 0; list++)
+              {
+              if (*list == scriptx) break;
+              }
+            if (*list == 0) return FALSE;
+            }
+
+          /* The rest of the string must be in this script, but we have to 
+          allow for the Han complications. */
+          
+          switch(scriptx)
+            {
+            case ucp_Han:
+            require_script = SCRIPT_HANPENDING;
+            break;
+
+            case ucp_Hiragana:
+            case ucp_Katakana:
+            require_script = SCRIPT_HANHIRAKATA;
+            break;
+
+            case ucp_Bopomofo:
+            require_script = SCRIPT_HANBOPOMOFO;
+            break;
+
+            case ucp_Hangul:
+            require_script = SCRIPT_HANHANGUL;
+            break;
+
+            default:
+            require_script = scriptx;
+            break;
+            }  
+          break;
+
+          /* This is the easy case when a single script is required. */
+
+          default:
+          if (scriptx != require_script) return FALSE;
+          break;
+          }
+        }  /* End of handing positive scriptx */
+
+      /* If scriptx is negative, this character is a mark-type character that
+      has a list of permitted scripts. */
+
+      else
+        {
+        uint32_t chspecial;
+        const uint8_t *clist, *rlist;
+        const uint8_t *list = PRIV(ucd_script_sets) - scriptx;
+        
+        switch(require_script)
+          {
+          case SCRIPT_UNSET:
+          require_list = PRIV(ucd_script_sets) - scriptx;
+          require_script = SCRIPT_LIST;
+          break;
+
+          /* An inspection of the Unicode 11.0.0 files shows that there are the
+          following types of Script Extension list that involve the Han,
+          Bopomofo, Hiragana, Katakana, and Hangul scripts:
+
+          . Bopomofo + Han
+          . Han + Hiragana + Katakana
+          . Hiragana + Katakana
+          . Bopopmofo + Hangul + Han + Hiragana + Katakana
+
+          The following code tries to make sense of this. */
+
+#define FOUND_BOPOMOFO 1
+#define FOUND_HIRAGANA 2
+#define FOUND_KATAKANA 4
+#define FOUND_HANGUL   8
+
+          case SCRIPT_HANPENDING:
+          chspecial = 0;
+          for (; *list != 0; list++)
+            {
+            switch (*list)
+              {
+              case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break;
+              case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break;
+              case ucp_Katakana: chspecial |= FOUND_KATAKANA; break;
+              case ucp_Hangul:   chspecial |= FOUND_HANGUL; break;
+              default: break;
+              }
+            }
+
+           if (chspecial == 0) return FALSE;
+
+           if (chspecial == FOUND_BOPOMOFO)
+             {
+             require_script = SCRIPT_HANBOPOMOFO;
+             }
+           else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
+             {
+             require_script = SCRIPT_HANHIRAKATA;
+             }
+
+          /* Otherwise it must be allowed with all of them, so remain in
+          the pending state. */
+
+          break;
+
+          case SCRIPT_HANHIRAKATA:
+          for (; *list != 0; list++)
+            {
+            if (*list == ucp_Hiragana || *list == ucp_Katakana) break;
+            }
+          if (*list == 0) return FALSE;
+          break;
+
+          case SCRIPT_HANBOPOMOFO:
+          for (; *list != 0; list++)
+            {
+            if (*list == ucp_Bopomofo) break;
+            }
+          if (*list == 0) return FALSE;
+          break;
+
+          case SCRIPT_HANHANGUL:
+          for (; *list != 0; list++)
+            {
+            if (*list == ucp_Hangul) break;
+            }
+          if (*list == 0) return FALSE;
+          break;
+
+          /* Previously encountered one or more characters that are allowed
+          with a list of scripts. Build the intersection of the required list
+          with this character's list in intersection_list[]. This code is
+          written so that it still works OK if the required list is already in
+          that vector. */
+
+          case SCRIPT_LIST:
+            {
+            int i = 0;
+            for (rlist = require_list; *rlist != 0; rlist++)
+              {
+              for (clist = list; *clist != 0; clist++)
+                {
+                if (*rlist == *clist)
+                  {
+                  intersection_list[i++] = *rlist;
+                  break;
+                  }
+                }
+              }
+            if (i == 0) return FALSE;  /* No scripts in common */
+
+            /* If there's just one script in common, we can set it as the
+            unique required script. Otherwise, terminate the intersection list
+            and make it the required list. */
+
+            if (i == 1)
+              {
+              require_script = intersection_list[0];
+              }
+            else
+              {
+              intersection_list[i] = 0;
+              require_list = intersection_list;
+              }
+            }
+          break;
+
+          /* The previously set required script is a single script, not
+          Han-related. Check that it is in this character's list. */
+
+          default:
+          for (; *list != 0; list++)
+            {
+            if (*list == require_script) break;
+            }
+          if (*list == 0) return FALSE;
+          break;
+          }
+        }  /* End of handling negative scriptx */
+      }    /* End of checking non-Common character */

    /* The character is in an acceptable script. We must now ensure that all
    decimal digits in the string come from the same set. Some scripts (e.g.
--- a/testdata/testinput4
+++ b/testdata/testinput4
@ -2395,6 +2395,22 @@
 \= Expect no match
    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
    
+/^(*sr:\S*)/utf
+    \x{1cf4}\x{20f0}\x{900}\x{11305}   [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Gran
+    \x{1cf4}\x{20f0}\x{11305}\x{900}   [Dev,Gran,Kan] [Dev,Gran,Lat] Gran Dev
+    \x{1cf4}\x{20f0}\x{900}ABC         [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Lat
+    \x{1cf4}\x{20f0}ABC                [Dev,Gran,Kan] [Dev,Gran,Lat] Lat
+    \x{20f0}ABC                        [Dev,Gran,Lat] Lat
+    XYZ\x{20f0}ABC                     Lat [Dev,Gran,Lat] Lat
+    \x{a36}\x{a33}\x{900}              [Dev,...] [Dev,...] Dev  
+    \x{3001}\x{2e80}\x{3041}\x{30a1}   [Bopo, Han, etc] Han Hira Kata
+    \x{3001}\x{30a1}\x{2e80}\x{3041}   [Bopo, Han, etc] Kata Han Hira
+    \x{3001}\x{3105}\x{2e80}\x{1101}   [Bopo, Han, etc] Bopomofo Han Hangul
+    \x{3105}\x{3001}\x{2e80}\x{1101}   Bopomofo [Bopo, Han, etc] Han Hangul
+    \x{3031}\x{3041}\x{30a1}\x{2e80}   [Hira Kata] Hira Kata Han
+    \x{060c}\x{06d4}\x{0600}\x{10d00}\x{0700}  [Arab Rohg Syrc Thaa] [Arab Rohg] Arab Rohg Syrc
+    \x{060c}\x{06d4}\x{0700}\x{0600}\x{10d00}  [Arab Rohg Syrc Thaa] [Arab Rohg] Syrc Arab Rohg
+
 /(?<!)(*sr:)/

 /(?<!X(*sr:B)C)/
@ -2405,6 +2421,17 @@
 /(?<=abc(?=X(*sr:BXY)CCC)XBXYCCC)./
   abcXBXYCCC!

+/^(*sr:\S*)/utf
+    \x{10d00}\x{10d00}\x{06d4}     Rohingya Rohingya Arabic-full-stop
+    \x{06d4}\x{10d00}\x{10d00}     Arabic-full-stop Rohingya Rohingya
+    \x{10d00}\x{10d00}\x{0363}     Rohingya Rohingya Inherited-extend-Latin
+    \x{0363}\x{10d00}\x{10d00}     Inherited-extend-Latin Rohingya Rohingya
+    AB\x{0363}                     Latin Latin Inherited-extend-Latin
+    \x{0363}AB                     Inherited-extend-Latin Latin Latin
+    AB\x{1cf7}                     Latin Latin Common-extended-Beng
+    \x{1cf7}AB                     Common-extend-Beng Latin Latin
+    \x{1cf7}\x{0993}               Common-extend-Beng Bengali
+    
 # Test loop breaking for empty string match

 /^(*sr:A|)*BCD/utf
--- a/testdata/testinput5
+++ b/testdata/testinput5
@ -2133,6 +2133,17 @@
    A\x{ff10}\x{ff19}B     Latin Common-notascii-digits Latin 
    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
    
+# These ones involve non-ASCII but nevertheless Common digits. As of October
+# 2018 even blead Perl wasn't handling all of these - but is going to. 
+
+/^(*sr:.{4})/utf
+    A\x{ff10}\x{ff19}B     Latin Common-notascii-digits Latin 
+    \x{ff10}\x{ff19}..     Common-notascii-digits Common Common
+    A\x{ff10}BC            Latin Common-notascii-digit Latin Latin
+    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
+    \x{1d7ce}\x{1d7cf},,   fancy-common-digits Common Common
+    A\x{1d7ce}BC           Latin fancy-common-digit Latin Latin
+
 # ------- 

 # End of testinput5
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@ -3873,6 +3873,36 @@ No match
    \x{1100}\x{2e80}\x{3041}\x{1101}   Hangul Han Hiragana Hangul
 No match
    
+/^(*sr:\S*)/utf
+    \x{1cf4}\x{20f0}\x{900}\x{11305}   [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Gran
+ 0: \x{1cf4}\x{20f0}\x{900}
+    \x{1cf4}\x{20f0}\x{11305}\x{900}   [Dev,Gran,Kan] [Dev,Gran,Lat] Gran Dev
+ 0: \x{1cf4}\x{20f0}\x{11305}
+    \x{1cf4}\x{20f0}\x{900}ABC         [Dev,Gran,Kan] [Dev,Gran,Lat] Dev Lat
+ 0: \x{1cf4}\x{20f0}\x{900}
+    \x{1cf4}\x{20f0}ABC                [Dev,Gran,Kan] [Dev,Gran,Lat] Lat
+ 0: \x{1cf4}\x{20f0}
+    \x{20f0}ABC                        [Dev,Gran,Lat] Lat
+ 0: \x{20f0}ABC
+    XYZ\x{20f0}ABC                     Lat [Dev,Gran,Lat] Lat
+ 0: XYZ\x{20f0}ABC
+    \x{a36}\x{a33}\x{900}              [Dev,...] [Dev,...] Dev  
+ 0: \x{a36}\x{a33}
+    \x{3001}\x{2e80}\x{3041}\x{30a1}   [Bopo, Han, etc] Han Hira Kata
+ 0: \x{3001}\x{2e80}\x{3041}\x{30a1}
+    \x{3001}\x{30a1}\x{2e80}\x{3041}   [Bopo, Han, etc] Kata Han Hira
+ 0: \x{3001}\x{30a1}\x{2e80}\x{3041}
+    \x{3001}\x{3105}\x{2e80}\x{1101}   [Bopo, Han, etc] Bopomofo Han Hangul
+ 0: \x{3001}\x{3105}\x{2e80}
+    \x{3105}\x{3001}\x{2e80}\x{1101}   Bopomofo [Bopo, Han, etc] Han Hangul
+ 0: \x{3105}\x{3001}\x{2e80}
+    \x{3031}\x{3041}\x{30a1}\x{2e80}   [Hira Kata] Hira Kata Han
+ 0: \x{3031}\x{3041}\x{30a1}\x{2e80}
+    \x{060c}\x{06d4}\x{0600}\x{10d00}\x{0700}  [Arab Rohg Syrc Thaa] [Arab Rohg] Arab Rohg Syrc
+ 0: \x{60c}\x{6d4}\x{600}
+    \x{060c}\x{06d4}\x{0700}\x{0600}\x{10d00}  [Arab Rohg Syrc Thaa] [Arab Rohg] Syrc Arab Rohg
+ 0: \x{60c}\x{6d4}
+
 /(?<!)(*sr:)/

 /(?<!X(*sr:B)C)/
@ -3885,6 +3915,26 @@ No match
   abcXBXYCCC!
 0: !

+/^(*sr:\S*)/utf
+    \x{10d00}\x{10d00}\x{06d4}     Rohingya Rohingya Arabic-full-stop
+ 0: \x{10d00}\x{10d00}\x{6d4}
+    \x{06d4}\x{10d00}\x{10d00}     Arabic-full-stop Rohingya Rohingya
+ 0: \x{6d4}\x{10d00}\x{10d00}
+    \x{10d00}\x{10d00}\x{0363}     Rohingya Rohingya Inherited-extend-Latin
+ 0: \x{10d00}\x{10d00}
+    \x{0363}\x{10d00}\x{10d00}     Inherited-extend-Latin Rohingya Rohingya
+ 0: \x{363}
+    AB\x{0363}                     Latin Latin Inherited-extend-Latin
+ 0: AB\x{363}
+    \x{0363}AB                     Inherited-extend-Latin Latin Latin
+ 0: \x{363}AB
+    AB\x{1cf7}                     Latin Latin Common-extended-Beng
+ 0: AB
+    \x{1cf7}AB                     Common-extend-Beng Latin Latin
+ 0: \x{1cf7}
+    \x{1cf7}\x{0993}               Common-extend-Beng Bengali
+ 0: \x{1cf7}\x{993}
+    
 # Test loop breaking for empty string match

 /^(*sr:A|)*BCD/utf
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@ -4866,6 +4866,23 @@ MK: ABC
    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
 0: A\x{1d7ce}\x{1d7cf}B
    
+# These ones involve non-ASCII but nevertheless Common digits. As of October
+# 2018 even blead Perl wasn't handling all of these - but is going to. 
+
+/^(*sr:.{4})/utf
+    A\x{ff10}\x{ff19}B     Latin Common-notascii-digits Latin 
+ 0: A\x{ff10}\x{ff19}B
+    \x{ff10}\x{ff19}..     Common-notascii-digits Common Common
+ 0: \x{ff10}\x{ff19}..
+    A\x{ff10}BC            Latin Common-notascii-digit Latin Latin
+ 0: A\x{ff10}BC
+    A\x{1d7ce}\x{1d7cf}B   Latin fancy-common-digits Latin
+ 0: A\x{1d7ce}\x{1d7cf}B
+    \x{1d7ce}\x{1d7cf},,   fancy-common-digits Common Common
+ 0: \x{1d7ce}\x{1d7cf},,
+    A\x{1d7ce}BC           Latin fancy-common-digit Latin Latin
+ 0: A\x{1d7ce}BC
+
 # ------- 

 # End of testinput5