pcre2/src/pcre2_script_run.c

/*************************************************
*      Perl-Compatible Regular Expressions       *
*************************************************/

/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
          New API code Copyright (c) 2016-2018 University of Cambridge

-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of the University of Cambridge nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/

/* This module contains the function for checking a script run. */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "pcre2_internal.h"


/*************************************************
*                Check script run                *
*************************************************/

/* A script run is conceptually a sequence of characters all in the same
Unicode script. However, it isn't quite that simple. There are special rules
for scripts that are commonly used together, and also special rules for digits.
This function implements the appropriate checks, which is possible only when
PCRE2 is compiled with Unicode support. The function returns TRUE if there is
no Unicode support; however, it should never be called in that circumstance
because an error is given by pcre2_compile() if a script run is called for in a
version of PCRE2 compiled without Unicode support.

Arguments:
  pgr       point to the first character
  endptr    point after the last character
  utf       TRUE if in UTF mode

Returns:    TRUE if this is a valid script run
*/

#define SCRIPT_UNSET        (-1)
#define SCRIPT_HANPENDING   (-2)
#define SCRIPT_HANHIRAKATA  (-3)
#define SCRIPT_HANBOPOMOFO  (-4)
#define SCRIPT_HANHANGUL    (-5)

BOOL
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
{
#ifdef SUPPORT_UNICODE
int require_script = SCRIPT_UNSET;
uint32_t require_digitset = 0;
uint32_t c;

#if PCRE2_CODE_UNIT_WIDTH == 32
(void)utf;    /* Avoid compiler warning */
#endif

/* Any string containing fewer than 2 characters is a valid script run. */

if (ptr >= endptr) return TRUE;
GETCHARINCTEST(c, ptr);
if (ptr >= endptr) return TRUE;

/* Scan strings of two or more characters, checking the Unicode characteristics
of each code point. */

for (;;)
  {
  const ucd_record *ucd = GET_UCD(c);
  uint32_t script = ucd->script;

  /* If the script is Unknown, the string is not a valid script run. Such
  characters can only form script runs of length one. */
  
  if (script == ucp_Unknown) return FALSE; 

  /* A character whose script is Inherited is always accepted, and plays no
  further part. A character whose script is Common is always accepted, but must
  still be tested for a digit below. Otherwise, the character must match the
  script of the first non-Inherited, non-Common character encountered. For most
  scripts, the test is for the same script. However, the Han Chinese script may
  be used in conjunction with four other scripts in these combinations:

  . Han with Hiragana and Katakana is allowed (for Japanese).

  . Han with Bopomofo is allowed (for Taiwanese Mandarin).

  . Han with Hangul is allowed (for Korean).

  If the first significant character's script is one of the four, the required
  script type is immediately known. However, if the first significant
  character's script is Han, we have to keep checking for a non-Han character.
  Hence the SCRIPT_HANPENDING state. */
 
  if (script != ucp_Inherited)
    { 
    if (script != ucp_Common) switch(require_script)
      {
      default:
      if (script != (unsigned int)require_script) return FALSE;
      break;
    
      case SCRIPT_UNSET:
      case SCRIPT_HANPENDING:
      switch(script)
        {
        case ucp_Han:
        require_script = SCRIPT_HANPENDING;
        break;
    
        case ucp_Hiragana:
        case ucp_Katakana:
        require_script = SCRIPT_HANHIRAKATA;
        break;
    
        case ucp_Bopomofo:
        require_script = SCRIPT_HANBOPOMOFO;
        break;
    
        case ucp_Hangul:
        require_script = SCRIPT_HANHANGUL;
        break;
    
        default:
        if (require_script == SCRIPT_HANPENDING) return FALSE;
        require_script = script;
        break;
        }
      break;
    
      case SCRIPT_HANHIRAKATA:
      if (script != ucp_Han && script != ucp_Hiragana && script != ucp_Katakana)
        return FALSE;
      break;
    
      case SCRIPT_HANBOPOMOFO:
      if (script != ucp_Han && script != ucp_Bopomofo) return FALSE;
      break;
    
      case SCRIPT_HANHANGUL:
      if (script != ucp_Han && script != ucp_Hangul) return FALSE;
      break;
      }
    
    /* The character is in an acceptable script. We must now ensure that all
    decimal digits in the string come from the same set. Some scripts (e.g.
    Common, Arabic) have more than one set of decimal digits. This code does
    not allow mixing sets, even within the same script. The vector called
    PRIV(ucd_digit_sets)[] contains, in its first element, the number of
    following elements, and then, in ascending order, the code points of the
    '9' characters in every set of 10 digits. Each set is identified by the
    offset in the vector of its '9' character. An initial check of the first
    value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
    
    if (ucd->chartype == ucp_Nd)
      {
      uint32_t digitset;
        
      if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
        {
        int mid;
        int bot = 1;
        int top = PRIV(ucd_digit_sets)[0];
        for (;;)
          {
          if (top <= bot + 1)    /* <= rather than == is paranoia */
            {
            digitset = top;
            break;
            }
          mid = (top + bot) / 2;
          if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
          }
        }
    
      /* A required value of 0 means "unset". */
    
      if (require_digitset == 0) require_digitset = digitset;
        else if (digitset != require_digitset) return FALSE;
      }   /* End digit handling */
    }     /* End checking non-Inherited character */

  /* If we haven't yet got to the end, pick up the next character. */

  if (ptr >= endptr) return TRUE;
  GETCHARINCTEST(c, ptr);
  }  /* End checking loop */

#else   /* NOT SUPPORT_UNICODE */
(void)ptr;
(void)endptr;
(void)utf;
return TRUE;
#endif  /* SUPPORT_UNICODE */
}

/* End of pcre2_script_run.c */
Basic "script run" implementation. Not yet complete, and not yet documented. 2018-10-02 17:25:58 +02:00			`/*************************************************`
			`* Perl-Compatible Regular Expressions *`
			`*************************************************/`

			`/* PCRE is a library of functions to support regular expressions whose syntax`
			`and semantics are as close as possible to those of the Perl 5 language.`

			`Written by Philip Hazel`
			`Original API code Copyright (c) 1997-2012 University of Cambridge`
			`New API code Copyright (c) 2016-2018 University of Cambridge`

			`-----------------------------------------------------------------------------`
			`Redistribution and use in source and binary forms, with or without`
			`modification, are permitted provided that the following conditions are met:`

			`* Redistributions of source code must retain the above copyright notice,`
			`this list of conditions and the following disclaimer.`

			`* Redistributions in binary form must reproduce the above copyright`
			`notice, this list of conditions and the following disclaimer in the`
			`documentation and/or other materials provided with the distribution.`

			`* Neither the name of the University of Cambridge nor the names of its`
			`contributors may be used to endorse or promote products derived from`
			`this software without specific prior written permission.`

			`THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE`
			`LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`POSSIBILITY OF SUCH DAMAGE.`
			`-----------------------------------------------------------------------------`
			`*/`

			`/* This module contains the function for checking a script run. */`

			`#ifdef HAVE_CONFIG_H`
			`#include "config.h"`
			`#endif`

			`#include "pcre2_internal.h"`


			`/*************************************************`
			`* Check script run *`
			`*************************************************/`

			`/* A script run is conceptually a sequence of characters all in the same`
			`Unicode script. However, it isn't quite that simple. There are special rules`
			`for scripts that are commonly used together, and also special rules for digits.`
			`This function implements the appropriate checks, which is possible only when`
			`PCRE2 is compiled with Unicode support. The function returns TRUE if there is`
			`no Unicode support; however, it should never be called in that circumstance`
			`because an error is given by pcre2_compile() if a script run is called for in a`
			`version of PCRE2 compiled without Unicode support.`

			`Arguments:`
			`pgr point to the first character`
			`endptr point after the last character`
			`utf TRUE if in UTF mode`

			`Returns: TRUE if this is a valid script run`
			`*/`

			`#define SCRIPT_UNSET (-1)`
			`#define SCRIPT_HANPENDING (-2)`
			`#define SCRIPT_HANHIRAKATA (-3)`
			`#define SCRIPT_HANBOPOMOFO (-4)`
			`#define SCRIPT_HANHANGUL (-5)`

			`BOOL`
			`PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)`
			`{`
			`#ifdef SUPPORT_UNICODE`
			`int require_script = SCRIPT_UNSET;`
			`uint32_t require_digitset = 0;`
			`uint32_t c;`

			`#if PCRE2_CODE_UNIT_WIDTH == 32`
			`(void)utf; /* Avoid compiler warning */`
			`#endif`

			`/* Any string containing fewer than 2 characters is a valid script run. */`

			`if (ptr >= endptr) return TRUE;`
			`GETCHARINCTEST(c, ptr);`
			`if (ptr >= endptr) return TRUE;`

			`/* Scan strings of two or more characters, checking the Unicode characteristics`
			`of each code point. */`

			`for (;;)`
			`{`
			`const ucd_record *ucd = GET_UCD(c);`
			`uint32_t script = ucd->script;`

			`/* If the script is Unknown, the string is not a valid script run. Such`
			`characters can only form script runs of length one. */`

			`if (script == ucp_Unknown) return FALSE;`

			`/* A character whose script is Inherited is always accepted, and plays no`
			`further part. A character whose script is Common is always accepted, but must`
			`still be tested for a digit below. Otherwise, the character must match the`
			`script of the first non-Inherited, non-Common character encountered. For most`
			`scripts, the test is for the same script. However, the Han Chinese script may`
			`be used in conjunction with four other scripts in these combinations:`

			`. Han with Hiragana and Katakana is allowed (for Japanese).`

			`. Han with Bopomofo is allowed (for Taiwanese Mandarin).`

			`. Han with Hangul is allowed (for Korean).`

			`If the first significant character's script is one of the four, the required`
			`script type is immediately known. However, if the first significant`
			`character's script is Han, we have to keep checking for a non-Han character.`
			`Hence the SCRIPT_HANPENDING state. */`

			`if (script != ucp_Inherited)`
			`{`
			`if (script != ucp_Common) switch(require_script)`
			`{`
			`default:`
			`if (script != (unsigned int)require_script) return FALSE;`
			`break;`

			`case SCRIPT_UNSET:`
			`case SCRIPT_HANPENDING:`
			`switch(script)`
			`{`
			`case ucp_Han:`
			`require_script = SCRIPT_HANPENDING;`
			`break;`

			`case ucp_Hiragana:`
			`case ucp_Katakana:`
			`require_script = SCRIPT_HANHIRAKATA;`
			`break;`

			`case ucp_Bopomofo:`
			`require_script = SCRIPT_HANBOPOMOFO;`
			`break;`

			`case ucp_Hangul:`
			`require_script = SCRIPT_HANHANGUL;`
			`break;`

			`default:`
			`if (require_script == SCRIPT_HANPENDING) return FALSE;`
			`require_script = script;`
			`break;`
			`}`
			`break;`

			`case SCRIPT_HANHIRAKATA:`
			`if (script != ucp_Han && script != ucp_Hiragana && script != ucp_Katakana)`
			`return FALSE;`
			`break;`

			`case SCRIPT_HANBOPOMOFO:`
			`if (script != ucp_Han && script != ucp_Bopomofo) return FALSE;`
			`break;`

			`case SCRIPT_HANHANGUL:`
			`if (script != ucp_Han && script != ucp_Hangul) return FALSE;`
			`break;`
			`}`

			`/* The character is in an acceptable script. We must now ensure that all`
			`decimal digits in the string come from the same set. Some scripts (e.g.`
			`Common, Arabic) have more than one set of decimal digits. This code does`
			`not allow mixing sets, even within the same script. The vector called`
			`PRIV(ucd_digit_sets)[] contains, in its first element, the number of`
			`following elements, and then, in ascending order, the code points of the`
			`'9' characters in every set of 10 digits. Each set is identified by the`
			`offset in the vector of its '9' character. An initial check of the first`
			`value picks up ASCII digits quickly. Otherwise, a binary chop is used. */`

			`if (ucd->chartype == ucp_Nd)`
			`{`
			`uint32_t digitset;`

			`if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else`
			`{`
			`int mid;`
			`int bot = 1;`
			`int top = PRIV(ucd_digit_sets)[0];`
			`for (;;)`
			`{`
			`if (top <= bot + 1) /* <= rather than == is paranoia */`
			`{`
			`digitset = top;`
			`break;`
			`}`
			`mid = (top + bot) / 2;`
			`if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;`
			`}`
			`}`

			`/* A required value of 0 means "unset". */`

			`if (require_digitset == 0) require_digitset = digitset;`
			`else if (digitset != require_digitset) return FALSE;`
			`} /* End digit handling */`
			`} /* End checking non-Inherited character */`

			`/* If we haven't yet got to the end, pick up the next character. */`

			`if (ptr >= endptr) return TRUE;`
			`GETCHARINCTEST(c, ptr);`
			`} /* End checking loop */`

			`#else /* NOT SUPPORT_UNICODE */`
			`(void)ptr;`
			`(void)endptr;`
			`(void)utf;`
			`return TRUE;`
			`#endif /* SUPPORT_UNICODE */`
			`}`

			`/* End of pcre2_script_run.c */`