Auto generate unicode property tests. (#67)

Co-authored-by: Zoltan Herczeg <hzmester@freemail.hu>
2021-12-31 17:47:37 +01:00 · 2021-12-31 17:47:37 +01:00 · e7457003cd
parent d888d36013
commit e7457003cd
4 changed files with 6420 additions and 2 deletions
--- a/23
+++ b/23
@ -80,7 +80,8 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
 title23="Test 23: \C disabled test"
 title24="Test 24: Non-UTF pattern conversion tests"
 title25="Test 25: UTF pattern conversion tests"
-maxtest=25
+title26="Test 26: Auto-generated unicode property tests"
 maxtest=26
 if [ $# -eq 1 -a "$1" = "list" ]; then
  echo $title0
@ -109,6 +110,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
  echo $title23
  echo $title24
  echo $title25
  echo $title26
  exit 0
 fi
@ -238,6 +240,7 @@ do22=no
 do23=no
 do24=no
 do25=no
 do26=no
 while [ $# -gt 0 ] ; do
  case $1 in
@ -267,6 +270,7 @@ while [ $# -gt 0 ] ; do
   23) do23=yes;;
   24) do24=yes;;
   25) do25=yes;;
   26) do26=yes;;
   -8) arg8=yes;;
  -16) arg16=yes;;
  -32) arg32=yes;;
@ -417,7 +421,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
     $do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
     $do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
     $do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
-     $do24 = no -a $do25 = no \
+     $do24 = no -a $do25 = no -a $do26 = no \
   ]; then
  do0=yes
  do1=yes
@ -445,6 +449,7 @@ if [ $do0  = no -a $do1  = no -a $do2  = no -a $do3  = no -a \
  do23=yes
  do24=yes
  do25=yes
  do26=yes
 fi
 # Handle any explicit skips at this stage, so that an argument list may consist
@ -863,6 +868,20 @@ for bmode in "$test8" "$test16" "$test32"; do
    fi
  fi
  # Auto-generated unicode property tests
  if [ $do26 = yes ] ; then
    echo $title26
    if [ $utf -eq 0 ] ; then
      echo "  Skipped because UTF-$bits support is not available"
    else
      for opt in "" $jitopt; do
        $sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
        checkresult $? 26 "$opt"
      done
    fi
  fi
 # End of loop for 8/16/32-bit tests
 done
--- a/maint/GenerateTest26.py
+++ b/maint/GenerateTest26.py
@ -0,0 +1,188 @@
 #! /usr/bin/python
 #                   PCRE2 UNICODE PROPERTY SUPPORT
 #                   ------------------------------
 #
 # This file auto-generates unicode property tests and their expected output.
 # It is recommended to re-run this generator after the unicode files are
 # updated. The names of the generated files are `testinput26` and `testoutput26`
 import re
 import sys
 from GenerateCommon import \
  script_names, \
  script_abbrevs
 def write_both(text):
  input_file.write(text)
  output_file.write(text)
 def to_string_char(ch_idx):
  if ch_idx < 128:
    if ch_idx < 16:
      return "\\x{0%x}" % ch_idx
    if ch_idx >= 32:
      return chr(ch_idx)
  return "\\x{%x}" % ch_idx
 output_directory = ""
 if len(sys.argv) > 2:
  print('** Too many arguments: just give a directory name')
  sys.exit(1)
 if len(sys.argv) == 2:
  output_directory = sys.argv[1]
  if not output_directory.endswith("/"):
    output_directory += "/"
 try:
  input_file = open(output_directory + "testinput26", "w")
  output_file = open(output_directory + "testoutput26", "w")
 except IOError:
  print ("** Couldn't open output files")
  sys.exit(1)
 write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
 # ---------------------------------------------------------------------------
 #                      UNICODE SCRIPT EXTENSION TESTS
 # ---------------------------------------------------------------------------
 write_both("# Unicode Script Extension tests.\n\n")
 def gen_script_tests():
  script_data = [None] * len(script_names)
  char_data = [None] * 0x110000
  property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
  prev_name = ""
  script_idx = -1
  with open("Unicode.tables/Scripts.txt") as f:
    for line in f:
      match_obj = property_re.match(line)
      if match_obj == None:
        continue
      name = match_obj.group(3)
      if name != prev_name:
        script_idx = script_names.index(name)
        prev_name = name
      low = int(match_obj.group(1), 16)
      high = low
      char_data[low] = name
      if match_obj.group(2) != None:
        high = int(match_obj.group(2), 16)
        for idx in range(low + 1, high + 1):
           char_data[idx] = name
      if script_data[script_idx] == None:
        script_data[script_idx] = [low, None, None, None, None]
      script_data[script_idx][1] = high
  extended_script_indicies = {}
  with open("Unicode.tables/ScriptExtensions.txt") as f:
    for line in f:
      match_obj = property_re.match(line)
      if match_obj == None:
        continue
      low = int(match_obj.group(1), 16)
      high = low
      if match_obj.group(2) != None:
        high = int(match_obj.group(2), 16)
      for abbrev in match_obj.group(3).split(" "):
        if abbrev not in extended_script_indicies:
          idx = script_abbrevs.index(abbrev)
          extended_script_indicies[abbrev] = idx
          rec = script_data[idx]
          rec[2] = low
          rec[3] = high
        else:
          idx = extended_script_indicies[abbrev]
          rec = script_data[idx]
          if rec[2] > low:
            rec[2] = low
          if rec[3] < high:
            rec[3] = high
        if rec[4] == None:
          name = script_names[idx]
          for idx in range(low, high + 1):
            if char_data[idx] != name:
              rec[4] = idx
              break
  long_property_name = False
  for idx, rec in enumerate(script_data):
    script_name = script_names[idx]
    if script_name == "Unknown":
      continue
    script_abbrev = script_abbrevs[idx]
    write_both("# Base script check\n")
    write_both("/^\\p{sc=%s}/utf\n" % script_name)
    write_both("  %s\n" % to_string_char(rec[0]))
    output_file.write(" 0: %s\n" % to_string_char(rec[0]))
    write_both("\n")
    write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
    write_both("  %s\n" % to_string_char(rec[1]))
    output_file.write(" 0: %s\n" % to_string_char(rec[1]))
    write_both("\n")
    if rec[2] != None:
      property_name = "scx"
      if long_property_name:
        property_name = "Script_Extensions"
      write_both("# Script extension check\n")
      write_both("/^\\p{%s}/utf\n" % script_name)
      write_both("  %s\n" % to_string_char(rec[2]))
      output_file.write(" 0: %s\n" % to_string_char(rec[2]))
      write_both("\n")
      write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
      write_both("  %s\n" % to_string_char(rec[3]))
      output_file.write(" 0: %s\n" % to_string_char(rec[3]))
      write_both("\n")
      long_property_name = not long_property_name
      if rec[4] != None:
        write_both("# Script extension only character\n")
        write_both("/^\\p{%s}/utf\n" % script_name)
        write_both("  %s\n" % to_string_char(rec[4]))
        output_file.write(" 0: %s\n" % to_string_char(rec[4]))
        write_both("\n")
        write_both("/^\\p{sc=%s}/utf\n" % script_name)
        write_both("  %s\n" % to_string_char(rec[4]))
        output_file.write("No match\n")
        write_both("\n")
      else:
        print("External character has not found for %s" % script_name)
    high = rec[1]
    if rec[3] != None and rec[3] > rec[1]:
      high = rec[3]
    write_both("# Character not in script\n")
    write_both("/^\\p{%s}/utf\n" % script_name)
    write_both("  %s\n" % to_string_char(high + 1))
    output_file.write("No match\n")
    write_both("\n")
 gen_script_tests()
 write_both("# End of testinput26\n")
--- a/testdata/testinput26
+++ b/testdata/testinput26
--- a/testdata/testoutput26
+++ b/testdata/testoutput26