Auto generate unicode property tests. (#67)
Co-authored-by: Zoltan Herczeg <hzmester@freemail.hu>
This commit is contained in:
parent
d888d36013
commit
e7457003cd
23
RunTest
23
RunTest
|
@ -80,7 +80,8 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
|
||||||
title23="Test 23: \C disabled test"
|
title23="Test 23: \C disabled test"
|
||||||
title24="Test 24: Non-UTF pattern conversion tests"
|
title24="Test 24: Non-UTF pattern conversion tests"
|
||||||
title25="Test 25: UTF pattern conversion tests"
|
title25="Test 25: UTF pattern conversion tests"
|
||||||
maxtest=25
|
title26="Test 26: Auto-generated unicode property tests"
|
||||||
|
maxtest=26
|
||||||
|
|
||||||
if [ $# -eq 1 -a "$1" = "list" ]; then
|
if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||||
echo $title0
|
echo $title0
|
||||||
|
@ -109,6 +110,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||||
echo $title23
|
echo $title23
|
||||||
echo $title24
|
echo $title24
|
||||||
echo $title25
|
echo $title25
|
||||||
|
echo $title26
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -238,6 +240,7 @@ do22=no
|
||||||
do23=no
|
do23=no
|
||||||
do24=no
|
do24=no
|
||||||
do25=no
|
do25=no
|
||||||
|
do26=no
|
||||||
|
|
||||||
while [ $# -gt 0 ] ; do
|
while [ $# -gt 0 ] ; do
|
||||||
case $1 in
|
case $1 in
|
||||||
|
@ -267,6 +270,7 @@ while [ $# -gt 0 ] ; do
|
||||||
23) do23=yes;;
|
23) do23=yes;;
|
||||||
24) do24=yes;;
|
24) do24=yes;;
|
||||||
25) do25=yes;;
|
25) do25=yes;;
|
||||||
|
26) do26=yes;;
|
||||||
-8) arg8=yes;;
|
-8) arg8=yes;;
|
||||||
-16) arg16=yes;;
|
-16) arg16=yes;;
|
||||||
-32) arg32=yes;;
|
-32) arg32=yes;;
|
||||||
|
@ -417,7 +421,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
||||||
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
||||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
||||||
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
|
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
|
||||||
$do24 = no -a $do25 = no \
|
$do24 = no -a $do25 = no -a $do26 = no \
|
||||||
]; then
|
]; then
|
||||||
do0=yes
|
do0=yes
|
||||||
do1=yes
|
do1=yes
|
||||||
|
@ -445,6 +449,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
||||||
do23=yes
|
do23=yes
|
||||||
do24=yes
|
do24=yes
|
||||||
do25=yes
|
do25=yes
|
||||||
|
do26=yes
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||||
|
@ -863,6 +868,20 @@ for bmode in "$test8" "$test16" "$test32"; do
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Auto-generated unicode property tests
|
||||||
|
|
||||||
|
if [ $do26 = yes ] ; then
|
||||||
|
echo $title26
|
||||||
|
if [ $utf -eq 0 ] ; then
|
||||||
|
echo " Skipped because UTF-$bits support is not available"
|
||||||
|
else
|
||||||
|
for opt in "" $jitopt; do
|
||||||
|
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
|
||||||
|
checkresult $? 26 "$opt"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# End of loop for 8/16/32-bit tests
|
# End of loop for 8/16/32-bit tests
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,188 @@
|
||||||
|
#! /usr/bin/python
|
||||||
|
|
||||||
|
# PCRE2 UNICODE PROPERTY SUPPORT
|
||||||
|
# ------------------------------
|
||||||
|
#
|
||||||
|
# This file auto-generates unicode property tests and their expected output.
|
||||||
|
# It is recommended to re-run this generator after the unicode files are
|
||||||
|
# updated. The names of the generated files are `testinput26` and `testoutput26`
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from GenerateCommon import \
|
||||||
|
script_names, \
|
||||||
|
script_abbrevs
|
||||||
|
|
||||||
|
def write_both(text):
|
||||||
|
input_file.write(text)
|
||||||
|
output_file.write(text)
|
||||||
|
|
||||||
|
def to_string_char(ch_idx):
|
||||||
|
if ch_idx < 128:
|
||||||
|
if ch_idx < 16:
|
||||||
|
return "\\x{0%x}" % ch_idx
|
||||||
|
if ch_idx >= 32:
|
||||||
|
return chr(ch_idx)
|
||||||
|
return "\\x{%x}" % ch_idx
|
||||||
|
|
||||||
|
output_directory = ""
|
||||||
|
|
||||||
|
if len(sys.argv) > 2:
|
||||||
|
print('** Too many arguments: just give a directory name')
|
||||||
|
sys.exit(1)
|
||||||
|
if len(sys.argv) == 2:
|
||||||
|
output_directory = sys.argv[1]
|
||||||
|
if not output_directory.endswith("/"):
|
||||||
|
output_directory += "/"
|
||||||
|
|
||||||
|
try:
|
||||||
|
input_file = open(output_directory + "testinput26", "w")
|
||||||
|
output_file = open(output_directory + "testoutput26", "w")
|
||||||
|
except IOError:
|
||||||
|
print ("** Couldn't open output files")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# UNICODE SCRIPT EXTENSION TESTS
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
write_both("# Unicode Script Extension tests.\n\n")
|
||||||
|
|
||||||
|
def gen_script_tests():
|
||||||
|
script_data = [None] * len(script_names)
|
||||||
|
char_data = [None] * 0x110000
|
||||||
|
|
||||||
|
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
|
||||||
|
prev_name = ""
|
||||||
|
script_idx = -1
|
||||||
|
|
||||||
|
with open("Unicode.tables/Scripts.txt") as f:
|
||||||
|
for line in f:
|
||||||
|
match_obj = property_re.match(line)
|
||||||
|
|
||||||
|
if match_obj == None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = match_obj.group(3)
|
||||||
|
if name != prev_name:
|
||||||
|
script_idx = script_names.index(name)
|
||||||
|
prev_name = name
|
||||||
|
|
||||||
|
low = int(match_obj.group(1), 16)
|
||||||
|
high = low
|
||||||
|
char_data[low] = name
|
||||||
|
|
||||||
|
if match_obj.group(2) != None:
|
||||||
|
high = int(match_obj.group(2), 16)
|
||||||
|
for idx in range(low + 1, high + 1):
|
||||||
|
char_data[idx] = name
|
||||||
|
|
||||||
|
if script_data[script_idx] == None:
|
||||||
|
script_data[script_idx] = [low, None, None, None, None]
|
||||||
|
script_data[script_idx][1] = high
|
||||||
|
|
||||||
|
extended_script_indicies = {}
|
||||||
|
|
||||||
|
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||||
|
for line in f:
|
||||||
|
match_obj = property_re.match(line)
|
||||||
|
|
||||||
|
if match_obj == None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
low = int(match_obj.group(1), 16)
|
||||||
|
high = low
|
||||||
|
if match_obj.group(2) != None:
|
||||||
|
high = int(match_obj.group(2), 16)
|
||||||
|
|
||||||
|
for abbrev in match_obj.group(3).split(" "):
|
||||||
|
if abbrev not in extended_script_indicies:
|
||||||
|
idx = script_abbrevs.index(abbrev)
|
||||||
|
extended_script_indicies[abbrev] = idx
|
||||||
|
rec = script_data[idx]
|
||||||
|
rec[2] = low
|
||||||
|
rec[3] = high
|
||||||
|
else:
|
||||||
|
idx = extended_script_indicies[abbrev]
|
||||||
|
rec = script_data[idx]
|
||||||
|
if rec[2] > low:
|
||||||
|
rec[2] = low
|
||||||
|
if rec[3] < high:
|
||||||
|
rec[3] = high
|
||||||
|
|
||||||
|
if rec[4] == None:
|
||||||
|
name = script_names[idx]
|
||||||
|
for idx in range(low, high + 1):
|
||||||
|
if char_data[idx] != name:
|
||||||
|
rec[4] = idx
|
||||||
|
break
|
||||||
|
|
||||||
|
long_property_name = False
|
||||||
|
|
||||||
|
for idx, rec in enumerate(script_data):
|
||||||
|
script_name = script_names[idx]
|
||||||
|
|
||||||
|
if script_name == "Unknown":
|
||||||
|
continue
|
||||||
|
|
||||||
|
script_abbrev = script_abbrevs[idx]
|
||||||
|
|
||||||
|
write_both("# Base script check\n")
|
||||||
|
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||||
|
write_both(" %s\n" % to_string_char(rec[0]))
|
||||||
|
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
|
||||||
|
write_both("\n")
|
||||||
|
|
||||||
|
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
|
||||||
|
write_both(" %s\n" % to_string_char(rec[1]))
|
||||||
|
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
|
||||||
|
write_both("\n")
|
||||||
|
|
||||||
|
if rec[2] != None:
|
||||||
|
property_name = "scx"
|
||||||
|
if long_property_name:
|
||||||
|
property_name = "Script_Extensions"
|
||||||
|
|
||||||
|
write_both("# Script extension check\n")
|
||||||
|
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||||
|
write_both(" %s\n" % to_string_char(rec[2]))
|
||||||
|
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
|
||||||
|
write_both("\n")
|
||||||
|
|
||||||
|
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
|
||||||
|
write_both(" %s\n" % to_string_char(rec[3]))
|
||||||
|
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
|
||||||
|
write_both("\n")
|
||||||
|
|
||||||
|
long_property_name = not long_property_name
|
||||||
|
|
||||||
|
if rec[4] != None:
|
||||||
|
write_both("# Script extension only character\n")
|
||||||
|
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||||
|
write_both(" %s\n" % to_string_char(rec[4]))
|
||||||
|
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
|
||||||
|
write_both("\n")
|
||||||
|
|
||||||
|
write_both("/^\\p{sc=%s}/utf\n" % script_name)
|
||||||
|
write_both(" %s\n" % to_string_char(rec[4]))
|
||||||
|
output_file.write("No match\n")
|
||||||
|
write_both("\n")
|
||||||
|
else:
|
||||||
|
print("External character has not found for %s" % script_name)
|
||||||
|
|
||||||
|
high = rec[1]
|
||||||
|
if rec[3] != None and rec[3] > rec[1]:
|
||||||
|
high = rec[3]
|
||||||
|
write_both("# Character not in script\n")
|
||||||
|
write_both("/^\\p{%s}/utf\n" % script_name)
|
||||||
|
write_both(" %s\n" % to_string_char(high + 1))
|
||||||
|
output_file.write("No match\n")
|
||||||
|
write_both("\n")
|
||||||
|
|
||||||
|
|
||||||
|
gen_script_tests()
|
||||||
|
|
||||||
|
write_both("# End of testinput26\n")
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue