pcre2/maint/GenerateTest26.py

189 lines
5.4 KiB
Python
Executable File

#! /usr/bin/python
# PCRE2 UNICODE PROPERTY SUPPORT
# ------------------------------
#
# This file auto-generates unicode property tests and their expected output.
# It is recommended to re-run this generator after the unicode files are
# updated. The names of the generated files are `testinput26` and `testoutput26`
import re
import sys
from GenerateCommon import \
script_names, \
script_abbrevs
def write_both(text):
input_file.write(text)
output_file.write(text)
def to_string_char(ch_idx):
if ch_idx < 128:
if ch_idx < 16:
return "\\x{0%x}" % ch_idx
if ch_idx >= 32:
return chr(ch_idx)
return "\\x{%x}" % ch_idx
output_directory = ""
if len(sys.argv) > 2:
print('** Too many arguments: just give a directory name')
sys.exit(1)
if len(sys.argv) == 2:
output_directory = sys.argv[1]
if not output_directory.endswith("/"):
output_directory += "/"
try:
input_file = open(output_directory + "testinput26", "w")
output_file = open(output_directory + "testoutput26", "w")
except IOError:
print ("** Couldn't open output files")
sys.exit(1)
write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
# ---------------------------------------------------------------------------
# UNICODE SCRIPT EXTENSION TESTS
# ---------------------------------------------------------------------------
write_both("# Unicode Script Extension tests.\n\n")
def gen_script_tests():
script_data = [None] * len(script_names)
char_data = [None] * 0x110000
property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
prev_name = ""
script_idx = -1
with open("Unicode.tables/Scripts.txt") as f:
for line in f:
match_obj = property_re.match(line)
if match_obj == None:
continue
name = match_obj.group(3)
if name != prev_name:
script_idx = script_names.index(name)
prev_name = name
low = int(match_obj.group(1), 16)
high = low
char_data[low] = name
if match_obj.group(2) != None:
high = int(match_obj.group(2), 16)
for idx in range(low + 1, high + 1):
char_data[idx] = name
if script_data[script_idx] == None:
script_data[script_idx] = [low, None, None, None, None]
script_data[script_idx][1] = high
extended_script_indicies = {}
with open("Unicode.tables/ScriptExtensions.txt") as f:
for line in f:
match_obj = property_re.match(line)
if match_obj == None:
continue
low = int(match_obj.group(1), 16)
high = low
if match_obj.group(2) != None:
high = int(match_obj.group(2), 16)
for abbrev in match_obj.group(3).split(" "):
if abbrev not in extended_script_indicies:
idx = script_abbrevs.index(abbrev)
extended_script_indicies[abbrev] = idx
rec = script_data[idx]
rec[2] = low
rec[3] = high
else:
idx = extended_script_indicies[abbrev]
rec = script_data[idx]
if rec[2] > low:
rec[2] = low
if rec[3] < high:
rec[3] = high
if rec[4] == None:
name = script_names[idx]
for idx in range(low, high + 1):
if char_data[idx] != name:
rec[4] = idx
break
long_property_name = False
for idx, rec in enumerate(script_data):
script_name = script_names[idx]
if script_name == "Unknown":
continue
script_abbrev = script_abbrevs[idx]
write_both("# Base script check\n")
write_both("/^\\p{sc=%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[0]))
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
write_both("\n")
write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
write_both(" %s\n" % to_string_char(rec[1]))
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
write_both("\n")
if rec[2] != None:
property_name = "scx"
if long_property_name:
property_name = "Script_Extensions"
write_both("# Script extension check\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[2]))
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
write_both("\n")
write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
write_both(" %s\n" % to_string_char(rec[3]))
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
write_both("\n")
long_property_name = not long_property_name
if rec[4] != None:
write_both("# Script extension only character\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[4]))
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
write_both("\n")
write_both("/^\\p{sc=%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[4]))
output_file.write("No match\n")
write_both("\n")
else:
print("External character has not found for %s" % script_name)
high = rec[1]
if rec[3] != None and rec[3] > rec[1]:
high = rec[3]
write_both("# Character not in script\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(high + 1))
output_file.write("No match\n")
write_both("\n")
gen_script_tests()
write_both("# End of testinput26\n")