fontconfig/fc-case/fc-case.py

#!/usr/bin/env python3
#
# fontconfig/fc-case/fc-case.py
#
# Copyright © 2004 Keith Packard
# Copyright © 2019 Tim-Philipp Müller
#
# Permission to use, copy, modify, distribute, and sell this software and its
# documentation for any purpose is hereby granted without fee, provided that
# the above copyright notice appear in all copies and that both that
# copyright notice and this permission notice appear in supporting
# documentation, and that the name of the author(s) not be used in
# advertising or publicity pertaining to distribution of the software without
# specific, written prior permission.  The authors make no
# representations about the suitability of this software for any purpose.  It
# is provided "as is" without express or implied warranty.
#
# THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
# EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.

from enum import Enum
import argparse
import string
import sys

class CaseFoldClass(Enum):
    COMMON = 1
    FULL = 2
    SIMPLE = 3
    TURKIC = 4

class CaseFoldMethod(Enum):
    RANGE = 0
    EVEN_ODD = 1
    FULL = 2

caseFoldClassMap = {
  'C' : CaseFoldClass.COMMON,
  'F' : CaseFoldClass.FULL,
  'S' : CaseFoldClass.SIMPLE,
  'T' : CaseFoldClass.TURKIC
}

folds = []

def ucs4_to_utf8(ucs4):
    utf8_rep = []
    
    if ucs4 < 0x80:
        utf8_rep.append(ucs4)
        bits = -6
    elif ucs4 < 0x800:
        utf8_rep.append(((ucs4 >> 6) & 0x1F) | 0xC0)
        bits = 0
    elif ucs4 < 0x10000:
        utf8_rep.append(((ucs4 >> 12) & 0x0F) | 0xE0)
        bits = 6
    elif ucs4 < 0x200000:
        utf8_rep.append(((ucs4 >> 18) & 0x07) | 0xF0)
        bits = 12
    elif ucs4 < 0x4000000:
        utf8_rep.append(((ucs4 >> 24) & 0x03) | 0xF8)
        bits = 18
    elif ucs4 < 0x80000000:
        utf8_rep.append(((ucs4 >> 30) & 0x01) | 0xFC)
        bits = 24
    else:
        return [];

    while bits >= 0:
        utf8_rep.append(((ucs4 >> bits) & 0x3F) | 0x80)
        bits-= 6

    return utf8_rep

def utf8_size(ucs4):
    return len(ucs4_to_utf8(ucs4))

case_fold_method_name_map = {
    CaseFoldMethod.RANGE: 'FC_CASE_FOLD_RANGE,',
    CaseFoldMethod.EVEN_ODD: 'FC_CASE_FOLD_EVEN_ODD,',
    CaseFoldMethod.FULL: 'FC_CASE_FOLD_FULL,',
}

if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('case_folding_file')
    parser.add_argument('--template', dest='template_file', default=None)
    parser.add_argument('--output', dest='output_file', default=None)

    args = parser.parse_args()

    minFoldChar = None
    maxFoldChar = None
    fold = None

    foldChars = []
    maxFoldChars = 0

    maxExpand = 0

    # Read the standard Unicode CaseFolding.txt file
    with open(args.case_folding_file, 'r', encoding='utf-8') as casefile:
        for cnt, line in enumerate(casefile):
            if not line or not line[0] in string.hexdigits:
                continue

            # print('Line {}: {}'.format(cnt, line.strip()))

            tokens = line.split('; ')

            if len(tokens) < 3:
                print('Not enough tokens in line {}'.format(cnt), file=sys.stderr)
                sys.exit(1)

            # Get upper case value
            upper = int(tokens.pop(0), 16)

            # Get class
            cfclass = caseFoldClassMap[tokens.pop(0)]

            # Get list of result characters
            lower = list(map(lambda s: int(s,16), tokens.pop(0).split()))

            # print('\t----> {:04X} {} {}'.format(upper, cfclass, lower))

            if not minFoldChar:
                minFoldChar = upper

            maxFoldChar = upper;

            if cfclass in [CaseFoldClass.COMMON, CaseFoldClass.FULL]:
                if len(lower) == 1:
                    # foldExtends
                    if fold and fold['method'] == CaseFoldMethod.RANGE:
                        foldExtends = (lower[0] - upper) == fold['offset'] and upper == fold['upper'] + fold['count']
                    elif fold and fold['method'] == CaseFoldMethod.EVEN_ODD:
                        foldExtends = (lower[0] - upper) == 1 and upper == (fold['upper'] + fold['count'] + 1)
                    else:
                        foldExtends = False

                    if foldExtends:
                        # This modifies the last fold item in the array too
                        fold['count'] = upper - fold['upper'] + 1;
                    else:
                        fold = {}
                        fold['upper'] = upper
                        fold['offset'] = lower[0] - upper;
                        if fold['offset'] == 1:
                            fold['method'] = CaseFoldMethod.EVEN_ODD
                        else:
                            fold['method'] = CaseFoldMethod.RANGE
                        fold['count'] = 1
                        folds.append(fold)
                    expand = utf8_size (lower[0]) - utf8_size(upper)
                else:
                    fold = {}
                    fold['upper'] = upper
                    fold['method'] = CaseFoldMethod.FULL
                    fold['offset'] = len(foldChars)

                    # add chars
                    for c in lower:
                        utf8_rep = ucs4_to_utf8(c)
                        # print('{} -> {}'.format(c,utf8_rep))
                        for utf8_char in utf8_rep:
                            foldChars.append(utf8_char)

                    fold['count'] = len(foldChars) - fold['offset']
                    folds.append(fold)

                    if fold['count'] > maxFoldChars:
                        maxFoldChars = fold['count']

                    expand = fold['count'] - utf8_size(upper)
                    if expand > maxExpand:
                        maxExpand = expand

    # Open output file
    if args.output_file:
        sys.stdout = open(args.output_file, 'w', encoding='utf-8')

    # Read the template file
    if args.template_file:
        tmpl_file = open(args.template_file, 'r', encoding='utf-8')
    else:
        tmpl_file = sys.stdin
    
    # Scan the input until the marker is found
    # FIXME: this is a bit silly really, might just as well harcode
    #        the license header in the script and drop the template
    for line in tmpl_file:
        if line.strip() == '@@@':
            break
        print(line, end='')
    
    # Dump these tables
    print('#define FC_NUM_CASE_FOLD\t{}'.format(len(folds)))
    print('#define FC_NUM_CASE_FOLD_CHARS\t{}'.format(len(foldChars)))
    print('#define FC_MAX_CASE_FOLD_CHARS\t{}'.format(maxFoldChars))
    print('#define FC_MAX_CASE_FOLD_EXPAND\t{}'.format(maxExpand))
    print('#define FC_MIN_FOLD_CHAR\t0x{:08x}'.format(minFoldChar))
    print('#define FC_MAX_FOLD_CHAR\t0x{:08x}'.format(maxFoldChar))
    print('')

    # Dump out ranges
    print('static const FcCaseFold    fcCaseFold[FC_NUM_CASE_FOLD] = {')
    for f in folds:
         short_offset = f['offset']
         if short_offset < -32367:
             short_offset += 65536
         if short_offset > 32368:
             short_offset -= 65536
         print('    {} 0x{:08x}, {:22s} 0x{:04x}, {:6d} {},'.format('{',
               f['upper'], case_fold_method_name_map[f['method']],
               f['count'], short_offset, '}'))
    print('};\n')

    # Dump out "other" values
    print('static const FcChar8\tfcCaseFoldChars[FC_NUM_CASE_FOLD_CHARS] = {')
    for n, c in enumerate(foldChars):
        if n == len(foldChars) - 1:
            end = ''
        elif n % 16 == 15:
            end = ',\n'
        else:
            end = ','
        print('0x{:02x}'.format(c), end=end)
    print('\n};')

    # And flush out the rest of the input file
    for line in tmpl_file:
        print(line, end='')
    
    sys.stdout.flush()
Add Meson build system See https://mesonbuild.com 2020-07-31 09:26:11 +02:00			`#!/usr/bin/env python3`
			`#`
			`# fontconfig/fc-case/fc-case.py`
			`#`
			`# Copyright © 2004 Keith Packard`
			`# Copyright © 2019 Tim-Philipp Müller`
			`#`
			`# Permission to use, copy, modify, distribute, and sell this software and its`
			`# documentation for any purpose is hereby granted without fee, provided that`
			`# the above copyright notice appear in all copies and that both that`
			`# copyright notice and this permission notice appear in supporting`
			`# documentation, and that the name of the author(s) not be used in`
			`# advertising or publicity pertaining to distribution of the software without`
			`# specific, written prior permission. The authors make no`
			`# representations about the suitability of this software for any purpose. It`
			`# is provided "as is" without express or implied warranty.`
			`#`
			`# THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,`
			`# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO`
			`# EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR`
			`# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,`
			`# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER`
			`# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR`
			`# PERFORMANCE OF THIS SOFTWARE.`

			`from enum import Enum`
			`import argparse`
			`import string`
			`import sys`

			`class CaseFoldClass(Enum):`
			`COMMON = 1`
			`FULL = 2`
			`SIMPLE = 3`
			`TURKIC = 4`

			`class CaseFoldMethod(Enum):`
			`RANGE = 0`
			`EVEN_ODD = 1`
			`FULL = 2`

			`caseFoldClassMap = {`
			`'C' : CaseFoldClass.COMMON,`
			`'F' : CaseFoldClass.FULL,`
			`'S' : CaseFoldClass.SIMPLE,`
			`'T' : CaseFoldClass.TURKIC`
			`}`

			`folds = []`

			`def ucs4_to_utf8(ucs4):`
			`utf8_rep = []`

			`if ucs4 < 0x80:`
			`utf8_rep.append(ucs4)`
			`bits = -6`
			`elif ucs4 < 0x800:`
			`utf8_rep.append(((ucs4 >> 6) & 0x1F) \| 0xC0)`
			`bits = 0`
			`elif ucs4 < 0x10000:`
			`utf8_rep.append(((ucs4 >> 12) & 0x0F) \| 0xE0)`
			`bits = 6`
			`elif ucs4 < 0x200000:`
			`utf8_rep.append(((ucs4 >> 18) & 0x07) \| 0xF0)`
			`bits = 12`
			`elif ucs4 < 0x4000000:`
			`utf8_rep.append(((ucs4 >> 24) & 0x03) \| 0xF8)`
			`bits = 18`
			`elif ucs4 < 0x80000000:`
			`utf8_rep.append(((ucs4 >> 30) & 0x01) \| 0xFC)`
			`bits = 24`
			`else:`
			`return [];`

			`while bits >= 0:`
			`utf8_rep.append(((ucs4 >> bits) & 0x3F) \| 0x80)`
			`bits-= 6`

			`return utf8_rep`

			`def utf8_size(ucs4):`
			`return len(ucs4_to_utf8(ucs4))`

			`case_fold_method_name_map = {`
			`CaseFoldMethod.RANGE: 'FC_CASE_FOLD_RANGE,',`
			`CaseFoldMethod.EVEN_ODD: 'FC_CASE_FOLD_EVEN_ODD,',`
			`CaseFoldMethod.FULL: 'FC_CASE_FOLD_FULL,',`
			`}`

			`if __name__=='__main__':`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('case_folding_file')`
			`parser.add_argument('--template', dest='template_file', default=None)`
			`parser.add_argument('--output', dest='output_file', default=None)`

			`args = parser.parse_args()`

			`minFoldChar = None`
			`maxFoldChar = None`
			`fold = None`

			`foldChars = []`
			`maxFoldChars = 0`

			`maxExpand = 0`

			`# Read the standard Unicode CaseFolding.txt file`
			`with open(args.case_folding_file, 'r', encoding='utf-8') as casefile:`
			`for cnt, line in enumerate(casefile):`
			`if not line or not line[0] in string.hexdigits:`
			`continue`

			`# print('Line {}: {}'.format(cnt, line.strip()))`

			`tokens = line.split('; ')`

			`if len(tokens) < 3:`
			`print('Not enough tokens in line {}'.format(cnt), file=sys.stderr)`
			`sys.exit(1)`

			`# Get upper case value`
			`upper = int(tokens.pop(0), 16)`

			`# Get class`
			`cfclass = caseFoldClassMap[tokens.pop(0)]`

			`# Get list of result characters`
			`lower = list(map(lambda s: int(s,16), tokens.pop(0).split()))`

			`# print('\t----> {:04X} {} {}'.format(upper, cfclass, lower))`

			`if not minFoldChar:`
			`minFoldChar = upper`

			`maxFoldChar = upper;`

			`if cfclass in [CaseFoldClass.COMMON, CaseFoldClass.FULL]:`
			`if len(lower) == 1:`
			`# foldExtends`
			`if fold and fold['method'] == CaseFoldMethod.RANGE:`
			`foldExtends = (lower[0] - upper) == fold['offset'] and upper == fold['upper'] + fold['count']`
			`elif fold and fold['method'] == CaseFoldMethod.EVEN_ODD:`
			`foldExtends = (lower[0] - upper) == 1 and upper == (fold['upper'] + fold['count'] + 1)`
			`else:`
			`foldExtends = False`

			`if foldExtends:`
			`# This modifies the last fold item in the array too`
			`fold['count'] = upper - fold['upper'] + 1;`
			`else:`
			`fold = {}`
			`fold['upper'] = upper`
			`fold['offset'] = lower[0] - upper;`
			`if fold['offset'] == 1:`
			`fold['method'] = CaseFoldMethod.EVEN_ODD`
			`else:`
			`fold['method'] = CaseFoldMethod.RANGE`
			`fold['count'] = 1`
			`folds.append(fold)`
			`expand = utf8_size (lower[0]) - utf8_size(upper)`
			`else:`
			`fold = {}`
			`fold['upper'] = upper`
			`fold['method'] = CaseFoldMethod.FULL`
			`fold['offset'] = len(foldChars)`

			`# add chars`
			`for c in lower:`
			`utf8_rep = ucs4_to_utf8(c)`
			`# print('{} -> {}'.format(c,utf8_rep))`
			`for utf8_char in utf8_rep:`
			`foldChars.append(utf8_char)`

			`fold['count'] = len(foldChars) - fold['offset']`
			`folds.append(fold)`

			`if fold['count'] > maxFoldChars:`
			`maxFoldChars = fold['count']`

			`expand = fold['count'] - utf8_size(upper)`
			`if expand > maxExpand:`
			`maxExpand = expand`

			`# Open output file`
			`if args.output_file:`
			`sys.stdout = open(args.output_file, 'w', encoding='utf-8')`

			`# Read the template file`
			`if args.template_file:`
			`tmpl_file = open(args.template_file, 'r', encoding='utf-8')`
			`else:`
			`tmpl_file = sys.stdin`

			`# Scan the input until the marker is found`
			`# FIXME: this is a bit silly really, might just as well harcode`
			`# the license header in the script and drop the template`
			`for line in tmpl_file:`
			`if line.strip() == '@@@':`
			`break`
			`print(line, end='')`

			`# Dump these tables`
			`print('#define FC_NUM_CASE_FOLD\t{}'.format(len(folds)))`
			`print('#define FC_NUM_CASE_FOLD_CHARS\t{}'.format(len(foldChars)))`
			`print('#define FC_MAX_CASE_FOLD_CHARS\t{}'.format(maxFoldChars))`
			`print('#define FC_MAX_CASE_FOLD_EXPAND\t{}'.format(maxExpand))`
			`print('#define FC_MIN_FOLD_CHAR\t0x{:08x}'.format(minFoldChar))`
			`print('#define FC_MAX_FOLD_CHAR\t0x{:08x}'.format(maxFoldChar))`
			`print('')`

			`# Dump out ranges`
			`print('static const FcCaseFold fcCaseFold[FC_NUM_CASE_FOLD] = {')`
			`for f in folds:`
			`short_offset = f['offset']`
			`if short_offset < -32367:`
			`short_offset += 65536`
			`if short_offset > 32368:`
			`short_offset -= 65536`
			`print(' {} 0x{:08x}, {:22s} 0x{:04x}, {:6d} {},'.format('{',`
			`f['upper'], case_fold_method_name_map[f['method']],`
			`f['count'], short_offset, '}'))`
			`print('};\n')`

			`# Dump out "other" values`
			`print('static const FcChar8\tfcCaseFoldChars[FC_NUM_CASE_FOLD_CHARS] = {')`
			`for n, c in enumerate(foldChars):`
			`if n == len(foldChars) - 1:`
			`end = ''`
			`elif n % 16 == 15:`
			`end = ',\n'`
			`else:`
			`end = ','`
			`print('0x{:02x}'.format(c), end=end)`
			`print('\n};')`

			`# And flush out the rest of the input file`
			`for line in tmpl_file:`
			`print(line, end='')`

			`sys.stdout.flush()`