duktape/tools/extract_caseconv.py

#!/usr/bin/env python2
#
#  Extract rules for Unicode case conversion, specifically the behavior
#  required by Ecmascript E5 in Sections 15.5.4.16 to 15.5.4.19.  The
#  bitstream encoded rules are used for the slow path at run time, so
#  compactness is favored over speed.
#
#  There is no support for context or locale sensitive rules, as they
#  are handled directly in C code before consulting tables generated
#  here.  Ecmascript requires case conversion both with and without
#  locale/language specific rules (e.g. String.prototype.toLowerCase()
#  and String.prototype.toLocaleLowerCase()), so they are best handled
#  in C anyway.
#
#  Case conversion rules for ASCII are also excluded as they are handled
#  by the C fast path.  Rules for non-BMP characters (codepoints above
#  U+FFFF) are omitted as they're not required for standard Ecmascript.
#

import os
import sys
import math
import optparse

import dukutil

class UnicodeData:
    "Read UnicodeData.txt into an internal representation."

    def __init__(self, filename):
        self.data = self.read_unicode_data(filename)
        print 'read %d unicode data entries' % len(self.data)

    def read_unicode_data(self, filename):
        res = []
        f = open(filename, 'rb')
        for line in f:
            if line.startswith('#'):
                continue
            line = line.strip()
            if line == '':
                continue
            parts = line.split(';')
            if len(parts) != 15:
                raise Exception('invalid unicode data line')
            res.append(parts)
        f.close()

        # Sort based on Unicode codepoint
        def mycmp(a,b):
            return cmp(long(a[0], 16), long(b[0], 16))

        res.sort(cmp=mycmp)
        return res

class SpecialCasing:
    "Read SpecialCasing.txt into an internal representation."

    def __init__(self, filename):
        self.data = self.read_special_casing_data(filename)
        print 'read %d special casing entries' % len(self.data)

    def read_special_casing_data(self, filename):
        res = []
        f = open(filename, 'rb')
        for line in f:
            try:
                idx = line.index('#')
                line = line[:idx]
            except ValueError:
                pass
            line = line.strip()
            if line == '':
                continue
            parts = line.split(';')
            parts = [i.strip() for i in parts]
            while len(parts) < 6:
                parts.append('')
            res.append(parts)
        f.close()
        return res

def parse_unicode_sequence(x):
    res = ''
    for i in x.split(' '):
        i = i.strip()
        if i == '':
            continue
        res += unichr(long(i, 16))
    return res

def get_base_conversion_maps(unicode_data):
    "Create case conversion tables without handling special casing yet."

    uc = {}        # codepoint (number) -> string
    lc = {}
    tc = {}         # titlecase

    for x in unicode_data.data:
        c1 = long(x[0], 16)

        # just 16-bit support needed
        if c1 >= 0x10000:
            continue

        if x[12] != '':
            # field 12: simple uppercase mapping
            c2 = parse_unicode_sequence(x[12])
            uc[c1] = c2
            tc[c1] = c2    # titlecase default == uppercase, overridden below if necessary
        if x[13] != '':
            # field 13: simple lowercase mapping
            c2 = parse_unicode_sequence(x[13])
            lc[c1] = c2
        if x[14] != '':
            # field 14: simple titlecase mapping
            c2 = parse_unicode_sequence(x[14])
            tc[c1] = c2

    return uc, lc, tc

def update_special_casings(uc, lc, tc, special_casing):
    "Update case conversion tables with special case conversion rules."

    for x in special_casing.data:
        c1 = long(x[0], 16)

        if x[4] != '':
            # conditions
            continue

        lower = parse_unicode_sequence(x[1])
        title = parse_unicode_sequence(x[2])
        upper = parse_unicode_sequence(x[3])

        if len(lower) > 1:
            lc[c1] = lower
        if len(upper) > 1:
            uc[c1] = upper
        if len(title) > 1:
            tc[c1] = title

        print 'special case: %d %d %d' % (len(lower), len(upper), len(title))

def remove_ascii_part(convmap):
    "Remove ASCII case conversion parts (handled by C fast path)."

    for i in xrange(128):
        if convmap.has_key(i):
            del convmap[i]

def scan_range_with_skip(convmap, start_idx, skip):
    "Scan for a range of continuous case conversion with a certain 'skip'."

    conv_i = start_idx
    if not convmap.has_key(conv_i):
        return None, None, None
    elif len(convmap[conv_i]) > 1:
        return None, None, None
    else:
        conv_o = ord(convmap[conv_i])

    start_i = conv_i
    start_o = conv_o

    while True:
        new_i = conv_i + skip
        new_o = conv_o + skip

        if not convmap.has_key(new_i):
            break
        if len(convmap[new_i]) > 1:
            break
        if ord(convmap[new_i]) != new_o:
            break

        conv_i = new_i
        conv_o = new_o

    # [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char
    count = (conv_i - start_i) / skip + 1
    if count <= 1:
        return None, None, None

    # we have an acceptable range, remove them from the convmap here
    for i in xrange(start_i, conv_i + skip, skip):
        del convmap[i]

    return start_i, start_o, count

def find_first_range_with_skip(convmap, skip):
    "Find first range with a certain 'skip' value."

    for i in xrange(65536):
        start_i, start_o, count = scan_range_with_skip(convmap, i, skip)
        if start_i is None:
            continue
        return start_i, start_o, count

    return None, None, None

def generate_tables(convmap):
    "Generate bit-packed case conversion table for a given conversion map."

    # The bitstream encoding is based on manual inspection for whatever
    # regularity the Unicode case conversion rules have.
    #
    # Start with a full description of case conversions which does not
    # cover all codepoints; unmapped codepoints convert to themselves.
    # Scan for range-to-range mappings with a range of skips starting from 1.
    # Whenever a valid range is found, remove it from the map.  Finally,
    # output the remaining case conversions (1:1 and 1:n) on a per codepoint
    # basis.
    #
    # This is very slow because we always scan from scratch, but its the
    # most reliable and simple way to scan

    ranges = []        # range mappings (2 or more consecutive mappings with a certain skip)
    singles = []       # 1:1 character mappings
    multis = []        # 1:n character mappings

    # Ranges with skips

    for skip in xrange(1,6+1):    # skips 1...6 are useful
        while True:
            start_i, start_o, count = find_first_range_with_skip(convmap, skip)
            if start_i is None:
                break
            print 'skip %d: %d %d %d' % (skip, start_i, start_o, count)
            ranges.append([start_i, start_o, count, skip])

    # 1:1 conversions

    k = convmap.keys()
    k.sort()
    for i in k:
        if len(convmap[i]) > 1:
            continue
        singles.append([i, ord(convmap[i])])    # codepoint, codepoint
        del convmap[i]

    # There are many mappings to 2-char sequences with latter char being U+0399.
    # These could be handled as a special case, but we don't do that right now.
    #
    # [8064L, u'\u1f08\u0399']
    # [8065L, u'\u1f09\u0399']
    # [8066L, u'\u1f0a\u0399']
    # [8067L, u'\u1f0b\u0399']
    # [8068L, u'\u1f0c\u0399']
    # [8069L, u'\u1f0d\u0399']
    # [8070L, u'\u1f0e\u0399']
    # [8071L, u'\u1f0f\u0399']
    # ...
    #
    # tmp = {}
    # k = convmap.keys()
    # k.sort()
    # for i in k:
    #    if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399':
    #        tmp[i] = convmap[i][0]
    #        del convmap[i]
    # print repr(tmp)
    #
    # skip = 1
    # while True:
    #    start_i, start_o, count = find_first_range_with_skip(tmp, skip)
    #    if start_i is None:
    #        break
    #    print 'special399, skip %d: %d %d %d' % (skip, start_i, start_o, count)
    # print len(tmp.keys())
    # print repr(tmp)
    # XXX: need to put 12 remaining mappings back to convmap

    # 1:n conversions

    k = convmap.keys()
    k.sort()
    for i in k:
        multis.append([i, convmap[i]])        # codepoint, string
        del convmap[i]

    for t in singles:
        print repr(t)

    for t in multis:
        print repr(t)

    print 'range mappings: %d' % len(ranges)
    print 'single character mappings: %d' % len(singles)
    print 'complex mappings (1:n): %d' % len(multis)
    print 'remaining (should be zero): %d' % len(convmap.keys())

    # XXX: opportunities for diff encoding skip=3 ranges?
    prev = None
    for t in ranges:
        # range: [start_i, start_o, count, skip]
        if t[3] != 3:
            continue
        if prev is not None:
            print '%d %d' % (t[0] - prev[0], t[1] - prev[1])
        else:
            print 'start: %d %d' % (t[0], t[1])
        prev = t

    # bit packed encoding

    be = dukutil.BitEncoder()

    for curr_skip in xrange(1, 7):    # 1...6
        count = 0
        for r in ranges:
            start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
            if skip != curr_skip:
                continue
            count += 1
        be.bits(count, 6)
        print 'encode: skip=%d, count=%d' % (curr_skip, count)

        for r in ranges:
            start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
            if skip != curr_skip:
                continue
            be.bits(start_i, 16)
            be.bits(start_o, 16)
            be.bits(r_count, 7)
    be.bits(0x3f, 6)    # maximum count value = end of skips

    count = len(singles)
    be.bits(count, 7)
    for t in singles:
        cp_i, cp_o = t[0], t[1]
        be.bits(cp_i, 16)
        be.bits(cp_o, 16)

    count = len(multis)
    be.bits(count, 7)
    for t in multis:
        cp_i, str_o = t[0], t[1]
        be.bits(cp_i, 16)
        be.bits(len(str_o), 2)
        for i in xrange(len(str_o)):
            be.bits(ord(str_o[i]), 16)

    return be.getBytes(), be.getNumBits()

def generate_regexp_canonicalize_lookup(convmap):
    res = []

    highest_nonid = -1

    for cp in xrange(65536):
        res_cp = cp  # default to as is
        if convmap.has_key(cp):
            tmp = convmap[cp]
            if len(tmp) == 1:
                # Multiple codepoints from input, ignore
                res_cp = ord(tmp[0])
        if cp >= 0x80 and res_cp < 0x80:
            res_cp = cp  # non-ASCII mapped to ASCII, ignore

        if cp != res_cp:
            highest_nonid = cp

        res.append(res_cp)

    # At the moment this is 65370, which means there's very little
    # gain in assuming 1:1 mapping above a certain BMP codepoint.
    print('HIGHEST NON-ID MAPPING: %d' % highest_nonid)
    return res

def clonedict(x):
    "Shallow clone of input dict."
    res = {}
    for k in x.keys():
        res[k] = x[k]
    return res

def main():
    parser = optparse.OptionParser()
    parser.add_option('--command', dest='command', default='caseconv_bitpacked')
    parser.add_option('--unicode-data', dest='unicode_data')
    parser.add_option('--special-casing', dest='special_casing')
    parser.add_option('--out-source', dest='out_source')
    parser.add_option('--out-header', dest='out_header')
    parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc')
    parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc')
    parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup')
    (opts, args) = parser.parse_args()

    unicode_data = UnicodeData(opts.unicode_data)
    special_casing = SpecialCasing(opts.special_casing)

    uc, lc, tc = get_base_conversion_maps(unicode_data)
    update_special_casings(uc, lc, tc, special_casing)

    if opts.command == 'caseconv_bitpacked':
        # XXX: ASCII and non-BMP filtering could be an option but is now hardcoded

        # ascii is handled with 'fast path' so not needed here
        t = clonedict(uc)
        remove_ascii_part(t)
        uc_bytes, uc_nbits = generate_tables(t)

        t = clonedict(lc)
        remove_ascii_part(t)
        lc_bytes, lc_nbits = generate_tables(t)

        # Generate C source and header files
        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
        genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
        f = open(opts.out_source, 'wb')
        f.write(genc.getString())
        f.close()

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes)))
        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes)))
        f = open(opts.out_header, 'wb')
        f.write(genc.getString())
        f.close()
    elif opts.command == 're_canon_lookup':
        # direct canonicalization lookup for case insensitive regexps, includes ascii part
        t = clonedict(uc)
        re_canon_lookup = generate_regexp_canonicalize_lookup(t)

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True)
        f = open(opts.out_source, 'wb')
        f.write(genc.getString())
        f.close()

        genc = dukutil.GenerateC()
        genc.emitHeader('extract_caseconv.py')
        genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup)))
        f = open(opts.out_header, 'wb')
        f.write(genc.getString())
        f.close()
    else:
        raise Exception('invalid command: %r' % opts.command)

if __name__ == '__main__':
    main()
Split dist util, reorg tools, Python PEP8 8 years ago			`#!/usr/bin/env python2`
			`#`
			`# Extract rules for Unicode case conversion, specifically the behavior`
			`# required by Ecmascript E5 in Sections 15.5.4.16 to 15.5.4.19. The`
			`# bitstream encoded rules are used for the slow path at run time, so`
			`# compactness is favored over speed.`
			`#`
			`# There is no support for context or locale sensitive rules, as they`
			`# are handled directly in C code before consulting tables generated`
			`# here. Ecmascript requires case conversion both with and without`
			`# locale/language specific rules (e.g. String.prototype.toLowerCase()`
			`# and String.prototype.toLocaleLowerCase()), so they are best handled`
			`# in C anyway.`
			`#`
Python tool cleanups and improvements Various fixes, verbosity reduction, etc. 8 years ago			`# Case conversion rules for ASCII are also excluded as they are handled`
			`# by the C fast path. Rules for non-BMP characters (codepoints above`
			`# U+FFFF) are omitted as they're not required for standard Ecmascript.`
Split dist util, reorg tools, Python PEP8 8 years ago			`#`

Python tool cleanups and improvements Various fixes, verbosity reduction, etc. 8 years ago			`import os`
			`import sys`
			`import math`
Split dist util, reorg tools, Python PEP8 8 years ago			`import optparse`
Python tool cleanups and improvements Various fixes, verbosity reduction, etc. 8 years ago
Split dist util, reorg tools, Python PEP8 8 years ago			`import dukutil`

			`class UnicodeData:`
			`"Read UnicodeData.txt into an internal representation."`

			`def __init__(self, filename):`
			`self.data = self.read_unicode_data(filename)`
			`print 'read %d unicode data entries' % len(self.data)`

			`def read_unicode_data(self, filename):`
			`res = []`
			`f = open(filename, 'rb')`
			`for line in f:`
			`if line.startswith('#'):`
			`continue`
			`line = line.strip()`
			`if line == '':`
			`continue`
			`parts = line.split(';')`
			`if len(parts) != 15:`
			`raise Exception('invalid unicode data line')`
			`res.append(parts)`
			`f.close()`

			`# Sort based on Unicode codepoint`
			`def mycmp(a,b):`
			`return cmp(long(a[0], 16), long(b[0], 16))`

			`res.sort(cmp=mycmp)`
			`return res`

			`class SpecialCasing:`
			`"Read SpecialCasing.txt into an internal representation."`

			`def __init__(self, filename):`
			`self.data = self.read_special_casing_data(filename)`
			`print 'read %d special casing entries' % len(self.data)`

			`def read_special_casing_data(self, filename):`
			`res = []`
			`f = open(filename, 'rb')`
			`for line in f:`
			`try:`
			`idx = line.index('#')`
			`line = line[:idx]`
			`except ValueError:`
			`pass`
			`line = line.strip()`
			`if line == '':`
			`continue`
			`parts = line.split(';')`
			`parts = [i.strip() for i in parts]`
			`while len(parts) < 6:`
			`parts.append('')`
			`res.append(parts)`
			`f.close()`
			`return res`

			`def parse_unicode_sequence(x):`
			`res = ''`
			`for i in x.split(' '):`
			`i = i.strip()`
			`if i == '':`
			`continue`
			`res += unichr(long(i, 16))`
			`return res`

			`def get_base_conversion_maps(unicode_data):`
			`"Create case conversion tables without handling special casing yet."`

			`uc = {} # codepoint (number) -> string`
			`lc = {}`
			`tc = {} # titlecase`

			`for x in unicode_data.data:`
			`c1 = long(x[0], 16)`

			`# just 16-bit support needed`
			`if c1 >= 0x10000:`
			`continue`

			`if x[12] != '':`
			`# field 12: simple uppercase mapping`
			`c2 = parse_unicode_sequence(x[12])`
			`uc[c1] = c2`
			`tc[c1] = c2 # titlecase default == uppercase, overridden below if necessary`
			`if x[13] != '':`
			`# field 13: simple lowercase mapping`
			`c2 = parse_unicode_sequence(x[13])`
			`lc[c1] = c2`
			`if x[14] != '':`
			`# field 14: simple titlecase mapping`
			`c2 = parse_unicode_sequence(x[14])`
			`tc[c1] = c2`

			`return uc, lc, tc`

			`def update_special_casings(uc, lc, tc, special_casing):`
			`"Update case conversion tables with special case conversion rules."`

			`for x in special_casing.data:`
			`c1 = long(x[0], 16)`

			`if x[4] != '':`
			`# conditions`
			`continue`

			`lower = parse_unicode_sequence(x[1])`
			`title = parse_unicode_sequence(x[2])`
			`upper = parse_unicode_sequence(x[3])`

			`if len(lower) > 1:`
			`lc[c1] = lower`
			`if len(upper) > 1:`
			`uc[c1] = upper`
			`if len(title) > 1:`
			`tc[c1] = title`

			`print 'special case: %d %d %d' % (len(lower), len(upper), len(title))`

			`def remove_ascii_part(convmap):`
			`"Remove ASCII case conversion parts (handled by C fast path)."`

			`for i in xrange(128):`
			`if convmap.has_key(i):`
			`del convmap[i]`

			`def scan_range_with_skip(convmap, start_idx, skip):`
			`"Scan for a range of continuous case conversion with a certain 'skip'."`

			`conv_i = start_idx`
			`if not convmap.has_key(conv_i):`
			`return None, None, None`
			`elif len(convmap[conv_i]) > 1:`
			`return None, None, None`
			`else:`
			`conv_o = ord(convmap[conv_i])`

			`start_i = conv_i`
			`start_o = conv_o`

			`while True:`
			`new_i = conv_i + skip`
			`new_o = conv_o + skip`

			`if not convmap.has_key(new_i):`
			`break`
			`if len(convmap[new_i]) > 1:`
			`break`
			`if ord(convmap[new_i]) != new_o:`
			`break`

			`conv_i = new_i`
			`conv_o = new_o`

			`# [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char`
			`count = (conv_i - start_i) / skip + 1`
			`if count <= 1:`
			`return None, None, None`

			`# we have an acceptable range, remove them from the convmap here`
			`for i in xrange(start_i, conv_i + skip, skip):`
			`del convmap[i]`

			`return start_i, start_o, count`

			`def find_first_range_with_skip(convmap, skip):`
			`"Find first range with a certain 'skip' value."`

			`for i in xrange(65536):`
			`start_i, start_o, count = scan_range_with_skip(convmap, i, skip)`
			`if start_i is None:`
			`continue`
			`return start_i, start_o, count`

			`return None, None, None`

			`def generate_tables(convmap):`
			`"Generate bit-packed case conversion table for a given conversion map."`

			`# The bitstream encoding is based on manual inspection for whatever`
			`# regularity the Unicode case conversion rules have.`
			`#`
			`# Start with a full description of case conversions which does not`
			`# cover all codepoints; unmapped codepoints convert to themselves.`
			`# Scan for range-to-range mappings with a range of skips starting from 1.`
			`# Whenever a valid range is found, remove it from the map. Finally,`
			`# output the remaining case conversions (1:1 and 1:n) on a per codepoint`
			`# basis.`
			`#`
			`# This is very slow because we always scan from scratch, but its the`
			`# most reliable and simple way to scan`

			`ranges = [] # range mappings (2 or more consecutive mappings with a certain skip)`
Python tool cleanups and improvements Various fixes, verbosity reduction, etc. 8 years ago			`singles = [] # 1:1 character mappings`
			`multis = [] # 1:n character mappings`
Split dist util, reorg tools, Python PEP8 8 years ago
			`# Ranges with skips`

			`for skip in xrange(1,6+1): # skips 1...6 are useful`
			`while True:`
			`start_i, start_o, count = find_first_range_with_skip(convmap, skip)`
			`if start_i is None:`
			`break`
			`print 'skip %d: %d %d %d' % (skip, start_i, start_o, count)`
			`ranges.append([start_i, start_o, count, skip])`

			`# 1:1 conversions`

			`k = convmap.keys()`
			`k.sort()`
			`for i in k:`
			`if len(convmap[i]) > 1:`
			`continue`
			`singles.append([i, ord(convmap[i])]) # codepoint, codepoint`
			`del convmap[i]`

			`# There are many mappings to 2-char sequences with latter char being U+0399.`
			`# These could be handled as a special case, but we don't do that right now.`
			`#`
			`# [8064L, u'\u1f08\u0399']`
			`# [8065L, u'\u1f09\u0399']`
			`# [8066L, u'\u1f0a\u0399']`
			`# [8067L, u'\u1f0b\u0399']`
			`# [8068L, u'\u1f0c\u0399']`
			`# [8069L, u'\u1f0d\u0399']`
			`# [8070L, u'\u1f0e\u0399']`
			`# [8071L, u'\u1f0f\u0399']`
			`# ...`
			`#`
			`# tmp = {}`
			`# k = convmap.keys()`
			`# k.sort()`
			`# for i in k:`
			`# if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399':`
			`# tmp[i] = convmap[i][0]`
			`# del convmap[i]`
			`# print repr(tmp)`
			`#`
			`# skip = 1`
			`# while True:`
			`# start_i, start_o, count = find_first_range_with_skip(tmp, skip)`
			`# if start_i is None:`
			`# break`
			`# print 'special399, skip %d: %d %d %d' % (skip, start_i, start_o, count)`
			`# print len(tmp.keys())`
			`# print repr(tmp)`
Python tool cleanups and improvements Various fixes, verbosity reduction, etc. 8 years ago			`# XXX: need to put 12 remaining mappings back to convmap`
Split dist util, reorg tools, Python PEP8 8 years ago
			`# 1:n conversions`

			`k = convmap.keys()`
			`k.sort()`
			`for i in k:`
Python tool cleanups and improvements Various fixes, verbosity reduction, etc. 8 years ago			`multis.append([i, convmap[i]]) # codepoint, string`
Split dist util, reorg tools, Python PEP8 8 years ago			`del convmap[i]`

			`for t in singles:`
			`print repr(t)`

Python tool cleanups and improvements Various fixes, verbosity reduction, etc. 8 years ago			`for t in multis:`
Split dist util, reorg tools, Python PEP8 8 years ago			`print repr(t)`

			`print 'range mappings: %d' % len(ranges)`
			`print 'single character mappings: %d' % len(singles)`
Python tool cleanups and improvements Various fixes, verbosity reduction, etc. 8 years ago			`print 'complex mappings (1:n): %d' % len(multis)`
Split dist util, reorg tools, Python PEP8 8 years ago			`print 'remaining (should be zero): %d' % len(convmap.keys())`

			`# XXX: opportunities for diff encoding skip=3 ranges?`
			`prev = None`
			`for t in ranges:`
			`# range: [start_i, start_o, count, skip]`
			`if t[3] != 3:`
			`continue`
			`if prev is not None:`
			`print '%d %d' % (t[0] - prev[0], t[1] - prev[1])`
			`else:`
			`print 'start: %d %d' % (t[0], t[1])`
			`prev = t`

			`# bit packed encoding`

			`be = dukutil.BitEncoder()`

			`for curr_skip in xrange(1, 7): # 1...6`
			`count = 0`
			`for r in ranges:`
			`start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]`
			`if skip != curr_skip:`
			`continue`
			`count += 1`
			`be.bits(count, 6)`
			`print 'encode: skip=%d, count=%d' % (curr_skip, count)`

			`for r in ranges:`
			`start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]`
			`if skip != curr_skip:`
			`continue`
			`be.bits(start_i, 16)`
			`be.bits(start_o, 16)`
			`be.bits(r_count, 7)`
			`be.bits(0x3f, 6) # maximum count value = end of skips`

			`count = len(singles)`
Update caseconv singles bit count Needed for newer UnicodeData.txt. 8 years ago			`be.bits(count, 7)`
Split dist util, reorg tools, Python PEP8 8 years ago			`for t in singles:`
			`cp_i, cp_o = t[0], t[1]`
			`be.bits(cp_i, 16)`
			`be.bits(cp_o, 16)`

Python tool cleanups and improvements Various fixes, verbosity reduction, etc. 8 years ago			`count = len(multis)`
Split dist util, reorg tools, Python PEP8 8 years ago			`be.bits(count, 7)`
Python tool cleanups and improvements Various fixes, verbosity reduction, etc. 8 years ago			`for t in multis:`
Split dist util, reorg tools, Python PEP8 8 years ago			`cp_i, str_o = t[0], t[1]`
			`be.bits(cp_i, 16)`
			`be.bits(len(str_o), 2)`
			`for i in xrange(len(str_o)):`
			`be.bits(ord(str_o[i]), 16)`

			`return be.getBytes(), be.getNumBits()`

			`def generate_regexp_canonicalize_lookup(convmap):`
			`res = []`

			`highest_nonid = -1`

			`for cp in xrange(65536):`
			`res_cp = cp # default to as is`
			`if convmap.has_key(cp):`
			`tmp = convmap[cp]`
			`if len(tmp) == 1:`
			`# Multiple codepoints from input, ignore`
			`res_cp = ord(tmp[0])`
			`if cp >= 0x80 and res_cp < 0x80:`
			`res_cp = cp # non-ASCII mapped to ASCII, ignore`

			`if cp != res_cp:`
			`highest_nonid = cp`

			`res.append(res_cp)`

			`# At the moment this is 65370, which means there's very little`
			`# gain in assuming 1:1 mapping above a certain BMP codepoint.`
			`print('HIGHEST NON-ID MAPPING: %d' % highest_nonid)`
			`return res`

			`def clonedict(x):`
			`"Shallow clone of input dict."`
			`res = {}`
			`for k in x.keys():`
			`res[k] = x[k]`
			`return res`

			`def main():`
			`parser = optparse.OptionParser()`
			`parser.add_option('--command', dest='command', default='caseconv_bitpacked')`
			`parser.add_option('--unicode-data', dest='unicode_data')`
			`parser.add_option('--special-casing', dest='special_casing')`
			`parser.add_option('--out-source', dest='out_source')`
			`parser.add_option('--out-header', dest='out_header')`
			`parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc')`
			`parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc')`
			`parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup')`
			`(opts, args) = parser.parse_args()`

			`unicode_data = UnicodeData(opts.unicode_data)`
			`special_casing = SpecialCasing(opts.special_casing)`

			`uc, lc, tc = get_base_conversion_maps(unicode_data)`
			`update_special_casings(uc, lc, tc, special_casing)`

			`if opts.command == 'caseconv_bitpacked':`
			`# XXX: ASCII and non-BMP filtering could be an option but is now hardcoded`

			`# ascii is handled with 'fast path' so not needed here`
			`t = clonedict(uc)`
			`remove_ascii_part(t)`
			`uc_bytes, uc_nbits = generate_tables(t)`

			`t = clonedict(lc)`
			`remove_ascii_part(t)`
			`lc_bytes, lc_nbits = generate_tables(t)`

			`# Generate C source and header files`
			`genc = dukutil.GenerateC()`
			`genc.emitHeader('extract_caseconv.py')`
			`genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True)`
			`genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True)`
			`f = open(opts.out_source, 'wb')`
			`f.write(genc.getString())`
			`f.close()`

			`genc = dukutil.GenerateC()`
			`genc.emitHeader('extract_caseconv.py')`
			`genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes)))`
			`genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes)))`
			`f = open(opts.out_header, 'wb')`
			`f.write(genc.getString())`
			`f.close()`
			`elif opts.command == 're_canon_lookup':`
			`# direct canonicalization lookup for case insensitive regexps, includes ascii part`
			`t = clonedict(uc)`
			`re_canon_lookup = generate_regexp_canonicalize_lookup(t)`

			`genc = dukutil.GenerateC()`
			`genc.emitHeader('extract_caseconv.py')`
			`genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True)`
			`f = open(opts.out_source, 'wb')`
			`f.write(genc.getString())`
			`f.close()`

			`genc = dukutil.GenerateC()`
			`genc.emitHeader('extract_caseconv.py')`
			`genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup)))`
			`f = open(opts.out_header, 'wb')`
			`f.write(genc.getString())`
			`f.close()`
			`else:`
			`raise Exception('invalid command: %r' % opts.command)`

			`if __name__ == '__main__':`
			`main()`