duktape/tools/extract_caseconv.py


								#!/usr/bin/env python2

								#

								#  Extract rules for Unicode case conversion, specifically the behavior

								#  required by ECMAScript E5 in Sections 15.5.4.16 to 15.5.4.19.  The

								#  bitstream encoded rules are used for the slow path at run time, so

								#  compactness is favored over speed.

								#

								#  There is no support for context or locale sensitive rules, as they

								#  are handled directly in C code before consulting tables generated

								#  here.  ECMAScript requires case conversion both with and without

								#  locale/language specific rules (e.g. String.prototype.toLowerCase()

								#  and String.prototype.toLocaleLowerCase()), so they are best handled

								#  in C anyway.

								#

								#  Case conversion rules for ASCII are also excluded as they are handled

								#  by the C fast path.  Rules for non-BMP characters (codepoints above

								#  U+FFFF) are omitted as they're not required for standard ECMAScript.

								#


								import os

								import sys

								import re

								import math

								import optparse


								import dukutil


								class UnicodeData:

								    """Read UnicodeData.txt into an internal representation."""


								    def __init__(self, filename):

								        self.data = self.read_unicode_data(filename)

								        print('read %d unicode data entries' % len(self.data))


								    def read_unicode_data(self, filename):

								        res = []

								        f = open(filename, 'rb')

								        for line in f:

								            if line.startswith('#'):

								                continue

								            line = line.strip()

								            if line == '':

								                continue

								            parts = line.split(';')

								            if len(parts) != 15:

								                raise Exception('invalid unicode data line')

								            res.append(parts)

								        f.close()


								        # Sort based on Unicode codepoint.

								        def mycmp(a,b):

								            return cmp(long(a[0], 16), long(b[0], 16))


								        res.sort(cmp=mycmp)

								        return res


								class SpecialCasing:

								    """Read SpecialCasing.txt into an internal representation."""


								    def __init__(self, filename):

								        self.data = self.read_special_casing_data(filename)

								        print('read %d special casing entries' % len(self.data))


								    def read_special_casing_data(self, filename):

								        res = []

								        f = open(filename, 'rb')

								        for line in f:

								            try:

								                idx = line.index('#')

								                line = line[:idx]

								            except ValueError:

								                pass

								            line = line.strip()

								            if line == '':

								                continue

								            parts = line.split(';')

								            parts = [i.strip() for i in parts]

								            while len(parts) < 6:

								                parts.append('')

								            res.append(parts)

								        f.close()

								        return res


								def parse_unicode_sequence(x):

								    """Parse a Unicode sequence like ABCD 1234 into a unicode string."""

								    res = ''

								    for i in x.split(' '):

								        i = i.strip()

								        if i == '':

								            continue

								        res += unichr(long(i, 16))

								    return res


								def get_base_conversion_maps(unicode_data):

								    """Create case conversion tables without handling special casing yet."""


								    uc = {}        # uppercase, codepoint (number) -> string

								    lc = {}        # lowercase

								    tc = {}        # titlecase


								    for x in unicode_data.data:

								        c1 = long(x[0], 16)


								        # just 16-bit support needed

								        if c1 >= 0x10000:

								            continue


								        if x[12] != '':

								            # field 12: simple uppercase mapping

								            c2 = parse_unicode_sequence(x[12])

								            uc[c1] = c2

								            tc[c1] = c2    # titlecase default == uppercase, overridden below if necessary

								        if x[13] != '':

								            # field 13: simple lowercase mapping

								            c2 = parse_unicode_sequence(x[13])

								            lc[c1] = c2

								        if x[14] != '':

								            # field 14: simple titlecase mapping

								            c2 = parse_unicode_sequence(x[14])

								            tc[c1] = c2


								    return uc, lc, tc


								def update_special_casings(uc, lc, tc, special_casing):

								    """Update case conversion tables with special case conversion rules."""


								    for x in special_casing.data:

								        c1 = long(x[0], 16)


								        if x[4] != '':

								            # conditions

								            continue


								        lower = parse_unicode_sequence(x[1])

								        title = parse_unicode_sequence(x[2])

								        upper = parse_unicode_sequence(x[3])


								        if len(lower) > 1:

								            lc[c1] = lower

								        if len(upper) > 1:

								            uc[c1] = upper

								        if len(title) > 1:

								            tc[c1] = title


								        print('- special case: %d %d %d' % (len(lower), len(upper), len(title)))


								def remove_ascii_part(convmap):

								    """Remove ASCII case conversion parts (handled by C fast path)."""


								    for i in xrange(128):

								        if convmap.has_key(i):

								            del convmap[i]


								def scan_range_with_skip(convmap, start_idx, skip):

								    """Scan for a range of continuous case conversion with a certain 'skip'."""


								    conv_i = start_idx

								    if not convmap.has_key(conv_i):

								        return None, None, None

								    elif len(convmap[conv_i]) > 1:

								        return None, None, None

								    else:

								        conv_o = ord(convmap[conv_i])


								    start_i = conv_i

								    start_o = conv_o


								    while True:

								        new_i = conv_i + skip

								        new_o = conv_o + skip


								        if not convmap.has_key(new_i):

								            break

								        if len(convmap[new_i]) > 1:

								            break

								        if ord(convmap[new_i]) != new_o:

								            break


								        conv_i = new_i

								        conv_o = new_o


								    # [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char.

								    count = (conv_i - start_i) / skip + 1

								    if count <= 1:

								        return None, None, None


								    # We have an acceptable range, remove them from the convmap here.

								    for i in xrange(start_i, conv_i + skip, skip):

								        del convmap[i]


								    return start_i, start_o, count


								def find_first_range_with_skip(convmap, skip):

								    """Find first range with a certain 'skip' value."""


								    for i in xrange(65536):

								        start_i, start_o, count = scan_range_with_skip(convmap, i, skip)

								        if start_i is None:

								            continue

								        return start_i, start_o, count


								    return None, None, None


								def generate_caseconv_tables(convmap):

								    """Generate bit-packed case conversion table for a given conversion map."""


								    # The bitstream encoding is based on manual inspection for whatever

								    # regularity the Unicode case conversion rules have.

								    #

								    # Start with a full description of case conversions which does not

								    # cover all codepoints; unmapped codepoints convert to themselves.

								    # Scan for range-to-range mappings with a range of skips starting from 1.

								    # Whenever a valid range is found, remove it from the map.  Finally,

								    # output the remaining case conversions (1:1 and 1:n) on a per codepoint

								    # basis.

								    #

								    # This is very slow because we always scan from scratch, but its the

								    # most reliable and simple way to scan


								    print('generate caseconv tables')


								    ranges = []        # range mappings (2 or more consecutive mappings with a certain skip)

								    singles = []       # 1:1 character mappings

								    multis = []        # 1:n character mappings


								    # Ranges with skips


								    for skip in xrange(1,6+1):    # skips 1...6 are useful

								        while True:

								            start_i, start_o, count = find_first_range_with_skip(convmap, skip)

								            if start_i is None:

								                break

								            print('- skip %d: %d %d %d' % (skip, start_i, start_o, count))

								            ranges.append([start_i, start_o, count, skip])


								    # 1:1 conversions


								    k = convmap.keys()

								    k.sort()

								    for i in k:

								        if len(convmap[i]) > 1:

								            continue

								        singles.append([i, ord(convmap[i])])    # codepoint, codepoint

								        del convmap[i]


								    # There are many mappings to 2-char sequences with latter char being U+0399.

								    # These could be handled as a special case, but we don't do that right now.

								    #

								    # [8064L, u'\u1f08\u0399']

								    # [8065L, u'\u1f09\u0399']

								    # [8066L, u'\u1f0a\u0399']

								    # [8067L, u'\u1f0b\u0399']

								    # [8068L, u'\u1f0c\u0399']

								    # [8069L, u'\u1f0d\u0399']

								    # [8070L, u'\u1f0e\u0399']

								    # [8071L, u'\u1f0f\u0399']

								    # ...

								    #

								    # tmp = {}

								    # k = convmap.keys()

								    # k.sort()

								    # for i in k:

								    #    if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399':

								    #        tmp[i] = convmap[i][0]

								    #        del convmap[i]

								    # print(repr(tmp))

								    #

								    # skip = 1

								    # while True:

								    #    start_i, start_o, count = find_first_range_with_skip(tmp, skip)

								    #    if start_i is None:

								    #        break

								    #    print('- special399, skip %d: %d %d %d' % (skip, start_i, start_o, count))

								    # print(len(tmp.keys()))

								    # print(repr(tmp))

								    # XXX: need to put 12 remaining mappings back to convmap


								    # 1:n conversions


								    k = convmap.keys()

								    k.sort()

								    for i in k:

								        multis.append([i, convmap[i]])        # codepoint, string

								        del convmap[i]


								    for t in singles:

								        print '- singles: ' + repr(t)


								    for t in multis:

								        print '- multis: ' + repr(t)


								    print '- range mappings: %d' % len(ranges)

								    print '- single character mappings: %d' % len(singles)

								    print '- complex mappings (1:n): %d' % len(multis)

								    print '- remaining (should be zero): %d' % len(convmap.keys())


								    # XXX: opportunities for diff encoding skip=3 ranges?

								    prev = None

								    for t in ranges:

								        # range: [start_i, start_o, count, skip]

								        if t[3] != 3:

								            continue

								        if prev is not None:

								            print '- %d %d' % (t[0] - prev[0], t[1] - prev[1])

								        else:

								            print '- start: %d %d' % (t[0], t[1])

								        prev = t


								    # Bit packed encoding.


								    be = dukutil.BitEncoder()


								    for curr_skip in xrange(1, 7):    # 1...6

								        count = 0

								        for r in ranges:

								            start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]

								            if skip != curr_skip:

								                continue

								            count += 1

								        be.bits(count, 6)

								        print('- encode: skip=%d, count=%d' % (curr_skip, count))


								        for r in ranges:

								            start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]

								            if skip != curr_skip:

								                continue

								            be.bits(start_i, 16)

								            be.bits(start_o, 16)

								            be.bits(r_count, 7)

								    be.bits(0x3f, 6)    # maximum count value = end of skips


								    count = len(singles)

								    be.bits(count, 7)

								    for t in singles:

								        cp_i, cp_o = t[0], t[1]

								        be.bits(cp_i, 16)

								        be.bits(cp_o, 16)


								    count = len(multis)

								    be.bits(count, 7)

								    for t in multis:

								        cp_i, str_o = t[0], t[1]

								        be.bits(cp_i, 16)

								        be.bits(len(str_o), 2)

								        for i in xrange(len(str_o)):

								            be.bits(ord(str_o[i]), 16)


								    return be.getBytes(), be.getNumBits()


								def generate_regexp_canonicalize_tables(convmap):

								    """Generate tables for case insensitive RegExp normalization."""


								    # Generate a direct codepoint lookup for canonicalizing BMP range.


								    def generate_canontab():

								        res = []

								        highest_nonid = -1


								        for cp in xrange(65536):

								            res_cp = cp  # default to as is

								            if convmap.has_key(cp):

								                tmp = convmap[cp]

								                if len(tmp) == 1:

								                    # If multiple codepoints from input, ignore.

								                    res_cp = ord(tmp[0])

								            if cp >= 0x80 and res_cp < 0x80:

								                res_cp = cp  # If non-ASCII mapped to ASCII, ignore.

								            if cp != res_cp:

								                highest_nonid = cp

								            res.append(res_cp)


								        # At the moment this is 65370, which means there's very little

								        # gain in assuming 1:1 mapping above a certain BMP codepoint

								        # (though we do assume 1:1 mapping for above BMP codepoints).

								        print('- highest non-identity mapping: %d' % highest_nonid)


								        return res


								    print('generate canontab')

								    canontab = generate_canontab()


								    # Figure out which BMP values are never the result of canonicalization.

								    # Such codepoints are "don't care" in the sense that they are never

								    # matched against at runtime: ranges are canonicalized at compile time,

								    # and codepoint being matched is also canonicalized at run time.

								    # (Currently unused.)


								    def generate_dontcare():

								        res = [ True ] * 65536

								        for cp in canontab:

								            res[cp] = False

								        res_count = 0

								        for x in res:

								            if x:

								                res_count += 1

								        print('- %d dontcare codepoints' % res_count)

								        return res


								    print('generate canon dontcare')

								    dontcare = generate_dontcare()


								    # Generate maximal continuous ranges for canonicalization.  A continuous

								    # range is a sequence with N codepoints where IN+i canonicalizes to OUT+i

								    # for fixed IN, OUT, and i in 0...N-1.  There are unfortunately >1000

								    # of these ranges, mostly because there are a lot of individual exceptions.

								    # (Currently unused.)


								    canon_ranges = []

								    for cp in xrange(65536):

								       canon_ranges.append([ cp, canontab[cp], 1 ])  # 1 codepoint ranges at first

								    def merge_compatible_nogap(rng1, rng2):

								        # Merge adjacent ranges if continuity allows.

								        if rng1[0] + rng1[2] == rng2[0] and \

								           rng1[1] + rng1[2] == rng2[1]:

								            return [ rng1[0], rng1[1], rng1[2] + rng2[2] ]

								        return None

								    def merge_check_nogap():

								        len_start = len(canon_ranges)

								        for i in xrange(len(canon_ranges) - 1):

								            j = i + 1

								            rng1 = canon_ranges[i]

								            rng2 = canon_ranges[j]

								            if rng1 is None or rng2 is None: continue

								            merged = merge_compatible_nogap(rng1, rng2)

								            if merged is not None:

								                canon_ranges[j] = None

								                canon_ranges[i] = merged

								        filtered = []

								        for x in canon_ranges:

								            if x is not None:

								                filtered.append(x)

								        len_end = len(filtered)

								        if len_end < len_start:

								            return filtered

								        return None


								    print('generate canon_ranges')

								    while True:

								        # Starting from individual ranges of 1 codepoint, merge adjacent

								        # ranges until no more ranges can be merged.

								        t = merge_check_nogap()

								        if t is None:

								            break

								        canon_ranges = t

								    print('- %d ranges' % len(canon_ranges))

								    #for rng in canon_ranges:

								    #    print('canon_ranges:')

								    #    print(repr(rng))


								    # Generate true/false ranges for BMP codepoints where:

								    # - A codepoint is flagged true if continuity is broken at that point, so

								    #   an explicit codepoint canonicalization is needed at runtime.

								    # - A codepoint is flagged false if case conversion is continuous from the

								    #   previous codepoint, i.e. out_curr = out_prev + 1.

								    #

								    # The result is a lot of small ranges due to a lot of small 'false' ranges.

								    # Reduce the range set by checking if adjacent 'true' ranges have at most

								    # false_limit 'false' entries between them.  If so, force the 'false'

								    # entries to 'true' (safe but results in an unnecessary runtime codepoint

								    # lookup) and merge the three ranges into a larger 'true' range.

								    #

								    # (Currently unused.)


								    def generate_needcheck_straight():

								        res = [ True ] * 65536

								        assert(canontab[0] == 0)  # can start from in == out == 0

								        prev_in = -1

								        prev_out = -1

								        for i in xrange(65536):

								            # First create a straight true/false bitmap for BMP.

								            curr_in = i

								            curr_out = canontab[i]

								            if prev_in + 1 == curr_in and prev_out + 1 == curr_out:

								                res[i] = False

								            prev_in = curr_in

								            prev_out = curr_out

								        return res

								    def generate_needcheck_ranges(data):

								        # Generate maximal accurate ranges.

								        prev = None

								        count = 0

								        ranges = []

								        for i in data:

								            if prev is None or prev != i:

								                if prev is not None:

								                    ranges.append([ prev, count ])

								                prev = i

								                count = 1

								            else:

								                count += 1

								        if prev is not None:

								            ranges.append([ prev, count ])

								        return ranges

								    def fillin_needcheck_ranges(data, false_limit):

								        # Fill in TRUE-FALSE*N-TRUE gaps into TRUE-TRUE*N-TRUE which is

								        # safe (leads to an unnecessary runtime check) but reduces

								        # range data size considerably.

								        res = []

								        for r in data:

								            res.append([ r[0], r[1] ])

								        while True:

								            found = False

								            for i in xrange(len(res) - 2):

								                r1 = res[i]

								                r2 = res[i + 1]

								                r3 = res[i + 2]

								                if r1[0] == True and r2[0] == False and r3[0] == True and \

								                   r2[1] <= false_limit:

								                    #print('fillin %d falses' % r2[1])

								                    res.pop(i + 2)

								                    res.pop(i + 1)

								                    res[i] = [ True, r1[1] + r2[1] + r3[1] ]

								                    found = True

								                    break

								            if not found:

								                break

								        return res


								    print('generate needcheck straight')

								    needcheck = generate_needcheck_straight()


								    print('generate needcheck without false fillins')

								    needcheck_ranges1 = generate_needcheck_ranges(needcheck)

								    print('- %d ranges' % len(needcheck_ranges1))

								    #print(needcheck_ranges1)


								    print('generate needcheck with false fillins')

								    needcheck_ranges2 = fillin_needcheck_ranges(needcheck_ranges1, 11)

								    print('- %d ranges' % len(needcheck_ranges2))

								    #print(needcheck_ranges2)


								    # Generate a bitmap for BMP, divided into N-codepoint blocks, with each

								    # bit indicating: "entire codepoint block canonicalizes continuously, and

								    # the block is continuous with the previous and next block".  A 'true'

								    # entry allows runtime code to just skip the block, advancing 'in' and

								    # 'out' by the block size, with no codepoint conversion.  The block size

								    # should be large enough to produce a relatively small lookup table, but

								    # small enough to reduce codepoint conversions to a manageable number

								    # because the conversions are (currently) quite slow.  This matters

								    # especially for case-insensitive RegExps; without any optimization,

								    # /[\u0000-\uffff]/i requires 65536 case conversions for runtime

								    # normalization.


								    block_shift = 5

								    block_size = 1 << block_shift

								    block_mask = block_size - 1

								    num_blocks = 65536 / block_size


								    def generate_block_bits(check_continuity):

								        res = [ True ] * num_blocks

								        for i in xrange(num_blocks):

								            base_in = i * block_size

								            base_out = canontab[base_in]

								            if check_continuity:

								                lower = -1   # [-1,block_size]

								                upper = block_size + 1

								            else:

								                lower = 0    # [0,block_size-1]

								                upper = block_size

								            for j in xrange(lower, upper):

								                cp = base_in + j

								                if cp >= 0x0000 and cp <= 0xffff and canontab[cp] != base_out + j:

								                   res[i] = False

								                   break

								        return res


								    def dump_block_bitmap(bits):

								        tmp = ''.join([ ({ True: 'x', False: '.' })[b] for b in bits])

								        tmp = re.sub(r'.{64}', lambda x: x.group(0) + '\n', tmp)

								        blocks_true = tmp.count('x')

								        blocks_false = tmp.count('.')

								        print('%d codepoint blocks are continuous, %d blocks are not' % (blocks_true, blocks_false))

								        sys.stdout.write(tmp)

								        #print(bits)


								    def dump_test_lookup(bits):

								        sys.stdout.write('duk_uint8_t test = {');

								        for b in bits:

								            if b:

								                sys.stdout.write('1,')

								            else:

								                sys.stdout.write('0,')

								        sys.stdout.write('};\n')


								    def convert_to_bitmap(bits):

								        # C code looks up bits as:

								        #   index = codepoint >> N

								        #   bitnum = codepoint & mask

								        #   bitmask = 1 << bitnum

								        # So block 0 is mask 0x01 of first byte, block 1 is mask 0x02 of

								        # first byte, etc.

								        res = []

								        curr = 0

								        mask = 0x01

								        for b in bits:

								            if b:

								                curr += mask

								            mask = mask * 2

								            if mask == 0x100:

								                res.append(curr)

								                curr = 0

								                mask = 0x01

								        assert(mask == 0x01)  # no leftover

								        return res


								    print('generate canon block bitmap without continuity')

								    block_bits1 = generate_block_bits(False)

								    dump_block_bitmap(block_bits1)

								    dump_test_lookup(block_bits1)


								    print('generate canon block bitmap with continuity')

								    block_bits2 = generate_block_bits(True)

								    dump_block_bitmap(block_bits2)

								    dump_test_lookup(block_bits2)


								    print('generate final canon bitmap')

								    block_bitmap = convert_to_bitmap(block_bits2)

								    print('- %d bytes' % len(block_bitmap))

								    print('- ' + repr(block_bitmap))

								    canon_bitmap = {

								        'data': block_bitmap,

								        'block_size': block_size,

								        'block_shift': block_shift,

								        'block_mask': block_mask

								    }


								    # This is useful to figure out corner case test cases.

								    print('canon blocks which are different with and without continuity check')

								    for i in xrange(num_blocks):

								        if block_bits1[i] != block_bits2[i]:

								            print('- block %d ([%d,%d]) differs' % (i, i * block_size, i * block_size + block_size - 1))


								    return canontab, canon_bitmap


								def clonedict(x):

								    "Shallow clone of input dict."

								    res = {}

								    for k in x.keys():

								        res[k] = x[k]

								    return res


								def main():

								    parser = optparse.OptionParser()

								    parser.add_option('--command', dest='command', default='caseconv_bitpacked')

								    parser.add_option('--unicode-data', dest='unicode_data')

								    parser.add_option('--special-casing', dest='special_casing')

								    parser.add_option('--out-source', dest='out_source')

								    parser.add_option('--out-header', dest='out_header')

								    parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc')

								    parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc')

								    parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup')

								    parser.add_option('--table-name-re-canon-bitmap', dest='table_name_re_canon_bitmap', default='caseconv_re_canon_bitmap')

								    (opts, args) = parser.parse_args()


								    unicode_data = UnicodeData(opts.unicode_data)

								    special_casing = SpecialCasing(opts.special_casing)


								    uc, lc, tc = get_base_conversion_maps(unicode_data)

								    update_special_casings(uc, lc, tc, special_casing)


								    if opts.command == 'caseconv_bitpacked':

								        # XXX: ASCII and non-BMP filtering could be an option but is now hardcoded


								        # ASCII is handled with 'fast path' so not needed here.

								        t = clonedict(uc)

								        remove_ascii_part(t)

								        uc_bytes, uc_nbits = generate_caseconv_tables(t)


								        t = clonedict(lc)

								        remove_ascii_part(t)

								        lc_bytes, lc_nbits = generate_caseconv_tables(t)


								        # Generate C source and header files.

								        genc = dukutil.GenerateC()

								        genc.emitHeader('extract_caseconv.py')

								        genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True)

								        genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True)

								        f = open(opts.out_source, 'wb')

								        f.write(genc.getString())

								        f.close()


								        genc = dukutil.GenerateC()

								        genc.emitHeader('extract_caseconv.py')

								        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes)))

								        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes)))

								        f = open(opts.out_header, 'wb')

								        f.write(genc.getString())

								        f.close()

								    elif opts.command == 're_canon_lookup':

								        # Direct canonicalization lookup for case insensitive regexps, includes ascii part.

								        t = clonedict(uc)

								        re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)


								        genc = dukutil.GenerateC()

								        genc.emitHeader('extract_caseconv.py')

								        genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True)

								        f = open(opts.out_source, 'wb')

								        f.write(genc.getString())

								        f.close()


								        genc = dukutil.GenerateC()

								        genc.emitHeader('extract_caseconv.py')

								        genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup)))

								        f = open(opts.out_header, 'wb')

								        f.write(genc.getString())

								        f.close()

								    elif opts.command == 're_canon_bitmap':

								        # N-codepoint block bitmap for skipping continuous codepoint blocks

								        # quickly.

								        t = clonedict(uc)

								        re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)


								        genc = dukutil.GenerateC()

								        genc.emitHeader('extract_caseconv.py')

								        genc.emitArray(re_canon_bitmap['data'], opts.table_name_re_canon_bitmap, size=len(re_canon_bitmap['data']), typename='duk_uint8_t', intvalues=True, const=True)

								        f = open(opts.out_source, 'wb')

								        f.write(genc.getString())

								        f.close()


								        genc = dukutil.GenerateC()

								        genc.emitHeader('extract_caseconv.py')

								        genc.emitDefine('DUK_CANON_BITMAP_BLKSIZE', re_canon_bitmap['block_size'])

								        genc.emitDefine('DUK_CANON_BITMAP_BLKSHIFT', re_canon_bitmap['block_shift'])

								        genc.emitDefine('DUK_CANON_BITMAP_BLKMASK', re_canon_bitmap['block_mask'])

								        genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_re_canon_bitmap, len(re_canon_bitmap['data'])))

								        f = open(opts.out_header, 'wb')

								        f.write(genc.getString())

								        f.close()

								    else:

								        raise Exception('invalid command: %r' % opts.command)


								if __name__ == '__main__':

								    main()