duktape/tools/extract_chars.py


								#!/usr/bin/env python2

								#

								#  Select a set of Unicode characters (based on included/excluded categories

								#  etc) and write out a compact bitstream for matching a character against

								#  the set at runtime.  This is for the slow path, where we're especially

								#  concerned with compactness.  A C source file with the table is written,

								#  together with a matching C header.

								#

								#  Unicode categories (such as 'Z') can be used.  Two pseudo-categories

								#  are also available for exclusion only: ASCII and NONBMP.  "ASCII"

								#  category excludes ASCII codepoints which is useful because C code

								#  typically contains an ASCII fast path so ASCII characters don't need

								#  to be considered in the Unicode tables.  "NONBMP" excludes codepoints

								#  above U+FFFF which is useful because such codepoints don't need to be

								#  supported in standard ECMAScript.

								#


								import os

								import sys

								import math

								import optparse


								import dukutil


								def read_unicode_data(unidata, catsinc, catsexc, filterfunc):

								    "Read UnicodeData.txt, including lines matching catsinc unless excluded by catsexc or filterfunc."

								    res = []

								    f = open(unidata, 'rb')


								    def filter_none(cp):

								        return True

								    if filterfunc is None:

								        filterfunc = filter_none


								    # The Unicode parsing is slow enough to warrant some speedups.

								    exclude_cat_exact = {}

								    for cat in catsexc:

								        exclude_cat_exact[cat] = True

								    include_cat_exact = {}

								    for cat in catsinc:

								        include_cat_exact[cat] = True


								    for line in f:

								        #line = line.strip()

								        parts = line.split(';')


								        codepoint = parts[0]

								        if not filterfunc(long(codepoint, 16)):

								            continue


								        category = parts[2]

								        if exclude_cat_exact.has_key(category):

								            continue  # quick reject


								        rejected = False

								        for cat in catsexc:

								            if category.startswith(cat) or codepoint == cat:

								                rejected = True

								                break

								        if rejected:

								            continue


								        if include_cat_exact.has_key(category):

								            res.append(line)

								            continue


								        accepted = False

								        for cat in catsinc:

								            if category.startswith(cat) or codepoint == cat:

								                accepted = True

								                break

								        if accepted:

								            res.append(line)


								    f.close()


								    # Sort based on Unicode codepoint

								    def mycmp(a,b):

								        t1 = a.split(';')

								        t2 = b.split(';')

								        n1 = long(t1[0], 16)

								        n2 = long(t2[0], 16)

								        return cmp(n1, n2)


								    res.sort(cmp=mycmp)


								    return res


								def scan_ranges(lines):

								    "Scan continuous ranges from (filtered) UnicodeData.txt lines."

								    ranges = []

								    range_start = None

								    prev = None


								    for line in lines:

								        t = line.split(';')

								        n = long(t[0], 16)

								        if range_start is None:

								            range_start = n

								        else:

								            if n == prev + 1:

								                # continue range

								                pass

								            else:

								                ranges.append((range_start, prev))

								                range_start = n

								        prev = n


								    if range_start is not None:

								        ranges.append((range_start, prev))


								    return ranges


								def generate_png(lines, fname):

								    "Generate an illustrative PNG of the character set."

								    from PIL import Image


								    m = {}

								    for line in lines:

								        t = line.split(';')

								        n = long(t[0], 16)

								        m[n] = 1


								    codepoints = 0x10ffff + 1

								    width = int(256)

								    height = int(math.ceil(float(codepoints) / float(width)))

								    im = Image.new('RGB', (width, height))

								    black = (0,0,0)

								    white = (255,255,255)

								    for cp in xrange(codepoints):

								        y = cp / width

								        x = cp % width


								        if m.has_key(long(cp)):

								            im.putpixel((x,y), black)

								        else:

								            im.putpixel((x,y), white)


								    im.save(fname)


								def generate_match_table1(ranges):

								    "Unused match table format."


								    # This is an earlier match table format which is no longer used.

								    # IdentifierStart-UnicodeLetter has 445 ranges and generates a

								    # match table of 2289 bytes.


								    data = []

								    prev_re = None


								    def genrange(rs, re):

								        if (rs > re):

								            raise Exception('assumption failed: rs=%d re=%d' % (rs, re))


								        while True:

								            now = re - rs + 1

								            if now > 255:

								                now = 255

								                data.append(now)    # range now

								                data.append(0)        # skip 0

								                rs = rs + now

								            else:

								                data.append(now)    # range now

								                break


								    def genskip(ss, se):

								        if (ss > se):

								            raise Exception('assumption failed: ss=%d se=%s' % (ss, se))


								        while True:

								            now = se - ss + 1

								            if now > 255:

								                now = 255

								                data.append(now)    # skip now

								                data.append(0)        # range 0

								                ss = ss + now

								            else:

								                data.append(now)    # skip now

								                break


								    for rs, re in ranges:

								        if prev_re is not None:

								            genskip(prev_re + 1, rs - 1)

								        genrange(rs, re)

								        prev_re = re


								    num_entries = len(data)


								    # header: start of first range

								    #         num entries

								    hdr = []

								    hdr.append(ranges[0][0] >> 8)    # XXX: check that not 0x10000 or over

								    hdr.append(ranges[0][1] & 0xff)

								    hdr.append(num_entries >> 8)

								    hdr.append(num_entries & 0xff)


								    return hdr + data


								def generate_match_table2(ranges):

								    "Unused match table format."


								    # Another attempt at a match table which is also unused.

								    # Total tables for all current classes is now 1472 bytes.


								    data = []


								    def enc(x):

								        while True:

								            if x < 0x80:

								                data.append(x)

								                break

								            data.append(0x80 + (x & 0x7f))

								            x = x >> 7


								    prev_re = 0


								    for rs, re in ranges:

								        r1 = rs - prev_re    # 1 or above (no unjoined ranges)

								        r2 = re - rs        # 0 or above

								        enc(r1)

								        enc(r2)

								        prev_re = re


								    enc(0)    # end marker


								    return data


								def generate_match_table3(ranges):

								    "Current match table format."


								    # Yet another attempt, similar to generate_match_table2 except

								    # in packing format.

								    #

								    # Total match size now (at time of writing): 1194 bytes.

								    #

								    # This is the current encoding format used in duk_lexer.c.


								    be = dukutil.BitEncoder()


								    freq = [0] * (0x10ffff + 1)  # informative


								    def enc(x):

								        freq[x] += 1


								        if x <= 0x0e:

								            # 4-bit encoding

								            be.bits(x, 4)

								            return

								        x -= 0x0e + 1

								        if x <= 0xfd:

								            # 12-bit encoding

								            be.bits(0x0f, 4)

								            be.bits(x, 8)

								            return

								        x -= 0xfd + 1

								        if x <= 0xfff:

								            # 24-bit encoding

								            be.bits(0x0f, 4)

								            be.bits(0xfe, 8)

								            be.bits(x, 12)

								            return

								        x -= 0xfff + 1

								        if True:

								            # 36-bit encoding

								            be.bits(0x0f, 4)

								            be.bits(0xff, 8)

								            be.bits(x, 24)

								            return


								        raise Exception('cannot encode')


								    prev_re = 0


								    for rs, re in ranges:

								        r1 = rs - prev_re    # 1 or above (no unjoined ranges)

								        r2 = re - rs        # 0 or above

								        enc(r1)

								        enc(r2)

								        prev_re = re


								    enc(0)    # end marker


								    data, nbits = be.getBytes(), be.getNumBits()

								    return data, freq


								def main():

								    parser = optparse.OptionParser()

								    parser.add_option('--unicode-data', dest='unicode_data')      # UnicodeData.txt

								    parser.add_option('--special-casing', dest='special_casing')  # SpecialCasing.txt

								    parser.add_option('--include-categories', dest='include_categories')

								    parser.add_option('--exclude-categories', dest='exclude_categories', default='NONE')

								    parser.add_option('--out-source', dest='out_source')

								    parser.add_option('--out-header', dest='out_header')

								    parser.add_option('--out-png', dest='out_png')

								    parser.add_option('--table-name', dest='table_name', default='match_table')

								    (opts, args) = parser.parse_args()


								    unidata = opts.unicode_data

								    catsinc = []

								    if opts.include_categories != '':

								        catsinc = opts.include_categories.split(',')

								    catsexc = []

								    if opts.exclude_categories != 'NONE':

								        catsexc = opts.exclude_categories.split(',')


								    print 'CATSEXC: %s' % repr(catsexc)

								    print 'CATSINC: %s' % repr(catsinc)


								    # pseudocategories

								    filter_ascii = ('ASCII' in catsexc)

								    filter_nonbmp = ('NONBMP' in catsexc)


								    # Read raw result

								    def filter1(x):

								        if filter_ascii and x <= 0x7f:

								            # exclude ascii

								            return False

								        if filter_nonbmp and x >= 0x10000:

								            # exclude non-bmp

								            return False

								        return True


								    print('read unicode data')

								    uni_filtered = read_unicode_data(unidata, catsinc, catsexc, filter1)

								    print('done reading unicode data')


								    # Raw output

								    #print('RAW OUTPUT:')

								    #print('===========')

								    #print('\n'.join(uni_filtered))


								    # Scan ranges

								    #print('')

								    #print('RANGES:')

								    #print('=======')

								    ranges = scan_ranges(uni_filtered)

								    #for i in ranges:

								    #    if i[0] == i[1]:

								    #        print('0x%04x' % i[0])

								    #    else:

								    #        print('0x%04x ... 0x%04x' % (i[0], i[1]))

								    #print('')

								    print('%d ranges total' % len(ranges))


								    # Generate match table

								    #print('')

								    #print('MATCH TABLE:')

								    #print('============')

								    #matchtable1 = generate_match_table1(ranges)

								    #matchtable2 = generate_match_table2(ranges)

								    matchtable3, freq = generate_match_table3(ranges)

								    #print 'match table: %s' % repr(matchtable3)

								    print 'match table length: %d bytes' % len(matchtable3)

								    print 'encoding freq:'

								    for i in xrange(len(freq)):

								        if freq[i] == 0:

								            continue

								        print '  %6d: %d' % (i, freq[i])


								    print('')

								    print('MATCH C TABLE -> file %s' % repr(opts.out_header))


								    # Create C source and header files

								    genc = dukutil.GenerateC()

								    genc.emitHeader('extract_chars.py')

								    genc.emitArray(matchtable3, opts.table_name, size=len(matchtable3), typename='duk_uint8_t', intvalues=True, const=True)

								    if opts.out_source is not None:

								        f = open(opts.out_source, 'wb')

								        f.write(genc.getString())

								        f.close()


								    genc = dukutil.GenerateC()

								    genc.emitHeader('extract_chars.py')

								    genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name, len(matchtable3)))

								    if opts.out_header is not None:

								        f = open(opts.out_header, 'wb')

								        f.write(genc.getString())

								        f.close()


								    # Image (for illustrative purposes only)

								    if opts.out_png is not None:

								        generate_png(uni_filtered, opts.out_png)


								if __name__ == '__main__':

								    main()