You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

447 lines
14 KiB

#!/usr/bin/env python2
#
# Extract rules for Unicode case conversion, specifically the behavior
# required by Ecmascript E5 in Sections 15.5.4.16 to 15.5.4.19. The
# bitstream encoded rules are used for the slow path at run time, so
# compactness is favored over speed.
#
# There is no support for context or locale sensitive rules, as they
# are handled directly in C code before consulting tables generated
# here. Ecmascript requires case conversion both with and without
# locale/language specific rules (e.g. String.prototype.toLowerCase()
# and String.prototype.toLocaleLowerCase()), so they are best handled
# in C anyway.
#
# Case conversion rules for ASCII are also excluded as they are handled
# by the C fast path. Rules for non-BMP characters (codepoints above
# U+FFFF) are omitted as they're not required for standard Ecmascript.
#
import os
import sys
import math
import optparse
import dukutil
class UnicodeData:
"Read UnicodeData.txt into an internal representation."
def __init__(self, filename):
self.data = self.read_unicode_data(filename)
print 'read %d unicode data entries' % len(self.data)
def read_unicode_data(self, filename):
res = []
f = open(filename, 'rb')
for line in f:
if line.startswith('#'):
continue
line = line.strip()
if line == '':
continue
parts = line.split(';')
if len(parts) != 15:
raise Exception('invalid unicode data line')
res.append(parts)
f.close()
# Sort based on Unicode codepoint
def mycmp(a,b):
return cmp(long(a[0], 16), long(b[0], 16))
res.sort(cmp=mycmp)
return res
class SpecialCasing:
"Read SpecialCasing.txt into an internal representation."
def __init__(self, filename):
self.data = self.read_special_casing_data(filename)
print 'read %d special casing entries' % len(self.data)
def read_special_casing_data(self, filename):
res = []
f = open(filename, 'rb')
for line in f:
try:
idx = line.index('#')
line = line[:idx]
except ValueError:
pass
line = line.strip()
if line == '':
continue
parts = line.split(';')
parts = [i.strip() for i in parts]
while len(parts) < 6:
parts.append('')
res.append(parts)
f.close()
return res
def parse_unicode_sequence(x):
res = ''
for i in x.split(' '):
i = i.strip()
if i == '':
continue
res += unichr(long(i, 16))
return res
def get_base_conversion_maps(unicode_data):
"Create case conversion tables without handling special casing yet."
uc = {} # codepoint (number) -> string
lc = {}
tc = {} # titlecase
for x in unicode_data.data:
c1 = long(x[0], 16)
# just 16-bit support needed
if c1 >= 0x10000:
continue
if x[12] != '':
# field 12: simple uppercase mapping
c2 = parse_unicode_sequence(x[12])
uc[c1] = c2
tc[c1] = c2 # titlecase default == uppercase, overridden below if necessary
if x[13] != '':
# field 13: simple lowercase mapping
c2 = parse_unicode_sequence(x[13])
lc[c1] = c2
if x[14] != '':
# field 14: simple titlecase mapping
c2 = parse_unicode_sequence(x[14])
tc[c1] = c2
return uc, lc, tc
def update_special_casings(uc, lc, tc, special_casing):
"Update case conversion tables with special case conversion rules."
for x in special_casing.data:
c1 = long(x[0], 16)
if x[4] != '':
# conditions
continue
lower = parse_unicode_sequence(x[1])
title = parse_unicode_sequence(x[2])
upper = parse_unicode_sequence(x[3])
if len(lower) > 1:
lc[c1] = lower
if len(upper) > 1:
uc[c1] = upper
if len(title) > 1:
tc[c1] = title
print 'special case: %d %d %d' % (len(lower), len(upper), len(title))
def remove_ascii_part(convmap):
"Remove ASCII case conversion parts (handled by C fast path)."
for i in xrange(128):
if convmap.has_key(i):
del convmap[i]
def scan_range_with_skip(convmap, start_idx, skip):
"Scan for a range of continuous case conversion with a certain 'skip'."
conv_i = start_idx
if not convmap.has_key(conv_i):
return None, None, None
elif len(convmap[conv_i]) > 1:
return None, None, None
else:
conv_o = ord(convmap[conv_i])
start_i = conv_i
start_o = conv_o
while True:
new_i = conv_i + skip
new_o = conv_o + skip
if not convmap.has_key(new_i):
break
if len(convmap[new_i]) > 1:
break
if ord(convmap[new_i]) != new_o:
break
conv_i = new_i
conv_o = new_o
# [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char
count = (conv_i - start_i) / skip + 1
if count <= 1:
return None, None, None
# we have an acceptable range, remove them from the convmap here
for i in xrange(start_i, conv_i + skip, skip):
del convmap[i]
return start_i, start_o, count
def find_first_range_with_skip(convmap, skip):
"Find first range with a certain 'skip' value."
for i in xrange(65536):
start_i, start_o, count = scan_range_with_skip(convmap, i, skip)
if start_i is None:
continue
return start_i, start_o, count
return None, None, None
def generate_tables(convmap):
"Generate bit-packed case conversion table for a given conversion map."
# The bitstream encoding is based on manual inspection for whatever
# regularity the Unicode case conversion rules have.
#
# Start with a full description of case conversions which does not
# cover all codepoints; unmapped codepoints convert to themselves.
# Scan for range-to-range mappings with a range of skips starting from 1.
# Whenever a valid range is found, remove it from the map. Finally,
# output the remaining case conversions (1:1 and 1:n) on a per codepoint
# basis.
#
# This is very slow because we always scan from scratch, but its the
# most reliable and simple way to scan
ranges = [] # range mappings (2 or more consecutive mappings with a certain skip)
singles = [] # 1:1 character mappings
multis = [] # 1:n character mappings
# Ranges with skips
for skip in xrange(1,6+1): # skips 1...6 are useful
while True:
start_i, start_o, count = find_first_range_with_skip(convmap, skip)
if start_i is None:
break
print 'skip %d: %d %d %d' % (skip, start_i, start_o, count)
ranges.append([start_i, start_o, count, skip])
# 1:1 conversions
k = convmap.keys()
k.sort()
for i in k:
if len(convmap[i]) > 1:
continue
singles.append([i, ord(convmap[i])]) # codepoint, codepoint
del convmap[i]
# There are many mappings to 2-char sequences with latter char being U+0399.
# These could be handled as a special case, but we don't do that right now.
#
# [8064L, u'\u1f08\u0399']
# [8065L, u'\u1f09\u0399']
# [8066L, u'\u1f0a\u0399']
# [8067L, u'\u1f0b\u0399']
# [8068L, u'\u1f0c\u0399']
# [8069L, u'\u1f0d\u0399']
# [8070L, u'\u1f0e\u0399']
# [8071L, u'\u1f0f\u0399']
# ...
#
# tmp = {}
# k = convmap.keys()
# k.sort()
# for i in k:
# if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399':
# tmp[i] = convmap[i][0]
# del convmap[i]
# print repr(tmp)
#
# skip = 1
# while True:
# start_i, start_o, count = find_first_range_with_skip(tmp, skip)
# if start_i is None:
# break
# print 'special399, skip %d: %d %d %d' % (skip, start_i, start_o, count)
# print len(tmp.keys())
# print repr(tmp)
# XXX: need to put 12 remaining mappings back to convmap
# 1:n conversions
k = convmap.keys()
k.sort()
for i in k:
multis.append([i, convmap[i]]) # codepoint, string
del convmap[i]
for t in singles:
print repr(t)
for t in multis:
print repr(t)
print 'range mappings: %d' % len(ranges)
print 'single character mappings: %d' % len(singles)
print 'complex mappings (1:n): %d' % len(multis)
print 'remaining (should be zero): %d' % len(convmap.keys())
# XXX: opportunities for diff encoding skip=3 ranges?
prev = None
for t in ranges:
# range: [start_i, start_o, count, skip]
if t[3] != 3:
continue
if prev is not None:
print '%d %d' % (t[0] - prev[0], t[1] - prev[1])
else:
print 'start: %d %d' % (t[0], t[1])
prev = t
# bit packed encoding
be = dukutil.BitEncoder()
for curr_skip in xrange(1, 7): # 1...6
count = 0
for r in ranges:
start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
if skip != curr_skip:
continue
count += 1
be.bits(count, 6)
print 'encode: skip=%d, count=%d' % (curr_skip, count)
for r in ranges:
start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
if skip != curr_skip:
continue
be.bits(start_i, 16)
be.bits(start_o, 16)
be.bits(r_count, 7)
be.bits(0x3f, 6) # maximum count value = end of skips
count = len(singles)
be.bits(count, 7)
for t in singles:
cp_i, cp_o = t[0], t[1]
be.bits(cp_i, 16)
be.bits(cp_o, 16)
count = len(multis)
be.bits(count, 7)
for t in multis:
cp_i, str_o = t[0], t[1]
be.bits(cp_i, 16)
be.bits(len(str_o), 2)
for i in xrange(len(str_o)):
be.bits(ord(str_o[i]), 16)
return be.getBytes(), be.getNumBits()
def generate_regexp_canonicalize_lookup(convmap):
res = []
highest_nonid = -1
for cp in xrange(65536):
res_cp = cp # default to as is
if convmap.has_key(cp):
tmp = convmap[cp]
if len(tmp) == 1:
# Multiple codepoints from input, ignore
res_cp = ord(tmp[0])
if cp >= 0x80 and res_cp < 0x80:
res_cp = cp # non-ASCII mapped to ASCII, ignore
if cp != res_cp:
highest_nonid = cp
res.append(res_cp)
# At the moment this is 65370, which means there's very little
# gain in assuming 1:1 mapping above a certain BMP codepoint.
print('HIGHEST NON-ID MAPPING: %d' % highest_nonid)
return res
def clonedict(x):
"Shallow clone of input dict."
res = {}
for k in x.keys():
res[k] = x[k]
return res
def main():
parser = optparse.OptionParser()
parser.add_option('--command', dest='command', default='caseconv_bitpacked')
parser.add_option('--unicode-data', dest='unicode_data')
parser.add_option('--special-casing', dest='special_casing')
parser.add_option('--out-source', dest='out_source')
parser.add_option('--out-header', dest='out_header')
parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc')
parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc')
parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup')
(opts, args) = parser.parse_args()
unicode_data = UnicodeData(opts.unicode_data)
special_casing = SpecialCasing(opts.special_casing)
uc, lc, tc = get_base_conversion_maps(unicode_data)
update_special_casings(uc, lc, tc, special_casing)
if opts.command == 'caseconv_bitpacked':
# XXX: ASCII and non-BMP filtering could be an option but is now hardcoded
# ascii is handled with 'fast path' so not needed here
t = clonedict(uc)
remove_ascii_part(t)
uc_bytes, uc_nbits = generate_tables(t)
t = clonedict(lc)
remove_ascii_part(t)
lc_bytes, lc_nbits = generate_tables(t)
# Generate C source and header files
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
f = open(opts.out_source, 'wb')
f.write(genc.getString())
f.close()
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes)))
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes)))
f = open(opts.out_header, 'wb')
f.write(genc.getString())
f.close()
elif opts.command == 're_canon_lookup':
# direct canonicalization lookup for case insensitive regexps, includes ascii part
t = clonedict(uc)
re_canon_lookup = generate_regexp_canonicalize_lookup(t)
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True)
f = open(opts.out_source, 'wb')
f.write(genc.getString())
f.close()
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup)))
f = open(opts.out_header, 'wb')
f.write(genc.getString())
f.close()
else:
raise Exception('invalid command: %r' % opts.command)
if __name__ == '__main__':
main()