Browse Source

rework genstrings encoding, now more compact and flexible; add some json built-in strings

pull/1/head
Sami Vaarala 12 years ago
parent
commit
a440d8761f
  1. 51
      src/duk_heap_alloc.c
  2. 146
      src/genstrings.py

51
src/duk_heap_alloc.c

@ -4,6 +4,14 @@
#include "duk_internal.h"
/* constants for built-in string data depacking */
#define BITPACK_LETTER_LIMIT 26
#define BITPACK_UNDERSCORE 26
#define BITPACK_FF 27
#define BITPACK_SWITCH1 29
#define BITPACK_SWITCH 30
#define BITPACK_SEVENBIT 31
/*
* Free a heap object.
*
@ -192,36 +200,45 @@ void duk_heap_free(duk_heap *heap) {
static int init_heap_strings(duk_heap *heap) {
duk_bitdecoder_ctx bd_ctx;
duk_bitdecoder_ctx *bd = &bd_ctx; /* convenience */
duk_u8 lookup[DUK_STRDATA_LOOKUP_LENGTH];
int i, j;
memset(&bd_ctx, 0, sizeof(bd_ctx));
bd->data = (duk_u8 *) duk_strings_data;
bd->length = DUK_STRDATA_DATA_LENGTH;
for (i = 0; i < DUK_STRDATA_LOOKUP_LENGTH; i++) {
lookup[i] = duk_bd_decode(bd, 7);
}
for (i = 0; i < DUK_HEAP_NUM_STRINGS; i++) {
int len;
duk_u8 tmp[DUK_STRDATA_MAX_STRLEN];
duk_hstring *h;
int len;
int mode;
int t;
len = duk_bd_decode(bd, 5);
mode = 32; /* 0 = uppercase, 32 = lowercase (= 'a' - 'A') */
for (j = 0; j < len; j++) {
duk_u8 ch = lookup[duk_bd_decode(bd, 6)];
/*
* Internal keys are prefixed with 0xFF in the stringtable
* (which makes them invalid UTF-8 on purpose). The internal
* marker in init data is 0x00 for technical reasons.
*/
if (ch == 0x00) {
/* 0xFF can never occur in valid UTF-8 */
ch = 0xff;
t = duk_bd_decode(bd, 5);
if (t < BITPACK_LETTER_LIMIT) {
t = t + 'A' + mode;
} else if (t == BITPACK_UNDERSCORE) {
t = (int) '_';
} else if (t == BITPACK_FF) {
/* Internal keys are prefixed with 0xFF in the stringtable
* (which makes them invalid UTF-8 on purpose).
*/
t = (int) 0xff;
} else if (t == BITPACK_SWITCH1) {
t = duk_bd_decode(bd, 5);
DUK_ASSERT(t >= 0 && t <= 25);
t = t + 'A' + (mode ^ 32);
} else if (t == BITPACK_SWITCH) {
mode = mode ^ 32;
t = duk_bd_decode(bd, 5);
DUK_ASSERT(t >= 0 && t <= 25);
t = t + 'A' + mode;
} else if (t == BITPACK_SEVENBIT) {
t = duk_bd_decode(bd, 7);
}
tmp[j] = ch;
tmp[j] = (duk_u8) t;
}
DUK_DDDPRINT("intern built-in string %d", i);

146
src/genstrings.py

@ -511,6 +511,12 @@ duk_string_list = [
mkstr("dec", custom=True),
mkstr("hex", custom=True), # enc/dec alg
mkstr("base64", custom=True), # enc/dec alg
# special literals for custom compatible json encoding
mkstr('{"_undefined":true}'),
mkstr('{"_nan":true}'),
mkstr('{"_inf":true}'),
mkstr('{"_ninf":true}'),
]
# Standard reserved words (non-strict mode + strict mode)
@ -646,6 +652,11 @@ special_define_names = {
'': 'EMPTY_STRING',
',': 'COMMA',
' ': 'SPACE',
'{"_undefined":true}': 'JSON_EXT_UNDEFINED',
'{"_nan":true}': 'JSON_EXT_NAN',
'{"_inf":true}': 'JSON_EXT_POSINF',
'{"_ninf":true}': 'JSON_EXT_NEGINF',
}
#
@ -680,57 +691,105 @@ def get_define_name(x):
def gen_strings_data_bitpacked(strlist):
be = dukutil.BitEncoder()
freq = [0] * 256
# Strings are encoded as follows: a string begins in lowercase
# mode and recognizes the following 5-bit symbols:
#
# 0-25 'a' ... 'z'
# 26 '_'
# 27 0x00 (actually decoded to 0xff, internal marker)
# 28 reserved
# 29 switch to uppercase for one character
# (next 5-bit symbol must be in range 0-25)
# 30 switch to uppercase
# 31 read a 7-bit character verbatim
#
# Uppercase mode is the same except codes 29 and 30 switch to
# lowercase.
UNDERSCORE = 26
ZERO = 27
SWITCH1 = 29
SWITCH = 30
SEVENBIT = 31
maxlen = 0
maxval = 0
n_optimal = 0
n_switch1 = 0
n_switch = 0
n_sevenbit = 0
for s, d in strlist:
for c in s:
freq[ord(c)] += 1
be.bits(len(s), 5)
if len(s) > maxlen:
maxlen = len(s)
for c in s:
if ord(c) > maxval:
maxval = ord(c)
lookup = []
invlookup = [0] * 256
for i in xrange(256):
if freq[i] != 0:
lookup.append(i)
for i in xrange(len(lookup)):
x = lookup[i]
invlookup[x] = i
uniq = len(lookup)
if uniq > 64:
raise Exception('too many unique characters for current assumptions')
if maxlen > 31:
raise Exception('string too long for current assumptions')
if maxval > 127:
raise Exception('string maxval too high for current assumptions')
databits = []
# lookup table for chars (6 bits -> 7 bit value)
# XXX: can halve by encoding first value and then 3-bit skips,
# but net benefit maybe 20 bytes.
for i in xrange(uniq):
be.bits(lookup[i], 7)
# strings: 5-bit length, N*6-bit characters
for s, d in strlist:
be.bits(len(s), 5)
for c in s:
be.bits(invlookup[ord(c)], 6)
# 5-bit character, mode specific
mode = 'lowercase'
for idx, c in enumerate(s):
# FIXME: this is not an optimal encoder but good enough
islower = (ord(c) >= ord('a') and ord(c) <= ord('z'))
isupper = (ord(c) >= ord('A') and ord(c) <= ord('Z'))
islast = (idx == len(s) - 1)
isnextlower = False
isnextupper = False
if not islast:
c2 = s[idx+1]
isnextlower = (ord(c2) >= ord('a') and ord(c2) <= ord('z'))
isnextupper = (ord(c2) >= ord('A') and ord(c2) <= ord('Z'))
if c == '_':
be.bits(UNDERSCORE, 5)
n_optimal += 1
elif c == '\x00':
be.bits(ZERO, 5)
n_optimal += 1
elif islower and mode == 'lowercase':
be.bits(ord(c) - ord('a'), 5)
n_optimal += 1
elif isupper and mode == 'uppercase':
be.bits(ord(c) - ord('A'), 5)
n_optimal += 1
elif islower and mode == 'uppercase':
if isnextlower:
be.bits(SWITCH, 5)
be.bits(ord(c) - ord('a'), 5)
mode = 'lowercase'
n_switch += 1
else:
be.bits(SWITCH1, 5)
be.bits(ord(c) - ord('a'), 5)
n_switch1 += 1
elif isupper and mode == 'lowercase':
if isnextupper:
be.bits(SWITCH, 5)
be.bits(ord(c) - ord('A'), 5)
mode = 'uppercase'
n_switch += 1
else:
be.bits(SWITCH1, 5)
be.bits(ord(c) - ord('A'), 5)
n_switch1 += 1
else:
assert(ord(c) >= 0 and ord(c) <= 127)
be.bits(SEVENBIT, 5)
be.bits(ord(c), 7)
n_sevenbit += 1
#print 'sevenbit for: %r' % c
# end marker not necessary, C code knows length from define
res = be.getByteString()
print '%d strings, %d bytes of string init data, %d unique bytes in strings, %d maximum string length, %d maximum code point value' % \
(len(strlist), len(res), uniq, maxlen, maxval)
print ('%d strings, %d bytes of string init data, %d maximum string length, ' + \
'encoding: optimal=%d,switch1=%d,switch=%d,sevenbit=%d') % \
(len(strlist), len(res), maxlen, \
n_optimal, n_switch1, n_switch, n_sevenbit)
return res, maxlen
return res, uniq, maxlen, maxval
if __name__ == '__main__':
parser = optparse.OptionParser()
@ -801,7 +860,7 @@ if __name__ == '__main__':
idx_start_reserved = len(strlist) - num_all_reserved
idx_start_strict_reserved = len(strlist) - num_strict_reserved
strdata, lookuplen, maxlen, maxval = gen_strings_data_bitpacked(strlist)
strdata, maxlen = gen_strings_data_bitpacked(strlist)
# write raw data file
f = open(opts.out_bin, 'wb')
@ -829,7 +888,6 @@ if __name__ == '__main__':
genc.emitLine('extern char duk_strings_data[];') # FIXME: unsigned char?
genc.emitLine('')
genc.emitDefine('DUK_STRDATA_DATA_LENGTH', len(strdata))
genc.emitDefine('DUK_STRDATA_LOOKUP_LENGTH', lookuplen)
genc.emitDefine('DUK_STRDATA_MAX_STRLEN', maxlen)
genc.emitLine('')
idx = 0

Loading…
Cancel
Save