Browse Source

rework genstrings encoding, now more compact and flexible; add some json built-in strings

pull/1/head
Sami Vaarala 12 years ago
parent
commit
a440d8761f
  1. 51
      src/duk_heap_alloc.c
  2. 146
      src/genstrings.py

51
src/duk_heap_alloc.c

@ -4,6 +4,14 @@
#include "duk_internal.h" #include "duk_internal.h"
/* constants for built-in string data depacking */
#define BITPACK_LETTER_LIMIT 26
#define BITPACK_UNDERSCORE 26
#define BITPACK_FF 27
#define BITPACK_SWITCH1 29
#define BITPACK_SWITCH 30
#define BITPACK_SEVENBIT 31
/* /*
* Free a heap object. * Free a heap object.
* *
@ -192,36 +200,45 @@ void duk_heap_free(duk_heap *heap) {
static int init_heap_strings(duk_heap *heap) { static int init_heap_strings(duk_heap *heap) {
duk_bitdecoder_ctx bd_ctx; duk_bitdecoder_ctx bd_ctx;
duk_bitdecoder_ctx *bd = &bd_ctx; /* convenience */ duk_bitdecoder_ctx *bd = &bd_ctx; /* convenience */
duk_u8 lookup[DUK_STRDATA_LOOKUP_LENGTH];
int i, j; int i, j;
memset(&bd_ctx, 0, sizeof(bd_ctx)); memset(&bd_ctx, 0, sizeof(bd_ctx));
bd->data = (duk_u8 *) duk_strings_data; bd->data = (duk_u8 *) duk_strings_data;
bd->length = DUK_STRDATA_DATA_LENGTH; bd->length = DUK_STRDATA_DATA_LENGTH;
for (i = 0; i < DUK_STRDATA_LOOKUP_LENGTH; i++) {
lookup[i] = duk_bd_decode(bd, 7);
}
for (i = 0; i < DUK_HEAP_NUM_STRINGS; i++) { for (i = 0; i < DUK_HEAP_NUM_STRINGS; i++) {
int len;
duk_u8 tmp[DUK_STRDATA_MAX_STRLEN]; duk_u8 tmp[DUK_STRDATA_MAX_STRLEN];
duk_hstring *h; duk_hstring *h;
int len;
int mode;
int t;
len = duk_bd_decode(bd, 5); len = duk_bd_decode(bd, 5);
mode = 32; /* 0 = uppercase, 32 = lowercase (= 'a' - 'A') */
for (j = 0; j < len; j++) { for (j = 0; j < len; j++) {
duk_u8 ch = lookup[duk_bd_decode(bd, 6)]; t = duk_bd_decode(bd, 5);
if (t < BITPACK_LETTER_LIMIT) {
/* t = t + 'A' + mode;
* Internal keys are prefixed with 0xFF in the stringtable } else if (t == BITPACK_UNDERSCORE) {
* (which makes them invalid UTF-8 on purpose). The internal t = (int) '_';
* marker in init data is 0x00 for technical reasons. } else if (t == BITPACK_FF) {
*/ /* Internal keys are prefixed with 0xFF in the stringtable
if (ch == 0x00) { * (which makes them invalid UTF-8 on purpose).
/* 0xFF can never occur in valid UTF-8 */ */
ch = 0xff; t = (int) 0xff;
} else if (t == BITPACK_SWITCH1) {
t = duk_bd_decode(bd, 5);
DUK_ASSERT(t >= 0 && t <= 25);
t = t + 'A' + (mode ^ 32);
} else if (t == BITPACK_SWITCH) {
mode = mode ^ 32;
t = duk_bd_decode(bd, 5);
DUK_ASSERT(t >= 0 && t <= 25);
t = t + 'A' + mode;
} else if (t == BITPACK_SEVENBIT) {
t = duk_bd_decode(bd, 7);
} }
tmp[j] = ch; tmp[j] = (duk_u8) t;
} }
DUK_DDDPRINT("intern built-in string %d", i); DUK_DDDPRINT("intern built-in string %d", i);

146
src/genstrings.py

@ -511,6 +511,12 @@ duk_string_list = [
mkstr("dec", custom=True), mkstr("dec", custom=True),
mkstr("hex", custom=True), # enc/dec alg mkstr("hex", custom=True), # enc/dec alg
mkstr("base64", custom=True), # enc/dec alg mkstr("base64", custom=True), # enc/dec alg
# special literals for custom compatible json encoding
mkstr('{"_undefined":true}'),
mkstr('{"_nan":true}'),
mkstr('{"_inf":true}'),
mkstr('{"_ninf":true}'),
] ]
# Standard reserved words (non-strict mode + strict mode) # Standard reserved words (non-strict mode + strict mode)
@ -646,6 +652,11 @@ special_define_names = {
'': 'EMPTY_STRING', '': 'EMPTY_STRING',
',': 'COMMA', ',': 'COMMA',
' ': 'SPACE', ' ': 'SPACE',
'{"_undefined":true}': 'JSON_EXT_UNDEFINED',
'{"_nan":true}': 'JSON_EXT_NAN',
'{"_inf":true}': 'JSON_EXT_POSINF',
'{"_ninf":true}': 'JSON_EXT_NEGINF',
} }
# #
@ -680,57 +691,105 @@ def get_define_name(x):
def gen_strings_data_bitpacked(strlist): def gen_strings_data_bitpacked(strlist):
be = dukutil.BitEncoder() be = dukutil.BitEncoder()
freq = [0] * 256 # Strings are encoded as follows: a string begins in lowercase
# mode and recognizes the following 5-bit symbols:
#
# 0-25 'a' ... 'z'
# 26 '_'
# 27 0x00 (actually decoded to 0xff, internal marker)
# 28 reserved
# 29 switch to uppercase for one character
# (next 5-bit symbol must be in range 0-25)
# 30 switch to uppercase
# 31 read a 7-bit character verbatim
#
# Uppercase mode is the same except codes 29 and 30 switch to
# lowercase.
UNDERSCORE = 26
ZERO = 27
SWITCH1 = 29
SWITCH = 30
SEVENBIT = 31
maxlen = 0 maxlen = 0
maxval = 0 n_optimal = 0
n_switch1 = 0
n_switch = 0
n_sevenbit = 0
for s, d in strlist: for s, d in strlist:
for c in s: be.bits(len(s), 5)
freq[ord(c)] += 1
if len(s) > maxlen: if len(s) > maxlen:
maxlen = len(s) maxlen = len(s)
for c in s:
if ord(c) > maxval: # 5-bit character, mode specific
maxval = ord(c) mode = 'lowercase'
lookup = [] for idx, c in enumerate(s):
invlookup = [0] * 256 # FIXME: this is not an optimal encoder but good enough
for i in xrange(256):
if freq[i] != 0: islower = (ord(c) >= ord('a') and ord(c) <= ord('z'))
lookup.append(i) isupper = (ord(c) >= ord('A') and ord(c) <= ord('Z'))
for i in xrange(len(lookup)): islast = (idx == len(s) - 1)
x = lookup[i] isnextlower = False
invlookup[x] = i isnextupper = False
if not islast:
uniq = len(lookup) c2 = s[idx+1]
isnextlower = (ord(c2) >= ord('a') and ord(c2) <= ord('z'))
if uniq > 64: isnextupper = (ord(c2) >= ord('A') and ord(c2) <= ord('Z'))
raise Exception('too many unique characters for current assumptions')
if maxlen > 31: if c == '_':
raise Exception('string too long for current assumptions') be.bits(UNDERSCORE, 5)
if maxval > 127: n_optimal += 1
raise Exception('string maxval too high for current assumptions') elif c == '\x00':
be.bits(ZERO, 5)
databits = [] n_optimal += 1
elif islower and mode == 'lowercase':
# lookup table for chars (6 bits -> 7 bit value) be.bits(ord(c) - ord('a'), 5)
# XXX: can halve by encoding first value and then 3-bit skips, n_optimal += 1
# but net benefit maybe 20 bytes. elif isupper and mode == 'uppercase':
for i in xrange(uniq): be.bits(ord(c) - ord('A'), 5)
be.bits(lookup[i], 7) n_optimal += 1
elif islower and mode == 'uppercase':
# strings: 5-bit length, N*6-bit characters if isnextlower:
for s, d in strlist: be.bits(SWITCH, 5)
be.bits(len(s), 5) be.bits(ord(c) - ord('a'), 5)
for c in s: mode = 'lowercase'
be.bits(invlookup[ord(c)], 6) n_switch += 1
else:
be.bits(SWITCH1, 5)
be.bits(ord(c) - ord('a'), 5)
n_switch1 += 1
elif isupper and mode == 'lowercase':
if isnextupper:
be.bits(SWITCH, 5)
be.bits(ord(c) - ord('A'), 5)
mode = 'uppercase'
n_switch += 1
else:
be.bits(SWITCH1, 5)
be.bits(ord(c) - ord('A'), 5)
n_switch1 += 1
else:
assert(ord(c) >= 0 and ord(c) <= 127)
be.bits(SEVENBIT, 5)
be.bits(ord(c), 7)
n_sevenbit += 1
#print 'sevenbit for: %r' % c
# end marker not necessary, C code knows length from define # end marker not necessary, C code knows length from define
res = be.getByteString() res = be.getByteString()
print '%d strings, %d bytes of string init data, %d unique bytes in strings, %d maximum string length, %d maximum code point value' % \ print ('%d strings, %d bytes of string init data, %d maximum string length, ' + \
(len(strlist), len(res), uniq, maxlen, maxval) 'encoding: optimal=%d,switch1=%d,switch=%d,sevenbit=%d') % \
(len(strlist), len(res), maxlen, \
n_optimal, n_switch1, n_switch, n_sevenbit)
return res, maxlen
return res, uniq, maxlen, maxval
if __name__ == '__main__': if __name__ == '__main__':
parser = optparse.OptionParser() parser = optparse.OptionParser()
@ -801,7 +860,7 @@ if __name__ == '__main__':
idx_start_reserved = len(strlist) - num_all_reserved idx_start_reserved = len(strlist) - num_all_reserved
idx_start_strict_reserved = len(strlist) - num_strict_reserved idx_start_strict_reserved = len(strlist) - num_strict_reserved
strdata, lookuplen, maxlen, maxval = gen_strings_data_bitpacked(strlist) strdata, maxlen = gen_strings_data_bitpacked(strlist)
# write raw data file # write raw data file
f = open(opts.out_bin, 'wb') f = open(opts.out_bin, 'wb')
@ -829,7 +888,6 @@ if __name__ == '__main__':
genc.emitLine('extern char duk_strings_data[];') # FIXME: unsigned char? genc.emitLine('extern char duk_strings_data[];') # FIXME: unsigned char?
genc.emitLine('') genc.emitLine('')
genc.emitDefine('DUK_STRDATA_DATA_LENGTH', len(strdata)) genc.emitDefine('DUK_STRDATA_DATA_LENGTH', len(strdata))
genc.emitDefine('DUK_STRDATA_LOOKUP_LENGTH', lookuplen)
genc.emitDefine('DUK_STRDATA_MAX_STRLEN', maxlen) genc.emitDefine('DUK_STRDATA_MAX_STRLEN', maxlen)
genc.emitLine('') genc.emitLine('')
idx = 0 idx = 0

Loading…
Cancel
Save