rework genstrings encoding, now more compact and flexible; add some json built-in strings

12 years ago · a440d8761f
2 changed files with 136 additions and 61 deletions
--- a/src/duk_heap_alloc.c
+++ b/src/duk_heap_alloc.c
@ -4,6 +4,14 @@
 #include "duk_internal.h"
 /* constants for built-in string data depacking */
 #define  BITPACK_LETTER_LIMIT  26
 #define  BITPACK_UNDERSCORE    26
 #define  BITPACK_FF            27
 #define  BITPACK_SWITCH1       29
 #define  BITPACK_SWITCH        30
 #define  BITPACK_SEVENBIT      31
 /*
 *  Free a heap object.
 *
@ -192,36 +200,45 @@ void duk_heap_free(duk_heap *heap) {
 static int init_heap_strings(duk_heap *heap) {
 	duk_bitdecoder_ctx bd_ctx;
 	duk_bitdecoder_ctx *bd = &bd_ctx;  /* convenience */
 	duk_u8 lookup[DUK_STRDATA_LOOKUP_LENGTH];
 	int i, j;
 	memset(&bd_ctx, 0, sizeof(bd_ctx));
 	bd->data = (duk_u8 *) duk_strings_data;
 	bd->length = DUK_STRDATA_DATA_LENGTH;
 	for (i = 0; i < DUK_STRDATA_LOOKUP_LENGTH; i++) {
 		lookup[i] = duk_bd_decode(bd, 7);
 	}
 	for (i = 0; i < DUK_HEAP_NUM_STRINGS; i++) {
 		int len;
 		duk_u8 tmp[DUK_STRDATA_MAX_STRLEN];
 		duk_hstring *h;
 		int len;
 		int mode;
 		int t;
 		len = duk_bd_decode(bd, 5);
 		mode = 32;		/* 0 = uppercase, 32 = lowercase (= 'a' - 'A') */
 		for (j = 0; j < len; j++) {
-			duk_u8 ch = lookup[duk_bd_decode(bd, 6)];
+			t = duk_bd_decode(bd, 5);
-
+			if (t < BITPACK_LETTER_LIMIT) {
-			/*
+				t = t + 'A' + mode;
-			 *  Internal keys are prefixed with 0xFF in the stringtable
+			} else if (t == BITPACK_UNDERSCORE) {
-			 *  (which makes them invalid UTF-8 on purpose).  The internal
+				t = (int) '_';
-			 *  marker in init data is 0x00 for technical reasons.
+			} else if (t == BITPACK_FF) {
-			 */
+				/* Internal keys are prefixed with 0xFF in the stringtable
-			if (ch == 0x00) {
+				 * (which makes them invalid UTF-8 on purpose).
-				/* 0xFF can never occur in valid UTF-8 */
+				 */
-				ch = 0xff;
+				t = (int) 0xff;
 			} else if (t == BITPACK_SWITCH1) {
 				t = duk_bd_decode(bd, 5);
 				DUK_ASSERT(t >= 0 && t <= 25);
 				t = t + 'A' + (mode ^ 32);
 			} else if (t == BITPACK_SWITCH) {
 				mode = mode ^ 32;
 				t = duk_bd_decode(bd, 5);
 				DUK_ASSERT(t >= 0 && t <= 25);
 				t = t + 'A' + mode;
 			} else if (t == BITPACK_SEVENBIT) {
 				t = duk_bd_decode(bd, 7);
 			}
-			tmp[j] = ch;
+			tmp[j] = (duk_u8) t;
 		}
 		DUK_DDDPRINT("intern built-in string %d", i);
--- a/src/genstrings.py
+++ b/src/genstrings.py
@ -511,6 +511,12 @@ duk_string_list = [
 	mkstr("dec", custom=True),
 	mkstr("hex", custom=True),      # enc/dec alg
 	mkstr("base64", custom=True),   # enc/dec alg
 	# special literals for custom compatible json encoding
 	mkstr('{"_undefined":true}'),
 	mkstr('{"_nan":true}'),
 	mkstr('{"_inf":true}'),
 	mkstr('{"_ninf":true}'),
 ]
 # Standard reserved words (non-strict mode + strict mode)
@ -646,6 +652,11 @@ special_define_names = {
 	'': 'EMPTY_STRING',
 	',': 'COMMA',
 	' ': 'SPACE',
 	'{"_undefined":true}': 'JSON_EXT_UNDEFINED',
 	'{"_nan":true}': 'JSON_EXT_NAN',
 	'{"_inf":true}': 'JSON_EXT_POSINF',
 	'{"_ninf":true}': 'JSON_EXT_NEGINF',
 }
 #
@ -680,57 +691,105 @@ def get_define_name(x):
 def gen_strings_data_bitpacked(strlist):
 	be = dukutil.BitEncoder()
-	freq = [0] * 256
+	# Strings are encoded as follows: a string begins in lowercase
 	# mode and recognizes the following 5-bit symbols:
 	#
 	#    0-25    'a' ... 'z'
 	#    26	     '_'
 	#    27      0x00 (actually decoded to 0xff, internal marker)
 	#    28	     reserved
 	#    29      switch to uppercase for one character
 	#            (next 5-bit symbol must be in range 0-25)
 	#    30      switch to uppercase
 	#    31      read a 7-bit character verbatim
 	#
 	# Uppercase mode is the same except codes 29 and 30 switch to
 	# lowercase.
 	UNDERSCORE = 26
 	ZERO = 27
 	SWITCH1 = 29
 	SWITCH = 30
 	SEVENBIT = 31
 	maxlen = 0
-	maxval = 0
+	n_optimal = 0
 	n_switch1 = 0
 	n_switch = 0
 	n_sevenbit = 0
 	for s, d in strlist:
-		for c in s:
+		be.bits(len(s), 5)
-			freq[ord(c)] += 1
+
 		if len(s) > maxlen:
 			maxlen = len(s)
-		for c in s:
+
-			if ord(c) > maxval:
+		# 5-bit character, mode specific
-				maxval = ord(c)
+		mode = 'lowercase'
-
+
-	lookup = []
+		for idx, c in enumerate(s):
-	invlookup = [0] * 256
+			# FIXME: this is not an optimal encoder but good enough
-	for i in xrange(256):
+
-		if freq[i] != 0:
+			islower = (ord(c) >= ord('a') and ord(c) <= ord('z'))
-			lookup.append(i)
+			isupper = (ord(c) >= ord('A') and ord(c) <= ord('Z'))
-	for i in xrange(len(lookup)):
+			islast = (idx == len(s) - 1)
-		x = lookup[i]
+			isnextlower = False
-		invlookup[x] = i
+			isnextupper = False
-
+			if not islast:
-	uniq = len(lookup)
+				c2 = s[idx+1]
-
+				isnextlower = (ord(c2) >= ord('a') and ord(c2) <= ord('z'))
-	if uniq > 64:
+				isnextupper = (ord(c2) >= ord('A') and ord(c2) <= ord('Z'))
-		raise Exception('too many unique characters for current assumptions')
+
-	if maxlen > 31:
+			if c == '_':
-		raise Exception('string too long for current assumptions')
+				be.bits(UNDERSCORE, 5)
-	if maxval > 127:
+				n_optimal += 1
-		raise Exception('string maxval too high for current assumptions')
+			elif c == '\x00':
-
+				be.bits(ZERO, 5)
-        databits = []
+				n_optimal += 1
-
+			elif islower and mode == 'lowercase':
-	# lookup table for chars (6 bits -> 7 bit value)
+				be.bits(ord(c) - ord('a'), 5)
-	# XXX: can halve by encoding first value and then 3-bit skips,
+				n_optimal += 1
-	# but net benefit maybe 20 bytes.
+			elif isupper and mode == 'uppercase':
-	for i in xrange(uniq):
+				be.bits(ord(c) - ord('A'), 5)
-		be.bits(lookup[i], 7)
+				n_optimal += 1
-
+			elif islower and mode == 'uppercase':
-	# strings: 5-bit length, N*6-bit characters
+				if isnextlower:
-	for s, d in strlist:
+					be.bits(SWITCH, 5)
-		be.bits(len(s), 5)
+					be.bits(ord(c) - ord('a'), 5)
-		for c in s:
+					mode = 'lowercase'
-			be.bits(invlookup[ord(c)], 6)
+					n_switch += 1
 				else:
 					be.bits(SWITCH1, 5)
 					be.bits(ord(c) - ord('a'), 5)
 					n_switch1 += 1
 			elif isupper and mode == 'lowercase':
 				if isnextupper:
 					be.bits(SWITCH, 5)
 					be.bits(ord(c) - ord('A'), 5)
 					mode = 'uppercase'
 					n_switch += 1
 				else:
 					be.bits(SWITCH1, 5)
 					be.bits(ord(c) - ord('A'), 5)
 					n_switch1 += 1
 			else:
 				assert(ord(c) >= 0 and ord(c) <= 127)
 				be.bits(SEVENBIT, 5)
 				be.bits(ord(c), 7)
 				n_sevenbit += 1
 				#print 'sevenbit for: %r' % c
 	# end marker not necessary, C code knows length from define
 	res = be.getByteString()
-	print '%d strings, %d bytes of string init data, %d unique bytes in strings, %d maximum string length, %d maximum code point value' % \
+	print ('%d strings, %d bytes of string init data, %d maximum string length, ' + \
-		(len(strlist), len(res), uniq, maxlen, maxval)
+	       'encoding: optimal=%d,switch1=%d,switch=%d,sevenbit=%d') % \
 		(len(strlist), len(res), maxlen, \
 	         n_optimal, n_switch1, n_switch, n_sevenbit)
 	return res, maxlen
 	return res, uniq, maxlen, maxval
 if __name__ == '__main__':
 	parser = optparse.OptionParser()
@ -801,7 +860,7 @@ if __name__ == '__main__':
 	idx_start_reserved = len(strlist) - num_all_reserved
 	idx_start_strict_reserved = len(strlist) - num_strict_reserved
-	strdata, lookuplen, maxlen, maxval = gen_strings_data_bitpacked(strlist)
+	strdata, maxlen = gen_strings_data_bitpacked(strlist)
 	# write raw data file
 	f = open(opts.out_bin, 'wb')
@ -829,7 +888,6 @@ if __name__ == '__main__':
 	genc.emitLine('extern char duk_strings_data[];')  # FIXME: unsigned char?
 	genc.emitLine('')
 	genc.emitDefine('DUK_STRDATA_DATA_LENGTH', len(strdata))
 	genc.emitDefine('DUK_STRDATA_LOOKUP_LENGTH', lookuplen)
 	genc.emitDefine('DUK_STRDATA_MAX_STRLEN', maxlen)
 	genc.emitLine('')
 	idx = 0