rework genstrings encoding, now more compact and flexible; add some json built-in strings

12 years ago · a440d8761f
2 changed files with 136 additions and 61 deletions
--- a/src/duk_heap_alloc.c
+++ b/src/duk_heap_alloc.c
@ -4,6 +4,14 @@

 #include "duk_internal.h"

+/* constants for built-in string data depacking */
+#define  BITPACK_LETTER_LIMIT  26
+#define  BITPACK_UNDERSCORE    26
+#define  BITPACK_FF            27
+#define  BITPACK_SWITCH1       29
+#define  BITPACK_SWITCH        30
+#define  BITPACK_SEVENBIT      31
+
 /*
 *  Free a heap object.
 *
@ -192,36 +200,45 @@ void duk_heap_free(duk_heap *heap) {
 static int init_heap_strings(duk_heap *heap) {
 	duk_bitdecoder_ctx bd_ctx;
 	duk_bitdecoder_ctx *bd = &bd_ctx;  /* convenience */
-	duk_u8 lookup[DUK_STRDATA_LOOKUP_LENGTH];
 	int i, j;

 	memset(&bd_ctx, 0, sizeof(bd_ctx));
 	bd->data = (duk_u8 *) duk_strings_data;
 	bd->length = DUK_STRDATA_DATA_LENGTH;

-	for (i = 0; i < DUK_STRDATA_LOOKUP_LENGTH; i++) {
-		lookup[i] = duk_bd_decode(bd, 7);
-	}
-
 	for (i = 0; i < DUK_HEAP_NUM_STRINGS; i++) {
-		int len;
 		duk_u8 tmp[DUK_STRDATA_MAX_STRLEN];
 		duk_hstring *h;
+		int len;
+		int mode;
+		int t;

 		len = duk_bd_decode(bd, 5);
+		mode = 32;		/* 0 = uppercase, 32 = lowercase (= 'a' - 'A') */
 		for (j = 0; j < len; j++) {
-			duk_u8 ch = lookup[duk_bd_decode(bd, 6)];
-
-			/*
-			 *  Internal keys are prefixed with 0xFF in the stringtable
-			 *  (which makes them invalid UTF-8 on purpose).  The internal
-			 *  marker in init data is 0x00 for technical reasons.
-			 */
-			if (ch == 0x00) {
-				/* 0xFF can never occur in valid UTF-8 */
-				ch = 0xff;
+			t = duk_bd_decode(bd, 5);
+			if (t < BITPACK_LETTER_LIMIT) {
+				t = t + 'A' + mode;
+			} else if (t == BITPACK_UNDERSCORE) {
+				t = (int) '_';
+			} else if (t == BITPACK_FF) {
+				/* Internal keys are prefixed with 0xFF in the stringtable
+				 * (which makes them invalid UTF-8 on purpose).
+				 */
+				t = (int) 0xff;
+			} else if (t == BITPACK_SWITCH1) {
+				t = duk_bd_decode(bd, 5);
+				DUK_ASSERT(t >= 0 && t <= 25);
+				t = t + 'A' + (mode ^ 32);
+			} else if (t == BITPACK_SWITCH) {
+				mode = mode ^ 32;
+				t = duk_bd_decode(bd, 5);
+				DUK_ASSERT(t >= 0 && t <= 25);
+				t = t + 'A' + mode;
+			} else if (t == BITPACK_SEVENBIT) {
+				t = duk_bd_decode(bd, 7);
 			}
-			tmp[j] = ch;
+			tmp[j] = (duk_u8) t;
 		}

 		DUK_DDDPRINT("intern built-in string %d", i);
--- a/src/genstrings.py
+++ b/src/genstrings.py
@ -511,6 +511,12 @@ duk_string_list = [
 	mkstr("dec", custom=True),
 	mkstr("hex", custom=True),      # enc/dec alg
 	mkstr("base64", custom=True),   # enc/dec alg
+
+	# special literals for custom compatible json encoding
+	mkstr('{"_undefined":true}'),
+	mkstr('{"_nan":true}'),
+	mkstr('{"_inf":true}'),
+	mkstr('{"_ninf":true}'),
 ]

 # Standard reserved words (non-strict mode + strict mode)
@ -646,6 +652,11 @@ special_define_names = {
 	'': 'EMPTY_STRING',
 	',': 'COMMA',
 	' ': 'SPACE',
+
+	'{"_undefined":true}': 'JSON_EXT_UNDEFINED',
+	'{"_nan":true}': 'JSON_EXT_NAN',
+	'{"_inf":true}': 'JSON_EXT_POSINF',
+	'{"_ninf":true}': 'JSON_EXT_NEGINF',
 }

 #
@ -680,57 +691,105 @@ def get_define_name(x):
 def gen_strings_data_bitpacked(strlist):
 	be = dukutil.BitEncoder()

-	freq = [0] * 256
+	# Strings are encoded as follows: a string begins in lowercase
+	# mode and recognizes the following 5-bit symbols:
+	#
+	#    0-25    'a' ... 'z'
+	#    26	     '_'
+	#    27      0x00 (actually decoded to 0xff, internal marker)
+	#    28	     reserved
+	#    29      switch to uppercase for one character
+	#            (next 5-bit symbol must be in range 0-25)
+	#    30      switch to uppercase
+	#    31      read a 7-bit character verbatim
+	#
+	# Uppercase mode is the same except codes 29 and 30 switch to
+	# lowercase.
+
+	UNDERSCORE = 26
+	ZERO = 27
+	SWITCH1 = 29
+	SWITCH = 30
+	SEVENBIT = 31
+
 	maxlen = 0
-	maxval = 0
+	n_optimal = 0
+	n_switch1 = 0
+	n_switch = 0
+	n_sevenbit = 0
+
 	for s, d in strlist:
-		for c in s:
-			freq[ord(c)] += 1
+		be.bits(len(s), 5)
+
 		if len(s) > maxlen:
 			maxlen = len(s)
-		for c in s:
-			if ord(c) > maxval:
-				maxval = ord(c)
-
-	lookup = []
-	invlookup = [0] * 256
-	for i in xrange(256):
-		if freq[i] != 0:
-			lookup.append(i)
-	for i in xrange(len(lookup)):
-		x = lookup[i]
-		invlookup[x] = i
-
-	uniq = len(lookup)
-
-	if uniq > 64:
-		raise Exception('too many unique characters for current assumptions')
-	if maxlen > 31:
-		raise Exception('string too long for current assumptions')
-	if maxval > 127:
-		raise Exception('string maxval too high for current assumptions')
-
-        databits = []
-
-	# lookup table for chars (6 bits -> 7 bit value)
-	# XXX: can halve by encoding first value and then 3-bit skips,
-	# but net benefit maybe 20 bytes.
-	for i in xrange(uniq):
-		be.bits(lookup[i], 7)
-
-	# strings: 5-bit length, N*6-bit characters
-	for s, d in strlist:
-		be.bits(len(s), 5)
-		for c in s:
-			be.bits(invlookup[ord(c)], 6)
+
+		# 5-bit character, mode specific
+		mode = 'lowercase'
+
+		for idx, c in enumerate(s):
+			# FIXME: this is not an optimal encoder but good enough
+
+			islower = (ord(c) >= ord('a') and ord(c) <= ord('z'))
+			isupper = (ord(c) >= ord('A') and ord(c) <= ord('Z'))
+			islast = (idx == len(s) - 1)
+			isnextlower = False
+			isnextupper = False
+			if not islast:
+				c2 = s[idx+1]
+				isnextlower = (ord(c2) >= ord('a') and ord(c2) <= ord('z'))
+				isnextupper = (ord(c2) >= ord('A') and ord(c2) <= ord('Z'))
+
+			if c == '_':
+				be.bits(UNDERSCORE, 5)
+				n_optimal += 1
+			elif c == '\x00':
+				be.bits(ZERO, 5)
+				n_optimal += 1
+			elif islower and mode == 'lowercase':
+				be.bits(ord(c) - ord('a'), 5)
+				n_optimal += 1
+			elif isupper and mode == 'uppercase':
+				be.bits(ord(c) - ord('A'), 5)
+				n_optimal += 1
+			elif islower and mode == 'uppercase':
+				if isnextlower:
+					be.bits(SWITCH, 5)
+					be.bits(ord(c) - ord('a'), 5)
+					mode = 'lowercase'
+					n_switch += 1
+				else:
+					be.bits(SWITCH1, 5)
+					be.bits(ord(c) - ord('a'), 5)
+					n_switch1 += 1
+			elif isupper and mode == 'lowercase':
+				if isnextupper:
+					be.bits(SWITCH, 5)
+					be.bits(ord(c) - ord('A'), 5)
+					mode = 'uppercase'
+					n_switch += 1
+				else:
+					be.bits(SWITCH1, 5)
+					be.bits(ord(c) - ord('A'), 5)
+					n_switch1 += 1
+			else:
+				assert(ord(c) >= 0 and ord(c) <= 127)
+				be.bits(SEVENBIT, 5)
+				be.bits(ord(c), 7)
+				n_sevenbit += 1
+				#print 'sevenbit for: %r' % c
+
 	# end marker not necessary, C code knows length from define

 	res = be.getByteString()

-	print '%d strings, %d bytes of string init data, %d unique bytes in strings, %d maximum string length, %d maximum code point value' % \
-		(len(strlist), len(res), uniq, maxlen, maxval)
+	print ('%d strings, %d bytes of string init data, %d maximum string length, ' + \
+	       'encoding: optimal=%d,switch1=%d,switch=%d,sevenbit=%d') % \
+		(len(strlist), len(res), maxlen, \
+	         n_optimal, n_switch1, n_switch, n_sevenbit)
+
+	return res, maxlen

-	return res, uniq, maxlen, maxval

 if __name__ == '__main__':
 	parser = optparse.OptionParser()
@ -801,7 +860,7 @@ if __name__ == '__main__':
 	idx_start_reserved = len(strlist) - num_all_reserved
 	idx_start_strict_reserved = len(strlist) - num_strict_reserved

-	strdata, lookuplen, maxlen, maxval = gen_strings_data_bitpacked(strlist)
+	strdata, maxlen = gen_strings_data_bitpacked(strlist)

 	# write raw data file
 	f = open(opts.out_bin, 'wb')
@ -829,7 +888,6 @@ if __name__ == '__main__':
 	genc.emitLine('extern char duk_strings_data[];')  # FIXME: unsigned char?
 	genc.emitLine('')
 	genc.emitDefine('DUK_STRDATA_DATA_LENGTH', len(strdata))
-	genc.emitDefine('DUK_STRDATA_LOOKUP_LENGTH', lookuplen)
 	genc.emitDefine('DUK_STRDATA_MAX_STRLEN', maxlen)
 	genc.emitLine('')
 	idx = 0