duktape/src-input/duk_heap_hashstring.c

/*
 *  String hash computation (interning).
 *
 *  String hashing is performance critical because a string hash is computed
 *  for all new strings which are candidates to be added to the string table.
 *  However, strings actually added to the string table go through a codepoint
 *  length calculation which dominates performance because it goes through
 *  every byte of the input string (but only for strings added).
 *
 *  The string hash algorithm should be fast, but on the other hand provide
 *  good enough hashes to ensure both string table and object property table
 *  hash tables work reasonably well (i.e., there aren't too many collisions
 *  with real world inputs).  Unless the hash is cryptographic, it's always
 *  possible to craft inputs with maximal hash collisions.
 *
 *  NOTE: The hash algorithms must match tools/dukutil.py:duk_heap_hashstring()
 *  for ROM string support!
 */

#include "duk_internal.h"

#if defined(DUK_USE_STRHASH_DENSE)
/* Constants for duk_hashstring(). */
#define DUK__STRHASH_SHORTSTRING   4096L
#define DUK__STRHASH_MEDIUMSTRING  (256L * 1024L)
#define DUK__STRHASH_BLOCKSIZE     256L

DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
	duk_uint32_t hash;

	/* Use Murmurhash2 directly for short strings, and use "block skipping"
	 * for long strings: hash an initial part and then sample the rest of
	 * the string with reasonably sized chunks.  An initial offset for the
	 * sampling is computed based on a hash of the initial part of the string;
	 * this is done to (usually) avoid the case where all long strings have
	 * certain offset ranges which are never sampled.
	 *
	 * Skip should depend on length and bound the total time to roughly
	 * logarithmic.  With current values:
	 *
	 *   1M string => 256 * 241 = 61696 bytes (0.06M) of hashing
	 *   1G string => 256 * 16321 = 4178176 bytes (3.98M) of hashing
	 *
	 * XXX: It would be better to compute the skip offset more "smoothly"
	 * instead of having a few boundary values.
	 */

	/* note: mixing len into seed improves hashing when skipping */
	duk_uint32_t str_seed = heap->hash_seed ^ ((duk_uint32_t) len);

	if (len <= DUK__STRHASH_SHORTSTRING) {
		hash = duk_util_hashbytes(str, len, str_seed);
	} else {
		duk_size_t off;
		duk_size_t skip;

		if (len <= DUK__STRHASH_MEDIUMSTRING) {
			skip = (duk_size_t) (16 * DUK__STRHASH_BLOCKSIZE + DUK__STRHASH_BLOCKSIZE);
		} else {
			skip = (duk_size_t) (256 * DUK__STRHASH_BLOCKSIZE + DUK__STRHASH_BLOCKSIZE);
		}

		hash = duk_util_hashbytes(str, (duk_size_t) DUK__STRHASH_SHORTSTRING, str_seed);
		off = DUK__STRHASH_SHORTSTRING + (skip * (hash % 256)) / 256;

		/* XXX: inefficient loop */
		while (off < len) {
			duk_size_t left = len - off;
			duk_size_t now = (duk_size_t) (left > DUK__STRHASH_BLOCKSIZE ? DUK__STRHASH_BLOCKSIZE : left);
			hash ^= duk_util_hashbytes(str + off, now, str_seed);
			off += skip;
		}
	}

#if defined(DUK_USE_STRHASH16)
	/* Truncate to 16 bits here, so that a computed hash can be compared
	 * against a hash stored in a 16-bit field.
	 */
	hash &= 0x0000ffffUL;
#endif
	return hash;
}
#else  /* DUK_USE_STRHASH_DENSE */
DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
	duk_uint32_t hash;
	duk_size_t step;
	duk_size_t off;

	/* Slightly modified "Bernstein hash" from:
	 *
	 *     http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
	 *
	 * Modifications: string skipping and reverse direction similar to
	 * Lua 5.1.5, and different hash initializer.
	 *
	 * The reverse direction ensures last byte it always included in the
	 * hash which is a good default as changing parts of the string are
	 * more often in the suffix than in the prefix.
	 */

	hash = heap->hash_seed ^ ((duk_uint32_t) len);  /* Bernstein hash init value is normally 5381 */
	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
	for (off = len; off >= step; off -= step) {
		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
		hash = (hash * 33) + str[off - 1];
	}

#if defined(DUK_USE_STRHASH16)
	/* Truncate to 16 bits here, so that a computed hash can be compared
	 * against a hash stored in a 16-bit field.
	 */
	hash &= 0x0000ffffUL;
#endif
	return hash;
}
#endif  /* DUK_USE_STRHASH_DENSE */
heap related code 12 years ago			`/*`
			`* String hash computation (interning).`
Clean up string hash algorithms * Remove string hash algorithms which weren't chosen * Clean up comments 9 years ago			`*`
			`* String hashing is performance critical because a string hash is computed`
			`* for all new strings which are candidates to be added to the string table.`
			`* However, strings actually added to the string table go through a codepoint`
			`* length calculation which dominates performance because it goes through`
			`* every byte of the input string (but only for strings added).`
			`*`
			`* The string hash algorithm should be fast, but on the other hand provide`
			`* good enough hashes to ensure both string table and object property table`
			`* hash tables work reasonably well (i.e., there aren't too many collisions`
			`* with real world inputs). Unless the hash is cryptographic, it's always`
			`* possible to craft inputs with maximal hash collisions.`
Changes for ROM string/object support 9 years ago			`*`
Fix some source and doc refs to relocated tools 8 years ago			`* NOTE: The hash algorithms must match tools/dukutil.py:duk_heap_hashstring()`
Changes for ROM string/object support 9 years ago			`* for ROM string support!`
heap related code 12 years ago			`*/`

			`#include "duk_internal.h"`

Add string hash replacement candidates Use a string skipping approach similar to Lua 5.1, and try a few byte based string hash algorithms to see what works best in practice. 10 years ago			`#if defined(DUK_USE_STRHASH_DENSE)`
Changes for ROM string/object support 9 years ago			`/* Constants for duk_hashstring(). */`
A round of internal typing fixes (midcommit) 11 years ago			`#define DUK__STRHASH_SHORTSTRING 4096L`
			`#define DUK__STRHASH_MEDIUMSTRING (256L * 1024L)`
			`#define DUK__STRHASH_BLOCKSIZE 256L`
heap related code 12 years ago
Add some missing "const" specifiers for strings 10 years ago			`DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap heap, const duk_uint8_t str, duk_size_t len) {`
16-bit fields and heap pointer compression work Memory optimization work for very low memory devices (96 to 256kB system RAM). Overall changes are: - 16-bit fields for various internal structures to reduce their size - Heap pointer compression to reduce pointer size to 16 bits When DUK_OPT_LIGHTFUNC_BUILTINS and the new low memory options are enabled, Duktape initial heap memory usage is about 23kB (compared to baseline of about 45kB) on x86. Unless low memory feature options are enabled, there should be no visible changes to Duktape behavior. More detailed changes: - 16-bit changes for duk_heaphdr: pointer compression, refcount - 16-bit changes for duk_hstring: hash, blen, and clen can all be 16 bits, use 0xFFFF as string byte length limit (call sites ensure this limit is never exceeded) - 16-bit changes for duk_hbuffer, use 0xFFFF as buffer length limit - 16-bit fields for hobject size (entry part, array part), drop hash part since it's not usually needed for extremely low memory environments - 16-bit changes for duk_hcompiledfunction - Heap pointer packing for stringtable - Heap pointer packing for 'strs' built-in strings list (saves around 600 to 700 bytes but may not be a good tradeoff because call site size will increase) Other changes: - Heaphdr NULL init fix. The original macros were broken: the double/single linked macro variants were the wrong way around. Now sets through macro to work properly with compressed pointers. - Rename duk_hbuffer CURR_DATA_PTR -> DATA_PTR to reduce macro length (previous name was tediously long) - Rename buffer "usable_size" to "alloc_size" throughout as they have been the same for a while now (they used to differ when buffer had an extra NUL). - Add memory optimization markers to Duktape.env (pointer compression and individual 16-bit field options) - Rename a few internal fields for clarity: duk_hobject 'p' to 'props', heap->st to heap->strtable - Add a safety check for buffer alloc size (should not be triggered but prevents wrapping if call sites don't properly check for sizes) - Other minor cleanups 10 years ago			`duk_uint32_t hash;`

Clean up string hash algorithms * Remove string hash algorithms which weren't chosen * Clean up comments 9 years ago			`/* Use Murmurhash2 directly for short strings, and use "block skipping"`
			`* for long strings: hash an initial part and then sample the rest of`
			`* the string with reasonably sized chunks. An initial offset for the`
			`* sampling is computed based on a hash of the initial part of the string;`
			`* this is done to (usually) avoid the case where all long strings have`
			`* certain offset ranges which are never sampled.`
heap related code 12 years ago			`*`
Clean up string hash algorithms * Remove string hash algorithms which weren't chosen * Clean up comments 9 years ago			`* Skip should depend on length and bound the total time to roughly`
			`* logarithmic. With current values:`
heap related code 12 years ago			`*`
Clean up string hash algorithms * Remove string hash algorithms which weren't chosen * Clean up comments 9 years ago			`* 1M string => 256 * 241 = 61696 bytes (0.06M) of hashing`
			`* 1G string => 256 * 16321 = 4178176 bytes (3.98M) of hashing`
heap related code 12 years ago			`*`
Clean up string hash algorithms * Remove string hash algorithms which weren't chosen * Clean up comments 9 years ago			`* XXX: It would be better to compute the skip offset more "smoothly"`
			`* instead of having a few boundary values.`
heap related code 12 years ago			`*/`
Code policy issue fix trivia 10 years ago
heap related code 12 years ago			`/* note: mixing len into seed improves hashing when skipping */`
Add a few casts to fix GH-177 10 years ago			`duk_uint32_t str_seed = heap->hash_seed ^ ((duk_uint32_t) len);`
heap related code 12 years ago
internal macro renames, XXX -> DUK__XXX 11 years ago			`if (len <= DUK__STRHASH_SHORTSTRING) {`
16-bit fields and heap pointer compression work Memory optimization work for very low memory devices (96 to 256kB system RAM). Overall changes are: - 16-bit fields for various internal structures to reduce their size - Heap pointer compression to reduce pointer size to 16 bits When DUK_OPT_LIGHTFUNC_BUILTINS and the new low memory options are enabled, Duktape initial heap memory usage is about 23kB (compared to baseline of about 45kB) on x86. Unless low memory feature options are enabled, there should be no visible changes to Duktape behavior. More detailed changes: - 16-bit changes for duk_heaphdr: pointer compression, refcount - 16-bit changes for duk_hstring: hash, blen, and clen can all be 16 bits, use 0xFFFF as string byte length limit (call sites ensure this limit is never exceeded) - 16-bit changes for duk_hbuffer, use 0xFFFF as buffer length limit - 16-bit fields for hobject size (entry part, array part), drop hash part since it's not usually needed for extremely low memory environments - 16-bit changes for duk_hcompiledfunction - Heap pointer packing for stringtable - Heap pointer packing for 'strs' built-in strings list (saves around 600 to 700 bytes but may not be a good tradeoff because call site size will increase) Other changes: - Heaphdr NULL init fix. The original macros were broken: the double/single linked macro variants were the wrong way around. Now sets through macro to work properly with compressed pointers. - Rename duk_hbuffer CURR_DATA_PTR -> DATA_PTR to reduce macro length (previous name was tediously long) - Rename buffer "usable_size" to "alloc_size" throughout as they have been the same for a while now (they used to differ when buffer had an extra NUL). - Add memory optimization markers to Duktape.env (pointer compression and individual 16-bit field options) - Rename a few internal fields for clarity: duk_hobject 'p' to 'props', heap->st to heap->strtable - Add a safety check for buffer alloc size (should not be triggered but prevents wrapping if call sites don't properly check for sizes) - Other minor cleanups 10 years ago			`hash = duk_util_hashbytes(str, len, str_seed);`
heap related code 12 years ago			`} else {`
C typing fixes 11 years ago			`duk_size_t off;`
			`duk_size_t skip;`
heap related code 12 years ago
internal macro renames, XXX -> DUK__XXX 11 years ago			`if (len <= DUK__STRHASH_MEDIUMSTRING) {`
			`skip = (duk_size_t) (16 * DUK__STRHASH_BLOCKSIZE + DUK__STRHASH_BLOCKSIZE);`
heap related code 12 years ago			`} else {`
internal macro renames, XXX -> DUK__XXX 11 years ago			`skip = (duk_size_t) (256 * DUK__STRHASH_BLOCKSIZE + DUK__STRHASH_BLOCKSIZE);`
heap related code 12 years ago			`}`

internal macro renames, XXX -> DUK__XXX 11 years ago			`hash = duk_util_hashbytes(str, (duk_size_t) DUK__STRHASH_SHORTSTRING, str_seed);`
			`off = DUK__STRHASH_SHORTSTRING + (skip * (hash % 256)) / 256;`
heap related code 12 years ago
fixme cleanups 11 years ago			`/* XXX: inefficient loop */`
heap related code 12 years ago			`while (off < len) {`
C typing fixes 11 years ago			`duk_size_t left = len - off;`
internal macro renames, XXX -> DUK__XXX 11 years ago			`duk_size_t now = (duk_size_t) (left > DUK__STRHASH_BLOCKSIZE ? DUK__STRHASH_BLOCKSIZE : left);`
heap related code 12 years ago			`hash ^= duk_util_hashbytes(str + off, now, str_seed);`
			`off += skip;`
			`}`
			`}`
16-bit fields and heap pointer compression work Memory optimization work for very low memory devices (96 to 256kB system RAM). Overall changes are: - 16-bit fields for various internal structures to reduce their size - Heap pointer compression to reduce pointer size to 16 bits When DUK_OPT_LIGHTFUNC_BUILTINS and the new low memory options are enabled, Duktape initial heap memory usage is about 23kB (compared to baseline of about 45kB) on x86. Unless low memory feature options are enabled, there should be no visible changes to Duktape behavior. More detailed changes: - 16-bit changes for duk_heaphdr: pointer compression, refcount - 16-bit changes for duk_hstring: hash, blen, and clen can all be 16 bits, use 0xFFFF as string byte length limit (call sites ensure this limit is never exceeded) - 16-bit changes for duk_hbuffer, use 0xFFFF as buffer length limit - 16-bit fields for hobject size (entry part, array part), drop hash part since it's not usually needed for extremely low memory environments - 16-bit changes for duk_hcompiledfunction - Heap pointer packing for stringtable - Heap pointer packing for 'strs' built-in strings list (saves around 600 to 700 bytes but may not be a good tradeoff because call site size will increase) Other changes: - Heaphdr NULL init fix. The original macros were broken: the double/single linked macro variants were the wrong way around. Now sets through macro to work properly with compressed pointers. - Rename duk_hbuffer CURR_DATA_PTR -> DATA_PTR to reduce macro length (previous name was tediously long) - Rename buffer "usable_size" to "alloc_size" throughout as they have been the same for a while now (they used to differ when buffer had an extra NUL). - Add memory optimization markers to Duktape.env (pointer compression and individual 16-bit field options) - Rename a few internal fields for clarity: duk_hobject 'p' to 'props', heap->st to heap->strtable - Add a safety check for buffer alloc size (should not be triggered but prevents wrapping if call sites don't properly check for sizes) - Other minor cleanups 10 years ago
			`#if defined(DUK_USE_STRHASH16)`
			`/* Truncate to 16 bits here, so that a computed hash can be compared`
			`* against a hash stored in a 16-bit field.`
			`*/`
			`hash &= 0x0000ffffUL;`
			`#endif`
			`return hash;`
heap related code 12 years ago			`}`
Add string hash replacement candidates Use a string skipping approach similar to Lua 5.1, and try a few byte based string hash algorithms to see what works best in practice. 10 years ago			`#else /* DUK_USE_STRHASH_DENSE */`
Clean up string hash algorithms * Remove string hash algorithms which weren't chosen * Clean up comments 9 years ago			`DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap heap, const duk_uint8_t str, duk_size_t len) {`
Add string hash replacement candidates Use a string skipping approach similar to Lua 5.1, and try a few byte based string hash algorithms to see what works best in practice. 10 years ago			`duk_uint32_t hash;`
			`duk_size_t step;`
			`duk_size_t off;`

Clean up string hash algorithms * Remove string hash algorithms which weren't chosen * Clean up comments 9 years ago			`/* Slightly modified "Bernstein hash" from:`
Add string hash replacement candidates Use a string skipping approach similar to Lua 5.1, and try a few byte based string hash algorithms to see what works best in practice. 10 years ago			`*`
Clean up string hash algorithms * Remove string hash algorithms which weren't chosen * Clean up comments 9 years ago			`* http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx`
			`*`
			`* Modifications: string skipping and reverse direction similar to`
			`* Lua 5.1.5, and different hash initializer.`
			`*`
			`* The reverse direction ensures last byte it always included in the`
			`* hash which is a good default as changing parts of the string are`
			`* more often in the suffix than in the prefix.`
Add string hash replacement candidates Use a string skipping approach similar to Lua 5.1, and try a few byte based string hash algorithms to see what works best in practice. 10 years ago			`*/`

Clean up string hash algorithms * Remove string hash algorithms which weren't chosen * Clean up comments 9 years ago			`hash = heap->hash_seed ^ ((duk_uint32_t) len); /* Bernstein hash init value is normally 5381 */`
Add string hash replacement candidates Use a string skipping approach similar to Lua 5.1, and try a few byte based string hash algorithms to see what works best in practice. 10 years ago			`step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;`
			`for (off = len; off >= step; off -= step) {`
			`DUK_ASSERT(off >= 1); /* off >= step, and step >= 1 */`
			`hash = (hash * 33) + str[off - 1];`
			`}`

			`#if defined(DUK_USE_STRHASH16)`
			`/* Truncate to 16 bits here, so that a computed hash can be compared`
			`* against a hash stored in a 16-bit field.`
			`*/`
			`hash &= 0x0000ffffUL;`
			`#endif`
			`return hash;`
			`}`
			`#endif /* DUK_USE_STRHASH_DENSE */`