Clean up string hash algorithms

* Remove string hash algorithms which weren't chosen * Clean up comments
9 years ago · 27ebe604b3
2 changed files with 42 additions and 288 deletions
--- a/src/duk_heap_alloc.c
+++ b/src/duk_heap_alloc.c
@ -771,6 +771,9 @@ duk_heap *duk_heap_alloc(duk_alloc_function alloc_func,
 	 */
 	res->hash_seed = (duk_uint32_t) (duk_intptr_t) res;
 	res->rnd_state = (duk_uint32_t) (duk_intptr_t) res;
 #if !defined(DUK_USE_STRHASH_DENSE)
 	res->hash_seed ^= 5381;  /* Bernstein hash init value is normally 5381; XOR it in in case pointer low bits are 0 */
 #endif
 #ifdef DUK_USE_EXPLICIT_NULL_INIT
 	res->lj.jmpbuf_ptr = NULL;
--- a/src/duk_heap_hashstring.c
+++ b/src/duk_heap_hashstring.c
@ -1,11 +1,22 @@
 /*
 *  String hash computation (interning).
 *
 *  String hashing is performance critical because a string hash is computed
 *  for all new strings which are candidates to be added to the string table.
 *  However, strings actually added to the string table go through a codepoint
 *  length calculation which dominates performance because it goes through
 *  every byte of the input string (but only for strings added).
 *
 *  The string hash algorithm should be fast, but on the other hand provide
 *  good enough hashes to ensure both string table and object property table
 *  hash tables work reasonably well (i.e., there aren't too many collisions
 *  with real world inputs).  Unless the hash is cryptographic, it's always
 *  possible to craft inputs with maximal hash collisions.
 */
 #include "duk_internal.h"
 #if defined(DUK_USE_STRHASH_DENSE)
 /* constants for duk_hashstring() */
 #define DUK__STRHASH_SHORTSTRING   4096L
 #define DUK__STRHASH_MEDIUMSTRING  (256L * 1024L)
 #define DUK__STRHASH_BLOCKSIZE     256L
@ -13,25 +24,21 @@
 DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
-	/*
+	/* Use Murmurhash2 directly for short strings, and use "block skipping"
-	 *  Sampling long strings by byte skipping (like Lua does) is potentially
+	 * for long strings: hash an initial part and then sample the rest of
-	 *  a cache problem.  Here we do 'block skipping' instead for long strings:
+	 * the string with reasonably sized chunks.  An initial offset for the
-	 *  hash an initial part, and then sample the rest of the string with
+	 * sampling is computed based on a hash of the initial part of the string;
-	 *  reasonably sized chunks.
+	 * this is done to (usually) avoid the case where all long strings have
 	 * certain offset ranges which are never sampled.
 	 *
 	 * Skip should depend on length and bound the total time to roughly
-	 *  logarithmic.
+	 * logarithmic.  With current values:
 	 *
 	 *  With current values:
 	 *
 	 *   1M string => 256 * 241 = 61696 bytes (0.06M) of hashing
 	 *   1G string => 256 * 16321 = 4178176 bytes (3.98M) of hashing
 	 *
-	 *  After an initial part has been hashed, an offset is applied before
+	 * XXX: It would be better to compute the skip offset more "smoothly"
-	 *  starting the sampling.  The initial offset is computed from the
+	 * instead of having a few boundary values.
 	 *  hash of the initial part of the string.  The idea is to avoid the
 	 *  case that all long strings have certain offset ranges that are never
 	 *  sampled.
 	 */
 	/* note: mixing len into seed improves hashing when skipping */
@ -69,291 +76,35 @@ DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t
 #endif
 	return hash;
 }
 #undef DUK__STRHASH_SHORTSTRING
 #undef DUK__STRHASH_MEDIUMSTRING
 #undef DUK__STRHASH_BLOCKSIZE
 #else  /* DUK_USE_STRHASH_DENSE */
-#if 1
+DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 DUK_LOCAL duk_uint32_t duk__hashstring_lua1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;
-	/* String algorithm based on Lua 5.1.5 with small modifications.
+	/* Slightly modified "Bernstein hash" from:
 	 * See lstring.c:luaS_newlstr().
 	 *
 	 * This is basically "Shift-add-XOR hash" with skipping and reverse
 	 * direction:
 	 *     http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
 	 *
 	 * Modifications: string skipping and reverse direction similar to
 	 * Lua 5.1.5, and different hash initializer.
 	 *
 	 * The reverse direction ensures last byte it always included in the
 	 * hash which is a good default as changing parts of the string are
 	 * more often in the suffix than in the prefix.
 	 */
-	hash = heap->hash_seed ^ ((duk_uint32_t) len);
+	hash = heap->hash_seed ^ ((duk_uint32_t) len);  /* Bernstein hash init value is normally 5381 */
 	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 	for (off = len; off >= step; off -= step) {
 		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
 		hash = hash ^ ((hash << 5) + (hash >> 2) + str[off - 1]);
 	}
 	return hash;
 }
 #endif
 #if 1
 DUK_LOCAL duk_uint32_t duk__hashstring_lua2(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;
 	/* Forward stepping variant of Lua 5.1.5. */
 	hash = heap->hash_seed ^ ((duk_uint32_t) len);
 	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 	off = (len + step - 1) % step;
 	for (; off < len; off += step) {
 		hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
 	}
 	return hash;
 }
 #endif
 #if 1
 DUK_LOCAL duk_uint32_t duk__hashstring_lua3(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	const duk_uint8_t *p;
 	const duk_uint8_t *p_stop;
 	/* Forward stepping variant of Lua 5.1.5 using pointers. */
 	hash = heap->hash_seed ^ ((duk_uint32_t) len);
 	if (len > 0) {
 		step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 		p = str + ((len - 1) % step);
 		p_stop = str + len - 1;
 		DUK_ASSERT(((duk_size_t) (p_stop - p) % step) == 0);  /* p eventually hits p_stop */
 		while (p != p_stop) {
 			hash = hash ^ ((hash << 5) + (hash >> 2) + *p);
 			p += step;
 		}
 	}
 	return hash;
 }
 #endif
 #if 1
 DUK_LOCAL duk_uint32_t duk__hashstring_hybrid1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;
 	/* Hybrid with a different algorithm for short and long strings. */
 	hash = heap->hash_seed ^ ((duk_uint32_t) len);
 	if (DUK_LIKELY(len <= 32)) {
 		for (off = 0; off < len; off++) {
 			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
 		}
 	} else {
 		step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 		for (off = 0; off < len; off += step) {
 			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
 		}
 		DUK_ASSERT(len >= 1);
 		hash = hash ^ ((hash << 5) + (hash >> 2) + str[len - 1]);
 	}
 	return hash;
 }
 #endif
 #if 1
 DUK_LOCAL duk_uint32_t duk__hashstring_hybrid2(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;
 	duk_size_t limit;
 	/* Hybrid with a different algorithm for short and long strings.
 	 * For long strings, include first and last 8 bytes entirely, and
 	 * use sparse skipping for the middle.
 	 */
 	hash = heap->hash_seed ^ ((duk_uint32_t) len);
 	if (DUK_LIKELY(len <= 32)) {
 		for (off = 0; off < len; off++) {
 			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
 		}
 	} else {
 		for (off = 0; off < 8; off++) {
 			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
 		}
 		for (off = len - 8; off < len; off++) {
 			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
 		}
 		step = (len >> 4);
 		limit = len - 8;
 		off = 8 + (hash & 0x07);  /* vary offset a bit */
 		for (; off < limit; off += step) {
 			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
 		}
 		hash = hash ^ ((hash << 5) + (hash >> 2) + str[len - 1]);
 	}
 	return hash;
 }
 #endif
 #if 1
 DUK_LOCAL duk_uint32_t duk__hashstring_bernstein1a(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;
 	/* "Bernstein hash" from:
 	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
 	 * but with string skipping and reverse direction (ensures
 	 * last byte is included).
 	 */
 	hash = heap->hash_seed;
 	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 	for (off = len; off >= step; off -= step) {
 		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
 		hash = (hash * 33) + str[off - 1];
 	}
 	return hash;
 }
 #endif
 #if 1
 DUK_LOCAL duk_uint32_t duk__hashstring_bernstein1b(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;
 	/* "Bernstein hash" from:
 	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
 	 * but with string skipping and reverse direction (ensures
 	 * last byte is included).
 	 */
 	hash = heap->hash_seed;
 	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 	for (off = len; off >= step; off -= step) {
 		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
 		hash = ((hash << 5) + hash) + str[off - 1];
 	}
 	return hash;
 }
 #endif
 #if 1
 DUK_LOCAL duk_uint32_t duk__hashstring_bernstein2a(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;
 	/* "Modified Bernstein" from:
 	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
 	 * but with string skipping and reverse direction (ensures
 	 * last byte is included).
 	 */
 	hash = heap->hash_seed;
 	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 	for (off = len; off >= step; off -= step) {
 		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
 		hash = (hash * 33) ^ str[off - 1];
 	}
 	return hash;
 }
 #endif
 #if 1
 DUK_LOCAL duk_uint32_t duk__hashstring_bernstein2b(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;
 	/* "Modified Bernstein" from:
 	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
 	 * but with string skipping and reverse direction (ensures
 	 * last byte is included).
 	 */
 	hash = heap->hash_seed;
 	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 	for (off = len; off >= step; off -= step) {
 		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
 		hash = ((hash << 5) + hash) ^ str[off - 1];
 	}
 	return hash;
 }
 #endif
 #if 1
 DUK_LOCAL duk_uint32_t duk__hashstring_fnv1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;
 	/* "FNV hash" from
 	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
 	 * but with string skipping and reverse direction (ensures
 	 * last byte is included).
 	 */
 	hash = heap->hash_seed;
 	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 	for (off = len; off >= step; off -= step) {
 		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
 		hash = (hash * 16777619L) ^ str[off - 1];
 	}
 	return hash;
 }
 #endif
 #if 1
 DUK_LOCAL duk_uint32_t duk__hashstring_oaat1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;
 	/* "One-at-a-Time hash" from
 	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
 	 * but with string skipping and reverse direction (ensures
 	 * last byte is included).
 	 */
 	hash = heap->hash_seed;
 	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 	for (off = len; off >= step; off -= step) {
 		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
 		hash += str[off - 1];
 		hash += (hash << 10);
 		hash ^= (hash >> 6);
 	}
 	hash += (hash << 3);
 	hash ^= (hash >> 11);
 	hash += (hash << 15);
 	return hash;
 }
 #endif
 DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 #if 0
 	hash = duk__hashstring_lua1(heap, str, len);
 	hash = duk__hashstring_lua2(heap, str, len);
 	hash = duk__hashstring_lua3(heap, str, len);
 	hash = duk__hashstring_hybrid1(heap, str, len);
 	hash = duk__hashstring_hybrid2(heap, str, len);
 	hash = duk__hashstring_bernstein1a(heap, str, len);
 	hash = duk__hashstring_bernstein1b(heap, str, len);
 	hash = duk__hashstring_bernstein2a(heap, str, len);
 	hash = duk__hashstring_bernstein2b(heap, str, len);
 	hash = duk__hashstring_fnv1(heap, str, len);
 	hash = duk__hashstring_oaat1(heap, str, len);
 #endif
 	hash = duk__hashstring_bernstein2b(heap, str, len);
 #if defined(DUK_USE_STRHASH16)
 	/* Truncate to 16 bits here, so that a computed hash can be compared
 	 * against a hash stored in a 16-bit field.