Clean up string hash algorithms

* Remove string hash algorithms which weren't chosen * Clean up comments
9 years ago · 27ebe604b3
2 changed files with 42 additions and 288 deletions
--- a/src/duk_heap_alloc.c
+++ b/src/duk_heap_alloc.c
@ -771,6 +771,9 @@ duk_heap *duk_heap_alloc(duk_alloc_function alloc_func,
 	 */
 	res->hash_seed = (duk_uint32_t) (duk_intptr_t) res;
 	res->rnd_state = (duk_uint32_t) (duk_intptr_t) res;
+#if !defined(DUK_USE_STRHASH_DENSE)
+	res->hash_seed ^= 5381;  /* Bernstein hash init value is normally 5381; XOR it in in case pointer low bits are 0 */
+#endif

 #ifdef DUK_USE_EXPLICIT_NULL_INIT
 	res->lj.jmpbuf_ptr = NULL;
--- a/src/duk_heap_hashstring.c
+++ b/src/duk_heap_hashstring.c
@ -1,11 +1,22 @@
 /*
 *  String hash computation (interning).
+ *
+ *  String hashing is performance critical because a string hash is computed
+ *  for all new strings which are candidates to be added to the string table.
+ *  However, strings actually added to the string table go through a codepoint
+ *  length calculation which dominates performance because it goes through
+ *  every byte of the input string (but only for strings added).
+ *
+ *  The string hash algorithm should be fast, but on the other hand provide
+ *  good enough hashes to ensure both string table and object property table
+ *  hash tables work reasonably well (i.e., there aren't too many collisions
+ *  with real world inputs).  Unless the hash is cryptographic, it's always
+ *  possible to craft inputs with maximal hash collisions.
 */

 #include "duk_internal.h"

 #if defined(DUK_USE_STRHASH_DENSE)
-/* constants for duk_hashstring() */
 #define DUK__STRHASH_SHORTSTRING   4096L
 #define DUK__STRHASH_MEDIUMSTRING  (256L * 1024L)
 #define DUK__STRHASH_BLOCKSIZE     256L
@ -13,25 +24,21 @@
 DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;

-	/*
-	 *  Sampling long strings by byte skipping (like Lua does) is potentially
-	 *  a cache problem.  Here we do 'block skipping' instead for long strings:
-	 *  hash an initial part, and then sample the rest of the string with
-	 *  reasonably sized chunks.
+	/* Use Murmurhash2 directly for short strings, and use "block skipping"
+	 * for long strings: hash an initial part and then sample the rest of
+	 * the string with reasonably sized chunks.  An initial offset for the
+	 * sampling is computed based on a hash of the initial part of the string;
+	 * this is done to (usually) avoid the case where all long strings have
+	 * certain offset ranges which are never sampled.
 	 *
-	 *  Skip should depend on length and bound the total time to roughly
-	 *  logarithmic.
+	 * Skip should depend on length and bound the total time to roughly
+	 * logarithmic.  With current values:
 	 *
-	 *  With current values:
+	 *   1M string => 256 * 241 = 61696 bytes (0.06M) of hashing
+	 *   1G string => 256 * 16321 = 4178176 bytes (3.98M) of hashing
 	 *
-	 *    1M string => 256 * 241 = 61696 bytes (0.06M) of hashing
-	 *    1G string => 256 * 16321 = 4178176 bytes (3.98M) of hashing
-	 *
-	 *  After an initial part has been hashed, an offset is applied before
-	 *  starting the sampling.  The initial offset is computed from the
-	 *  hash of the initial part of the string.  The idea is to avoid the
-	 *  case that all long strings have certain offset ranges that are never
-	 *  sampled.
+	 * XXX: It would be better to compute the skip offset more "smoothly"
+	 * instead of having a few boundary values.
 	 */

 	/* note: mixing len into seed improves hashing when skipping */
@ -69,291 +76,35 @@ DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t
 #endif
 	return hash;
 }
+
+#undef DUK__STRHASH_SHORTSTRING
+#undef DUK__STRHASH_MEDIUMSTRING
+#undef DUK__STRHASH_BLOCKSIZE
 #else  /* DUK_USE_STRHASH_DENSE */
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_lua1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
+DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
 	duk_uint32_t hash;
 	duk_size_t step;
 	duk_size_t off;

-	/* String algorithm based on Lua 5.1.5 with small modifications.
-	 * See lstring.c:luaS_newlstr().
+	/* Slightly modified "Bernstein hash" from:
 	 *
-	 * This is basically "Shift-add-XOR hash" with skipping and reverse
-	 * direction:
-	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
-	 */
-
-	hash = heap->hash_seed ^ ((duk_uint32_t) len);
-	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
-	for (off = len; off >= step; off -= step) {
-		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
-		hash = hash ^ ((hash << 5) + (hash >> 2) + str[off - 1]);
-	}
-
-	return hash;
-}
-#endif
-
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_lua2(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-	duk_size_t step;
-	duk_size_t off;
-
-	/* Forward stepping variant of Lua 5.1.5. */
-	hash = heap->hash_seed ^ ((duk_uint32_t) len);
-	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
-	off = (len + step - 1) % step;
-	for (; off < len; off += step) {
-		hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
-	}
-
-	return hash;
-}
-#endif
-
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_lua3(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-	duk_size_t step;
-	const duk_uint8_t *p;
-	const duk_uint8_t *p_stop;
-
-	/* Forward stepping variant of Lua 5.1.5 using pointers. */
-	hash = heap->hash_seed ^ ((duk_uint32_t) len);
-	if (len > 0) {
-		step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
-		p = str + ((len - 1) % step);
-		p_stop = str + len - 1;
-		DUK_ASSERT(((duk_size_t) (p_stop - p) % step) == 0);  /* p eventually hits p_stop */
-		while (p != p_stop) {
-			hash = hash ^ ((hash << 5) + (hash >> 2) + *p);
-			p += step;
-		}
-	}
-
-	return hash;
-}
-#endif
-
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_hybrid1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-	duk_size_t step;
-	duk_size_t off;
-
-	/* Hybrid with a different algorithm for short and long strings. */
-	hash = heap->hash_seed ^ ((duk_uint32_t) len);
-	if (DUK_LIKELY(len <= 32)) {
-		for (off = 0; off < len; off++) {
-			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
-		}
-	} else {
-		step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
-		for (off = 0; off < len; off += step) {
-			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
-		}
-		DUK_ASSERT(len >= 1);
-		hash = hash ^ ((hash << 5) + (hash >> 2) + str[len - 1]);
-	}
-
-	return hash;
-}
-#endif
-
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_hybrid2(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-	duk_size_t step;
-	duk_size_t off;
-	duk_size_t limit;
-
-	/* Hybrid with a different algorithm for short and long strings.
-	 * For long strings, include first and last 8 bytes entirely, and
-	 * use sparse skipping for the middle.
+	 *     http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
+	 *
+	 * Modifications: string skipping and reverse direction similar to
+	 * Lua 5.1.5, and different hash initializer.
+	 *
+	 * The reverse direction ensures last byte it always included in the
+	 * hash which is a good default as changing parts of the string are
+	 * more often in the suffix than in the prefix.
 	 */
-	hash = heap->hash_seed ^ ((duk_uint32_t) len);
-	if (DUK_LIKELY(len <= 32)) {
-		for (off = 0; off < len; off++) {
-			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
-		}
-	} else {
-		for (off = 0; off < 8; off++) {
-			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
-		}
-		for (off = len - 8; off < len; off++) {
-			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
-		}
-		step = (len >> 4);
-		limit = len - 8;
-		off = 8 + (hash & 0x07);  /* vary offset a bit */
-		for (; off < limit; off += step) {
-			hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
-		}
-		hash = hash ^ ((hash << 5) + (hash >> 2) + str[len - 1]);
-	}
-
-	return hash;
-}
-#endif
-
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_bernstein1a(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-	duk_size_t step;
-	duk_size_t off;

-	/* "Bernstein hash" from:
-	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
-	 * but with string skipping and reverse direction (ensures
-	 * last byte is included).
-	 */
-	hash = heap->hash_seed;
+	hash = heap->hash_seed ^ ((duk_uint32_t) len);  /* Bernstein hash init value is normally 5381 */
 	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
 	for (off = len; off >= step; off -= step) {
 		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
 		hash = (hash * 33) + str[off - 1];
 	}

-	return hash;
-}
-#endif
-
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_bernstein1b(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-	duk_size_t step;
-	duk_size_t off;
-
-	/* "Bernstein hash" from:
-	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
-	 * but with string skipping and reverse direction (ensures
-	 * last byte is included).
-	 */
-	hash = heap->hash_seed;
-	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
-	for (off = len; off >= step; off -= step) {
-		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
-		hash = ((hash << 5) + hash) + str[off - 1];
-	}
-
-	return hash;
-}
-#endif
-
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_bernstein2a(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-	duk_size_t step;
-	duk_size_t off;
-
-	/* "Modified Bernstein" from:
-	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
-	 * but with string skipping and reverse direction (ensures
-	 * last byte is included).
-	 */
-	hash = heap->hash_seed;
-	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
-	for (off = len; off >= step; off -= step) {
-		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
-		hash = (hash * 33) ^ str[off - 1];
-	}
-
-	return hash;
-}
-#endif
-
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_bernstein2b(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-	duk_size_t step;
-	duk_size_t off;
-
-	/* "Modified Bernstein" from:
-	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
-	 * but with string skipping and reverse direction (ensures
-	 * last byte is included).
-	 */
-	hash = heap->hash_seed;
-	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
-	for (off = len; off >= step; off -= step) {
-		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
-		hash = ((hash << 5) + hash) ^ str[off - 1];
-	}
-
-	return hash;
-}
-#endif
-
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_fnv1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-	duk_size_t step;
-	duk_size_t off;
-
-	/* "FNV hash" from
-	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
-	 * but with string skipping and reverse direction (ensures
-	 * last byte is included).
-	 */
-	hash = heap->hash_seed;
-	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
-	for (off = len; off >= step; off -= step) {
-		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
-		hash = (hash * 16777619L) ^ str[off - 1];
-	}
-
-	return hash;
-}
-#endif
-
-#if 1
-DUK_LOCAL duk_uint32_t duk__hashstring_oaat1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-	duk_size_t step;
-	duk_size_t off;
-
-	/* "One-at-a-Time hash" from
-	 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
-	 * but with string skipping and reverse direction (ensures
-	 * last byte is included).
-	 */
-	hash = heap->hash_seed;
-	step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
-	for (off = len; off >= step; off -= step) {
-		DUK_ASSERT(off >= 1);  /* off >= step, and step >= 1 */
-		hash += str[off - 1];
-		hash += (hash << 10);
-		hash ^= (hash >> 6);
-	}
-	hash += (hash << 3);
-	hash ^= (hash >> 11);
-	hash += (hash << 15);
-
-	return hash;
-}
-#endif
-
-DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
-	duk_uint32_t hash;
-
-#if 0
-	hash = duk__hashstring_lua1(heap, str, len);
-	hash = duk__hashstring_lua2(heap, str, len);
-	hash = duk__hashstring_lua3(heap, str, len);
-	hash = duk__hashstring_hybrid1(heap, str, len);
-	hash = duk__hashstring_hybrid2(heap, str, len);
-	hash = duk__hashstring_bernstein1a(heap, str, len);
-	hash = duk__hashstring_bernstein1b(heap, str, len);
-	hash = duk__hashstring_bernstein2a(heap, str, len);
-	hash = duk__hashstring_bernstein2b(heap, str, len);
-	hash = duk__hashstring_fnv1(heap, str, len);
-	hash = duk__hashstring_oaat1(heap, str, len);
-#endif
-
-	hash = duk__hashstring_bernstein2b(heap, str, len);
-
 #if defined(DUK_USE_STRHASH16)
 	/* Truncate to 16 bits here, so that a computed hash can be compared
 	 * against a hash stored in a 16-bit field.