Browse Source

Clean up string hash algorithms

* Remove string hash algorithms which weren't chosen

* Clean up comments
pull/432/head
Sami Vaarala 9 years ago
parent
commit
27ebe604b3
  1. 3
      src/duk_heap_alloc.c
  2. 319
      src/duk_heap_hashstring.c

3
src/duk_heap_alloc.c

@ -771,6 +771,9 @@ duk_heap *duk_heap_alloc(duk_alloc_function alloc_func,
*/
res->hash_seed = (duk_uint32_t) (duk_intptr_t) res;
res->rnd_state = (duk_uint32_t) (duk_intptr_t) res;
#if !defined(DUK_USE_STRHASH_DENSE)
res->hash_seed ^= 5381; /* Bernstein hash init value is normally 5381; XOR it in in case pointer low bits are 0 */
#endif
#ifdef DUK_USE_EXPLICIT_NULL_INIT
res->lj.jmpbuf_ptr = NULL;

319
src/duk_heap_hashstring.c

@ -1,11 +1,22 @@
/*
* String hash computation (interning).
*
* String hashing is performance critical because a string hash is computed
* for all new strings which are candidates to be added to the string table.
* However, strings actually added to the string table go through a codepoint
* length calculation which dominates performance because it goes through
* every byte of the input string (but only for strings added).
*
* The string hash algorithm should be fast, but on the other hand provide
* good enough hashes to ensure both string table and object property table
* hash tables work reasonably well (i.e., there aren't too many collisions
* with real world inputs). Unless the hash is cryptographic, it's always
* possible to craft inputs with maximal hash collisions.
*/
#include "duk_internal.h"
#if defined(DUK_USE_STRHASH_DENSE)
/* constants for duk_hashstring() */
#define DUK__STRHASH_SHORTSTRING 4096L
#define DUK__STRHASH_MEDIUMSTRING (256L * 1024L)
#define DUK__STRHASH_BLOCKSIZE 256L
@ -13,25 +24,21 @@
DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
/*
* Sampling long strings by byte skipping (like Lua does) is potentially
* a cache problem. Here we do 'block skipping' instead for long strings:
* hash an initial part, and then sample the rest of the string with
* reasonably sized chunks.
/* Use Murmurhash2 directly for short strings, and use "block skipping"
* for long strings: hash an initial part and then sample the rest of
* the string with reasonably sized chunks. An initial offset for the
* sampling is computed based on a hash of the initial part of the string;
* this is done to (usually) avoid the case where all long strings have
* certain offset ranges which are never sampled.
*
* Skip should depend on length and bound the total time to roughly
* logarithmic.
*
* With current values:
* logarithmic. With current values:
*
* 1M string => 256 * 241 = 61696 bytes (0.06M) of hashing
* 1G string => 256 * 16321 = 4178176 bytes (3.98M) of hashing
*
* After an initial part has been hashed, an offset is applied before
* starting the sampling. The initial offset is computed from the
* hash of the initial part of the string. The idea is to avoid the
* case that all long strings have certain offset ranges that are never
* sampled.
* XXX: It would be better to compute the skip offset more "smoothly"
* instead of having a few boundary values.
*/
/* note: mixing len into seed improves hashing when skipping */
@ -69,291 +76,35 @@ DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t
#endif
return hash;
}
#undef DUK__STRHASH_SHORTSTRING
#undef DUK__STRHASH_MEDIUMSTRING
#undef DUK__STRHASH_BLOCKSIZE
#else /* DUK_USE_STRHASH_DENSE */
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_lua1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
duk_size_t off;
/* String algorithm based on Lua 5.1.5 with small modifications.
* See lstring.c:luaS_newlstr().
/* Slightly modified "Bernstein hash" from:
*
* This is basically "Shift-add-XOR hash" with skipping and reverse
* direction:
* http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
*
* Modifications: string skipping and reverse direction similar to
* Lua 5.1.5, and different hash initializer.
*
* The reverse direction ensures last byte it always included in the
* hash which is a good default as changing parts of the string are
* more often in the suffix than in the prefix.
*/
hash = heap->hash_seed ^ ((duk_uint32_t) len);
step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
for (off = len; off >= step; off -= step) {
DUK_ASSERT(off >= 1); /* off >= step, and step >= 1 */
hash = hash ^ ((hash << 5) + (hash >> 2) + str[off - 1]);
}
return hash;
}
#endif
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_lua2(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
duk_size_t off;
/* Forward stepping variant of Lua 5.1.5. */
hash = heap->hash_seed ^ ((duk_uint32_t) len);
step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
off = (len + step - 1) % step;
for (; off < len; off += step) {
hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
}
return hash;
}
#endif
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_lua3(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
const duk_uint8_t *p;
const duk_uint8_t *p_stop;
/* Forward stepping variant of Lua 5.1.5 using pointers. */
hash = heap->hash_seed ^ ((duk_uint32_t) len);
if (len > 0) {
step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
p = str + ((len - 1) % step);
p_stop = str + len - 1;
DUK_ASSERT(((duk_size_t) (p_stop - p) % step) == 0); /* p eventually hits p_stop */
while (p != p_stop) {
hash = hash ^ ((hash << 5) + (hash >> 2) + *p);
p += step;
}
}
return hash;
}
#endif
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_hybrid1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
duk_size_t off;
/* Hybrid with a different algorithm for short and long strings. */
hash = heap->hash_seed ^ ((duk_uint32_t) len);
if (DUK_LIKELY(len <= 32)) {
for (off = 0; off < len; off++) {
hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
}
} else {
step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
for (off = 0; off < len; off += step) {
hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
}
DUK_ASSERT(len >= 1);
hash = hash ^ ((hash << 5) + (hash >> 2) + str[len - 1]);
}
return hash;
}
#endif
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_hybrid2(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
duk_size_t off;
duk_size_t limit;
/* Hybrid with a different algorithm for short and long strings.
* For long strings, include first and last 8 bytes entirely, and
* use sparse skipping for the middle.
*/
hash = heap->hash_seed ^ ((duk_uint32_t) len);
if (DUK_LIKELY(len <= 32)) {
for (off = 0; off < len; off++) {
hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
}
} else {
for (off = 0; off < 8; off++) {
hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
}
for (off = len - 8; off < len; off++) {
hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
}
step = (len >> 4);
limit = len - 8;
off = 8 + (hash & 0x07); /* vary offset a bit */
for (; off < limit; off += step) {
hash = hash ^ ((hash << 5) + (hash >> 2) + str[off]);
}
hash = hash ^ ((hash << 5) + (hash >> 2) + str[len - 1]);
}
return hash;
}
#endif
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_bernstein1a(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
duk_size_t off;
/* "Bernstein hash" from:
* http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
* but with string skipping and reverse direction (ensures
* last byte is included).
*/
hash = heap->hash_seed;
hash = heap->hash_seed ^ ((duk_uint32_t) len); /* Bernstein hash init value is normally 5381 */
step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
for (off = len; off >= step; off -= step) {
DUK_ASSERT(off >= 1); /* off >= step, and step >= 1 */
hash = (hash * 33) + str[off - 1];
}
return hash;
}
#endif
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_bernstein1b(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
duk_size_t off;
/* "Bernstein hash" from:
* http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
* but with string skipping and reverse direction (ensures
* last byte is included).
*/
hash = heap->hash_seed;
step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
for (off = len; off >= step; off -= step) {
DUK_ASSERT(off >= 1); /* off >= step, and step >= 1 */
hash = ((hash << 5) + hash) + str[off - 1];
}
return hash;
}
#endif
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_bernstein2a(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
duk_size_t off;
/* "Modified Bernstein" from:
* http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
* but with string skipping and reverse direction (ensures
* last byte is included).
*/
hash = heap->hash_seed;
step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
for (off = len; off >= step; off -= step) {
DUK_ASSERT(off >= 1); /* off >= step, and step >= 1 */
hash = (hash * 33) ^ str[off - 1];
}
return hash;
}
#endif
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_bernstein2b(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
duk_size_t off;
/* "Modified Bernstein" from:
* http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
* but with string skipping and reverse direction (ensures
* last byte is included).
*/
hash = heap->hash_seed;
step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
for (off = len; off >= step; off -= step) {
DUK_ASSERT(off >= 1); /* off >= step, and step >= 1 */
hash = ((hash << 5) + hash) ^ str[off - 1];
}
return hash;
}
#endif
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_fnv1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
duk_size_t off;
/* "FNV hash" from
* http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
* but with string skipping and reverse direction (ensures
* last byte is included).
*/
hash = heap->hash_seed;
step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
for (off = len; off >= step; off -= step) {
DUK_ASSERT(off >= 1); /* off >= step, and step >= 1 */
hash = (hash * 16777619L) ^ str[off - 1];
}
return hash;
}
#endif
#if 1
DUK_LOCAL duk_uint32_t duk__hashstring_oaat1(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
duk_size_t step;
duk_size_t off;
/* "One-at-a-Time hash" from
* http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
* but with string skipping and reverse direction (ensures
* last byte is included).
*/
hash = heap->hash_seed;
step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
for (off = len; off >= step; off -= step) {
DUK_ASSERT(off >= 1); /* off >= step, and step >= 1 */
hash += str[off - 1];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
#endif
DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
duk_uint32_t hash;
#if 0
hash = duk__hashstring_lua1(heap, str, len);
hash = duk__hashstring_lua2(heap, str, len);
hash = duk__hashstring_lua3(heap, str, len);
hash = duk__hashstring_hybrid1(heap, str, len);
hash = duk__hashstring_hybrid2(heap, str, len);
hash = duk__hashstring_bernstein1a(heap, str, len);
hash = duk__hashstring_bernstein1b(heap, str, len);
hash = duk__hashstring_bernstein2a(heap, str, len);
hash = duk__hashstring_bernstein2b(heap, str, len);
hash = duk__hashstring_fnv1(heap, str, len);
hash = duk__hashstring_oaat1(heap, str, len);
#endif
hash = duk__hashstring_bernstein2b(heap, str, len);
#if defined(DUK_USE_STRHASH16)
/* Truncate to 16 bits here, so that a computed hash can be compared
* against a hash stored in a 16-bit field.

Loading…
Cancel
Save