Browse Source

WTF-8 implementation improvements

* Remove lazy charlen support.  Since we need to WTF-8 sanitize the entire
  input string, charlen can be computed while validating (avoiding extra
  book-keeping for ASCII eventually).

* Improve WTF-8 search forwards/backwards performance (no substring operations)
  when the search string is valid UTF-8.  Use reference implementation for
  non-UTF-8 still, to be optimized later.

* Minor testcase improvements.
pull/2459/head
Sami Vaarala 3 years ago
parent
commit
3b2643ebf8
  1. 4
      config/config-options/DUK_USE_HSTRING_LAZY_CLEN.yaml
  2. 2
      config/examples/low_memory.yaml
  3. 18
      src-input/duk_bi_json.c
  4. 2
      src-input/duk_heap.h
  5. 5
      src-input/duk_heap_markandsweep.c
  6. 15
      src-input/duk_heap_stringcache.c
  7. 95
      src-input/duk_heap_stringtable.c
  8. 12
      src-input/duk_hstring.h
  9. 106
      src-input/duk_hstring_misc.c
  10. 20
      src-input/duk_unicode.h
  11. 346
      src-input/duk_unicode_wtf8.c
  12. 20
      tests/ecmascript/test-wtf8-string-lastindexof-2.js
  13. 2
      tests/perf/test-string-indexof-1.js
  14. 2
      tests/perf/test-string-lastindexof-1.js
  15. 2
      tests/perf/test-string-replace-ascii-1.js
  16. 2
      tests/perf/test-string-replace-ascii-2.js
  17. 2
      tests/perf/test-string-replace-nonbmp-1.js
  18. 2
      tests/perf/test-string-replace-nonbmp-2.js
  19. 29
      tests/perf/test-string-split-ascii-1.js
  20. 29
      tests/perf/test-string-split-ascii-2.js
  21. 2
      tests/perf/test-string-split-nonbmp-1.js
  22. 29
      tests/perf/test-string-split-nonbmp-2.js
  23. 1
      util/index_page_sizes.sh
  24. 2
      util/makeduk_base.yaml
  25. 1
      util/makeduk_duklow.yaml

4
config/config-options/DUK_USE_HSTRING_LAZY_CLEN.yaml

@ -1,5 +1,6 @@
define: DUK_USE_HSTRING_LAZY_CLEN
introduced: 2.2.0
removed: 3.0.0
default: true
tags:
- performance
@ -9,3 +10,6 @@ description: >
reduces unnecessary charlen calculations. When disabled, charlen is computed
during interning which has smaller code footprint at slightly slower charlen
handling.
Removed in Duktape 3.x with WTF-8 change where charlen is naturally computed
during WTF-8 sanitization.

2
config/examples/low_memory.yaml

@ -75,7 +75,6 @@ DUK_USE_STRTAB_RESIZE_CHECK_MASK: 255 # -""-
DUK_USE_LITCACHE_SIZE: false
DUK_USE_HSTRING_ARRIDX: false
DUK_USE_HSTRING_LAZY_CLEN: false # non-lazy charlen is smaller
# Only add a hash table for quite large objects to conserve memory. Even
# lower memory targets usually drop hash part support entirely.
@ -95,7 +94,6 @@ DUK_USE_CACHE_CATCHER: false
#DUK_USE_BUFLEN16: true
#DUK_USE_OBJSIZES16: true
#DUK_USE_HSTRING_CLEN: false
#DUK_USE_HSTRING_LAZY_CLEN: false
#DUK_USE_HOBJECT_HASH_PART: false
#DUK_USE_HEAPPTR16
#DUK_USE_HEAPPTR_DEC16

18
src-input/duk_bi_json.c

@ -1417,22 +1417,12 @@ DUK_LOCAL void duk__json_enc_quote_string(duk_json_enc_ctx *js_ctx, duk_hstring
if (need_esc) {
q = duk__emit_esc_auto_fast(js_ctx, cp, q);
} else {
/* Emit without escaping, but split codepoints in
* [U+10000,U+10FFFF] into surrogates for output.
/* Conceptually we should split the non-BMP codepoint
* into a surrogate pair for output. But since the
* surrogate pair would be combined in WTF-8 sanitization
* we can just emit the UTF-8 codepoint as is.
*/
DUK_RAW_WRITEINC_XUTF8(q, cp);
#if 0
if (cp >= 0x10000UL) {
duk_ucodepoint_t hi, lo;
cp -= 0x10000UL;
hi = 0xd800UL + (cp >> 10);
lo = 0xdc00UL + (cp & 0x3ffUL);
DUK_RAW_WRITEINC_XUTF8(q, hi);
DUK_RAW_WRITEINC_XUTF8(q, lo);
} else {
DUK_RAW_WRITEINC_XUTF8(q, cp);
}
#endif
}
}
}

2
src-input/duk_heap.h

@ -618,6 +618,8 @@ struct duk_heap {
duk_int_t stats_ms_try_count;
duk_int_t stats_ms_skip_count;
duk_int_t stats_ms_emergency_count;
duk_int_t stats_strtab_intern_notemp;
duk_int_t stats_strtab_intern_temp;
duk_int_t stats_strtab_intern_hit;
duk_int_t stats_strtab_intern_miss;
duk_int_t stats_strtab_resize_check;

5
src-input/duk_heap_markandsweep.c

@ -1155,9 +1155,12 @@ DUK_LOCAL void duk__dump_stats(duk_heap *heap) {
(long) heap->stats_ms_try_count,
(long) heap->stats_ms_skip_count,
(long) heap->stats_ms_emergency_count));
DUK_D(DUK_DPRINT("stats stringtable: intern_hit=%ld, intern_miss=%ld, "
DUK_D(DUK_DPRINT("stats stringtable: intern_notemp=%ld, intern_temp=%ld, "
"intern_hit=%ld, intern_miss=%ld, "
"resize_check=%ld, resize_grow=%ld, resize_shrink=%ld, "
"litcache_hit=%ld, litcache_miss=%ld, litcache_pin=%ld",
(long) heap->stats_strtab_intern_notemp,
(long) heap->stats_strtab_intern_temp,
(long) heap->stats_strtab_intern_hit,
(long) heap->stats_strtab_intern_miss,
(long) heap->stats_strtab_resize_check,

15
src-input/duk_heap_stringcache.c

@ -102,7 +102,7 @@ DUK_LOCAL void duk__strcache_scan_char2byte_wtf8_forwards_1(duk_hthread *thr,
#endif
/* Forward scan lookup. */
const duk_uint_t duk__strcache_wtf8_pstep_lookup[256] = {
DUK_LOCAL const duk_uint_t duk__strcache_wtf8_pstep_lookup[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@ -113,7 +113,7 @@ const duk_uint_t duk__strcache_wtf8_pstep_lookup[256] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
const duk_uint_t duk__strcache_wtf8_leftadj_lookup[256] = {
DUK_LOCAL const duk_uint_t duk__strcache_wtf8_leftadj_lookup[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@ -304,7 +304,6 @@ DUK_LOCAL void duk__strcache_scan_char2byte_wtf8_uncached(duk_hthread *thr,
* So prefer scanning forwards with a crude quick check.
*/
prefer_forwards = (dist_start / 2 <= dist_end);
if (prefer_forwards) {
duk_uint_fast32_t start_boff = 0;
duk_uint_fast32_t start_coff = 0;
@ -475,16 +474,6 @@ DUK_INTERNAL void duk_strcache_scan_char2byte_wtf8(duk_hthread *thr,
char_length = (duk_uint_fast32_t) duk_hstring_get_charlen(h);
DUK_ASSERT(char_offset <= char_length);
if (DUK_LIKELY(duk_hstring_is_ascii(h) != 0)) {
/* Must recheck because the 'is ascii' flag may be set
* lazily. Alternatively, we could just compare charlen
* to bytelen.
*/
*out_byteoff = char_offset;
*out_charoff = char_offset;
return;
}
/*
* For non-ASCII strings, we need to scan forwards or backwards
* from some starting point. The starting point may be the start

95
src-input/duk_heap_stringtable.c

@ -145,13 +145,12 @@ DUK_LOCAL void duk__strtable_assert_checks(duk_heap *heap) {
DUK_LOCAL duk_hstring *duk__strtable_alloc_hstring(duk_heap *heap,
const duk_uint8_t *str,
duk_uint32_t blen,
duk_uint32_t clen,
duk_uint32_t strhash,
const duk_uint8_t *extdata) {
duk_hstring *res;
const duk_uint8_t *data;
#if !defined(DUK_USE_HSTRING_ARRIDX)
duk_uarridx_t dummy;
#endif
duk_uarridx_t arridx;
DUK_ASSERT(heap != NULL);
DUK_UNREF(extdata);
@ -162,6 +161,7 @@ DUK_LOCAL duk_hstring *duk__strtable_alloc_hstring(duk_heap *heap,
DUK_D(DUK_DPRINT("16-bit string blen/clen active and blen over 16 bits, reject intern"));
goto alloc_error;
}
DUK_ASSERT(clen <= 0xffffUL);
#endif
/* XXX: Memzeroing the allocated structure is not really necessary
@ -209,52 +209,54 @@ DUK_LOCAL duk_hstring *duk__strtable_alloc_hstring(duk_heap *heap,
duk_hstring_set_bytelen(res, blen);
duk_hstring_set_hash(res, strhash);
DUK_ASSERT(!DUK_HSTRING_HAS_ARRIDX(res));
if (blen == clen) {
DUK_ASSERT(!duk_hstring_is_symbol_initial_byte(data[0])); /* blen > 0, clen = 0 for symbols */
DUK_ASSERT(duk_unicode_wtf8_charlength(data, (duk_size_t) blen) == blen);
DUK_HSTRING_SET_ASCII(res);
duk_hstring_set_charlen(res, blen);
arridx = duk_js_to_arrayindex_string(data, blen);
#if defined(DUK_USE_HSTRING_ARRIDX)
res->arridx = duk_js_to_arrayindex_string(data, blen);
if (res->arridx != DUK_HSTRING_NO_ARRAY_INDEX) {
#else
dummy = duk_js_to_arrayindex_string(data, blen);
if (dummy != DUK_HSTRING_NO_ARRAY_INDEX) {
res->arridx = arridx;
#endif
/* Array index strings cannot be symbol strings,
* and they're always pure ASCII so blen == clen.
*/
DUK_HSTRING_SET_ARRIDX(res);
DUK_HSTRING_SET_ASCII(res);
DUK_ASSERT(duk_unicode_wtf8_charlength(data, (duk_size_t) blen) == blen);
if (arridx != DUK_HSTRING_NO_ARRAY_INDEX) {
/* Array index strings cannot be symbol strings,
* and they're always pure ASCII so blen == clen.
*/
DUK_HSTRING_SET_ARRIDX(res);
} else {
DUK_ASSERT(!DUK_HSTRING_HAS_ARRIDX(res));
}
} else {
DUK_ASSERT(!DUK_HSTRING_HAS_ASCII(res));
DUK_ASSERT(!DUK_HSTRING_HAS_ARRIDX(res));
DUK_ASSERT(duk_js_to_arrayindex_string(data, blen) == DUK_HSTRING_NO_ARRAY_INDEX);
#if defined(DUK_USE_HSTRING_ARRIDX)
res->arridx = DUK_HSTRING_NO_ARRAY_INDEX;
#endif
/* Because 'data' is NUL-terminated, we don't need a
* blen > 0 check here. For NUL (0x00) the symbol
* checks will be false.
*/
duk_hstring_set_charlen(res, clen);
if (DUK_UNLIKELY(data[0] >= 0x80U)) {
if (data[0] <= 0x81) {
DUK_HSTRING_SET_SYMBOL(res);
DUK_ASSERT(clen == 0); /* Caller ensures */
} else if (data[0] == 0x82U || data[0] == 0xffU) {
DUK_HSTRING_SET_HIDDEN(res);
DUK_HSTRING_SET_SYMBOL(res);
DUK_ASSERT(clen == 0); /* Caller ensures */
}
}
/* Using an explicit 'ASCII' flag has larger footprint (one call site
* only) but is quite useful for the case when there's no explicit
* 'clen' in duk_hstring.
*
* The flag is set lazily for RAM strings.
*/
DUK_ASSERT(!DUK_HSTRING_HAS_ASCII(res));
#if defined(DUK_USE_HSTRING_LAZY_CLEN)
/* Charlen initialized to 0, updated on-the-fly. */
#else
duk_hstring_init_charlen(res); /* Also sets ASCII flag. */
#endif
}
DUK_DDD(DUK_DDDPRINT("interned string, hash=0x%08lx, blen=%ld, has_arridx=%ld, has_extdata=%ld",
DUK_DDD(DUK_DDDPRINT("interned string, hash=0x%08lx, blen=%ld, clen=%ld, has_arridx=%ld, has_extdata=%ld",
(unsigned long) duk_hstring_get_hash(res),
(long) duk_hstring_get_bytelen(res),
(long) duk_hstring_get_charlen(res),
(long) (DUK_HSTRING_HAS_ARRIDX(res) ? 1 : 0),
(long) (DUK_HSTRING_HAS_EXTDATA(res) ? 1 : 0)));
@ -564,7 +566,11 @@ DUK_LOCAL void duk__strtable_resize_torture(duk_heap *heap) {
* Raw intern; string already checked not to be present.
*/
DUK_LOCAL duk_hstring *duk__strtable_do_intern(duk_heap *heap, const duk_uint8_t *str, duk_uint32_t blen, duk_uint32_t strhash) {
DUK_LOCAL duk_hstring *duk__strtable_do_intern(duk_heap *heap,
const duk_uint8_t *str,
duk_uint32_t blen,
duk_uint32_t clen,
duk_uint32_t strhash) {
duk_hstring *res;
const duk_uint8_t *extdata;
#if defined(DUK_USE_STRTAB_PTRCOMP)
@ -573,10 +579,11 @@ DUK_LOCAL duk_hstring *duk__strtable_do_intern(duk_heap *heap, const duk_uint8_t
duk_hstring **slot;
#endif
DUK_DDD(DUK_DDDPRINT("do_intern: heap=%p, str=%p, blen=%lu, strhash=%lx, st_size=%lu, st_count=%lu, load=%lf",
DUK_DDD(DUK_DDDPRINT("do_intern: heap=%p, str=%p, blen=%lu, clen=%lu, strhash=%lx, st_size=%lu, st_count=%lu, load=%lf",
(void *) heap,
(const void *) str,
(unsigned long) blen,
(unsigned long) clen,
(unsigned long) strhash,
(unsigned long) heap->st_size,
(unsigned long) heap->st_count,
@ -635,7 +642,7 @@ DUK_LOCAL duk_hstring *duk__strtable_do_intern(duk_heap *heap, const duk_uint8_t
* a buffer used as a data area for 'str'.
*/
res = duk__strtable_alloc_hstring(heap, str, blen, strhash, extdata);
res = duk__strtable_alloc_hstring(heap, str, blen, clen, strhash, extdata);
/* Allow side effects again: GC must be avoided until duk_hstring
* result (if successful) has been INCREF'd.
@ -731,6 +738,7 @@ DUK_INTERNAL duk_hstring *duk_heap_strtable_intern(duk_heap *heap, const duk_uin
duk_uint8_t tmp[DUK__WTF8_INTERN_SHORT_LIMIT * 3];
duk_uint8_t *tmp_alloc = NULL;
duk_uint32_t blen_keep;
duk_uint32_t clen;
DUK_DDD(DUK_DDDPRINT("intern check: heap=%p, str=%p, blen=%lu", (void *) heap, (const void *) str, (unsigned long) blen));
@ -745,7 +753,7 @@ DUK_INTERNAL duk_hstring *duk_heap_strtable_intern(duk_heap *heap, const duk_uin
* with U+FFFD replacements and combining valid surrogate pairs. Optimize
* for not needing to do so, i.e. input string is already valid WTF-8.
*/
clen = 0;
blen_keep = duk_unicode_wtf8_sanitize_keepcheck(str, blen);
DUK_ASSERT(blen_keep <= blen);
if (DUK_LIKELY(blen_keep == blen)) {
@ -756,24 +764,28 @@ DUK_INTERNAL duk_hstring *duk_heap_strtable_intern(duk_heap *heap, const duk_uin
* Also Symbol strings are handled here now: keepcheck must
* return blen_keep == blen for them.
*/
DUK_STATS_INC(heap, stats_strtab_intern_notemp);
} else {
/* 'blen_keep' bytes can be kept, but the rest may need
* some rewrites.
*
* Symbols are handled above (keep without rewrite).
*/
duk_uint32_t blen_remain = blen - blen_keep;
duk_uint32_t blen_remain;
DUK_STATS_INC(heap, stats_strtab_intern_temp);
blen_remain = blen - blen_keep;
if (DUK_LIKELY(blen <= DUK__WTF8_INTERN_SHORT_LIMIT)) {
duk_uint32_t new_blen;
duk_uint32_t new_clen;
duk_memcpy((void *) tmp, (const void *) str, blen_keep);
new_blen = duk_unicode_wtf8_sanitize_string(str + blen_keep, blen_remain, tmp + blen_keep);
new_blen = duk_unicode_wtf8_sanitize_string(str + blen_keep, blen_remain, tmp + blen_keep, &new_clen);
str = tmp;
blen = blen_keep + new_blen;
} else {
duk_uint32_t blen_alloc;
duk_uint32_t new_blen;
duk_uint32_t new_clen;
heap->pf_prevent_count++;
DUK_ASSERT(heap->pf_prevent_count != 0); /* Wrap. */
@ -797,7 +809,7 @@ DUK_INTERNAL duk_hstring *duk_heap_strtable_intern(duk_heap *heap, const duk_uin
}
duk_memcpy((void *) tmp_alloc, (const void *) str, blen_keep);
new_blen = duk_unicode_wtf8_sanitize_string(str + blen_keep, blen_remain, tmp_alloc + blen_keep);
new_blen = duk_unicode_wtf8_sanitize_string(str + blen_keep, blen_remain, tmp_alloc + blen_keep, &new_clen);
str = tmp_alloc;
blen = blen_keep + new_blen;
}
@ -841,7 +853,10 @@ DUK_INTERNAL duk_hstring *duk_heap_strtable_intern(duk_heap *heap, const duk_uin
/* Not found in string table; insert. */
DUK_STATS_INC(heap, stats_strtab_intern_miss);
h = duk__strtable_do_intern(heap, str, blen, strhash);
/* For now compute final charlen here, should be done inline in WTF-8 sanitize. */
clen = duk_hstring_is_symbol_initial_byte(str[0]) ? 0 : duk_unicode_wtf8_charlength(str, blen);
h = duk__strtable_do_intern(heap, str, blen, clen, strhash);
goto done;
done:

12
src-input/duk_hstring.h

@ -36,12 +36,6 @@
/* Maximum string charlen equals maximum bytelen for the ASCII case. */
#define DUK_HSTRING_MAX_CHARLEN DUK_HSTRING_MAX_BYTELEN
/* XXX: could add flags for "is valid CESU-8" (ECMAScript compatible strings),
* "is valid UTF-8", "is valid extended UTF-8" (internal strings are not,
* regexp bytecode is), and "contains non-BMP characters". These are not
* needed right now.
*/
/* With lowmem builds the high 16 bits of duk_heaphdr are used for other
* purposes, so this leaves 7 duk_heaphdr flags and 9 duk_hstring flags.
*/
@ -173,15 +167,13 @@ struct duk_hstring_external {
DUK_INTERNAL_DECL duk_bool_t duk_hstring_is_ascii(duk_hstring *h);
DUK_INTERNAL_DECL duk_bool_t duk_hstring_is_empty(duk_hstring *h);
DUK_INTERNAL_DECL duk_bool_t duk_hstring_is_symbol_initial_byte(duk_uint8_t t);
DUK_INTERNAL_DECL duk_uint32_t duk_hstring_get_hash(duk_hstring *h);
DUK_INTERNAL_DECL void duk_hstring_set_hash(duk_hstring *h, duk_uint32_t hash);
DUK_INTERNAL_DECL duk_size_t duk_hstring_get_bytelen(duk_hstring *h);
DUK_INTERNAL_DECL void duk_hstring_set_bytelen(duk_hstring *h, duk_size_t len);
DUK_INTERNAL_DECL duk_size_t duk_hstring_get_charlen(duk_hstring *h);
#if !defined(DUK_USE_HSTRING_LAZY_CLEN)
DUK_INTERNAL_DECL void duk_hstring_init_charlen(duk_hstring *h);
#endif
/* No duk_hstring_set_charlen(), set via duk_hstring_init_charlen(). */
DUK_INTERNAL_DECL void duk_hstring_set_charlen(duk_hstring *h, duk_size_t len);
DUK_INTERNAL_DECL duk_uarridx_t duk_hstring_get_arridx_fast(duk_hstring *h);
DUK_INTERNAL_DECL duk_uarridx_t duk_hstring_get_arridx_fast_known(duk_hstring *h);
DUK_INTERNAL_DECL duk_uarridx_t duk_hstring_get_arridx_slow(duk_hstring *h);

106
src-input/duk_hstring_misc.c

@ -9,13 +9,7 @@
*/
DUK_INTERNAL duk_bool_t duk_hstring_is_ascii(duk_hstring *h) {
#if 0
/* Slightly smaller code without explicit flag, but explicit flag
* is very useful when 'clen' is dropped.
*/
return duk_hstring_get_bytelen(h) == duk_hstring_get_charlen(h);
#endif
return DUK_HSTRING_HAS_ASCII(h); /* lazily set! */
return DUK_HSTRING_HAS_ASCII(h);
}
DUK_INTERNAL duk_bool_t duk_hstring_is_empty(duk_hstring *h) {
@ -69,87 +63,33 @@ DUK_INTERNAL const duk_uint8_t *duk_hstring_get_data_end(duk_hstring *h) {
}
/*
* duk_hstring charlen, when lazy charlen disabled.
* duk_hstring charlen
*/
#if !defined(DUK_USE_HSTRING_LAZY_CLEN)
#if !defined(DUK_USE_HSTRING_CLEN)
#error non-lazy duk_hstring charlen but DUK_USE_HSTRING_CLEN not set
#endif
DUK_INTERNAL void duk_hstring_init_charlen(duk_hstring *h) {
duk_uint32_t clen;
DUK_ASSERT(h != NULL);
DUK_ASSERT(!DUK_HSTRING_HAS_ASCII(h));
DUK_ASSERT(!DUK_HEAPHDR_HAS_READONLY((duk_heaphdr *) h));
if (DUK_HSTRING_HAS_SYMBOL(h)) {
clen = 0;
} else {
clen = (duk_uint32_t) duk_unicode_wtf8_charlength(duk_hstring_get_data(h), duk_hstring_get_bytelen(h));
}
DUK_INTERNAL void duk_hstring_set_charlen(duk_hstring *h, duk_size_t len) {
#if defined(DUK_USE_HSTRING_CLEN)
#if defined(DUK_USE_STRLEN16)
DUK_ASSERT(clen <= 0xffffUL); /* Bytelength checked during interning. */
h->clen16 = (duk_uint16_t) clen;
DUK_ASSERT(len <= 0xffffUL);
h->clen16 = len;
#else
h->clen = (duk_uint32_t) clen;
DUK_ASSERT(len <= 0xffffffffUL);
h->clen = len;
#endif
if (DUK_LIKELY(clen == duk_hstring_get_bytelen(h))) {
DUK_HSTRING_SET_ASCII(h);
}
}
DUK_INTERNAL DUK_HOT duk_size_t duk_hstring_get_charlen(duk_hstring *h) {
#if defined(DUK_USE_STRLEN16)
return h->clen16;
#else
return h->clen;
DUK_UNREF(len);
#endif
}
#endif /* !DUK_USE_HSTRING_LAZY_CLEN */
/*
* duk_hstring charlen, when lazy charlen enabled.
*/
#if defined(DUK_USE_HSTRING_LAZY_CLEN)
#if defined(DUK_USE_HSTRING_CLEN)
DUK_LOCAL DUK_COLD duk_size_t duk__hstring_get_charlen_slowpath(duk_hstring *h) {
duk_size_t res;
DUK_ASSERT(h->clen == 0); /* Checked by caller. */
#if defined(DUK_USE_ROM_STRINGS)
/* ROM strings have precomputed clen, but if the computed clen is zero
* we can still come here and can't write anything.
*/
if (DUK_HEAPHDR_HAS_READONLY((duk_heaphdr *) h)) {
return 0;
}
#endif
if (DUK_HSTRING_HAS_SYMBOL(h)) {
return 0;
}
res = duk_unicode_wtf8_charlength(duk_hstring_get_data(h), duk_hstring_get_bytelen(h));
#if defined(DUK_USE_STRLEN16)
DUK_ASSERT(res <= 0xffffUL); /* Bytelength checked during interning. */
h->clen16 = (duk_uint16_t) res;
#else
h->clen = (duk_uint32_t) res;
#endif
if (DUK_LIKELY(res == duk_hstring_get_bytelen(h))) {
DUK_HSTRING_SET_ASCII(h);
}
return res;
}
#else /* DUK_USE_HSTRING_CLEN */
DUK_LOCAL duk_size_t duk__hstring_get_charlen_slowpath(duk_hstring *h) {
if (DUK_LIKELY(DUK_HSTRING_HAS_ASCII(h))) {
/* Most practical strings will go here. */
DUK_ASSERT(!DUK_HSTRING_HAS_SYMBOL(h));
return duk_hstring_get_bytelen(h);
} else {
/* ASCII flag is lazy, so set it here. */
duk_size_t res;
/* XXX: here we could use the strcache to speed up the
@ -161,34 +101,17 @@ DUK_LOCAL duk_size_t duk__hstring_get_charlen_slowpath(duk_hstring *h) {
}
res = duk_unicode_wtf8_charlength(duk_hstring_get_data(h), duk_hstring_get_bytelen(h));
#if defined(DUK_USE_ROM_STRINGS)
if (DUK_HEAPHDR_HAS_READONLY((duk_heaphdr *) h)) {
/* For ROM strings, can't write anything; ASCII flag
* is preset so we don't need to update it.
*/
return res;
}
#endif
if (DUK_LIKELY(res == duk_hstring_get_bytelen(h))) {
DUK_HSTRING_SET_ASCII(h);
}
return res;
}
}
#endif /* DUK_USE_HSTRING_CLEN */
#if defined(DUK_USE_HSTRING_CLEN)
DUK_INTERNAL DUK_HOT duk_size_t duk_hstring_get_charlen(duk_hstring *h) {
#if defined(DUK_USE_STRLEN16)
if (DUK_LIKELY(h->clen16 != 0)) {
return h->clen16;
}
return h->clen16;
#else
if (DUK_LIKELY(h->clen != 0)) {
return h->clen;
}
return h->clen;
#endif
return duk__hstring_get_charlen_slowpath(h);
}
#else /* DUK_USE_HSTRING_CLEN */
DUK_INTERNAL DUK_HOT duk_size_t duk_hstring_get_charlen(duk_hstring *h) {
@ -196,7 +119,6 @@ DUK_INTERNAL DUK_HOT duk_size_t duk_hstring_get_charlen(duk_hstring *h) {
return duk__hstring_get_charlen_slowpath(h);
}
#endif /* DUK_USE_HSTRING_CLEN */
#endif /* DUK_USE_HSTRING_LAZY_CLEN */
/*
* duk_hstring charCodeAt, with and without surrogate awareness.
@ -287,3 +209,7 @@ DUK_INTERNAL duk_bool_t duk_hstring_equals_ascii_cstring(duk_hstring *h, const c
}
return 0;
}
DUK_INTERNAL duk_bool_t duk_hstring_is_symbol_initial_byte(duk_uint8_t t) {
return (t >= 0x80) && (t <= 0x82U || t == 0xffU);
}

20
src-input/duk_unicode.h

@ -256,9 +256,18 @@ DUK_INTERNAL_DECL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *
DUK_INTERNAL_DECL duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t cp);
#endif
DUK_INTERNAL_DECL duk_ucodepoint_t duk_unicode_wtf8_decode_known(const duk_uint8_t *p);
DUK_INTERNAL_DECL duk_uint32_t duk_unicode_wtf8_sanitize_string(const duk_uint8_t *str, duk_uint32_t blen, duk_uint8_t *out);
DUK_INTERNAL_DECL duk_uint32_t duk_unicode_wtf8_sanitize_symbol(const duk_uint8_t *str, duk_uint32_t blen, duk_uint8_t *out);
DUK_INTERNAL_DECL duk_uint32_t duk_unicode_wtf8_sanitize_detect(const duk_uint8_t *str, duk_uint32_t blen, duk_uint8_t *out);
DUK_INTERNAL_DECL duk_uint32_t duk_unicode_wtf8_sanitize_string(const duk_uint8_t *str,
duk_uint32_t blen,
duk_uint8_t *out,
duk_uint32_t *out_charlen);
DUK_INTERNAL_DECL duk_uint32_t duk_unicode_wtf8_sanitize_symbol(const duk_uint8_t *str,
duk_uint32_t blen,
duk_uint8_t *out,
duk_uint32_t *out_charlen);
DUK_INTERNAL_DECL duk_uint32_t duk_unicode_wtf8_sanitize_detect(const duk_uint8_t *str,
duk_uint32_t blen,
duk_uint8_t *out,
duk_uint32_t *out_charlen);
DUK_INTERNAL_DECL duk_uint32_t duk_unicode_wtf8_sanitize_keepcheck(const duk_uint8_t *str, duk_uint32_t blen);
DUK_INTERNAL_DECL duk_size_t duk_unicode_wtf8_charlength(const duk_uint8_t *data, duk_size_t blen);
DUK_INTERNAL_DECL duk_hstring *duk_push_wtf8_substring_hstring(duk_hthread *thr,
@ -266,6 +275,7 @@ DUK_INTERNAL_DECL duk_hstring *duk_push_wtf8_substring_hstring(duk_hthread *thr,
duk_size_t start_offset,
duk_size_t end_offset);
DUK_INTERNAL_DECL duk_bool_t duk_unicode_is_valid_wtf8(const duk_uint8_t *data, duk_size_t blen);
DUK_INTERNAL_DECL duk_bool_t duk_unicode_is_valid_utf8(const duk_uint8_t *data, duk_size_t blen);
DUK_INTERNAL_DECL void duk_unicode_wtf8_to_cesu8(duk_hthread *thr, const duk_uint8_t *data, duk_size_t blen);
DUK_INTERNAL_DECL duk_ucodepoint_t duk_unicode_wtf8_charcodeat_helper(duk_hthread *thr,
duk_hstring *h,
@ -273,11 +283,11 @@ DUK_INTERNAL_DECL duk_ucodepoint_t duk_unicode_wtf8_charcodeat_helper(duk_hthrea
duk_bool_t surrogate_aware);
DUK_INTERNAL_DECL duk_int_t duk_unicode_wtf8_search_forwards(duk_hthread *thr,
duk_hstring *h_input,
duk_hstring *h_match,
duk_hstring *h_search,
duk_uint32_t start_charoff);
DUK_INTERNAL_DECL duk_int_t duk_unicode_wtf8_search_backwards(duk_hthread *thr,
duk_hstring *h_input,
duk_hstring *h_match,
duk_hstring *h_search,
duk_uint32_t start_charoff);
#endif /* DUK_UNICODE_H_INCLUDED */

346
src-input/duk_unicode_wtf8.c

@ -4,8 +4,7 @@
#include "duk_internal.h"
/* Check whether a byte sequence is valid WTF-8. */
DUK_INTERNAL duk_bool_t duk_unicode_is_valid_wtf8(const duk_uint8_t *data, duk_size_t blen) {
DUK_LOCAL duk_bool_t duk__unicode_is_valid_wtf8_or_utf8(const duk_uint8_t *data, duk_size_t blen, duk_bool_t allow_wtf8) {
const duk_uint8_t *p;
const duk_uint8_t *p_end;
@ -34,8 +33,20 @@ DUK_INTERNAL duk_bool_t duk_unicode_is_valid_wtf8(const duk_uint8_t *data, duk_s
return 0;
}
} else if (t <= 0xefU) {
duk_uint8_t lower = (t == 0xe0U ? 0xa0U : 0x80U);
if (p_end - p >= 3 && p[1] >= lower && p[1] <= 0xbfU && p[2] >= 0x80U && p[2] <= 0xbfU) {
/* The only difference to valid UTF-8 is that in WTF-8
* codepoints U+D800 to U+DFFF are allowed (encoded
* forms ED A0 80 to ED BF BF).
*/
duk_uint8_t lower;
duk_uint8_t upper;
if (allow_wtf8) {
lower = (t == 0xe0U ? 0xa0U : 0x80U);
upper = 0xbfU;
} else {
lower = (t == 0xe0U ? 0xa0U : 0x80U);
upper = (t == 0xedU ? 0x9fU : 0xbfU);
}
if (p_end - p >= 3 && p[1] >= lower && p[1] <= upper && p[2] >= 0x80U && p[2] <= 0xbfU) {
p += 3;
} else {
return 0;
@ -61,21 +72,35 @@ DUK_INTERNAL duk_bool_t duk_unicode_is_valid_wtf8(const duk_uint8_t *data, duk_s
return 1;
}
/* Check whether a byte sequence is valid WTF-8. */
DUK_INTERNAL duk_bool_t duk_unicode_is_valid_wtf8(const duk_uint8_t *data, duk_size_t blen) {
return duk__unicode_is_valid_wtf8_or_utf8(data, blen, 1 /*allow_wtf8*/);
}
/* Check whether a byte sequence is valid UTF-8. */
DUK_INTERNAL duk_bool_t duk_unicode_is_valid_utf8(const duk_uint8_t *data, duk_size_t blen) {
return duk__unicode_is_valid_wtf8_or_utf8(data, blen, 0 /*allow_wtf8*/);
}
/* Straightforward reference implementation for the WTF-8 sanitization algorithm.
* Caller must ensure 'out' has enough space for maximum expansion, 3x input.
*/
DUK_LOCAL duk_uint32_t duk__unicode_wtf8_sanitize_string_reference(const duk_uint8_t *str,
duk_uint32_t str_blen,
duk_uint8_t *out_data) {
duk_uint8_t *out_data,
duk_uint32_t *out_charlen) {
const duk_uint8_t *p;
const duk_uint8_t *p_end;
duk_uint8_t *q;
duk_uint32_t out_clen_sub = 0; /* Output charlen = out_blen - out_clen_sub. */
duk_uint32_t out_blen;
duk_uint32_t out_clen;
duk_bool_t have_non_bmp = 0;
duk_bool_t have_non_utf8 = 0;
DUK_ASSERT(str_blen == 0 || str != NULL);
DUK_ASSERT(out_data != NULL);
DUK_ASSERT(out_charlen != NULL);
p = str;
p_end = str + str_blen;
@ -174,9 +199,11 @@ DUK_LOCAL duk_uint32_t duk__unicode_wtf8_sanitize_string_reference(const duk_uin
p += 3;
} else {
/* Keep 'cp' as is. */
have_non_utf8 = 1;
}
} else {
/* Unpaired low surrogate, keep as is. */
have_non_utf8 = 1;
}
}
@ -184,20 +211,33 @@ DUK_LOCAL duk_uint32_t duk__unicode_wtf8_sanitize_string_reference(const duk_uin
DUK_ASSERT(cp >= 0x80UL); /* ASCII handled already. */
DUK_ASSERT(cp <= 0x10ffffUL); /* Decode upper/lower ranges ensure this. */
if (cp <= 0x7ffUL) {
*q++ = 0xc0U + (cp >> 6U);
*q++ = 0x80U + (cp & 0x3fU);
duk_uint8_t b1, b2;
b1 = 0xc0U + (cp >> 6U);
b2 = 0x80U + (cp & 0x3fU);
*q++ = b1;
*q++ = b2;
out_clen_sub += 1;
} else if (cp <= 0xffffUL) {
*q++ = 0xe0U + (cp >> 12U);
*q++ = 0x80U + ((cp >> 6U) & 0x3fU);
*q++ = 0x80U + (cp & 0x3fU);
duk_uint8_t b1, b2, b3;
b1 = 0xe0U + (cp >> 12U);
b2 = 0x80U + ((cp >> 6U) & 0x3fU);
b3 = 0x80U + (cp & 0x3fU);
*q++ = b1;
*q++ = b2;
*q++ = b3;
out_clen_sub += 2;
} else {
*q++ = 0xf0U + (cp >> 18U);
*q++ = 0x80U + ((cp >> 12U) & 0x3fU);
*q++ = 0x80U + ((cp >> 6U) & 0x3fU);
*q++ = 0x80U + (cp & 0x3fU);
duk_uint8_t b1, b2, b3, b4;
b1 = 0xf0U + (cp >> 18U);
b2 = 0x80U + ((cp >> 12U) & 0x3fU);
b3 = 0x80U + ((cp >> 6U) & 0x3fU);
b4 = 0x80U + (cp & 0x3fU);
*q++ = b1;
*q++ = b2;
*q++ = b3;
*q++ = b4;
out_clen_sub += 3;
have_non_bmp = 1;
}
continue;
@ -211,7 +251,7 @@ DUK_LOCAL duk_uint32_t duk__unicode_wtf8_sanitize_string_reference(const duk_uin
out_blen = (duk_uint32_t) (q - out_data);
DUK_ASSERT(out_clen_sub <= out_blen);
out_clen = out_blen - out_clen_sub;
DUK_UNREF(out_clen);
*out_charlen = out_clen;
return out_blen;
}
@ -219,7 +259,8 @@ DUK_LOCAL duk_uint32_t duk__unicode_wtf8_sanitize_string_reference(const duk_uin
/* Sanitize Symbol reference, for now copied 1:1. */
DUK_LOCAL duk_uint32_t duk__unicode_wtf8_sanitize_symbol_reference(const duk_uint8_t *str,
duk_uint32_t str_blen,
duk_uint8_t *out) {
duk_uint8_t *out,
duk_uint32_t *out_charlen) {
DUK_ASSERT(str_blen == 0 || str != NULL);
DUK_ASSERT(out != NULL);
@ -227,15 +268,24 @@ DUK_LOCAL duk_uint32_t duk__unicode_wtf8_sanitize_symbol_reference(const duk_uin
return str_blen;
}
DUK_INTERNAL duk_uint32_t duk_unicode_wtf8_sanitize_symbol(const duk_uint8_t *str, duk_uint32_t str_blen, duk_uint8_t *out) {
return duk__unicode_wtf8_sanitize_symbol_reference(str, str_blen, out);
DUK_INTERNAL duk_uint32_t duk_unicode_wtf8_sanitize_symbol(const duk_uint8_t *str,
duk_uint32_t str_blen,
duk_uint8_t *out,
duk_uint32_t *out_charlen) {
return duk__unicode_wtf8_sanitize_symbol_reference(str, str_blen, out, out_charlen);
}
DUK_INTERNAL duk_uint32_t duk_unicode_wtf8_sanitize_string(const duk_uint8_t *str, duk_uint32_t str_blen, duk_uint8_t *out) {
return duk__unicode_wtf8_sanitize_string_reference(str, str_blen, out);
DUK_INTERNAL duk_uint32_t duk_unicode_wtf8_sanitize_string(const duk_uint8_t *str,
duk_uint32_t str_blen,
duk_uint8_t *out,
duk_uint32_t *out_charlen) {
return duk__unicode_wtf8_sanitize_string_reference(str, str_blen, out, out_charlen);
}
DUK_INTERNAL duk_uint32_t duk_unicode_wtf8_sanitize_detect(const duk_uint8_t *str, duk_uint32_t str_blen, duk_uint8_t *out) {
DUK_INTERNAL duk_uint32_t duk_unicode_wtf8_sanitize_detect(const duk_uint8_t *str,
duk_uint32_t str_blen,
duk_uint8_t *out,
duk_uint32_t *out_charlen) {
duk_bool_t symbol = 0;
DUK_ASSERT(str_blen == 0 || str != NULL);
@ -253,9 +303,9 @@ DUK_INTERNAL duk_uint32_t duk_unicode_wtf8_sanitize_detect(const duk_uint8_t *st
}
if (DUK_UNLIKELY(symbol)) {
return duk_unicode_wtf8_sanitize_symbol(str, str_blen, out);
return duk_unicode_wtf8_sanitize_symbol(str, str_blen, out, out_charlen);
} else {
return duk_unicode_wtf8_sanitize_string(str, str_blen, out);
return duk_unicode_wtf8_sanitize_string(str, str_blen, out, out_charlen);
}
}
@ -276,7 +326,7 @@ DUK_LOCAL duk_uint32_t duk__unicode_wtf8_sanitize_asciicheck_reference(const duk
}
#endif
DUK_LOCAL DUK_NOINLINE duk_uint32_t duk__unicode_wtf8_sanitize_asciicheck_optimized(const duk_uint8_t *str, duk_uint32_t blen) {
DUK_LOCAL duk_uint32_t duk__unicode_wtf8_sanitize_asciicheck_optimized(const duk_uint8_t *str, duk_uint32_t blen) {
const duk_uint8_t *p;
const duk_uint8_t *p_end;
const duk_uint32_t *p32;
@ -441,8 +491,7 @@ DUK_INTERNAL duk_hstring *duk_push_wtf8_substring_hstring(duk_hthread *thr,
duk_uint32_t end_byteoff;
duk_uint32_t end_charoff;
/* ASCII fast path. */
if (duk_hstring_get_charlen(h_input) == duk_hstring_get_bytelen(h_input)) {
if (duk_hstring_is_ascii(h_input)) {
duk_push_lstring(thr,
(const char *) (duk_hstring_get_data(h_input) + start_offset),
(duk_size_t) (end_offset - start_offset));
@ -561,38 +610,39 @@ done:
/* Find a string from within an input string. Must account for non-BMP codepoints,
* e.g. search string may start with a low surrogate which must be correctly matched
* with combined surrogates in the input.
* with combined surrogates in the input. Empty string always matches.
*
* Empty string always matches.
* WTF-8 complicates the string search because one can't do a simple byte comparison
* in some cases.
*/
/* Naive implementation for reference. */
DUK_LOCAL duk_int_t duk__unicode_wtf8_search_forwards_reference(duk_hthread *thr,
duk_hstring *h_input,
duk_hstring *h_match,
duk_hstring *h_search,
duk_uint32_t start_charoff) {
duk_uint32_t match_charlen;
duk_uint32_t search_charlen;
duk_uint32_t input_charlen;
duk_uint32_t charoff;
input_charlen = duk_hstring_get_charlen(h_input);
match_charlen = duk_hstring_get_charlen(h_match);
DUK_DD(DUK_DDPRINT("input_charlen=%ld, match_charlen=%d, start_charoff=%ld",
search_charlen = duk_hstring_get_charlen(h_search);
DUK_DD(DUK_DDPRINT("input_charlen=%ld, search_charlen=%d, start_charoff=%ld",
(long) input_charlen,
(long) match_charlen,
(long) search_charlen,
(long) start_charoff));
for (charoff = start_charoff; charoff <= input_charlen; charoff++) {
/* Must scan to charoff == input_charlen for zero length input. */
DUK_DDD(DUK_DDDPRINT("wtf8 find, charoff=%ld", (long) charoff));
if (charoff + match_charlen <= input_charlen) {
if (charoff + search_charlen <= input_charlen) {
duk_hstring *h_tmp;
h_tmp = duk_push_wtf8_substring_hstring(thr, h_input, charoff, charoff + match_charlen);
DUK_DDD(DUK_DDDPRINT("substring=%!O, match=%!O", h_tmp, h_match));
h_tmp = duk_push_wtf8_substring_hstring(thr, h_input, charoff, charoff + search_charlen);
DUK_DDD(DUK_DDDPRINT("substring=%!O, match=%!O", h_tmp, h_search));
/* Rely on string interning! */
if (h_tmp == h_match) {
if (h_tmp == h_search) {
duk_pop_unsafe(thr);
return (duk_int_t) charoff;
}
@ -602,41 +652,134 @@ DUK_LOCAL duk_int_t duk__unicode_wtf8_search_forwards_reference(duk_hthread *thr
return -1;
}
/* Search forwards with byte compare, taking advantage of the search string being
* valid UTF-8 (= no unpaired surrogate vs non-BMP matches).
*/
DUK_LOCAL DUK_ALWAYS_INLINE duk_int_t duk__unicode_wtf8_search_forwards_1_utf8(duk_hthread *thr,
duk_hstring *h_input,
duk_hstring *h_search,
duk_uint32_t start_charoff) {
duk_uint32_t start_boff;
duk_uint32_t start_coff;
duk_uint32_t curr_coff;
const duk_uint8_t *p;
duk_size_t p_len;
const duk_uint8_t *q;
duk_size_t q_len;
duk_int_t i;
duk_int_t i_limit;
/* Caller ensures. */
DUK_ASSERT(duk_unicode_is_valid_wtf8(duk_hstring_get_data(h_input), duk_hstring_get_bytelen(h_input)));
DUK_ASSERT(duk_unicode_is_valid_utf8(duk_hstring_get_data(h_search), duk_hstring_get_bytelen(h_search)));
/* For a valid UTF-8 search string we can just go on matching the
* search string byte-for-byte. However, we need to keep track of
* the logical ECMAScript characters we pass (counting non-BMP
* characters as 2 characters each). Since we do that, we can also
* process the input based on the leading byte; we could match against
* continuation bytes but the match would always fail.
*/
/* Get start offset for search. The offset may point to the middle of
* a non-BMP character in which case we'll be off by 1 character offset.
* In that case we can skip the non-BMP character because the low surrogate
* can't match a valid UTF-8 search string.
*/
duk_strcache_scan_char2byte_wtf8(thr, h_input, start_charoff, &start_boff, &start_coff);
if (start_charoff != start_coff) {
start_coff += 2;
start_boff += 4;
}
p = duk_hstring_get_data_and_bytelen(h_input, &p_len);
q = duk_hstring_get_data_and_bytelen(h_search, &q_len);
i_limit = (duk_int_t) p_len - (duk_int_t) q_len;
curr_coff = start_coff;
for (i = (duk_int_t) start_boff; i <= i_limit; i++) {
duk_uint8_t t;
DUK_ASSERT(i + q_len <= p_len);
if (duk_memcmp((const void *) (p + i), (const void *) q, q_len) == 0) {
return curr_coff;
}
t = p[i];
if (t < 0x80U) {
curr_coff++;
} else {
if (t >= 0xf0U) {
curr_coff += 2;
} else if (t >= 0xc0U) {
curr_coff += 1;
}
}
}
return -1;
}
/* Slightly optimized: check if search string is free of unpaired surrogates
* (CESU-8 style [U+D800,U+DFFF], i.e. is valid UTF-8). If so, we can search
* using a simple byte compare.
*/
DUK_LOCAL duk_int_t duk__unicode_wtf8_search_forwards_1(duk_hthread *thr,
duk_hstring *h_input,
duk_hstring *h_search,
duk_uint32_t start_charoff) {
duk_bool_t search_utf8 = duk_unicode_is_valid_utf8(duk_hstring_get_data(h_search), duk_hstring_get_bytelen(h_search));
if (DUK_UNLIKELY(!search_utf8)) {
return duk__unicode_wtf8_search_forwards_reference(thr, h_input, h_search, start_charoff);
} else {
return duk__unicode_wtf8_search_forwards_1_utf8(thr, h_input, h_search, start_charoff);
}
}
DUK_INTERNAL duk_int_t duk_unicode_wtf8_search_forwards(duk_hthread *thr,
duk_hstring *h_input,
duk_hstring *h_match,
duk_hstring *h_search,
duk_uint32_t start_charoff) {
return duk__unicode_wtf8_search_forwards_reference(thr, h_input, h_match, start_charoff);
DUK_HTHREAD_ASSERT_VALID(thr);
DUK_HSTRING_ASSERT_VALID(h_input);
DUK_HSTRING_ASSERT_VALID(h_search);
#if 0
return duk__unicode_wtf8_search_forwards_reference(thr, h_input, h_search, start_charoff);
#endif
return duk__unicode_wtf8_search_forwards_1(thr, h_input, h_search, start_charoff);
}
/* Naive implementation for reference. */
DUK_LOCAL duk_int_t duk__unicode_wtf8_search_backwards_reference(duk_hthread *thr,
duk_hstring *h_input,
duk_hstring *h_match,
duk_hstring *h_search,
duk_uint32_t start_charoff) {
duk_uint32_t match_charlen;
duk_uint32_t search_charlen;
duk_uint32_t input_charlen;
duk_int_t i;
duk_uint32_t charoff;
input_charlen = duk_hstring_get_charlen(h_input);
match_charlen = duk_hstring_get_charlen(h_match);
DUK_DD(DUK_DDPRINT("input_charlen=%ld, match_charlen=%d, start_charoff=%ld",
search_charlen = duk_hstring_get_charlen(h_search);
DUK_DD(DUK_DDPRINT("input_charlen=%ld, search_charlen=%d, start_charoff=%ld",
(long) input_charlen,
(long) match_charlen,
(long) search_charlen,
(long) start_charoff));
for (i = (duk_int_t) start_charoff; i >= 0; i--) {
charoff = (duk_uint32_t) i;
DUK_DDD(DUK_DDDPRINT("wtf8 find, charoff=%ld", (long) charoff));
if (charoff + match_charlen <= input_charlen) {
if (charoff + search_charlen <= input_charlen) {
duk_hstring *h_tmp;
h_tmp = duk_push_wtf8_substring_hstring(thr, h_input, charoff, charoff + match_charlen);
DUK_DDD(DUK_DDDPRINT("substring=%!O, match=%!O", h_tmp, h_match));
h_tmp = duk_push_wtf8_substring_hstring(thr, h_input, charoff, charoff + search_charlen);
DUK_DDD(DUK_DDDPRINT("substring=%!O, match=%!O", h_tmp, h_search));
/* Rely on string interning! */
if (h_tmp == h_match) {
if (h_tmp == h_search) {
duk_pop_unsafe(thr);
return (duk_int_t) charoff;
}
@ -646,11 +789,112 @@ DUK_LOCAL duk_int_t duk__unicode_wtf8_search_backwards_reference(duk_hthread *th
return -1;
}
/* Search backwards with byte compare, taking advantage of the search string being
* valid UTF-8 (= no unpaired surrogate vs non-BMP matches).
*/
DUK_LOCAL duk_int_t duk__unicode_wtf8_search_backwards_1_utf8(duk_hthread *thr,
duk_hstring *h_input,
duk_hstring *h_search,
duk_uint32_t start_charoff) {
duk_uint32_t start_boff;
duk_uint32_t start_coff;
duk_uint32_t curr_coff;
const duk_uint8_t *p;
duk_size_t p_len;
const duk_uint8_t *q;
duk_size_t q_len;
duk_int_t i;
duk_int_t i_start;
/* Caller ensures. */
DUK_ASSERT(duk_unicode_is_valid_wtf8(duk_hstring_get_data(h_input), duk_hstring_get_bytelen(h_input)));
DUK_ASSERT(duk_unicode_is_valid_utf8(duk_hstring_get_data(h_search), duk_hstring_get_bytelen(h_search)));
/* Get start offset for search. The offset may point to the middle of
* a non-BMP character in which case we'll be off by 1 character offset.
* In that case we can skip the non-BMP character because the high
* surrogate can't match a valid UTF-8 search string.
*/
duk_strcache_scan_char2byte_wtf8(thr, h_input, start_charoff, &start_boff, &start_coff);
if (start_charoff != start_coff) {
/* No action needed. */
DUK_ASSERT(start_coff + 1 == start_charoff);
}
p = duk_hstring_get_data_and_bytelen(h_input, &p_len);
q = duk_hstring_get_data_and_bytelen(h_search, &q_len);
curr_coff = start_coff;
i_start = (duk_int_t) start_boff;
DUK_ASSERT(i_start >= 0);
for (i = i_start; i >= 0;) {
duk_uint8_t t;
if (i + q_len > p_len) {
goto skip;
}
if (duk_memcmp((const void *) (p + i), (const void *) q, q_len) == 0) {
return curr_coff;
}
skip:
if (i <= 0) {
break;
}
/* Scan one codepoint backwards. With WTF-8 this is guaranteed
* to succeed and find a non-continuation byte.
*/
for (;;) {
DUK_ASSERT(i > 0);
t = p[--i];
if (t < 0x80U) {
curr_coff--;
break;
} else {
if (t >= 0xf0U) {
curr_coff -= 2;
break;
} else if (t >= 0xc0U) {
curr_coff -= 1;
break;
} else {
/* Continuation byte, keep going. */
}
}
}
}
return -1;
}
DUK_LOCAL duk_int_t duk__unicode_wtf8_search_backwards_1(duk_hthread *thr,
duk_hstring *h_input,
duk_hstring *h_search,
duk_uint32_t start_charoff) {
duk_bool_t search_utf8 = duk_unicode_is_valid_utf8(duk_hstring_get_data(h_search), duk_hstring_get_bytelen(h_search));
if (DUK_UNLIKELY(!search_utf8)) {
return duk__unicode_wtf8_search_backwards_reference(thr, h_input, h_search, start_charoff);
} else {
return duk__unicode_wtf8_search_backwards_1_utf8(thr, h_input, h_search, start_charoff);
}
}
DUK_INTERNAL duk_int_t duk_unicode_wtf8_search_backwards(duk_hthread *thr,
duk_hstring *h_input,
duk_hstring *h_match,
duk_hstring *h_search,
duk_uint32_t start_charoff) {
return duk__unicode_wtf8_search_backwards_reference(thr, h_input, h_match, start_charoff);
DUK_HTHREAD_ASSERT_VALID(thr);
DUK_HSTRING_ASSERT_VALID(h_input);
DUK_HSTRING_ASSERT_VALID(h_search);
#if 0
return duk__unicode_wtf8_search_backwards_reference(thr, h_input, h_search, start_charoff);
#endif
return duk__unicode_wtf8_search_backwards_1(thr, h_input, h_search, start_charoff);
}
/* Convert a valid WTF-8 string to CESU-8 representation. This allows some
@ -786,7 +1030,7 @@ DUK_INTERNAL duk_ucodepoint_t duk_unicode_wtf8_charcodeat_helper(duk_hthread *th
DUK_ASSERT(pos < (duk_uint_t) duk_hstring_get_charlen(h));
DUK_HSTRING_ASSERT_VALID(h);
if (duk_hstring_get_charlen(h) == duk_hstring_get_bytelen(h)) {
if (duk_hstring_is_ascii(h)) {
const duk_uint8_t *p = duk_hstring_get_data(h);
return (duk_ucodepoint_t) p[pos];
}

20
tests/ecmascript/test-wtf8-string-lastindexof-2.js

@ -0,0 +1,20 @@
/*===
7
7
7
-1
===*/
function test() {
var x = 'foobar\u{cafe}foo\u{1f98a}';
print(x.lastIndexOf('foo'));
print(x.lastIndexOf('foo\u{1f98a}'));
print(x.lastIndexOf('foo\u{d83e}')); // High surrogate of U+1F98A
print(x.lastIndexOf('foo\u{d83f}'));
}
try {
test();
} catch (e) {
print(e.stack || e);
}

2
tests/perf/test-string-indexof-1.js

@ -5,7 +5,7 @@ function test() {
var x;
x = 'foobar\u{1f4a9}'.repeat(1000);
console.log(x);
print(x);
for (i = 0; i < 1e5; i++) {
void x.indexOf('\u{1f4a9}');

2
tests/perf/test-string-lastindexof-1.js

@ -5,7 +5,7 @@ function test() {
var x;
x = 'foobar\u{1f4a9}'.repeat(1000);
console.log(x);
print(x);
for (i = 0; i < 1e5; i++) {
void x.lastIndexOf('\u{1f4a9}');

2
tests/perf/test-string-replace-ascii-1.js

@ -5,7 +5,7 @@ function test() {
var x;
x = 'foobarX'.repeat(1000);
console.log(x);
print(x);
for (i = 0; i < 1e5; i++) {
void x.replace('X', 'ZZ');

2
tests/perf/test-string-replace-ascii-2.js

@ -5,7 +5,7 @@ function test() {
var x;
x = 'foobarX'.repeat(1000);
console.log(x);
print(x);
for (i = 0; i < 5e2; i++) {
void x.replace(/X/g, 'ZZ');

2
tests/perf/test-string-replace-nonbmp-1.js

@ -5,7 +5,7 @@ function test() {
var x;
x = 'foobar\u{1f4a9}'.repeat(1000);
console.log(x);
print(x);
for (i = 0; i < 1e5; i++) {
void x.replace('\ud83d', 'XX');

2
tests/perf/test-string-replace-nonbmp-2.js

@ -5,7 +5,7 @@ function test() {
var x;
x = 'foobar\u{1f4a9}'.repeat(1000);
console.log(x);
print(x);
for (i = 0; i < 5e2; i++) {
void x.replace(/\ud83d/g, 'XX');

29
tests/perf/test-string-split-ascii-1.js

@ -0,0 +1,29 @@
if (typeof print !== 'function') { print = console.log; }
function test() {
var i;
var x;
x = 'foobarz'.repeat(1000);
print(x);
for (i = 0; i < 1e4; i++) {
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
}
}
try {
test();
} catch (e) {
print(e.stack || e);
throw e;
}

29
tests/perf/test-string-split-ascii-2.js

@ -0,0 +1,29 @@
if (typeof print !== 'function') { print = console.log; }
function test() {
var i;
var x;
x = ('foobarx'.repeat(1000) + 'z').repeat(10);
print(x);
for (i = 0; i < 1e3; i++) {
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
void x.split('z');
}
}
try {
test();
} catch (e) {
print(e.stack || e);
throw e;
}

2
tests/perf/test-string-split-nonbmp-1.js

@ -5,7 +5,7 @@ function test() {
var x;
x = 'foobar\u{1f4a9}'.repeat(1000);
console.log(x);
print(x);
for (i = 0; i < 1e4; i++) {
void x.split('\u{1f4a9}');

29
tests/perf/test-string-split-nonbmp-2.js

@ -0,0 +1,29 @@
if (typeof print !== 'function') { print = console.log; }
function test() {
var i;
var x;
x = ('foobar\ucafe'.repeat(1000) + '\u{1f4a9}').repeat(10);
print(x);
for (i = 0; i < 1e3; i++) {
void x.split('\u{1f4a9}');
void x.split('\u{1f4a9}');
void x.split('\u{1f4a9}');
void x.split('\u{1f4a9}');
void x.split('\u{1f4a9}');
void x.split('\u{1f4a9}');
void x.split('\u{1f4a9}');
void x.split('\u{1f4a9}');
void x.split('\u{1f4a9}');
void x.split('\u{1f4a9}');
}
}
try {
test();
} catch (e) {
print(e.stack || e);
throw e;
}

1
util/index_page_sizes.sh

@ -74,7 +74,6 @@ DUK_USE_STRLEN16: true
DUK_USE_BUFLEN16: true
DUK_USE_OBJSIZES16: true
DUK_USE_HSTRING_CLEN: false
DUK_USE_HSTRING_LAZY_CLEN: true # must be lazy when clen field dropped
DUK_USE_HOBJECT_HASH_PART: false
DUK_USE_HEAPPTR16: true

2
util/makeduk_base.yaml

@ -83,6 +83,4 @@ DUK_USE_JSON_STRINGIFY_FASTPATH: true
#DUK_USE_CACHE_ACTIVATION: false
#DUK_USE_CACHE_CATCHER: false
#DUK_USE_HSTRING_LAZY_CLEN: false
#DUK_USE_PROMISE_BUILTIN: true

1
util/makeduk_duklow.yaml

@ -13,7 +13,6 @@ DUK_USE_STRLEN16: true
DUK_USE_BUFLEN16: true
DUK_USE_OBJSIZES16: true
DUK_USE_HSTRING_CLEN: false
DUK_USE_HSTRING_LAZY_CLEN: true # must be lazy when clen field dropped
DUK_USE_HSTRING_ARRIDX: false
DUK_USE_HOBJECT_HASH_PART: false
DUK_USE_STRTAB_MINSIZE: 128

Loading…
Cancel
Save