duktape/src-input/duk_unicode_support.c


								/*

								 *  Various Unicode help functions for character classification predicates,

								 *  case conversion, decoding, etc.

								 */


								#include "duk_internal.h"


								/*

								 *  Fast path tables

								 */


								#if defined(DUK_USE_IDCHAR_FASTPATH)

								DUK_INTERNAL const duk_int8_t duk_is_idchar_tab[128] = {

									/* 0: not IdentifierStart or IdentifierPart

									 * 1: IdentifierStart and IdentifierPart

									 * -1: IdentifierPart only

									 */

									0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, /* 0x00...0x0f */

									0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, /* 0x10...0x1f */

									0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, /* 0x20...0x2f */

									-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, /* 0x30...0x3f */

									0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 1, 1, 1, 1, 1, /* 0x40...0x4f */

									1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 0, 0, 0, 0, 1, /* 0x50...0x5f */

									0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 1, 1, 1, 1, 1, /* 0x60...0x6f */

									1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 0, 0, 0, 0, 0 /* 0x70...0x7f */

								};

								#endif


								/*

								 *  XUTF-8 and CESU-8 encoding/decoding

								 */


								DUK_INTERNAL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) {

									duk_uint_fast32_t x = (duk_uint_fast32_t) cp;

									if (x < 0x80UL) {

										/* 7 bits */

										return 1;

									} else if (x < 0x800UL) {

										/* 11 bits */

										return 2;

									} else if (x < 0x10000UL) {

										/* 16 bits */

										return 3;

									} else if (x < 0x200000UL) {

										/* 21 bits */

										return 4;

									} else if (x < 0x4000000UL) {

										/* 26 bits */

										return 5;

									} else if (x < (duk_ucodepoint_t) 0x80000000UL) {

										/* 31 bits */

										return 6;

									} else {

										/* 36 bits */

										return 7;

									}

								}


								#if defined(DUK_USE_ASSERTIONS)

								DUK_INTERNAL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp) {

									duk_uint_fast32_t x = (duk_uint_fast32_t) cp;

									if (x < 0x80UL) {

										/* 7 bits */

										return 1;

									} else if (x < 0x800UL) {

										/* 11 bits */

										return 2;

									} else if (x < 0x10000UL) {

										/* 16 bits */

										return 3;

									} else {

										/* Encoded as surrogate pair, each encoding to 3 bytes for

										 * 6 bytes total.  Codepoints above U+10FFFF encode as 6 bytes

										 * too, see duk_unicode_encode_cesu8().

										 */

										return 3 + 3;

									}

								}

								#endif /* DUK_USE_ASSERTIONS */


								DUK_INTERNAL const duk_uint8_t duk_unicode_xutf8_markers[7] = { 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe };


								/* Encode to extended UTF-8; 'out' must have space for at least

								 * DUK_UNICODE_MAX_XUTF8_LENGTH bytes.  Allows encoding of any

								 * 32-bit (unsigned) codepoint.

								 */

								DUK_INTERNAL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out) {

									duk_uint_fast32_t x = (duk_uint_fast32_t) cp;

									duk_small_int_t len;

									duk_uint8_t marker;

									duk_small_int_t i;


									len = duk_unicode_get_xutf8_length(cp);

									DUK_ASSERT(len > 0);


									marker = duk_unicode_xutf8_markers[len - 1]; /* 64-bit OK because always >= 0 */


									i = len;

									DUK_ASSERT(i > 0);

									do {

										i--;

										if (i > 0) {

											out[i] = (duk_uint8_t) (0x80 + (x & 0x3f));

											x >>= 6;

										} else {

											/* Note: masking of 'x' is not necessary because of

											 * range check and shifting -> no bits overlapping

											 * the marker should be set.

											 */

											out[0] = (duk_uint8_t) (marker + x);

										}

									} while (i > 0);


									return len;

								}


								/* Encode to CESU-8; 'out' must have space for at least

								 * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF

								 * will encode to garbage but won't overwrite the output buffer.

								 */

								DUK_INTERNAL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out) {

									duk_uint_fast32_t x = (duk_uint_fast32_t) cp;

									duk_small_int_t len;


									if (x < 0x80UL) {

										out[0] = (duk_uint8_t) x;

										len = 1;

									} else if (x < 0x800UL) {

										out[0] = (duk_uint8_t) (0xc0 + ((x >> 6) & 0x1f));

										out[1] = (duk_uint8_t) (0x80 + (x & 0x3f));

										len = 2;

									} else if (x < 0x10000UL) {

										/* surrogate pairs get encoded here */

										out[0] = (duk_uint8_t) (0xe0 + ((x >> 12) & 0x0f));

										out[1] = (duk_uint8_t) (0x80 + ((x >> 6) & 0x3f));

										out[2] = (duk_uint8_t) (0x80 + (x & 0x3f));

										len = 3;

									} else {

										/*

										 *  Unicode codepoints above U+FFFF are encoded as surrogate

										 *  pairs here.  This ensures that all CESU-8 codepoints are

										 *  16-bit values as expected in ECMAScript.  The surrogate

										 *  pairs always get a 3-byte encoding (each) in CESU-8.

										 *  See: http://en.wikipedia.org/wiki/Surrogate_pair

										 *

										 *  20-bit codepoint, 10 bits (A and B) per surrogate pair:

										 *

										 *    x = 0b00000000 0000AAAA AAAAAABB BBBBBBBB

										 *  sp1 = 0b110110AA AAAAAAAA  (0xd800 + ((x >> 10) & 0x3ff))

										 *  sp2 = 0b110111BB BBBBBBBB  (0xdc00 + (x & 0x3ff))

										 *

										 *  Encoded into CESU-8:

										 *

										 *  sp1 -> 0b11101101  (0xe0 + ((sp1 >> 12) & 0x0f))

										 *      -> 0b1010AAAA  (0x80 + ((sp1 >> 6) & 0x3f))

										 *      -> 0b10AAAAAA  (0x80 + (sp1 & 0x3f))

										 *  sp2 -> 0b11101101  (0xe0 + ((sp2 >> 12) & 0x0f))

										 *      -> 0b1011BBBB  (0x80 + ((sp2 >> 6) & 0x3f))

										 *      -> 0b10BBBBBB  (0x80 + (sp2 & 0x3f))

										 *

										 *  Note that 0x10000 must be subtracted first.  The code below

										 *  avoids the sp1, sp2 temporaries which saves around 20 bytes

										 *  of code.

										 */


										x -= 0x10000UL;


										out[0] = (duk_uint8_t) (0xed);

										out[1] = (duk_uint8_t) (0xa0 + ((x >> 16) & 0x0f));

										out[2] = (duk_uint8_t) (0x80 + ((x >> 10) & 0x3f));

										out[3] = (duk_uint8_t) (0xed);

										out[4] = (duk_uint8_t) (0xb0 + ((x >> 6) & 0x0f));

										out[5] = (duk_uint8_t) (0x80 + (x & 0x3f));

										len = 6;

									}


									return len;

								}


								/* Decode helper.  Return zero on error. */

								DUK_INTERNAL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr,

								                                                      const duk_uint8_t **ptr,

								                                                      const duk_uint8_t *ptr_start,

								                                                      const duk_uint8_t *ptr_end,

								                                                      duk_ucodepoint_t *out_cp) {

									const duk_uint8_t *p;

									duk_uint32_t res;

									duk_uint_fast8_t ch;

									duk_small_int_t n;


									DUK_UNREF(thr);


									p = *ptr;

									if (p < ptr_start || p >= ptr_end) {

										goto fail;

									}


									/*

									 *  UTF-8 decoder which accepts longer than standard byte sequences.

									 *  This allows full 32-bit code points to be used.

									 */


									ch = (duk_uint_fast8_t) (*p++);

									if (ch < 0x80) {

										/* 0xxx xxxx   [7 bits] */

										res = (duk_uint32_t) (ch & 0x7f);

										n = 0;

									} else if (ch < 0xc0) {

										/* 10xx xxxx -> invalid */

										goto fail;

									} else if (ch < 0xe0) {

										/* 110x xxxx   10xx xxxx   [11 bits] */

										res = (duk_uint32_t) (ch & 0x1f);

										n = 1;

									} else if (ch < 0xf0) {

										/* 1110 xxxx   10xx xxxx   10xx xxxx   [16 bits] */

										res = (duk_uint32_t) (ch & 0x0f);

										n = 2;

									} else if (ch < 0xf8) {

										/* 1111 0xxx   10xx xxxx   10xx xxxx   10xx xxxx   [21 bits] */

										res = (duk_uint32_t) (ch & 0x07);

										n = 3;

									} else if (ch < 0xfc) {

										/* 1111 10xx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [26 bits] */

										res = (duk_uint32_t) (ch & 0x03);

										n = 4;

									} else if (ch < 0xfe) {

										/* 1111 110x   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [31 bits] */

										res = (duk_uint32_t) (ch & 0x01);

										n = 5;

									} else if (ch < 0xff) {

										/* 1111 1110   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [36 bits] */

										res = (duk_uint32_t) (0);

										n = 6;

									} else {

										/* 8-byte format could be:

										 * 1111 1111   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [41 bits]

										 *

										 * However, this format would not have a zero bit following the

										 * leading one bits and would not allow 0xFF to be used as an

										 * "invalid xutf-8" marker for internal keys.  Further, 8-byte

										 * encodings (up to 41 bit code points) are not currently needed.

										 */

										goto fail;

									}


									DUK_ASSERT(p >= ptr_start); /* verified at beginning */

									if (p + n > ptr_end) {

										/* check pointer at end */

										goto fail;

									}


									while (n > 0) {

										DUK_ASSERT(p >= ptr_start && p < ptr_end);

										ch = (duk_uint_fast8_t) (*p++);

								#if 0

										if (ch & 0xc0 != 0x80) {

											/* not a continuation byte */

											p--;

											*ptr = p;

											*out_cp = DUK_UNICODE_CP_REPLACEMENT_CHARACTER;

											return 1;

										}

								#endif

										res = (res << 6) + (duk_uint32_t) (ch & 0x3f);

										n--;

									}


									*ptr = p;

									*out_cp = res;

									return 1;


								fail:

									return 0;

								}


								/* used by e.g. duk_regexp_executor.c, string built-ins */

								DUK_INTERNAL duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr,

								                                                               const duk_uint8_t **ptr,

								                                                               const duk_uint8_t *ptr_start,

								                                                               const duk_uint8_t *ptr_end) {

									duk_ucodepoint_t cp;


									if (duk_unicode_decode_xutf8(thr, ptr, ptr_start, ptr_end, &cp)) {

										return cp;

									}

									DUK_ERROR_INTERNAL(thr);

									DUK_WO_NORETURN(return 0;);

								}


								/* Compute (extended) utf-8 length without codepoint encoding validation,

								 * used for string interning.

								 *

								 * NOTE: This algorithm is performance critical, more so than string hashing

								 * in some cases.  It is needed when interning a string and needs to scan

								 * every byte of the string with no skipping.  Having an ASCII fast path

								 * is useful if possible in the algorithm.  The current algorithms were

								 * chosen from several variants, based on x64 gcc -O2 testing.  See:

								 * https://github.com/svaarala/duktape/pull/422

								 *

								 * NOTE: must match tools/dukutil.py:duk_unicode_unvalidated_utf8_length().

								 */


								#if defined(DUK_USE_PREFER_SIZE)

								/* Small variant; roughly 150 bytes smaller than the fast variant. */

								DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) {

									const duk_uint8_t *p;

									const duk_uint8_t *p_end;

									duk_size_t ncont;

									duk_size_t clen;


									p = data;

									p_end = data + blen;

									ncont = 0;

									while (p != p_end) {

										duk_uint8_t x;

										x = *p++;

										if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {

											ncont++;

										}

									}


									DUK_ASSERT(ncont <= blen);

									clen = blen - ncont;

									DUK_ASSERT(clen <= blen);

									return clen;

								}

								#else /* DUK_USE_PREFER_SIZE */

								/* This seems like a good overall approach.  Fast path for ASCII in 4 byte

								 * blocks.

								 */

								DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) {

									const duk_uint8_t *p;

									const duk_uint8_t *p_end;

									const duk_uint32_t *p32_end;

									const duk_uint32_t *p32;

									duk_size_t ncont;

									duk_size_t clen;


									ncont = 0; /* number of continuation (non-initial) bytes in [0x80,0xbf] */

									p = data;

									p_end = data + blen;

									if (blen < 16) {

										goto skip_fastpath;

									}


									/* Align 'p' to 4; the input data may have arbitrary alignment.

									 * End of string check not needed because blen >= 16.

									 */

									while (((duk_size_t) (const void *) p) & 0x03U) {

										duk_uint8_t x;

										x = *p++;

										if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {

											ncont++;

										}

									}


									/* Full, aligned 4-byte reads. */

									p32_end = (const duk_uint32_t *) (const void *) (p + ((duk_size_t) (p_end - p) & (duk_size_t) (~0x03)));

									p32 = (const duk_uint32_t *) (const void *) p;

									while (p32 != (const duk_uint32_t *) p32_end) {

										duk_uint32_t x;

										x = *p32++;

										if (DUK_LIKELY((x & 0x80808080UL) == 0)) {

											; /* ASCII fast path */

										} else {

											/* Flip highest bit of each byte which changes

											 * the bit pattern 10xxxxxx into 00xxxxxx which

											 * allows an easy bit mask test.

											 */

											x ^= 0x80808080UL;

											if (DUK_UNLIKELY(!(x & 0xc0000000UL))) {

												ncont++;

											}

											if (DUK_UNLIKELY(!(x & 0x00c00000UL))) {

												ncont++;

											}

											if (DUK_UNLIKELY(!(x & 0x0000c000UL))) {

												ncont++;

											}

											if (DUK_UNLIKELY(!(x & 0x000000c0UL))) {

												ncont++;

											}

										}

									}

									p = (const duk_uint8_t *) p32;

									/* Fall through to handle the rest. */


								skip_fastpath:

									while (p != p_end) {

										duk_uint8_t x;

										x = *p++;

										if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {

											ncont++;

										}

									}


									DUK_ASSERT(ncont <= blen);

									clen = blen - ncont;

									DUK_ASSERT(clen <= blen);

									return clen;

								}

								#endif /* DUK_USE_PREFER_SIZE */


								/* Check whether a string is UTF-8 compatible or not. */

								DUK_INTERNAL duk_bool_t duk_unicode_is_utf8_compatible(const duk_uint8_t *buf, duk_size_t len) {

									duk_size_t i = 0;

								#if !defined(DUK_USE_PREFER_SIZE)

									duk_size_t len_safe;

								#endif


									/* Many practical strings are ASCII only, so use a fast path check

									 * to check chunks of bytes at once with minimal branch cost.

									 */

								#if !defined(DUK_USE_PREFER_SIZE)

									len_safe = len & ~0x03UL;

									for (; i < len_safe; i += 4) {

										duk_uint8_t t = buf[i] | buf[i + 1] | buf[i + 2] | buf[i + 3];

										if (DUK_UNLIKELY((t & 0x80U) != 0U)) {

											/* At least one byte was outside 0x00-0x7f, break

											 * out to slow path (and remain there).

											 *

											 * XXX: We could also deal with the problem character

											 * and resume fast path later.

											 */

											break;

										}

									}

								#endif


									for (; i < len;) {

										duk_uint8_t t;

										duk_size_t left;

										duk_size_t ncont;

										duk_uint32_t cp;

										duk_uint32_t mincp;


										t = buf[i++];

										if (DUK_LIKELY((t & 0x80U) == 0U)) {

											/* Fast path, ASCII. */

											continue;

										}


										/* Non-ASCII start byte, slow path.

										 *

										 * 10xx xxxx          -> continuation byte

										 * 110x xxxx + 1*CONT -> [0x80, 0x7ff]

										 * 1110 xxxx + 2*CONT -> [0x800, 0xffff], must reject [0xd800,0xdfff]

										 * 1111 0xxx + 3*CONT -> [0x10000, 0x10ffff]

										 */

										left = len - i;

										if (t <= 0xdfU) { /* 1101 1111 = 0xdf */

											if (t <= 0xbfU) { /* 1011 1111 = 0xbf */

												return 0;

											}

											ncont = 1;

											mincp = 0x80UL;

											cp = t & 0x1fU;

										} else if (t <= 0xefU) { /* 1110 1111 = 0xef */

											ncont = 2;

											mincp = 0x800UL;

											cp = t & 0x0fU;

										} else if (t <= 0xf7U) { /* 1111 0111 = 0xf7 */

											ncont = 3;

											mincp = 0x10000UL;

											cp = t & 0x07U;

										} else {

											return 0;

										}

										if (left < ncont) {

											return 0;

										}

										while (ncont > 0U) {

											t = buf[i++];

											if ((t & 0xc0U) != 0x80U) { /* 10xx xxxx */

												return 0;

											}

											cp = (cp << 6) + (t & 0x3fU);

											ncont--;

										}

										if (cp < mincp || cp > 0x10ffffUL || (cp >= 0xd800UL && cp <= 0xdfffUL)) {

											return 0;

										}

									}


									return 1;

								}


								/*

								 *  Unicode range matcher

								 *

								 *  Matches a codepoint against a packed bitstream of character ranges.

								 *  Used for slow path Unicode matching.

								 */


								/* Must match tools/extract_chars.py, generate_match_table3(). */

								DUK_LOCAL duk_uint32_t duk__uni_decode_value(duk_bitdecoder_ctx *bd_ctx) {

									duk_uint32_t t;


									t = (duk_uint32_t) duk_bd_decode(bd_ctx, 4);

									if (t <= 0x0eU) {

										return t;

									}

									t = (duk_uint32_t) duk_bd_decode(bd_ctx, 8);

									if (t <= 0xfdU) {

										return t + 0x0f;

									}

									if (t == 0xfeU) {

										t = (duk_uint32_t) duk_bd_decode(bd_ctx, 12);

										return t + 0x0fU + 0xfeU;

									} else {

										t = (duk_uint32_t) duk_bd_decode(bd_ctx, 24);

										return t + 0x0fU + 0xfeU + 0x1000UL;

									}

								}


								DUK_LOCAL duk_small_int_t duk__uni_range_match(const duk_uint8_t *unitab, duk_size_t unilen, duk_codepoint_t cp) {

									duk_bitdecoder_ctx bd_ctx;

									duk_codepoint_t prev_re;


									duk_memzero(&bd_ctx, sizeof(bd_ctx));

									bd_ctx.data = (const duk_uint8_t *) unitab;

									bd_ctx.length = (duk_size_t) unilen;


									prev_re = 0;

									for (;;) {

										duk_codepoint_t r1, r2;

										r1 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx);

										if (r1 == 0) {

											break;

										}

										r2 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx);


										r1 = prev_re + r1;

										r2 = r1 + r2;

										prev_re = r2;


										/* [r1,r2] is the range */


										DUK_DDD(DUK_DDDPRINT("duk__uni_range_match: cp=%06lx range=[0x%06lx,0x%06lx]",

										                     (unsigned long) cp,

										                     (unsigned long) r1,

										                     (unsigned long) r2));

										if (cp >= r1 && cp <= r2) {

											return 1;

										}

									}


									return 0;

								}


								/*

								 *  "WhiteSpace" production check.

								 */


								DUK_INTERNAL duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) {

									/*

									 *  E5 Section 7.2 specifies six characters specifically as

									 *  white space:

									 *

									 *    - 0009: <control>

									 *    - 000B: <control>

									 *    - 000C: <control>

									 *    - 0020: SPACE

									 *    - 00A0: NO-BREAK SPACE

									 *    - FEFF: ZERO WIDTH NO-BREAK SPACE

									 *

									 *  It also specifies any Unicode category 'Zs' characters as white

									 *  space.  Current result (Unicode 12.1.0):

									 *

									 *    CATEGORY: Zs

									 *    - 0020: SPACE

									 *    - 00A0: NO-BREAK SPACE

									 *    - 1680: OGHAM SPACE MARK

									 *    - 2000: EN QUAD

									 *    - 2001: EM QUAD

									 *    - 2002: EN SPACE

									 *    - 2003: EM SPACE

									 *    - 2004: THREE-PER-EM SPACE

									 *    - 2005: FOUR-PER-EM SPACE

									 *    - 2006: SIX-PER-EM SPACE

									 *    - 2007: FIGURE SPACE

									 *    - 2008: PUNCTUATION SPACE

									 *    - 2009: THIN SPACE

									 *    - 200A: HAIR SPACE

									 *    - 202F: NARROW NO-BREAK SPACE

									 *    - 205F: MEDIUM MATHEMATICAL SPACE

									 *    - 3000: IDEOGRAPHIC SPACE

									 *

									 *    RANGES:

									 *    - 0020

									 *    - 00A0

									 *    - 1680

									 *    - 2000-200A

									 *    - 202F

									 *    - 205F

									 *    - 3000

									 *

									 *  A manual decoder (below) is probably most compact for this.

									 */


									duk_uint_fast8_t lo;

									duk_uint_fast32_t hi;


									/* cp == -1 (EOF) never matches and causes return value 0 */


									lo = (duk_uint_fast8_t) (cp & 0xff);

									hi = (duk_uint_fast32_t) (cp >> 8); /* does not fit into an uchar */


									if (hi == 0x0000UL) {

										if (lo == 0x09U || lo == 0x0bU || lo == 0x0cU || lo == 0x20U || lo == 0xa0U) {

											return 1;

										}

									} else if (hi == 0x0020UL) {

										if (lo <= 0x0aU || lo == 0x2fU || lo == 0x5fU) {

											return 1;

										}

									} else if (cp == 0x1680L || cp == 0x3000L || cp == 0xfeffL) {

										return 1;

									}


									return 0;

								}


								/*

								 *  "LineTerminator" production check.

								 */


								DUK_INTERNAL duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp) {

									/*

									 *  E5 Section 7.3

									 *

									 *  A LineTerminatorSequence essentially merges <CR> <LF> sequences

									 *  into a single line terminator.  This must be handled by the caller.

									 */


									if (cp == 0x000aL || cp == 0x000dL || cp == 0x2028L || cp == 0x2029L) {

										return 1;

									}


									return 0;

								}


								/*

								 *  "IdentifierStart" production check.

								 */


								DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) {

									/*

									 *  E5 Section 7.6:

									 *

									 *    IdentifierStart:

									 *      UnicodeLetter

									 *      $

									 *      _

									 *      \ UnicodeEscapeSequence

									 *

									 *  IdentifierStart production has one multi-character production:

									 *

									 *    \ UnicodeEscapeSequence

									 *

									 *  The '\' character is -not- matched by this function.  Rather, the caller

									 *  should decode the escape and then call this function to check whether the

									 *  decoded character is acceptable (see discussion in E5 Section 7.6).

									 *

									 *  The "UnicodeLetter" alternative of the production allows letters

									 *  from various Unicode categories.  These can be extracted with the

									 *  "tools/extract_chars.py" script.

									 *

									 *  Because the result has hundreds of Unicode codepoint ranges, matching

									 *  for any values >= 0x80 are done using a very slow range-by-range scan

									 *  and a packed range format.

									 *

									 *  The ASCII portion (codepoints 0x00 ... 0x7f) is fast-pathed below because

									 *  it matters the most.  The ASCII related ranges of IdentifierStart are:

									 *

									 *    0x0041 ... 0x005a     ['A' ... 'Z']

									 *    0x0061 ... 0x007a     ['a' ... 'z']

									 *    0x0024                ['$']

									 *    0x005f                ['_']

									 */


									/* ASCII (and EOF) fast path -- quick accept and reject */

									if (cp <= 0x7fL) {

								#if defined(DUK_USE_IDCHAR_FASTPATH)

										return (cp >= 0) && (duk_is_idchar_tab[cp] > 0);

								#else

										if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') || cp == '_' || cp == '$') {

											return 1;

										}

										return 0;

								#endif

									}


									/* Non-ASCII slow path (range-by-range linear comparison), very slow */


								#if defined(DUK_USE_SOURCE_NONBMP)

									if (duk__uni_range_match(duk_unicode_ids_noa, (duk_size_t) sizeof(duk_unicode_ids_noa), (duk_codepoint_t) cp)) {

										return 1;

									}

									return 0;

								#else

									if (cp < 0x10000L) {

										if (duk__uni_range_match(duk_unicode_ids_noabmp, sizeof(duk_unicode_ids_noabmp), (duk_codepoint_t) cp)) {

											return 1;

										}

										return 0;

									} else {

										/* without explicit non-BMP support, assume non-BMP characters

										 * are always accepted as identifier characters.

										 */

										return 1;

									}

								#endif

								}


								/*

								 *  "IdentifierPart" production check.

								 */


								DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) {

									/*

									 *  E5 Section 7.6:

									 *

									 *    IdentifierPart:

									 *      IdentifierStart

									 *      UnicodeCombiningMark

									 *      UnicodeDigit

									 *      UnicodeConnectorPunctuation

									 *      <ZWNJ>  [U+200C]

									 *      <ZWJ>   [U+200D]

									 *

									 *  IdentifierPart production has one multi-character production

									 *  as part of its IdentifierStart alternative.  The '\' character

									 *  of an escape sequence is not matched here, see discussion in

									 *  duk_unicode_is_identifier_start().

									 *

									 *  To match non-ASCII characters (codepoints >= 0x80), a very slow

									 *  linear range-by-range scan is used.  The codepoint is first compared

									 *  to the IdentifierStart ranges, and if it doesn't match, then to a

									 *  set consisting of code points in IdentifierPart but not in

									 *  IdentifierStart.  This is done to keep the unicode range data small,

									 *  at the expense of speed.

									 *

									 *  The ASCII fast path consists of:

									 *

									 *    0x0030 ... 0x0039     ['0' ... '9', UnicodeDigit]

									 *    0x0041 ... 0x005a     ['A' ... 'Z', IdentifierStart]

									 *    0x0061 ... 0x007a     ['a' ... 'z', IdentifierStart]

									 *    0x0024                ['$', IdentifierStart]

									 *    0x005f                ['_', IdentifierStart and

									 *                                UnicodeConnectorPunctuation]

									 *

									 *  UnicodeCombiningMark has no code points <= 0x7f.

									 *

									 *  The matching code reuses the "identifier start" tables, and then

									 *  consults a separate range set for characters in "identifier part"

									 *  but not in "identifier start".  These can be extracted with the

									 *  "tools/extract_chars.py" script.

									 *

									 *  UnicodeCombiningMark -> categories Mn, Mc

									 *  UnicodeDigit -> categories Nd

									 *  UnicodeConnectorPunctuation -> categories Pc

									 */


									/* ASCII (and EOF) fast path -- quick accept and reject */

									if (cp <= 0x7fL) {

								#if defined(DUK_USE_IDCHAR_FASTPATH)

										return (cp >= 0) && (duk_is_idchar_tab[cp] != 0);

								#else

										if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') || (cp >= '0' && cp <= '9') || cp == '_' || cp == '$') {

											return 1;

										}

										return 0;

								#endif

									}


									/* Non-ASCII slow path (range-by-range linear comparison), very slow */


								#if defined(DUK_USE_SOURCE_NONBMP)

									if (duk__uni_range_match(duk_unicode_ids_noa, sizeof(duk_unicode_ids_noa), (duk_codepoint_t) cp) ||

									    duk__uni_range_match(duk_unicode_idp_m_ids_noa, sizeof(duk_unicode_idp_m_ids_noa), (duk_codepoint_t) cp)) {

										return 1;

									}

									return 0;

								#else

									if (cp < 0x10000L) {

										if (duk__uni_range_match(duk_unicode_ids_noabmp, sizeof(duk_unicode_ids_noabmp), (duk_codepoint_t) cp) ||

										    duk__uni_range_match(duk_unicode_idp_m_ids_noabmp,

										                         sizeof(duk_unicode_idp_m_ids_noabmp),

										                         (duk_codepoint_t) cp)) {

											return 1;

										}

										return 0;

									} else {

										/* without explicit non-BMP support, assume non-BMP characters

										 * are always accepted as identifier characters.

										 */

										return 1;

									}

								#endif

								}


								/*

								 *  Unicode letter check.

								 */


								DUK_INTERNAL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp) {

									/*

									 *  Unicode letter is now taken to be the categories:

									 *

									 *    Lu, Ll, Lt, Lm, Lo

									 *

									 *  (Not sure if this is exactly correct.)

									 *

									 *  The ASCII fast path consists of:

									 *

									 *    0x0041 ... 0x005a     ['A' ... 'Z']

									 *    0x0061 ... 0x007a     ['a' ... 'z']

									 */


									/* ASCII (and EOF) fast path -- quick accept and reject */

									if (cp <= 0x7fL) {

										if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z')) {

											return 1;

										}

										return 0;

									}


									/* Non-ASCII slow path (range-by-range linear comparison), very slow */


								#if defined(DUK_USE_SOURCE_NONBMP)

									if (duk__uni_range_match(duk_unicode_ids_noa, sizeof(duk_unicode_ids_noa), (duk_codepoint_t) cp) &&

									    !duk__uni_range_match(duk_unicode_ids_m_let_noa, sizeof(duk_unicode_ids_m_let_noa), (duk_codepoint_t) cp)) {

										return 1;

									}

									return 0;

								#else

									if (cp < 0x10000L) {

										if (duk__uni_range_match(duk_unicode_ids_noabmp, sizeof(duk_unicode_ids_noabmp), (duk_codepoint_t) cp) &&

										    !duk__uni_range_match(duk_unicode_ids_m_let_noabmp,

										                          sizeof(duk_unicode_ids_m_let_noabmp),

										                          (duk_codepoint_t) cp)) {

											return 1;

										}

										return 0;

									} else {

										/* without explicit non-BMP support, assume non-BMP characters

										 * are always accepted as letters.

										 */

										return 1;

									}

								#endif

								}


								/*

								 *  Complex case conversion helper which decodes a bit-packed conversion

								 *  control stream generated by tools/extract_caseconv.py.  The conversion

								 *  is very slow because it runs through the conversion data in a linear

								 *  fashion to save space (which is why ASCII characters have a special

								 *  fast path before arriving here).

								 *

								 *  The particular bit counts etc have been determined experimentally to

								 *  be small but still sufficient, and must match the Python script

								 *  (tools/extract_caseconv.py).

								 *

								 *  The return value is the case converted codepoint or -1 if the conversion

								 *  results in multiple characters (this is useful for regexp Canonicalization

								 *  operation).  If 'buf' is not NULL, the result codepoint(s) are also

								 *  appended to the hbuffer.

								 *

								 *  Context and locale specific rules must be checked before consulting

								 *  this function.

								 */


								DUK_LOCAL

								duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr, duk_bufwriter_ctx *bw, duk_codepoint_t cp, duk_bitdecoder_ctx *bd_ctx) {

									duk_small_int_t skip = 0;

									duk_small_int_t n;

									duk_small_int_t t;

									duk_small_int_t count;

									duk_codepoint_t tmp_cp;

									duk_codepoint_t start_i;

									duk_codepoint_t start_o;


									DUK_ASSERT(bd_ctx != NULL);

									DUK_UNREF(thr);


									DUK_DDD(DUK_DDDPRINT("slow case conversion for codepoint: %ld", (long) cp));


									/* range conversion with a "skip" */

									DUK_DDD(DUK_DDDPRINT("checking ranges"));

									for (;;) {

										skip++;

										n = (duk_small_int_t) duk_bd_decode(bd_ctx, 6);

										if (n == 0x3f) {

											/* end marker */

											break;

										}

										DUK_DDD(DUK_DDDPRINT("skip=%ld, n=%ld", (long) skip, (long) n));


										while (n--) {

											start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);

											start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);

											count = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);

											DUK_DDD(DUK_DDDPRINT("range: start_i=%ld, start_o=%ld, count=%ld, skip=%ld",

											                     (long) start_i,

											                     (long) start_o,

											                     (long) count,

											                     (long) skip));


											if (cp >= start_i) {

												tmp_cp = cp - start_i; /* always >= 0 */

												if (tmp_cp < (duk_codepoint_t) count * (duk_codepoint_t) skip &&

												    (tmp_cp % (duk_codepoint_t) skip) == 0) {

													DUK_DDD(DUK_DDDPRINT("range matches input codepoint"));

													cp = start_o + tmp_cp;

													goto single;

												}

											}

										}

									}


									/* 1:1 conversion */

									n = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);

									DUK_DDD(DUK_DDDPRINT("checking 1:1 conversions (count %ld)", (long) n));

									while (n--) {

										start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);

										start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);

										DUK_DDD(DUK_DDDPRINT("1:1 conversion %ld -> %ld", (long) start_i, (long) start_o));

										if (cp == start_i) {

											DUK_DDD(DUK_DDDPRINT("1:1 matches input codepoint"));

											cp = start_o;

											goto single;

										}

									}


									/* complex, multicharacter conversion */

									n = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);

									DUK_DDD(DUK_DDDPRINT("checking 1:n conversions (count %ld)", (long) n));

									while (n--) {

										start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);

										t = (duk_small_int_t) duk_bd_decode(bd_ctx, 2);

										DUK_DDD(DUK_DDDPRINT("1:n conversion %ld -> %ld chars", (long) start_i, (long) t));

										if (cp == start_i) {

											DUK_DDD(DUK_DDDPRINT("1:n matches input codepoint"));

											if (bw != NULL) {

												while (t--) {

													tmp_cp = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);

													DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) tmp_cp);

												}

											}

											return -1;

										} else {

											while (t--) {

												(void) duk_bd_decode(bd_ctx, 16);

											}

										}

									}


									/* default: no change */

									DUK_DDD(DUK_DDDPRINT("no rule matches, output is same as input"));

									/* fall through */


								single:

									if (bw != NULL) {

										DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);

									}

									return cp;

								}


								/*

								 *  Case conversion helper, with context/local sensitivity.

								 *  For proper case conversion, one needs to know the character

								 *  and the preceding and following characters, as well as

								 *  locale/language.

								 */


								/* XXX: add 'language' argument when locale/language sensitive rule

								 * support added.

								 */

								DUK_LOCAL

								duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,

								                                           duk_bufwriter_ctx *bw,

								                                           duk_codepoint_t cp,

								                                           duk_codepoint_t prev,

								                                           duk_codepoint_t next,

								                                           duk_bool_t uppercase) {

									duk_bitdecoder_ctx bd_ctx;


									/* fast path for ASCII */

									if (cp < 0x80L) {

										/* XXX: there are language sensitive rules for the ASCII range.

										 * If/when language/locale support is implemented, they need to

										 * be implemented here for the fast path.  There are no context

										 * sensitive rules for ASCII range.

										 */


										if (uppercase) {

											if (cp >= 'a' && cp <= 'z') {

												cp = cp - 'a' + 'A';

											}

										} else {

											if (cp >= 'A' && cp <= 'Z') {

												cp = cp - 'A' + 'a';

											}

										}


										if (bw != NULL) {

											DUK_BW_WRITE_RAW_U8(thr, bw, (duk_uint8_t) cp);

										}

										return cp;

									}


									/* context and locale specific rules which cannot currently be represented

									 * in the caseconv bitstream: hardcoded rules in C

									 */

									if (uppercase) {

										/* XXX: turkish / azeri */

									} else {

										/*

										 *  Final sigma context specific rule.  This is a rather tricky

										 *  rule and this handling is probably not 100% correct now.

										 *  The rule is not locale/language specific so it is supported.

										 */


										if (cp == 0x03a3L && /* U+03A3 = GREEK CAPITAL LETTER SIGMA */

										    duk_unicode_is_letter(prev) && /* prev exists and is not a letter */

										    !duk_unicode_is_letter(next)) { /* next does not exist or next is not a letter */

											/* Capital sigma occurred at "end of word", lowercase to

											 * U+03C2 = GREEK SMALL LETTER FINAL SIGMA.  Otherwise

											 * fall through and let the normal rules lowercase it to

											 * U+03C3 = GREEK SMALL LETTER SIGMA.

											 */

											cp = 0x03c2L;

											goto singlechar;

										}


										/* XXX: lithuanian not implemented */

										/* XXX: lithuanian, explicit dot rules */

										/* XXX: turkish / azeri, lowercase rules */

									}


									/* 1:1 or special conversions, but not locale/context specific: script generated rules */

									duk_memzero(&bd_ctx, sizeof(bd_ctx));

									if (uppercase) {

										bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_uc;

										bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_uc);

									} else {

										bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_lc;

										bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc);

									}

									return duk__slow_case_conversion(thr, bw, cp, &bd_ctx);


								singlechar:

									if (bw != NULL) {

										DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);

									}

									return cp;


									/* unused now, not needed until Turkish/Azeri */

								#if 0

								 nochar:

									return -1;

								#endif

								}


								/*

								 *  Replace valstack top with case converted version.

								 */


								DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_bool_t uppercase) {

									duk_hstring *h_input;

									duk_bufwriter_ctx bw_alloc;

									duk_bufwriter_ctx *bw;

									const duk_uint8_t *p, *p_start, *p_end;

									duk_codepoint_t prev, curr, next;


									h_input = duk_require_hstring(thr, -1); /* Accept symbols. */

									DUK_ASSERT(h_input != NULL);


									bw = &bw_alloc;

									DUK_BW_INIT_PUSHBUF(thr, bw, duk_hstring_get_bytelen(h_input));


									/* [ ... input buffer ] */


									p_start = (const duk_uint8_t *) duk_hstring_get_data(h_input);

									p_end = p_start + duk_hstring_get_bytelen(h_input);

									p = p_start;


									prev = -1;

									DUK_UNREF(prev);

									curr = -1;

									next = -1;

									for (;;) {

										prev = curr;

										curr = next;

										next = -1;

										if (p < p_end) {

											next = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end);

										} else {

											/* end of input and last char has been processed */

											if (curr < 0) {

												break;

											}

										}


										/* on first round, skip */

										if (curr >= 0) {

											/* XXX: could add a fast path to process chunks of input codepoints,

											 * but relative benefit would be quite small.

											 */


											/* Ensure space for maximum multi-character result; estimate is overkill. */

											DUK_BW_ENSURE(thr, bw, 8 * DUK_UNICODE_MAX_XUTF8_LENGTH);


											duk__case_transform_helper(thr, bw, (duk_codepoint_t) curr, prev, next, uppercase);

										}

									}


									DUK_BW_COMPACT(thr, bw);

									(void) duk_buffer_to_string(thr, -1); /* Safe, output is encoded. */

									/* invalidates h_buf pointer */

									duk_remove_m2(thr);

								}


								#if defined(DUK_USE_REGEXP_SUPPORT)


								/*

								 *  Canonicalize() abstract operation needed for canonicalization of individual

								 *  codepoints during regexp compilation and execution, see E5 Section 15.10.2.8.

								 *  Note that codepoints are canonicalized one character at a time, so no context

								 *  specific rules can apply.  Locale specific rules can apply, though.

								 */


								DUK_INTERNAL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp) {

								#if defined(DUK_USE_REGEXP_CANON_WORKAROUND)

									/* Fast canonicalization lookup at the cost of 128kB footprint. */

									DUK_ASSERT(cp >= 0);

									DUK_UNREF(thr);

									if (DUK_LIKELY(cp < 0x10000L)) {

										return (duk_codepoint_t) duk_unicode_re_canon_lookup[cp];

									}

									return cp;

								#else /* DUK_USE_REGEXP_CANON_WORKAROUND */

									duk_codepoint_t y;


									y = duk__case_transform_helper(thr,

									                               NULL, /* NULL is allowed, no output */

									                               cp, /* curr char */

									                               -1, /* prev char */

									                               -1, /* next char */

									                               1); /* uppercase */


									if ((y < 0) || (cp >= 0x80 && y < 0x80)) {

										/* multiple codepoint conversion or non-ASCII mapped to ASCII

										 * --> leave as is.

										 */

										return cp;

									}


									return y;

								#endif /* DUK_USE_REGEXP_CANON_WORKAROUND */

								}


								/*

								 *  E5 Section 15.10.2.6 "IsWordChar" abstract operation.  Assume

								 *  x < 0 for characters read outside the string.

								 */


								DUK_INTERNAL duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t x) {

									/*

									 *  Note: the description in E5 Section 15.10.2.6 has a typo, it

									 *  contains 'A' twice and lacks 'a'; the intent is [0-9a-zA-Z_].

									 */

									if ((x >= '0' && x <= '9') || (x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || (x == '_')) {

										return 1;

									}

									return 0;

								}


								/*

								 *  Regexp range tables

								 */


								/* exposed because lexer needs these too */

								DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_digit[2] = {

									(duk_uint16_t) 0x0030UL,

									(duk_uint16_t) 0x0039UL,

								};

								DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_white[22] = {

									(duk_uint16_t) 0x0009UL, (duk_uint16_t) 0x000DUL, (duk_uint16_t) 0x0020UL, (duk_uint16_t) 0x0020UL, (duk_uint16_t) 0x00A0UL,

									(duk_uint16_t) 0x00A0UL, (duk_uint16_t) 0x1680UL, (duk_uint16_t) 0x1680UL, (duk_uint16_t) 0x180EUL, (duk_uint16_t) 0x180EUL,

									(duk_uint16_t) 0x2000UL, (duk_uint16_t) 0x200AUL, (duk_uint16_t) 0x2028UL, (duk_uint16_t) 0x2029UL, (duk_uint16_t) 0x202FUL,

									(duk_uint16_t) 0x202FUL, (duk_uint16_t) 0x205FUL, (duk_uint16_t) 0x205FUL, (duk_uint16_t) 0x3000UL, (duk_uint16_t) 0x3000UL,

									(duk_uint16_t) 0xFEFFUL, (duk_uint16_t) 0xFEFFUL,

								};

								DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_wordchar[8] = {

									(duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL, (duk_uint16_t) 0x0041UL, (duk_uint16_t) 0x005AUL,

									(duk_uint16_t) 0x005FUL, (duk_uint16_t) 0x005FUL, (duk_uint16_t) 0x0061UL, (duk_uint16_t) 0x007AUL,

								};

								DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_digit[4] = {

									(duk_uint16_t) 0x0000UL,

									(duk_uint16_t) 0x002FUL,

									(duk_uint16_t) 0x003AUL,

									(duk_uint16_t) 0xFFFFUL,

								};

								DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_white[24] = {

									(duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x0008UL, (duk_uint16_t) 0x000EUL, (duk_uint16_t) 0x001FUL, (duk_uint16_t) 0x0021UL,

									(duk_uint16_t) 0x009FUL, (duk_uint16_t) 0x00A1UL, (duk_uint16_t) 0x167FUL, (duk_uint16_t) 0x1681UL, (duk_uint16_t) 0x180DUL,

									(duk_uint16_t) 0x180FUL, (duk_uint16_t) 0x1FFFUL, (duk_uint16_t) 0x200BUL, (duk_uint16_t) 0x2027UL, (duk_uint16_t) 0x202AUL,

									(duk_uint16_t) 0x202EUL, (duk_uint16_t) 0x2030UL, (duk_uint16_t) 0x205EUL, (duk_uint16_t) 0x2060UL, (duk_uint16_t) 0x2FFFUL,

									(duk_uint16_t) 0x3001UL, (duk_uint16_t) 0xFEFEUL, (duk_uint16_t) 0xFF00UL, (duk_uint16_t) 0xFFFFUL,

								};

								DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_wordchar[10] = {

									(duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL, (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0x0040UL, (duk_uint16_t) 0x005BUL,

									(duk_uint16_t) 0x005EUL, (duk_uint16_t) 0x0060UL, (duk_uint16_t) 0x0060UL, (duk_uint16_t) 0x007BUL, (duk_uint16_t) 0xFFFFUL,

								};


								#endif /* DUK_USE_REGEXP_SUPPORT */