duktape/src-input/duk_regexp_compiler.c


								/*

								 *  Regexp compilation.

								 *

								 *  See doc/regexp.rst for a discussion of the compilation approach and

								 *  current limitations.

								 *

								 *  Regexp bytecode assumes jumps can be expressed with signed 32-bit

								 *  integers.  Consequently the bytecode size must not exceed 0x7fffffffL.

								 *  The implementation casts duk_size_t (buffer size) to duk_(u)int32_t

								 *  in many places.  Although this could be changed, the bytecode format

								 *  limit would still prevent regexps exceeding the signed 32-bit limit

								 *  from working.

								 *

								 *  XXX: The implementation does not prevent bytecode from exceeding the

								 *  maximum supported size.  This could be done by limiting the maximum

								 *  input string size (assuming an upper bound can be computed for number

								 *  of bytecode bytes emitted per input byte) or checking buffer maximum

								 *  size when emitting bytecode (slower).

								 */


								#include "duk_internal.h"


								#if defined(DUK_USE_REGEXP_SUPPORT)


								/*

								 *  Helper macros

								 */


								#define DUK__RE_INITIAL_BUFSIZE 64


								#define DUK__RE_BUFLEN(re_ctx) \

									DUK_BW_GET_SIZE(re_ctx->thr, &re_ctx->bw)


								/*

								 *  Disjunction struct: result of parsing a disjunction

								 */


								typedef struct {

									/* Number of characters that the atom matches (e.g. 3 for 'abc'),

									 * -1 if atom is complex and number of matched characters either

									 * varies or is not known.

									 */

									duk_int32_t charlen;


								#if 0

									/* These are not needed to implement quantifier capture handling,

									 * but might be needed at some point.

									 */


									/* re_ctx->captures at start and end of atom parsing.

									 * Since 'captures' indicates highest capture number emitted

									 * so far in a DUK_REOP_SAVE, the captures numbers saved by

									 * the atom are: ]start_captures,end_captures].

									 */

									duk_uint32_t start_captures;

									duk_uint32_t end_captures;

								#endif

								} duk__re_disjunction_info;


								/*

								 *  Encoding helpers

								 *

								 *  Some of the typing is bytecode based, e.g. slice sizes are unsigned 32-bit

								 *  even though the buffer operations will use duk_size_t.

								 */


								/* XXX: the insert helpers should ensure that the bytecode result is not

								 * larger than expected (or at least assert for it).  Many things in the

								 * bytecode, like skip offsets, won't work correctly if the bytecode is

								 * larger than say 2G.

								 */


								DUK_LOCAL duk_uint32_t duk__encode_i32(duk_int32_t x) {

									if (x < 0) {

										return ((duk_uint32_t) (-x)) * 2 + 1;

									} else {

										return ((duk_uint32_t) x) * 2;

									}

								}


								/* XXX: return type should probably be duk_size_t, or explicit checks are needed for

								 * maximum size.

								 */

								DUK_LOCAL duk_uint32_t duk__insert_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t x) {

									duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];

									duk_small_int_t len;


									len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);

									DUK_ASSERT(len >= 0);

									DUK_BW_INSERT_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, offset, buf, (duk_size_t) len);

									return (duk_uint32_t) len;

								}


								DUK_LOCAL void duk__append_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {

									DUK_BW_WRITE_ENSURE_XUTF8(re_ctx->thr, &re_ctx->bw, x);

								}


								DUK_LOCAL void duk__append_7bit(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {

								#if defined(DUK_USE_PREFER_SIZE)

									duk__append_u32(re_ctx, x);

								#else

									DUK_ASSERT(x <= 0x7fU);

									DUK_BW_WRITE_ENSURE_U8(re_ctx->thr, &re_ctx->bw, (duk_uint8_t) x);

								#endif

								}


								#if 0

								DUK_LOCAL void duk__append_2bytes(duk_re_compiler_ctx *re_ctx, duk_uint8_t x, duk_uint8_t y) {

									DUK_BW_WRITE_ENSURE_U8_2(re_ctx->thr, &re_ctx->bw, x, y);

								}

								#endif


								DUK_LOCAL duk_uint32_t duk__insert_i32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t x) {

									return duk__insert_u32(re_ctx, offset, duk__encode_i32(x));

								}


								DUK_LOCAL void duk__append_reop(duk_re_compiler_ctx *re_ctx, duk_uint32_t reop) {

									DUK_ASSERT(reop <= 0x7fU);

									(void) duk__append_7bit(re_ctx, reop);

								}


								#if 0  /* unused */

								DUK_LOCAL void duk__append_i32(duk_re_compiler_ctx *re_ctx, duk_int32_t x) {

									duk__append_u32(re_ctx, duk__encode_i32(x));

								}

								#endif


								/* special helper for emitting u16 lists (used for character ranges for built-in char classes) */

								DUK_LOCAL void duk__append_u16_list(duk_re_compiler_ctx *re_ctx, const duk_uint16_t *values, duk_uint32_t count) {

									/* Call sites don't need the result length so it's not accumulated. */

									while (count-- > 0) {

										duk__append_u32(re_ctx, (duk_uint32_t) (*values++));

									}

								}


								DUK_LOCAL void duk__insert_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t data_offset, duk_uint32_t data_length) {

									DUK_BW_INSERT_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, offset, data_offset, data_length);

								}


								DUK_LOCAL void duk__append_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {

									DUK_BW_WRITE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);

								}


								DUK_LOCAL void duk__remove_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {

									DUK_BW_REMOVE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);

								}


								/*

								 *  Insert a jump offset at 'offset' to complete an instruction

								 *  (the jump offset is always the last component of an instruction).

								 *  The 'skip' argument must be computed relative to 'offset',

								 *  -without- taking into account the skip field being inserted.

								 *

								 *       ... A B C ins X Y Z ...   (ins may be a JUMP, SPLIT1/SPLIT2, etc)

								 *   =>  ... A B C ins SKIP X Y Z

								 *

								 *  Computing the final (adjusted) skip value, which is relative to the

								 *  first byte of the next instruction, is a bit tricky because of the

								 *  variable length UTF-8 encoding.  See doc/regexp.rst for discussion.

								 */

								DUK_LOCAL duk_uint32_t duk__insert_jump_offset(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t skip) {

								#if 0

									/* Iterative solution. */

									if (skip < 0) {

										duk_small_int_t len;

										/* two encoding attempts suffices */

										len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip));

										len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip - (duk_int32_t) len));

										DUK_ASSERT(duk_unicode_get_xutf8_length(duk__encode_i32(skip - (duk_int32_t) len)) == len);  /* no change */

										skip -= (duk_int32_t) len;

									}

								#endif


								#if defined(DUK_USE_PREFER_SIZE)

									/* Closed form solution, this produces smallest code.

									 * See re_neg_jump_offset (closed2).

									 */

									if (skip < 0) {

										skip--;

										if (skip < -0x3fL) {

											skip--;

										}

										if (skip < -0x3ffL) {

											skip--;

										}

										if (skip < -0x7fffL) {

											skip--;

										}

										if (skip < -0xfffffL) {

											skip--;

										}

										if (skip < -0x1ffffffL) {

											skip--;

										}

										if (skip < -0x3fffffffL) {

											skip--;

										}

									}

								#else  /* DUK_USE_PREFER_SIZE */

									/* Closed form solution, this produces fastest code.

									 * See re_neg_jump_offset (closed1).

									 */

									if (skip < 0) {

										if (skip >= -0x3eL) {

											skip -= 1;

										} else if (skip >= -0x3fdL) {

											skip -= 2;

										} else if (skip >= -0x7ffcL) {

											skip -= 3;

										} else if (skip >= -0xffffbL) {

											skip -= 4;

										} else if (skip >= -0x1fffffaL) {

											skip -= 5;

										} else if (skip >= -0x3ffffff9L) {

											skip -= 6;

										} else {

											skip -= 7;

										}

									}

								#endif  /* DUK_USE_PREFER_SIZE */


									return duk__insert_i32(re_ctx, offset, skip);

								}


								DUK_LOCAL duk_uint32_t duk__append_jump_offset(duk_re_compiler_ctx *re_ctx, duk_int32_t skip) {

									return (duk_uint32_t) duk__insert_jump_offset(re_ctx, (duk_uint32_t) DUK__RE_BUFLEN(re_ctx), skip);

								}


								/*

								 *  duk_re_range_callback for generating character class ranges.

								 *

								 *  When ignoreCase is false, the range is simply emitted as is.  We don't,

								 *  for instance, eliminate duplicates or overlapping ranges in a character

								 *  class.

								 *

								 *  When ignoreCase is true but the 'direct' flag is set, the caller knows

								 *  that the range canonicalizes to itself for case insensitive matching,

								 *  so the range is emitted as is.  This is mainly useful for built-in ranges

								 *  like \W.

								 *

								 *  Otherwise, when ignoreCase is true, the range needs to be normalized

								 *  through canonicalization.  Unfortunately a canonicalized version of a

								 *  continuous range is not necessarily continuous (e.g. [x-{] is continuous

								 *  but [X-{] is not).  As a result, a single input range may expand to a lot

								 *  of output ranges.  The current algorithm creates the canonicalized ranges

								 *  footprint efficiently at the cost of compile time execution time; see

								 *  doc/regexp.rst for discussion, and some more details below.

								 *

								 *  Note that the ctx->nranges is a context-wide temporary value.  This is OK

								 *  because there cannot be multiple character classes being parsed

								 *  simultaneously.

								 *

								 *  More detail on canonicalization:

								 *

								 *  Conceptually, a range is canonicalized by scanning the entire range,

								 *  normalizing each codepoint by converting it to uppercase, and generating

								 *  a set of result ranges.

								 *

								 *  Ideally a minimal set of output ranges would be emitted by merging all

								 *  possible ranges even if they're emitted out of sequence.  Because the

								 *  input string is also case normalized during matching, some codepoints

								 *  never occur at runtime; these "don't care" codepoints can be included or

								 *  excluded from ranges when merging/optimizing ranges.

								 *

								 *  The current algorithm does not do optimal range merging.  Rather, output

								 *  codepoints are generated in sequence, and when the output codepoints are

								 *  continuous (CP, CP+1, CP+2, ...), they are merged locally into as large a

								 *  range as possible.  A small canonicalization bitmap is used to reduce

								 *  actual codepoint canonicalizations which are quite slow at present.  The

								 *  bitmap provides a "codepoint block is continuous with respect to

								 *  canonicalization" for N-codepoint blocks.  This allows blocks to be

								 *  skipped quickly.

								 *

								 *  There are a number of shortcomings and future work here:

								 *

								 *    - Individual codepoint normalizations are slow because they involve

								 *      walking bit-packed rules without a lookup index.

								 *

								 *    - The conceptual algorithm needs to canonicalize every codepoint in the

								 *      input range to figure out the output range(s).  Even with the small

								 *      canonicalization bitmap the algorithm runs quite slowly for worst case

								 *      inputs.  There are many data structure alternatives to improve this.

								 *

								 *    - While the current algorithm generates maximal output ranges when the

								 *      output codepoints are emitted linearly, output ranges are not sorted or

								 *      merged otherwise.  In the worst case a lot of ranges are emitted when

								 *      most of the ranges could be merged.  In this process one could take

								 *      advantage of "don't care" codepoints, which are never matched against at

								 *      runtime due to canonicalization of input codepoints before comparison,

								 *      to merge otherwise discontinuous output ranges.

								 *

								 *    - The runtime data structure is just a linear list of ranges to match

								 *      against.  This can be quite slow if there are a lot of output ranges.

								 *      There are various ways to make matching against the ranges faster,

								 *      e.g. sorting the ranges and using a binary search; skip lists; tree

								 *      based representations; full or approximate codepoint bitmaps, etc.

								 *

								 *    - Only BMP is supported, codepoints above BMP are assumed to canonicalize

								 *      to themselves.  For now this is one place where we don't want to

								 *      support chars outside the BMP, because the exhaustive search would be

								 *      massively larger.  It would be possible to support non-BMP with a

								 *      different algorithm, or perhaps doing case normalization only at match

								 *      time.

								 */


								DUK_LOCAL void duk__regexp_emit_range(duk_re_compiler_ctx *re_ctx, duk_codepoint_t r1, duk_codepoint_t r2) {

									DUK_ASSERT(r2 >= r1);

									duk__append_u32(re_ctx, (duk_uint32_t) r1);

									duk__append_u32(re_ctx, (duk_uint32_t) r2);

									re_ctx->nranges++;

								}


								#if defined(DUK_USE_REGEXP_CANON_BITMAP)

								/* Find next canonicalization discontinuity (conservative estimate) starting

								 * from 'start', not exceeding 'end'.  If continuity is fine up to 'end'

								 * inclusive, returns end.  Minimum possible return value is start.

								 */

								DUK_LOCAL duk_codepoint_t duk__re_canon_next_discontinuity(duk_codepoint_t start, duk_codepoint_t end) {

									duk_uint_t start_blk;

									duk_uint_t end_blk;

									duk_uint_t blk;

									duk_uint_t offset;

									duk_uint8_t mask;


									/* Inclusive block range. */

									DUK_ASSERT(start >= 0);

									DUK_ASSERT(end >= 0);

									DUK_ASSERT(end >= start);

									start_blk = (duk_uint_t) (start >> DUK_CANON_BITMAP_BLKSHIFT);

									end_blk = (duk_uint_t) (end >> DUK_CANON_BITMAP_BLKSHIFT);


									for (blk = start_blk; blk <= end_blk; blk++) {

										offset = blk >> 3;

										mask = 1U << (blk & 0x07);

										if (offset >= sizeof(duk_unicode_re_canon_bitmap)) {

											/* Reached non-BMP range which is assumed continuous. */

											return end;

										}

										DUK_ASSERT(offset < sizeof(duk_unicode_re_canon_bitmap));

										if ((duk_unicode_re_canon_bitmap[offset] & mask) == 0) {

											/* Block is discontinuous, continuity is guaranteed

											 * only up to end of previous block (+1 for exclusive

											 * return value => start of current block).  Start

											 * block requires special handling.

											 */

											if (blk > start_blk) {

												return (duk_codepoint_t) (blk << DUK_CANON_BITMAP_BLKSHIFT);

											} else {

												return start;

											}

										}

									}

									DUK_ASSERT(blk == end_blk + 1);  /* Reached end block which is continuous. */

									return end;

								}

								#else  /* DUK_USE_REGEXP_CANON_BITMAP */

								DUK_LOCAL duk_codepoint_t duk__re_canon_next_discontinuity(duk_codepoint_t start, duk_codepoint_t end) {

									DUK_ASSERT(start >= 0);

									DUK_ASSERT(end >= 0);

									DUK_ASSERT(end >= start);

									if (start >= 0x10000) {

										/* Even without the bitmap, treat non-BMP as continuous. */

										return end;

									}

									return start;

								}

								#endif  /* DUK_USE_REGEXP_CANON_BITMAP */


								DUK_LOCAL void duk__regexp_generate_ranges(void *userdata, duk_codepoint_t r1, duk_codepoint_t r2, duk_bool_t direct) {

									duk_re_compiler_ctx *re_ctx = (duk_re_compiler_ctx *) userdata;

									duk_codepoint_t r_start;

									duk_codepoint_t r_end;

									duk_codepoint_t i;

									duk_codepoint_t t;

									duk_codepoint_t r_disc;


									DUK_DD(DUK_DDPRINT("duk__regexp_generate_ranges(): re_ctx=%p, range=[%ld,%ld] direct=%ld",

									                   (void *) re_ctx, (long) r1, (long) r2, (long) direct));


									DUK_ASSERT(r2 >= r1);  /* SyntaxError for out of order range. */


									if (direct || (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) == 0) {

										DUK_DD(DUK_DDPRINT("direct or not case sensitive, emit range: [%ld,%ld]", (long) r1, (long) r2));

										duk__regexp_emit_range(re_ctx, r1, r2);

										return;

									}


									DUK_DD(DUK_DDPRINT("case sensitive, process range: [%ld,%ld]", (long) r1, (long) r2));


									r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);

									r_end = r_start;


									for (i = r1 + 1; i <= r2;) {

										/* Input codepoint space processed up to i-1, and

										 * current range in r_{start,end} is up-to-date

										 * (inclusive) and may either break or continue.

										 */

										r_disc = duk__re_canon_next_discontinuity(i, r2);

										DUK_ASSERT(r_disc >= i);

										DUK_ASSERT(r_disc <= r2);


										r_end += r_disc - i;  /* May be zero. */

										t = duk_unicode_re_canonicalize_char(re_ctx->thr, r_disc);

										if (t == r_end + 1) {

											/* Not actually a discontinuity, continue range

											 * to r_disc and recheck.

											 */

											r_end = t;

										} else {

											duk__regexp_emit_range(re_ctx, r_start, r_end);

											r_start = t;

											r_end = t;

										}

										i = r_disc + 1;  /* Guarantees progress. */

									}

									duk__regexp_emit_range(re_ctx, r_start, r_end);


								#if 0  /* Exhaustive search, very slow. */

									r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);

									r_end = r_start;

									for (i = r1 + 1; i <= r2; i++) {

										t = duk_unicode_re_canonicalize_char(re_ctx->thr, i);

										if (t == r_end + 1) {

											r_end = t;

										} else {

											DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));

											duk__append_u32(re_ctx, (duk_uint32_t) r_start);

											duk__append_u32(re_ctx, (duk_uint32_t) r_end);

											re_ctx->nranges++;

											r_start = t;

											r_end = t;

										}

									}

									DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));

									duk__append_u32(re_ctx, (duk_uint32_t) r_start);

									duk__append_u32(re_ctx, (duk_uint32_t) r_end);

									re_ctx->nranges++;

								#endif

								}


								/*

								 *  Parse regexp Disjunction.  Most of regexp compilation happens here.

								 *

								 *  Handles Disjunction, Alternative, and Term productions directly without

								 *  recursion.  The only constructs requiring recursion are positive/negative

								 *  lookaheads, capturing parentheses, and non-capturing parentheses.

								 *

								 *  The function determines whether the entire disjunction is a 'simple atom'

								 *  (see doc/regexp.rst discussion on 'simple quantifiers') and if so,

								 *  returns the atom character length which is needed by the caller to keep

								 *  track of its own atom character length.  A disjunction with more than one

								 *  alternative is never considered a simple atom (although in some cases

								 *  that might be the case).

								 *

								 *  Return value: simple atom character length or < 0 if not a simple atom.

								 *  Appends the bytecode for the disjunction matcher to the end of the temp

								 *  buffer.

								 *

								 *  Regexp top level structure is:

								 *

								 *    Disjunction = Term*

								 *                | Term* | Disjunction

								 *

								 *    Term = Assertion

								 *         | Atom

								 *         | Atom Quantifier

								 *

								 *  An empty Term sequence is a valid disjunction alternative (e.g. /|||c||/).

								 *

								 *  Notes:

								 *

								 *    * Tracking of the 'simple-ness' of the current atom vs. the entire

								 *      disjunction are separate matters.  For instance, the disjunction

								 *      may be complex, but individual atoms may be simple.  Furthermore,

								 *      simple quantifiers are used whenever possible, even if the

								 *      disjunction as a whole is complex.

								 *

								 *    * The estimate of whether an atom is simple is conservative now,

								 *      and it would be possible to expand it.  For instance, captures

								 *      cause the disjunction to be marked complex, even though captures

								 *      -can- be handled by simple quantifiers with some minor modifications.

								 *

								 *    * Disjunction 'tainting' as 'complex' is handled at the end of the

								 *      main for loop collectively for atoms.  Assertions, quantifiers,

								 *      and '|' tokens need to taint the result manually if necessary.

								 *      Assertions cannot add to result char length, only atoms (and

								 *      quantifiers) can; currently quantifiers will taint the result

								 *      as complex though.

								 */


								DUK_LOCAL const duk_uint16_t * const duk__re_range_lookup1[3] = {

									duk_unicode_re_ranges_digit,

									duk_unicode_re_ranges_white,

									duk_unicode_re_ranges_wordchar

								};

								DUK_LOCAL const duk_uint8_t duk__re_range_lookup2[3] = {

									sizeof(duk_unicode_re_ranges_digit) / (2 * sizeof(duk_uint16_t)),

									sizeof(duk_unicode_re_ranges_white) / (2 * sizeof(duk_uint16_t)),

									sizeof(duk_unicode_re_ranges_wordchar) / (2 * sizeof(duk_uint16_t))

								};


								DUK_LOCAL void duk__append_range_atom_matcher(duk_re_compiler_ctx *re_ctx, duk_small_uint_t re_op, const duk_uint16_t *ranges, duk_small_uint_t count) {

								#if 0

									DUK_ASSERT(re_op <= 0x7fUL);

									DUK_ASSERT(count <= 0x7fUL);

									duk__append_2bytes(re_ctx, (duk_uint8_t) re_op, (duk_uint8_t) count);

								#endif

									duk__append_reop(re_ctx, re_op);

									duk__append_7bit(re_ctx, count);

									duk__append_u16_list(re_ctx, ranges, count * 2);

								}


								DUK_LOCAL void duk__parse_disjunction(duk_re_compiler_ctx *re_ctx, duk_bool_t expect_eof, duk__re_disjunction_info *out_atom_info) {

									duk_int32_t atom_start_offset = -1;                   /* negative -> no atom matched on previous round */

									duk_int32_t atom_char_length = 0;                     /* negative -> complex atom */

									duk_uint32_t atom_start_captures = re_ctx->captures;  /* value of re_ctx->captures at start of atom */

									duk_int32_t unpatched_disjunction_split = -1;

									duk_int32_t unpatched_disjunction_jump = -1;

									duk_uint32_t entry_offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);

									duk_int32_t res_charlen = 0;  /* -1 if disjunction is complex, char length if simple */

									duk__re_disjunction_info tmp_disj;


									DUK_ASSERT(out_atom_info != NULL);


									if (re_ctx->recursion_depth >= re_ctx->recursion_limit) {

										DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_REGEXP_COMPILER_RECURSION_LIMIT);

									}

									re_ctx->recursion_depth++;


								#if 0

									out_atom_info->start_captures = re_ctx->captures;

								#endif


									for (;;) {

										/* atom_char_length, atom_start_offset, atom_start_offset reflect the

										 * atom matched on the previous loop.  If a quantifier is encountered

										 * on this loop, these are needed to handle the quantifier correctly.

										 * new_atom_char_length etc are for the atom parsed on this round;

										 * they're written to atom_char_length etc at the end of the round.

										 */

										duk_int32_t new_atom_char_length;   /* char length of the atom parsed in this loop */

										duk_int32_t new_atom_start_offset;  /* bytecode start offset of the atom parsed in this loop

										                                     * (allows quantifiers to copy the atom bytecode)

										                                     */

										duk_uint32_t new_atom_start_captures;  /* re_ctx->captures at the start of the atom parsed in this loop */


										duk_lexer_parse_re_token(&re_ctx->lex, &re_ctx->curr_token);


										DUK_DD(DUK_DDPRINT("re token: %ld (num=%ld, char=%c)",

										                   (long) re_ctx->curr_token.t,

										                   (long) re_ctx->curr_token.num,

										                   (re_ctx->curr_token.num >= 0x20 && re_ctx->curr_token.num <= 0x7e) ?

										                   (int) re_ctx->curr_token.num : (int) '?'));


										/* set by atom case clauses */

										new_atom_start_offset = -1;

										new_atom_char_length = -1;

										new_atom_start_captures = re_ctx->captures;


										switch (re_ctx->curr_token.t) {

										case DUK_RETOK_DISJUNCTION: {

											/*

											 *  The handling here is a bit tricky.  If a previous '|' has been processed,

											 *  we have a pending split1 and a pending jump (for a previous match).  These

											 *  need to be back-patched carefully.  See docs for a detailed example.

											 */


											/* patch pending jump and split */

											if (unpatched_disjunction_jump >= 0) {

												duk_uint32_t offset;


												DUK_ASSERT(unpatched_disjunction_split >= 0);

												offset = (duk_uint32_t) unpatched_disjunction_jump;

												offset += duk__insert_jump_offset(re_ctx,

												                                  offset,

												                                  (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));

												/* offset is now target of the pending split (right after jump) */

												duk__insert_jump_offset(re_ctx,

												                        (duk_uint32_t) unpatched_disjunction_split,

												                        (duk_int32_t) offset - unpatched_disjunction_split);

											}


											/* add a new pending split to the beginning of the entire disjunction */

											(void) duk__insert_u32(re_ctx,

											                       entry_offset,

											                       DUK_REOP_SPLIT1);   /* prefer direct execution */

											unpatched_disjunction_split = (duk_int32_t) (entry_offset + 1);   /* +1 for opcode */


											/* add a new pending match jump for latest finished alternative */

											duk__append_reop(re_ctx, DUK_REOP_JUMP);

											unpatched_disjunction_jump = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);


											/* 'taint' result as complex */

											res_charlen = -1;

											break;

										}

										case DUK_RETOK_QUANTIFIER: {

											if (atom_start_offset < 0) {

												DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_NO_ATOM);

											}

											if (re_ctx->curr_token.qmin > re_ctx->curr_token.qmax) {

												DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_VALUES);

											}

											if (atom_char_length >= 0) {

												/*

												 *  Simple atom

												 *

												 *  If atom_char_length is zero, we'll have unbounded execution time for e.g.

												 *  /()*x/.exec('x').  We can't just skip the match because it might have some

												 *  side effects (for instance, if we allowed captures in simple atoms, the

												 *  capture needs to happen).  The simple solution below is to force the

												 *  quantifier to match at most once, since the additional matches have no effect.

												 *

												 *  With a simple atom there can be no capture groups, so no captures need

												 *  to be reset.

												 */

												duk_int32_t atom_code_length;

												duk_uint32_t offset;

												duk_uint32_t qmin, qmax;


												qmin = re_ctx->curr_token.qmin;

												qmax = re_ctx->curr_token.qmax;

												if (atom_char_length == 0) {

													/* qmin and qmax will be 0 or 1 */

													if (qmin > 1) {

														qmin = 1;

													}

													if (qmax > 1) {

														qmax = 1;

													}

												}


												duk__append_reop(re_ctx, DUK_REOP_MATCH);   /* complete 'sub atom' */

												atom_code_length = (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (duk_size_t) atom_start_offset);


												offset = (duk_uint32_t) atom_start_offset;

												if (re_ctx->curr_token.greedy) {

													offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQGREEDY);

													offset += duk__insert_u32(re_ctx, offset, qmin);

													offset += duk__insert_u32(re_ctx, offset, qmax);

													offset += duk__insert_u32(re_ctx, offset, (duk_uint32_t) atom_char_length);

													offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);

												} else {

													offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQMINIMAL);

													offset += duk__insert_u32(re_ctx, offset, qmin);

													offset += duk__insert_u32(re_ctx, offset, qmax);

													offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);

												}

												DUK_UNREF(offset);  /* silence scan-build warning */

											} else {

												/*

												 *  Complex atom

												 *

												 *  The original code is used as a template, and removed at the end

												 *  (this differs from the handling of simple quantifiers).

												 *

												 *  NOTE: there is no current solution for empty atoms in complex

												 *  quantifiers.  This would need some sort of a 'progress' instruction.

												 *

												 *  XXX: impose limit on maximum result size, i.e. atom_code_len * atom_copies?

												 */

												duk_int32_t atom_code_length;

												duk_uint32_t atom_copies;

												duk_uint32_t tmp_qmin, tmp_qmax;


												/* pre-check how many atom copies we're willing to make (atom_copies not needed below) */

												atom_copies = (re_ctx->curr_token.qmax == DUK_RE_QUANTIFIER_INFINITE) ?

												              re_ctx->curr_token.qmin : re_ctx->curr_token.qmax;

												if (atom_copies > DUK_RE_MAX_ATOM_COPIES) {

													DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_QUANTIFIER_TOO_MANY_COPIES);

												}


												/* wipe the capture range made by the atom (if any) */

												DUK_ASSERT(atom_start_captures <= re_ctx->captures);

												if (atom_start_captures != re_ctx->captures) {

													DUK_ASSERT(atom_start_captures < re_ctx->captures);

													DUK_DDD(DUK_DDDPRINT("must wipe ]atom_start_captures,re_ctx->captures]: ]%ld,%ld]",

													                     (long) atom_start_captures, (long) re_ctx->captures));


													/* insert (DUK_REOP_WIPERANGE, start, count) in reverse order so the order ends up right */

													duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, (re_ctx->captures - atom_start_captures) * 2U);

													duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, (atom_start_captures + 1) * 2);

													duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, DUK_REOP_WIPERANGE);

												} else {

													DUK_DDD(DUK_DDDPRINT("no need to wipe captures: atom_start_captures == re_ctx->captures == %ld",

													                     (long) atom_start_captures));

												}


												atom_code_length = (duk_int32_t) DUK__RE_BUFLEN(re_ctx) - atom_start_offset;


												/* insert the required matches (qmin) by copying the atom */

												tmp_qmin = re_ctx->curr_token.qmin;

												tmp_qmax = re_ctx->curr_token.qmax;

												while (tmp_qmin > 0) {

													duk__append_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);

													tmp_qmin--;

													if (tmp_qmax != DUK_RE_QUANTIFIER_INFINITE) {

														tmp_qmax--;

													}

												}

												DUK_ASSERT(tmp_qmin == 0);


												/* insert code for matching the remainder - infinite or finite */

												if (tmp_qmax == DUK_RE_QUANTIFIER_INFINITE) {

													/* reuse last emitted atom for remaining 'infinite' quantifier */


													if (re_ctx->curr_token.qmin == 0) {

														/* Special case: original qmin was zero so there is nothing

														 * to repeat.  Emit an atom copy but jump over it here.

														 */

														duk__append_reop(re_ctx, DUK_REOP_JUMP);

														duk__append_jump_offset(re_ctx, atom_code_length);

														duk__append_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);

													}

													if (re_ctx->curr_token.greedy) {

														duk__append_reop(re_ctx, DUK_REOP_SPLIT2);   /* prefer jump */

													} else {

														duk__append_reop(re_ctx, DUK_REOP_SPLIT1);   /* prefer direct */

													}

													duk__append_jump_offset(re_ctx, -atom_code_length - 1);  /* -1 for opcode */

												} else {

													/*

													 *  The remaining matches are emitted as sequence of SPLITs and atom

													 *  copies; the SPLITs skip the remaining copies and match the sequel.

													 *  This sequence needs to be emitted starting from the last copy

													 *  because the SPLITs are variable length due to the variable length

													 *  skip offset.  This causes a lot of memory copying now.

													 *

													 *  Example structure (greedy, match maximum # atoms):

													 *

													 *      SPLIT1 LSEQ

													 *      (atom)

													 *      SPLIT1 LSEQ    ; <- the byte length of this instruction is needed

													 *      (atom)         ; to encode the above SPLIT1 correctly

													 *      ...

													 *   LSEQ:

													 */

													duk_uint32_t offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);

													while (tmp_qmax > 0) {

														duk__insert_slice(re_ctx, offset, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);

														if (re_ctx->curr_token.greedy) {

															duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT1);   /* prefer direct */

														} else {

															duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT2);   /* prefer jump */

														}

														duk__insert_jump_offset(re_ctx,

														                        offset + 1,   /* +1 for opcode */

														                        (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));

														tmp_qmax--;

													}

												}


												/* remove the original 'template' atom */

												duk__remove_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);

											}


											/* 'taint' result as complex */

											res_charlen = -1;

											break;

										}

										case DUK_RETOK_ASSERT_START: {

											duk__append_reop(re_ctx, DUK_REOP_ASSERT_START);

											break;

										}

										case DUK_RETOK_ASSERT_END: {

											duk__append_reop(re_ctx, DUK_REOP_ASSERT_END);

											break;

										}

										case DUK_RETOK_ASSERT_WORD_BOUNDARY: {

											duk__append_reop(re_ctx, DUK_REOP_ASSERT_WORD_BOUNDARY);

											break;

										}

										case DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY: {

											duk__append_reop(re_ctx, DUK_REOP_ASSERT_NOT_WORD_BOUNDARY);

											break;

										}

										case DUK_RETOK_ASSERT_START_POS_LOOKAHEAD:

										case DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD: {

											duk_uint32_t offset;

											duk_uint32_t opcode = (re_ctx->curr_token.t == DUK_RETOK_ASSERT_START_POS_LOOKAHEAD) ?

											                      DUK_REOP_LOOKPOS : DUK_REOP_LOOKNEG;


											offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);

											duk__parse_disjunction(re_ctx, 0, &tmp_disj);

											duk__append_reop(re_ctx, DUK_REOP_MATCH);


											(void) duk__insert_u32(re_ctx, offset, opcode);

											(void) duk__insert_jump_offset(re_ctx,

											                               offset + 1,   /* +1 for opcode */

											                               (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));


											/* 'taint' result as complex -- this is conservative,

											 * as lookaheads do not backtrack.

											 */

											res_charlen = -1;

											break;

										}

										case DUK_RETOK_ATOM_PERIOD: {

											new_atom_char_length = 1;

											new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);

											duk__append_reop(re_ctx, DUK_REOP_PERIOD);

											break;

										}

										case DUK_RETOK_ATOM_CHAR: {

											/* Note: successive characters could be joined into string matches

											 * but this is not trivial (consider e.g. '/xyz+/); see docs for

											 * more discussion.

											 *

											 * No support for \u{H+} yet.  While only BMP Unicode escapes are

											 * supported for RegExps at present, 'ch' may still be a non-BMP

											 * codepoint if it is decoded straight from source text UTF-8.

											 * There's no non-BMP support yet so this is handled simply by

											 * matching the non-BMP character (which is custom behavior).

											 */

											duk_uint32_t ch;


											new_atom_char_length = 1;

											new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);

											duk__append_reop(re_ctx, DUK_REOP_CHAR);

											ch = re_ctx->curr_token.num;

											if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {

												ch = (duk_uint32_t) duk_unicode_re_canonicalize_char(re_ctx->thr, (duk_codepoint_t) ch);

											}

											duk__append_u32(re_ctx, ch);

											break;

										}

										case DUK_RETOK_ATOM_DIGIT:

										case DUK_RETOK_ATOM_NOT_DIGIT:

										case DUK_RETOK_ATOM_WHITE:

										case DUK_RETOK_ATOM_NOT_WHITE:

										case DUK_RETOK_ATOM_WORD_CHAR:

										case DUK_RETOK_ATOM_NOT_WORD_CHAR: {

											duk_small_uint_t re_op;

											duk_small_uint_t idx;


											new_atom_char_length = 1;

											new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);


											DUK_ASSERT((DUK_RETOK_ATOM_DIGIT & 0x01) != 0);

											DUK_ASSERT((DUK_RETOK_ATOM_WHITE & 0x01) != 0);

											DUK_ASSERT((DUK_RETOK_ATOM_WORD_CHAR & 0x01) != 0);

											DUK_ASSERT((DUK_RETOK_ATOM_NOT_DIGIT & 0x01) == 0);

											DUK_ASSERT((DUK_RETOK_ATOM_NOT_WHITE & 0x01) == 0);

											DUK_ASSERT((DUK_RETOK_ATOM_NOT_WORD_CHAR & 0x01) == 0);

											re_op = (re_ctx->curr_token.t & 0x01) ? DUK_REOP_RANGES : DUK_REOP_INVRANGES;


											DUK_ASSERT(DUK_RETOK_ATOM_WHITE == DUK_RETOK_ATOM_DIGIT + 2);

											DUK_ASSERT(DUK_RETOK_ATOM_WORD_CHAR == DUK_RETOK_ATOM_DIGIT + 4);

											idx = (duk_small_uint_t) ((re_ctx->curr_token.t - DUK_RETOK_ATOM_DIGIT) >> 1U);

											DUK_ASSERT(idx <= 2U);  /* Assume continuous token numbers; also checks negative underflow. */


											duk__append_range_atom_matcher(re_ctx, re_op, duk__re_range_lookup1[idx], duk__re_range_lookup2[idx]);

											break;

										}

										case DUK_RETOK_ATOM_BACKREFERENCE: {

											duk_uint32_t backref = (duk_uint32_t) re_ctx->curr_token.num;

											if (backref > re_ctx->highest_backref) {

												re_ctx->highest_backref = backref;

											}

											new_atom_char_length = -1;   /* mark as complex */

											new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);

											duk__append_reop(re_ctx, DUK_REOP_BACKREFERENCE);

											duk__append_u32(re_ctx, backref);

											break;

										}

										case DUK_RETOK_ATOM_START_CAPTURE_GROUP: {

											duk_uint32_t cap;


											new_atom_char_length = -1;   /* mark as complex (capture handling) */

											new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);

											cap = ++re_ctx->captures;

											duk__append_reop(re_ctx, DUK_REOP_SAVE);

											duk__append_u32(re_ctx, cap * 2);

											duk__parse_disjunction(re_ctx, 0, &tmp_disj);  /* retval (sub-atom char length) unused, tainted as complex above */

											duk__append_reop(re_ctx, DUK_REOP_SAVE);

											duk__append_u32(re_ctx, cap * 2 + 1);

											break;

										}

										case DUK_RETOK_ATOM_START_NONCAPTURE_GROUP: {

											new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);

											duk__parse_disjunction(re_ctx, 0, &tmp_disj);

											new_atom_char_length = tmp_disj.charlen;

											break;

										}

										case DUK_RETOK_ATOM_START_CHARCLASS:

										case DUK_RETOK_ATOM_START_CHARCLASS_INVERTED: {

											/*

											 *  Range parsing is done with a special lexer function which calls

											 *  us for every range parsed.  This is different from how rest of

											 *  the parsing works, but avoids a heavy, arbitrary size intermediate

											 *  value type to hold the ranges.

											 *

											 *  Another complication is the handling of character ranges when

											 *  case insensitive matching is used (see docs for discussion).

											 *  The range handler callback given to the lexer takes care of this

											 *  as well.

											 *

											 *  Note that duplicate ranges are not eliminated when parsing character

											 *  classes, so that canonicalization of

											 *

											 *    [0-9a-fA-Fx-{]

											 *

											 *  creates the result (note the duplicate ranges):

											 *

											 *    [0-9A-FA-FX-Z{-{]

											 *

											 *  where [x-{] is split as a result of canonicalization.  The duplicate

											 *  ranges are not a semantics issue: they work correctly.

											 */


											duk_uint32_t offset;


											DUK_DD(DUK_DDPRINT("character class"));


											/* insert ranges instruction, range count patched in later */

											new_atom_char_length = 1;

											new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);

											duk__append_reop(re_ctx,

											                 (re_ctx->curr_token.t == DUK_RETOK_ATOM_START_CHARCLASS) ?

											                 DUK_REOP_RANGES : DUK_REOP_INVRANGES);

											offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);    /* patch in range count later */


											/* parse ranges until character class ends */

											re_ctx->nranges = 0;    /* note: ctx-wide temporary */

											duk_lexer_parse_re_ranges(&re_ctx->lex, duk__regexp_generate_ranges, (void *) re_ctx);


											/* insert range count */

											duk__insert_u32(re_ctx, offset, re_ctx->nranges);

											break;

										}

										case DUK_RETOK_ATOM_END_GROUP: {

											if (expect_eof) {

												DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_CLOSING_PAREN);

											}

											goto done;

										}

										case DUK_RETOK_EOF: {

											if (!expect_eof) {

												DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_END_OF_PATTERN);

											}

											goto done;

										}

										default: {

											DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_REGEXP_TOKEN);

										}

										}


										/* a complex (new) atom taints the result */

										if (new_atom_start_offset >= 0) {

											if (new_atom_char_length < 0) {

												res_charlen = -1;

											} else if (res_charlen >= 0) {

												/* only advance if not tainted */

												res_charlen += new_atom_char_length;

											}

										}


										/* record previous atom info in case next token is a quantifier */

										atom_start_offset = new_atom_start_offset;

										atom_char_length = new_atom_char_length;

										atom_start_captures = new_atom_start_captures;

									}


								 done:


									/* finish up pending jump and split for last alternative */

									if (unpatched_disjunction_jump >= 0) {

										duk_uint32_t offset;


										DUK_ASSERT(unpatched_disjunction_split >= 0);

										offset = (duk_uint32_t) unpatched_disjunction_jump;

										offset += duk__insert_jump_offset(re_ctx,

										                                  offset,

										                                  (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));

										/* offset is now target of the pending split (right after jump) */

										duk__insert_jump_offset(re_ctx,

										                        (duk_uint32_t) unpatched_disjunction_split,

										                        (duk_int32_t) offset - unpatched_disjunction_split);

									}


								#if 0

									out_atom_info->end_captures = re_ctx->captures;

								#endif

									out_atom_info->charlen = res_charlen;

									DUK_DDD(DUK_DDDPRINT("parse disjunction finished: charlen=%ld",

									                     (long) out_atom_info->charlen));


									re_ctx->recursion_depth--;

								}


								/*

								 *  Flags parsing (see E5 Section 15.10.4.1).

								 */


								DUK_LOCAL duk_uint32_t duk__parse_regexp_flags(duk_hthread *thr, duk_hstring *h) {

									const duk_uint8_t *p;

									const duk_uint8_t *p_end;

									duk_uint32_t flags = 0;


									p = DUK_HSTRING_GET_DATA(h);

									p_end = p + DUK_HSTRING_GET_BYTELEN(h);


									/* Note: can be safely scanned as bytes (undecoded) */


									while (p < p_end) {

										duk_uint8_t c = *p++;

										switch (c) {

										case (duk_uint8_t) 'g': {

											if (flags & DUK_RE_FLAG_GLOBAL) {

												goto flags_error;

											}

											flags |= DUK_RE_FLAG_GLOBAL;

											break;

										}

										case (duk_uint8_t) 'i': {

											if (flags & DUK_RE_FLAG_IGNORE_CASE) {

												goto flags_error;

											}

											flags |= DUK_RE_FLAG_IGNORE_CASE;

											break;

										}

										case (duk_uint8_t) 'm': {

											if (flags & DUK_RE_FLAG_MULTILINE) {

												goto flags_error;

											}

											flags |= DUK_RE_FLAG_MULTILINE;

											break;

										}

										default: {

											goto flags_error;

										}

										}

									}


									return flags;


								 flags_error:

									DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_REGEXP_FLAGS);

									return 0;  /* never here */

								}


								/*

								 *  Create escaped RegExp source (E5 Section 15.10.3).

								 *

								 *  The current approach is to special case the empty RegExp

								 *  ('' -> '(?:)') and otherwise replace unescaped '/' characters

								 *  with '\/' regardless of where they occur in the regexp.

								 *

								 *  Note that normalization does not seem to be necessary for

								 *  RegExp literals (e.g. '/foo/') because to be acceptable as

								 *  a RegExp literal, the text between forward slashes must

								 *  already match the escaping requirements (e.g. must not contain

								 *  unescaped forward slashes or be empty).  Escaping IS needed

								 *  for expressions like 'new Regexp("...", "")' however.

								 *  Currently, we re-escape in either case.

								 *

								 *  Also note that we process the source here in UTF-8 encoded

								 *  form.  This is correct, because any non-ASCII characters are

								 *  passed through without change.

								 */


								DUK_LOCAL void duk__create_escaped_source(duk_hthread *thr, int idx_pattern) {

									duk_hstring *h;

									const duk_uint8_t *p;

									duk_bufwriter_ctx bw_alloc;

									duk_bufwriter_ctx *bw;

									duk_uint8_t *q;

									duk_size_t i, n;

									duk_uint_fast8_t c_prev, c;


									h = duk_known_hstring(thr, idx_pattern);

									p = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h);

									n = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h);


									if (n == 0) {

										duk_push_string(thr, "(?:)");

										return;

									}


									bw = &bw_alloc;

									DUK_BW_INIT_PUSHBUF(thr, bw, n);

									q = DUK_BW_GET_PTR(thr, bw);


									c_prev = (duk_uint_fast8_t) 0;


									for (i = 0; i < n; i++) {

										c = p[i];


										q = DUK_BW_ENSURE_RAW(thr, bw, 2, q);


										if (c == (duk_uint_fast8_t) '/' && c_prev != (duk_uint_fast8_t) '\\') {

											/* Unescaped '/' ANYWHERE in the regexp (in disjunction,

											 * inside a character class, ...) => same escape works.

											 */

											*q++ = DUK_ASC_BACKSLASH;

										}

										*q++ = (duk_uint8_t) c;


										c_prev = c;

									}


									DUK_BW_SETPTR_AND_COMPACT(thr, bw, q);

									(void) duk_buffer_to_string(thr, -1);  /* Safe if input is safe. */


									/* [ ... escaped_source ] */

								}


								/*

								 *  Exposed regexp compilation primitive.

								 *

								 *  Sets up a regexp compilation context, and calls duk__parse_disjunction() to do the

								 *  actual parsing.  Handles generation of the compiled regexp header and the

								 *  "boilerplate" capture of the matching substring (save 0 and 1).  Also does some

								 *  global level regexp checks after recursive compilation has finished.

								 *

								 *  An escaped version of the regexp source, suitable for use as a RegExp instance

								 *  'source' property (see E5 Section 15.10.3), is also left on the stack.

								 *

								 *  Input stack:  [ pattern flags ]

								 *  Output stack: [ bytecode escaped_source ]  (both as strings)

								 */


								DUK_INTERNAL void duk_regexp_compile(duk_hthread *thr) {

									duk_re_compiler_ctx re_ctx;

									duk_lexer_point lex_point;

									duk_hstring *h_pattern;

									duk_hstring *h_flags;

									duk__re_disjunction_info ign_disj;


									DUK_ASSERT(thr != NULL);


									/*

									 *  Args validation

									 */


									/* TypeError if fails */

									h_pattern = duk_require_hstring_notsymbol(thr, -2);

									h_flags = duk_require_hstring_notsymbol(thr, -1);


									/*

									 *  Create normalized 'source' property (E5 Section 15.10.3).

									 */


									/* [ ... pattern flags ] */


									duk__create_escaped_source(thr, -2);


									/* [ ... pattern flags escaped_source ] */


									/*

									 *  Init compilation context

									 */


									/* [ ... pattern flags escaped_source buffer ] */


									DUK_MEMZERO(&re_ctx, sizeof(re_ctx));

									DUK_LEXER_INITCTX(&re_ctx.lex);  /* duplicate zeroing, expect for (possible) NULL inits */

									re_ctx.thr = thr;

									re_ctx.lex.thr = thr;

									re_ctx.lex.input = DUK_HSTRING_GET_DATA(h_pattern);

									re_ctx.lex.input_length = DUK_HSTRING_GET_BYTELEN(h_pattern);

									re_ctx.lex.token_limit = DUK_RE_COMPILE_TOKEN_LIMIT;

									re_ctx.recursion_limit = DUK_USE_REGEXP_COMPILER_RECLIMIT;

									re_ctx.re_flags = duk__parse_regexp_flags(thr, h_flags);


									DUK_BW_INIT_PUSHBUF(thr, &re_ctx.bw, DUK__RE_INITIAL_BUFSIZE);


									DUK_DD(DUK_DDPRINT("regexp compiler ctx initialized, flags=0x%08lx, recursion_limit=%ld",

									                   (unsigned long) re_ctx.re_flags, (long) re_ctx.recursion_limit));


									/*

									 *  Init lexer

									 */


									lex_point.offset = 0;  /* expensive init, just want to fill window */

									lex_point.line = 1;

									DUK_LEXER_SETPOINT(&re_ctx.lex, &lex_point);


									/*

									 *  Compilation

									 */


									DUK_DD(DUK_DDPRINT("starting regexp compilation"));


									duk__append_reop(&re_ctx, DUK_REOP_SAVE);

									duk__append_7bit(&re_ctx, 0);

									duk__parse_disjunction(&re_ctx, 1 /*expect_eof*/, &ign_disj);

									duk__append_reop(&re_ctx, DUK_REOP_SAVE);

									duk__append_7bit(&re_ctx, 1);

									duk__append_reop(&re_ctx, DUK_REOP_MATCH);


									/*

									 *  Check for invalid backreferences; note that it is NOT an error

									 *  to back-reference a capture group which has not yet been introduced

									 *  in the pattern (as in /\1(foo)/); in fact, the backreference will

									 *  always match!  It IS an error to back-reference a capture group

									 *  which will never be introduced in the pattern.  Thus, we can check

									 *  for such references only after parsing is complete.

									 */


									if (re_ctx.highest_backref > re_ctx.captures) {

										DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_BACKREFS);

									}


									/*

									 *  Emit compiled regexp header: flags, ncaptures

									 *  (insertion order inverted on purpose)

									 */


									duk__insert_u32(&re_ctx, 0, (re_ctx.captures + 1) * 2);

									duk__insert_u32(&re_ctx, 0, re_ctx.re_flags);


									/* [ ... pattern flags escaped_source buffer ] */


									DUK_BW_COMPACT(thr, &re_ctx.bw);

									(void) duk_buffer_to_string(thr, -1);  /* Safe because flags is at most 7 bit. */


									/* [ ... pattern flags escaped_source bytecode ] */


									/*

									 *  Finalize stack

									 */


									duk_remove(thr, -4);     /* -> [ ... flags escaped_source bytecode ] */

									duk_remove(thr, -3);     /* -> [ ... escaped_source bytecode ] */


									DUK_DD(DUK_DDPRINT("regexp compilation successful, bytecode: %!T, escaped source: %!T",

									                   (duk_tval *) duk_get_tval(thr, -1), (duk_tval *) duk_get_tval(thr, -2)));

								}


								/*

								 *  Create a RegExp instance (E5 Section 15.10.7).

								 *

								 *  Note: the output stack left by duk_regexp_compile() is directly compatible

								 *  with the input here.

								 *

								 *  Input stack:  [ escaped_source bytecode ]  (both as strings)

								 *  Output stack: [ RegExp ]

								 */


								DUK_INTERNAL void duk_regexp_create_instance(duk_hthread *thr) {

									duk_hobject *h;


									/* [ ... escaped_source bytecode ] */


									duk_push_object(thr);

									h = duk_known_hobject(thr, -1);

									duk_insert(thr, -3);


									/* [ ... regexp_object escaped_source bytecode ] */


									DUK_HOBJECT_SET_CLASS_NUMBER(h, DUK_HOBJECT_CLASS_REGEXP);

									DUK_HOBJECT_SET_PROTOTYPE_UPDREF(thr, h, thr->builtins[DUK_BIDX_REGEXP_PROTOTYPE]);


									duk_xdef_prop_stridx_short(thr, -3, DUK_STRIDX_INT_BYTECODE, DUK_PROPDESC_FLAGS_NONE);


									/* [ ... regexp_object escaped_source ] */


									/* In ES2015 .source, and the .global, .multiline, etc flags are

									 * inherited getters.  Store the escaped source as an internal

									 * property for the getter.

									 */


									duk_xdef_prop_stridx_short(thr, -2, DUK_STRIDX_INT_SOURCE, DUK_PROPDESC_FLAGS_NONE);


									/* [ ... regexp_object ] */


									duk_push_int(thr, 0);

									duk_xdef_prop_stridx_short(thr, -2, DUK_STRIDX_LAST_INDEX, DUK_PROPDESC_FLAGS_W);


									/* [ ... regexp_object ] */

								}


								#else  /* DUK_USE_REGEXP_SUPPORT */


								/* regexp support disabled */


								#endif  /* DUK_USE_REGEXP_SUPPORT */