|
|
|
/*
|
|
|
|
* Lexer for source files, ToNumber() string conversions, RegExp expressions,
|
|
|
|
* and JSON.
|
|
|
|
*
|
|
|
|
* Provides a stream of Ecmascript tokens from an UTF-8/CESU-8 buffer. The
|
|
|
|
* caller can also rewind the token stream into a certain position which is
|
|
|
|
* needed by the compiler part for multi-pass scanning. Tokens are
|
|
|
|
* represented as duk_token structures, and contain line number information.
|
|
|
|
* Token types are identified with DUK_TOK_* defines.
|
|
|
|
*
|
|
|
|
* Characters are decoded into a fixed size lookup window consisting of
|
|
|
|
* decoded Unicode code points, with window positions past the end of the
|
|
|
|
* input filled with an invalid codepoint (-1). The tokenizer can thus
|
|
|
|
* perform multiple character lookups efficiently and with few sanity
|
|
|
|
* checks (such as access outside the end of the input), which keeps the
|
|
|
|
* tokenization code small at the cost of performance.
|
|
|
|
*
|
|
|
|
* Character data in tokens, such as identifier names and string literals,
|
|
|
|
* is encoded into CESU-8 format on-the-fly while parsing the token in
|
|
|
|
* question. The string data is made reachable to garbage collection by
|
|
|
|
* placing the token-related values in value stack entries allocated for
|
|
|
|
* this purpose by the caller. The characters exist in Unicode code point
|
|
|
|
* form only in the fixed size lookup window, which keeps character data
|
|
|
|
* expansion (of especially ASCII data) low.
|
|
|
|
*
|
|
|
|
* Token parsing supports the full range of Unicode characters as described
|
|
|
|
* in the E5 specification. Parsing has been optimized for ASCII characters
|
|
|
|
* because ordinary Ecmascript code consists almost entirely of ASCII
|
|
|
|
* characters. Matching of complex Unicode codepoint sets (such as in the
|
|
|
|
* IdentifierStart and IdentifierPart productions) is optimized for size,
|
|
|
|
* and is done using a linear scan of a bit-packed list of ranges. This is
|
|
|
|
* very slow, but should never be entered unless the source code actually
|
|
|
|
* contains Unicode characters.
|
|
|
|
*
|
|
|
|
* Ecmascript tokenization is partially context sensitive. First,
|
|
|
|
* additional future reserved words are recognized in strict mode (see E5
|
|
|
|
* Section 7.6.1.2). Second, a forward slash character ('/') can be
|
|
|
|
* recognized either as starting a RegExp literal or as a division operator,
|
|
|
|
* depending on context. The caller must provide necessary context flags
|
|
|
|
* when requesting a new token.
|
|
|
|
*
|
|
|
|
* Future work:
|
|
|
|
*
|
|
|
|
* * Make line number tracking optional, as it consumes space.
|
|
|
|
*
|
|
|
|
* * Add a feature flag for disabling UTF-8 decoding of input, as most
|
|
|
|
* source code is ASCII. Because of Unicode escapes written in ASCII,
|
|
|
|
* this does not allow Unicode support to be removed from e.g.
|
|
|
|
* duk_unicode_is_identifier_start() nor does it allow removal of CESU-8
|
|
|
|
* encoding of e.g. string literals.
|
|
|
|
*
|
|
|
|
* * Add a feature flag for disabling Unicode compliance of e.g. identifier
|
|
|
|
* names. This allows for a build more than a kilobyte smaller, because
|
|
|
|
* Unicode ranges needed by duk_unicode_is_identifier_start() and
|
|
|
|
* duk_unicode_is_identifier_part() can be dropped. String literals
|
|
|
|
* should still be allowed to contain escaped Unicode, so this still does
|
|
|
|
* not allow removal of CESU-8 encoding of e.g. string literals.
|
|
|
|
*
|
|
|
|
* * Character lookup tables for codepoints above BMP could be stripped.
|
|
|
|
*
|
|
|
|
* * Strictly speaking, E5 specification requires that source code consists
|
|
|
|
* of 16-bit code units, and if not, must be conceptually converted to
|
|
|
|
* that format first. The current lexer processes Unicode code points
|
|
|
|
* and allows characters outside the BMP. These should be converted to
|
|
|
|
* surrogate pairs while reading the source characters into the window,
|
|
|
|
* not after tokens have been formed (as is done now). However, the fix
|
|
|
|
* is not trivial because two characters are decoded from one codepoint.
|
|
|
|
*
|
|
|
|
* * Optimize for speed as well as size. Large if-else ladders are (at
|
|
|
|
* least potentially) slow.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "duk_internal.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Various defines and file specific helper macros
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define DUK__MAX_RE_DECESC_DIGITS 9
|
|
|
|
#define DUK__MAX_RE_QUANT_DIGITS 9 /* Does not allow e.g. 2**31-1, but one more would allow overflows of u32. */
|
|
|
|
|
|
|
|
/* whether to use macros or helper function depends on call count */
|
|
|
|
#define DUK__ISDIGIT(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_9)
|
|
|
|
#define DUK__ISHEXDIGIT(x) duk__is_hex_digit((x))
|
|
|
|
#define DUK__ISOCTDIGIT(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_7)
|
|
|
|
#define DUK__ISDIGIT03(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_3)
|
|
|
|
#define DUK__ISDIGIT47(x) ((x) >= DUK_ASC_4 && (x) <= DUK_ASC_7)
|
|
|
|
|
|
|
|
/* lexer character window helpers */
|
|
|
|
#define DUK__LOOKUP(lex_ctx,idx) ((lex_ctx)->window[(idx)].codepoint)
|
|
|
|
#define DUK__ADVANCECHARS(lex_ctx,count) duk__advance_chars((lex_ctx), (count))
|
|
|
|
#define DUK__ADVANCEBYTES(lex_ctx,count) duk__advance_bytes((lex_ctx), (count))
|
|
|
|
#define DUK__INITBUFFER(lex_ctx) duk__initbuffer((lex_ctx))
|
|
|
|
#define DUK__APPENDBUFFER(lex_ctx,x) duk__appendbuffer((lex_ctx), (duk_codepoint_t) (x))
|
|
|
|
#define DUK__APPENDBUFFER_ASCII(lex_ctx,x) duk__appendbuffer_ascii((lex_ctx), (duk_codepoint_t) (x))
|
|
|
|
|
|
|
|
/* lookup shorthands (note: assume context variable is named 'lex_ctx') */
|
|
|
|
#define DUK__L0() DUK__LOOKUP(lex_ctx, 0)
|
|
|
|
#define DUK__L1() DUK__LOOKUP(lex_ctx, 1)
|
|
|
|
#define DUK__L2() DUK__LOOKUP(lex_ctx, 2)
|
|
|
|
#define DUK__L3() DUK__LOOKUP(lex_ctx, 3)
|
|
|
|
#define DUK__L4() DUK__LOOKUP(lex_ctx, 4)
|
|
|
|
#define DUK__L5() DUK__LOOKUP(lex_ctx, 5)
|
|
|
|
|
|
|
|
/* packed advance/token number macro used by multiple functions */
|
|
|
|
#define DUK__ADVTOK(advbytes,tok) ((((advbytes) * sizeof(duk_lexer_codepoint)) << 8) + (tok))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Advance lookup window by N characters, filling in new characters as
|
|
|
|
* necessary. After returning caller is guaranteed a character window of
|
|
|
|
* at least DUK_LEXER_WINDOW_SIZE characters.
|
|
|
|
*
|
|
|
|
* The main function duk__advance_bytes() is called at least once per every
|
|
|
|
* token so it has a major lexer/compiler performance impact. There are two
|
|
|
|
* variants for the main duk__advance_bytes() algorithm: a sliding window
|
|
|
|
* approach which is slightly faster at the cost of larger code footprint,
|
|
|
|
* and a simple copying one.
|
|
|
|
*
|
|
|
|
* Decoding directly from the source string would be another lexing option.
|
|
|
|
* But the lookup window based approach has the advantage of hiding the
|
|
|
|
* source string and its encoding effectively which gives more flexibility
|
|
|
|
* going forward to e.g. support chunked streaming of source from flash.
|
|
|
|
*
|
|
|
|
* Decodes UTF-8/CESU-8 leniently with support for code points from U+0000 to
|
|
|
|
* U+10FFFF, causing an error if the input is unparseable. Leniency means:
|
|
|
|
*
|
|
|
|
* * Unicode code point validation is intentionally not performed,
|
|
|
|
* except to check that the codepoint does not exceed 0x10ffff.
|
|
|
|
*
|
|
|
|
* * In particular, surrogate pairs are allowed and not combined, which
|
|
|
|
* allows source files to represent all SourceCharacters with CESU-8.
|
|
|
|
* Broken surrogate pairs are allowed, as Ecmascript does not mandate
|
|
|
|
* their validation.
|
|
|
|
*
|
|
|
|
* * Allow non-shortest UTF-8 encodings.
|
|
|
|
*
|
|
|
|
* Leniency here causes few security concerns because all character data is
|
|
|
|
* decoded into Unicode codepoints before lexer processing, and is then
|
|
|
|
* re-encoded into CESU-8. The source can be parsed as strict UTF-8 with
|
|
|
|
* a compiler option. However, Ecmascript source characters include -all-
|
|
|
|
* 16-bit unsigned integer codepoints, so leniency seems to be appropriate.
|
|
|
|
*
|
|
|
|
* Note that codepoints above the BMP are not strictly SourceCharacters,
|
|
|
|
* but the lexer still accepts them as such. Before ending up in a string
|
|
|
|
* or an identifier name, codepoints above BMP are converted into surrogate
|
|
|
|
* pairs and then CESU-8 encoded, resulting in 16-bit Unicode data as
|
|
|
|
* expected by Ecmascript.
|
|
|
|
*
|
|
|
|
* An alternative approach to dealing with invalid or partial sequences
|
|
|
|
* would be to skip them and replace them with e.g. the Unicode replacement
|
|
|
|
* character U+FFFD. This has limited utility because a replacement character
|
|
|
|
* will most likely cause a parse error, unless it occurs inside a string.
|
|
|
|
* Further, Ecmascript source is typically pure ASCII.
|
|
|
|
*
|
|
|
|
* See:
|
|
|
|
*
|
|
|
|
* http://en.wikipedia.org/wiki/UTF-8
|
|
|
|
* http://en.wikipedia.org/wiki/CESU-8
|
|
|
|
* http://tools.ietf.org/html/rfc3629
|
|
|
|
* http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
|
|
|
|
*
|
|
|
|
* Future work:
|
|
|
|
*
|
|
|
|
* * Reject other invalid Unicode sequences (see Wikipedia entry for examples)
|
|
|
|
* in strict UTF-8 mode.
|
|
|
|
*
|
|
|
|
* * Size optimize. An attempt to use a 16-byte lookup table for the first
|
|
|
|
* byte resulted in a code increase though.
|
|
|
|
*
|
|
|
|
* * Is checking against maximum 0x10ffff really useful? 4-byte encoding
|
|
|
|
* imposes a certain limit anyway.
|
|
|
|
*
|
|
|
|
* * Support chunked streaming of source code. Can be implemented either
|
|
|
|
* by streaming chunks of bytes or chunks of codepoints.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#if defined(DUK_USE_LEXER_SLIDING_WINDOW)
|
|
|
|
DUK_LOCAL void duk__fill_lexer_buffer(duk_lexer_ctx *lex_ctx, duk_small_uint_t start_offset_bytes) {
|
|
|
|
duk_lexer_codepoint *cp, *cp_end;
|
|
|
|
duk_ucodepoint_t x;
|
|
|
|
duk_small_uint_t contlen;
|
|
|
|
const duk_uint8_t *p, *p_end;
|
|
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
|
|
duk_ucodepoint_t mincp;
|
|
|
|
#endif
|
|
|
|
duk_int_t input_line;
|
|
|
|
|
|
|
|
/* Use temporaries and update lex_ctx only when finished. */
|
|
|
|
input_line = lex_ctx->input_line;
|
|
|
|
p = lex_ctx->input + lex_ctx->input_offset;
|
|
|
|
p_end = lex_ctx->input + lex_ctx->input_length;
|
|
|
|
|
|
|
|
cp = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->buffer + start_offset_bytes);
|
|
|
|
cp_end = lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE;
|
|
|
|
|
|
|
|
for (; cp != cp_end; cp++) {
|
|
|
|
cp->offset = (duk_size_t) (p - lex_ctx->input);
|
|
|
|
cp->line = input_line;
|
|
|
|
|
|
|
|
/* XXX: potential issue with signed pointers, p_end < p. */
|
|
|
|
if (DUK_UNLIKELY(p >= p_end)) {
|
|
|
|
/* If input_offset were assigned a negative value, it would
|
|
|
|
* result in a large positive value. Most likely it would be
|
|
|
|
* larger than input_length and be caught here. In any case
|
|
|
|
* no memory unsafe behavior would happen.
|
|
|
|
*/
|
|
|
|
cp->codepoint = -1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
x = (duk_ucodepoint_t) (*p++);
|
|
|
|
|
|
|
|
/* Fast path. */
|
|
|
|
|
|
|
|
if (DUK_LIKELY(x < 0x80UL)) {
|
|
|
|
DUK_ASSERT(x != 0x2028UL && x != 0x2029UL); /* not LS/PS */
|
|
|
|
if (DUK_UNLIKELY(x <= 0x000dUL)) {
|
|
|
|
if ((x == 0x000aUL) ||
|
|
|
|
((x == 0x000dUL) && (p >= p_end || *p != 0x000aUL))) {
|
|
|
|
/* lookup for 0x000a above assumes shortest encoding now */
|
|
|
|
|
|
|
|
/* E5 Section 7.3, treat the following as newlines:
|
|
|
|
* LF
|
|
|
|
* CR [not followed by LF]
|
|
|
|
* LS
|
|
|
|
* PS
|
|
|
|
*
|
|
|
|
* For CR LF, CR is ignored if it is followed by LF, and the LF will bump
|
|
|
|
* the line number.
|
|
|
|
*/
|
|
|
|
input_line++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
cp->codepoint = (duk_codepoint_t) x;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Slow path. */
|
|
|
|
|
|
|
|
if (x < 0xc0UL) {
|
|
|
|
/* 10xx xxxx -> invalid */
|
|
|
|
goto error_encoding;
|
|
|
|
} else if (x < 0xe0UL) {
|
|
|
|
/* 110x xxxx 10xx xxxx */
|
|
|
|
contlen = 1;
|
|
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
|
|
mincp = 0x80UL;
|
|
|
|
#endif
|
|
|
|
x = x & 0x1fUL;
|
|
|
|
} else if (x < 0xf0UL) {
|
|
|
|
/* 1110 xxxx 10xx xxxx 10xx xxxx */
|
|
|
|
contlen = 2;
|
|
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
|
|
mincp = 0x800UL;
|
|
|
|
#endif
|
|
|
|
x = x & 0x0fUL;
|
|
|
|
} else if (x < 0xf8UL) {
|
|
|
|
/* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */
|
|
|
|
contlen = 3;
|
|
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
|
|
mincp = 0x10000UL;
|
|
|
|
#endif
|
|
|
|
x = x & 0x07UL;
|
|
|
|
} else {
|
|
|
|
/* no point in supporting encodings of 5 or more bytes */
|
|
|
|
goto error_encoding;
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_ASSERT(p_end >= p);
|
|
|
|
if ((duk_size_t) contlen > (duk_size_t) (p_end - p)) {
|
|
|
|
goto error_clipped;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (contlen > 0) {
|
|
|
|
duk_small_uint_t y;
|
|
|
|
y = *p++;
|
|
|
|
if ((y & 0xc0U) != 0x80U) {
|
|
|
|
/* check that byte has the form 10xx xxxx */
|
|
|
|
goto error_encoding;
|
|
|
|
}
|
|
|
|
x = x << 6;
|
|
|
|
x += y & 0x3fUL;
|
|
|
|
contlen--;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check final character validity */
|
|
|
|
|
|
|
|
if (x > 0x10ffffUL) {
|
|
|
|
goto error_encoding;
|
|
|
|
}
|
|
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
|
|
if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) {
|
|
|
|
goto error_encoding;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
DUK_ASSERT(x != 0x000aUL && x != 0x000dUL);
|
|
|
|
if ((x == 0x2028UL) || (x == 0x2029UL)) {
|
|
|
|
input_line++;
|
|
|
|
}
|
|
|
|
|
|
|
|
cp->codepoint = (duk_codepoint_t) x;
|
|
|
|
}
|
|
|
|
|
|
|
|
lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input);
|
|
|
|
lex_ctx->input_line = input_line;
|
|
|
|
return;
|
|
|
|
|
|
|
|
error_clipped: /* clipped codepoint */
|
|
|
|
error_encoding: /* invalid codepoint encoding or codepoint */
|
|
|
|
lex_ctx->input_offset = (duk_size_t) (p - lex_ctx->input);
|
|
|
|
lex_ctx->input_line = input_line;
|
|
|
|
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_DECODE_FAILED);
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) {
|
|
|
|
duk_small_uint_t used_bytes, avail_bytes;
|
|
|
|
|
|
|
|
DUK_ASSERT_DISABLE(count_bytes >= 0); /* unsigned */
|
|
|
|
DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)));
|
|
|
|
DUK_ASSERT(lex_ctx->window >= lex_ctx->buffer);
|
|
|
|
DUK_ASSERT(lex_ctx->window < lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE);
|
|
|
|
DUK_ASSERT((duk_uint8_t *) lex_ctx->window + count_bytes <= (duk_uint8_t *) lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint));
|
|
|
|
|
|
|
|
/* Zero 'count' is also allowed to make call sites easier.
|
|
|
|
* Arithmetic in bytes generates better code in GCC.
|
|
|
|
*/
|
|
|
|
|
|
|
|
lex_ctx->window = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->window + count_bytes); /* avoid multiply */
|
|
|
|
used_bytes = (duk_small_uint_t) ((duk_uint8_t *) lex_ctx->window - (duk_uint8_t *) lex_ctx->buffer);
|
|
|
|
avail_bytes = DUK_LEXER_BUFFER_SIZE * sizeof(duk_lexer_codepoint) - used_bytes;
|
|
|
|
if (avail_bytes < (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint))) {
|
|
|
|
/* Not enough data to provide a full window, so "scroll" window to
|
|
|
|
* start of buffer and fill up the rest.
|
|
|
|
*/
|
|
|
|
DUK_MEMMOVE((void *) lex_ctx->buffer,
|
|
|
|
(const void *) lex_ctx->window,
|
|
|
|
(size_t) avail_bytes);
|
|
|
|
lex_ctx->window = lex_ctx->buffer;
|
|
|
|
duk__fill_lexer_buffer(lex_ctx, avail_bytes);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) {
|
|
|
|
lex_ctx->window = lex_ctx->buffer;
|
|
|
|
duk__fill_lexer_buffer(lex_ctx, 0);
|
|
|
|
}
|
|
|
|
#else /* DUK_USE_LEXER_SLIDING_WINDOW */
|
|
|
|
DUK_LOCAL duk_codepoint_t duk__read_char(duk_lexer_ctx *lex_ctx) {
|
|
|
|
duk_ucodepoint_t x;
|
|
|
|
duk_small_uint_t len;
|
|
|
|
duk_small_uint_t i;
|
|
|
|
const duk_uint8_t *p;
|
|
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
|
|
duk_ucodepoint_t mincp;
|
|
|
|
#endif
|
|
|
|
duk_size_t input_offset;
|
|
|
|
|
|
|
|
input_offset = lex_ctx->input_offset;
|
|
|
|
if (DUK_UNLIKELY(input_offset >= lex_ctx->input_length)) {
|
|
|
|
/* If input_offset were assigned a negative value, it would
|
|
|
|
* result in a large positive value. Most likely it would be
|
|
|
|
* larger than input_length and be caught here. In any case
|
|
|
|
* no memory unsafe behavior would happen.
|
|
|
|
*/
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
p = lex_ctx->input + input_offset;
|
|
|
|
x = (duk_ucodepoint_t) (*p);
|
|
|
|
|
|
|
|
if (DUK_LIKELY(x < 0x80UL)) {
|
|
|
|
/* 0xxx xxxx -> fast path */
|
|
|
|
|
|
|
|
/* input offset tracking */
|
|
|
|
lex_ctx->input_offset++;
|
|
|
|
|
|
|
|
DUK_ASSERT(x != 0x2028UL && x != 0x2029UL); /* not LS/PS */
|
|
|
|
if (DUK_UNLIKELY(x <= 0x000dUL)) {
|
|
|
|
if ((x == 0x000aUL) ||
|
|
|
|
((x == 0x000dUL) && (lex_ctx->input_offset >= lex_ctx->input_length ||
|
|
|
|
lex_ctx->input[lex_ctx->input_offset] != 0x000aUL))) {
|
|
|
|
/* lookup for 0x000a above assumes shortest encoding now */
|
|
|
|
|
|
|
|
/* E5 Section 7.3, treat the following as newlines:
|
|
|
|
* LF
|
|
|
|
* CR [not followed by LF]
|
|
|
|
* LS
|
|
|
|
* PS
|
|
|
|
*
|
|
|
|
* For CR LF, CR is ignored if it is followed by LF, and the LF will bump
|
|
|
|
* the line number.
|
|
|
|
*/
|
|
|
|
lex_ctx->input_line++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (duk_codepoint_t) x;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Slow path. */
|
|
|
|
|
|
|
|
if (x < 0xc0UL) {
|
|
|
|
/* 10xx xxxx -> invalid */
|
|
|
|
goto error_encoding;
|
|
|
|
} else if (x < 0xe0UL) {
|
|
|
|
/* 110x xxxx 10xx xxxx */
|
|
|
|
len = 2;
|
|
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
|
|
mincp = 0x80UL;
|
|
|
|
#endif
|
|
|
|
x = x & 0x1fUL;
|
|
|
|
} else if (x < 0xf0UL) {
|
|
|
|
/* 1110 xxxx 10xx xxxx 10xx xxxx */
|
|
|
|
len = 3;
|
|
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
|
|
mincp = 0x800UL;
|
|
|
|
#endif
|
|
|
|
x = x & 0x0fUL;
|
|
|
|
} else if (x < 0xf8UL) {
|
|
|
|
/* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */
|
|
|
|
len = 4;
|
|
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
|
|
mincp = 0x10000UL;
|
|
|
|
#endif
|
|
|
|
x = x & 0x07UL;
|
|
|
|
} else {
|
|
|
|
/* no point in supporting encodings of 5 or more bytes */
|
|
|
|
goto error_encoding;
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_ASSERT(lex_ctx->input_length >= lex_ctx->input_offset);
|
|
|
|
if ((duk_size_t) len > (duk_size_t) (lex_ctx->input_length - lex_ctx->input_offset)) {
|
|
|
|
goto error_clipped;
|
|
|
|
}
|
|
|
|
|
|
|
|
p++;
|
|
|
|
for (i = 1; i < len; i++) {
|
|
|
|
duk_small_uint_t y;
|
|
|
|
y = *p++;
|
|
|
|
if ((y & 0xc0U) != 0x80U) {
|
|
|
|
/* check that byte has the form 10xx xxxx */
|
|
|
|
goto error_encoding;
|
|
|
|
}
|
|
|
|
x = x << 6;
|
|
|
|
x += y & 0x3fUL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check final character validity */
|
|
|
|
|
|
|
|
if (x > 0x10ffffUL) {
|
|
|
|
goto error_encoding;
|
|
|
|
}
|
|
|
|
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
|
|
|
|
if (x < mincp || (x >= 0xd800UL && x <= 0xdfffUL) || x == 0xfffeUL) {
|
|
|
|
goto error_encoding;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* input offset tracking */
|
|
|
|
lex_ctx->input_offset += len;
|
|
|
|
|
|
|
|
/* line tracking */
|
|
|
|
DUK_ASSERT(x != 0x000aUL && x != 0x000dUL);
|
|
|
|
if ((x == 0x2028UL) || (x == 0x2029UL)) {
|
|
|
|
lex_ctx->input_line++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (duk_codepoint_t) x;
|
|
|
|
|
|
|
|
error_clipped: /* clipped codepoint */
|
|
|
|
error_encoding: /* invalid codepoint encoding or codepoint */
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_DECODE_FAILED);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_LOCAL void duk__advance_bytes(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_bytes) {
|
|
|
|
duk_small_uint_t keep_bytes;
|
|
|
|
duk_lexer_codepoint *cp, *cp_end;
|
|
|
|
|
|
|
|
DUK_ASSERT_DISABLE(count_bytes >= 0); /* unsigned */
|
|
|
|
DUK_ASSERT(count_bytes <= (duk_small_uint_t) (DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)));
|
|
|
|
|
|
|
|
/* Zero 'count' is also allowed to make call sites easier. */
|
|
|
|
|
|
|
|
keep_bytes = DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint) - count_bytes;
|
|
|
|
DUK_MEMMOVE((void *) lex_ctx->window,
|
|
|
|
(const void *) ((duk_uint8_t *) lex_ctx->window + count_bytes),
|
|
|
|
(size_t) keep_bytes);
|
|
|
|
|
|
|
|
cp = (duk_lexer_codepoint *) ((duk_uint8_t *) lex_ctx->window + keep_bytes);
|
|
|
|
cp_end = lex_ctx->window + DUK_LEXER_WINDOW_SIZE;
|
|
|
|
for (; cp != cp_end; cp++) {
|
|
|
|
cp->offset = lex_ctx->input_offset;
|
|
|
|
cp->line = lex_ctx->input_line;
|
|
|
|
cp->codepoint = duk__read_char(lex_ctx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_LOCAL void duk__init_lexer_window(duk_lexer_ctx *lex_ctx) {
|
|
|
|
/* Call with count == DUK_LEXER_WINDOW_SIZE to fill buffer initially. */
|
|
|
|
duk__advance_bytes(lex_ctx, DUK_LEXER_WINDOW_SIZE * sizeof(duk_lexer_codepoint)); /* fill window */
|
|
|
|
}
|
|
|
|
#endif /* DUK_USE_LEXER_SLIDING_WINDOW */
|
|
|
|
|
|
|
|
DUK_LOCAL void duk__advance_chars(duk_lexer_ctx *lex_ctx, duk_small_uint_t count_chars) {
|
|
|
|
duk__advance_bytes(lex_ctx, count_chars * sizeof(duk_lexer_codepoint));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* (Re)initialize the temporary byte buffer. May be called extra times
|
|
|
|
* with little impact.
|
|
|
|
*/
|
|
|
|
|
|
|
|
DUK_LOCAL void duk__initbuffer(duk_lexer_ctx *lex_ctx) {
|
|
|
|
/* Reuse buffer as is unless buffer has grown large. */
|
|
|
|
if (DUK_HBUFFER_DYNAMIC_GET_SIZE(lex_ctx->buf) < DUK_LEXER_TEMP_BUF_LIMIT) {
|
|
|
|
/* Keep current size */
|
|
|
|
} else {
|
|
|
|
duk_hbuffer_resize(lex_ctx->thr, lex_ctx->buf, DUK_LEXER_TEMP_BUF_LIMIT);
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_BW_INIT_WITHBUF(lex_ctx->thr, &lex_ctx->bw, lex_ctx->buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Append a Unicode codepoint to the temporary byte buffer. Performs
|
|
|
|
* CESU-8 surrogate pair encoding for codepoints above the BMP.
|
|
|
|
* Existing surrogate pairs are allowed and also encoded into CESU-8.
|
|
|
|
*/
|
|
|
|
|
|
|
|
DUK_LOCAL void duk__appendbuffer(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) {
|
|
|
|
/*
|
|
|
|
* Since character data is only generated by decoding the source or by
|
|
|
|
* the compiler itself, we rely on the input codepoints being correct
|
|
|
|
* and avoid a check here.
|
|
|
|
*
|
|
|
|
* Character data can also come here through decoding of Unicode
|
|
|
|
* escapes ("\udead\ubeef") so all 16-but unsigned values can be
|
|
|
|
* present, even when the source file itself is strict UTF-8.
|
|
|
|
*/
|
|
|
|
DUK_ASSERT(x >= 0 && x <= 0x10ffffL);
|
|
|
|
|
|
|
|
DUK_BW_WRITE_ENSURE_CESU8(lex_ctx->thr, &lex_ctx->bw, (duk_ucodepoint_t) x);
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_LOCAL void duk__appendbuffer_ascii(duk_lexer_ctx *lex_ctx, duk_codepoint_t x) {
|
|
|
|
/* ASCII characters can be emitted as a single byte without encoding
|
|
|
|
* which matters for some fast paths.
|
|
|
|
*/
|
|
|
|
DUK_ASSERT(x >= 0 && x <= 0x7f);
|
|
|
|
|
|
|
|
DUK_BW_WRITE_ENSURE_U8(lex_ctx->thr, &lex_ctx->bw, (duk_uint8_t) x);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Intern the temporary byte buffer into a valstack slot
|
|
|
|
* (in practice, slot1 or slot2).
|
|
|
|
*/
|
|
|
|
|
|
|
|
DUK_LOCAL duk_hstring *duk__internbuffer(duk_lexer_ctx *lex_ctx, duk_idx_t valstack_idx) {
|
|
|
|
DUK_ASSERT(valstack_idx == lex_ctx->slot1_idx || valstack_idx == lex_ctx->slot2_idx);
|
|
|
|
|
|
|
|
DUK_BW_PUSH_AS_STRING(lex_ctx->thr, &lex_ctx->bw);
|
|
|
|
duk_replace(lex_ctx->thr, valstack_idx);
|
|
|
|
return duk_known_hstring(lex_ctx->thr, valstack_idx);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Init lexer context
|
|
|
|
*/
|
|
|
|
|
|
|
|
DUK_INTERNAL void duk_lexer_initctx(duk_lexer_ctx *lex_ctx) {
|
|
|
|
DUK_ASSERT(lex_ctx != NULL);
|
|
|
|
|
|
|
|
DUK_MEMZERO(lex_ctx, sizeof(*lex_ctx));
|
|
|
|
#if defined(DUK_USE_EXPLICIT_NULL_INIT)
|
|
|
|
#if defined(DUK_USE_LEXER_SLIDING_WINDOW)
|
|
|
|
lex_ctx->window = NULL;
|
|
|
|
#endif
|
|
|
|
lex_ctx->thr = NULL;
|
|
|
|
lex_ctx->input = NULL;
|
|
|
|
lex_ctx->buf = NULL;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set lexer input position and reinitialize lookup window.
|
|
|
|
*/
|
|
|
|
|
|
|
|
DUK_INTERNAL void duk_lexer_getpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt) {
|
|
|
|
pt->offset = lex_ctx->window[0].offset;
|
|
|
|
pt->line = lex_ctx->window[0].line;
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_INTERNAL void duk_lexer_setpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt) {
|
|
|
|
DUK_ASSERT_DISABLE(pt->offset >= 0); /* unsigned */
|
|
|
|
DUK_ASSERT(pt->line >= 1);
|
|
|
|
lex_ctx->input_offset = pt->offset;
|
|
|
|
lex_ctx->input_line = pt->line;
|
|
|
|
duk__init_lexer_window(lex_ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lexing helpers
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Numeric value of a hex digit (also covers octal and decimal digits) or
|
|
|
|
* -1 if not a valid hex digit.
|
|
|
|
*/
|
|
|
|
DUK_LOCAL duk_codepoint_t duk__hexval_validate(duk_codepoint_t x) {
|
|
|
|
duk_small_int_t t;
|
|
|
|
|
|
|
|
/* Here 'x' is a Unicode codepoint */
|
|
|
|
if (DUK_LIKELY(x >= 0 && x <= 0xff)) {
|
|
|
|
t = duk_hex_dectab[x];
|
|
|
|
if (DUK_LIKELY(t >= 0)) {
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Just a wrapper for call sites where 'x' is known to be valid so
|
|
|
|
* we assert for it before decoding.
|
|
|
|
*/
|
|
|
|
DUK_LOCAL duk_codepoint_t duk__hexval(duk_codepoint_t x) {
|
|
|
|
duk_codepoint_t ret;
|
|
|
|
|
|
|
|
DUK_ASSERT((x >= DUK_ASC_0 && x <= DUK_ASC_9) ||
|
|
|
|
(x >= DUK_ASC_LC_A && x <= DUK_ASC_LC_F) ||
|
|
|
|
(x >= DUK_ASC_UC_A && x <= DUK_ASC_UC_F));
|
|
|
|
ret = duk__hexval_validate(x);
|
|
|
|
DUK_ASSERT(ret >= 0 && ret <= 15);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* having this as a separate function provided a size benefit */
|
|
|
|
DUK_LOCAL duk_bool_t duk__is_hex_digit(duk_codepoint_t x) {
|
|
|
|
if (DUK_LIKELY(x >= 0 && x <= 0xff)) {
|
|
|
|
return (duk_hex_dectab[x] >= 0);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Parse a Unicode escape of the form \xHH, \uHHHH, or \u{H+}. Shared by
|
|
|
|
* source and RegExp parsing.
|
|
|
|
*/
|
|
|
|
DUK_LOCAL duk_codepoint_t duk__lexer_parse_escape(duk_lexer_ctx *lex_ctx, duk_bool_t allow_es6) {
|
|
|
|
duk_small_int_t digits; /* Initial value 2 or 4 for fixed length escapes, 0 for ES2015 \u{H+}. */
|
|
|
|
duk_codepoint_t escval;
|
|
|
|
duk_codepoint_t x;
|
|
|
|
duk_small_uint_t adv;
|
|
|
|
|
|
|
|
DUK_ASSERT(DUK__L0() == DUK_ASC_BACKSLASH); /* caller responsibilities */
|
|
|
|
DUK_ASSERT(DUK__L1() == DUK_ASC_LC_X || DUK__L1() == DUK_ASC_LC_U);
|
|
|
|
DUK_UNREF(allow_es6);
|
|
|
|
|
|
|
|
adv = 2;
|
|
|
|
digits = 2;
|
|
|
|
if (DUK__L1() == DUK_ASC_LC_U) {
|
|
|
|
digits = 4;
|
|
|
|
#if defined(DUK_USE_ES6_UNICODE_ESCAPE)
|
|
|
|
if (DUK__L2() == DUK_ASC_LCURLY && allow_es6) {
|
|
|
|
digits = 0;
|
|
|
|
adv = 3;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, adv);
|
|
|
|
|
|
|
|
escval = 0;
|
|
|
|
for (;;) {
|
|
|
|
/* One of the escape forms: \xHH, \uHHHH, \u{H+}.
|
|
|
|
* The 'digits' variable tracks parsing state and is
|
|
|
|
* initialized to:
|
|
|
|
*
|
|
|
|
* \xHH 2
|
|
|
|
* \uHH 4
|
|
|
|
* \u{H+} 0 first time, updated to -1 to indicate
|
|
|
|
* at least one digit has been parsed
|
|
|
|
*
|
|
|
|
* Octal parsing is handled separately because it can be
|
|
|
|
* done with fixed lookahead and also has validation
|
|
|
|
* rules which depend on the escape length (which is
|
|
|
|
* variable).
|
|
|
|
*
|
|
|
|
* We don't need a specific check for x < 0 (end of
|
|
|
|
* input) or duk_unicode_is_line_terminator(x)
|
|
|
|
* because the 'dig' decode will fail and lead to a
|
|
|
|
* SyntaxError.
|
|
|
|
*/
|
|
|
|
duk_codepoint_t dig;
|
|
|
|
|
|
|
|
x = DUK__L0();
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
|
|
|
|
dig = duk__hexval_validate(x);
|
|
|
|
if (digits > 0) {
|
|
|
|
digits--;
|
|
|
|
if (dig < 0) {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
DUK_ASSERT(dig >= 0x00 && dig <= 0x0f);
|
|
|
|
escval = (escval << 4) + dig;
|
|
|
|
if (digits == 0) {
|
|
|
|
DUK_ASSERT(escval >= 0 && escval <= 0xffffL);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
#if defined(DUK_USE_ES6_UNICODE_ESCAPE)
|
|
|
|
DUK_ASSERT(digits == 0 /* first time */ || digits == -1 /* others */);
|
|
|
|
if (dig >= 0) {
|
|
|
|
DUK_ASSERT(dig >= 0x00 && dig <= 0x0f);
|
|
|
|
escval = (escval << 4) + dig;
|
|
|
|
if (escval > 0x10ffffL) {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
} else if (x == DUK_ASC_RCURLY) {
|
|
|
|
if (digits == 0) {
|
|
|
|
/* Empty escape, \u{}. */
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
DUK_ASSERT(escval >= 0 && escval <= 0x10ffffL);
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
digits = -1; /* Indicate we have at least one digit. */
|
|
|
|
#else /* DUK_USE_ES6_UNICODE_ESCAPE */
|
|
|
|
DUK_ASSERT(0); /* Never happens if \u{H+} support disabled. */
|
|
|
|
#endif /* DUK_USE_ES6_UNICODE_ESCAPE */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return escval;
|
|
|
|
|
|
|
|
fail_escape:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_ESCAPE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Parse legacy octal escape of the form \N{1,3}, e.g. \0, \5, \0377. Maximum
|
|
|
|
* allowed value is \0377 (U+00FF), longest match is used. Used for both string
|
|
|
|
* RegExp octal escape parsing. Window[0] must be the slash '\' and the first
|
|
|
|
* digit must already be validated to be in [0-9] by the caller.
|
|
|
|
*/
|
|
|
|
DUK_LOCAL duk_codepoint_t duk__lexer_parse_legacy_octal(duk_lexer_ctx *lex_ctx, duk_small_uint_t *out_adv, duk_bool_t reject_annex_b) {
|
|
|
|
duk_codepoint_t cp;
|
|
|
|
duk_small_uint_t lookup_idx;
|
|
|
|
duk_small_uint_t adv;
|
|
|
|
duk_codepoint_t tmp;
|
|
|
|
|
|
|
|
DUK_ASSERT(out_adv != NULL);
|
|
|
|
DUK_ASSERT(DUK__LOOKUP(lex_ctx, 0) == DUK_ASC_BACKSLASH);
|
|
|
|
DUK_ASSERT(DUK__LOOKUP(lex_ctx, 1) >= DUK_ASC_0 && DUK__LOOKUP(lex_ctx, 1) <= DUK_ASC_9);
|
|
|
|
|
|
|
|
cp = 0;
|
|
|
|
tmp = 0;
|
|
|
|
for (lookup_idx = 1; lookup_idx <= 3; lookup_idx++) {
|
|
|
|
DUK_DDD(DUK_DDDPRINT("lookup_idx=%ld, cp=%ld", (long) lookup_idx, (long) cp));
|
|
|
|
tmp = DUK__LOOKUP(lex_ctx, lookup_idx);
|
|
|
|
if (tmp < DUK_ASC_0 || tmp > DUK_ASC_7) {
|
|
|
|
/* No more valid digits. */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
tmp = (cp << 3) + (tmp - DUK_ASC_0);
|
|
|
|
if (tmp > 0xff) {
|
|
|
|
/* Three digit octal escapes above \377 (= 0xff)
|
|
|
|
* are not allowed.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
cp = tmp;
|
|
|
|
}
|
|
|
|
DUK_DDD(DUK_DDDPRINT("final lookup_idx=%ld, cp=%ld", (long) lookup_idx, (long) cp));
|
|
|
|
|
|
|
|
adv = lookup_idx;
|
|
|
|
if (lookup_idx == 1) {
|
|
|
|
DUK_DDD(DUK_DDDPRINT("\\8 or \\9 -> treat as literal, accept in strict mode too"));
|
|
|
|
DUK_ASSERT(tmp == DUK_ASC_8 || tmp == DUK_ASC_9);
|
|
|
|
cp = tmp;
|
|
|
|
adv++; /* correction to above, eat offending character */
|
|
|
|
} else if (lookup_idx == 2 && cp == 0) {
|
|
|
|
/* Note: 'foo\0bar' is OK in strict mode, but 'foo\00bar' is not.
|
|
|
|
* It won't be interpreted as 'foo\u{0}0bar' but as a SyntaxError.
|
|
|
|
*/
|
|
|
|
DUK_DDD(DUK_DDDPRINT("\\0 -> accept in strict mode too"));
|
|
|
|
} else {
|
|
|
|
/* This clause also handles non-shortest zero, e.g. \00. */
|
|
|
|
if (reject_annex_b) {
|
|
|
|
DUK_DDD(DUK_DDDPRINT("non-zero octal literal %ld -> reject in strict-mode", (long) cp));
|
|
|
|
cp = -1;
|
|
|
|
} else {
|
|
|
|
DUK_DDD(DUK_DDDPRINT("non-zero octal literal %ld -> accepted", (long) cp));
|
|
|
|
DUK_ASSERT(cp >= 0 && cp <= 0xff);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*out_adv = adv;
|
|
|
|
|
|
|
|
DUK_ASSERT((cp >= 0 && cp <= 0xff) || (cp == -1 && reject_annex_b));
|
|
|
|
return cp;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* XXX: move strict mode to lex_ctx? */
|
|
|
|
DUK_LOCAL void duk__lexer_parse_string_literal(duk_lexer_ctx *lex_ctx, duk_token *out_token, duk_small_int_t quote, duk_bool_t strict_mode) {
|
|
|
|
duk_small_uint_t adv;
|
|
|
|
|
|
|
|
for (adv = 1 /* initial quote */ ;;) {
|
|
|
|
duk_codepoint_t x;
|
|
|
|
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, adv); /* eat opening quote on first loop */
|
|
|
|
x = DUK__L0();
|
|
|
|
|
|
|
|
adv = 1;
|
|
|
|
if (x == quote) {
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* eat closing quote */
|
|
|
|
break;
|
|
|
|
} else if (x == '\\') {
|
|
|
|
/* DUK__L0 -> '\' char
|
|
|
|
* DUK__L1 ... DUK__L5 -> more lookup
|
|
|
|
*/
|
|
|
|
duk_small_int_t emitcp = -1;
|
|
|
|
|
|
|
|
x = DUK__L1();
|
|
|
|
|
|
|
|
/* How much to advance before next loop. */
|
|
|
|
adv = 2; /* note: long live range */
|
|
|
|
|
|
|
|
switch (x) {
|
|
|
|
case '\'':
|
|
|
|
emitcp = 0x0027;
|
|
|
|
break;
|
|
|
|
case '"':
|
|
|
|
emitcp = 0x0022;
|
|
|
|
break;
|
|
|
|
case '\\':
|
|
|
|
emitcp = 0x005c;
|
|
|
|
break;
|
|
|
|
case 'b':
|
|
|
|
emitcp = 0x0008;
|
|
|
|
break;
|
|
|
|
case 'f':
|
|
|
|
emitcp = 0x000c;
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
|
|
emitcp = 0x000a;
|
|
|
|
break;
|
|
|
|
case 'r':
|
|
|
|
emitcp = 0x000d;
|
|
|
|
break;
|
|
|
|
case 't':
|
|
|
|
emitcp = 0x0009;
|
|
|
|
break;
|
|
|
|
case 'v':
|
|
|
|
emitcp = 0x000b;
|
|
|
|
break;
|
|
|
|
case 'x':
|
|
|
|
case 'u': {
|
|
|
|
duk_codepoint_t esc_cp;
|
|
|
|
esc_cp = duk__lexer_parse_escape(lex_ctx, 1 /*allow_es6*/);
|
|
|
|
DUK__APPENDBUFFER(lex_ctx, esc_cp);
|
|
|
|
adv = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default: {
|
|
|
|
if (duk_unicode_is_line_terminator(x)) {
|
|
|
|
/* line continuation */
|
|
|
|
if (x == 0x000d && DUK__L2() == 0x000a) {
|
|
|
|
/* CR LF again a special case */
|
|
|
|
adv = 3; /* line terminator, CR, LF */
|
|
|
|
}
|
|
|
|
} else if (DUK__ISDIGIT(x)) {
|
|
|
|
/*
|
|
|
|
* Octal escape or zero escape:
|
|
|
|
* \0 (lookahead not OctalDigit)
|
|
|
|
* \1 ... \7 (lookahead not OctalDigit)
|
|
|
|
* \ZeroToThree OctalDigit (lookahead not OctalDigit)
|
|
|
|
* \FourToSeven OctalDigit (no lookahead restrictions)
|
|
|
|
* \ZeroToThree OctalDigit OctalDigit (no lookahead restrictions)
|
|
|
|
*
|
|
|
|
* Zero escape is part of the standard syntax. Octal escapes are
|
|
|
|
* defined in E5 Section B.1.2, and are only allowed in non-strict mode.
|
|
|
|
* Any other productions starting with a decimal digit are invalid
|
|
|
|
* but are in practice treated like identity escapes.
|
|
|
|
*
|
|
|
|
* Parse octal (up to 3 digits) from the lookup window.
|
|
|
|
*/
|
|
|
|
|
|
|
|
emitcp = duk__lexer_parse_legacy_octal(lex_ctx, &adv, strict_mode /*reject_annex_b*/);
|
|
|
|
if (emitcp < 0) {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
} else if (x < 0) {
|
|
|
|
goto fail_unterminated;
|
|
|
|
} else {
|
|
|
|
/* escaped NonEscapeCharacter */
|
|
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
|
|
}
|
|
|
|
} /* end default clause */
|
|
|
|
} /* end switch */
|
|
|
|
|
|
|
|
/* Shared handling for single codepoint escapes. */
|
|
|
|
if (emitcp >= 0) {
|
|
|
|
DUK__APPENDBUFFER(lex_ctx, emitcp);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Track number of escapes; count not really needed but directive
|
|
|
|
* prologues need to detect whether there were any escapes or line
|
|
|
|
* continuations or not.
|
|
|
|
*/
|
|
|
|
out_token->num_escapes++;
|
|
|
|
} else if (x >= 0x20 && x <= 0x7f) {
|
|
|
|
/* Fast path for ASCII case, avoids line terminator
|
|
|
|
* check and CESU-8 encoding.
|
|
|
|
*/
|
|
|
|
DUK_ASSERT(x >= 0);
|
|
|
|
DUK_ASSERT(!duk_unicode_is_line_terminator(x));
|
|
|
|
DUK_ASSERT(x != quote);
|
|
|
|
DUK_ASSERT(x != DUK_ASC_BACKSLASH);
|
|
|
|
DUK__APPENDBUFFER_ASCII(lex_ctx, x);
|
|
|
|
} else if (x < 0 || duk_unicode_is_line_terminator(x)) {
|
|
|
|
goto fail_unterminated;
|
|
|
|
} else {
|
|
|
|
/* Character which is part of the string but wasn't handled
|
|
|
|
* by the fast path.
|
|
|
|
*/
|
|
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
|
|
}
|
|
|
|
} /* string parse loop */
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_escape:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_ESCAPE);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_unterminated:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_STRING);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Skip to end-of-line (or end-of-file), used for single line comments. */
|
|
|
|
DUK_LOCAL void duk__lexer_skip_to_endofline(duk_lexer_ctx *lex_ctx) {
|
|
|
|
for (;;) {
|
|
|
|
duk_codepoint_t x;
|
|
|
|
|
|
|
|
x = DUK__L0();
|
|
|
|
if (x < 0 || duk_unicode_is_line_terminator(x)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parse Ecmascript source InputElementDiv or InputElementRegExp
|
|
|
|
* (E5 Section 7), skipping whitespace, comments, and line terminators.
|
|
|
|
*
|
|
|
|
* Possible results are:
|
|
|
|
* (1) a token
|
|
|
|
* (2) a line terminator (skipped)
|
|
|
|
* (3) a comment (skipped)
|
|
|
|
* (4) EOF
|
|
|
|
*
|
|
|
|
* White space is automatically skipped from the current position (but
|
|
|
|
* not after the input element). If input has already ended, returns
|
|
|
|
* DUK_TOK_EOF indefinitely. If a parse error occurs, uses an DUK_ERROR()
|
|
|
|
* macro call (and hence a longjmp through current heap longjmp context).
|
|
|
|
* Comments and line terminator tokens are automatically skipped.
|
|
|
|
*
|
|
|
|
* The input element being matched is determined by regexp_mode; if set,
|
|
|
|
* parses a InputElementRegExp, otherwise a InputElementDiv. The
|
|
|
|
* difference between these are handling of productions starting with a
|
|
|
|
* forward slash.
|
|
|
|
*
|
|
|
|
* If strict_mode is set, recognizes additional future reserved words
|
|
|
|
* specific to strict mode, and refuses to parse octal literals.
|
|
|
|
*
|
|
|
|
* The matching strategy below is to (currently) use a six character
|
|
|
|
* lookup window to quickly determine which production is the -longest-
|
|
|
|
* matching one, and then parse that. The top-level if-else clauses
|
|
|
|
* match the first character, and the code blocks for each clause
|
|
|
|
* handle -all- alternatives for that first character. Ecmascript
|
|
|
|
* specification uses the "longest match wins" semantics, so the order
|
|
|
|
* of the if-clauses matters.
|
|
|
|
*
|
|
|
|
* Misc notes:
|
|
|
|
*
|
|
|
|
* * Ecmascript numeric literals do not accept a sign character.
|
|
|
|
* Consequently e.g. "-1.0" is parsed as two tokens: a negative
|
|
|
|
* sign and a positive numeric literal. The compiler performs
|
|
|
|
* the negation during compilation, so this has no adverse impact.
|
|
|
|
*
|
|
|
|
* * There is no token for "undefined": it is just a value available
|
|
|
|
* from the global object (or simply established by doing a reference
|
|
|
|
* to an undefined value).
|
|
|
|
*
|
|
|
|
* * Some contexts want Identifier tokens, which are IdentifierNames
|
|
|
|
* excluding reserved words, while some contexts want IdentifierNames
|
|
|
|
* directly. In the latter case e.g. "while" is interpreted as an
|
|
|
|
* identifier name, not a DUK_TOK_WHILE token. The solution here is
|
|
|
|
* to provide both token types: DUK_TOK_WHILE goes to 't' while
|
|
|
|
* DUK_TOK_IDENTIFIER goes to 't_nores', and 'slot1' always contains
|
|
|
|
* the identifier / keyword name.
|
|
|
|
*
|
|
|
|
* * Directive prologue needs to identify string literals such as
|
|
|
|
* "use strict" and 'use strict', which are sensitive to line
|
|
|
|
* continuations and escape sequences. For instance, "use\u0020strict"
|
|
|
|
* is a valid directive but is distinct from "use strict". The solution
|
|
|
|
* here is to decode escapes while tokenizing, but to keep track of the
|
|
|
|
* number of escapes. Directive detection can then check that the
|
|
|
|
* number of escapes is zero.
|
|
|
|
*
|
|
|
|
* * Multi-line comments with one or more internal LineTerminator are
|
|
|
|
* treated like a line terminator to comply with automatic semicolon
|
|
|
|
* insertion.
|
|
|
|
*/
|
|
|
|
|
|
|
|
DUK_INTERNAL
|
|
|
|
void duk_lexer_parse_js_input_element(duk_lexer_ctx *lex_ctx,
|
|
|
|
duk_token *out_token,
|
|
|
|
duk_bool_t strict_mode,
|
|
|
|
duk_bool_t regexp_mode) {
|
|
|
|
duk_codepoint_t x; /* temporary, must be signed and 32-bit to hold Unicode code points */
|
|
|
|
duk_small_uint_t advtok = 0; /* (advance << 8) + token_type, updated at function end,
|
|
|
|
* init is unnecessary but suppresses "may be used uninitialized" warnings.
|
|
|
|
*/
|
|
|
|
duk_bool_t got_lineterm = 0; /* got lineterm preceding non-whitespace, non-lineterm token */
|
|
|
|
|
|
|
|
if (++lex_ctx->token_count >= lex_ctx->token_limit) {
|
|
|
|
goto fail_token_limit;
|
|
|
|
}
|
|
|
|
|
|
|
|
out_token->t = DUK_TOK_EOF;
|
|
|
|
out_token->t_nores = DUK_TOK_INVALID; /* marker: copy t if not changed */
|
|
|
|
#if 0 /* not necessary to init, disabled for faster parsing */
|
|
|
|
out_token->num = DUK_DOUBLE_NAN;
|
|
|
|
out_token->str1 = NULL;
|
|
|
|
out_token->str2 = NULL;
|
|
|
|
#endif
|
|
|
|
out_token->num_escapes = 0;
|
|
|
|
/* out_token->lineterm set by caller */
|
|
|
|
|
|
|
|
/* This would be nice, but parsing is faster without resetting the
|
|
|
|
* value slots. The only side effect is that references to temporary
|
|
|
|
* string values may linger until lexing is finished; they're then
|
|
|
|
* freed normally.
|
|
|
|
*/
|
|
|
|
#if 0
|
|
|
|
duk_to_undefined(lex_ctx->thr, lex_ctx->slot1_idx);
|
|
|
|
duk_to_undefined(lex_ctx->thr, lex_ctx->slot2_idx);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* 'advtok' indicates how much to advance and which token id to assign
|
|
|
|
* at the end. This shared functionality minimizes code size. All
|
|
|
|
* code paths are required to set 'advtok' to some value, so no default
|
|
|
|
* init value is used. Code paths calling DUK_ERROR() never return so
|
|
|
|
* they don't need to set advtok.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Matching order:
|
|
|
|
*
|
|
|
|
* Punctuator first chars, also covers comments, regexps
|
|
|
|
* LineTerminator
|
|
|
|
* Identifier or reserved word, also covers null/true/false literals
|
|
|
|
* NumericLiteral
|
|
|
|
* StringLiteral
|
|
|
|
* EOF
|
|
|
|
*
|
|
|
|
* The order does not matter as long as the longest match is
|
|
|
|
* always correctly identified. There are order dependencies
|
|
|
|
* in the clauses, so it's not trivial to convert to a switch.
|
|
|
|
*/
|
|
|
|
|
|
|
|
restart_lineupdate:
|
|
|
|
out_token->start_line = lex_ctx->window[0].line;
|
|
|
|
|
|
|
|
restart:
|
|
|
|
out_token->start_offset = lex_ctx->window[0].offset;
|
|
|
|
|
|
|
|
x = DUK__L0();
|
|
|
|
|
|
|
|
switch (x) {
|
|
|
|
case DUK_ASC_SPACE:
|
|
|
|
case DUK_ASC_HT: /* fast paths for space and tab */
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
goto restart;
|
|
|
|
case DUK_ASC_LF: /* LF line terminator; CR LF and Unicode lineterms are handled in slow path */
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
got_lineterm = 1;
|
|
|
|
goto restart_lineupdate;
|
|
|
|
#if defined(DUK_USE_SHEBANG_COMMENTS)
|
|
|
|
case DUK_ASC_HASH: /* '#' */
|
|
|
|
if (DUK__L1() == DUK_ASC_EXCLAMATION && lex_ctx->window[0].offset == 0 &&
|
|
|
|
(lex_ctx->flags & DUK_COMPILE_SHEBANG)) {
|
|
|
|
/* "Shebang" comment ('#! ...') on first line. */
|
|
|
|
/* DUK__ADVANCECHARS(lex_ctx, 2) would be correct here, but not necessary */
|
|
|
|
duk__lexer_skip_to_endofline(lex_ctx);
|
|
|
|
goto restart; /* line terminator will be handled on next round */
|
|
|
|
}
|
|
|
|
goto fail_token;
|
|
|
|
#endif /* DUK_USE_SHEBANG_COMMENTS */
|
|
|
|
case DUK_ASC_SLASH: /* '/' */
|
|
|
|
if (DUK__L1() == DUK_ASC_SLASH) {
|
|
|
|
/*
|
|
|
|
* E5 Section 7.4, allow SourceCharacter (which is any 16-bit
|
|
|
|
* code point).
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* DUK__ADVANCECHARS(lex_ctx, 2) would be correct here, but not necessary */
|
|
|
|
duk__lexer_skip_to_endofline(lex_ctx);
|
|
|
|
goto restart; /* line terminator will be handled on next round */
|
|
|
|
} else if (DUK__L1() == DUK_ASC_STAR) {
|
|
|
|
/*
|
|
|
|
* E5 Section 7.4. If the multi-line comment contains a newline,
|
|
|
|
* it is treated like a single line terminator for automatic
|
|
|
|
* semicolon insertion.
|
|
|
|
*/
|
|
|
|
|
|
|
|
duk_bool_t last_asterisk = 0;
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 2);
|
|
|
|
for (;;) {
|
|
|
|
x = DUK__L0();
|
|
|
|
if (x < 0) {
|
|
|
|
goto fail_unterm_comment;
|
|
|
|
}
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
if (last_asterisk && x == DUK_ASC_SLASH) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (duk_unicode_is_line_terminator(x)) {
|
|
|
|
got_lineterm = 1;
|
|
|
|
}
|
|
|
|
last_asterisk = (x == DUK_ASC_STAR);
|
|
|
|
}
|
|
|
|
goto restart_lineupdate;
|
|
|
|
} else if (regexp_mode) {
|
|
|
|
#if defined(DUK_USE_REGEXP_SUPPORT)
|
|
|
|
/*
|
|
|
|
* "/" followed by something in regexp mode. See E5 Section 7.8.5.
|
|
|
|
*
|
|
|
|
* RegExp parsing is a bit complex. First, the regexp body is delimited
|
|
|
|
* by forward slashes, but the body may also contain forward slashes as
|
|
|
|
* part of an escape sequence or inside a character class (delimited by
|
|
|
|
* square brackets). A mini state machine is used to implement these.
|
|
|
|
*
|
|
|
|
* Further, an early (parse time) error must be thrown if the regexp
|
|
|
|
* would cause a run-time error when used in the expression new RegExp(...).
|
|
|
|
* Parsing here simply extracts the (candidate) regexp, and also accepts
|
|
|
|
* invalid regular expressions (which are delimited properly). The caller
|
|
|
|
* (compiler) must perform final validation and regexp compilation.
|
|
|
|
*
|
|
|
|
* RegExp first char may not be '/' (single line comment) or '*' (multi-
|
|
|
|
* line comment). These have already been checked above, so there is no
|
|
|
|
* need below for special handling of the first regexp character as in
|
|
|
|
* the E5 productions.
|
|
|
|
*
|
|
|
|
* About unicode escapes within regexp literals:
|
|
|
|
*
|
|
|
|
* E5 Section 7.8.5 grammar does NOT accept \uHHHH escapes.
|
|
|
|
* However, Section 6 states that regexps accept the escapes,
|
|
|
|
* see paragraph starting with "In string literals...".
|
|
|
|
* The regexp grammar, which sees the decoded regexp literal
|
|
|
|
* (after lexical parsing) DOES have a \uHHHH unicode escape.
|
|
|
|
* So, for instance:
|
|
|
|
*
|
|
|
|
* /\u1234/
|
|
|
|
*
|
|
|
|
* should first be parsed by the lexical grammar as:
|
|
|
|
*
|
|
|
|
* '\' 'u' RegularExpressionBackslashSequence
|
|
|
|
* '1' RegularExpressionNonTerminator
|
|
|
|
* '2' RegularExpressionNonTerminator
|
|
|
|
* '3' RegularExpressionNonTerminator
|
|
|
|
* '4' RegularExpressionNonTerminator
|
|
|
|
*
|
|
|
|
* and the escape itself is then parsed by the regexp engine.
|
|
|
|
* This is the current implementation.
|
|
|
|
*
|
|
|
|
* Minor spec inconsistency:
|
|
|
|
*
|
|
|
|
* E5 Section 7.8.5 RegularExpressionBackslashSequence is:
|
|
|
|
*
|
|
|
|
* \ RegularExpressionNonTerminator
|
|
|
|
*
|
|
|
|
* while Section A.1 RegularExpressionBackslashSequence is:
|
|
|
|
*
|
|
|
|
* \ NonTerminator
|
|
|
|
*
|
|
|
|
* The latter is not normative and a typo.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* first, parse regexp body roughly */
|
|
|
|
|
|
|
|
duk_small_int_t state = 0; /* 0=base, 1=esc, 2=class, 3=class+esc */
|
|
|
|
|
|
|
|
DUK__INITBUFFER(lex_ctx);
|
|
|
|
for (;;) {
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* skip opening slash on first loop */
|
|
|
|
x = DUK__L0();
|
|
|
|
if (x < 0 || duk_unicode_is_line_terminator(x)) {
|
|
|
|
goto fail_unterm_regexp;
|
|
|
|
}
|
|
|
|
x = DUK__L0(); /* re-read to avoid spill / fetch */
|
|
|
|
if (state == 0) {
|
|
|
|
if (x == DUK_ASC_SLASH) {
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* eat closing slash */
|
|
|
|
break;
|
|
|
|
} else if (x == DUK_ASC_BACKSLASH) {
|
|
|
|
state = 1;
|
|
|
|
} else if (x == DUK_ASC_LBRACKET) {
|
|
|
|
state = 2;
|
|
|
|
}
|
|
|
|
} else if (state == 1) {
|
|
|
|
state = 0;
|
|
|
|
} else if (state == 2) {
|
|
|
|
if (x == DUK_ASC_RBRACKET) {
|
|
|
|
state = 0;
|
|
|
|
} else if (x == DUK_ASC_BACKSLASH) {
|
|
|
|
state = 3;
|
|
|
|
}
|
|
|
|
} else { /* state == 3 */
|
|
|
|
state = 2;
|
|
|
|
}
|
|
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
|
|
}
|
|
|
|
out_token->str1 = duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
|
|
|
|
|
|
|
|
/* second, parse flags */
|
|
|
|
|
|
|
|
DUK__INITBUFFER(lex_ctx);
|
|
|
|
for (;;) {
|
|
|
|
x = DUK__L0();
|
|
|
|
if (!duk_unicode_is_identifier_part(x)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
x = DUK__L0(); /* re-read to avoid spill / fetch */
|
|
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
}
|
|
|
|
out_token->str2 = duk__internbuffer(lex_ctx, lex_ctx->slot2_idx);
|
|
|
|
|
|
|
|
DUK__INITBUFFER(lex_ctx); /* free some memory */
|
|
|
|
|
|
|
|
/* validation of the regexp is caller's responsibility */
|
|
|
|
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_REGEXP);
|
|
|
|
#else /* DUK_USE_REGEXP_SUPPORT */
|
|
|
|
goto fail_regexp_support;
|
|
|
|
#endif /* DUK_USE_REGEXP_SUPPORT */
|
|
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
/* "/=" and not in regexp mode */
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_DIV_EQ);
|
|
|
|
} else {
|
|
|
|
/* "/" and not in regexp mode */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_DIV);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_LCURLY: /* '{' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_LCURLY);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_RCURLY: /* '}' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_RCURLY);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_LPAREN: /* '(' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_LPAREN);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_RPAREN: /* ')' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_RPAREN);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_LBRACKET: /* '[' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_LBRACKET);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_RBRACKET: /* ']' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_RBRACKET);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_PERIOD: /* '.' */
|
|
|
|
if (DUK__ISDIGIT(DUK__L1())) {
|
|
|
|
/* Period followed by a digit can only start DecimalLiteral
|
|
|
|
* (handled in slow path). We could jump straight into the
|
|
|
|
* DecimalLiteral handling but should avoid goto to inside
|
|
|
|
* a block.
|
|
|
|
*/
|
|
|
|
goto slow_path;
|
|
|
|
}
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_PERIOD);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_SEMICOLON: /* ';' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_SEMICOLON);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_COMMA: /* ',' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_COMMA);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_LANGLE: /* '<' */
|
|
|
|
#if defined(DUK_USE_HTML_COMMENTS)
|
|
|
|
if (DUK__L1() == DUK_ASC_EXCLAMATION && DUK__L2() == DUK_ASC_MINUS && DUK__L3() == DUK_ASC_MINUS) {
|
|
|
|
/*
|
|
|
|
* ES2015: B.1.3, handle "<!--" SingleLineHTMLOpenComment
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* DUK__ADVANCECHARS(lex_ctx, 4) would be correct here, but not necessary */
|
|
|
|
duk__lexer_skip_to_endofline(lex_ctx);
|
|
|
|
goto restart; /* line terminator will be handled on next round */
|
|
|
|
}
|
|
|
|
else
|
|
|
|
#endif /* DUK_USE_HTML_COMMENTS */
|
|
|
|
if (DUK__L1() == DUK_ASC_LANGLE && DUK__L2() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_ALSHIFT_EQ);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_LE);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_LANGLE) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_ALSHIFT);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_LT);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_RANGLE: /* '>' */
|
|
|
|
if (DUK__L1() == DUK_ASC_RANGLE && DUK__L2() == DUK_ASC_RANGLE && DUK__L3() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(4, DUK_TOK_RSHIFT_EQ);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_RANGLE && DUK__L2() == DUK_ASC_RANGLE) {
|
|
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_RSHIFT);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_RANGLE && DUK__L2() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_ARSHIFT_EQ);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_GE);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_RANGLE) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_ARSHIFT);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_GT);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_EQUALS: /* '=' */
|
|
|
|
if (DUK__L1() == DUK_ASC_EQUALS && DUK__L2() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_SEQ);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_EQ);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_EQUALSIGN);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_EXCLAMATION: /* '!' */
|
|
|
|
if (DUK__L1() == DUK_ASC_EQUALS && DUK__L2() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_SNEQ);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_NEQ);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_LNOT);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_PLUS: /* '+' */
|
|
|
|
if (DUK__L1() == DUK_ASC_PLUS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_INCREMENT);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_ADD_EQ);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_ADD);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_MINUS: /* '-' */
|
|
|
|
#if defined(DUK_USE_HTML_COMMENTS)
|
|
|
|
if (got_lineterm && DUK__L1() == DUK_ASC_MINUS && DUK__L2() == DUK_ASC_RANGLE) {
|
|
|
|
/*
|
|
|
|
* ES2015: B.1.3, handle "-->" SingleLineHTMLCloseComment
|
|
|
|
* Only allowed:
|
|
|
|
* - on new line
|
|
|
|
* - preceded only by whitespace
|
|
|
|
* - preceded by end of multiline comment and optional whitespace
|
|
|
|
*
|
|
|
|
* Since whitespace generates no tokens, and multiline comments
|
|
|
|
* are treated as a line ending, consulting `got_lineterm` is
|
|
|
|
* sufficient to test for these three options.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* DUK__ADVANCECHARS(lex_ctx, 3) would be correct here, but not necessary */
|
|
|
|
duk__lexer_skip_to_endofline(lex_ctx);
|
|
|
|
goto restart; /* line terminator will be handled on next round */
|
|
|
|
} else
|
|
|
|
#endif /* DUK_USE_HTML_COMMENTS */
|
|
|
|
if (DUK__L1() == DUK_ASC_MINUS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_DECREMENT);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_SUB_EQ);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_SUB);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_STAR: /* '*' */
|
|
|
|
#if defined(DUK_USE_ES7_EXP_OPERATOR)
|
|
|
|
if (DUK__L1() == DUK_ASC_STAR && DUK__L2() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(3, DUK_TOK_EXP_EQ);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_STAR) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_EXP);
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_MUL_EQ);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_MUL);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_PERCENT: /* '%' */
|
|
|
|
if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_MOD_EQ);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_MOD);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_AMP: /* '&' */
|
|
|
|
if (DUK__L1() == DUK_ASC_AMP) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_LAND);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_BAND_EQ);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_BAND);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_PIPE: /* '|' */
|
|
|
|
if (DUK__L1() == DUK_ASC_PIPE) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_LOR);
|
|
|
|
} else if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_BOR_EQ);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_BOR);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_CARET: /* '^' */
|
|
|
|
if (DUK__L1() == DUK_ASC_EQUALS) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_TOK_BXOR_EQ);
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_BXOR);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case DUK_ASC_TILDE: /* '~' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_BNOT);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_QUESTION: /* '?' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_QUESTION);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_COLON: /* ':' */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_TOK_COLON);
|
|
|
|
break;
|
|
|
|
case DUK_ASC_DOUBLEQUOTE: /* '"' */
|
|
|
|
case DUK_ASC_SINGLEQUOTE: { /* '\'' */
|
|
|
|
DUK__INITBUFFER(lex_ctx);
|
|
|
|
duk__lexer_parse_string_literal(lex_ctx, out_token, x /*quote*/, strict_mode);
|
|
|
|
duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
|
|
|
|
out_token->str1 = duk_known_hstring(lex_ctx->thr, lex_ctx->slot1_idx);
|
|
|
|
|
|
|
|
DUK__INITBUFFER(lex_ctx); /* free some memory */
|
|
|
|
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_STRING);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
goto slow_path;
|
|
|
|
} /* switch */
|
|
|
|
|
|
|
|
goto skip_slow_path;
|
|
|
|
|
|
|
|
slow_path:
|
|
|
|
if (duk_unicode_is_line_terminator(x)) {
|
|
|
|
if (x == 0x000d && DUK__L1() == 0x000a) {
|
|
|
|
/*
|
|
|
|
* E5 Section 7.3: CR LF is detected as a single line terminator for
|
|
|
|
* line numbers. Here we also detect it as a single line terminator
|
|
|
|
* token.
|
|
|
|
*/
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 2);
|
|
|
|
} else {
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
}
|
|
|
|
got_lineterm = 1;
|
|
|
|
goto restart_lineupdate;
|
|
|
|
} else if (duk_unicode_is_identifier_start(x) || x == DUK_ASC_BACKSLASH) {
|
|
|
|
/*
|
|
|
|
* Parse an identifier and then check whether it is:
|
|
|
|
* - reserved word (keyword or other reserved word)
|
|
|
|
* - "null" (NullLiteral)
|
|
|
|
* - "true" (BooleanLiteral)
|
|
|
|
* - "false" (BooleanLiteral)
|
|
|
|
* - anything else => identifier
|
|
|
|
*
|
|
|
|
* This does not follow the E5 productions cleanly, but is
|
|
|
|
* useful and compact.
|
|
|
|
*
|
|
|
|
* Note that identifiers may contain Unicode escapes,
|
|
|
|
* see E5 Sections 6 and 7.6. They must be decoded first,
|
|
|
|
* and the result checked against allowed characters.
|
|
|
|
* The above if-clause accepts an identifier start and an
|
|
|
|
* '\' character -- no other token can begin with a '\'.
|
|
|
|
*
|
|
|
|
* Note that "get" and "set" are not reserved words in E5
|
|
|
|
* specification so they are recognized as plain identifiers
|
|
|
|
* (the tokens DUK_TOK_GET and DUK_TOK_SET are actually not
|
|
|
|
* used now). The compiler needs to work around this.
|
|
|
|
*
|
|
|
|
* Strictly speaking, following Ecmascript longest match
|
|
|
|
* specification, an invalid escape for the first character
|
|
|
|
* should cause a syntax error. However, an invalid escape
|
|
|
|
* for IdentifierParts should just terminate the identifier
|
|
|
|
* early (longest match), and let the next tokenization
|
|
|
|
* fail. For instance Rhino croaks with 'foo\z' when
|
|
|
|
* parsing the identifier. This has little practical impact.
|
|
|
|
*/
|
|
|
|
|
|
|
|
duk_small_uint_t i, i_end;
|
|
|
|
duk_bool_t first = 1;
|
|
|
|
duk_hstring *str;
|
|
|
|
|
|
|
|
DUK__INITBUFFER(lex_ctx);
|
|
|
|
for (;;) {
|
|
|
|
/* re-lookup first char on first loop */
|
|
|
|
if (DUK__L0() == DUK_ASC_BACKSLASH) {
|
|
|
|
duk_codepoint_t esc_cp;
|
|
|
|
if (DUK__L1() != DUK_ASC_LC_U) {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
esc_cp = duk__lexer_parse_escape(lex_ctx, 1 /*allow_es6*/);
|
|
|
|
DUK__APPENDBUFFER(lex_ctx, esc_cp);
|
|
|
|
|
|
|
|
/* IdentifierStart is stricter than IdentifierPart, so if the first
|
|
|
|
* character is escaped, must have a stricter check here.
|
|
|
|
*/
|
|
|
|
if (!(first ? duk_unicode_is_identifier_start(esc_cp) : duk_unicode_is_identifier_part(esc_cp))) {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Track number of escapes: necessary for proper keyword
|
|
|
|
* detection.
|
|
|
|
*/
|
|
|
|
out_token->num_escapes++;
|
|
|
|
} else {
|
|
|
|
/* Note: first character is checked against this. But because
|
|
|
|
* IdentifierPart includes all IdentifierStart characters, and
|
|
|
|
* the first character (if unescaped) has already been checked
|
|
|
|
* in the if condition, this is OK.
|
|
|
|
*/
|
|
|
|
if (!duk_unicode_is_identifier_part(DUK__L0())) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
DUK__APPENDBUFFER(lex_ctx, DUK__L0());
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
}
|
|
|
|
first = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
out_token->str1 = duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
|
|
|
|
str = out_token->str1;
|
|
|
|
out_token->t_nores = DUK_TOK_IDENTIFIER;
|
|
|
|
|
|
|
|
DUK__INITBUFFER(lex_ctx); /* free some memory */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Interned identifier is compared against reserved words, which are
|
|
|
|
* currently interned into the heap context. See genbuiltins.py.
|
|
|
|
*
|
|
|
|
* Note that an escape in the identifier disables recognition of
|
|
|
|
* keywords; e.g. "\u0069f = 1;" is a valid statement (assigns to
|
|
|
|
* identifier named "if"). This is not necessarily compliant,
|
|
|
|
* see test-dec-escaped-char-in-keyword.js.
|
|
|
|
*
|
|
|
|
* Note: "get" and "set" are awkward. They are not officially
|
|
|
|
* ReservedWords (and indeed e.g. "var set = 1;" is valid), and
|
|
|
|
* must come out as DUK_TOK_IDENTIFIER. The compiler needs to
|
|
|
|
* work around this a bit.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* XXX: optimize by adding the token numbers directly into the
|
|
|
|
* always interned duk_hstring objects (there should be enough
|
|
|
|
* flag bits free for that)?
|
|
|
|
*/
|
|
|
|
|
|
|
|
i_end = (strict_mode ? DUK_STRIDX_END_RESERVED : DUK_STRIDX_START_STRICT_RESERVED);
|
|
|
|
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_IDENTIFIER);
|
|
|
|
if (out_token->num_escapes == 0) {
|
|
|
|
for (i = DUK_STRIDX_START_RESERVED; i < i_end; i++) {
|
|
|
|
DUK_ASSERT_DISABLE(i >= 0); /* unsigned */
|
|
|
|
DUK_ASSERT(i < DUK_HEAP_NUM_STRINGS);
|
16-bit fields and heap pointer compression work
Memory optimization work for very low memory devices (96 to 256kB system RAM).
Overall changes are:
- 16-bit fields for various internal structures to reduce their size
- Heap pointer compression to reduce pointer size to 16 bits
When DUK_OPT_LIGHTFUNC_BUILTINS and the new low memory options are enabled,
Duktape initial heap memory usage is about 23kB (compared to baseline of
about 45kB) on x86.
Unless low memory feature options are enabled, there should be no visible
changes to Duktape behavior.
More detailed changes:
- 16-bit changes for duk_heaphdr: pointer compression, refcount
- 16-bit changes for duk_hstring: hash, blen, and clen can all be 16 bits,
use 0xFFFF as string byte length limit (call sites ensure this limit is
never exceeded)
- 16-bit changes for duk_hbuffer, use 0xFFFF as buffer length limit
- 16-bit fields for hobject size (entry part, array part), drop hash part
since it's not usually needed for extremely low memory environments
- 16-bit changes for duk_hcompiledfunction
- Heap pointer packing for stringtable
- Heap pointer packing for 'strs' built-in strings list (saves around 600
to 700 bytes but may not be a good tradeoff because call site size will
increase)
Other changes:
- Heaphdr NULL init fix. The original macros were broken: the double/single
linked macro variants were the wrong way around. Now sets through macro
to work properly with compressed pointers.
- Rename duk_hbuffer CURR_DATA_PTR -> DATA_PTR to reduce macro length
(previous name was tediously long)
- Rename buffer "usable_size" to "alloc_size" throughout as they have been
the same for a while now (they used to differ when buffer had an extra NUL).
- Add memory optimization markers to Duktape.env (pointer compression and
individual 16-bit field options)
- Rename a few internal fields for clarity: duk_hobject 'p' to 'props',
heap->st to heap->strtable
- Add a safety check for buffer alloc size (should not be triggered but
prevents wrapping if call sites don't properly check for sizes)
- Other minor cleanups
10 years ago
|
|
|
if (DUK_HTHREAD_GET_STRING(lex_ctx->thr, i) == str) {
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_STRIDX_TO_TOK(i));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (DUK__ISDIGIT(x) || (x == DUK_ASC_PERIOD)) {
|
|
|
|
/* Note: decimal number may start with a period, but must be followed by a digit */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pre-parsing for decimal, hex, octal (both legacy and ES2015),
|
|
|
|
* and binary literals, followed by an actual parser step
|
|
|
|
* provided by numconv.
|
|
|
|
*
|
|
|
|
* Note: the leading sign character ('+' or '-') is -not- part of
|
|
|
|
* the production in E5 grammar, and that the a DecimalLiteral
|
|
|
|
* starting with a '0' must be followed by a non-digit.
|
|
|
|
*
|
|
|
|
* XXX: the two step parsing process is quite awkward, it would
|
|
|
|
* be more straightforward to allow numconv to parse the longest
|
|
|
|
* valid prefix (it already does that, it only needs to indicate
|
|
|
|
* where the input ended). However, the lexer decodes characters
|
|
|
|
* using a limited lookup window, so this is not a trivial change.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* XXX: because of the final check below (that the literal is not
|
|
|
|
* followed by a digit), this could maybe be simplified, if we bail
|
|
|
|
* out early from a leading zero (and if there are no periods etc).
|
|
|
|
* Maybe too complex.
|
|
|
|
*/
|
|
|
|
|
|
|
|
duk_double_t val;
|
|
|
|
duk_bool_t legacy_oct = 0;
|
|
|
|
duk_small_int_t state; /* 0=before period/exp,
|
|
|
|
* 1=after period, before exp
|
|
|
|
* 2=after exp, allow '+' or '-'
|
|
|
|
* 3=after exp and exp sign
|
|
|
|
*/
|
|
|
|
duk_small_uint_t s2n_flags;
|
|
|
|
duk_codepoint_t y, z;
|
|
|
|
duk_small_int_t s2n_radix = 10;
|
|
|
|
duk_small_uint_t pre_adv = 0;
|
|
|
|
|
|
|
|
DUK__INITBUFFER(lex_ctx);
|
|
|
|
y = DUK__L1();
|
|
|
|
|
|
|
|
if (x == DUK_ASC_0) {
|
|
|
|
z = DUK_LOWERCASE_CHAR_ASCII(y);
|
|
|
|
|
|
|
|
pre_adv = 2; /* default for 0xNNN, 0oNNN, 0bNNN. */
|
|
|
|
if (z == DUK_ASC_LC_X) {
|
|
|
|
s2n_radix = 16;
|
|
|
|
} else if (z == DUK_ASC_LC_O) {
|
|
|
|
s2n_radix = 8;
|
|
|
|
} else if (z == DUK_ASC_LC_B) {
|
|
|
|
s2n_radix = 2;
|
|
|
|
} else {
|
|
|
|
pre_adv = 0;
|
|
|
|
if (DUK__ISDIGIT(y)) {
|
|
|
|
if (strict_mode) {
|
|
|
|
/* Reject octal like \07 but also octal-lookalike
|
|
|
|
* decimal like \08 in strict mode.
|
|
|
|
*/
|
|
|
|
goto fail_number_literal;
|
|
|
|
} else {
|
|
|
|
/* Legacy OctalIntegerLiteral or octal-lookalice
|
|
|
|
* decimal. Deciding between the two happens below
|
|
|
|
* in digit scanning.
|
|
|
|
*/
|
|
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
|
|
pre_adv = 1;
|
|
|
|
legacy_oct = 1;
|
|
|
|
s2n_radix = 8; /* tentative unless conflicting digits found */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, pre_adv);
|
|
|
|
|
|
|
|
/* XXX: we could parse integers here directly, and fall back
|
|
|
|
* to numconv only when encountering a fractional expression
|
|
|
|
* or when an octal literal turned out to be decimal (0778 etc).
|
|
|
|
*/
|
|
|
|
state = 0;
|
|
|
|
for (;;) {
|
|
|
|
x = DUK__L0(); /* re-lookup curr char on first round */
|
|
|
|
if (DUK__ISDIGIT(x)) {
|
|
|
|
/* Note: intentionally allow leading zeroes here, as the
|
|
|
|
* actual parser will check for them.
|
|
|
|
*/
|
|
|
|
if (state == 0 && legacy_oct && (x == DUK_ASC_8 || x == DUK_ASC_9)) {
|
|
|
|
/* Started out as an octal-lookalike
|
|
|
|
* but interpreted as decimal, e.g.
|
|
|
|
* '0779' -> 779. This also means
|
|
|
|
* that fractions are allowed, e.g.
|
|
|
|
* '0779.123' is allowed but '0777.123'
|
|
|
|
* is not!
|
|
|
|
*/
|
|
|
|
s2n_radix = 10;
|
|
|
|
}
|
|
|
|
if (state == 2) {
|
|
|
|
state = 3;
|
|
|
|
}
|
|
|
|
} else if (s2n_radix == 16 && DUK__ISHEXDIGIT(x)) {
|
|
|
|
/* Note: 'e' and 'E' are also accepted here. */
|
|
|
|
;
|
|
|
|
} else if (x == DUK_ASC_PERIOD) {
|
|
|
|
if (state >= 1 || s2n_radix != 10) {
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
state = 1;
|
|
|
|
}
|
|
|
|
} else if (x == DUK_ASC_LC_E || x == DUK_ASC_UC_E) {
|
|
|
|
if (state >= 2 || s2n_radix != 10) {
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
state = 2;
|
|
|
|
}
|
|
|
|
} else if (x == DUK_ASC_MINUS || x == DUK_ASC_PLUS) {
|
|
|
|
if (state != 2) {
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
state = 3;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
DUK__APPENDBUFFER(lex_ctx, x);
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* XXX: better coercion */
|
|
|
|
(void) duk__internbuffer(lex_ctx, lex_ctx->slot1_idx);
|
|
|
|
|
|
|
|
if (s2n_radix != 10) {
|
|
|
|
/* For bases other than 10, integer only. */
|
|
|
|
s2n_flags = DUK_S2N_FLAG_ALLOW_LEADING_ZERO;
|
|
|
|
} else {
|
|
|
|
s2n_flags = DUK_S2N_FLAG_ALLOW_EXP |
|
|
|
|
DUK_S2N_FLAG_ALLOW_FRAC |
|
|
|
|
DUK_S2N_FLAG_ALLOW_NAKED_FRAC |
|
|
|
|
DUK_S2N_FLAG_ALLOW_EMPTY_FRAC |
|
|
|
|
DUK_S2N_FLAG_ALLOW_LEADING_ZERO;
|
|
|
|
}
|
|
|
|
|
|
|
|
duk_dup(lex_ctx->thr, lex_ctx->slot1_idx);
|
|
|
|
duk_numconv_parse(lex_ctx->thr, s2n_radix, s2n_flags);
|
|
|
|
val = duk_to_number_m1(lex_ctx->thr);
|
|
|
|
if (DUK_ISNAN(val)) {
|
|
|
|
goto fail_number_literal;
|
|
|
|
}
|
|
|
|
duk_replace(lex_ctx->thr, lex_ctx->slot1_idx); /* could also just pop? */
|
|
|
|
|
|
|
|
DUK__INITBUFFER(lex_ctx); /* free some memory */
|
|
|
|
|
|
|
|
/* Section 7.8.3 (note): NumericLiteral must be followed by something other than
|
|
|
|
* IdentifierStart or DecimalDigit.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (DUK__ISDIGIT(DUK__L0()) || duk_unicode_is_identifier_start(DUK__L0())) {
|
|
|
|
goto fail_number_literal;
|
|
|
|
}
|
|
|
|
|
|
|
|
out_token->num = val;
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_NUMBER);
|
|
|
|
} else if (duk_unicode_is_whitespace(DUK__LOOKUP(lex_ctx, 0))) {
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
goto restart;
|
|
|
|
} else if (x < 0) {
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_EOF);
|
|
|
|
} else {
|
|
|
|
goto fail_token;
|
|
|
|
}
|
|
|
|
skip_slow_path:
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Shared exit path
|
|
|
|
*/
|
|
|
|
|
|
|
|
DUK__ADVANCEBYTES(lex_ctx, advtok >> 8);
|
|
|
|
out_token->t = advtok & 0xff;
|
|
|
|
if (out_token->t_nores == DUK_TOK_INVALID) {
|
|
|
|
out_token->t_nores = out_token->t;
|
|
|
|
}
|
|
|
|
out_token->lineterm = got_lineterm;
|
|
|
|
|
|
|
|
/* Automatic semicolon insertion is allowed if a token is preceded
|
|
|
|
* by line terminator(s), or terminates a statement list (right curly
|
|
|
|
* or EOF).
|
|
|
|
*/
|
|
|
|
if (got_lineterm || out_token->t == DUK_TOK_RCURLY || out_token->t == DUK_TOK_EOF) {
|
|
|
|
out_token->allow_auto_semi = 1;
|
|
|
|
} else {
|
|
|
|
out_token->allow_auto_semi = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_token_limit:
|
|
|
|
DUK_ERROR_RANGE(lex_ctx->thr, DUK_STR_TOKEN_LIMIT);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_token:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_TOKEN);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_number_literal:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_NUMBER_LITERAL);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_escape:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_ESCAPE);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_unterm_regexp:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_REGEXP);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_unterm_comment:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_COMMENT);
|
|
|
|
return;
|
|
|
|
|
|
|
|
#if !defined(DUK_USE_REGEXP_SUPPORT)
|
|
|
|
fail_regexp_support:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_REGEXP_SUPPORT_DISABLED);
|
|
|
|
return;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
#if defined(DUK_USE_REGEXP_SUPPORT)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parse a RegExp token. The grammar is described in E5 Section 15.10.
|
|
|
|
* Terminal constructions (such as quantifiers) are parsed directly here.
|
|
|
|
*
|
|
|
|
* 0xffffffffU is used as a marker for "infinity" in quantifiers. Further,
|
|
|
|
* DUK__MAX_RE_QUANT_DIGITS limits the maximum number of digits that
|
|
|
|
* will be accepted for a quantifier.
|
|
|
|
*/
|
|
|
|
|
|
|
|
DUK_INTERNAL void duk_lexer_parse_re_token(duk_lexer_ctx *lex_ctx, duk_re_token *out_token) {
|
|
|
|
duk_small_uint_t advtok = 0; /* init is unnecessary but suppresses "may be used uninitialized" warnings */
|
|
|
|
duk_codepoint_t x, y;
|
|
|
|
|
|
|
|
if (++lex_ctx->token_count >= lex_ctx->token_limit) {
|
|
|
|
goto fail_token_limit;
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_MEMZERO(out_token, sizeof(*out_token));
|
|
|
|
|
|
|
|
x = DUK__L0();
|
|
|
|
y = DUK__L1();
|
|
|
|
|
|
|
|
DUK_DDD(DUK_DDDPRINT("parsing regexp token, L0=%ld, L1=%ld", (long) x, (long) y));
|
|
|
|
|
|
|
|
switch (x) {
|
|
|
|
case DUK_ASC_PIPE: {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_DISJUNCTION);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_CARET: {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_START);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_DOLLAR: {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ASSERT_END);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_QUESTION: {
|
|
|
|
out_token->qmin = 0;
|
|
|
|
out_token->qmax = 1;
|
|
|
|
if (y == DUK_ASC_QUESTION) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
|
|
|
|
out_token->greedy = 0;
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
|
|
|
|
out_token->greedy = 1;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_STAR: {
|
|
|
|
out_token->qmin = 0;
|
|
|
|
out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
|
|
|
|
if (y == DUK_ASC_QUESTION) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
|
|
|
|
out_token->greedy = 0;
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
|
|
|
|
out_token->greedy = 1;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_PLUS: {
|
|
|
|
out_token->qmin = 1;
|
|
|
|
out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
|
|
|
|
if (y == DUK_ASC_QUESTION) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_QUANTIFIER);
|
|
|
|
out_token->greedy = 0;
|
|
|
|
} else {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_QUANTIFIER);
|
|
|
|
out_token->greedy = 1;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_LCURLY: {
|
|
|
|
/* Production allows 'DecimalDigits', including leading zeroes */
|
|
|
|
duk_uint32_t val1 = 0;
|
|
|
|
duk_uint32_t val2 = DUK_RE_QUANTIFIER_INFINITE;
|
|
|
|
duk_small_int_t digits = 0;
|
|
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
|
|
duk_lexer_point lex_pt;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
|
|
/* Store lexer position, restoring if quantifier is invalid. */
|
|
|
|
DUK_LEXER_GETPOINT(lex_ctx, &lex_pt);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* eat '{' on entry */
|
|
|
|
x = DUK__L0();
|
|
|
|
if (DUK__ISDIGIT(x)) {
|
|
|
|
digits++;
|
|
|
|
val1 = val1 * 10 + (duk_uint32_t) duk__hexval(x);
|
|
|
|
} else if (x == DUK_ASC_COMMA) {
|
|
|
|
if (digits > DUK__MAX_RE_QUANT_DIGITS) {
|
|
|
|
goto invalid_quantifier;
|
|
|
|
}
|
|
|
|
if (val2 != DUK_RE_QUANTIFIER_INFINITE) {
|
|
|
|
goto invalid_quantifier;
|
|
|
|
}
|
|
|
|
if (DUK__L1() == DUK_ASC_RCURLY) {
|
|
|
|
/* form: { DecimalDigits , }, val1 = min count */
|
|
|
|
if (digits == 0) {
|
|
|
|
goto invalid_quantifier;
|
|
|
|
}
|
|
|
|
out_token->qmin = val1;
|
|
|
|
out_token->qmax = DUK_RE_QUANTIFIER_INFINITE;
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 2);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
val2 = val1;
|
|
|
|
val1 = 0;
|
|
|
|
digits = 0; /* not strictly necessary because of lookahead '}' above */
|
|
|
|
} else if (x == DUK_ASC_RCURLY) {
|
|
|
|
if (digits > DUK__MAX_RE_QUANT_DIGITS) {
|
|
|
|
goto invalid_quantifier;
|
|
|
|
}
|
|
|
|
if (digits == 0) {
|
|
|
|
goto invalid_quantifier;
|
|
|
|
}
|
|
|
|
if (val2 != DUK_RE_QUANTIFIER_INFINITE) {
|
|
|
|
/* val2 = min count, val1 = max count */
|
|
|
|
out_token->qmin = val2;
|
|
|
|
out_token->qmax = val1;
|
|
|
|
} else {
|
|
|
|
/* val1 = count */
|
|
|
|
out_token->qmin = val1;
|
|
|
|
out_token->qmax = val1;
|
|
|
|
}
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
goto invalid_quantifier;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (DUK__L0() == DUK_ASC_QUESTION) {
|
|
|
|
out_token->greedy = 0;
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1);
|
|
|
|
} else {
|
|
|
|
out_token->greedy = 1;
|
|
|
|
}
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_RETOK_QUANTIFIER);
|
|
|
|
break;
|
|
|
|
invalid_quantifier:
|
|
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
|
|
/* Failed to match the quantifier, restore lexer and parse
|
|
|
|
* opening brace as a literal.
|
|
|
|
*/
|
|
|
|
DUK_LEXER_SETPOINT(lex_ctx, &lex_pt);
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR);
|
|
|
|
out_token->num = DUK_ASC_LCURLY;
|
|
|
|
#else
|
|
|
|
goto fail_quantifier;
|
|
|
|
#endif
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_PERIOD: {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_PERIOD);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_BACKSLASH: {
|
|
|
|
/* The E5.1 specification does not seem to allow IdentifierPart characters
|
|
|
|
* to be used as identity escapes. Unfortunately this includes '$', which
|
|
|
|
* cannot be escaped as '\$'; it needs to be escaped e.g. as '\u0024'.
|
|
|
|
* Many other implementations (including V8 and Rhino, for instance) do
|
|
|
|
* accept '\$' as a valid identity escape, which is quite pragmatic, and
|
|
|
|
* ES2015 Annex B relaxes the rules to allow these (and other) real world forms.
|
|
|
|
*/
|
|
|
|
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR); /* default: char escape (two chars) */
|
|
|
|
if (y == DUK_ASC_LC_B) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_WORD_BOUNDARY);
|
|
|
|
} else if (y == DUK_ASC_UC_B) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY);
|
|
|
|
} else if (y == DUK_ASC_LC_F) {
|
|
|
|
out_token->num = 0x000c;
|
|
|
|
} else if (y == DUK_ASC_LC_N) {
|
|
|
|
out_token->num = 0x000a;
|
|
|
|
} else if (y == DUK_ASC_LC_T) {
|
|
|
|
out_token->num = 0x0009;
|
|
|
|
} else if (y == DUK_ASC_LC_R) {
|
|
|
|
out_token->num = 0x000d;
|
|
|
|
} else if (y == DUK_ASC_LC_V) {
|
|
|
|
out_token->num = 0x000b;
|
|
|
|
} else if (y == DUK_ASC_LC_C) {
|
|
|
|
x = DUK__L2();
|
|
|
|
if ((x >= DUK_ASC_LC_A && x <= DUK_ASC_LC_Z) ||
|
|
|
|
(x >= DUK_ASC_UC_A && x <= DUK_ASC_UC_Z)) {
|
|
|
|
out_token->num = (duk_uint32_t) (x % 32);
|
|
|
|
advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_CHAR);
|
|
|
|
} else {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
} else if (y == DUK_ASC_LC_X || y == DUK_ASC_LC_U) {
|
|
|
|
/* The token value is the Unicode codepoint without
|
|
|
|
* it being decode into surrogate pair characters
|
|
|
|
* here. The \u{H+} is only allowed in Unicode mode
|
|
|
|
* which we don't support yet.
|
|
|
|
*/
|
|
|
|
out_token->num = (duk_uint32_t) duk__lexer_parse_escape(lex_ctx, 0 /*allow_es6*/);
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_RETOK_ATOM_CHAR);
|
|
|
|
} else if (y == DUK_ASC_LC_D) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_DIGIT);
|
|
|
|
} else if (y == DUK_ASC_UC_D) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_DIGIT);
|
|
|
|
} else if (y == DUK_ASC_LC_S) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WHITE);
|
|
|
|
} else if (y == DUK_ASC_UC_S) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WHITE);
|
|
|
|
} else if (y == DUK_ASC_LC_W) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_WORD_CHAR);
|
|
|
|
} else if (y == DUK_ASC_UC_W) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_NOT_WORD_CHAR);
|
|
|
|
} else if (DUK__ISDIGIT(y)) {
|
|
|
|
/* E5 Section 15.10.2.11 */
|
|
|
|
if (y == DUK_ASC_0) {
|
|
|
|
if (DUK__ISDIGIT(DUK__L2())) {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
out_token->num = 0x0000;
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_CHAR);
|
|
|
|
} else {
|
|
|
|
/* XXX: shared parsing? */
|
|
|
|
duk_uint32_t val = 0;
|
|
|
|
duk_small_int_t i;
|
|
|
|
for (i = 0; ; i++) {
|
|
|
|
if (i >= DUK__MAX_RE_DECESC_DIGITS) {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* eat backslash on entry */
|
|
|
|
x = DUK__L0();
|
|
|
|
if (!DUK__ISDIGIT(x)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
val = val * 10 + (duk_uint32_t) duk__hexval(x);
|
|
|
|
}
|
|
|
|
/* DUK__L0() cannot be a digit, because the loop doesn't terminate if it is */
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_RETOK_ATOM_BACKREFERENCE);
|
|
|
|
out_token->num = val;
|
|
|
|
}
|
|
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
|
|
} else if (y >= 0) {
|
|
|
|
/* For ES2015 Annex B, accept any source character as identity
|
|
|
|
* escape except 'c' which is used for control characters.
|
|
|
|
* http://www.ecma-international.org/ecma-262/6.0/#sec-regular-expressions-patterns
|
|
|
|
* Careful not to match end-of-buffer (<0) here.
|
|
|
|
* This is not yet full ES2015 Annex B because cases above
|
|
|
|
* (like hex escape) won't backtrack.
|
|
|
|
*/
|
|
|
|
DUK_ASSERT(y != DUK_ASC_LC_C); /* covered above */
|
|
|
|
#else /* DUK_USE_ES6_REGEXP_SYNTAX */
|
|
|
|
} else if ((y >= 0 && !duk_unicode_is_identifier_part(y)) ||
|
|
|
|
y == DUK_UNICODE_CP_ZWNJ ||
|
|
|
|
y == DUK_UNICODE_CP_ZWJ) {
|
|
|
|
/* For ES5.1 identity escapes are not allowed for identifier
|
|
|
|
* parts. This conflicts with a lot of real world code as this
|
|
|
|
* doesn't e.g. allow escaping a dollar sign as /\$/, see
|
|
|
|
* test-regexp-identity-escape-dollar.js.
|
|
|
|
*/
|
|
|
|
#endif /* DUK_USE_ES6_REGEXP_SYNTAX */
|
|
|
|
out_token->num = (duk_uint32_t) y;
|
|
|
|
} else {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_LPAREN: {
|
|
|
|
/* XXX: naming is inconsistent: ATOM_END_GROUP ends an ASSERT_START_LOOKAHEAD */
|
|
|
|
|
|
|
|
if (y == DUK_ASC_QUESTION) {
|
|
|
|
if (DUK__L2() == DUK_ASC_EQUALS) {
|
|
|
|
/* (?= */
|
|
|
|
advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_POS_LOOKAHEAD);
|
|
|
|
} else if (DUK__L2() == DUK_ASC_EXCLAMATION) {
|
|
|
|
/* (?! */
|
|
|
|
advtok = DUK__ADVTOK(3, DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD);
|
|
|
|
} else if (DUK__L2() == DUK_ASC_COLON) {
|
|
|
|
/* (?: */
|
|
|
|
advtok = DUK__ADVTOK(3, DUK_RETOK_ATOM_START_NONCAPTURE_GROUP);
|
|
|
|
} else {
|
|
|
|
goto fail_group;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* ( */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CAPTURE_GROUP);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_RPAREN: {
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_END_GROUP);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case DUK_ASC_LBRACKET: {
|
|
|
|
/*
|
|
|
|
* To avoid creating a heavy intermediate value for the list of ranges,
|
|
|
|
* only the start token ('[' or '[^') is parsed here. The regexp
|
|
|
|
* compiler parses the ranges itself.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* XXX: with DUK_USE_ES6_REGEXP_SYNTAX we should allow left bracket
|
|
|
|
* literal too, but it's not easy to parse without backtracking.
|
|
|
|
*/
|
|
|
|
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_START_CHARCLASS);
|
|
|
|
if (y == DUK_ASC_CARET) {
|
|
|
|
advtok = DUK__ADVTOK(2, DUK_RETOK_ATOM_START_CHARCLASS_INVERTED);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#if !defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
|
|
case DUK_ASC_RCURLY:
|
|
|
|
case DUK_ASC_RBRACKET: {
|
|
|
|
/* Although these could be parsed as PatternCharacters unambiguously (here),
|
|
|
|
* E5 Section 15.10.1 grammar explicitly forbids these as PatternCharacters.
|
|
|
|
*/
|
|
|
|
goto fail_invalid_char;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
case -1: {
|
|
|
|
/* EOF */
|
|
|
|
advtok = DUK__ADVTOK(0, DUK_TOK_EOF);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default: {
|
|
|
|
/* PatternCharacter, all excluded characters are matched by cases above */
|
|
|
|
advtok = DUK__ADVTOK(1, DUK_RETOK_ATOM_CHAR);
|
|
|
|
out_token->num = (duk_uint32_t) x;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Shared exit path
|
|
|
|
*/
|
|
|
|
|
|
|
|
DUK__ADVANCEBYTES(lex_ctx, advtok >> 8);
|
|
|
|
out_token->t = advtok & 0xff;
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_token_limit:
|
|
|
|
DUK_ERROR_RANGE(lex_ctx->thr, DUK_STR_TOKEN_LIMIT);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_escape:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_ESCAPE);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_group:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_GROUP);
|
|
|
|
return;
|
|
|
|
|
|
|
|
#if !defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
|
|
fail_invalid_char:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_CHARACTER);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_quantifier:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_QUANTIFIER);
|
|
|
|
return;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Special parser for character classes; calls callback for every
|
|
|
|
* range parsed and returns the number of ranges present.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* XXX: this duplicates functionality in duk_regexp.c where a similar loop is
|
|
|
|
* required anyway. We could use that BUT we need to update the regexp compiler
|
|
|
|
* 'nranges' too. Work this out a bit more cleanly to save space.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* XXX: the handling of character range detection is a bit convoluted.
|
|
|
|
* Try to simplify and make smaller.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* XXX: logic for handling character ranges is now incorrect, it will accept
|
|
|
|
* e.g. [\d-z] whereas it should croak from it? SMJS accepts this too, though.
|
|
|
|
*
|
|
|
|
* Needs a read through and a lot of additional tests.
|
|
|
|
*/
|
|
|
|
|
|
|
|
DUK_LOCAL
|
|
|
|
void duk__emit_u16_direct_ranges(duk_lexer_ctx *lex_ctx,
|
|
|
|
duk_re_range_callback gen_range,
|
|
|
|
void *userdata,
|
|
|
|
const duk_uint16_t *ranges,
|
|
|
|
duk_small_int_t num) {
|
|
|
|
const duk_uint16_t *ranges_end;
|
|
|
|
|
|
|
|
DUK_UNREF(lex_ctx);
|
|
|
|
|
|
|
|
ranges_end = ranges + num;
|
|
|
|
while (ranges < ranges_end) {
|
|
|
|
/* mark range 'direct', bypass canonicalization (see Wiki) */
|
|
|
|
gen_range(userdata, (duk_codepoint_t) ranges[0], (duk_codepoint_t) ranges[1], 1);
|
|
|
|
ranges += 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
DUK_INTERNAL void duk_lexer_parse_re_ranges(duk_lexer_ctx *lex_ctx, duk_re_range_callback gen_range, void *userdata) {
|
|
|
|
duk_codepoint_t start = -1;
|
|
|
|
duk_codepoint_t ch;
|
|
|
|
duk_codepoint_t x;
|
|
|
|
duk_bool_t dash = 0;
|
|
|
|
duk_small_uint_t adv = 0;
|
|
|
|
|
|
|
|
DUK_DD(DUK_DDPRINT("parsing regexp ranges"));
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, adv);
|
|
|
|
adv = 1;
|
|
|
|
|
|
|
|
x = DUK__L0();
|
|
|
|
|
|
|
|
ch = -1; /* not strictly necessary, but avoids "uninitialized variable" warnings */
|
|
|
|
DUK_UNREF(ch);
|
|
|
|
|
|
|
|
if (x < 0) {
|
|
|
|
goto fail_unterm_charclass;
|
|
|
|
} else if (x == DUK_ASC_RBRACKET) {
|
|
|
|
if (start >= 0) {
|
|
|
|
gen_range(userdata, start, start, 0);
|
|
|
|
}
|
|
|
|
DUK__ADVANCECHARS(lex_ctx, 1); /* eat ']' before finishing */
|
|
|
|
break;
|
|
|
|
} else if (x == DUK_ASC_MINUS) {
|
|
|
|
if (start >= 0 && !dash && DUK__L1() != DUK_ASC_RBRACKET) {
|
|
|
|
/* '-' as a range indicator */
|
|
|
|
dash = 1;
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
/* '-' verbatim */
|
|
|
|
ch = x;
|
|
|
|
}
|
|
|
|
} else if (x == DUK_ASC_BACKSLASH) {
|
|
|
|
/*
|
|
|
|
* The escapes are same as outside a character class, except that \b has a
|
|
|
|
* different meaning, and \B and backreferences are prohibited (see E5
|
|
|
|
* Section 15.10.2.19). However, it's difficult to share code because we
|
|
|
|
* handle e.g. "\n" very differently: here we generate a single character
|
|
|
|
* range for it.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* XXX: ES2015 surrogate pair handling. */
|
|
|
|
|
|
|
|
x = DUK__L1();
|
|
|
|
|
|
|
|
adv = 2;
|
|
|
|
|
|
|
|
if (x == DUK_ASC_LC_B) {
|
|
|
|
/* Note: '\b' in char class is different than outside (assertion),
|
|
|
|
* '\B' is not allowed and is caught by the duk_unicode_is_identifier_part()
|
|
|
|
* check below.
|
|
|
|
*/
|
|
|
|
ch = 0x0008;
|
|
|
|
} else if (x == DUK_ASC_LC_F) {
|
|
|
|
ch = 0x000c;
|
|
|
|
} else if (x == DUK_ASC_LC_N) {
|
|
|
|
ch = 0x000a;
|
|
|
|
} else if (x == DUK_ASC_LC_T) {
|
|
|
|
ch = 0x0009;
|
|
|
|
} else if (x == DUK_ASC_LC_R) {
|
|
|
|
ch = 0x000d;
|
|
|
|
} else if (x == DUK_ASC_LC_V) {
|
|
|
|
ch = 0x000b;
|
|
|
|
} else if (x == DUK_ASC_LC_C) {
|
|
|
|
x = DUK__L2();
|
|
|
|
adv = 3;
|
|
|
|
if ((x >= DUK_ASC_LC_A && x <= DUK_ASC_LC_Z) ||
|
|
|
|
(x >= DUK_ASC_UC_A && x <= DUK_ASC_UC_Z)) {
|
|
|
|
ch = (x % 32);
|
|
|
|
} else {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
} else if (x == DUK_ASC_LC_X || x == DUK_ASC_LC_U) {
|
|
|
|
/* The \u{H+} form is only allowed in Unicode mode which
|
|
|
|
* we don't support yet.
|
|
|
|
*/
|
|
|
|
ch = duk__lexer_parse_escape(lex_ctx, 0 /*allow_es6*/);
|
|
|
|
adv = 0;
|
|
|
|
} else if (x == DUK_ASC_LC_D) {
|
|
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
|
|
gen_range,
|
|
|
|
userdata,
|
|
|
|
duk_unicode_re_ranges_digit,
|
|
|
|
sizeof(duk_unicode_re_ranges_digit) / sizeof(duk_uint16_t));
|
|
|
|
ch = -1;
|
|
|
|
} else if (x == DUK_ASC_UC_D) {
|
|
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
|
|
gen_range,
|
|
|
|
userdata,
|
|
|
|
duk_unicode_re_ranges_not_digit,
|
|
|
|
sizeof(duk_unicode_re_ranges_not_digit) / sizeof(duk_uint16_t));
|
|
|
|
ch = -1;
|
|
|
|
} else if (x == DUK_ASC_LC_S) {
|
|
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
|
|
gen_range,
|
|
|
|
userdata,
|
|
|
|
duk_unicode_re_ranges_white,
|
|
|
|
sizeof(duk_unicode_re_ranges_white) / sizeof(duk_uint16_t));
|
|
|
|
ch = -1;
|
|
|
|
} else if (x == DUK_ASC_UC_S) {
|
|
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
|
|
gen_range,
|
|
|
|
userdata,
|
|
|
|
duk_unicode_re_ranges_not_white,
|
|
|
|
sizeof(duk_unicode_re_ranges_not_white) / sizeof(duk_uint16_t));
|
|
|
|
ch = -1;
|
|
|
|
} else if (x == DUK_ASC_LC_W) {
|
|
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
|
|
gen_range,
|
|
|
|
userdata,
|
|
|
|
duk_unicode_re_ranges_wordchar,
|
|
|
|
sizeof(duk_unicode_re_ranges_wordchar) / sizeof(duk_uint16_t));
|
|
|
|
ch = -1;
|
|
|
|
} else if (x == DUK_ASC_UC_W) {
|
|
|
|
duk__emit_u16_direct_ranges(lex_ctx,
|
|
|
|
gen_range,
|
|
|
|
userdata,
|
|
|
|
duk_unicode_re_ranges_not_wordchar,
|
|
|
|
sizeof(duk_unicode_re_ranges_not_wordchar) / sizeof(duk_uint16_t));
|
|
|
|
ch = -1;
|
|
|
|
} else if (DUK__ISDIGIT(x)) {
|
|
|
|
/* DecimalEscape, only \0 is allowed, no leading
|
|
|
|
* zeroes are allowed.
|
|
|
|
*
|
|
|
|
* ES2015 Annex B also allows (maximal match) legacy
|
|
|
|
* octal escapes up to \377 and \8 and \9 are
|
|
|
|
* accepted as literal '8' and '9', also in strict mode.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
|
|
ch = duk__lexer_parse_legacy_octal(lex_ctx, &adv, 0 /*reject_annex_b*/);
|
|
|
|
DUK_ASSERT(ch >= 0); /* no rejections */
|
|
|
|
#else
|
|
|
|
if (x == DUK_ASC_0 && !DUK__ISDIGIT(DUK__L2())) {
|
|
|
|
ch = 0x0000;
|
|
|
|
} else {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#if defined(DUK_USE_ES6_REGEXP_SYNTAX)
|
|
|
|
} else if (x >= 0) {
|
|
|
|
/* IdentityEscape: ES2015 Annex B allows almost all
|
|
|
|
* source characters here. Match anything except
|
|
|
|
* EOF here.
|
|
|
|
*/
|
|
|
|
ch = x;
|
|
|
|
#else /* DUK_USE_ES6_REGEXP_SYNTAX */
|
|
|
|
} else if (!duk_unicode_is_identifier_part(x)) {
|
|
|
|
/* IdentityEscape: ES5.1 doesn't allow identity escape
|
|
|
|
* for identifier part characters, which conflicts with
|
|
|
|
* some real world code. For example, it doesn't allow
|
|
|
|
* /[\$]/ which is awkward.
|
|
|
|
*/
|
|
|
|
ch = x;
|
|
|
|
#endif /* DUK_USE_ES6_REGEXP_SYNTAX */
|
|
|
|
} else {
|
|
|
|
goto fail_escape;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* character represents itself */
|
|
|
|
ch = x;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ch is a literal character here or -1 if parsed entity was
|
|
|
|
* an escape such as "\s".
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (ch < 0) {
|
|
|
|
/* multi-character sets not allowed as part of ranges, see
|
|
|
|
* E5 Section 15.10.2.15, abstract operation CharacterRange.
|
|
|
|
*/
|
|
|
|
if (start >= 0) {
|
|
|
|
if (dash) {
|
|
|
|
goto fail_range;
|
|
|
|
} else {
|
|
|
|
gen_range(userdata, start, start, 0);
|
|
|
|
start = -1;
|
|
|
|
/* dash is already 0 */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (start >= 0) {
|
|
|
|
if (dash) {
|
|
|
|
if (start > ch) {
|
|
|
|
goto fail_range;
|
|
|
|
}
|
|
|
|
gen_range(userdata, start, ch, 0);
|
|
|
|
start = -1;
|
|
|
|
dash = 0;
|
|
|
|
} else {
|
|
|
|
gen_range(userdata, start, start, 0);
|
|
|
|
start = ch;
|
|
|
|
/* dash is already 0 */
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
start = ch;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_escape:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_REGEXP_ESCAPE);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_range:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_INVALID_RANGE);
|
|
|
|
return;
|
|
|
|
|
|
|
|
fail_unterm_charclass:
|
|
|
|
DUK_ERROR_SYNTAX(lex_ctx->thr, DUK_STR_UNTERMINATED_CHARCLASS);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* DUK_USE_REGEXP_SUPPORT */
|