mirror of https://github.com/svaarala/duktape.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
373 lines
11 KiB
373 lines
11 KiB
8 years ago
|
/*
|
||
|
* WHATWG Encoding API built-ins
|
||
|
*
|
||
|
* API specification: https://encoding.spec.whatwg.org/#api
|
||
|
* Web IDL: https://www.w3.org/TR/WebIDL/
|
||
|
*/
|
||
|
|
||
|
#include "duk_internal.h"
|
||
|
|
||
|
#if defined(DUK_USE_ENCODING_BUILTINS)
|
||
|
typedef struct {
|
||
|
duk_uint8_t *out; /* where to write next byte(s) */
|
||
|
duk_codepoint_t lead; /* lead surrogate */
|
||
|
} duk__encode_context;
|
||
|
|
||
|
typedef struct {
|
||
|
duk_bool_t bom_seen;
|
||
|
duk_bool_t fatal;
|
||
|
duk_bool_t ignore_bom;
|
||
|
|
||
|
/* UTF-8 decoding state */
|
||
|
duk_codepoint_t codepoint; /* built up incrementally */
|
||
|
duk_int_t needed; /* how many more bytes we need */
|
||
|
duk_uint8_t upper; /* max value of next byte (decode error otherwise) */
|
||
|
duk_uint8_t lower; /* min value of next byte (ditto) */
|
||
|
} duk__decode_context;
|
||
|
|
||
|
#define DUK__U8_CONTINUE 0 /* continue to next byte */
|
||
|
#define DUK__U8_CODEPOINT 1 /* have codepoint */
|
||
|
#define DUK__U8_ERROR 2 /* decoding error */
|
||
|
#define DUK__U8_RETRY 3 /* retry last byte, constraint: RETRY > ERROR */
|
||
|
|
||
|
DUK_LOCAL void duk__utf8_decode_init(duk__decode_context *dec_ctx) {
|
||
|
dec_ctx->codepoint = 0x0000L;
|
||
|
dec_ctx->needed = 0;
|
||
|
dec_ctx->upper = 0xbf;
|
||
|
dec_ctx->lower = 0x80;
|
||
|
dec_ctx->bom_seen = 0;
|
||
|
}
|
||
|
|
||
|
DUK_LOCAL int duk__utf8_decode_next(duk__decode_context *dec_ctx, duk_uint8_t byte, duk_codepoint_t *cp) {
|
||
|
/*
|
||
|
* UTF-8 algorithm based on the Encoding specification:
|
||
|
* https://encoding.spec.whatwg.org/#utf-8-decoder
|
||
|
*/
|
||
|
|
||
|
if (dec_ctx->needed == 0) {
|
||
|
/* process initial byte */
|
||
|
if (byte <= 0x7f) {
|
||
|
/* U+0000-U+007F, 1 byte (ASCII) */
|
||
|
*cp = (duk_codepoint_t) byte;
|
||
|
return DUK__U8_CODEPOINT;
|
||
|
} else if (byte >= 0xc2 && byte <= 0xdf) {
|
||
|
/* U+0080-U+07FF, 2 bytes */
|
||
|
dec_ctx->needed = 1;
|
||
|
dec_ctx->codepoint = byte & 0x1f;
|
||
|
return DUK__U8_CONTINUE;
|
||
|
} else if (byte >= 0xe0 && byte <= 0xef) {
|
||
|
/* U+0800-U+FFFF, 3 bytes */
|
||
|
if (byte == 0xe0) {
|
||
|
dec_ctx->lower = 0xa0;
|
||
|
} else if (byte == 0xed) {
|
||
|
dec_ctx->upper = 0x9f;
|
||
|
}
|
||
|
dec_ctx->needed = 2;
|
||
|
dec_ctx->codepoint = byte & 0xf;
|
||
|
return DUK__U8_CONTINUE;
|
||
|
} else if (byte >= 0xf0 && byte <= 0xf4) {
|
||
|
/* U+010000-U+10FFFF, 4 bytes */
|
||
|
if (byte == 0xf0) {
|
||
|
dec_ctx->lower = 0x90;
|
||
|
} else if (byte == 0xf4) {
|
||
|
dec_ctx->upper = 0x8f;
|
||
|
}
|
||
|
dec_ctx->needed = 3;
|
||
|
dec_ctx->codepoint = byte & 0x7;
|
||
|
return DUK__U8_CONTINUE;
|
||
|
} else {
|
||
|
/* not a legal initial byte */
|
||
|
return DUK__U8_ERROR;
|
||
|
}
|
||
|
} else {
|
||
|
/* process continuation byte */
|
||
|
if (byte >= dec_ctx->lower && byte <= dec_ctx->upper) {
|
||
|
dec_ctx->lower = 0x80;
|
||
|
dec_ctx->upper = 0xbf;
|
||
|
dec_ctx->codepoint = (dec_ctx->codepoint << 6) | (byte & 0x3f);
|
||
|
if (--dec_ctx->needed > 0) {
|
||
|
/* need more bytes */
|
||
|
return DUK__U8_CONTINUE;
|
||
|
} else {
|
||
|
/* got a codepoint */
|
||
|
*cp = dec_ctx->codepoint;
|
||
|
dec_ctx->codepoint = 0x0000L;
|
||
|
dec_ctx->needed = 0;
|
||
|
return DUK__U8_CODEPOINT;
|
||
|
}
|
||
|
} else {
|
||
|
/* We just encountered an illegal UTF-8 continuation byte. This might
|
||
|
* be the initial byte of the next character; if we return a plain
|
||
|
* error status and the decoder is in replacement mode, the character
|
||
|
* will be masked. We still need to alert the caller to the error
|
||
|
* though.
|
||
|
*/
|
||
|
dec_ctx->codepoint = 0x0000L;
|
||
|
dec_ctx->needed = 0;
|
||
|
dec_ctx->lower = 0x80;
|
||
|
dec_ctx->upper = 0xbf;
|
||
|
return DUK__U8_RETRY;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
DUK_LOCAL void duk__utf8_encode_char(void *udata, duk_codepoint_t codepoint) {
|
||
|
duk__encode_context *enc_ctx;
|
||
|
|
||
|
enc_ctx = (duk__encode_context *) udata;
|
||
|
if (codepoint > 0x10ffffL) {
|
||
|
/* cannot legally encode in UTF-8 */
|
||
|
enc_ctx->out += duk_unicode_encode_xutf8(DUK_UNICODE_CP_REPLACEMENT_CHARACTER, enc_ctx->out);
|
||
|
} else if (codepoint >= 0xd800L && codepoint <= 0xdbffL) {
|
||
|
/* high surrogate */
|
||
|
if (enc_ctx->lead != 0x0000L) {
|
||
|
/* consecutive high surrogates, consider first one unpaired */
|
||
|
enc_ctx->out += duk_unicode_encode_xutf8(DUK_UNICODE_CP_REPLACEMENT_CHARACTER, enc_ctx->out);
|
||
|
}
|
||
|
enc_ctx->lead = codepoint;
|
||
|
} else if (codepoint >= 0xdc00L && codepoint <= 0xdfffL) {
|
||
|
/* low surrogate */
|
||
|
if (enc_ctx->lead != 0x0000L) {
|
||
|
codepoint = 0x010000L | ((enc_ctx->lead - 0xd800L) << 10) | (codepoint - 0xdc00L);
|
||
|
enc_ctx->out += duk_unicode_encode_xutf8(codepoint, enc_ctx->out);
|
||
|
enc_ctx->lead = 0x0000L;
|
||
|
} else {
|
||
|
/* unpaired low surrogate */
|
||
|
enc_ctx->out += duk_unicode_encode_xutf8(DUK_UNICODE_CP_REPLACEMENT_CHARACTER, enc_ctx->out);
|
||
|
}
|
||
|
} else {
|
||
|
if (enc_ctx->lead != 0x0000L) {
|
||
|
/* unpaired high surrogate */
|
||
|
enc_ctx->out += duk_unicode_encode_xutf8(DUK_UNICODE_CP_REPLACEMENT_CHARACTER, enc_ctx->out);
|
||
|
enc_ctx->lead = 0x0000L;
|
||
|
}
|
||
|
enc_ctx->out += duk_unicode_encode_xutf8(codepoint, enc_ctx->out);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
DUK_INTERNAL duk_ret_t duk_bi_textencoder_constructor(duk_context *ctx) {
|
||
|
/*
|
||
|
* TextEncoder currently requires no persistent state, so the constructor
|
||
|
* does nothing on purpose.
|
||
|
*/
|
||
|
|
||
|
if (!duk_is_constructor_call(ctx)) {
|
||
|
duk_error(ctx, DUK_ERR_TYPE_ERROR, "constructor must use 'new'");
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
DUK_INTERNAL duk_ret_t duk_bi_textencoder_prototype_encoding_getter(duk_context *ctx) {
|
||
|
duk_push_string(ctx, "utf-8");
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
DUK_INTERNAL duk_ret_t duk_bi_textencoder_prototype_encode(duk_context *ctx) {
|
||
|
duk__encode_context enc_ctx;
|
||
|
duk_size_t len;
|
||
|
duk_uint8_t *output;
|
||
|
|
||
|
if (!duk_is_undefined(ctx, 0)) {
|
||
|
duk_to_string(ctx, 0);
|
||
|
} else {
|
||
|
duk_push_string(ctx, "");
|
||
|
duk_replace(ctx, 0);
|
||
|
}
|
||
|
|
||
|
len = duk_get_length(ctx, 0);
|
||
|
if (len >= DUK_HBUFFER_MAX_BYTELEN / 3) {
|
||
|
duk_error(ctx, DUK_ERR_TYPE_ERROR, "input too large");
|
||
|
}
|
||
|
|
||
|
/* Allowance is 3*len because all bytes can potentially be replaced with
|
||
|
* U+FFFD--which rather inconveniently encodes to 3 bytes in UTF-8.
|
||
|
*
|
||
|
* XXX: The buffer allocation strategy used here is rather inefficient.
|
||
|
* Maybe switch to a chunk-based strategy, or preprocess the string to
|
||
|
* figure out the space needed ahead of time?
|
||
|
*/
|
||
|
output = (duk_uint8_t *) duk_push_dynamic_buffer(ctx, 3 * len);
|
||
|
|
||
|
/* XXX: duk_decode_string() is used to process the input string. For strings
|
||
|
* originating in C, this means we will see the string the same way Ecmascript
|
||
|
* code would. But maybe we need something stricter in this case?
|
||
|
*/
|
||
|
enc_ctx.lead = 0x0000L;
|
||
|
enc_ctx.out = output;
|
||
|
duk_decode_string(ctx, 0, duk__utf8_encode_char, (void *) &enc_ctx);
|
||
|
if (enc_ctx.lead != 0x0000L) {
|
||
|
/* unpaired high surrogate at end of string */
|
||
|
enc_ctx.out += duk_unicode_encode_xutf8(DUK_UNICODE_CP_REPLACEMENT_CHARACTER, enc_ctx.out);
|
||
|
DUK_ASSERT(enc_ctx.out <= output + (3 * len));
|
||
|
}
|
||
|
|
||
|
#if defined(DUK_USE_BUFFEROBJECT_SUPPORT)
|
||
|
duk_push_buffer_object(ctx, -1, 0, (duk_size_t) (enc_ctx.out - output), DUK_BUFOBJ_UINT8ARRAY);
|
||
|
#else
|
||
|
/* shrink buffer to include only the encoded text */
|
||
|
duk_resize_buffer(ctx, -1, (duk_size_t) (enc_ctx.out - output));
|
||
|
#endif
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
DUK_INTERNAL duk_ret_t duk_bi_textdecoder_constructor(duk_context *ctx) {
|
||
|
duk__decode_context *dec_ctx;
|
||
|
duk_bool_t fatal = 0;
|
||
|
duk_bool_t ignore_bom = 0;
|
||
|
|
||
|
if (!duk_is_constructor_call(ctx)) {
|
||
|
duk_error(ctx, DUK_ERR_TYPE_ERROR, "constructor must use 'new'");
|
||
|
}
|
||
|
if (!duk_is_undefined(ctx, 0)) {
|
||
|
/* XXX: for now, 'label' argument is ignored */
|
||
|
duk_to_string(ctx, 0);
|
||
|
}
|
||
|
if (!duk_is_null_or_undefined(ctx, 1)) {
|
||
|
if (duk_get_prop_string(ctx, 1, "fatal")) {
|
||
|
fatal = duk_to_boolean(ctx, -1);
|
||
|
}
|
||
|
if (duk_get_prop_string(ctx, 1, "ignoreBOM")) {
|
||
|
ignore_bom = duk_to_boolean(ctx, -1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
dec_ctx = (duk__decode_context *) duk_push_fixed_buffer(ctx, sizeof(duk__decode_context));
|
||
|
dec_ctx->fatal = fatal;
|
||
|
dec_ctx->ignore_bom = ignore_bom;
|
||
|
duk__utf8_decode_init(dec_ctx);
|
||
|
|
||
|
duk_push_this(ctx);
|
||
|
duk_dup(ctx, -2);
|
||
|
duk_put_prop_string(ctx, -2, "\xff" "Context");
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
DUK_INTERNAL duk_ret_t duk_bi_textdecoder_prototype_encoding_getter(duk_context *ctx) {
|
||
|
duk__decode_context *dec_ctx;
|
||
|
|
||
|
duk_push_this(ctx);
|
||
|
duk_get_prop_string(ctx, -1, "\xff" "Context");
|
||
|
dec_ctx = (duk__decode_context *) duk_require_buffer(ctx, -1, NULL);
|
||
|
DUK_ASSERT(dec_ctx != NULL);
|
||
|
|
||
|
/* XXX: change to look up from dec_ctx once other encodings are supported */
|
||
|
DUK_UNREF(dec_ctx);
|
||
|
duk_push_string(ctx, "utf-8");
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
DUK_INTERNAL duk_ret_t duk_bi_textdecoder_prototype_fatal_getter(duk_context *ctx) {
|
||
|
duk__decode_context *dec_ctx;
|
||
|
|
||
|
duk_push_this(ctx);
|
||
|
duk_get_prop_string(ctx, -1, "\xff" "Context");
|
||
|
dec_ctx = (duk__decode_context *) duk_require_buffer(ctx, -1, NULL);
|
||
|
DUK_ASSERT(dec_ctx != NULL);
|
||
|
|
||
|
duk_push_boolean(ctx, dec_ctx->fatal);
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
DUK_INTERNAL duk_ret_t duk_bi_textdecoder_prototype_ignorebom_getter(duk_context *ctx) {
|
||
|
duk__decode_context *dec_ctx;
|
||
|
|
||
|
duk_push_this(ctx);
|
||
|
duk_get_prop_string(ctx, -1, "\xff" "Context");
|
||
|
dec_ctx = (duk__decode_context *) duk_require_buffer(ctx, -1, NULL);
|
||
|
|
||
|
duk_push_boolean(ctx, dec_ctx->ignore_bom);
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
DUK_INTERNAL duk_ret_t duk_bi_textdecoder_prototype_decode(duk_context *ctx) {
|
||
|
const duk_uint8_t *input = (const duk_uint8_t *) "";
|
||
|
duk__decode_context *dec_ctx;
|
||
|
duk_size_t len = 0;
|
||
|
duk_bool_t stream = 0;
|
||
|
duk_codepoint_t codepoint;
|
||
|
int ret;
|
||
|
duk_uint8_t *output;
|
||
|
const duk_uint8_t *in;
|
||
|
duk_uint8_t *out;
|
||
|
|
||
|
if (!duk_is_undefined(ctx, 0)) {
|
||
|
input = (const duk_uint8_t *) duk_require_buffer_data(ctx, 0, &len);
|
||
|
}
|
||
|
if (!duk_is_null_or_undefined(ctx, 1)) {
|
||
|
if (duk_get_prop_string(ctx, 1, "stream")) {
|
||
|
stream = duk_to_boolean(ctx, -1);
|
||
|
}
|
||
|
}
|
||
|
duk_push_this(ctx);
|
||
|
duk_get_prop_string(ctx, -1, "\xff" "Context");
|
||
|
dec_ctx = (duk__decode_context *) duk_require_buffer(ctx, -1, NULL);
|
||
|
DUK_ASSERT(dec_ctx != NULL);
|
||
|
|
||
|
if (len >= (DUK_HBUFFER_MAX_BYTELEN / 3) - 3) {
|
||
|
duk_error(ctx, DUK_ERR_TYPE_ERROR, "input too large");
|
||
|
}
|
||
|
|
||
|
/* Allowance is 3*len in the general case because all bytes may potentially
|
||
|
* become U+FFFD. If the first byte completes a non-BMP codepoint it will
|
||
|
* decode to a CESU-8 surrogate pair (6 bytes) so we allow 3 extra bytes to
|
||
|
* compensate: (1*3)+3 = 6. Non-BMP codepoints are safe otherwise because
|
||
|
* the 4->6 expansion is well under the 3x allowance.
|
||
|
*
|
||
|
* XXX: As with TextEncoder, need a better buffer allocation strategy here.
|
||
|
*/
|
||
|
output = (duk_uint8_t *) duk_push_fixed_buffer(ctx, 3 + (3 * len));
|
||
|
|
||
|
in = input;
|
||
|
out = output;
|
||
|
DUK_ASSERT(DUK__U8_RETRY > DUK__U8_ERROR);
|
||
|
while (in < input + len) {
|
||
|
ret = duk__utf8_decode_next(dec_ctx, *in++, &codepoint);
|
||
|
if (ret == DUK__U8_CONTINUE) {
|
||
|
continue;
|
||
|
}
|
||
|
if (ret >= DUK__U8_ERROR) {
|
||
|
/* decoding error */
|
||
|
DUK_ASSERT(ret == DUK__U8_ERROR || ret == DUK__U8_RETRY);
|
||
|
if (dec_ctx->fatal) {
|
||
|
/* fatal mode: throw a TypeError */
|
||
|
goto fatal;
|
||
|
} else {
|
||
|
/* replacement mode: replace with U+FFFD */
|
||
|
codepoint = DUK_UNICODE_CP_REPLACEMENT_CHARACTER;
|
||
|
}
|
||
|
}
|
||
|
if (ret == DUK__U8_RETRY) {
|
||
|
--in; /* retry last byte */
|
||
|
}
|
||
|
if (!dec_ctx->bom_seen) {
|
||
|
dec_ctx->bom_seen = 1;
|
||
|
if (codepoint == 0xfeffL && !dec_ctx->ignore_bom) {
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
out += duk_unicode_encode_cesu8(codepoint, out);
|
||
|
DUK_ASSERT(out <= output + (3 + (3 * len)));
|
||
|
}
|
||
|
|
||
|
if (!stream) {
|
||
|
if (dec_ctx->needed != 0) {
|
||
|
/* truncated sequence at end of buffer */
|
||
|
if (dec_ctx->fatal) {
|
||
|
goto fatal;
|
||
|
} else {
|
||
|
out += duk_unicode_encode_cesu8(DUK_UNICODE_CP_REPLACEMENT_CHARACTER, out);
|
||
|
DUK_ASSERT(out <= output + (3 + (3 * len)));
|
||
|
}
|
||
|
}
|
||
|
duk__utf8_decode_init(dec_ctx);
|
||
|
}
|
||
|
|
||
|
duk_push_lstring(ctx, (const char *) output, (duk_size_t) (out - output));
|
||
|
return 1;
|
||
|
|
||
|
fatal:
|
||
|
duk_error(ctx, DUK_ERR_TYPE_ERROR, "cannot decode as utf-8");
|
||
|
DUK_UNREACHABLE();
|
||
|
}
|
||
|
#endif /* DUK_USE_ENCODING_BUILTINS */
|