change duk_unicode_xutf8_get_u32() to be non-checked; fix minor nits in global object encode/decode functions (now passes tests)

12 years ago · 9e8ca671d3
6 changed files with 49 additions and 16 deletions
--- a/src/duk_builtin_global.c
+++ b/src/duk_builtin_global.c
@ -134,7 +134,7 @@ static int transform_helper(duk_context *ctx, transform_callback callback, void
 	tfm_ctx->p = tfm_ctx->p_start;

 	while (tfm_ctx->p < tfm_ctx->p_end) {
-		cp = duk_unicode_xutf8_get_u32(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end);
+		cp = duk_unicode_xutf8_get_u32_checked(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end);
 		callback(tfm_ctx, udata, cp);
 	}

@ -157,15 +157,26 @@ static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void *
 		goto uri_error;
 	} else if (cp >= 0xd800 && cp <= 0xdbff) {
 		/* Needs lookahead */
-		/* FIXME: if fails, must be URIError */
-		cp2 = duk_unicode_xutf8_get_u32(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end);
+		if (duk_unicode_xutf8_get_u32(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end, &cp2) == 0) {
+			goto uri_error;
+		}
 		if (!(cp2 >= 0xdc00 && cp2 <= 0xdfff)) {
 			goto uri_error;
 		}
 		cp1 = cp;
 		cp = ((cp1 - 0xd800) << 10) + (cp2 - 0xdc00) + 0x10000;
+	} else if (cp > 0x10ffff) {
+		/* Although we can allow non-BMP characters (they'll decode
+		 * back into surrogate pairs), we don't allow extended UTF-8
+		 * characters; they would encode to URIs which won't decode
+		 * back because of strict UTF-8 checks in URI decoding.
+		 * (However, we could just as well allow them here.)
+		 */
+		goto uri_error;
 	} else {
-		/* FIXME: non-BMP? */
+		/* Non-BMP characters within valid UTF-8 range: encode as is.
+		 * They'll decode back into surrogate pairs.
+		 */
 		;
 	}

@ -320,8 +331,7 @@ static void transform_callback_escape(duk_transform_context *tfm_ctx, void *udat
 		buf[1] = (duk_u8) duk_uc_nybbles[cp >> 4];
 		buf[2] = (duk_u8) duk_uc_nybbles[cp & 0x0f];
 		len = 3;
-	} else {
-		/* FIXME: non-BMP chars will now be clipped */
+	} else if (cp < 65536) {
 		buf[0] = (duk_u8) '%';
 		buf[1] = (duk_u8) 'u';
 		buf[2] = (duk_u8) duk_uc_nybbles[cp >> 12];
@ -329,9 +339,20 @@ static void transform_callback_escape(duk_transform_context *tfm_ctx, void *udat
 		buf[4] = (duk_u8) duk_uc_nybbles[(cp >> 4) & 0x0f];
 		buf[5] = (duk_u8) duk_uc_nybbles[cp & 0x0f];
 		len = 6;
+	} else {
+		/* Characters outside BMP cannot be escape()'d.  We could
+		 * encode them as surrogate pairs (for codepoints inside
+		 * valid UTF-8 range, but not extended UTF-8).  Because
+		 * escape() and unescape() are legacy functions, we don't.
+		 */
+		goto esc_error;
 	}

 	duk_hbuffer_append_bytes(tfm_ctx->thr, tfm_ctx->h_buf, buf, len);
+	return;
+
+ esc_error:
+	DUK_ERROR(tfm_ctx->thr, DUK_ERR_TYPE_ERROR, "invalid input");
 }

 static void transform_callback_unescape(duk_transform_context *tfm_ctx, void *udata, duk_u32 cp) {
--- a/src/duk_builtin_json.c
+++ b/src/duk_builtin_json.c
@ -754,7 +754,7 @@ static void json_enc_quote_string(duk_json_enc_ctx *js_ctx, duk_hstring *h_str)
 			/* slow path decode */

 			/* FIXME: this may currently fail, we'd prefer it never do that */
-			cp = duk_unicode_xutf8_get_u32(thr, &p, p_start, p_end);
+			cp = duk_unicode_xutf8_get_u32_checked(thr, &p, p_start, p_end);

 			if (js_ctx->flag_ascii_only) {
 				if (cp > 0xffff) {
--- a/src/duk_builtin_string.c
+++ b/src/duk_builtin_string.c
@ -148,7 +148,7 @@ int duk_builtin_string_prototype_char_code_at(duk_context *ctx) {
 	DUK_DDDPRINT("p_start=%p, p_end=%p, p=%p", (void *) p_start, (void *) p_end, (void *) p);

 	/* FIXME: this may throw an error, though not for valid E5 strings - is this OK here? */
-	cp = duk_unicode_xutf8_get_u32(thr, &p, p_start, p_end);
+	cp = duk_unicode_xutf8_get_u32_checked(thr, &p, p_start, p_end);

 	/* FIXME: push_uint or push_u32 */
 	duk_push_number(ctx, (double) cp);
--- a/src/duk_regexp_executor.c
+++ b/src/duk_regexp_executor.c
@ -17,14 +17,14 @@
 */

 static duk_u32 bc_get_u32(duk_re_matcher_ctx *re_ctx, duk_u8 **pc) {
-	return duk_unicode_xutf8_get_u32(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
+	return duk_unicode_xutf8_get_u32_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
 }

 static duk_i32 bc_get_i32(duk_re_matcher_ctx *re_ctx, duk_u8 **pc) {
 	duk_u32 t;

 	/* signed integer encoding needed to work with UTF-8 */
-	t = duk_unicode_xutf8_get_u32(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
+	t = duk_unicode_xutf8_get_u32_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
 	if (t & 1) {
 		return -(t >> 1);
 	} else {
@ -103,7 +103,7 @@ static duk_u8 *utf8_advance(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, d
 * matching.
 */
 static duk_u32 inp_get_u32(duk_re_matcher_ctx *re_ctx, duk_u8 **sp) {
-	duk_u32 res = duk_unicode_xutf8_get_u32(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end);
+	duk_u32 res = duk_unicode_xutf8_get_u32_checked(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end);
 	if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
 		res = duk_unicode_re_canonicalize_char(re_ctx->thr, res);
 	}
--- a/src/duk_unicode.h
+++ b/src/duk_unicode.h
@ -48,7 +48,8 @@ extern duk_u16 duk_unicode_re_ranges_not_wordchar[10];
 int duk_unicode_get_xutf8_length(duk_u32 x);
 size_t duk_unicode_encode_xutf8(duk_u32 x, duk_u8 *out);
 size_t duk_unicode_encode_cesu8(duk_u32 x, duk_u8 *out);
-duk_u32 duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end);
+int duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end, duk_u32 *out_cp);
+duk_u32 duk_unicode_xutf8_get_u32_checked(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end);
 duk_u32 duk_unicode_unvalidated_utf8_length(duk_u8 *data, duk_u32 blen);
 int duk_unicode_is_whitespace(int x);
 int duk_unicode_is_line_terminator(int x);
--- a/src/duk_unicode_support.c
+++ b/src/duk_unicode_support.c
@ -133,8 +133,8 @@ size_t duk_unicode_encode_cesu8(duk_u32 x, duk_u8 *out) {
 	return len;
 }

-/* used by e.g. duk_regexp_executor.c, string built-ins */
-duk_u32 duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end) {
+/* Decode helper.  Return zero on error. */
+int duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end, duk_u32 *out_cp) {
 	duk_u8 *p;
 	duk_u32 res;
 	int ch;
@ -208,9 +208,20 @@ duk_u32 duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_st
 	}

 	*ptr = p;
-	return res;
+	*out_cp = res;
+	return 1;

 fail:
+	return 0;
+}
+
+/* used by e.g. duk_regexp_executor.c, string built-ins */
+duk_u32 duk_unicode_xutf8_get_u32_checked(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end) {
+	duk_u32 cp;
+
+	if (duk_unicode_xutf8_get_u32(thr, ptr, ptr_start, ptr_end, &cp)) {
+		return cp;
+	}
 	DUK_ERROR(thr, DUK_ERR_INTERNAL_ERROR, "utf-8 decode failed");
 	return 0;  /* never here */
 }
@ -791,7 +802,7 @@ void duk_unicode_case_convert_string(duk_hthread *thr, int uppercase) {
 		curr = next;
 		next = -1;
 		if (p < p_end) {
-			next = (int) duk_unicode_xutf8_get_u32(thr, &p, p_start, p_end);
+			next = (int) duk_unicode_xutf8_get_u32_checked(thr, &p, p_start, p_end);
 		} else {
 			/* end of input and last char has been processed */
 			if (curr < 0) {