make duk_codepoint_t a signed type and use it everywhere except utf8/cesu8 encode/decode which uses the duk_ucodepoint_t unsigned type

11 years ago · 3731c7b068
6 changed files with 67 additions and 62 deletions
--- a/src/duk_api_string.c
+++ b/src/duk_api_string.c
@ -159,7 +159,7 @@ void duk_trim(duk_context *ctx, int index) {
 	duk_hstring *h;
 	duk_uint8_t *p, *p_start, *p_end, *p_tmp1, *p_tmp2;  /* pointers for scanning */
 	duk_uint8_t *q_start, *q_end;  /* start (incl) and end (excl) of trimmed part */
-	duk_uint32_t cp;
+	duk_codepoint_t cp;

 	index = duk_require_normalize_index(ctx, index);
 	h = duk_require_hstring(ctx, index);
@ -171,8 +171,7 @@ void duk_trim(duk_context *ctx, int index) {
 	p = p_start;
 	while (p < p_end) {
 		p_tmp1 = p;
-		/* FIXME: duk_codepoint_t */
-		cp = (duk_uint32_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp1, p_start, p_end);
+		cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp1, p_start, p_end);
 		if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) {
 			break;
 		}
@ -196,8 +195,7 @@ void duk_trim(duk_context *ctx, int index) {
 		}
 		p_tmp2 = p;

-		/* FIXME: duk_codepoint_t */
-		cp = (duk_uint32_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp2, p_start, p_end);
+		cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp2, p_start, p_end);
 		if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) {
 			p = p_tmp1;
 			break;
--- a/src/duk_builtin_global.c
+++ b/src/duk_builtin_global.c
@ -133,7 +133,7 @@ static int transform_helper(duk_context *ctx, transform_callback callback, void
 	tfm_ctx->p = tfm_ctx->p_start;

 	while (tfm_ctx->p < tfm_ctx->p_end) {
-		cp = duk_unicode_decode_xutf8_checked(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end);
+		cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end);
 		callback(tfm_ctx, udata, cp);
 	}

@ -149,22 +149,24 @@ static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void *
 	duk_small_int_t i, t;
 	duk_uint8_t *unescaped_table = (duk_uint8_t *) udata;

-	if ((cp < 128) && CHECK_BITMASK(unescaped_table, cp)) {
+	if (cp < 0) {
+		goto uri_error;
+	} else if ((cp < 0x80L) && CHECK_BITMASK(unescaped_table, cp)) {
 		duk_hbuffer_append_byte(tfm_ctx->thr, tfm_ctx->h_buf, (duk_uint8_t) cp);
 		return;
-	} else if (cp >= 0xdc00UL && cp <= 0xdfffUL) {
+	} else if (cp >= 0xdc00L && cp <= 0xdfffL) {
 		goto uri_error;
-	} else if (cp >= 0xd800UL && cp <= 0xdbffUL) {
+	} else if (cp >= 0xd800L && cp <= 0xdbffL) {
 		/* Needs lookahead */
-		if (duk_unicode_decode_xutf8(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end, &cp2) == 0) {
+		if (duk_unicode_decode_xutf8(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end, (duk_ucodepoint_t *) &cp2) == 0) {
 			goto uri_error;
 		}
-		if (!(cp2 >= 0xdc00UL && cp2 <= 0xdfffUL)) {
+		if (!(cp2 >= 0xdc00L && cp2 <= 0xdfffL)) {
 			goto uri_error;
 		}
 		cp1 = cp;
-		cp = ((cp1 - 0xd800UL) << 10) + (cp2 - 0xdc00UL) + 0x10000UL;
-	} else if (cp > 0x10ffffUL) {
+		cp = ((cp1 - 0xd800L) << 10) + (cp2 - 0xdc00L) + 0x10000L;
+	} else if (cp > 0x10ffffL) {
 		/* Although we can allow non-BMP characters (they'll decode
 		 * back into surrogate pairs), we don't allow extended UTF-8
 		 * characters; they would encode to URIs which won't decode
@ -179,7 +181,7 @@ static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void *
 		;
 	}

-	len = duk_unicode_encode_xutf8(cp, xutf8_buf);
+	len = duk_unicode_encode_xutf8((duk_ucodepoint_t) cp, xutf8_buf);
 	buf[0] = (duk_uint8_t) '%';
 	for (i = 0; i < len; i++) {
 		t = (int) xutf8_buf[i];
@ -284,7 +286,7 @@ static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void *

 		DUK_DDDPRINT("final cp=%d, min_cp=%d", cp, min_cp);

-		if (cp < min_cp || cp > 0x10ffffUL || (cp >= 0xd800UL && cp <= 0xdfffUL)) {
+		if (cp < min_cp || cp > 0x10ffffL || (cp >= 0xd800L && cp <= 0xdfffL)) {
 			goto uri_error;
 		}

@ -297,18 +299,18 @@ static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void *
 		 */

 		/* utf-8 validation ensures these */
-		DUK_ASSERT(cp >= 0x80UL && cp <= 0x10ffffUL);
+		DUK_ASSERT(cp >= 0x80L && cp <= 0x10ffffL);

-		if (cp >= 0x10000UL) {
-			cp -= 0x10000UL;
+		if (cp >= 0x10000L) {
+			cp -= 0x10000L;
 			DUK_ASSERT(cp < 0x100000UL);
-			duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (cp >> 10) + 0xd800UL);
-			duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (cp & 0x03ffUL) + 0xdc00UL);
+			duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) ((cp >> 10) + 0xd800L));
+			duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) ((cp & 0x03ffUL) + 0xdc00L));
 		} else {
-			duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, cp);
+			duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) cp);
 		}
 	} else {
-		duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, cp);
+		duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) cp);
 	}
 	return;

@ -321,15 +323,17 @@ static void transform_callback_escape(duk_transform_context *tfm_ctx, void *udat
 	duk_uint8_t buf[6];
 	duk_small_int_t len;

-	if ((cp < 0x80UL) && CHECK_BITMASK(escape_unescaped_table, cp)) {
+	if (cp < 0) {
+		goto esc_error;
+	} else if ((cp < 0x80L) && CHECK_BITMASK(escape_unescaped_table, cp)) {
 		buf[0] = (duk_uint8_t) cp;
 		len = 1;
-	} else if (cp < 0x100UL) {
+	} else if (cp < 0x100L) {
 		buf[0] = (duk_uint8_t) '%';
 		buf[1] = (duk_uint8_t) duk_uc_nybbles[cp >> 4];
 		buf[2] = (duk_uint8_t) duk_uc_nybbles[cp & 0x0f];
 		len = 3;
-	} else if (cp < 0x10000UL) {
+	} else if (cp < 0x10000L) {
 		buf[0] = (duk_uint8_t) '%';
 		buf[1] = (duk_uint8_t) 'u';
 		buf[2] = (duk_uint8_t) duk_uc_nybbles[cp >> 12];
--- a/src/duk_features.h
+++ b/src/duk_features.h
@ -571,11 +571,14 @@ typedef int duk_small_int_t;
 typedef unsigned int duk_small_uint_t;

 /* Codepoint type.  Must be 32 bits or more because it is used also for
- * internal codepoints.  Signed codepoints are needed internally in some
- * algorithms (e.g. negative value used as a marker).
- */
-typedef duk_uint_fast32_t duk_codepoint_t;
-typedef duk_int_fast32_t duk_signed_codepoint_t;
+ * internal codepoints.  The type is signed because negative codepoints
+ * are used as internal markers (e.g. to mark EOF or missing argument).
+ * (X)UTF-8/CESU-8 encode/decode take and return an unsigned variant to
+ * ensure duk_uint32_t casts back and forth nicely.  Almost everything
+ * else uses the signed one.
+ */
+typedef duk_int_fast32_t duk_codepoint_t;
+typedef duk_uint_fast32_t duk_ucodepoint_t;

 /* IEEE double typedef. */
 typedef double duk_double_t;
--- a/src/duk_hbuffer_ops.c
+++ b/src/duk_hbuffer_ops.c
@ -200,7 +200,7 @@ size_t duk_hbuffer_insert_xutf8(duk_hthread *thr, duk_hbuffer_dynamic *buf, size
 	/* Intentionally no fast path: insertion is not that central */

 	/* FIXME: cp -> duk_codepoint_t */
-	len = (size_t) duk_unicode_encode_xutf8(codepoint, tmp);
+	len = (size_t) duk_unicode_encode_xutf8((duk_ucodepoint_t) codepoint, tmp);
 	duk_hbuffer_insert_bytes(thr, buf, offset, tmp, len);
 	return len;
 }
@ -223,7 +223,7 @@ size_t duk_hbuffer_insert_cesu8(duk_hthread *thr, duk_hbuffer_dynamic *buf, size
 	/* Intentionally no fast path: insertion is not that central */

 	/* FIXME: cp -> duk_codepoint_t */
-	len = (size_t) duk_unicode_encode_cesu8(codepoint, tmp);
+	len = (size_t) duk_unicode_encode_cesu8((duk_ucodepoint_t) codepoint, tmp);
 	duk_hbuffer_insert_bytes(thr, buf, offset, tmp, len);
 	return len;
 }
@ -305,7 +305,7 @@ size_t duk_hbuffer_append_xutf8(duk_hthread *thr, duk_hbuffer_dynamic *buf, duk_
 	}

 	/* FIXME: cp -> duk_codepoint_t */
-	len = (size_t) duk_unicode_encode_xutf8(codepoint, tmp);
+	len = (size_t) duk_unicode_encode_xutf8((duk_ucodepoint_t) codepoint, tmp);
 	duk_hbuffer_insert_bytes(thr, buf, DUK_HBUFFER_GET_SIZE(buf), tmp, len);
 	return len;
 }
@ -334,7 +334,7 @@ size_t duk_hbuffer_append_cesu8(duk_hthread *thr, duk_hbuffer_dynamic *buf, duk_
 	}

 	/* FIXME: cp -> duk_codepoint_t */
-	len = (size_t) duk_unicode_encode_cesu8(codepoint, tmp);
+	len = (size_t) duk_unicode_encode_cesu8((duk_ucodepoint_t) codepoint, tmp);
 	duk_hbuffer_insert_bytes(thr, buf, DUK_HBUFFER_GET_SIZE(buf), tmp, len);
 	return len;
 }
--- a/src/duk_unicode.h
+++ b/src/duk_unicode.h
@ -43,16 +43,16 @@ extern duk_uint16_t duk_unicode_re_ranges_not_wordchar[10];
 *  Prototypes
 */

-duk_small_int_t duk_unicode_get_xutf8_length(duk_codepoint_t cp);
-duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out);
-duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out);
-duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_codepoint_t *out_cp);
-duk_codepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end);
+duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp);
+duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out);
+duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out);
+duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp);
+duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end);
 duk_size_t duk_unicode_unvalidated_utf8_length(duk_uint8_t *data, duk_size_t blen);
-duk_small_int_t duk_unicode_is_whitespace(duk_signed_codepoint_t cp);
-duk_small_int_t duk_unicode_is_line_terminator(duk_signed_codepoint_t cp);
-duk_small_int_t duk_unicode_is_identifier_start(duk_signed_codepoint_t cp);
-duk_small_int_t duk_unicode_is_identifier_part(duk_signed_codepoint_t cp);
+duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp);
+duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp);
+duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp);
+duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp);
 void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase);
 duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp);
 duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t cp);
--- a/src/duk_unicode_support.c
+++ b/src/duk_unicode_support.c
@ -9,7 +9,7 @@
 *  XUTF-8 and CESU-8 encoding/decoding
 */

-duk_small_int_t duk_unicode_get_xutf8_length(duk_codepoint_t cp) {
+duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) {
 	duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
 	if (x < 0x80UL) {
 		/* 7 bits */
@ -43,7 +43,7 @@ duk_uint8_t duk_unicode_xutf8_markers[7] = {
 * DUK_UNICODE_MAX_XUTF8_LENGTH bytes.  Allows encoding of any
 * 32-bit (unsigned) codepoint.
 */
-duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out) {
+duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out) {
 	duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
 	duk_small_int_t len;
 	duk_uint8_t marker;
@ -77,7 +77,7 @@ duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out) {
 * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF
 * will encode to garbage but won't overwrite the output buffer.
 */
-duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out) {
+duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out) {
 	duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
 	duk_small_int_t len;

@ -137,7 +137,7 @@ duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out) {
 }

 /* Decode helper.  Return zero on error. */
-duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_codepoint_t *out_cp) {
+duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp) {
 	duk_uint8_t *p;
 	duk_uint32_t res;
 	duk_uint_fast8_t ch;
@ -219,8 +219,8 @@ duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, du
 }

 /* used by e.g. duk_regexp_executor.c, string built-ins */
-duk_codepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end) {
-	duk_codepoint_t cp;
+duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end) {
+	duk_ucodepoint_t cp;

 	if (duk_unicode_decode_xutf8(thr, ptr, ptr_start, ptr_end, &cp)) {
 		return cp;
@ -314,7 +314,7 @@ static duk_small_int_t uni_range_match(const duk_uint8_t *unitab, duk_size_t uni
 *  "WhiteSpace" production check.
 */

-duk_small_int_t duk_unicode_is_whitespace(duk_signed_codepoint_t cp) {
+duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) {
 	/*
 	 *  E5 Section 7.2 specifies six characters specifically as
 	 *  white space:
@ -400,7 +400,7 @@ duk_small_int_t duk_unicode_is_whitespace(duk_signed_codepoint_t cp) {
 *  "LineTerminator" production check.
 */

-duk_small_int_t duk_unicode_is_line_terminator(duk_signed_codepoint_t cp) {
+duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp) {
 	/*
 	 *  E5 Section 7.3
 	 *
@ -420,7 +420,7 @@ duk_small_int_t duk_unicode_is_line_terminator(duk_signed_codepoint_t cp) {
 *  "IdentifierStart" production check.
 */

-duk_small_int_t duk_unicode_is_identifier_start(duk_signed_codepoint_t cp) {
+duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) {
 	/*
 	 *  E5 Section 7.6:
 	 *
@ -495,7 +495,7 @@ duk_small_int_t duk_unicode_is_identifier_start(duk_signed_codepoint_t cp) {
 *  "IdentifierPart" production check.
 */

-duk_small_int_t duk_unicode_is_identifier_part(duk_signed_codepoint_t cp) {
+duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) {
 	/*
 	 *  E5 Section 7.6:
 	 *
@ -704,11 +704,11 @@ static duk_codepoint_t slow_case_conversion(duk_hthread *thr,
 *  locale/language.
 */

-static duk_signed_codepoint_t case_transform_helper(duk_hthread *thr,
+static duk_codepoint_t case_transform_helper(duk_hthread *thr,
                                             duk_hbuffer_dynamic *buf,
                                             duk_codepoint_t cp,
-                                                    duk_signed_codepoint_t prev,
-                                                    duk_signed_codepoint_t next,
+                                             duk_codepoint_t prev,
+                                             duk_codepoint_t next,
                                             duk_small_int_t uppercase,
                                             duk_small_int_t language) {
 	duk_bitdecoder_ctx bd_ctx;
@ -786,7 +786,7 @@ void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase
 	duk_hstring *h_input;
 	duk_hbuffer_dynamic *h_buf;
 	duk_uint8_t *p, *p_start, *p_end;
-	duk_signed_codepoint_t prev, curr, next;  /* need signed type here */
+	duk_codepoint_t prev, curr, next;

 	h_input = duk_require_hstring(ctx, -1);
 	DUK_ASSERT(h_input != NULL);
@ -846,7 +846,7 @@ void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase
 */

 duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp) {
-	duk_signed_codepoint_t y;
+	duk_codepoint_t y;

 	y = case_transform_helper(thr,
 	                          NULL,    /* buf */