diff --git a/src/duk_api_string.c b/src/duk_api_string.c index 8d371e02..c0859ccd 100644 --- a/src/duk_api_string.c +++ b/src/duk_api_string.c @@ -159,7 +159,7 @@ void duk_trim(duk_context *ctx, int index) { duk_hstring *h; duk_uint8_t *p, *p_start, *p_end, *p_tmp1, *p_tmp2; /* pointers for scanning */ duk_uint8_t *q_start, *q_end; /* start (incl) and end (excl) of trimmed part */ - duk_uint32_t cp; + duk_codepoint_t cp; index = duk_require_normalize_index(ctx, index); h = duk_require_hstring(ctx, index); @@ -171,8 +171,7 @@ void duk_trim(duk_context *ctx, int index) { p = p_start; while (p < p_end) { p_tmp1 = p; - /* FIXME: duk_codepoint_t */ - cp = (duk_uint32_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp1, p_start, p_end); + cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp1, p_start, p_end); if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) { break; } @@ -196,8 +195,7 @@ void duk_trim(duk_context *ctx, int index) { } p_tmp2 = p; - /* FIXME: duk_codepoint_t */ - cp = (duk_uint32_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp2, p_start, p_end); + cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp2, p_start, p_end); if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) { p = p_tmp1; break; diff --git a/src/duk_builtin_global.c b/src/duk_builtin_global.c index 6cf69dfe..94ad4657 100644 --- a/src/duk_builtin_global.c +++ b/src/duk_builtin_global.c @@ -133,7 +133,7 @@ static int transform_helper(duk_context *ctx, transform_callback callback, void tfm_ctx->p = tfm_ctx->p_start; while (tfm_ctx->p < tfm_ctx->p_end) { - cp = duk_unicode_decode_xutf8_checked(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end); + cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end); callback(tfm_ctx, udata, cp); } @@ -149,22 +149,24 @@ static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void * duk_small_int_t i, t; duk_uint8_t *unescaped_table = (duk_uint8_t *) udata; - if ((cp < 128) && CHECK_BITMASK(unescaped_table, cp)) { + if (cp < 0) { + goto uri_error; + } else if ((cp < 0x80L) && CHECK_BITMASK(unescaped_table, cp)) { duk_hbuffer_append_byte(tfm_ctx->thr, tfm_ctx->h_buf, (duk_uint8_t) cp); return; - } else if (cp >= 0xdc00UL && cp <= 0xdfffUL) { + } else if (cp >= 0xdc00L && cp <= 0xdfffL) { goto uri_error; - } else if (cp >= 0xd800UL && cp <= 0xdbffUL) { + } else if (cp >= 0xd800L && cp <= 0xdbffL) { /* Needs lookahead */ - if (duk_unicode_decode_xutf8(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end, &cp2) == 0) { + if (duk_unicode_decode_xutf8(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end, (duk_ucodepoint_t *) &cp2) == 0) { goto uri_error; } - if (!(cp2 >= 0xdc00UL && cp2 <= 0xdfffUL)) { + if (!(cp2 >= 0xdc00L && cp2 <= 0xdfffL)) { goto uri_error; } cp1 = cp; - cp = ((cp1 - 0xd800UL) << 10) + (cp2 - 0xdc00UL) + 0x10000UL; - } else if (cp > 0x10ffffUL) { + cp = ((cp1 - 0xd800L) << 10) + (cp2 - 0xdc00L) + 0x10000L; + } else if (cp > 0x10ffffL) { /* Although we can allow non-BMP characters (they'll decode * back into surrogate pairs), we don't allow extended UTF-8 * characters; they would encode to URIs which won't decode @@ -179,7 +181,7 @@ static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void * ; } - len = duk_unicode_encode_xutf8(cp, xutf8_buf); + len = duk_unicode_encode_xutf8((duk_ucodepoint_t) cp, xutf8_buf); buf[0] = (duk_uint8_t) '%'; for (i = 0; i < len; i++) { t = (int) xutf8_buf[i]; @@ -284,7 +286,7 @@ static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void * DUK_DDDPRINT("final cp=%d, min_cp=%d", cp, min_cp); - if (cp < min_cp || cp > 0x10ffffUL || (cp >= 0xd800UL && cp <= 0xdfffUL)) { + if (cp < min_cp || cp > 0x10ffffL || (cp >= 0xd800L && cp <= 0xdfffL)) { goto uri_error; } @@ -297,18 +299,18 @@ static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void * */ /* utf-8 validation ensures these */ - DUK_ASSERT(cp >= 0x80UL && cp <= 0x10ffffUL); + DUK_ASSERT(cp >= 0x80L && cp <= 0x10ffffL); - if (cp >= 0x10000UL) { - cp -= 0x10000UL; + if (cp >= 0x10000L) { + cp -= 0x10000L; DUK_ASSERT(cp < 0x100000UL); - duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (cp >> 10) + 0xd800UL); - duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (cp & 0x03ffUL) + 0xdc00UL); + duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) ((cp >> 10) + 0xd800L)); + duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) ((cp & 0x03ffUL) + 0xdc00L)); } else { - duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, cp); + duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) cp); } } else { - duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, cp); + duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) cp); } return; @@ -321,15 +323,17 @@ static void transform_callback_escape(duk_transform_context *tfm_ctx, void *udat duk_uint8_t buf[6]; duk_small_int_t len; - if ((cp < 0x80UL) && CHECK_BITMASK(escape_unescaped_table, cp)) { + if (cp < 0) { + goto esc_error; + } else if ((cp < 0x80L) && CHECK_BITMASK(escape_unescaped_table, cp)) { buf[0] = (duk_uint8_t) cp; len = 1; - } else if (cp < 0x100UL) { + } else if (cp < 0x100L) { buf[0] = (duk_uint8_t) '%'; buf[1] = (duk_uint8_t) duk_uc_nybbles[cp >> 4]; buf[2] = (duk_uint8_t) duk_uc_nybbles[cp & 0x0f]; len = 3; - } else if (cp < 0x10000UL) { + } else if (cp < 0x10000L) { buf[0] = (duk_uint8_t) '%'; buf[1] = (duk_uint8_t) 'u'; buf[2] = (duk_uint8_t) duk_uc_nybbles[cp >> 12]; diff --git a/src/duk_features.h b/src/duk_features.h index c9b597d7..d176689b 100644 --- a/src/duk_features.h +++ b/src/duk_features.h @@ -571,11 +571,14 @@ typedef int duk_small_int_t; typedef unsigned int duk_small_uint_t; /* Codepoint type. Must be 32 bits or more because it is used also for - * internal codepoints. Signed codepoints are needed internally in some - * algorithms (e.g. negative value used as a marker). - */ -typedef duk_uint_fast32_t duk_codepoint_t; -typedef duk_int_fast32_t duk_signed_codepoint_t; + * internal codepoints. The type is signed because negative codepoints + * are used as internal markers (e.g. to mark EOF or missing argument). + * (X)UTF-8/CESU-8 encode/decode take and return an unsigned variant to + * ensure duk_uint32_t casts back and forth nicely. Almost everything + * else uses the signed one. + */ +typedef duk_int_fast32_t duk_codepoint_t; +typedef duk_uint_fast32_t duk_ucodepoint_t; /* IEEE double typedef. */ typedef double duk_double_t; diff --git a/src/duk_hbuffer_ops.c b/src/duk_hbuffer_ops.c index 21c43f5c..2bce8773 100644 --- a/src/duk_hbuffer_ops.c +++ b/src/duk_hbuffer_ops.c @@ -200,7 +200,7 @@ size_t duk_hbuffer_insert_xutf8(duk_hthread *thr, duk_hbuffer_dynamic *buf, size /* Intentionally no fast path: insertion is not that central */ /* FIXME: cp -> duk_codepoint_t */ - len = (size_t) duk_unicode_encode_xutf8(codepoint, tmp); + len = (size_t) duk_unicode_encode_xutf8((duk_ucodepoint_t) codepoint, tmp); duk_hbuffer_insert_bytes(thr, buf, offset, tmp, len); return len; } @@ -223,7 +223,7 @@ size_t duk_hbuffer_insert_cesu8(duk_hthread *thr, duk_hbuffer_dynamic *buf, size /* Intentionally no fast path: insertion is not that central */ /* FIXME: cp -> duk_codepoint_t */ - len = (size_t) duk_unicode_encode_cesu8(codepoint, tmp); + len = (size_t) duk_unicode_encode_cesu8((duk_ucodepoint_t) codepoint, tmp); duk_hbuffer_insert_bytes(thr, buf, offset, tmp, len); return len; } @@ -305,7 +305,7 @@ size_t duk_hbuffer_append_xutf8(duk_hthread *thr, duk_hbuffer_dynamic *buf, duk_ } /* FIXME: cp -> duk_codepoint_t */ - len = (size_t) duk_unicode_encode_xutf8(codepoint, tmp); + len = (size_t) duk_unicode_encode_xutf8((duk_ucodepoint_t) codepoint, tmp); duk_hbuffer_insert_bytes(thr, buf, DUK_HBUFFER_GET_SIZE(buf), tmp, len); return len; } @@ -334,7 +334,7 @@ size_t duk_hbuffer_append_cesu8(duk_hthread *thr, duk_hbuffer_dynamic *buf, duk_ } /* FIXME: cp -> duk_codepoint_t */ - len = (size_t) duk_unicode_encode_cesu8(codepoint, tmp); + len = (size_t) duk_unicode_encode_cesu8((duk_ucodepoint_t) codepoint, tmp); duk_hbuffer_insert_bytes(thr, buf, DUK_HBUFFER_GET_SIZE(buf), tmp, len); return len; } diff --git a/src/duk_unicode.h b/src/duk_unicode.h index eaaeb397..4eb23375 100644 --- a/src/duk_unicode.h +++ b/src/duk_unicode.h @@ -43,16 +43,16 @@ extern duk_uint16_t duk_unicode_re_ranges_not_wordchar[10]; * Prototypes */ -duk_small_int_t duk_unicode_get_xutf8_length(duk_codepoint_t cp); -duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out); -duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out); -duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_codepoint_t *out_cp); -duk_codepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end); +duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp); +duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out); +duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out); +duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp); +duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end); duk_size_t duk_unicode_unvalidated_utf8_length(duk_uint8_t *data, duk_size_t blen); -duk_small_int_t duk_unicode_is_whitespace(duk_signed_codepoint_t cp); -duk_small_int_t duk_unicode_is_line_terminator(duk_signed_codepoint_t cp); -duk_small_int_t duk_unicode_is_identifier_start(duk_signed_codepoint_t cp); -duk_small_int_t duk_unicode_is_identifier_part(duk_signed_codepoint_t cp); +duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp); +duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp); +duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp); +duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp); void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase); duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp); duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t cp); diff --git a/src/duk_unicode_support.c b/src/duk_unicode_support.c index 17e7e3a4..b8a6740c 100644 --- a/src/duk_unicode_support.c +++ b/src/duk_unicode_support.c @@ -9,7 +9,7 @@ * XUTF-8 and CESU-8 encoding/decoding */ -duk_small_int_t duk_unicode_get_xutf8_length(duk_codepoint_t cp) { +duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) { duk_uint_fast32_t x = (duk_uint_fast32_t) cp; if (x < 0x80UL) { /* 7 bits */ @@ -43,7 +43,7 @@ duk_uint8_t duk_unicode_xutf8_markers[7] = { * DUK_UNICODE_MAX_XUTF8_LENGTH bytes. Allows encoding of any * 32-bit (unsigned) codepoint. */ -duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out) { +duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out) { duk_uint_fast32_t x = (duk_uint_fast32_t) cp; duk_small_int_t len; duk_uint8_t marker; @@ -77,7 +77,7 @@ duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out) { * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF * will encode to garbage but won't overwrite the output buffer. */ -duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out) { +duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out) { duk_uint_fast32_t x = (duk_uint_fast32_t) cp; duk_small_int_t len; @@ -137,7 +137,7 @@ duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out) { } /* Decode helper. Return zero on error. */ -duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_codepoint_t *out_cp) { +duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp) { duk_uint8_t *p; duk_uint32_t res; duk_uint_fast8_t ch; @@ -219,8 +219,8 @@ duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, du } /* used by e.g. duk_regexp_executor.c, string built-ins */ -duk_codepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end) { - duk_codepoint_t cp; +duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end) { + duk_ucodepoint_t cp; if (duk_unicode_decode_xutf8(thr, ptr, ptr_start, ptr_end, &cp)) { return cp; @@ -314,7 +314,7 @@ static duk_small_int_t uni_range_match(const duk_uint8_t *unitab, duk_size_t uni * "WhiteSpace" production check. */ -duk_small_int_t duk_unicode_is_whitespace(duk_signed_codepoint_t cp) { +duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) { /* * E5 Section 7.2 specifies six characters specifically as * white space: @@ -400,7 +400,7 @@ duk_small_int_t duk_unicode_is_whitespace(duk_signed_codepoint_t cp) { * "LineTerminator" production check. */ -duk_small_int_t duk_unicode_is_line_terminator(duk_signed_codepoint_t cp) { +duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp) { /* * E5 Section 7.3 * @@ -420,7 +420,7 @@ duk_small_int_t duk_unicode_is_line_terminator(duk_signed_codepoint_t cp) { * "IdentifierStart" production check. */ -duk_small_int_t duk_unicode_is_identifier_start(duk_signed_codepoint_t cp) { +duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) { /* * E5 Section 7.6: * @@ -495,7 +495,7 @@ duk_small_int_t duk_unicode_is_identifier_start(duk_signed_codepoint_t cp) { * "IdentifierPart" production check. */ -duk_small_int_t duk_unicode_is_identifier_part(duk_signed_codepoint_t cp) { +duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) { /* * E5 Section 7.6: * @@ -704,13 +704,13 @@ static duk_codepoint_t slow_case_conversion(duk_hthread *thr, * locale/language. */ -static duk_signed_codepoint_t case_transform_helper(duk_hthread *thr, - duk_hbuffer_dynamic *buf, - duk_codepoint_t cp, - duk_signed_codepoint_t prev, - duk_signed_codepoint_t next, - duk_small_int_t uppercase, - duk_small_int_t language) { +static duk_codepoint_t case_transform_helper(duk_hthread *thr, + duk_hbuffer_dynamic *buf, + duk_codepoint_t cp, + duk_codepoint_t prev, + duk_codepoint_t next, + duk_small_int_t uppercase, + duk_small_int_t language) { duk_bitdecoder_ctx bd_ctx; /* fast path for ASCII */ @@ -786,7 +786,7 @@ void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase duk_hstring *h_input; duk_hbuffer_dynamic *h_buf; duk_uint8_t *p, *p_start, *p_end; - duk_signed_codepoint_t prev, curr, next; /* need signed type here */ + duk_codepoint_t prev, curr, next; h_input = duk_require_hstring(ctx, -1); DUK_ASSERT(h_input != NULL); @@ -846,7 +846,7 @@ void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase */ duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp) { - duk_signed_codepoint_t y; + duk_codepoint_t y; y = case_transform_helper(thr, NULL, /* buf */