Browse Source

make duk_codepoint_t a signed type and use it everywhere except utf8/cesu8 encode/decode which uses the duk_ucodepoint_t unsigned type

pull/1/head
Sami Vaarala 11 years ago
parent
commit
3731c7b068
  1. 8
      src/duk_api_string.c
  2. 44
      src/duk_builtin_global.c
  3. 13
      src/duk_features.h
  4. 8
      src/duk_hbuffer_ops.c
  5. 18
      src/duk_unicode.h
  6. 38
      src/duk_unicode_support.c

8
src/duk_api_string.c

@ -159,7 +159,7 @@ void duk_trim(duk_context *ctx, int index) {
duk_hstring *h;
duk_uint8_t *p, *p_start, *p_end, *p_tmp1, *p_tmp2; /* pointers for scanning */
duk_uint8_t *q_start, *q_end; /* start (incl) and end (excl) of trimmed part */
duk_uint32_t cp;
duk_codepoint_t cp;
index = duk_require_normalize_index(ctx, index);
h = duk_require_hstring(ctx, index);
@ -171,8 +171,7 @@ void duk_trim(duk_context *ctx, int index) {
p = p_start;
while (p < p_end) {
p_tmp1 = p;
/* FIXME: duk_codepoint_t */
cp = (duk_uint32_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp1, p_start, p_end);
cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp1, p_start, p_end);
if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) {
break;
}
@ -196,8 +195,7 @@ void duk_trim(duk_context *ctx, int index) {
}
p_tmp2 = p;
/* FIXME: duk_codepoint_t */
cp = (duk_uint32_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp2, p_start, p_end);
cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp2, p_start, p_end);
if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) {
p = p_tmp1;
break;

44
src/duk_builtin_global.c

@ -133,7 +133,7 @@ static int transform_helper(duk_context *ctx, transform_callback callback, void
tfm_ctx->p = tfm_ctx->p_start;
while (tfm_ctx->p < tfm_ctx->p_end) {
cp = duk_unicode_decode_xutf8_checked(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end);
cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end);
callback(tfm_ctx, udata, cp);
}
@ -149,22 +149,24 @@ static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void *
duk_small_int_t i, t;
duk_uint8_t *unescaped_table = (duk_uint8_t *) udata;
if ((cp < 128) && CHECK_BITMASK(unescaped_table, cp)) {
if (cp < 0) {
goto uri_error;
} else if ((cp < 0x80L) && CHECK_BITMASK(unescaped_table, cp)) {
duk_hbuffer_append_byte(tfm_ctx->thr, tfm_ctx->h_buf, (duk_uint8_t) cp);
return;
} else if (cp >= 0xdc00UL && cp <= 0xdfffUL) {
} else if (cp >= 0xdc00L && cp <= 0xdfffL) {
goto uri_error;
} else if (cp >= 0xd800UL && cp <= 0xdbffUL) {
} else if (cp >= 0xd800L && cp <= 0xdbffL) {
/* Needs lookahead */
if (duk_unicode_decode_xutf8(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end, &cp2) == 0) {
if (duk_unicode_decode_xutf8(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end, (duk_ucodepoint_t *) &cp2) == 0) {
goto uri_error;
}
if (!(cp2 >= 0xdc00UL && cp2 <= 0xdfffUL)) {
if (!(cp2 >= 0xdc00L && cp2 <= 0xdfffL)) {
goto uri_error;
}
cp1 = cp;
cp = ((cp1 - 0xd800UL) << 10) + (cp2 - 0xdc00UL) + 0x10000UL;
} else if (cp > 0x10ffffUL) {
cp = ((cp1 - 0xd800L) << 10) + (cp2 - 0xdc00L) + 0x10000L;
} else if (cp > 0x10ffffL) {
/* Although we can allow non-BMP characters (they'll decode
* back into surrogate pairs), we don't allow extended UTF-8
* characters; they would encode to URIs which won't decode
@ -179,7 +181,7 @@ static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void *
;
}
len = duk_unicode_encode_xutf8(cp, xutf8_buf);
len = duk_unicode_encode_xutf8((duk_ucodepoint_t) cp, xutf8_buf);
buf[0] = (duk_uint8_t) '%';
for (i = 0; i < len; i++) {
t = (int) xutf8_buf[i];
@ -284,7 +286,7 @@ static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void *
DUK_DDDPRINT("final cp=%d, min_cp=%d", cp, min_cp);
if (cp < min_cp || cp > 0x10ffffUL || (cp >= 0xd800UL && cp <= 0xdfffUL)) {
if (cp < min_cp || cp > 0x10ffffL || (cp >= 0xd800L && cp <= 0xdfffL)) {
goto uri_error;
}
@ -297,18 +299,18 @@ static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void *
*/
/* utf-8 validation ensures these */
DUK_ASSERT(cp >= 0x80UL && cp <= 0x10ffffUL);
DUK_ASSERT(cp >= 0x80L && cp <= 0x10ffffL);
if (cp >= 0x10000UL) {
cp -= 0x10000UL;
if (cp >= 0x10000L) {
cp -= 0x10000L;
DUK_ASSERT(cp < 0x100000UL);
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (cp >> 10) + 0xd800UL);
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (cp & 0x03ffUL) + 0xdc00UL);
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) ((cp >> 10) + 0xd800L));
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) ((cp & 0x03ffUL) + 0xdc00L));
} else {
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, cp);
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) cp);
}
} else {
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, cp);
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (duk_ucodepoint_t) cp);
}
return;
@ -321,15 +323,17 @@ static void transform_callback_escape(duk_transform_context *tfm_ctx, void *udat
duk_uint8_t buf[6];
duk_small_int_t len;
if ((cp < 0x80UL) && CHECK_BITMASK(escape_unescaped_table, cp)) {
if (cp < 0) {
goto esc_error;
} else if ((cp < 0x80L) && CHECK_BITMASK(escape_unescaped_table, cp)) {
buf[0] = (duk_uint8_t) cp;
len = 1;
} else if (cp < 0x100UL) {
} else if (cp < 0x100L) {
buf[0] = (duk_uint8_t) '%';
buf[1] = (duk_uint8_t) duk_uc_nybbles[cp >> 4];
buf[2] = (duk_uint8_t) duk_uc_nybbles[cp & 0x0f];
len = 3;
} else if (cp < 0x10000UL) {
} else if (cp < 0x10000L) {
buf[0] = (duk_uint8_t) '%';
buf[1] = (duk_uint8_t) 'u';
buf[2] = (duk_uint8_t) duk_uc_nybbles[cp >> 12];

13
src/duk_features.h

@ -571,11 +571,14 @@ typedef int duk_small_int_t;
typedef unsigned int duk_small_uint_t;
/* Codepoint type. Must be 32 bits or more because it is used also for
* internal codepoints. Signed codepoints are needed internally in some
* algorithms (e.g. negative value used as a marker).
*/
typedef duk_uint_fast32_t duk_codepoint_t;
typedef duk_int_fast32_t duk_signed_codepoint_t;
* internal codepoints. The type is signed because negative codepoints
* are used as internal markers (e.g. to mark EOF or missing argument).
* (X)UTF-8/CESU-8 encode/decode take and return an unsigned variant to
* ensure duk_uint32_t casts back and forth nicely. Almost everything
* else uses the signed one.
*/
typedef duk_int_fast32_t duk_codepoint_t;
typedef duk_uint_fast32_t duk_ucodepoint_t;
/* IEEE double typedef. */
typedef double duk_double_t;

8
src/duk_hbuffer_ops.c

@ -200,7 +200,7 @@ size_t duk_hbuffer_insert_xutf8(duk_hthread *thr, duk_hbuffer_dynamic *buf, size
/* Intentionally no fast path: insertion is not that central */
/* FIXME: cp -> duk_codepoint_t */
len = (size_t) duk_unicode_encode_xutf8(codepoint, tmp);
len = (size_t) duk_unicode_encode_xutf8((duk_ucodepoint_t) codepoint, tmp);
duk_hbuffer_insert_bytes(thr, buf, offset, tmp, len);
return len;
}
@ -223,7 +223,7 @@ size_t duk_hbuffer_insert_cesu8(duk_hthread *thr, duk_hbuffer_dynamic *buf, size
/* Intentionally no fast path: insertion is not that central */
/* FIXME: cp -> duk_codepoint_t */
len = (size_t) duk_unicode_encode_cesu8(codepoint, tmp);
len = (size_t) duk_unicode_encode_cesu8((duk_ucodepoint_t) codepoint, tmp);
duk_hbuffer_insert_bytes(thr, buf, offset, tmp, len);
return len;
}
@ -305,7 +305,7 @@ size_t duk_hbuffer_append_xutf8(duk_hthread *thr, duk_hbuffer_dynamic *buf, duk_
}
/* FIXME: cp -> duk_codepoint_t */
len = (size_t) duk_unicode_encode_xutf8(codepoint, tmp);
len = (size_t) duk_unicode_encode_xutf8((duk_ucodepoint_t) codepoint, tmp);
duk_hbuffer_insert_bytes(thr, buf, DUK_HBUFFER_GET_SIZE(buf), tmp, len);
return len;
}
@ -334,7 +334,7 @@ size_t duk_hbuffer_append_cesu8(duk_hthread *thr, duk_hbuffer_dynamic *buf, duk_
}
/* FIXME: cp -> duk_codepoint_t */
len = (size_t) duk_unicode_encode_cesu8(codepoint, tmp);
len = (size_t) duk_unicode_encode_cesu8((duk_ucodepoint_t) codepoint, tmp);
duk_hbuffer_insert_bytes(thr, buf, DUK_HBUFFER_GET_SIZE(buf), tmp, len);
return len;
}

18
src/duk_unicode.h

@ -43,16 +43,16 @@ extern duk_uint16_t duk_unicode_re_ranges_not_wordchar[10];
* Prototypes
*/
duk_small_int_t duk_unicode_get_xutf8_length(duk_codepoint_t cp);
duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out);
duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out);
duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_codepoint_t *out_cp);
duk_codepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end);
duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp);
duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out);
duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out);
duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp);
duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end);
duk_size_t duk_unicode_unvalidated_utf8_length(duk_uint8_t *data, duk_size_t blen);
duk_small_int_t duk_unicode_is_whitespace(duk_signed_codepoint_t cp);
duk_small_int_t duk_unicode_is_line_terminator(duk_signed_codepoint_t cp);
duk_small_int_t duk_unicode_is_identifier_start(duk_signed_codepoint_t cp);
duk_small_int_t duk_unicode_is_identifier_part(duk_signed_codepoint_t cp);
duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp);
duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp);
duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp);
duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp);
void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase);
duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp);
duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t cp);

38
src/duk_unicode_support.c

@ -9,7 +9,7 @@
* XUTF-8 and CESU-8 encoding/decoding
*/
duk_small_int_t duk_unicode_get_xutf8_length(duk_codepoint_t cp) {
duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) {
duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
if (x < 0x80UL) {
/* 7 bits */
@ -43,7 +43,7 @@ duk_uint8_t duk_unicode_xutf8_markers[7] = {
* DUK_UNICODE_MAX_XUTF8_LENGTH bytes. Allows encoding of any
* 32-bit (unsigned) codepoint.
*/
duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out) {
duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out) {
duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
duk_small_int_t len;
duk_uint8_t marker;
@ -77,7 +77,7 @@ duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out) {
* DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF
* will encode to garbage but won't overwrite the output buffer.
*/
duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out) {
duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out) {
duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
duk_small_int_t len;
@ -137,7 +137,7 @@ duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out) {
}
/* Decode helper. Return zero on error. */
duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_codepoint_t *out_cp) {
duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp) {
duk_uint8_t *p;
duk_uint32_t res;
duk_uint_fast8_t ch;
@ -219,8 +219,8 @@ duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, du
}
/* used by e.g. duk_regexp_executor.c, string built-ins */
duk_codepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end) {
duk_codepoint_t cp;
duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end) {
duk_ucodepoint_t cp;
if (duk_unicode_decode_xutf8(thr, ptr, ptr_start, ptr_end, &cp)) {
return cp;
@ -314,7 +314,7 @@ static duk_small_int_t uni_range_match(const duk_uint8_t *unitab, duk_size_t uni
* "WhiteSpace" production check.
*/
duk_small_int_t duk_unicode_is_whitespace(duk_signed_codepoint_t cp) {
duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) {
/*
* E5 Section 7.2 specifies six characters specifically as
* white space:
@ -400,7 +400,7 @@ duk_small_int_t duk_unicode_is_whitespace(duk_signed_codepoint_t cp) {
* "LineTerminator" production check.
*/
duk_small_int_t duk_unicode_is_line_terminator(duk_signed_codepoint_t cp) {
duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp) {
/*
* E5 Section 7.3
*
@ -420,7 +420,7 @@ duk_small_int_t duk_unicode_is_line_terminator(duk_signed_codepoint_t cp) {
* "IdentifierStart" production check.
*/
duk_small_int_t duk_unicode_is_identifier_start(duk_signed_codepoint_t cp) {
duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) {
/*
* E5 Section 7.6:
*
@ -495,7 +495,7 @@ duk_small_int_t duk_unicode_is_identifier_start(duk_signed_codepoint_t cp) {
* "IdentifierPart" production check.
*/
duk_small_int_t duk_unicode_is_identifier_part(duk_signed_codepoint_t cp) {
duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) {
/*
* E5 Section 7.6:
*
@ -704,13 +704,13 @@ static duk_codepoint_t slow_case_conversion(duk_hthread *thr,
* locale/language.
*/
static duk_signed_codepoint_t case_transform_helper(duk_hthread *thr,
duk_hbuffer_dynamic *buf,
duk_codepoint_t cp,
duk_signed_codepoint_t prev,
duk_signed_codepoint_t next,
duk_small_int_t uppercase,
duk_small_int_t language) {
static duk_codepoint_t case_transform_helper(duk_hthread *thr,
duk_hbuffer_dynamic *buf,
duk_codepoint_t cp,
duk_codepoint_t prev,
duk_codepoint_t next,
duk_small_int_t uppercase,
duk_small_int_t language) {
duk_bitdecoder_ctx bd_ctx;
/* fast path for ASCII */
@ -786,7 +786,7 @@ void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase
duk_hstring *h_input;
duk_hbuffer_dynamic *h_buf;
duk_uint8_t *p, *p_start, *p_end;
duk_signed_codepoint_t prev, curr, next; /* need signed type here */
duk_codepoint_t prev, curr, next;
h_input = duk_require_hstring(ctx, -1);
DUK_ASSERT(h_input != NULL);
@ -846,7 +846,7 @@ void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase
*/
duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp) {
duk_signed_codepoint_t y;
duk_codepoint_t y;
y = case_transform_helper(thr,
NULL, /* buf */

Loading…
Cancel
Save