Browse Source

change duk_unicode_xutf8_get_u32() to be non-checked; fix minor nits in global object encode/decode functions (now passes tests)

pull/1/head
Sami Vaarala 12 years ago
parent
commit
9e8ca671d3
  1. 33
      src/duk_builtin_global.c
  2. 2
      src/duk_builtin_json.c
  3. 2
      src/duk_builtin_string.c
  4. 6
      src/duk_regexp_executor.c
  5. 3
      src/duk_unicode.h
  6. 19
      src/duk_unicode_support.c

33
src/duk_builtin_global.c

@ -134,7 +134,7 @@ static int transform_helper(duk_context *ctx, transform_callback callback, void
tfm_ctx->p = tfm_ctx->p_start; tfm_ctx->p = tfm_ctx->p_start;
while (tfm_ctx->p < tfm_ctx->p_end) { while (tfm_ctx->p < tfm_ctx->p_end) {
cp = duk_unicode_xutf8_get_u32(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end); cp = duk_unicode_xutf8_get_u32_checked(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end);
callback(tfm_ctx, udata, cp); callback(tfm_ctx, udata, cp);
} }
@ -157,15 +157,26 @@ static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void *
goto uri_error; goto uri_error;
} else if (cp >= 0xd800 && cp <= 0xdbff) { } else if (cp >= 0xd800 && cp <= 0xdbff) {
/* Needs lookahead */ /* Needs lookahead */
/* FIXME: if fails, must be URIError */ if (duk_unicode_xutf8_get_u32(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end, &cp2) == 0) {
cp2 = duk_unicode_xutf8_get_u32(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end); goto uri_error;
}
if (!(cp2 >= 0xdc00 && cp2 <= 0xdfff)) { if (!(cp2 >= 0xdc00 && cp2 <= 0xdfff)) {
goto uri_error; goto uri_error;
} }
cp1 = cp; cp1 = cp;
cp = ((cp1 - 0xd800) << 10) + (cp2 - 0xdc00) + 0x10000; cp = ((cp1 - 0xd800) << 10) + (cp2 - 0xdc00) + 0x10000;
} else if (cp > 0x10ffff) {
/* Although we can allow non-BMP characters (they'll decode
* back into surrogate pairs), we don't allow extended UTF-8
* characters; they would encode to URIs which won't decode
* back because of strict UTF-8 checks in URI decoding.
* (However, we could just as well allow them here.)
*/
goto uri_error;
} else { } else {
/* FIXME: non-BMP? */ /* Non-BMP characters within valid UTF-8 range: encode as is.
* They'll decode back into surrogate pairs.
*/
; ;
} }
@ -320,8 +331,7 @@ static void transform_callback_escape(duk_transform_context *tfm_ctx, void *udat
buf[1] = (duk_u8) duk_uc_nybbles[cp >> 4]; buf[1] = (duk_u8) duk_uc_nybbles[cp >> 4];
buf[2] = (duk_u8) duk_uc_nybbles[cp & 0x0f]; buf[2] = (duk_u8) duk_uc_nybbles[cp & 0x0f];
len = 3; len = 3;
} else { } else if (cp < 65536) {
/* FIXME: non-BMP chars will now be clipped */
buf[0] = (duk_u8) '%'; buf[0] = (duk_u8) '%';
buf[1] = (duk_u8) 'u'; buf[1] = (duk_u8) 'u';
buf[2] = (duk_u8) duk_uc_nybbles[cp >> 12]; buf[2] = (duk_u8) duk_uc_nybbles[cp >> 12];
@ -329,9 +339,20 @@ static void transform_callback_escape(duk_transform_context *tfm_ctx, void *udat
buf[4] = (duk_u8) duk_uc_nybbles[(cp >> 4) & 0x0f]; buf[4] = (duk_u8) duk_uc_nybbles[(cp >> 4) & 0x0f];
buf[5] = (duk_u8) duk_uc_nybbles[cp & 0x0f]; buf[5] = (duk_u8) duk_uc_nybbles[cp & 0x0f];
len = 6; len = 6;
} else {
/* Characters outside BMP cannot be escape()'d. We could
* encode them as surrogate pairs (for codepoints inside
* valid UTF-8 range, but not extended UTF-8). Because
* escape() and unescape() are legacy functions, we don't.
*/
goto esc_error;
} }
duk_hbuffer_append_bytes(tfm_ctx->thr, tfm_ctx->h_buf, buf, len); duk_hbuffer_append_bytes(tfm_ctx->thr, tfm_ctx->h_buf, buf, len);
return;
esc_error:
DUK_ERROR(tfm_ctx->thr, DUK_ERR_TYPE_ERROR, "invalid input");
} }
static void transform_callback_unescape(duk_transform_context *tfm_ctx, void *udata, duk_u32 cp) { static void transform_callback_unescape(duk_transform_context *tfm_ctx, void *udata, duk_u32 cp) {

2
src/duk_builtin_json.c

@ -754,7 +754,7 @@ static void json_enc_quote_string(duk_json_enc_ctx *js_ctx, duk_hstring *h_str)
/* slow path decode */ /* slow path decode */
/* FIXME: this may currently fail, we'd prefer it never do that */ /* FIXME: this may currently fail, we'd prefer it never do that */
cp = duk_unicode_xutf8_get_u32(thr, &p, p_start, p_end); cp = duk_unicode_xutf8_get_u32_checked(thr, &p, p_start, p_end);
if (js_ctx->flag_ascii_only) { if (js_ctx->flag_ascii_only) {
if (cp > 0xffff) { if (cp > 0xffff) {

2
src/duk_builtin_string.c

@ -148,7 +148,7 @@ int duk_builtin_string_prototype_char_code_at(duk_context *ctx) {
DUK_DDDPRINT("p_start=%p, p_end=%p, p=%p", (void *) p_start, (void *) p_end, (void *) p); DUK_DDDPRINT("p_start=%p, p_end=%p, p=%p", (void *) p_start, (void *) p_end, (void *) p);
/* FIXME: this may throw an error, though not for valid E5 strings - is this OK here? */ /* FIXME: this may throw an error, though not for valid E5 strings - is this OK here? */
cp = duk_unicode_xutf8_get_u32(thr, &p, p_start, p_end); cp = duk_unicode_xutf8_get_u32_checked(thr, &p, p_start, p_end);
/* FIXME: push_uint or push_u32 */ /* FIXME: push_uint or push_u32 */
duk_push_number(ctx, (double) cp); duk_push_number(ctx, (double) cp);

6
src/duk_regexp_executor.c

@ -17,14 +17,14 @@
*/ */
static duk_u32 bc_get_u32(duk_re_matcher_ctx *re_ctx, duk_u8 **pc) { static duk_u32 bc_get_u32(duk_re_matcher_ctx *re_ctx, duk_u8 **pc) {
return duk_unicode_xutf8_get_u32(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end); return duk_unicode_xutf8_get_u32_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
} }
static duk_i32 bc_get_i32(duk_re_matcher_ctx *re_ctx, duk_u8 **pc) { static duk_i32 bc_get_i32(duk_re_matcher_ctx *re_ctx, duk_u8 **pc) {
duk_u32 t; duk_u32 t;
/* signed integer encoding needed to work with UTF-8 */ /* signed integer encoding needed to work with UTF-8 */
t = duk_unicode_xutf8_get_u32(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end); t = duk_unicode_xutf8_get_u32_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
if (t & 1) { if (t & 1) {
return -(t >> 1); return -(t >> 1);
} else { } else {
@ -103,7 +103,7 @@ static duk_u8 *utf8_advance(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, d
* matching. * matching.
*/ */
static duk_u32 inp_get_u32(duk_re_matcher_ctx *re_ctx, duk_u8 **sp) { static duk_u32 inp_get_u32(duk_re_matcher_ctx *re_ctx, duk_u8 **sp) {
duk_u32 res = duk_unicode_xutf8_get_u32(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end); duk_u32 res = duk_unicode_xutf8_get_u32_checked(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end);
if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) { if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
res = duk_unicode_re_canonicalize_char(re_ctx->thr, res); res = duk_unicode_re_canonicalize_char(re_ctx->thr, res);
} }

3
src/duk_unicode.h

@ -48,7 +48,8 @@ extern duk_u16 duk_unicode_re_ranges_not_wordchar[10];
int duk_unicode_get_xutf8_length(duk_u32 x); int duk_unicode_get_xutf8_length(duk_u32 x);
size_t duk_unicode_encode_xutf8(duk_u32 x, duk_u8 *out); size_t duk_unicode_encode_xutf8(duk_u32 x, duk_u8 *out);
size_t duk_unicode_encode_cesu8(duk_u32 x, duk_u8 *out); size_t duk_unicode_encode_cesu8(duk_u32 x, duk_u8 *out);
duk_u32 duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end); int duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end, duk_u32 *out_cp);
duk_u32 duk_unicode_xutf8_get_u32_checked(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end);
duk_u32 duk_unicode_unvalidated_utf8_length(duk_u8 *data, duk_u32 blen); duk_u32 duk_unicode_unvalidated_utf8_length(duk_u8 *data, duk_u32 blen);
int duk_unicode_is_whitespace(int x); int duk_unicode_is_whitespace(int x);
int duk_unicode_is_line_terminator(int x); int duk_unicode_is_line_terminator(int x);

19
src/duk_unicode_support.c

@ -133,8 +133,8 @@ size_t duk_unicode_encode_cesu8(duk_u32 x, duk_u8 *out) {
return len; return len;
} }
/* used by e.g. duk_regexp_executor.c, string built-ins */ /* Decode helper. Return zero on error. */
duk_u32 duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end) { int duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end, duk_u32 *out_cp) {
duk_u8 *p; duk_u8 *p;
duk_u32 res; duk_u32 res;
int ch; int ch;
@ -208,9 +208,20 @@ duk_u32 duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_st
} }
*ptr = p; *ptr = p;
return res; *out_cp = res;
return 1;
fail: fail:
return 0;
}
/* used by e.g. duk_regexp_executor.c, string built-ins */
duk_u32 duk_unicode_xutf8_get_u32_checked(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end) {
duk_u32 cp;
if (duk_unicode_xutf8_get_u32(thr, ptr, ptr_start, ptr_end, &cp)) {
return cp;
}
DUK_ERROR(thr, DUK_ERR_INTERNAL_ERROR, "utf-8 decode failed"); DUK_ERROR(thr, DUK_ERR_INTERNAL_ERROR, "utf-8 decode failed");
return 0; /* never here */ return 0; /* never here */
} }
@ -791,7 +802,7 @@ void duk_unicode_case_convert_string(duk_hthread *thr, int uppercase) {
curr = next; curr = next;
next = -1; next = -1;
if (p < p_end) { if (p < p_end) {
next = (int) duk_unicode_xutf8_get_u32(thr, &p, p_start, p_end); next = (int) duk_unicode_xutf8_get_u32_checked(thr, &p, p_start, p_end);
} else { } else {
/* end of input and last char has been processed */ /* end of input and last char has been processed */
if (curr < 0) { if (curr < 0) {

Loading…
Cancel
Save