Browse Source

C type cleanups

pull/1/head
Sami Vaarala 11 years ago
parent
commit
12fb90f00d
  1. 6
      src/duk_api_string.c
  2. 87
      src/duk_builtin_global.c
  3. 3
      src/duk_builtin_json.c
  4. 5
      src/duk_builtin_string.c
  5. 12
      src/duk_hbuffer_ops.c
  6. 2
      src/duk_heap_stringtable.c
  7. 4
      src/duk_regexp_compiler.c
  8. 6
      src/duk_regexp_executor.c
  9. 26
      src/duk_unicode.h
  10. 410
      src/duk_unicode_support.c

6
src/duk_api_string.c

@ -185,7 +185,8 @@ void duk_trim(duk_context *ctx, int index) {
p = p_start; p = p_start;
while (p < p_end) { while (p < p_end) {
p_tmp1 = p; p_tmp1 = p;
cp = duk_unicode_xutf8_get_u32_checked(thr, &p_tmp1, p_start, p_end); /* FIXME: duk_codepoint_t */
cp = (duk_uint32_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp1, p_start, p_end);
if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) { if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) {
break; break;
} }
@ -209,7 +210,8 @@ void duk_trim(duk_context *ctx, int index) {
} }
p_tmp2 = p; p_tmp2 = p;
cp = duk_unicode_xutf8_get_u32_checked(thr, &p_tmp2, p_start, p_end); /* FIXME: duk_codepoint_t */
cp = (duk_uint32_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp2, p_start, p_end);
if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) { if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) {
p = p_tmp1; p = p_tmp1;
break; break;

87
src/duk_builtin_global.c

@ -88,12 +88,12 @@ typedef struct {
duk_uint8_t *p_end; duk_uint8_t *p_end;
} duk_transform_context; } duk_transform_context;
typedef void (*transform_callback)(duk_transform_context *tfm_ctx, void *udata, duk_uint32_t cp); typedef void (*transform_callback)(duk_transform_context *tfm_ctx, void *udata, duk_codepoint_t cp);
/* FIXME: refactor and share with other code */ /* FIXME: refactor and share with other code */
static int decode_hex_escape(duk_uint8_t *p, int n) { static duk_small_int_t decode_hex_escape(duk_uint8_t *p, duk_small_int_t n) {
int ch; duk_small_int_t ch;
int t = 0; duk_small_int_t t = 0;
while (n > 0) { while (n > 0) {
t = t * 16; t = t * 16;
@ -116,7 +116,7 @@ static int transform_helper(duk_context *ctx, transform_callback callback, void
duk_hthread *thr = (duk_hthread *) ctx; duk_hthread *thr = (duk_hthread *) ctx;
duk_transform_context tfm_ctx_alloc; duk_transform_context tfm_ctx_alloc;
duk_transform_context *tfm_ctx = &tfm_ctx_alloc; duk_transform_context *tfm_ctx = &tfm_ctx_alloc;
duk_uint32_t cp; duk_codepoint_t cp;
tfm_ctx->thr = thr; tfm_ctx->thr = thr;
@ -133,7 +133,7 @@ static int transform_helper(duk_context *ctx, transform_callback callback, void
tfm_ctx->p = tfm_ctx->p_start; tfm_ctx->p = tfm_ctx->p_start;
while (tfm_ctx->p < tfm_ctx->p_end) { while (tfm_ctx->p < tfm_ctx->p_end) {
cp = duk_unicode_xutf8_get_u32_checked(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end); cp = duk_unicode_decode_xutf8_checked(thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end);
callback(tfm_ctx, udata, cp); callback(tfm_ctx, udata, cp);
} }
@ -141,30 +141,30 @@ static int transform_helper(duk_context *ctx, transform_callback callback, void
return 1; return 1;
} }
static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void *udata, duk_uint32_t cp) { static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void *udata, duk_codepoint_t cp) {
duk_uint8_t xutf8_buf[DUK_UNICODE_MAX_XUTF8_LENGTH]; duk_uint8_t xutf8_buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
duk_uint8_t buf[3]; duk_uint8_t buf[3];
size_t len; duk_small_int_t len;
duk_uint32_t cp1, cp2; duk_codepoint_t cp1, cp2;
int i, t; duk_small_int_t i, t;
duk_uint8_t *unescaped_table = (duk_uint8_t *) udata; duk_uint8_t *unescaped_table = (duk_uint8_t *) udata;
if ((cp < 128) && CHECK_BITMASK(unescaped_table, cp)) { if ((cp < 128) && CHECK_BITMASK(unescaped_table, cp)) {
duk_hbuffer_append_byte(tfm_ctx->thr, tfm_ctx->h_buf, (duk_uint8_t) cp); duk_hbuffer_append_byte(tfm_ctx->thr, tfm_ctx->h_buf, (duk_uint8_t) cp);
return; return;
} else if (cp >= 0xdc00 && cp <= 0xdfff) { } else if (cp >= 0xdc00UL && cp <= 0xdfffUL) {
goto uri_error; goto uri_error;
} else if (cp >= 0xd800 && cp <= 0xdbff) { } else if (cp >= 0xd800UL && cp <= 0xdbffUL) {
/* Needs lookahead */ /* Needs lookahead */
if (duk_unicode_xutf8_get_u32(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end, &cp2) == 0) { if (duk_unicode_decode_xutf8(tfm_ctx->thr, &tfm_ctx->p, tfm_ctx->p_start, tfm_ctx->p_end, &cp2) == 0) {
goto uri_error; goto uri_error;
} }
if (!(cp2 >= 0xdc00 && cp2 <= 0xdfff)) { if (!(cp2 >= 0xdc00UL && cp2 <= 0xdfffUL)) {
goto uri_error; goto uri_error;
} }
cp1 = cp; cp1 = cp;
cp = ((cp1 - 0xd800) << 10) + (cp2 - 0xdc00) + 0x10000; cp = ((cp1 - 0xd800UL) << 10) + (cp2 - 0xdc00UL) + 0x10000UL;
} else if (cp > 0x10ffff) { } else if (cp > 0x10ffffUL) {
/* Although we can allow non-BMP characters (they'll decode /* Although we can allow non-BMP characters (they'll decode
* back into surrogate pairs), we don't allow extended UTF-8 * back into surrogate pairs), we don't allow extended UTF-8
* characters; they would encode to URIs which won't decode * characters; they would encode to URIs which won't decode
@ -193,16 +193,15 @@ static void transform_callback_encode_uri(duk_transform_context *tfm_ctx, void *
DUK_ERROR(tfm_ctx->thr, DUK_ERR_URI_ERROR, "invalid input"); DUK_ERROR(tfm_ctx->thr, DUK_ERR_URI_ERROR, "invalid input");
} }
static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void *udata, duk_uint32_t cp) { static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void *udata, duk_codepoint_t cp) {
duk_uint8_t *reserved_table = (duk_uint8_t *) udata; duk_uint8_t *reserved_table = (duk_uint8_t *) udata;
int utf8_blen; duk_small_int_t utf8_blen;
int min_cp; duk_codepoint_t min_cp;
int t; duk_small_int_t t, i;
int i;
if (cp == (duk_uint32_t) '%') { if (cp == (duk_codepoint_t) '%') {
duk_uint8_t *p = tfm_ctx->p; duk_uint8_t *p = tfm_ctx->p;
size_t left = (size_t) (tfm_ctx->p_end - p); /* bytes left */ duk_size_t left = (duk_size_t) (tfm_ctx->p_end - p); /* bytes left */
DUK_DDDPRINT("percent encoding, left=%d", (int) left); DUK_DDDPRINT("percent encoding, left=%d", (int) left);
@ -244,17 +243,17 @@ static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void *
} else if (t < 0xe0) { } else if (t < 0xe0) {
/* 110x xxxx; 2 bytes */ /* 110x xxxx; 2 bytes */
utf8_blen = 2; utf8_blen = 2;
min_cp = 0x80; min_cp = 0x80UL;
cp = t & 0x1f; cp = t & 0x1f;
} else if (t < 0xf0) { } else if (t < 0xf0) {
/* 1110 xxxx; 3 bytes */ /* 1110 xxxx; 3 bytes */
utf8_blen = 3; utf8_blen = 3;
min_cp = 0x800; min_cp = 0x800UL;
cp = t & 0x0f; cp = t & 0x0f;
} else if (t < 0xf8) { } else if (t < 0xf8) {
/* 1111 0xxx; 4 bytes */ /* 1111 0xxx; 4 bytes */
utf8_blen = 4; utf8_blen = 4;
min_cp = 0x10000; min_cp = 0x10000UL;
cp = t & 0x07; cp = t & 0x07;
} else { } else {
/* extended utf-8 not allowed for URIs */ /* extended utf-8 not allowed for URIs */
@ -285,7 +284,7 @@ static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void *
DUK_DDDPRINT("final cp=%d, min_cp=%d", cp, min_cp); DUK_DDDPRINT("final cp=%d, min_cp=%d", cp, min_cp);
if (cp < min_cp || cp > 0x10ffff || (cp >= 0xd800 && cp <= 0xdfff)) { if (cp < min_cp || cp > 0x10ffffUL || (cp >= 0xd800UL && cp <= 0xdfffUL)) {
goto uri_error; goto uri_error;
} }
@ -298,13 +297,13 @@ static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void *
*/ */
/* utf-8 validation ensures these */ /* utf-8 validation ensures these */
DUK_ASSERT(cp >= 0x80 && cp <= 0x10ffff); DUK_ASSERT(cp >= 0x80UL && cp <= 0x10ffffUL);
if (cp >= 0x10000) { if (cp >= 0x10000UL) {
cp -= 0x10000; cp -= 0x10000UL;
DUK_ASSERT(cp < 0x100000); DUK_ASSERT(cp < 0x100000UL);
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (cp >> 10) + 0xd800); duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (cp >> 10) + 0xd800UL);
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (cp & 0x03ff) + 0xdc00); duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, (cp & 0x03ffUL) + 0xdc00UL);
} else { } else {
duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, cp); duk_hbuffer_append_xutf8(tfm_ctx->thr, tfm_ctx->h_buf, cp);
} }
@ -318,19 +317,19 @@ static void transform_callback_decode_uri(duk_transform_context *tfm_ctx, void *
} }
#ifdef DUK_USE_SECTION_B #ifdef DUK_USE_SECTION_B
static void transform_callback_escape(duk_transform_context *tfm_ctx, void *udata, duk_uint32_t cp) { static void transform_callback_escape(duk_transform_context *tfm_ctx, void *udata, duk_codepoint_t cp) {
duk_uint8_t buf[6]; duk_uint8_t buf[6];
size_t len; duk_small_int_t len;
if ((cp < 128) && CHECK_BITMASK(escape_unescaped_table, cp)) { if ((cp < 0x80UL) && CHECK_BITMASK(escape_unescaped_table, cp)) {
buf[0] = (duk_uint8_t) cp; buf[0] = (duk_uint8_t) cp;
len = 1; len = 1;
} else if (cp < 256) { } else if (cp < 0x100UL) {
buf[0] = (duk_uint8_t) '%'; buf[0] = (duk_uint8_t) '%';
buf[1] = (duk_uint8_t) duk_uc_nybbles[cp >> 4]; buf[1] = (duk_uint8_t) duk_uc_nybbles[cp >> 4];
buf[2] = (duk_uint8_t) duk_uc_nybbles[cp & 0x0f]; buf[2] = (duk_uint8_t) duk_uc_nybbles[cp & 0x0f];
len = 3; len = 3;
} else if (cp < 65536) { } else if (cp < 0x10000UL) {
buf[0] = (duk_uint8_t) '%'; buf[0] = (duk_uint8_t) '%';
buf[1] = (duk_uint8_t) 'u'; buf[1] = (duk_uint8_t) 'u';
buf[2] = (duk_uint8_t) duk_uc_nybbles[cp >> 12]; buf[2] = (duk_uint8_t) duk_uc_nybbles[cp >> 12];
@ -354,20 +353,20 @@ static void transform_callback_escape(duk_transform_context *tfm_ctx, void *udat
DUK_ERROR(tfm_ctx->thr, DUK_ERR_TYPE_ERROR, "invalid input"); DUK_ERROR(tfm_ctx->thr, DUK_ERR_TYPE_ERROR, "invalid input");
} }
static void transform_callback_unescape(duk_transform_context *tfm_ctx, void *udata, duk_uint32_t cp) { static void transform_callback_unescape(duk_transform_context *tfm_ctx, void *udata, duk_codepoint_t cp) {
int t; duk_small_int_t t;
if (cp == (duk_uint32_t) '%') { if (cp == (duk_codepoint_t) '%') {
duk_uint8_t *p = tfm_ctx->p; duk_uint8_t *p = tfm_ctx->p;
size_t left = (size_t) (tfm_ctx->p_end - p); /* bytes left */ duk_size_t left = (duk_size_t) (tfm_ctx->p_end - p); /* bytes left */
if (left >= 5 && p[0] == 'u' && if (left >= 5 && p[0] == 'u' &&
((t = decode_hex_escape(p + 1, 4)) >= 0)) { ((t = decode_hex_escape(p + 1, 4)) >= 0)) {
cp = (duk_uint32_t) t; cp = (duk_codepoint_t) t;
tfm_ctx->p += 5; tfm_ctx->p += 5;
} else if (left >= 2 && } else if (left >= 2 &&
((t = decode_hex_escape(p, 2)) >= 0)) { ((t = decode_hex_escape(p, 2)) >= 0)) {
cp = (duk_uint32_t) t; cp = (duk_codepoint_t) t;
tfm_ctx->p += 2; tfm_ctx->p += 2;
} }
} }

3
src/duk_builtin_json.c

@ -765,7 +765,8 @@ static void json_enc_quote_string(duk_json_enc_ctx *js_ctx, duk_hstring *h_str)
/* slow path decode */ /* slow path decode */
/* FIXME: this may currently fail, we'd prefer it never do that */ /* FIXME: this may currently fail, we'd prefer it never do that */
cp = duk_unicode_xutf8_get_u32_checked(thr, &p, p_start, p_end); /* FIXME: duk_codepoint_t */
cp = (duk_uint32_t) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end);
if (js_ctx->flag_ascii_only) { if (js_ctx->flag_ascii_only) {
if (cp > 0xffff) { if (cp > 0xffff) {

5
src/duk_builtin_string.c

@ -158,7 +158,8 @@ int duk_builtin_string_prototype_char_code_at(duk_context *ctx) {
DUK_DDDPRINT("p_start=%p, p_end=%p, p=%p", (void *) p_start, (void *) p_end, (void *) p); DUK_DDDPRINT("p_start=%p, p_end=%p, p=%p", (void *) p_start, (void *) p_end, (void *) p);
/* FIXME: this may throw an error, though not for valid E5 strings - is this OK here? */ /* FIXME: this may throw an error, though not for valid E5 strings - is this OK here? */
cp = duk_unicode_xutf8_get_u32_checked(thr, &p, p_start, p_end); /* FIXME: duk_codepoint_t */
cp = (duk_uint32_t) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end);
/* FIXME: push_uint or push_u32 */ /* FIXME: push_uint or push_u32 */
duk_push_number(ctx, (double) cp); duk_push_number(ctx, (double) cp);
@ -306,7 +307,7 @@ int duk_builtin_string_prototype_slice(duk_context *ctx) {
* Case conversion * Case conversion
*/ */
static int caseconv_helper(duk_context *ctx, int uppercase) { static int caseconv_helper(duk_context *ctx, duk_small_int_t uppercase) {
duk_hthread *thr = (duk_hthread *) ctx; duk_hthread *thr = (duk_hthread *) ctx;
duk_push_this_coercible_to_string(ctx); duk_push_this_coercible_to_string(ctx);

12
src/duk_hbuffer_ops.c

@ -189,7 +189,8 @@ size_t duk_hbuffer_insert_xutf8(duk_hthread *thr, duk_hbuffer_dynamic *buf, size
/* Intentionally no fast path: insertion is not that central */ /* Intentionally no fast path: insertion is not that central */
len = duk_unicode_encode_xutf8(codepoint, tmp); /* FIXME: cp -> duk_codepoint_t */
len = (size_t) duk_unicode_encode_xutf8(codepoint, tmp);
duk_hbuffer_insert_bytes(thr, buf, offset, tmp, len); duk_hbuffer_insert_bytes(thr, buf, offset, tmp, len);
return len; return len;
} }
@ -211,7 +212,8 @@ size_t duk_hbuffer_insert_cesu8(duk_hthread *thr, duk_hbuffer_dynamic *buf, size
/* Intentionally no fast path: insertion is not that central */ /* Intentionally no fast path: insertion is not that central */
len = duk_unicode_encode_cesu8(codepoint, tmp); /* FIXME: cp -> duk_codepoint_t */
len = (size_t) duk_unicode_encode_cesu8(codepoint, tmp);
duk_hbuffer_insert_bytes(thr, buf, offset, tmp, len); duk_hbuffer_insert_bytes(thr, buf, offset, tmp, len);
return len; return len;
} }
@ -292,7 +294,8 @@ size_t duk_hbuffer_append_xutf8(duk_hthread *thr, duk_hbuffer_dynamic *buf, duk_
return 1; return 1;
} }
len = duk_unicode_encode_xutf8(codepoint, tmp); /* FIXME: cp -> duk_codepoint_t */
len = (size_t) duk_unicode_encode_xutf8(codepoint, tmp);
duk_hbuffer_insert_bytes(thr, buf, DUK_HBUFFER_GET_SIZE(buf), tmp, len); duk_hbuffer_insert_bytes(thr, buf, DUK_HBUFFER_GET_SIZE(buf), tmp, len);
return len; return len;
} }
@ -320,7 +323,8 @@ size_t duk_hbuffer_append_cesu8(duk_hthread *thr, duk_hbuffer_dynamic *buf, duk_
return 1; return 1;
} }
len = duk_unicode_encode_cesu8(codepoint, tmp); /* FIXME: cp -> duk_codepoint_t */
len = (size_t) duk_unicode_encode_cesu8(codepoint, tmp);
duk_hbuffer_insert_bytes(thr, buf, DUK_HBUFFER_GET_SIZE(buf), tmp, len); duk_hbuffer_insert_bytes(thr, buf, DUK_HBUFFER_GET_SIZE(buf), tmp, len);
return len; return len;
} }

2
src/duk_heap_stringtable.c

@ -46,7 +46,7 @@ static duk_hstring *alloc_init_hstring(duk_heap *heap,
res->hash = strhash; res->hash = strhash;
res->blen = blen; res->blen = blen;
res->clen = duk_unicode_unvalidated_utf8_length(str, blen); res->clen = (duk_uint32_t) duk_unicode_unvalidated_utf8_length(str, (duk_size_t) blen); /* clen <= blen */
data = (duk_uint8_t *) (res + 1); data = (duk_uint8_t *) (res + 1);
DUK_MEMCPY(data, str, blen); DUK_MEMCPY(data, str, blen);

4
src/duk_regexp_compiler.c

@ -96,8 +96,8 @@ static duk_uint32_t insert_jump_offset(duk_re_compiler_ctx *re_ctx, duk_uint32_t
if (skip < 0) { if (skip < 0) {
/* two encoding attempts suffices */ /* two encoding attempts suffices */
len = duk_unicode_get_xutf8_length(encode_i32(skip)); len = duk_unicode_get_xutf8_length((duk_codepoint_t) encode_i32(skip));
len = duk_unicode_get_xutf8_length(encode_i32(skip - len)); len = duk_unicode_get_xutf8_length((duk_codepoint_t) encode_i32(skip - len));
DUK_ASSERT(duk_unicode_get_xutf8_length(encode_i32(skip - len)) == len); /* no change */ DUK_ASSERT(duk_unicode_get_xutf8_length(encode_i32(skip - len)) == len); /* no change */
skip -= len; skip -= len;
} }

6
src/duk_regexp_executor.c

@ -17,14 +17,14 @@
*/ */
static duk_uint32_t bc_get_u32(duk_re_matcher_ctx *re_ctx, duk_uint8_t **pc) { static duk_uint32_t bc_get_u32(duk_re_matcher_ctx *re_ctx, duk_uint8_t **pc) {
return duk_unicode_xutf8_get_u32_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end); return (duk_uint32_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
} }
static duk_int32_t bc_get_i32(duk_re_matcher_ctx *re_ctx, duk_uint8_t **pc) { static duk_int32_t bc_get_i32(duk_re_matcher_ctx *re_ctx, duk_uint8_t **pc) {
duk_uint32_t t; duk_uint32_t t;
/* signed integer encoding needed to work with UTF-8 */ /* signed integer encoding needed to work with UTF-8 */
t = duk_unicode_xutf8_get_u32_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end); t = (duk_uint32_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, pc, re_ctx->bytecode, re_ctx->bytecode_end);
if (t & 1) { if (t & 1) {
return -(t >> 1); return -(t >> 1);
} else { } else {
@ -103,7 +103,7 @@ static duk_uint8_t *utf8_advance(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_
* matching. * matching.
*/ */
static duk_uint32_t inp_get_u32(duk_re_matcher_ctx *re_ctx, duk_uint8_t **sp) { static duk_uint32_t inp_get_u32(duk_re_matcher_ctx *re_ctx, duk_uint8_t **sp) {
duk_uint32_t res = duk_unicode_xutf8_get_u32_checked(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end); duk_uint32_t res = (duk_uint32_t) duk_unicode_decode_xutf8_checked(re_ctx->thr, sp, re_ctx->input, re_ctx->input_end);
if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) { if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
res = duk_unicode_re_canonicalize_char(re_ctx->thr, res); res = duk_unicode_re_canonicalize_char(re_ctx->thr, res);
} }

26
src/duk_unicode.h

@ -43,19 +43,19 @@ extern duk_uint16_t duk_unicode_re_ranges_not_wordchar[10];
* Prototypes * Prototypes
*/ */
int duk_unicode_get_xutf8_length(duk_uint32_t x); duk_small_int_t duk_unicode_get_xutf8_length(duk_codepoint_t cp);
size_t duk_unicode_encode_xutf8(duk_uint32_t x, duk_uint8_t *out); duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out);
size_t duk_unicode_encode_cesu8(duk_uint32_t x, duk_uint8_t *out); duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out);
int duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_uint32_t *out_cp); duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_codepoint_t *out_cp);
duk_uint32_t duk_unicode_xutf8_get_u32_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end); duk_codepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end);
duk_uint32_t duk_unicode_unvalidated_utf8_length(duk_uint8_t *data, duk_uint32_t blen); duk_size_t duk_unicode_unvalidated_utf8_length(duk_uint8_t *data, duk_size_t blen);
int duk_unicode_is_whitespace(int x); duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp);
int duk_unicode_is_line_terminator(int x); duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp);
int duk_unicode_is_identifier_start(int x); duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp);
int duk_unicode_is_identifier_part(int x); duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp);
void duk_unicode_case_convert_string(duk_hthread *thr, int uppercase); void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase);
int duk_unicode_re_canonicalize_char(duk_hthread *thr, int x); duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp);
int duk_unicode_re_is_wordchar(int x); duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t cp);
#endif /* DUK_UNICODE_H_INCLUDED */ #endif /* DUK_UNICODE_H_INCLUDED */

410
src/duk_unicode_support.c

@ -9,23 +9,24 @@
* XUTF-8 and CESU-8 encoding/decoding * XUTF-8 and CESU-8 encoding/decoding
*/ */
int duk_unicode_get_xutf8_length(duk_uint32_t x) { duk_small_int_t duk_unicode_get_xutf8_length(duk_codepoint_t cp) {
if (x < 0x80) { duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
if (x < 0x80UL) {
/* 7 bits */ /* 7 bits */
return 1; return 1;
} else if (x < 0x800) { } else if (x < 0x800UL) {
/* 11 bits */ /* 11 bits */
return 2; return 2;
} else if (x < 0x10000) { } else if (x < 0x10000UL) {
/* 16 bits */ /* 16 bits */
return 3; return 3;
} else if (x < 0x200000) { } else if (x < 0x200000UL) {
/* 21 bits */ /* 21 bits */
return 4; return 4;
} else if (x < 0x4000000) { } else if (x < 0x4000000UL) {
/* 26 bits */ /* 26 bits */
return 5; return 5;
} else if (x < (duk_uint32_t) 0x80000000L) { } else if (x < (duk_uint32_t) 0x80000000UL) {
/* 31 bits */ /* 31 bits */
return 6; return 6;
} else { } else {
@ -42,12 +43,13 @@ duk_uint8_t duk_unicode_xutf8_markers[7] = {
* DUK_UNICODE_MAX_XUTF8_LENGTH bytes. Allows encoding of any * DUK_UNICODE_MAX_XUTF8_LENGTH bytes. Allows encoding of any
* 32-bit (unsigned) codepoint. * 32-bit (unsigned) codepoint.
*/ */
size_t duk_unicode_encode_xutf8(duk_uint32_t x, duk_uint8_t *out) { duk_small_int_t duk_unicode_encode_xutf8(duk_codepoint_t cp, duk_uint8_t *out) {
size_t len; duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
duk_small_int_t len;
duk_uint8_t marker; duk_uint8_t marker;
size_t i; duk_small_int_t i;
len = duk_unicode_get_xutf8_length(x); len = duk_unicode_get_xutf8_length(cp);
DUK_ASSERT(len > 0); DUK_ASSERT(len > 0);
marker = duk_unicode_xutf8_markers[len - 1]; /* 64-bit OK because always >= 0 */ marker = duk_unicode_xutf8_markers[len - 1]; /* 64-bit OK because always >= 0 */
@ -57,14 +59,14 @@ size_t duk_unicode_encode_xutf8(duk_uint32_t x, duk_uint8_t *out) {
do { do {
i--; i--;
if (i > 0) { if (i > 0) {
out[i] = 0x80 + (x & 0x3f); out[i] = (duk_uint8_t) (0x80 + (x & 0x3f));
x >>= 6; x >>= 6;
} else { } else {
/* Note: masking of 'x' is not necessary because of /* Note: masking of 'x' is not necessary because of
* range check and shifting -> no bits overlapping * range check and shifting -> no bits overlapping
* the marker should be set. * the marker should be set.
*/ */
out[0] = marker + x; out[0] = (duk_uint8_t) (marker + x);
} }
} while(i > 0); } while(i > 0);
@ -75,21 +77,22 @@ size_t duk_unicode_encode_xutf8(duk_uint32_t x, duk_uint8_t *out) {
* DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF
* will encode to garbage but won't overwrite the output buffer. * will encode to garbage but won't overwrite the output buffer.
*/ */
size_t duk_unicode_encode_cesu8(duk_uint32_t x, duk_uint8_t *out) { duk_small_int_t duk_unicode_encode_cesu8(duk_codepoint_t cp, duk_uint8_t *out) {
size_t len; duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
duk_small_int_t len;
if (x < 0x80) { if (x < 0x80UL) {
out[0] = x; out[0] = (duk_uint8_t) x;
len = 1; len = 1;
} else if (x < 0x800) { } else if (x < 0x800UL) {
out[0] = 0xc0 + ((x >> 6) & 0x1f); out[0] = (duk_uint8_t) (0xc0 + ((x >> 6) & 0x1f));
out[1] = 0x80 + (x & 0x3f); out[1] = (duk_uint8_t) (0x80 + (x & 0x3f));
len = 2; len = 2;
} else if (x < 0x10000) { } else if (x < 0x10000UL) {
/* surrogate pairs get encoded here */ /* surrogate pairs get encoded here */
out[0] = 0xe0 + ((x >> 12) & 0x0f); out[0] = (duk_uint8_t) (0xe0 + ((x >> 12) & 0x0f));
out[1] = 0x80 + ((x >> 6) & 0x3f); out[1] = (duk_uint8_t) (0x80 + ((x >> 6) & 0x3f));
out[2] = 0x80 + (x & 0x3f); out[2] = (duk_uint8_t) (0x80 + (x & 0x3f));
len = 3; len = 3;
} else { } else {
/* /*
@ -119,14 +122,14 @@ size_t duk_unicode_encode_cesu8(duk_uint32_t x, duk_uint8_t *out) {
* of code. * of code.
*/ */
x -= 0x10000; x -= 0x10000UL;
out[0] = 0xed; out[0] = (duk_uint8_t) (0xed);
out[1] = 0xa0 + ((x >> 16) & 0x0f); out[1] = (duk_uint8_t) (0xa0 + ((x >> 16) & 0x0f));
out[2] = 0x80 + ((x >> 10) & 0x3f); out[2] = (duk_uint8_t) (0x80 + ((x >> 10) & 0x3f));
out[3] = 0xed; out[3] = (duk_uint8_t) (0xed);
out[4] = 0xb0 + ((x >> 6) & 0x0f); out[4] = (duk_uint8_t) (0xb0 + ((x >> 6) & 0x0f));
out[5] = 0x80 + (x & 0x3f); out[5] = (duk_uint8_t) (0x80 + (x & 0x3f));
len = 6; len = 6;
} }
@ -134,11 +137,11 @@ size_t duk_unicode_encode_cesu8(duk_uint32_t x, duk_uint8_t *out) {
} }
/* Decode helper. Return zero on error. */ /* Decode helper. Return zero on error. */
int duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_uint32_t *out_cp) { duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end, duk_codepoint_t *out_cp) {
duk_uint8_t *p; duk_uint8_t *p;
duk_uint32_t res; duk_uint32_t res;
int ch; duk_uint_fast8_t ch;
int n; duk_small_int_t n;
p = *ptr; p = *ptr;
if (p < ptr_start || p >= ptr_end) { if (p < ptr_start || p >= ptr_end) {
@ -150,37 +153,37 @@ int duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *
* This allows full 32-bit code points to be used. * This allows full 32-bit code points to be used.
*/ */
ch = *p++; ch = (duk_uint_fast8_t) (*p++);
if (ch < 0x80) { if (ch < 0x80) {
/* 0xxx xxxx [7 bits] */ /* 0xxx xxxx [7 bits] */
res = ch & 0x7f; res = (duk_uint32_t) (ch & 0x7f);
n = 0; n = 0;
} else if (ch < 0xc0) { } else if (ch < 0xc0) {
/* 10xx xxxx -> invalid */ /* 10xx xxxx -> invalid */
goto fail; goto fail;
} else if (ch < 0xe0) { } else if (ch < 0xe0) {
/* 110x xxxx 10xx xxxx [11 bits] */ /* 110x xxxx 10xx xxxx [11 bits] */
res = ch & 0x1f; res = (duk_uint32_t) (ch & 0x1f);
n = 1; n = 1;
} else if (ch < 0xf0) { } else if (ch < 0xf0) {
/* 1110 xxxx 10xx xxxx 10xx xxxx [16 bits] */ /* 1110 xxxx 10xx xxxx 10xx xxxx [16 bits] */
res = ch & 0x0f; res = (duk_uint32_t) (ch & 0x0f);
n = 2; n = 2;
} else if (ch < 0xf8) { } else if (ch < 0xf8) {
/* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx [21 bits] */ /* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx [21 bits] */
res = ch & 0x07; res = (duk_uint32_t) (ch & 0x07);
n = 3; n = 3;
} else if (ch < 0xfc) { } else if (ch < 0xfc) {
/* 1111 10xx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [26 bits] */ /* 1111 10xx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [26 bits] */
res = ch & 0x03; res = (duk_uint32_t) (ch & 0x03);
n = 4; n = 4;
} else if (ch < 0xfe) { } else if (ch < 0xfe) {
/* 1111 110x 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [31 bits] */ /* 1111 110x 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [31 bits] */
res = ch & 0x01; res = (duk_uint32_t) (ch & 0x01);
n = 5; n = 5;
} else if (ch < 0xff) { } else if (ch < 0xff) {
/* 1111 1110 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [36 bits] */ /* 1111 1110 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [36 bits] */
res = 0; res = (duk_uint32_t) (0);
n = 6; n = 6;
} else { } else {
/* 8-byte format could be: /* 8-byte format could be:
@ -203,7 +206,7 @@ int duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *
while (n > 0) { while (n > 0) {
DUK_ASSERT(p >= ptr_start && p < ptr_end); DUK_ASSERT(p >= ptr_start && p < ptr_end);
res = res << 6; res = res << 6;
res += (*p++) & 0x3f; res += (duk_uint32_t) ((*p++) & 0x3f);
n--; n--;
} }
@ -216,37 +219,36 @@ int duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *
} }
/* used by e.g. duk_regexp_executor.c, string built-ins */ /* used by e.g. duk_regexp_executor.c, string built-ins */
duk_uint32_t duk_unicode_xutf8_get_u32_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end) { duk_codepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, duk_uint8_t **ptr, duk_uint8_t *ptr_start, duk_uint8_t *ptr_end) {
duk_uint32_t cp; duk_codepoint_t cp;
if (duk_unicode_xutf8_get_u32(thr, ptr, ptr_start, ptr_end, &cp)) { if (duk_unicode_decode_xutf8(thr, ptr, ptr_start, ptr_end, &cp)) {
return cp; return cp;
} }
DUK_ERROR(thr, DUK_ERR_INTERNAL_ERROR, "utf-8 decode failed"); DUK_ERROR(thr, DUK_ERR_INTERNAL_ERROR, "utf-8 decode failed");
return 0; /* never here */ DUK_UNREACHABLE();
return 0;
} }
/* (extended) utf-8 length without codepoint encoding validation, used /* (extended) utf-8 length without codepoint encoding validation, used
* for string interning (should probably be inlined). * for string interning (should probably be inlined).
*/ */
duk_uint32_t duk_unicode_unvalidated_utf8_length(duk_uint8_t *data, duk_uint32_t blen) { duk_size_t duk_unicode_unvalidated_utf8_length(duk_uint8_t *data, duk_size_t blen) {
duk_uint8_t *p = data; duk_uint8_t *p = data;
duk_uint8_t *p_end = data + blen; duk_uint8_t *p_end = data + blen;
duk_uint32_t clen = 0; duk_size_t clen = 0;
while (p < p_end) { while (p < p_end) {
duk_uint8_t x = *p++; duk_uint8_t x = *p++;
if (x < 0x80) { if (x < 0x80 || x >= 0xc0) {
clen++; /* 10xxxxxx = continuation chars (0x80...0xbf), above
} else if (x >= 0xc0 ) { * and below that initial bytes.
/* 10xxxxxx = continuation chars (0x80...0xbf), above that */
* initial bytes. clen++;
*/ }
clen++; }
}
} return clen;
return clen;
} }
/* /*
@ -257,8 +259,8 @@ duk_uint32_t duk_unicode_unvalidated_utf8_length(duk_uint8_t *data, duk_uint32_t
*/ */
/* Must match src/extract_chars.py, generate_match_table3(). */ /* Must match src/extract_chars.py, generate_match_table3(). */
static int uni_decode_value(duk_bitdecoder_ctx *bd_ctx) { static duk_uint32_t uni_decode_value(duk_bitdecoder_ctx *bd_ctx) {
int t; duk_uint32_t t;
t = duk_bd_decode(bd_ctx, 4); t = duk_bd_decode(bd_ctx, 4);
if (t <= 0x0e) { if (t <= 0x0e) {
@ -273,25 +275,25 @@ static int uni_decode_value(duk_bitdecoder_ctx *bd_ctx) {
return t + 0x0f + 0xfe; return t + 0x0f + 0xfe;
} else { } else {
t = duk_bd_decode(bd_ctx, 24); t = duk_bd_decode(bd_ctx, 24);
return t + 0x0f + 0xfe + 0x1000; return t + 0x0f + 0xfe + 0x1000UL;
} }
} }
static int uni_range_match(duk_uint8_t *unitab, int unilen, int x) { static duk_small_int_t uni_range_match(duk_uint8_t *unitab, duk_size_t unilen, duk_codepoint_t cp) {
duk_bitdecoder_ctx bd_ctx; duk_bitdecoder_ctx bd_ctx;
DUK_MEMSET(&bd_ctx, 0, sizeof(bd_ctx)); DUK_MEMSET(&bd_ctx, 0, sizeof(bd_ctx));
bd_ctx.data = (duk_uint8_t *) unitab; bd_ctx.data = (duk_uint8_t *) unitab;
bd_ctx.length = (duk_size_t) unilen; bd_ctx.length = (duk_size_t) unilen;
int prev_re = 0; duk_codepoint_t prev_re = 0;
for (;;) { for (;;) {
int r1, r2; duk_codepoint_t r1, r2;
r1 = uni_decode_value(&bd_ctx); r1 = (duk_codepoint_t) uni_decode_value(&bd_ctx);
if (r1 == 0) { if (r1 == 0) {
break; break;
} }
r2 = uni_decode_value(&bd_ctx); r2 = (duk_codepoint_t) uni_decode_value(&bd_ctx);
r1 = prev_re + r1; r1 = prev_re + r1;
r2 = r1 + r2; r2 = r1 + r2;
@ -300,7 +302,7 @@ static int uni_range_match(duk_uint8_t *unitab, int unilen, int x) {
/* [r1,r2] is the range */ /* [r1,r2] is the range */
DUK_DDDPRINT("uni_range_match: range=[0x%06x,0x%06x]", r1, r2); DUK_DDDPRINT("uni_range_match: range=[0x%06x,0x%06x]", r1, r2);
if (x >= r1 && x <= r2) { if (cp >= r1 && cp <= r2) {
return 1; return 1;
} }
} }
@ -312,7 +314,7 @@ static int uni_range_match(duk_uint8_t *unitab, int unilen, int x) {
* "WhiteSpace" production check. * "WhiteSpace" production check.
*/ */
int duk_unicode_is_whitespace(int x) { duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) {
/* /*
* E5 Section 7.2 specifies six characters specifically as * E5 Section 7.2 specifies six characters specifically as
* white space: * white space:
@ -368,24 +370,24 @@ int duk_unicode_is_whitespace(int x) {
* A manual decoder (below) is probably most compact for this. * A manual decoder (below) is probably most compact for this.
*/ */
unsigned char lo; duk_uint_fast8_t lo;
int hi; duk_uint_fast32_t hi;
lo = (unsigned char) (x & 0xff); lo = (duk_uint_fast8_t) (cp & 0xff);
hi = (int) (x >> 8); /* does not fit into an uchar */ hi = (duk_uint_fast32_t) (cp >> 8); /* does not fit into an uchar */
if (hi == 0x0000) { if (hi == 0x0000UL) {
if (lo == 0x09 || lo == 0x0b || lo == 0x0c || if (lo == 0x09 || lo == 0x0b || lo == 0x0c ||
lo == 0x20 || lo == 0xa0) { lo == 0x20 || lo == 0xa0) {
return 1; return 1;
} }
} else if (hi == 0x0020) { } else if (hi == 0x0020UL) {
if (lo <= 0x0a || lo == 0x28 || lo == 0x29 || if (lo <= 0x0a || lo == 0x28 || lo == 0x29 ||
lo == 0x2f || lo == 0x5f) { lo == 0x2f || lo == 0x5f) {
return 1; return 1;
} }
} else if (x == 0x1680 || x == 0x180e || x == 0x3000 || } else if (cp == 0x1680UL || cp == 0x180eUL || cp == 0x3000UL ||
x == 0xfeff) { cp == 0xfeffUL) {
return 1; return 1;
} }
@ -396,7 +398,7 @@ int duk_unicode_is_whitespace(int x) {
* "LineTerminator" production check. * "LineTerminator" production check.
*/ */
int duk_unicode_is_line_terminator(int x) { duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp) {
/* /*
* E5 Section 7.3 * E5 Section 7.3
* *
@ -404,8 +406,8 @@ int duk_unicode_is_line_terminator(int x) {
* into a single line terminator. This must be handled by the caller. * into a single line terminator. This must be handled by the caller.
*/ */
if (x == 0x000a || x == 0x000d || x == 0x2028 || if (cp == 0x000aUL || cp == 0x000dUL || cp == 0x2028UL ||
x == 0x2029) { cp == 0x2029UL) {
return 1; return 1;
} }
@ -416,7 +418,7 @@ int duk_unicode_is_line_terminator(int x) {
* "IdentifierStart" production check. * "IdentifierStart" production check.
*/ */
int duk_unicode_is_identifier_start(int x) { duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) {
/* /*
* E5 Section 7.6: * E5 Section 7.6:
* *
@ -452,10 +454,10 @@ int duk_unicode_is_identifier_start(int x) {
*/ */
/* ASCII fast path -- quick accept and reject */ /* ASCII fast path -- quick accept and reject */
if (x <= 0x7f) { if (cp <= 0x7fUL) {
if ((x >= 'a' && x <= 'z') || if ((cp >= 'a' && cp <= 'z') ||
(x >= 'A' && x <= 'Z') || (cp >= 'A' && cp <= 'Z') ||
x == '_' || x == '$') { cp == '_' || cp == '$') {
return 1; return 1;
} }
return 0; return 0;
@ -465,16 +467,16 @@ int duk_unicode_is_identifier_start(int x) {
#ifdef DUK_USE_SOURCE_NONBMP #ifdef DUK_USE_SOURCE_NONBMP
if (uni_range_match(duk_unicode_identifier_start_noascii, if (uni_range_match(duk_unicode_identifier_start_noascii,
sizeof(duk_unicode_identifier_start_noascii), (duk_size_t) sizeof(duk_unicode_identifier_start_noascii),
x)) { cp)) {
return 1; return 1;
} }
return 0; return 0;
#else #else
if (x < 0x10000) { if (cp < 0x10000UL) {
if (uni_range_match(duk_unicode_identifier_start_noascii_bmponly, if (uni_range_match(duk_unicode_identifier_start_noascii_bmponly,
sizeof(duk_unicode_identifier_start_noascii_bmponly), sizeof(duk_unicode_identifier_start_noascii_bmponly),
x)) { cp)) {
return 1; return 1;
} }
return 0; return 0;
@ -491,7 +493,7 @@ int duk_unicode_is_identifier_start(int x) {
* "IdentifierPart" production check. * "IdentifierPart" production check.
*/ */
int duk_unicode_is_identifier_part(int x) { duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) {
/* /*
* E5 Section 7.6: * E5 Section 7.6:
* *
@ -538,11 +540,11 @@ int duk_unicode_is_identifier_part(int x) {
*/ */
/* ASCII fast path -- quick accept and reject */ /* ASCII fast path -- quick accept and reject */
if (x <= 0x7f) { if (cp <= 0x7fUL) {
if ((x >= 'a' && x <= 'z') || if ((cp >= 'a' && cp <= 'z') ||
(x >= 'A' && x <= 'Z') || (cp >= 'A' && cp <= 'Z') ||
(x >= '0' && x <= '9') || (cp >= '0' && cp <= '9') ||
x == '_' || x == '$') { cp == '_' || cp == '$') {
return 1; return 1;
} }
return 0; return 0;
@ -553,21 +555,21 @@ int duk_unicode_is_identifier_part(int x) {
#ifdef DUK_USE_SOURCE_NONBMP #ifdef DUK_USE_SOURCE_NONBMP
if (uni_range_match(duk_unicode_identifier_start_noascii, if (uni_range_match(duk_unicode_identifier_start_noascii,
sizeof(duk_unicode_identifier_start_noascii), sizeof(duk_unicode_identifier_start_noascii),
x) || cp) ||
uni_range_match(duk_unicode_identifier_part_minus_identifier_start_noascii, uni_range_match(duk_unicode_identifier_part_minus_identifier_start_noascii,
sizeof(duk_unicode_identifier_part_minus_identifier_start_noascii), sizeof(duk_unicode_identifier_part_minus_identifier_start_noascii),
x)) { cp)) {
return 1; return 1;
} }
return 0; return 0;
#else #else
if (x < 0x10000) { if (x < 0x10000UL) {
if (uni_range_match(duk_unicode_identifier_start_noascii_bmponly, if (uni_range_match(duk_unicode_identifier_start_noascii_bmponly,
sizeof(duk_unicode_identifier_start_noascii_bmponly), sizeof(duk_unicode_identifier_start_noascii_bmponly),
x) || cp) ||
uni_range_match(duk_unicode_identifier_part_minus_identifier_start_noascii_bmponly, uni_range_match(duk_unicode_identifier_part_minus_identifier_start_noascii_bmponly,
sizeof(duk_unicode_identifier_part_minus_identifier_start_noascii_bmponly), sizeof(duk_unicode_identifier_part_minus_identifier_start_noascii_bmponly),
x)) { cp)) {
return 1; return 1;
} }
return 0; return 0;
@ -600,15 +602,19 @@ int duk_unicode_is_identifier_part(int x) {
* this function. * this function.
*/ */
static int slow_case_conversion(duk_hthread *thr, static duk_codepoint_t slow_case_conversion(duk_hthread *thr,
duk_hbuffer_dynamic *buf, duk_hbuffer_dynamic *buf,
int x, duk_codepoint_t cp,
duk_bitdecoder_ctx *bd_ctx) { duk_bitdecoder_ctx *bd_ctx) {
int skip = 0; duk_small_int_t skip = 0;
int n, t; duk_small_int_t n;
int start_i, start_o, count; duk_small_int_t t;
duk_small_int_t count;
duk_codepoint_t tmp_cp;
duk_codepoint_t start_i;
duk_codepoint_t start_o;
DUK_DDDPRINT("slow case conversion for codepoint: %d", x); DUK_DDDPRINT("slow case conversion for codepoint: %d", (int) cp);
/* range conversion with a "skip" */ /* range conversion with a "skip" */
DUK_DDDPRINT("checking ranges"); DUK_DDDPRINT("checking ranges");
@ -619,19 +625,21 @@ static int slow_case_conversion(duk_hthread *thr,
/* end marker */ /* end marker */
break; break;
} }
DUK_DDDPRINT("skip=%d, n=%d", skip, n); DUK_DDDPRINT("skip=%d, n=%d", (int) skip, (int) n);
while (n--) { while (n--) {
start_i = duk_bd_decode(bd_ctx, 16); start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
start_o = duk_bd_decode(bd_ctx, 16); start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
count = duk_bd_decode(bd_ctx, 7); count = duk_bd_decode(bd_ctx, 7);
DUK_DDDPRINT("range: start_i=%d, start_o=%d, count=%d, skip=%d", DUK_DDDPRINT("range: start_i=%d, start_o=%d, count=%d, skip=%d",
start_i, start_o, count, skip); (int) start_i, (int) start_o, (int) count, (int) skip);
t = x - start_i; tmp_cp = cp - start_i;
if (t >= 0 && t < count * skip && (t % skip) == 0) { if (tmp_cp >= 0 &&
tmp_cp < (duk_codepoint_t) count * (duk_codepoint_t) skip &&
(tmp_cp % (duk_codepoint_t) skip) == 0) {
DUK_DDDPRINT("range matches input codepoint"); DUK_DDDPRINT("range matches input codepoint");
x = start_o + t; cp = start_o + tmp_cp;
goto single; goto single;
} }
} }
@ -639,14 +647,14 @@ static int slow_case_conversion(duk_hthread *thr,
/* 1:1 conversion */ /* 1:1 conversion */
n = duk_bd_decode(bd_ctx, 6); n = duk_bd_decode(bd_ctx, 6);
DUK_DDDPRINT("checking 1:1 conversions (count %d)", n); DUK_DDDPRINT("checking 1:1 conversions (count %d)", (int) n);
while (n--) { while (n--) {
start_i = duk_bd_decode(bd_ctx, 16); start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
start_o = duk_bd_decode(bd_ctx, 16); start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
DUK_DDDPRINT("1:1 conversion %d -> %d", start_i, start_o); DUK_DDDPRINT("1:1 conversion %d -> %d", (int) start_i, (int) start_o);
if (x == start_i) { if (cp == start_i) {
DUK_DDDPRINT("1:1 matches input codepoint"); DUK_DDDPRINT("1:1 matches input codepoint");
x = start_o; cp = start_o;
goto single; goto single;
} }
} }
@ -655,16 +663,16 @@ static int slow_case_conversion(duk_hthread *thr,
n = duk_bd_decode(bd_ctx, 7); n = duk_bd_decode(bd_ctx, 7);
DUK_DDDPRINT("checking 1:n conversions (count %d)", n); DUK_DDDPRINT("checking 1:n conversions (count %d)", n);
while (n--) { while (n--) {
start_i = duk_bd_decode(bd_ctx, 16); start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
t = duk_bd_decode(bd_ctx, 2); t = duk_bd_decode(bd_ctx, 2);
DUK_DDDPRINT("1:n conversion %d -> %d chars", start_i, t); DUK_DDDPRINT("1:n conversion %d -> %d chars", (int) start_i, (int) t);
if (x == start_i) { if (cp == start_i) {
DUK_DDDPRINT("1:n matches input codepoint"); DUK_DDDPRINT("1:n matches input codepoint");
if (buf) { if (buf) {
while (t--) { while (t--) {
int tmp = duk_bd_decode(bd_ctx, 16); tmp_cp = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
DUK_ASSERT(buf != NULL); DUK_ASSERT(buf != NULL);
duk_hbuffer_append_xutf8(thr, buf, tmp); duk_hbuffer_append_xutf8(thr, buf, (duk_uint32_t) tmp_cp); /* FIXME: duk_codepoint_t */
} }
} }
return -1; return -1;
@ -681,9 +689,9 @@ static int slow_case_conversion(duk_hthread *thr,
single: single:
if (buf) { if (buf) {
duk_hbuffer_append_xutf8(thr, buf, x); duk_hbuffer_append_xutf8(thr, buf, cp);
} }
return x; return cp;
} }
/* /*
@ -693,28 +701,30 @@ static int slow_case_conversion(duk_hthread *thr,
* locale/language. * locale/language.
*/ */
static int case_transform_helper(duk_hthread *thr, static duk_signed_codepoint_t case_transform_helper(duk_hthread *thr,
duk_hbuffer_dynamic *buf, duk_hbuffer_dynamic *buf,
int x, duk_signed_codepoint_t cp,
int prev, duk_signed_codepoint_t prev,
int next, duk_signed_codepoint_t next,
int uppercase, duk_small_int_t uppercase,
int language) { duk_small_int_t language) {
duk_bitdecoder_ctx bd_ctx; duk_bitdecoder_ctx bd_ctx;
DUK_ASSERT(cp >= 0);
/* fast path for ASCII */ /* fast path for ASCII */
if (x < 0x80) { if (cp < 0x80UL) {
/* FIXME: context sensitive rules exist for ASCII range too. /* FIXME: context sensitive rules exist for ASCII range too.
* Need to add them here. * Need to add them here.
*/ */
if (uppercase) { if (uppercase) {
if (x >= 'a' && x <= 'z') { if (cp >= 'a' && cp <= 'z') {
x = x - 'a' + 'A'; cp = cp - 'a' + 'A';
} }
} else { } else {
if (x >= 'A' && x <= 'Z') { if (cp >= 'A' && cp <= 'Z') {
x = x - 'A' + 'a'; cp = cp - 'A' + 'a';
} }
} }
goto singlechar; goto singlechar;
@ -727,17 +737,17 @@ static int case_transform_helper(duk_hthread *thr,
/* FIXME: turkish / azeri */ /* FIXME: turkish / azeri */
} else { } else {
/* final sigma context specific rule */ /* final sigma context specific rule */
if (x == 0x03a3 && /* U+03A3 = GREEK CAPITAL LETTER SIGMA */ if (cp == 0x03a3UL && /* U+03A3 = GREEK CAPITAL LETTER SIGMA */
prev >= 0 && /* prev is letter */ prev >= 0 && /* prev is letter */
next < 0) { /* next is not letter */ next < 0) { /* next is not letter */
/* FIXME: fix conditions */ /* FIXME: fix conditions */
x = 0x03c2; cp = 0x03c2UL;
goto singlechar; goto singlechar;
} }
/* FIXME: lithuanian */ /* FIXME: lithuanian */
if (0 /* language == 'lt' */ && if (0 /* language == 'lt' */ &&
x == 0x0307) { /* U+0307 = COMBINING DOT ABOVE */ cp == 0x0307UL) { /* U+0307 = COMBINING DOT ABOVE */
goto nochar; goto nochar;
} }
@ -754,13 +764,13 @@ static int case_transform_helper(duk_hthread *thr,
bd_ctx.data = (duk_uint8_t *) duk_unicode_caseconv_lc; bd_ctx.data = (duk_uint8_t *) duk_unicode_caseconv_lc;
bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc); bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc);
} }
return slow_case_conversion(thr, buf, x, &bd_ctx); return slow_case_conversion(thr, buf, cp, &bd_ctx);
singlechar: singlechar:
if (buf) { if (buf) {
duk_hbuffer_append_xutf8(thr, buf, x); duk_hbuffer_append_xutf8(thr, buf, cp);
} }
return x; return cp;
nochar: nochar:
return -1; return -1;
@ -770,12 +780,12 @@ static int case_transform_helper(duk_hthread *thr,
* Replace valstack top with case converted version. * Replace valstack top with case converted version.
*/ */
void duk_unicode_case_convert_string(duk_hthread *thr, int uppercase) { void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase) {
duk_context *ctx = (duk_context *) thr; duk_context *ctx = (duk_context *) thr;
duk_hstring *h_input; duk_hstring *h_input;
duk_hbuffer_dynamic *h_buf; duk_hbuffer_dynamic *h_buf;
duk_uint8_t *p, *p_start, *p_end; duk_uint8_t *p, *p_start, *p_end;
int prev, curr, next; duk_signed_codepoint_t prev, curr, next; /* need signed type here */
h_input = duk_require_hstring(ctx, -1); h_input = duk_require_hstring(ctx, -1);
DUK_ASSERT(h_input != NULL); DUK_ASSERT(h_input != NULL);
@ -800,7 +810,7 @@ void duk_unicode_case_convert_string(duk_hthread *thr, int uppercase) {
curr = next; curr = next;
next = -1; next = -1;
if (p < p_end) { if (p < p_end) {
next = (int) duk_unicode_xutf8_get_u32_checked(thr, &p, p_start, p_end); next = (int) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end);
} else { } else {
/* end of input and last char has been processed */ /* end of input and last char has been processed */
if (curr < 0) { if (curr < 0) {
@ -834,22 +844,22 @@ void duk_unicode_case_convert_string(duk_hthread *thr, int uppercase) {
* specific rules can apply. Locale specific rules can apply, though. * specific rules can apply. Locale specific rules can apply, though.
*/ */
int duk_unicode_re_canonicalize_char(duk_hthread *thr, int x) { duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp) {
int y; duk_codepoint_t y;
y = case_transform_helper(thr, y = case_transform_helper(thr,
NULL, /* buf */ NULL, /* buf */
x, /* curr char */ cp, /* curr char */
-1, /* prev char */ -1, /* prev char */
-1, /* next char */ -1, /* next char */
1, /* uppercase */ 1, /* uppercase */
0); /* FIXME: language */ 0); /* FIXME: language */
if ((y < 0) || (x >= 0x80 && y < 0x80)) { if ((y < 0) || (cp >= 0x80 && y < 0x80)) {
/* multiple codepoint conversion or non-ASCII mapped to ASCII /* multiple codepoint conversion or non-ASCII mapped to ASCII
* --> leave as is. * --> leave as is.
*/ */
return x; return cp;
} }
return y; return y;
@ -860,7 +870,7 @@ int duk_unicode_re_canonicalize_char(duk_hthread *thr, int x) {
* x < 0 for characters read outside the string. * x < 0 for characters read outside the string.
*/ */
int duk_unicode_re_is_wordchar(int x) { duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t x) {
/* /*
* Note: the description in E5 Section 15.10.2.6 has a typo, it * Note: the description in E5 Section 15.10.2.6 has a typo, it
* contains 'A' twice and lacks 'a'; the intent is [0-9a-zA-Z_]. * contains 'A' twice and lacks 'a'; the intent is [0-9a-zA-Z_].
@ -880,51 +890,51 @@ int duk_unicode_re_is_wordchar(int x) {
/* exposed because lexer needs these too */ /* exposed because lexer needs these too */
duk_uint16_t duk_unicode_re_ranges_digit[2] = { duk_uint16_t duk_unicode_re_ranges_digit[2] = {
(duk_uint16_t) 0x0030, (duk_uint16_t) 0x0039, (duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL,
}; };
duk_uint16_t duk_unicode_re_ranges_white[22] = { duk_uint16_t duk_unicode_re_ranges_white[22] = {
(duk_uint16_t) 0x0009, (duk_uint16_t) 0x000D, (duk_uint16_t) 0x0009UL, (duk_uint16_t) 0x000DUL,
(duk_uint16_t) 0x0020, (duk_uint16_t) 0x0020, (duk_uint16_t) 0x0020UL, (duk_uint16_t) 0x0020UL,
(duk_uint16_t) 0x00A0, (duk_uint16_t) 0x00A0, (duk_uint16_t) 0x00A0UL, (duk_uint16_t) 0x00A0UL,
(duk_uint16_t) 0x1680, (duk_uint16_t) 0x1680, (duk_uint16_t) 0x1680UL, (duk_uint16_t) 0x1680UL,
(duk_uint16_t) 0x180E, (duk_uint16_t) 0x180E, (duk_uint16_t) 0x180EUL, (duk_uint16_t) 0x180EUL,
(duk_uint16_t) 0x2000, (duk_uint16_t) 0x200A, (duk_uint16_t) 0x2000UL, (duk_uint16_t) 0x200AUL,
(duk_uint16_t) 0x2028, (duk_uint16_t) 0x2029, (duk_uint16_t) 0x2028UL, (duk_uint16_t) 0x2029UL,
(duk_uint16_t) 0x202F, (duk_uint16_t) 0x202F, (duk_uint16_t) 0x202FUL, (duk_uint16_t) 0x202FUL,
(duk_uint16_t) 0x205F, (duk_uint16_t) 0x205F, (duk_uint16_t) 0x205FUL, (duk_uint16_t) 0x205FUL,
(duk_uint16_t) 0x3000, (duk_uint16_t) 0x3000, (duk_uint16_t) 0x3000UL, (duk_uint16_t) 0x3000UL,
(duk_uint16_t) 0xFEFF, (duk_uint16_t) 0xFEFF, (duk_uint16_t) 0xFEFFUL, (duk_uint16_t) 0xFEFFUL,
}; };
duk_uint16_t duk_unicode_re_ranges_wordchar[8] = { duk_uint16_t duk_unicode_re_ranges_wordchar[8] = {
(duk_uint16_t) 0x0030, (duk_uint16_t) 0x0039, (duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL,
(duk_uint16_t) 0x0041, (duk_uint16_t) 0x005A, (duk_uint16_t) 0x0041UL, (duk_uint16_t) 0x005AUL,
(duk_uint16_t) 0x005F, (duk_uint16_t) 0x005F, (duk_uint16_t) 0x005FUL, (duk_uint16_t) 0x005FUL,
(duk_uint16_t) 0x0061, (duk_uint16_t) 0x007A, (duk_uint16_t) 0x0061UL, (duk_uint16_t) 0x007AUL,
}; };
duk_uint16_t duk_unicode_re_ranges_not_digit[4] = { duk_uint16_t duk_unicode_re_ranges_not_digit[4] = {
(duk_uint16_t) 0x0000, (duk_uint16_t) 0x002F, (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL,
(duk_uint16_t) 0x003A, (duk_uint16_t) 0xFFFF, (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0xFFFFUL,
}; };
duk_uint16_t duk_unicode_re_ranges_not_white[24] = { duk_uint16_t duk_unicode_re_ranges_not_white[24] = {
(duk_uint16_t) 0x0000, (duk_uint16_t) 0x0008, (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x0008UL,
(duk_uint16_t) 0x000E, (duk_uint16_t) 0x001F, (duk_uint16_t) 0x000EUL, (duk_uint16_t) 0x001FUL,
(duk_uint16_t) 0x0021, (duk_uint16_t) 0x009F, (duk_uint16_t) 0x0021UL, (duk_uint16_t) 0x009FUL,
(duk_uint16_t) 0x00A1, (duk_uint16_t) 0x167F, (duk_uint16_t) 0x00A1UL, (duk_uint16_t) 0x167FUL,
(duk_uint16_t) 0x1681, (duk_uint16_t) 0x180D, (duk_uint16_t) 0x1681UL, (duk_uint16_t) 0x180DUL,
(duk_uint16_t) 0x180F, (duk_uint16_t) 0x1FFF, (duk_uint16_t) 0x180FUL, (duk_uint16_t) 0x1FFFUL,
(duk_uint16_t) 0x200B, (duk_uint16_t) 0x2027, (duk_uint16_t) 0x200BUL, (duk_uint16_t) 0x2027UL,
(duk_uint16_t) 0x202A, (duk_uint16_t) 0x202E, (duk_uint16_t) 0x202AUL, (duk_uint16_t) 0x202EUL,
(duk_uint16_t) 0x2030, (duk_uint16_t) 0x205E, (duk_uint16_t) 0x2030UL, (duk_uint16_t) 0x205EUL,
(duk_uint16_t) 0x2060, (duk_uint16_t) 0x2FFF, (duk_uint16_t) 0x2060UL, (duk_uint16_t) 0x2FFFUL,
(duk_uint16_t) 0x3001, (duk_uint16_t) 0xFEFE, (duk_uint16_t) 0x3001UL, (duk_uint16_t) 0xFEFEUL,
(duk_uint16_t) 0xFF00, (duk_uint16_t) 0xFFFF, (duk_uint16_t) 0xFF00UL, (duk_uint16_t) 0xFFFFUL,
}; };
duk_uint16_t duk_unicode_re_ranges_not_wordchar[10] = { duk_uint16_t duk_unicode_re_ranges_not_wordchar[10] = {
(duk_uint16_t) 0x0000, (duk_uint16_t) 0x002F, (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL,
(duk_uint16_t) 0x003A, (duk_uint16_t) 0x0040, (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0x0040UL,
(duk_uint16_t) 0x005B, (duk_uint16_t) 0x005E, (duk_uint16_t) 0x005BUL, (duk_uint16_t) 0x005EUL,
(duk_uint16_t) 0x0060, (duk_uint16_t) 0x0060, (duk_uint16_t) 0x0060UL, (duk_uint16_t) 0x0060UL,
(duk_uint16_t) 0x007B, (duk_uint16_t) 0xFFFF, (duk_uint16_t) 0x007BUL, (duk_uint16_t) 0xFFFFUL,
}; };
#endif /* DUK_USE_REGEXP_SUPPORT */ #endif /* DUK_USE_REGEXP_SUPPORT */

Loading…
Cancel
Save