Browse Source

Rework unicode support for bufwriter

pull/264/head
Sami Vaarala 9 years ago
parent
commit
0f86d575d2
  1. 3
      src/duk_unicode.h
  2. 75
      src/duk_unicode_support.c

3
src/duk_unicode.h

@ -209,6 +209,9 @@ DUK_INTERNAL_DECL duk_uint16_t duk_unicode_re_ranges_not_wordchar[10];
*/ */
DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp); DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp);
#if defined(DUK_USE_ASSERTIONS)
DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp);
#endif
DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out); DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out);
DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out); DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out);
DUK_INTERNAL_DECL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp); DUK_INTERNAL_DECL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp);

75
src/duk_unicode_support.c

@ -35,6 +35,28 @@ DUK_INTERNAL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) {
} }
} }
#if defined(DUK_USE_ASSERTIONS)
DUK_INTERNAL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp) {
duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
if (x < 0x80UL) {
/* 7 bits */
return 1;
} else if (x < 0x800UL) {
/* 11 bits */
return 2;
} else if (x < 0x10000UL) {
/* 16 bits */
return 3;
} else {
/* Encoded as surrogate pair, each encoding to 3 bytes for
* 6 bytes total. Codepoints above U+10FFFF encode as 6 bytes
* too, see duk_unicode_encode_cesu8().
*/
return 3 + 3;
}
}
#endif /* DUK_USE_ASSERTIONS */
DUK_INTERNAL duk_uint8_t duk_unicode_xutf8_markers[7] = { DUK_INTERNAL duk_uint8_t duk_unicode_xutf8_markers[7] = {
0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
}; };
@ -662,7 +684,7 @@ DUK_INTERNAL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp) {
DUK_LOCAL DUK_LOCAL
duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr, duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
duk_hbuffer_dynamic *buf, duk_bufwriter_ctx *bw,
duk_codepoint_t cp, duk_codepoint_t cp,
duk_bitdecoder_ctx *bd_ctx) { duk_bitdecoder_ctx *bd_ctx) {
duk_small_int_t skip = 0; duk_small_int_t skip = 0;
@ -673,6 +695,9 @@ duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
duk_codepoint_t start_i; duk_codepoint_t start_i;
duk_codepoint_t start_o; duk_codepoint_t start_o;
DUK_UNREF(thr);
DUK_ASSERT(bd_ctx != NULL);
DUK_DDD(DUK_DDDPRINT("slow case conversion for codepoint: %ld", (long) cp)); DUK_DDD(DUK_DDDPRINT("slow case conversion for codepoint: %ld", (long) cp));
/* range conversion with a "skip" */ /* range conversion with a "skip" */
@ -728,11 +753,10 @@ duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
DUK_DDD(DUK_DDDPRINT("1:n conversion %ld -> %ld chars", (long) start_i, (long) t)); DUK_DDD(DUK_DDDPRINT("1:n conversion %ld -> %ld chars", (long) start_i, (long) t));
if (cp == start_i) { if (cp == start_i) {
DUK_DDD(DUK_DDDPRINT("1:n matches input codepoint")); DUK_DDD(DUK_DDDPRINT("1:n matches input codepoint"));
if (buf) { if (bw != NULL) {
while (t--) { while (t--) {
tmp_cp = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16); tmp_cp = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
DUK_ASSERT(buf != NULL); DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) tmp_cp);
duk_hbuffer_append_xutf8(thr, buf, (duk_ucodepoint_t) tmp_cp);
} }
} }
return -1; return -1;
@ -748,8 +772,8 @@ duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
/* fall through */ /* fall through */
single: single:
if (buf) { if (bw != NULL) {
duk_hbuffer_append_xutf8(thr, buf, cp); DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);
} }
return cp; return cp;
} }
@ -766,7 +790,7 @@ duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
*/ */
DUK_LOCAL DUK_LOCAL
duk_codepoint_t duk__case_transform_helper(duk_hthread *thr, duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
duk_hbuffer_dynamic *buf, duk_bufwriter_ctx *bw,
duk_codepoint_t cp, duk_codepoint_t cp,
duk_codepoint_t prev, duk_codepoint_t prev,
duk_codepoint_t next, duk_codepoint_t next,
@ -790,7 +814,11 @@ duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
cp = cp - 'A' + 'a'; cp = cp - 'A' + 'a';
} }
} }
goto singlechar;
if (bw != NULL) {
DUK_BW_WRITE_RAW_U8(thr, bw, (duk_uint8_t) cp);
}
return cp;
} }
/* context and locale specific rules which cannot currently be represented /* context and locale specific rules which cannot currently be represented
@ -831,11 +859,11 @@ duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
bd_ctx.data = (duk_uint8_t *) duk_unicode_caseconv_lc; bd_ctx.data = (duk_uint8_t *) duk_unicode_caseconv_lc;
bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc); bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc);
} }
return duk__slow_case_conversion(thr, buf, cp, &bd_ctx); return duk__slow_case_conversion(thr, bw, cp, &bd_ctx);
singlechar: singlechar:
if (buf) { if (bw != NULL) {
duk_hbuffer_append_xutf8(thr, buf, cp); DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);
} }
return cp; return cp;
@ -853,20 +881,16 @@ duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase) { DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase) {
duk_context *ctx = (duk_context *) thr; duk_context *ctx = (duk_context *) thr;
duk_hstring *h_input; duk_hstring *h_input;
duk_hbuffer_dynamic *h_buf; duk_bufwriter_ctx bw_alloc;
duk_bufwriter_ctx *bw;
const duk_uint8_t *p, *p_start, *p_end; const duk_uint8_t *p, *p_start, *p_end;
duk_codepoint_t prev, curr, next; duk_codepoint_t prev, curr, next;
h_input = duk_require_hstring(ctx, -1); h_input = duk_require_hstring(ctx, -1);
DUK_ASSERT(h_input != NULL); DUK_ASSERT(h_input != NULL);
/* XXX: should init the buffer with a spare of at least h_input->blen bw = &bw_alloc;
* to avoid unnecessary growth steps. DUK_BW_INIT_PUSHBUF(thr, bw, DUK_HSTRING_GET_BYTELEN(h_input));
*/
duk_push_dynamic_buffer(ctx, 0);
h_buf = (duk_hbuffer_dynamic *) duk_get_hbuffer(ctx, -1);
DUK_ASSERT(h_buf != NULL);
DUK_ASSERT(DUK_HBUFFER_HAS_DYNAMIC(h_buf));
/* [ ... input buffer ] */ /* [ ... input buffer ] */
@ -892,9 +916,15 @@ DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_in
/* on first round, skip */ /* on first round, skip */
if (curr >= 0) { if (curr >= 0) {
/* may generate any number of output codepoints */ /* XXX: could add a fast path to process chunks of input codepoints,
* but relative benefit would be quite small.
*/
/* Ensure space for maximum multi-character result; estimate is overkill. */
DUK_BW_ENSURE(thr, bw, 8 * DUK_UNICODE_MAX_XUTF8_LENGTH);
duk__case_transform_helper(thr, duk__case_transform_helper(thr,
h_buf, bw,
(duk_codepoint_t) curr, (duk_codepoint_t) curr,
prev, prev,
next, next,
@ -902,6 +932,7 @@ DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_in
} }
} }
DUK_BW_COMPACT(thr, bw);
duk_to_string(ctx, -1); /* invalidates h_buf pointer */ duk_to_string(ctx, -1); /* invalidates h_buf pointer */
duk_remove(ctx, -2); duk_remove(ctx, -2);
} }
@ -919,7 +950,7 @@ DUK_INTERNAL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr,
duk_codepoint_t y; duk_codepoint_t y;
y = duk__case_transform_helper(thr, y = duk__case_transform_helper(thr,
NULL, /* buf */ NULL, /* NULL is allowed, no output */
cp, /* curr char */ cp, /* curr char */
-1, /* prev char */ -1, /* prev char */
-1, /* next char */ -1, /* next char */

Loading…
Cancel
Save