Rework unicode support for bufwriter

9 years ago · 0f86d575d2
2 changed files with 56 additions and 22 deletions
--- a/src/duk_unicode.h
+++ b/src/duk_unicode.h
@ -209,6 +209,9 @@ DUK_INTERNAL_DECL duk_uint16_t duk_unicode_re_ranges_not_wordchar[10];
 */

 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp);
+#if defined(DUK_USE_ASSERTIONS)
+DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp);
+#endif
 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out);
 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out);
 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp);
--- a/src/duk_unicode_support.c
+++ b/src/duk_unicode_support.c
@ -35,6 +35,28 @@ DUK_INTERNAL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) {
 	}
 }

+#if defined(DUK_USE_ASSERTIONS)
+DUK_INTERNAL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp) {
+	duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
+	if (x < 0x80UL) {
+		/* 7 bits */
+		return 1;
+	} else if (x < 0x800UL) {
+		/* 11 bits */
+		return 2;
+	} else if (x < 0x10000UL) {
+		/* 16 bits */
+		return 3;
+	} else {
+		/* Encoded as surrogate pair, each encoding to 3 bytes for
+		 * 6 bytes total.  Codepoints above U+10FFFF encode as 6 bytes
+		 * too, see duk_unicode_encode_cesu8().
+		  */
+		return 3 + 3;
+	}
+}
+#endif  /* DUK_USE_ASSERTIONS */
+
 DUK_INTERNAL duk_uint8_t duk_unicode_xutf8_markers[7] = {
 	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
 };
@ -662,7 +684,7 @@ DUK_INTERNAL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp) {

 DUK_LOCAL
 duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
-                                          duk_hbuffer_dynamic *buf,
+                                          duk_bufwriter_ctx *bw,
                                          duk_codepoint_t cp,
                                          duk_bitdecoder_ctx *bd_ctx) {
 	duk_small_int_t skip = 0;
@ -673,6 +695,9 @@ duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
 	duk_codepoint_t start_i;
 	duk_codepoint_t start_o;

+	DUK_UNREF(thr);
+	DUK_ASSERT(bd_ctx != NULL);
+
 	DUK_DDD(DUK_DDDPRINT("slow case conversion for codepoint: %ld", (long) cp));

 	/* range conversion with a "skip" */
@ -728,11 +753,10 @@ duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
 		DUK_DDD(DUK_DDDPRINT("1:n conversion %ld -> %ld chars", (long) start_i, (long) t));
 		if (cp == start_i) {
 			DUK_DDD(DUK_DDDPRINT("1:n matches input codepoint"));
-			if (buf) {
+			if (bw != NULL) {
 				while (t--) {
 					tmp_cp = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
-					DUK_ASSERT(buf != NULL);
-					duk_hbuffer_append_xutf8(thr, buf, (duk_ucodepoint_t) tmp_cp);
+					DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) tmp_cp);
 				}
 			}
 			return -1;
@ -748,8 +772,8 @@ duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
 	/* fall through */

 single:
-	if (buf) {
-		duk_hbuffer_append_xutf8(thr, buf, cp);
+	if (bw != NULL) {
+		DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);
 	}
 	return cp;
 }
@ -766,7 +790,7 @@ duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
 */
 DUK_LOCAL
 duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
-                                           duk_hbuffer_dynamic *buf,
+                                           duk_bufwriter_ctx *bw,
                                           duk_codepoint_t cp,
                                           duk_codepoint_t prev,
                                           duk_codepoint_t next,
@ -790,7 +814,11 @@ duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
 				cp = cp - 'A' + 'a';
 			}
 		}
-		goto singlechar;
+
+		if (bw != NULL) {
+			DUK_BW_WRITE_RAW_U8(thr, bw, (duk_uint8_t) cp);
+		}
+		return cp;
 	}

 	/* context and locale specific rules which cannot currently be represented
@ -831,11 +859,11 @@ duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
 		bd_ctx.data = (duk_uint8_t *) duk_unicode_caseconv_lc;
 		bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc);
 	}
-	return duk__slow_case_conversion(thr, buf, cp, &bd_ctx);
+	return duk__slow_case_conversion(thr, bw, cp, &bd_ctx);

 singlechar:
-	if (buf) {
-		duk_hbuffer_append_xutf8(thr, buf, cp);
+	if (bw != NULL) {
+		DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);
 	}
 	return cp;

@ -853,20 +881,16 @@ duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
 DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_int_t uppercase) {
 	duk_context *ctx = (duk_context *) thr;
 	duk_hstring *h_input;
-	duk_hbuffer_dynamic *h_buf;
+	duk_bufwriter_ctx bw_alloc;
+	duk_bufwriter_ctx *bw;
 	const duk_uint8_t *p, *p_start, *p_end;
 	duk_codepoint_t prev, curr, next;

 	h_input = duk_require_hstring(ctx, -1);
 	DUK_ASSERT(h_input != NULL);

-	/* XXX: should init the buffer with a spare of at least h_input->blen
-	 * to avoid unnecessary growth steps.
-	 */
-	duk_push_dynamic_buffer(ctx, 0);
-	h_buf = (duk_hbuffer_dynamic *) duk_get_hbuffer(ctx, -1);
-	DUK_ASSERT(h_buf != NULL);
-	DUK_ASSERT(DUK_HBUFFER_HAS_DYNAMIC(h_buf));
+	bw = &bw_alloc;
+	DUK_BW_INIT_PUSHBUF(thr, bw, DUK_HSTRING_GET_BYTELEN(h_input));

 	/* [ ... input buffer ] */

@ -892,9 +916,15 @@ DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_in

 		/* on first round, skip */
 		if (curr >= 0) {
-			/* may generate any number of output codepoints */
+			/* XXX: could add a fast path to process chunks of input codepoints,
+			 * but relative benefit would be quite small.
+			 */
+
+			/* Ensure space for maximum multi-character result; estimate is overkill. */
+			DUK_BW_ENSURE(thr, bw, 8 * DUK_UNICODE_MAX_XUTF8_LENGTH);
+
 			duk__case_transform_helper(thr,
-			                           h_buf,
+			                           bw,
 			                           (duk_codepoint_t) curr,
 			                           prev,
 			                           next,
@ -902,6 +932,7 @@ DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_small_in
 		}
 	}

+	DUK_BW_COMPACT(thr, bw);
 	duk_to_string(ctx, -1);  /* invalidates h_buf pointer */
 	duk_remove(ctx, -2);
 }
@ -919,7 +950,7 @@ DUK_INTERNAL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr,
 	duk_codepoint_t y;

 	y = duk__case_transform_helper(thr,
-	                               NULL,    /* buf */
+	                               NULL,    /* NULL is allowed, no output */
 	                               cp,      /* curr char */
 	                               -1,      /* prev char */
 	                               -1,      /* next char */