unicode support code

12 years ago · 98f1da91c1
3 changed files with 1006 additions and 0 deletions
--- a/src/duk_unicode.h
+++ b/src/duk_unicode.h
@ -0,0 +1,60 @@
+/*
+ *  Unicode helpers
+ */
+
+#ifndef __DUK_UNICODE_H
+#define __DUK_UNICODE_H 1
+
+#include "duk_features.h"
+
+#define  DUK_UNICODE_MAX_XUTF8_LENGTH   7   /* up to 36 bit codepoints */
+#define  DUK_UNICODE_MAX_CESU8_LENGTH   6   /* all codepoints up to U+10FFFF */
+
+#define  DUK_UNICODE_CP_ZWNJ            0x200c  /* zero-width non-joiner */
+#define  DUK_UNICODE_CP_ZWJ             0x200d  /* zero-width joiner */
+
+#ifdef DUK_USE_SOURCE_NONBMP
+#include "duk_unicode_ids_noa.h"
+#else
+#include "duk_unicode_ids_noa_bmpo.h"
+#endif
+
+#ifdef DUK_USE_SOURCE_NONBMP
+#include "duk_unicode_idp_m_ids_noa.h"
+#else
+#include "duk_unicode_idp_m_ids_noa_bmpo.h"
+#endif
+
+#include "duk_unicode_caseconv.h"
+
+/*
+ *  Extern
+ */
+
+/* duk_unicode_support.c */
+extern duk_u8 duk_unicode_xutf8_markers[7];
+extern duk_u16 duk_unicode_re_ranges_digit[2];
+extern duk_u16 duk_unicode_re_ranges_white[22];
+extern duk_u16 duk_unicode_re_ranges_wordchar[8];
+extern duk_u16 duk_unicode_re_ranges_not_digit[4];
+extern duk_u16 duk_unicode_re_ranges_not_white[24];
+extern duk_u16 duk_unicode_re_ranges_not_wordchar[10];
+
+/*
+ *  Prototypes
+ */
+
+int duk_unicode_get_xutf8_length(duk_u32 x);
+size_t duk_unicode_encode_xutf8(duk_u32 x, duk_u8 *out);
+size_t duk_unicode_encode_cesu8(duk_u32 x, duk_u8 *out);
+duk_u32 duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end);
+int duk_unicode_is_whitespace(int x);
+int duk_unicode_is_line_terminator(int x);
+int duk_unicode_is_identifier_start(int x);
+int duk_unicode_is_identifier_part(int x);
+void duk_unicode_case_convert_string(duk_hthread *thr, int uppercase);
+int duk_unicode_re_canonicalize_char(duk_hthread *thr, int x);
+int duk_unicode_re_is_wordchar(int x);
+
+#endif  /* __DUK_UNICODE_H */
+
--- a/src/duk_unicode_support.c
+++ b/src/duk_unicode_support.c
@ -0,0 +1,899 @@
+/*
+ *  Various Unicode help functions for character classification predicates,
+ *  case conversion, decoding, etc.
+ */
+
+#include "duk_internal.h"
+
+/*
+ *  XUTF-8 and CESU-8 encoding/decoding
+ */
+
+int duk_unicode_get_xutf8_length(duk_u32 x) {
+	if (x < 0x80) {
+		/* 7 bits */
+		return 1;
+	} else if (x < 0x800) {
+		/* 11 bits */
+		return 2;
+	} else if (x < 0x10000) {
+		/* 16 bits */
+		return 3;
+	} else if (x < 0x200000) {
+		/* 21 bits */
+		return 4;
+	} else if (x < 0x4000000) {
+		/* 26 bits */
+		return 5;
+	} else if (x < (duk_u32) 0x80000000L) {
+		/* 31 bits */
+		return 6;
+	} else {
+		/* 36 bits */
+		return 7;
+	}
+}
+
+duk_u8 duk_unicode_xutf8_markers[7] = {
+	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
+};
+
+/* Encode to extended UTF-8; 'out' must have space for at least
+ * DUK_UNICODE_MAX_XUTF8_LENGTH bytes.  Allows encoding of any
+ * 32-bit (unsigned) codepoint.
+ */
+size_t duk_unicode_encode_xutf8(duk_u32 x, duk_u8 *out) {
+	size_t len;
+	duk_u8 marker;
+	size_t i;
+
+	len = duk_unicode_get_xutf8_length(x);
+	DUK_ASSERT(len > 0);
+
+	marker = duk_unicode_xutf8_markers[len - 1];  /* 64-bit OK because always >= 0 */
+
+	i = len;
+	DUK_ASSERT(i > 0);
+	do {
+		i--;
+		if (i > 0) {
+			out[i] = 0x80 + (x & 0x3f);
+			x >>= 6;
+		} else {
+			/* Note: masking of 'x' is not necessary because of
+			 * range check and shifting -> no bits overlapping
+			 * the marker should be set.
+			 */
+			out[0] = marker + x;
+		}
+	} while(i > 0);
+
+	return len;
+}
+
+/* Encode to CESU-8; 'out' must have space for at least
+ * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF
+ * will encode to garbage but won't overwrite the output buffer.
+ */
+size_t duk_unicode_encode_cesu8(duk_u32 x, duk_u8 *out) {
+	size_t len;
+
+	if (x < 0x80) {
+		out[0] = x;
+		len = 1;
+	} else if (x < 0x800) {
+		out[0] = 0xc0 + ((x >> 6) & 0x1f);
+		out[1] = 0x80 + (x & 0x3f);
+		len = 2;
+	} else if (x < 0x10000) {
+		/* surrogate pairs get encoded here */
+		out[0] = 0xe0 + ((x >> 12) & 0x0f);
+		out[1] = 0x80 + ((x >> 6) & 0x3f);
+		out[2] = 0x80 + (x & 0x3f);
+		len = 3;
+	} else {
+		/*
+		 *  Unicode codepoints above U+FFFF are encoded as surrogate
+		 *  pairs here.  This ensures that all CESU-8 codepoints are
+		 *  16-bit values as expected in Ecmascript.  The surrogate
+		 *  pairs always get a 3-byte encoding (each) in CESU-8.
+		 *  See: http://en.wikipedia.org/wiki/Surrogate_pair
+		 *
+		 *  20-bit codepoint, 10 bits (A and B) per surrogate pair:
+		 * 
+		 *    x = 0b00000000 0000AAAA AAAAAABB BBBBBBBB
+		 *  sp1 = 0b110110AA AAAAAAAA  (0xd800 + ((x >> 10) & 0x3ff))
+		 *  sp2 = 0b110111BB BBBBBBBB  (0xdc00 + (x & 0x3ff))
+		 *
+		 *  Encoded into CESU-8:
+		 *
+		 *  sp1 -> 0b11101101  (0xe0 + ((sp1 >> 12) & 0x0f))
+		 *      -> 0b1010AAAA  (0x80 + ((sp1 >> 6) & 0x3f))
+		 *      -> 0b10AAAAAA  (0x80 + (sp1 & 0x3f))
+		 *  sp2 -> 0b11101101  (0xe0 + ((sp2 >> 12) & 0x0f))
+		 *      -> 0b1011BBBB  (0x80 + ((sp2 >> 6) & 0x3f))
+		 *      -> 0b10BBBBBB  (0x80 + (sp2 & 0x3f))
+		 *
+		 *  Note that 0x10000 must be subtracted first.  The code below
+		 *  avoids the sp1, sp2 temporaries which saves around 20 bytes
+		 *  of code.
+		 */
+
+		x -= 0x10000;
+
+		out[0] = 0xed;
+		out[1] = 0xa0 + ((x >> 16) & 0x0f);
+		out[2] = 0x80 + ((x >> 10) & 0x3f);
+		out[3] = 0xed;
+		out[4] = 0xb0 + ((x >> 6) & 0x0f);
+		out[5] = 0x80 + (x & 0x3f);
+		len = 6;
+	}
+
+	return len;
+}
+
+/* used by e.g. duk_regexp_executor.c, string built-ins */
+duk_u32 duk_unicode_xutf8_get_u32(duk_hthread *thr, duk_u8 **ptr, duk_u8 *ptr_start, duk_u8 *ptr_end) {
+	duk_u8 *p;
+	int res;
+	int ch;
+	int n;
+
+	p = *ptr;
+	if (p < ptr_start || p >= ptr_end) {
+		goto fail;
+	}
+
+	/*
+	 *  UTF-8 decoder which accepts longer than standard byte sequences.
+	 *  This allows full 32-bit code points to be used.
+	 */
+
+	ch = *p++;
+	if (ch < 0x80) {
+		/* 0xxx xxxx   [7 bits] */
+		res = ch & 0x7f;
+		n = 0;
+	} else if (ch < 0xc0) {
+		/* 10xx xxxx -> invalid */
+		goto fail;
+	} else if (ch < 0xe0) {
+		/* 110x xxxx   10xx xxxx   [11 bits] */
+		res = ch & 0x1f;
+		n = 1;
+	} else if (ch < 0xf0) {
+		/* 1110 xxxx   10xx xxxx   10xx xxxx   [16 bits] */
+		res = ch & 0x0f;
+		n = 2;
+	} else if (ch < 0xf8) {
+		/* 1111 0xxx   10xx xxxx   10xx xxxx   10xx xxxx   [21 bits] */
+		res = ch & 0x07;
+		n = 3;
+	} else if (ch < 0xfc) {
+		/* 1111 10xx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [26 bits] */
+		res = ch & 0x03;
+		n = 4;
+	} else if (ch < 0xfe) {
+		/* 1111 110x   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [31 bits] */
+		res = ch & 0x01;
+		n = 5;
+	} else if (ch < 0xff) {
+		/* 1111 1110   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [36 bits] */
+		res = 0;
+		n = 6;
+	} else {
+		/* 8-byte format could be:
+		 * 1111 1111   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [41 bits]
+		 *
+		 * However, this format would not have a zero bit following the
+		 * leading one bits and would not allow 0xFF to be used as an
+		 * "invalid xutf-8" marker for internal keys.  Further, 8-byte
+		 * encodings (up to 41 bit code points) are not currently needed.
+		 */
+		goto fail;
+	}
+
+	DUK_ASSERT(p >= ptr_start);  /* verified at beginning */
+	if (p + n > ptr_end) {
+		/* check pointer at end */
+		goto fail;
+	}
+
+	while (n > 0) {
+		DUK_ASSERT(p >= ptr_start && p < ptr_end);
+		res = res << 6;
+		res += (*p++) & 0x3f;
+		n--;
+	}
+
+	*ptr = p;
+	return res;
+
+ fail:
+	DUK_ERROR(thr, DUK_ERR_INTERNAL_ERROR, "utf-8 decode failed");
+	return 0;  /* never here */
+}
+
+/*
+ *  Unicode range matcher
+ *
+ *  Matches a codepoint against a packed bitstream of character ranges.
+ *  Used for slow path Unicode matching.
+ */
+
+/* Must match src/extract_chars.py, generate_match_table3(). */
+static int uni_decode_value(duk_bitdecoder_ctx *bd_ctx) {
+	int t;
+
+	t = duk_bd_decode(bd_ctx, 4);
+	if (t <= 0x0e) {
+		return t;
+	}
+	t = duk_bd_decode(bd_ctx, 8);
+	if (t <= 0xfd) {
+		return t + 0x0f;
+	}
+	if (t == 0xfe) {
+		t = duk_bd_decode(bd_ctx, 12);
+		return t + 0x0f + 0xfe;
+	} else {
+		t = duk_bd_decode(bd_ctx, 24);
+		return t + 0x0f + 0xfe + 0x1000;
+	}
+}
+
+static int uni_range_match(char *unitab, int unilen, int x) {
+	duk_bitdecoder_ctx bd_ctx;
+
+	bd_ctx.data = (duk_u8 *) unitab;
+	bd_ctx.offset = (duk_u32) 0;
+	bd_ctx.length = (duk_u32) unilen;
+	bd_ctx.currval = (duk_u32) 0;
+	bd_ctx.currbits = (duk_u32) 0;
+
+	int prev_re = 0;
+	for (;;) {
+		int r1, r2;
+		r1 = uni_decode_value(&bd_ctx);
+		if (r1 == 0) {
+			break;
+		}
+		r2 = uni_decode_value(&bd_ctx);
+
+		r1 = prev_re + r1;
+		r2 = r1 + r2;
+		prev_re = r2;
+
+		/* [r1,r2] is the range */
+
+		DUK_DDDPRINT("uni_range_match: range=[0x%06x,0x%06x]", r1, r2);
+		if (x >= r1 && x <= r2) {
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *  "WhiteSpace" production check.
+ */
+
+int duk_unicode_is_whitespace(int x) {
+	/*
+	 *  E5 Section 7.2 specifies six characters specifically as
+	 *  white space:
+	 *
+	 *    0009;<control>;Cc;0;S;;;;;N;CHARACTER TABULATION;;;;
+	 *    000B;<control>;Cc;0;S;;;;;N;LINE TABULATION;;;;
+	 *    000C;<control>;Cc;0;WS;;;;;N;FORM FEED (FF);;;;
+	 *    0020;SPACE;Zs;0;WS;;;;;N;;;;;
+	 *    00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
+	 *    FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;;
+	 *
+	 *  It also specifies any Unicode category 'Z' characters as white
+	 *  space.  These can be extracted with the "src/extract_chars.py" script,
+	 *  see src/SConscript t_uni_ws target.
+	 *
+	 *  Current result (built as WhiteSpace-Z.txt).
+	 *
+	 *    RAW OUTPUT:
+	 *    ===========
+	 *    0020;SPACE;Zs;0;WS;;;;;N;;;;;
+	 *    00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
+	 *    1680;OGHAM SPACE MARK;Zs;0;WS;;;;;N;;;;;
+	 *    180E;MONGOLIAN VOWEL SEPARATOR;Zs;0;WS;;;;;N;;;;;
+	 *    2000;EN QUAD;Zs;0;WS;2002;;;;N;;;;;
+	 *    2001;EM QUAD;Zs;0;WS;2003;;;;N;;;;;
+	 *    2002;EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
+	 *    2003;EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
+	 *    2004;THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
+	 *    2005;FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
+	 *    2006;SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
+	 *    2007;FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;;
+	 *    2008;PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
+	 *    2009;THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
+	 *    200A;HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
+	 *    2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;;
+	 *    2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;;
+	 *    202F;NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;;
+	 *    205F;MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
+	 *    3000;IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;;
+	 *    
+	 *    RANGES:
+	 *    =======
+	 *    0x0020
+	 *    0x00a0
+	 *    0x1680
+	 *    0x180e
+	 *    0x2000 ... 0x200a
+	 *    0x2028 ... 0x2029
+	 *    0x202f
+	 *    0x205f
+	 *    0x3000
+	 *
+	 *  A manual decoder (below) is probably most compact for this.
+	 */
+
+	unsigned char lo;
+	int hi;
+
+	lo = (unsigned char) (x & 0xff);
+	hi = (int) (x >> 8);  /* does not fit into an uchar */
+
+	if (hi == 0x0000) {
+		if (lo == 0x09 || lo == 0x0b || lo == 0x0c ||
+	            lo == 0x20 || lo == 0xa0) {
+			return 1;
+		}
+	} else if (hi == 0x0020) {
+		if (lo <= 0x0a || lo == 0x28 || lo == 0x29 ||
+		    lo == 0x2f || lo == 0x5f) {
+			return 1;
+		}
+	} else if (x == 0x1680 || x == 0x180e || x == 0x3000 ||
+	           x == 0xfeff) {
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ *  "LineTerminator" production check.
+ */
+
+int duk_unicode_is_line_terminator(int x) {
+	/*
+	 *  E5 Section 7.3
+	 *
+	 *  A LineTerminatorSequence essentially merges <CR> <LF> sequences
+	 *  into a single line terminator.  This must be handled by the caller.
+	 */
+
+	if (x == 0x000a || x == 0x000d || x == 0x2028 ||
+	    x == 0x2029) {
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ *  "IdentifierStart" production check.
+ */
+
+int duk_unicode_is_identifier_start(int x) {
+	/*
+	 *  E5 Section 7.6:
+	 *
+	 *    IdentifierStart:
+	 *      UnicodeLetter
+	 *      $
+	 *      _
+	 *      \ UnicodeEscapeSequence
+	 *
+	 *  IdentifierStart production has one multi-character production:
+	 *
+	 *    \ UnicodeEscapeSequence
+	 *
+	 *  The '\' character is -not- matched by this function.  Rather, the caller
+	 *  should decode the escape and then call this function to check whether the
+	 *  decoded character is acceptable (see discussion in E5 Section 7.6).
+	 *
+	 *  The "UnicodeLetter" alternative of the production allows letters
+	 *  from various Unicode categories.  These can be extracted with the
+	 *  "src/extract_chars.py" script, see src/SConscript t_uni_idstart* targets.
+	 *
+	 *  Because the result has hundreds of Unicode codepoint ranges, matching
+	 *  for any values >= 0x80 are done using a very slow range-by-range scan
+	 *  and a packed range format.
+	 *
+	 *  The ASCII portion (codepoints 0x00 ... 0x7f) is fast-pathed below because
+	 *  it matters the most.  The ASCII related ranges of IdentifierStart are:
+	 *
+	 *    0x0041 ... 0x005a		['A' ... 'Z']
+	 *    0x0061 ... 0x007a		['a' ... 'z']
+	 *    0x0024			['$']
+	 *    0x005f			['_']
+	 */
+
+	/* ASCII fast path -- quick accept and reject */
+	if (x <= 0x7f) {
+		if ((x >= 'a' && x <= 'z') ||
+		    (x >= 'A' && x <= 'Z') ||
+		    x == '_' || x == '$') {
+			return 1;
+		}
+		return 0;
+	}
+
+	/* Non-ASCII slow path (range-by-range linear comparison), very slow */
+
+#ifdef DUK_USE_SOURCE_NONBMP
+	if (uni_range_match(duk_unicode_identifier_start_noascii,
+	                    sizeof(duk_unicode_identifier_start_noascii),
+	                    x)) {
+		return 1;
+	}
+	return 0;
+#else
+	if (x < 0x10000) {
+		if (uni_range_match(duk_unicode_identifier_start_noascii_bmponly,
+	                    sizeof(duk_unicode_identifier_start_noascii_bmponly),
+	                    x)) {
+			return 1;
+		}
+		return 0;
+	} else {
+		/* without explicit non-BMP support, assume non-BMP characters
+		 * are always accepted as identifier characters.
+		 */
+		return 1;
+	}
+#endif
+}
+
+/*
+ *  "IdentifierPart" production check.
+ */
+
+int duk_unicode_is_identifier_part(int x) {
+	/*
+	 *  E5 Section 7.6:
+	 *
+	 *    IdentifierPart:
+	 *      IdentifierStart
+	 *      UnicodeCombiningMark
+	 *      UnicodeDigit
+	 *      UnicodeConnectorPunctuation
+	 *      <ZWNJ>	[U+200C]
+	 *      <ZWJ>	[U+200D]
+	 *
+	 *  IdentifierPart production has one multi-character production
+	 *  as part of its IdentifierStart alternative.  The '\' character
+	 *  of an escape sequence is not matched here, see discussion in
+	 *  duk_unicode_is_identifier_start().
+	 *
+	 *  To match non-ASCII characters (codepoints >= 0x80), a very slow
+	 *  linear range-by-range scan is used.  The codepoint is first compared
+	 *  to the IdentifierStart ranges, and if it doesn't match, then to a
+	 *  set consisting of code points in IdentifierPart but not in
+	 *  IdentifierStart.  This is done to keep the unicode range data small,
+	 *  at the expense of speed.
+	 *
+	 *  The ASCII fast path consists of:
+	 *
+	 *    0x0030 ... 0x0039		['0' ... '9', UnicodeDigit]
+	 *    0x0041 ... 0x005a		['A' ... 'Z', IdentifierStart]
+	 *    0x0061 ... 0x007a		['a' ... 'z', IdentifierStart]
+	 *    0x0024			['$', IdentifierStart]
+	 *    0x005f			['_', IdentifierStart and
+	 *                               UnicodeConnectorPunctuation]
+	 *
+	 *  UnicodeCombiningMark has no code points <= 0x7f.
+	 *
+	 *  The matching code reuses the "identifier start" tables, and then
+	 *  consults a separate range set for characters in "identifier part"
+	 *  but not in "identifier start".  These can be extracted with the
+	 *  "src/extract_chars.py" script, see src/SConscript
+	 *  t_uni_idpart_minus_idstart* targets.
+	 *
+	 *  UnicodeCombiningMark -> categories Mn, Mc
+	 *  UnicodeDigit -> categories Nd
+	 *  UnicodeConnectorPunctuation -> categories Pc
+	 */
+
+	/* ASCII fast path -- quick accept and reject */
+	if (x <= 0x7f) {
+		if ((x >= 'a' && x <= 'z') ||
+		    (x >= 'A' && x <= 'Z') ||
+		    (x >= '0' && x <= '9') ||
+		    x == '_' || x == '$') {
+			return 1;
+		}
+		return 0;
+	}
+
+	/* Non-ASCII slow path (range-by-range linear comparison), very slow */
+
+#ifdef DUK_USE_SOURCE_NONBMP
+	if (uni_range_match(duk_unicode_identifier_start_noascii,
+	                    sizeof(duk_unicode_identifier_start_noascii),
+	                    x) ||
+	    uni_range_match(duk_unicode_identifier_part_minus_identifier_start_noascii,
+	                    sizeof(duk_unicode_identifier_part_minus_identifier_start_noascii),
+	                    x)) {
+		return 1;
+	}
+	return 0;
+#else
+	if (x < 0x10000) {
+		if (uni_range_match(duk_unicode_identifier_start_noascii_bmponly,
+		                    sizeof(duk_unicode_identifier_start_noascii_bmponly),
+		                    x) ||
+		    uni_range_match(duk_unicode_identifier_part_minus_identifier_start_noascii_bmponly,
+		                    sizeof(duk_unicode_identifier_part_minus_identifier_start_noascii_bmponly),
+		                    x)) {
+			return 1;
+		}
+		return 0;
+	} else {
+		/* without explicit non-BMP support, assume non-BMP characters
+		 * are always accepted as identifier characters.
+		 */
+		return 1;
+	}
+#endif
+}
+
+/*
+ *  Complex case conversion helper which decodes a bit-packed conversion
+ *  control stream generated by unicode/extract_caseconv.py.  The conversion
+ *  is very slow because it runs through the conversion data in a linear
+ *  fashion to save space (which is why ASCII characters have a special
+ *  fast path before arriving here).
+ * 
+ *  The particular bit counts etc have been determined experimentally to
+ *  be small but still sufficient, and must match the Python script
+ *  (src/extract_caseconv.py).
+ *
+ *  The return value is the case converted codepoint or -1 if the conversion
+ *  results in multiple characters (this is useful for regexp Canonicalization
+ *  operation).  If 'buf' is not NULL, the result codepoint(s) are also
+ *  appended to the hbuffer.
+ *
+ *  Context and locale specific rules must be checked before consulting
+ *  this function.
+ */
+
+static int slow_case_conversion(duk_hthread *thr,
+                                duk_hbuffer_growable *buf,
+                                int x,
+                                duk_bitdecoder_ctx *bd_ctx) {
+	int skip = 0;
+	int n, t;
+	int start_i, start_o, count;
+
+	DUK_DDDPRINT("slow case conversion for codepoint: %d", x);
+
+	/* range conversion with a "skip" */
+	DUK_DDDPRINT("checking ranges");
+	for (;;) {
+		skip++;
+		n = duk_bd_decode(bd_ctx, 6);
+		if (n == 0x3f) {
+			/* end marker */
+			break;
+		}
+		DUK_DDDPRINT("skip=%d, n=%d", skip, n);
+
+		while (n--) {
+			start_i = duk_bd_decode(bd_ctx, 16);
+			start_o = duk_bd_decode(bd_ctx, 16);
+			count = duk_bd_decode(bd_ctx, 7);
+			DUK_DDDPRINT("range: start_i=%d, start_o=%d, count=%d, skip=%d",
+			             start_i, start_o, count, skip);
+
+			t = x - start_i;
+			if (t >= 0 && t < count * skip && (t % skip) == 0) {
+				DUK_DDDPRINT("range matches input codepoint");
+				x = start_o + t;
+				goto single;
+			}
+		}
+	}
+
+	/* 1:1 conversion */
+	n = duk_bd_decode(bd_ctx, 6);
+	DUK_DDDPRINT("checking 1:1 conversions (count %d)", n);
+	while (n--) {
+		start_i = duk_bd_decode(bd_ctx, 16);
+		start_o = duk_bd_decode(bd_ctx, 16);
+		DUK_DDDPRINT("1:1 conversion %d -> %d", start_i, start_o);
+		if (x == start_i) {
+			DUK_DDDPRINT("1:1 matches input codepoint");
+			x = start_o;
+			goto single;
+		}
+	}
+
+	/* complex, multicharacter conversion */
+	n = duk_bd_decode(bd_ctx, 7);
+	DUK_DDDPRINT("checking 1:n conversions (count %d)", n);
+	while (n--) {
+		start_i = duk_bd_decode(bd_ctx, 16);
+		t = duk_bd_decode(bd_ctx, 2);
+		DUK_DDDPRINT("1:n conversion %d -> %d chars", start_i, t);
+		if (x == start_i) {
+			DUK_DDDPRINT("1:n matches input codepoint");
+			if (buf) {
+				while (t--) {
+					int tmp = duk_bd_decode(bd_ctx, 16);
+					DUK_ASSERT(buf != NULL);
+					duk_hbuffer_append_xutf8(thr, buf, tmp);
+				}
+			}
+			return -1;
+		} else {
+			while (t--) {
+				(void) duk_bd_decode(bd_ctx, 16);
+			}
+		}
+	}
+
+	/* default: no change */
+	DUK_DDDPRINT("no rule matches, output is same as input");
+	/* fall through */
+
+ single:
+	if (buf) {
+		duk_hbuffer_append_xutf8(thr, buf, x);
+	}
+	return x;
+}
+
+/*
+ *  Case conversion helper, with context/local sensitivity.
+ *  For proper case conversion, one needs to know the character
+ *  and the preceding and following characters, as well as
+ *  locale/language.
+ */
+
+static int case_transform_helper(duk_hthread *thr,
+                                 duk_hbuffer_growable *buf,
+                                 int x,
+                                 int prev,
+                                 int next,
+                                 int uppercase,
+                                 int language) {
+	duk_bitdecoder_ctx bd_ctx;
+
+	/* fast path for ASCII */
+	if (x < 0x80) {
+		/* FIXME: context sensitive rules exist for ASCII range too.
+		 * Need to add them here.
+		 */
+
+		if (uppercase) {
+			if (x >= 'a' && x <= 'z') {
+				x = x - 'a' + 'A';
+			}
+		} else {
+			if (x >= 'A' && x <= 'Z') {
+				x = x - 'A' + 'a';
+			}
+		}
+		goto singlechar;
+	}
+
+	/* context and locale specific rules which cannot currently be represented
+	 * in the caseconv bitstream: hardcoded rules in C
+	 */
+	if (uppercase) {
+		/* FIXME: turkish / azeri */
+	} else {
+		/* final sigma context specific rule */
+		if (x == 0x03a3 &&   /* U+03A3 = GREEK CAPITAL LETTER SIGMA */
+		    prev >= 0 &&     /* prev is letter */
+		    next < 0) {      /* next is not letter */
+			/* FIXME: fix conditions */
+			x = 0x03c2;
+			goto singlechar;
+		}
+
+		/* FIXME: lithuanian */
+		if (1 /* language == 'lt' */ &&
+		    x == 0x0307) {               /* U+0307 = COMBINING DOT ABOVE */
+			goto nochar;
+		}
+
+		/* FIXME: lithuanian, explicit dot rules */
+		/* FIXME: turkish / azeri, lowercase rules */
+	}
+
+	/* 1:1 or special conversions, but not locale/context specific: script generated rules */
+	memset(&bd_ctx, 0, sizeof(bd_ctx));
+	if (uppercase) {
+		bd_ctx.data = (duk_u8 *) duk_unicode_caseconv_uc;
+		bd_ctx.length = (duk_u32) sizeof(duk_unicode_caseconv_uc);
+	} else {
+		bd_ctx.data = (duk_u8 *) duk_unicode_caseconv_lc;
+		bd_ctx.length = (duk_u32) sizeof(duk_unicode_caseconv_lc);
+	}
+	return slow_case_conversion(thr, buf, x, &bd_ctx);
+
+ singlechar:
+	if (buf) {
+		duk_hbuffer_append_xutf8(thr, buf, x);
+	}
+	return x;
+
+ nochar:
+	return -1;
+}
+
+/*
+ *  Replace valstack top with case converted version.
+ */
+
+void duk_unicode_case_convert_string(duk_hthread *thr, int uppercase) {
+	duk_context *ctx = (duk_context *) thr;
+	duk_hstring *h_input;
+	duk_hbuffer_growable *h_buf;
+	duk_u8 *p, *p_start, *p_end;
+	int prev, curr, next;
+
+	h_input = duk_require_hstring(ctx, -1);
+	DUK_ASSERT(h_input != NULL);
+
+	/* FIXME: should init with a spare of at least h_input->blen? */
+	duk_push_new_growable_buffer(ctx, 0);
+	h_buf = (duk_hbuffer_growable *) duk_get_hbuffer(ctx, -1);
+	DUK_ASSERT(h_buf != NULL);
+	DUK_ASSERT(DUK_HBUFFER_HAS_GROWABLE(h_buf));
+
+	/* [ ... input buffer ] */
+
+	p_start = (duk_u8 *) DUK_HSTRING_GET_DATA(h_input);
+	p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input);
+	p = p_start;
+
+	prev = -1;
+	curr = -1;
+	next = -1;
+	for (;;) {
+		prev = curr;
+		curr = next;
+		next = -1;
+		if (p < p_end) {
+			next = (int) duk_unicode_xutf8_get_u32(thr, &p, p_start, p_end);
+		} else {
+			/* end of input and last char has been processed */
+			if (curr < 0) {
+				break;
+			}
+		}
+
+		/* on first round, skip */
+		if (curr >= 0) {
+			/* may generate any number of output codepoints */
+			case_transform_helper(thr,
+			                      h_buf,
+			                      curr,
+			                      prev,
+			                      next,
+			                      uppercase,
+			                      0);  /* FIXME: language */
+		}
+	}
+
+	duk_to_string(ctx, -1);  /* invalidates h_buf pointer */
+	duk_remove(ctx, -2);
+}
+
+#ifdef DUK_USE_REGEXP_SUPPORT
+
+/*
+ *  Canonicalize() abstract operation needed for canonicalization of individual
+ *  codepoints during regexp compilation and execution, see E5 Section 15.10.2.8.
+ *  Note that codepoints are canonicalized one character at a time, so no context
+ *  specific rules can apply.  Locale specific rules can apply, though.
+ */
+
+int duk_unicode_re_canonicalize_char(duk_hthread *thr, int x) {
+	int y;
+
+	y = case_transform_helper(thr,
+	                          NULL,    /* buf */
+	                          x,       /* curr char */
+	                          -1,      /* prev char */
+	                          -1,      /* next char */
+	                          1,       /* uppercase */
+	                          0);      /* FIXME: language */
+
+	if ((y < 0) || (x >= 0x80 && y < 0x80)) {
+		/* multiple codepoint conversion or non-ASCII mapped to ASCII
+		 * --> leave as is.
+		 */
+		return x;
+	}
+
+	return y;
+}
+
+/*
+ *  E5 Section 15.10.2.6 "IsWordChar" abstract operation.  Assume
+ *  x < 0 for characters read outside the string.
+ */
+
+int duk_unicode_re_is_wordchar(int x) {
+	/*
+	 *  Note: the description in E5 Section 15.10.2.6 has a typo, it
+	 *  contains 'A' twice and lacks 'a'; the intent is [0-9a-zA-Z_].
+	 */
+	if ((x >= '0' && x <= '9') ||
+	    (x >= 'a' && x <= 'z') ||
+	    (x >= 'A' && x <= 'Z') ||
+	    (x == '_')) {
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ *  Regexp range tables
+ */
+
+/* exposed because lexer needs these too */
+duk_u16 duk_unicode_re_ranges_digit[2] = {
+	(duk_u16) 0x0030, (duk_u16) 0x0039,
+};
+duk_u16 duk_unicode_re_ranges_white[22] = {
+	(duk_u16) 0x0009, (duk_u16) 0x000D,
+	(duk_u16) 0x0020, (duk_u16) 0x0020,
+	(duk_u16) 0x00A0, (duk_u16) 0x00A0,
+	(duk_u16) 0x1680, (duk_u16) 0x1680,
+	(duk_u16) 0x180E, (duk_u16) 0x180E,
+	(duk_u16) 0x2000, (duk_u16) 0x200A,
+	(duk_u16) 0x2028, (duk_u16) 0x2029,
+	(duk_u16) 0x202F, (duk_u16) 0x202F,
+	(duk_u16) 0x205F, (duk_u16) 0x205F,
+	(duk_u16) 0x3000, (duk_u16) 0x3000,
+	(duk_u16) 0xFEFF, (duk_u16) 0xFEFF,
+};
+duk_u16 duk_unicode_re_ranges_wordchar[8] = {
+	(duk_u16) 0x0030, (duk_u16) 0x0039,
+	(duk_u16) 0x0041, (duk_u16) 0x005A,
+	(duk_u16) 0x005F, (duk_u16) 0x005F,
+	(duk_u16) 0x0061, (duk_u16) 0x007A,
+};
+duk_u16 duk_unicode_re_ranges_not_digit[4] = {
+	(duk_u16) 0x0000, (duk_u16) 0x002F,
+	(duk_u16) 0x003A, (duk_u16) 0xFFFF,
+};
+duk_u16 duk_unicode_re_ranges_not_white[24] = {
+	(duk_u16) 0x0000, (duk_u16) 0x0008,
+	(duk_u16) 0x000E, (duk_u16) 0x001F,
+	(duk_u16) 0x0021, (duk_u16) 0x009F,
+	(duk_u16) 0x00A1, (duk_u16) 0x167F,
+	(duk_u16) 0x1681, (duk_u16) 0x180D,
+	(duk_u16) 0x180F, (duk_u16) 0x1FFF,
+	(duk_u16) 0x200B, (duk_u16) 0x2027,
+	(duk_u16) 0x202A, (duk_u16) 0x202E,
+	(duk_u16) 0x2030, (duk_u16) 0x205E,
+	(duk_u16) 0x2060, (duk_u16) 0x2FFF,
+	(duk_u16) 0x3001, (duk_u16) 0xFEFE,
+	(duk_u16) 0xFF00, (duk_u16) 0xFFFF,
+};
+duk_u16 duk_unicode_re_ranges_not_wordchar[10] = {
+	(duk_u16) 0x0000, (duk_u16) 0x002F,
+	(duk_u16) 0x003A, (duk_u16) 0x0040,
+	(duk_u16) 0x005B, (duk_u16) 0x005E,
+	(duk_u16) 0x0060, (duk_u16) 0x0060,
+	(duk_u16) 0x007B, (duk_u16) 0xFFFF,
+};
+
+#endif  /* DUK_USE_REGEXP_SUPPORT */
+
--- a/src/duk_unicode_tables.c
+++ b/src/duk_unicode_tables.c
@ -0,0 +1,47 @@
+/*
+ *  Unicode support tables automatically generated during build.
+ */
+
+#include "duk_internal.h"
+
+/*
+ *  Unicode tables containing ranges of Unicode characters in a
+ *  packed format.  These tables are used to match non-ASCII
+ *  characters of complex productions by resorting to a linear
+ *  range-by-range comparison.  This is very slow, but is expected
+ *  to be very rare in practical Ecmascript source code, and thus
+ *  compactness is most important.
+ *
+ *  The tables are matched using uni_range_match() and the format
+ *  is described in src/extract_chars.py.
+ */
+
+#ifdef DUK_USE_SOURCE_NONBMP
+/* IdentifierStart production with ASCII excluded */
+/* duk_unicode_identifier_start_noascii[] */
+#include "duk_unicode_ids_noa.c"
+#else
+/* IdentifierStart production with ASCII and non-BMP excluded */
+/* duk_unicode_identifier_start_noascii_bmponly[] */
+#include "duk_unicode_ids_noa_bmpo.c"
+#endif
+
+#ifdef DUK_USE_SOURCE_NONBMP
+/* IdentifierPart production with IdentifierStart and ASCII excluded */
+/* duk_unicode_identifier_part_minus_identifier_start_noascii[] */
+#include "duk_unicode_idp_m_ids_noa.c"
+#else
+/* IdentifierPart production with IdentifierStart, ASCII, and non-BMP excluded */
+/* duk_unicode_identifier_part_minus_identifier_start_noascii_bmponly[] */
+#include "duk_unicode_idp_m_ids_noa_bmpo.c"
+#endif
+
+/*
+ *  Case conversion tables generated using src/extract_caseconv.py.
+ */
+
+/* duk_unicode_caseconv_uc[] */
+/* duk_unicode_caseconv_lc[] */
+
+#include "duk_unicode_caseconv.c"
+