|
|
@ -285,7 +285,7 @@ DUK_INTERNAL duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, |
|
|
|
* chosen from several variants, based on x64 gcc -O2 testing. See: |
|
|
|
* https://github.com/svaarala/duktape/pull/422
|
|
|
|
* |
|
|
|
* NOTE: must match src/dukutil.py:duk_unicode_unvalidated_utf8_length(). |
|
|
|
* NOTE: must match tools/dukutil.py:duk_unicode_unvalidated_utf8_length(). |
|
|
|
*/ |
|
|
|
|
|
|
|
#if defined(DUK_USE_PREFER_SIZE) |
|
|
@ -396,7 +396,7 @@ DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *d |
|
|
|
* Used for slow path Unicode matching. |
|
|
|
*/ |
|
|
|
|
|
|
|
/* Must match src/extract_chars.py, generate_match_table3(). */ |
|
|
|
/* Must match tools/extract_chars.py, generate_match_table3(). */ |
|
|
|
DUK_LOCAL duk_uint32_t duk__uni_decode_value(duk_bitdecoder_ctx *bd_ctx) { |
|
|
|
duk_uint32_t t; |
|
|
|
|
|
|
@ -467,7 +467,7 @@ DUK_INTERNAL duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) { |
|
|
|
* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; |
|
|
|
* |
|
|
|
* It also specifies any Unicode category 'Zs' characters as white |
|
|
|
* space. These can be extracted with the "src/extract_chars.py" script. |
|
|
|
* space. These can be extracted with the "tools/extract_chars.py" script. |
|
|
|
* Current result: |
|
|
|
* |
|
|
|
* RAW OUTPUT: |
|
|
@ -574,7 +574,7 @@ DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) |
|
|
|
* |
|
|
|
* The "UnicodeLetter" alternative of the production allows letters |
|
|
|
* from various Unicode categories. These can be extracted with the |
|
|
|
* "src/extract_chars.py" script. |
|
|
|
* "tools/extract_chars.py" script. |
|
|
|
* |
|
|
|
* Because the result has hundreds of Unicode codepoint ranges, matching |
|
|
|
* for any values >= 0x80 are done using a very slow range-by-range scan |
|
|
@ -671,7 +671,7 @@ DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) |
|
|
|
* The matching code reuses the "identifier start" tables, and then |
|
|
|
* consults a separate range set for characters in "identifier part" |
|
|
|
* but not in "identifier start". These can be extracted with the |
|
|
|
* "src/extract_chars.py" script. |
|
|
|
* "tools/extract_chars.py" script. |
|
|
|
* |
|
|
|
* UnicodeCombiningMark -> categories Mn, Mc |
|
|
|
* UnicodeDigit -> categories Nd |
|
|
@ -786,14 +786,14 @@ DUK_INTERNAL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp) { |
|
|
|
|
|
|
|
/*
|
|
|
|
* Complex case conversion helper which decodes a bit-packed conversion |
|
|
|
* control stream generated by unicode/extract_caseconv.py. The conversion |
|
|
|
* control stream generated by tools/extract_caseconv.py. The conversion |
|
|
|
* is very slow because it runs through the conversion data in a linear |
|
|
|
* fashion to save space (which is why ASCII characters have a special |
|
|
|
* fast path before arriving here). |
|
|
|
* |
|
|
|
* The particular bit counts etc have been determined experimentally to |
|
|
|
* be small but still sufficient, and must match the Python script |
|
|
|
* (src/extract_caseconv.py). |
|
|
|
* (tools/extract_caseconv.py). |
|
|
|
* |
|
|
|
* The return value is the case converted codepoint or -1 if the conversion |
|
|
|
* results in multiple characters (this is useful for regexp Canonicalization |
|
|
|