Merge pull request #2443 from svaarala/tools-wtf8-sanitize-fixes

Improvements to WTF-8 sanitize JS helper
3 years ago · f608ae3fe3
1 changed files with 370 additions and 45 deletions
--- a/src-tools/lib/util/wtf8_sanitize.js
+++ b/src-tools/lib/util/wtf8_sanitize.js
@ -172,45 +172,72 @@ function wtf8SanitizeString(u8) {
        //let iStart = i;
        let ch = u8[i++];

-        // ASCII.
+        /* Decoder based on https://encoding.spec.whatwg.org/#utf-8-decoder
+         * in "replacement" mode, except that:
+         *   - U+D800 to U+DFFF (encoded ED A0 80 to ED BF BF) are allowed
+         *     to reach WTF-8 surrogate pair processing.
+         *
+         * Note in particular that non-canonical encodings (codepoint value
+         * too low or too high) must be terminated on the first invalid byte,
+         * not after decoding the whole N-byte sequence.
+         *
+         * Maximum input expansion is 3x from an invalid initial byte expanding
+         * to 3 bytes of UTF-8 encoded U+FFFD replacement character.
+         */
+
        if (ch <= 0x7f) {
            pushWtf8(ch);
            continue;
        }

-        // Non-ASCII, decode initial (extended) UTF-8 byte:
-        //   10000000 to 10111111: invalid
-        //   110xxxxx + 1 continuation: U+0080 to U+07FF
-        //   1110xxxx + 2 continuation: U+0800 to U+FFFF
-        //   11110xxx + 3 continuation: U+10000 to U+10FFFF
-        //   11111000 to 11111111: invalid
-        //
-        // If the (extended) UTF-8 sequence is invalid, replace the
-        // maximal valid sequence with a single U+FFFD replacement
-        // character as described in http://unicode.org/review/pr-121.html.
-
+        let lower = 0x80;
+        let upper = 0xbf;
        let numCont = 0;
        let cp = 0;
        let cpMin = 0;
        let cpMax = 0;
-        if (ch >= 0b11000000 && ch <= 0b11011111) {
+
+        if (ch >= 0x80 && ch <= 0xbf) {
+            // Invalid leading continuation byte.
+            pushReplacement();
+            continue;
+        } else if (ch >= 0xc0 && ch <= 0xc1) {
+            // Invalid 2-byte sequence, initial byte too low.
+            pushReplacement();
+            continue;
+        } else if (ch >= 0xc2 && ch <= 0xdf) {
+            // 2-byte sequence, valid initial byte.
            numCont = 1;
-            cp = (ch - 0b11000000);
+            cp = ch & 0x1f;
            cpMin = 0x80;
            cpMax = 0x7ff;
-        } else if (ch >= 0b11100000 && ch <= 0b11101111) {
+            lower = 0x80;
+            upper = 0xbf;
+        } else if (ch >= 0xe0 && ch <= 0xef) {
+            // 3-byte sequence, valid initial byte.
            numCont = 2;
-            cp = (ch - 0b11100000);
+            cp = ch & 0x0f;
            cpMin = 0x800;
            cpMax = 0xffff;
-        } else if (ch >= 0b11110000 && ch <= 0b11110111) {
+            lower = (ch === 0xe0 ? 0xa0 : 0x80);
+            upper = 0xbf;
+            // This would be the case for TextDecoder to reject U+D800 to U+DFFF,
+            // but we must allow them.
+            //upper = (ch === 0xed ? 0x9f : 0xbf);
+        } else if (ch >= 0xf0 && ch <= 0xf4) {
+            // 4-byte sequence, valid initial byte.
            numCont = 3;
-            cp = (ch - 0b11110000);
+            cp = ch & 0x07;
            cpMin = 0x10000;
            cpMax = 0x10ffff;
+            lower = (ch === 0xf0 ? 0x90 : 0x80);
+            upper = (ch === 0xf4 ? 0x8f : 0xbf);
+        } else if (ch >= 0xf5 && ch <= 0xf7) {
+            // Invalid 4-byte sequence, initial byte too high.
+            pushReplacement();
+            continue;
        } else {
-            // Invalid initial byte, replace with one replacement char.
-            // This branch produces the maximum 3x input expansion.
+            // Invalid UTF-8 sequence, invalid initial byte.
            pushReplacement();
            continue;
        }
@ -226,8 +253,10 @@ function wtf8SanitizeString(u8) {
            }

            let cb = u8[i++];
-            if (cb >= 0b10000000 && cb <= 0b10111111) {
-                cp = (cp * 64) + (cb - 0b10000000);
+            if (cb >= lower && cb <= upper) {
+                cp = (cp << 6) + (cb & 0x3f);
+                lower = 0x80;
+                upper = 0xbf;
            } else {
                // Encoding broken at current index, replace everything so
                // far (excluding the broken byte) with one replacement
@ -246,23 +275,11 @@ function wtf8SanitizeString(u8) {
        }

        if (cp < cpMin || cp > cpMax) {
-            // Not a canonical shortest encoding or out-of-bounds, replace the
-            // entire sequence with a single replacement character.
-            //
-            // This doesn't currently handle non-shortest encoding or codepoints
-            // above U+10FFFF correctly: we'll first decode the entire sequence
-            // and then replace it with a single U+FFFD if it's incorrect.  The
-            // correct behavior is to detect the first broken byte which necessarily
-            // makes the result invalid, and consume only the maximal valid byte
-            // prefix.
-            //
-            // For example, for U+110000 the encoding is F4 90 80 80, and the 90 is
-            // already broken so the initial F4 gets replaced with a U+FFFD and 90
-            // is then considered again.  The correct output for F4 90 80 80 is
-            // U+FFFD U+FFFD U+FFFD U+FFFD.
-
-            pushReplacement();
-            continue;
+            // This should never happen because the upper/lower continuation
+            // byte should restrict the codepoint to the allowed range.
+            throw new TypeError('internal error: codepoint not within [cpMin,cpMax]: ' + cp + ' vs ' + cpMin + '-' + cpMax);
+            //pushReplacement();
+            //continue;
        }

        // Successfully decoded a UTF-8 codepoint.  We will either:
@ -299,10 +316,10 @@ function wtf8SanitizeString(u8) {
        // '\xee\x80\x80'

        let bytesLeft = u8.length - i;
-        let validSurrogatePair = bytesLeft >= 3 &&
+        let validSurrogatePair = (bytesLeft >= 3 &&
            u8[i] === 0xed &&
            u8[i + 1] >= 0xa0 && u8[i + 1] <= 0xbf &&
-            u8[i + 2] >= 0x80 && u8[i + 2] <= 0xbf;
+            u8[i + 2] >= 0x80 && u8[i + 2] <= 0xbf);
        if (validSurrogatePair) {
            // Valid low surrogate follows.  Decode and combine.
            let hi = cp;
@ -337,6 +354,8 @@ function wtf8SanitizeString(u8) {
        inputDebugString: getDebugStringForU8(u8),
        outputUint8ArrayWtf8,
        outputUint8ArrayCesu8,
+        outputCodepointsWtf8: codepointResultWtf8,
+        outputCodepointsCesu8: codepointResultCesu8,
        outputDebugStringWtf8,
        outputDebugStringCesu8,
        byteLengthWtf8: outputUint8ArrayWtf8.length,
@ -352,13 +371,14 @@ function wtf8SanitizeString(u8) {
    };
 }

-function sanitizeToWtf8(u8) {
+function sanitizeToWtf8(u8, args) {
    if (!(typeof u8 === 'object' && u8 !== null && u8 instanceof Uint8Array)) {
        throw new TypeError('input must be a Uint8Array');
    }

    // Special check for symbol strings.
-    if (u8.length >= 1 && (u8[0] === 0x80 || u8[0] === 0x81 || u8[0] === 0x82 || u8[0] === 0xff)) {
+    let allowSymbol = !(args && args.allowSymbol === false);
+    if (allowSymbol && u8.length >= 1 && (u8[0] === 0x80 || u8[0] === 0x81 || u8[0] === 0x82 || u8[0] === 0xff)) {
        return wtf8SanitizeSymbol(u8);
    } else {
        return wtf8SanitizeString(u8);
@ -384,6 +404,7 @@ function sanitizeToWtf8(u8) {
 // on conversion or validation.

 function testPr121() {
+    // Example given in PR-121.
    let res = sanitizeToWtf8(new Uint8Array([0x61, 0xf1, 0x80, 0x80, 0xe1, 0x80, 0xc2, 0x62]));
    //console.log(res);
    if (res.outputDebugStringWtf8 !== 'a<fffd><fffd><fffd>b') {
@ -395,16 +416,320 @@ function testPr121() {
 function testUnicode110000() {
    let res = sanitizeToWtf8(new Uint8Array([0x41, 0xf4, 0x90, 0x80, 0x80]));
    //console.log(res);
-    // Currently incorrect, A<fffd>.
    if (res.outputDebugStringWtf8 !== 'A<fffd><fffd><fffd><fffd>') {
        //console.log(res);
-        //throw new TypeError('wtf8 sanitize self test failed');
+        throw new TypeError('wtf8 sanitize self test failed');
+    }
+}
+
+function testByteRanges() {
+    // Some basic boundary testing of valid byte ranges.
+    // Also compare against TextDecoder() output which should match
+    // for decoding and replacement character behavior except when
+    // U+D800 to U+DFFF are involved.
+
+    let R = 'REPLACEMENT';
+    let I = 'INPUT';
+    let tests = [
+        { input: [ 0x00 ], output: [ I ] },
+        { input: [ 0x40 ], output: [ I ] },
+        { input: [ 0x7f ], output: [ I ] },
+
+        { input: [ 0x80 ], output: [ R ] },
+        { input: [ 0x80, 0x80 ], output: [ R, R ] },
+        { input: [ 0xa0 ], output: [ R ] },
+        { input: [ 0xbf ], output: [ R ] },
+        { input: [ 0xbf, 0xbf ], output: [ R, R ] },
+
+        { input: [ 0xc0 ], output: [ R ] },
+        { input: [ 0xc0, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xc0, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xc0, 0x80 ], output: [ R, R ] },
+        { input: [ 0xc0, 0xbf ], output: [ R, R ] },
+        { input: [ 0xc0, 0xc0 ], output: [ R, R ] },
+
+        { input: [ 0xc1 ], output: [ R ] },
+        { input: [ 0xc1, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xc1, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xc1, 0x80 ], output: [ R, R ] },
+        { input: [ 0xc1, 0xbf ], output: [ R, R ] },
+        { input: [ 0xc1, 0xc0 ], output: [ R, R ] },
+
+        { input: [ 0xc2, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xc2, 0x80 ], output: [ I ] },  // smallest valid 2-byte: U+0080
+        { input: [ 0xc2, 0xbf ], output: [ I ] },
+        { input: [ 0xc2, 0xc0 ], output: [ R, R ] },
+
+        { input: [ 0xc3, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xc3, 0x80 ], output: [ I ] },
+        { input: [ 0xc3, 0xbf ], output: [ I ] },
+        { input: [ 0xc3, 0xc0 ], output: [ R, R ] },
+
+        { input: [ 0xdf, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xdf, 0x80 ], output: [ I ] },
+        { input: [ 0xdf, 0xbf ], output: [ I ] },  // highest valid 2-byte: U+07FF
+        { input: [ 0xdf, 0xc0 ], output: [ R, R ] },
+
+        { input: [ 0xe0 ], output: [ R ] },
+        { input: [ 0xe0, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xe0, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xe0, 0x80 ], output: [ R, R ] },
+        { input: [ 0xe0, 0x80, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xe0, 0x80, 0xbf ], output: [ R, R, R ] },
+        { input: [ 0xe0, 0x9f ], output: [ R, R ] },
+        { input: [ 0xe0, 0x9f, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xe0, 0x9f, 0xbf ], output: [ R, R, R ] },
+        { input: [ 0xe0, 0xa0 ], output: [ R ] },  // lower limit for 2nd byte 0xa0 so valid here (but truncated)
+        { input: [ 0xe0, 0xa0, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xe0, 0xa0, 0x80 ], output: [ I ] },  // smallest valid 3-byte: 0x800
+        { input: [ 0xe0, 0xa0, 0xbf ], output: [ I ] },
+        { input: [ 0xe0, 0xbf, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xe0, 0xbf, 0x80 ], output: [ I ] },
+        { input: [ 0xe0, 0xbf, 0xbf ], output: [ I ] },
+
+        { input: [ 0xe1 ], output: [ R ] },
+        { input: [ 0xe1, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xe1, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xe1, 0x80 ], output: [ R ] },
+        { input: [ 0xe1, 0x80, 0x80 ], output: [ I ] },
+        { input: [ 0xe1, 0x80, 0xbf ], output: [ I ] },
+        { input: [ 0xe1, 0x9f ], output: [ R ] },
+        { input: [ 0xe1, 0x9f, 0x80 ], output: [ I ] },
+        { input: [ 0xe1, 0x9f, 0xbf ], output: [ I ] },
+        { input: [ 0xe1, 0xa0 ], output: [ R ] },
+        { input: [ 0xe1, 0xa0, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xe1, 0xa0, 0x80 ], output: [ I ] },
+        { input: [ 0xe1, 0xa0, 0xbf ], output: [ I ] },
+        { input: [ 0xe1, 0xbf, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xe1, 0xbf, 0x80 ], output: [ I ] },
+        { input: [ 0xe1, 0xbf, 0xbf ], output: [ I ] },
+
+        // Unlike with TextDecoder(), initial byte 0xED does not cause 2nd
+        // byte to have upper limit 0x9F because U+D800 to U+DFFF must be
+        // allowed for WTF-8 so these pass through as is (when otherwise valid).
+        { input: [ 0xed ], output: [ R ] },
+        { input: [ 0xed, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xed, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xed, 0x80 ], output: [ R ] },
+        { input: [ 0xed, 0x80, 0x80 ], output: [ I ] },
+        { input: [ 0xed, 0x80, 0xbf ], output: [ I ] },
+        { input: [ 0xed, 0x9f ], output: [ R ] },
+        { input: [ 0xed, 0x9f, 0x80 ], output: [ I ] },
+        { input: [ 0xed, 0x9f, 0xbf ], output: [ I ] },
+        { input: [ 0xed, 0xa0 ], output: [ R ], textDecoderOutput: [ R, R ] },  // TextDecoder() rejects the 2nd byte so two replacements
+        { input: [ 0xed, 0xa0, 0x7f ], output: [ R, 0x7f ], textDecoderOutput: [ R, R, 0x7f ] },
+        { input: [ 0xed, 0xa0, 0x80 ], output: [ I ], textDecoderOutput: [ R, R, R ] },
+        { input: [ 0xed, 0xa0, 0xbf ], output: [ I ], textDecoderOutput: [ R, R, R ] },
+        { input: [ 0xed, 0xbf, 0x7f ], output: [ R, 0x7f ], textDecoderOutput: [ R, R, 0x7f ] },
+        { input: [ 0xed, 0xbf, 0x80 ], output: [ I ], textDecoderOutput: [ R, R, R ] },
+        { input: [ 0xed, 0xbf, 0xbf ], output: [ I ], textDecoderOutput: [ R, R, R ] },
+
+        // Valid surrogate pairs get combined.  Just a few point checks here.
+        { input: [ 0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80 ], output: [ 0xf0, 0x90, 0x80, 0x80 ], textDecoderOutput: [ R, R, R, R, R, R ] },  // U+D800 U+DC00 => U+10000
+        { input: [ 0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf ], output: [ 0xf0, 0x90, 0x8f, 0xbf ], textDecoderOutput: [ R, R, R, R, R, R ] },  // U+D800 U+DFFF => U+103FF
+        { input: [ 0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80 ], output: [ 0xf4, 0x8f, 0xb0, 0x80 ], textDecoderOutput: [ R, R, R, R, R, R ] },  // U+DBFF U+DC00 => U+10FC00
+        { input: [ 0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf ], output: [ 0xf4, 0x8f, 0xbf, 0xbf ], textDecoderOutput: [ R, R, R, R, R, R ] },  // U+DBFF U+DFFF => U+10FFFF
+
+        { input: [ 0xee ], output: [ R ] },
+        { input: [ 0xee, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xee, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xee, 0x80 ], output: [ R ] },
+        { input: [ 0xee, 0x80, 0x80 ], output: [ I ] },
+        { input: [ 0xee, 0x80, 0xbf ], output: [ I ] },
+        { input: [ 0xee, 0x9f ], output: [ R ] },
+        { input: [ 0xee, 0x9f, 0x80 ], output: [ I ] },
+        { input: [ 0xee, 0x9f, 0xbf ], output: [ I ] },
+        { input: [ 0xee, 0xa0 ], output: [ R ] },
+        { input: [ 0xee, 0xa0, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xee, 0xa0, 0x80 ], output: [ I ] },
+        { input: [ 0xee, 0xa0, 0xbf ], output: [ I ] },
+        { input: [ 0xee, 0xbf, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xee, 0xbf, 0x80 ], output: [ I ] },
+        { input: [ 0xee, 0xbf, 0xbf ], output: [ I ] },
+
+        { input: [ 0xef ], output: [ R ] },
+        { input: [ 0xef, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xef, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xef, 0x80 ], output: [ R ] },
+        { input: [ 0xef, 0x80, 0x80 ], output: [ I ] },
+        { input: [ 0xef, 0x80, 0xbf ], output: [ I ] },
+        { input: [ 0xef, 0x9f ], output: [ R ] },
+        { input: [ 0xef, 0x9f, 0x80 ], output: [ I ] },
+        { input: [ 0xef, 0x9f, 0xbf ], output: [ I ] },
+        { input: [ 0xef, 0xa0 ], output: [ R ] },
+        { input: [ 0xef, 0xa0, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xef, 0xa0, 0x80 ], output: [ I ] },
+        { input: [ 0xef, 0xa0, 0xbf ], output: [ I ] },
+        { input: [ 0xef, 0xbf, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xef, 0xbf, 0x80 ], output: [ I ] },
+        { input: [ 0xef, 0xbf, 0xbf ], output: [ I ] },
+
+        { input: [ 0xf0 ], output: [ R ] },
+        { input: [ 0xf0, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xf0, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xf0, 0x80 ], output: [ R, R ] },
+        { input: [ 0xf0, 0x80, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf0, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf0, 0x8f ], output: [ R, R ] },
+        { input: [ 0xf0, 0x8f, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf0, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf0, 0x90 ], output: [ R ] },  // lower limit for 2nd byte 0x90 so valid here (but truncated)
+        { input: [ 0xf0, 0x90, 0x80 ], output: [ R ] },
+        { input: [ 0xf0, 0x90, 0x80, 0x80 ], output: [ I ] },  // smallest valid 4-byte: U+10000
+        { input: [ 0xf0, 0xbf, 0xbf, 0xbf ], output: [ I ] },
+        { input: [ 0xf0, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+
+        { input: [ 0xf1 ], output: [ R ] },
+        { input: [ 0xf1, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xf1, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xf1, 0x80 ], output: [ R ] },
+        { input: [ 0xf1, 0x80, 0x80 ], output: [ R ] },
+        { input: [ 0xf1, 0x80, 0x80, 0x80 ], output: [ I ] },  // U+40000
+        { input: [ 0xf1, 0x8f ], output: [ R ] },
+        { input: [ 0xf1, 0x8f, 0x80 ], output: [ R ] },
+        { input: [ 0xf1, 0x8f, 0x80, 0x80 ], output: [ I ] },  // U+4F000
+        { input: [ 0xf1, 0x90 ], output: [ R ] },
+        { input: [ 0xf1, 0x90, 0x80 ], output: [ R ] },
+        { input: [ 0xf1, 0x90, 0x80, 0x80 ], output: [ I ] },  // U+50000
+        { input: [ 0xf1, 0xbf, 0xbf, 0xbf ], output: [ I ] },
+        { input: [ 0xf1, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+
+        { input: [ 0xf3 ], output: [ R ] },
+        { input: [ 0xf3, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xf3, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xf3, 0x80 ], output: [ R ] },
+        { input: [ 0xf3, 0x80, 0x80 ], output: [ R ] },
+        { input: [ 0xf3, 0x80, 0x80, 0x80 ], output: [ I ] },
+        { input: [ 0xf3, 0x8f ], output: [ R ] },
+        { input: [ 0xf3, 0x8f, 0x80 ], output: [ R ] },
+        { input: [ 0xf3, 0x8f, 0x80, 0x80 ], output: [ I ] },
+        { input: [ 0xf3, 0x90 ], output: [ R ] },
+        { input: [ 0xf3, 0x90, 0x80 ], output: [ R ] },
+        { input: [ 0xf3, 0x90, 0x80, 0x80 ], output: [ I ] },
+        { input: [ 0xf3, 0xbf, 0xbf, 0xbf ], output: [ I ] },
+        { input: [ 0xf3, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+
+        { input: [ 0xf4 ], output: [ R ] },
+        { input: [ 0xf4, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xf4, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xf4, 0x80 ], output: [ R ] },
+        { input: [ 0xf4, 0x80, 0x80 ], output: [ R ] },
+        { input: [ 0xf4, 0x80, 0x80, 0x80 ], output: [ I ] },
+        { input: [ 0xf4, 0x8f ], output: [ R ] },
+        { input: [ 0xf4, 0x8f, 0x80 ], output: [ R ] },
+        { input: [ 0xf4, 0x8f, 0x80, 0x80 ], output: [ I ] },
+        { input: [ 0xf4, 0x90 ], output: [ R, R ] },  // upper limit for 2nd byte 0x8f so invalid here
+        { input: [ 0xf4, 0x90, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf4, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf4, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+        { input: [ 0xf4, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+
+        // F5-F7 is technically a leading byte for a 4-byte encoding but
+        // encoded values are > U+10FFFF so F5-F7 are rejected.
+        { input: [ 0xf5 ], output: [ R ] },
+        { input: [ 0xf5, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xf5, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xf5, 0x80 ], output: [ R, R ] },
+        { input: [ 0xf5, 0x80, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf5, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf5, 0x8f ], output: [ R, R ] },
+        { input: [ 0xf5, 0x8f, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf5, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf5, 0x90 ], output: [ R, R ] },
+        { input: [ 0xf5, 0x90, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf5, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf5, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+        { input: [ 0xf5, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+
+        { input: [ 0xf7 ], output: [ R ] },
+        { input: [ 0xf7, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xf7, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xf7, 0x80 ], output: [ R, R ] },
+        { input: [ 0xf7, 0x80, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf7, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf7, 0x8f ], output: [ R, R ] },
+        { input: [ 0xf7, 0x8f, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf7, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf7, 0x90 ], output: [ R, R ] },
+        { input: [ 0xf7, 0x90, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf7, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf7, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+        { input: [ 0xf7, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+
+        // F8-FF are all invalid so rejected as initial byte.
+        { input: [ 0xf8 ], output: [ R ] },
+        { input: [ 0xf8, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xf8, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xf8, 0x80 ], output: [ R, R ] },
+        { input: [ 0xf8, 0x80, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf8, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf8, 0x8f ], output: [ R, R ] },
+        { input: [ 0xf8, 0x8f, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf8, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf8, 0x90 ], output: [ R, R ] },
+        { input: [ 0xf8, 0x90, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xf8, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xf8, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+        { input: [ 0xf8, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+
+        { input: [ 0xff ], output: [ R ] },
+        { input: [ 0xff, 0x41 ], output: [ R, 0x41 ] },
+        { input: [ 0xff, 0x7f ], output: [ R, 0x7f ] },
+        { input: [ 0xff, 0x80 ], output: [ R, R ] },
+        { input: [ 0xff, 0x80, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xff, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xff, 0x8f ], output: [ R, R ] },
+        { input: [ 0xff, 0x8f, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xff, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xff, 0x90 ], output: [ R, R ] },
+        { input: [ 0xff, 0x90, 0x80 ], output: [ R, R, R ] },
+        { input: [ 0xff, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] },
+        { input: [ 0xff, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+        { input: [ 0xff, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
+    ];
+
+    function prepOutput(output, input) {
+        return output.map((v) => {
+            if (v === 'REPLACEMENT') { return [ 0xef, 0xbf, 0xbd ]; }
+            if (v === 'INPUT') { return input; }
+            return [v];
+        }).flat();
+    }
+
+    for (let { input, output, textDecoderOutput } of tests) {
+        let out = prepOutput(output, input);
+        let wtf8Res = sanitizeToWtf8(new Uint8Array(input), { allowSymbol: false });
+        let res1 = wtf8Res.outputUint8ArrayWtf8;
+        if (res1.length !== out.length) {
+            //console.log(input, out, res1);
+            throw new TypeError('wtf8 sanitize self test failed, length mismatch');
+        }
+        for (let i = 0; i < res1.length; i++) {
+            if (res1[i] !== out[i]) {
+                //console.log(input, out, res1);
+                throw new TypeError('wtf8 sanitize self test failed, difference at index ' + i);
+            }
+        }
+
+        let tdOut = prepOutput(textDecoderOutput || output, input);
+        let res2 = new TextEncoder().encode(new TextDecoder().decode(new Uint8Array(input)));
+        if (res2.length !== tdOut.length) {
+            //console.log(input, tdOut, res2);
+            throw new TypeError('wtf8 sanitize self test failed, length mismatch for TextDecoder output');
+        }
+        for (let i = 0; i < res2.length; i++) {
+            if (res2[i] !== tdOut[i]) {
+                //console.log(input, tdOut, res2);
+                throw new TypeError('wtf8 sanitize self test failed, difference at index ' + i + ' for TextDecoder output');
+            }
+        }
    }
 }

 function runSelfTests() {
    testPr121();
    testUnicode110000();
+    testByteRanges();
 }
 runSelfTests();