Browse Source

Merge pull request #2443 from svaarala/tools-wtf8-sanitize-fixes

Improvements to WTF-8 sanitize JS helper
pull/2444/head
Sami Vaarala 3 years ago
committed by GitHub
parent
commit
f608ae3fe3
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 415
      src-tools/lib/util/wtf8_sanitize.js

415
src-tools/lib/util/wtf8_sanitize.js

@ -172,45 +172,72 @@ function wtf8SanitizeString(u8) {
//let iStart = i;
let ch = u8[i++];
// ASCII.
/* Decoder based on https://encoding.spec.whatwg.org/#utf-8-decoder
* in "replacement" mode, except that:
* - U+D800 to U+DFFF (encoded ED A0 80 to ED BF BF) are allowed
* to reach WTF-8 surrogate pair processing.
*
* Note in particular that non-canonical encodings (codepoint value
* too low or too high) must be terminated on the first invalid byte,
* not after decoding the whole N-byte sequence.
*
* Maximum input expansion is 3x from an invalid initial byte expanding
* to 3 bytes of UTF-8 encoded U+FFFD replacement character.
*/
if (ch <= 0x7f) {
pushWtf8(ch);
continue;
}
// Non-ASCII, decode initial (extended) UTF-8 byte:
// 10000000 to 10111111: invalid
// 110xxxxx + 1 continuation: U+0080 to U+07FF
// 1110xxxx + 2 continuation: U+0800 to U+FFFF
// 11110xxx + 3 continuation: U+10000 to U+10FFFF
// 11111000 to 11111111: invalid
//
// If the (extended) UTF-8 sequence is invalid, replace the
// maximal valid sequence with a single U+FFFD replacement
// character as described in http://unicode.org/review/pr-121.html.
let lower = 0x80;
let upper = 0xbf;
let numCont = 0;
let cp = 0;
let cpMin = 0;
let cpMax = 0;
if (ch >= 0b11000000 && ch <= 0b11011111) {
if (ch >= 0x80 && ch <= 0xbf) {
// Invalid leading continuation byte.
pushReplacement();
continue;
} else if (ch >= 0xc0 && ch <= 0xc1) {
// Invalid 2-byte sequence, initial byte too low.
pushReplacement();
continue;
} else if (ch >= 0xc2 && ch <= 0xdf) {
// 2-byte sequence, valid initial byte.
numCont = 1;
cp = (ch - 0b11000000);
cp = ch & 0x1f;
cpMin = 0x80;
cpMax = 0x7ff;
} else if (ch >= 0b11100000 && ch <= 0b11101111) {
lower = 0x80;
upper = 0xbf;
} else if (ch >= 0xe0 && ch <= 0xef) {
// 3-byte sequence, valid initial byte.
numCont = 2;
cp = (ch - 0b11100000);
cp = ch & 0x0f;
cpMin = 0x800;
cpMax = 0xffff;
} else if (ch >= 0b11110000 && ch <= 0b11110111) {
lower = (ch === 0xe0 ? 0xa0 : 0x80);
upper = 0xbf;
// This would be the case for TextDecoder to reject U+D800 to U+DFFF,
// but we must allow them.
//upper = (ch === 0xed ? 0x9f : 0xbf);
} else if (ch >= 0xf0 && ch <= 0xf4) {
// 4-byte sequence, valid initial byte.
numCont = 3;
cp = (ch - 0b11110000);
cp = ch & 0x07;
cpMin = 0x10000;
cpMax = 0x10ffff;
lower = (ch === 0xf0 ? 0x90 : 0x80);
upper = (ch === 0xf4 ? 0x8f : 0xbf);
} else if (ch >= 0xf5 && ch <= 0xf7) {
// Invalid 4-byte sequence, initial byte too high.
pushReplacement();
continue;
} else {
// Invalid initial byte, replace with one replacement char.
// This branch produces the maximum 3x input expansion.
// Invalid UTF-8 sequence, invalid initial byte.
pushReplacement();
continue;
}
@ -226,8 +253,10 @@ function wtf8SanitizeString(u8) {
}
let cb = u8[i++];
if (cb >= 0b10000000 && cb <= 0b10111111) {
cp = (cp * 64) + (cb - 0b10000000);
if (cb >= lower && cb <= upper) {
cp = (cp << 6) + (cb & 0x3f);
lower = 0x80;
upper = 0xbf;
} else {
// Encoding broken at current index, replace everything so
// far (excluding the broken byte) with one replacement
@ -246,23 +275,11 @@ function wtf8SanitizeString(u8) {
}
if (cp < cpMin || cp > cpMax) {
// Not a canonical shortest encoding or out-of-bounds, replace the
// entire sequence with a single replacement character.
//
// This doesn't currently handle non-shortest encoding or codepoints
// above U+10FFFF correctly: we'll first decode the entire sequence
// and then replace it with a single U+FFFD if it's incorrect. The
// correct behavior is to detect the first broken byte which necessarily
// makes the result invalid, and consume only the maximal valid byte
// prefix.
//
// For example, for U+110000 the encoding is F4 90 80 80, and the 90 is
// already broken so the initial F4 gets replaced with a U+FFFD and 90
// is then considered again. The correct output for F4 90 80 80 is
// U+FFFD U+FFFD U+FFFD U+FFFD.
pushReplacement();
continue;
// This should never happen because the upper/lower continuation
// byte should restrict the codepoint to the allowed range.
throw new TypeError('internal error: codepoint not within [cpMin,cpMax]: ' + cp + ' vs ' + cpMin + '-' + cpMax);
//pushReplacement();
//continue;
}
// Successfully decoded a UTF-8 codepoint. We will either:
@ -299,10 +316,10 @@ function wtf8SanitizeString(u8) {
// '\xee\x80\x80'
let bytesLeft = u8.length - i;
let validSurrogatePair = bytesLeft >= 3 &&
let validSurrogatePair = (bytesLeft >= 3 &&
u8[i] === 0xed &&
u8[i + 1] >= 0xa0 && u8[i + 1] <= 0xbf &&
u8[i + 2] >= 0x80 && u8[i + 2] <= 0xbf;
u8[i + 2] >= 0x80 && u8[i + 2] <= 0xbf);
if (validSurrogatePair) {
// Valid low surrogate follows. Decode and combine.
let hi = cp;
@ -337,6 +354,8 @@ function wtf8SanitizeString(u8) {
inputDebugString: getDebugStringForU8(u8),
outputUint8ArrayWtf8,
outputUint8ArrayCesu8,
outputCodepointsWtf8: codepointResultWtf8,
outputCodepointsCesu8: codepointResultCesu8,
outputDebugStringWtf8,
outputDebugStringCesu8,
byteLengthWtf8: outputUint8ArrayWtf8.length,
@ -352,13 +371,14 @@ function wtf8SanitizeString(u8) {
};
}
function sanitizeToWtf8(u8) {
function sanitizeToWtf8(u8, args) {
if (!(typeof u8 === 'object' && u8 !== null && u8 instanceof Uint8Array)) {
throw new TypeError('input must be a Uint8Array');
}
// Special check for symbol strings.
if (u8.length >= 1 && (u8[0] === 0x80 || u8[0] === 0x81 || u8[0] === 0x82 || u8[0] === 0xff)) {
let allowSymbol = !(args && args.allowSymbol === false);
if (allowSymbol && u8.length >= 1 && (u8[0] === 0x80 || u8[0] === 0x81 || u8[0] === 0x82 || u8[0] === 0xff)) {
return wtf8SanitizeSymbol(u8);
} else {
return wtf8SanitizeString(u8);
@ -384,6 +404,7 @@ function sanitizeToWtf8(u8) {
// on conversion or validation.
function testPr121() {
// Example given in PR-121.
let res = sanitizeToWtf8(new Uint8Array([0x61, 0xf1, 0x80, 0x80, 0xe1, 0x80, 0xc2, 0x62]));
//console.log(res);
if (res.outputDebugStringWtf8 !== 'a<fffd><fffd><fffd>b') {
@ -395,16 +416,320 @@ function testPr121() {
function testUnicode110000() {
let res = sanitizeToWtf8(new Uint8Array([0x41, 0xf4, 0x90, 0x80, 0x80]));
//console.log(res);
// Currently incorrect, A<fffd>.
if (res.outputDebugStringWtf8 !== 'A<fffd><fffd><fffd><fffd>') {
//console.log(res);
//throw new TypeError('wtf8 sanitize self test failed');
throw new TypeError('wtf8 sanitize self test failed');
}
}
function testByteRanges() {
// Some basic boundary testing of valid byte ranges.
// Also compare against TextDecoder() output which should match
// for decoding and replacement character behavior except when
// U+D800 to U+DFFF are involved.
let R = 'REPLACEMENT';
let I = 'INPUT';
let tests = [
{ input: [ 0x00 ], output: [ I ] },
{ input: [ 0x40 ], output: [ I ] },
{ input: [ 0x7f ], output: [ I ] },
{ input: [ 0x80 ], output: [ R ] },
{ input: [ 0x80, 0x80 ], output: [ R, R ] },
{ input: [ 0xa0 ], output: [ R ] },
{ input: [ 0xbf ], output: [ R ] },
{ input: [ 0xbf, 0xbf ], output: [ R, R ] },
{ input: [ 0xc0 ], output: [ R ] },
{ input: [ 0xc0, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xc0, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xc0, 0x80 ], output: [ R, R ] },
{ input: [ 0xc0, 0xbf ], output: [ R, R ] },
{ input: [ 0xc0, 0xc0 ], output: [ R, R ] },
{ input: [ 0xc1 ], output: [ R ] },
{ input: [ 0xc1, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xc1, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xc1, 0x80 ], output: [ R, R ] },
{ input: [ 0xc1, 0xbf ], output: [ R, R ] },
{ input: [ 0xc1, 0xc0 ], output: [ R, R ] },
{ input: [ 0xc2, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xc2, 0x80 ], output: [ I ] }, // smallest valid 2-byte: U+0080
{ input: [ 0xc2, 0xbf ], output: [ I ] },
{ input: [ 0xc2, 0xc0 ], output: [ R, R ] },
{ input: [ 0xc3, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xc3, 0x80 ], output: [ I ] },
{ input: [ 0xc3, 0xbf ], output: [ I ] },
{ input: [ 0xc3, 0xc0 ], output: [ R, R ] },
{ input: [ 0xdf, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xdf, 0x80 ], output: [ I ] },
{ input: [ 0xdf, 0xbf ], output: [ I ] }, // highest valid 2-byte: U+07FF
{ input: [ 0xdf, 0xc0 ], output: [ R, R ] },
{ input: [ 0xe0 ], output: [ R ] },
{ input: [ 0xe0, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xe0, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xe0, 0x80 ], output: [ R, R ] },
{ input: [ 0xe0, 0x80, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xe0, 0x80, 0xbf ], output: [ R, R, R ] },
{ input: [ 0xe0, 0x9f ], output: [ R, R ] },
{ input: [ 0xe0, 0x9f, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xe0, 0x9f, 0xbf ], output: [ R, R, R ] },
{ input: [ 0xe0, 0xa0 ], output: [ R ] }, // lower limit for 2nd byte 0xa0 so valid here (but truncated)
{ input: [ 0xe0, 0xa0, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xe0, 0xa0, 0x80 ], output: [ I ] }, // smallest valid 3-byte: 0x800
{ input: [ 0xe0, 0xa0, 0xbf ], output: [ I ] },
{ input: [ 0xe0, 0xbf, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xe0, 0xbf, 0x80 ], output: [ I ] },
{ input: [ 0xe0, 0xbf, 0xbf ], output: [ I ] },
{ input: [ 0xe1 ], output: [ R ] },
{ input: [ 0xe1, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xe1, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xe1, 0x80 ], output: [ R ] },
{ input: [ 0xe1, 0x80, 0x80 ], output: [ I ] },
{ input: [ 0xe1, 0x80, 0xbf ], output: [ I ] },
{ input: [ 0xe1, 0x9f ], output: [ R ] },
{ input: [ 0xe1, 0x9f, 0x80 ], output: [ I ] },
{ input: [ 0xe1, 0x9f, 0xbf ], output: [ I ] },
{ input: [ 0xe1, 0xa0 ], output: [ R ] },
{ input: [ 0xe1, 0xa0, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xe1, 0xa0, 0x80 ], output: [ I ] },
{ input: [ 0xe1, 0xa0, 0xbf ], output: [ I ] },
{ input: [ 0xe1, 0xbf, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xe1, 0xbf, 0x80 ], output: [ I ] },
{ input: [ 0xe1, 0xbf, 0xbf ], output: [ I ] },
// Unlike with TextDecoder(), initial byte 0xED does not cause 2nd
// byte to have upper limit 0x9F because U+D800 to U+DFFF must be
// allowed for WTF-8 so these pass through as is (when otherwise valid).
{ input: [ 0xed ], output: [ R ] },
{ input: [ 0xed, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xed, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xed, 0x80 ], output: [ R ] },
{ input: [ 0xed, 0x80, 0x80 ], output: [ I ] },
{ input: [ 0xed, 0x80, 0xbf ], output: [ I ] },
{ input: [ 0xed, 0x9f ], output: [ R ] },
{ input: [ 0xed, 0x9f, 0x80 ], output: [ I ] },
{ input: [ 0xed, 0x9f, 0xbf ], output: [ I ] },
{ input: [ 0xed, 0xa0 ], output: [ R ], textDecoderOutput: [ R, R ] }, // TextDecoder() rejects the 2nd byte so two replacements
{ input: [ 0xed, 0xa0, 0x7f ], output: [ R, 0x7f ], textDecoderOutput: [ R, R, 0x7f ] },
{ input: [ 0xed, 0xa0, 0x80 ], output: [ I ], textDecoderOutput: [ R, R, R ] },
{ input: [ 0xed, 0xa0, 0xbf ], output: [ I ], textDecoderOutput: [ R, R, R ] },
{ input: [ 0xed, 0xbf, 0x7f ], output: [ R, 0x7f ], textDecoderOutput: [ R, R, 0x7f ] },
{ input: [ 0xed, 0xbf, 0x80 ], output: [ I ], textDecoderOutput: [ R, R, R ] },
{ input: [ 0xed, 0xbf, 0xbf ], output: [ I ], textDecoderOutput: [ R, R, R ] },
// Valid surrogate pairs get combined. Just a few point checks here.
{ input: [ 0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80 ], output: [ 0xf0, 0x90, 0x80, 0x80 ], textDecoderOutput: [ R, R, R, R, R, R ] }, // U+D800 U+DC00 => U+10000
{ input: [ 0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf ], output: [ 0xf0, 0x90, 0x8f, 0xbf ], textDecoderOutput: [ R, R, R, R, R, R ] }, // U+D800 U+DFFF => U+103FF
{ input: [ 0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80 ], output: [ 0xf4, 0x8f, 0xb0, 0x80 ], textDecoderOutput: [ R, R, R, R, R, R ] }, // U+DBFF U+DC00 => U+10FC00
{ input: [ 0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf ], output: [ 0xf4, 0x8f, 0xbf, 0xbf ], textDecoderOutput: [ R, R, R, R, R, R ] }, // U+DBFF U+DFFF => U+10FFFF
{ input: [ 0xee ], output: [ R ] },
{ input: [ 0xee, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xee, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xee, 0x80 ], output: [ R ] },
{ input: [ 0xee, 0x80, 0x80 ], output: [ I ] },
{ input: [ 0xee, 0x80, 0xbf ], output: [ I ] },
{ input: [ 0xee, 0x9f ], output: [ R ] },
{ input: [ 0xee, 0x9f, 0x80 ], output: [ I ] },
{ input: [ 0xee, 0x9f, 0xbf ], output: [ I ] },
{ input: [ 0xee, 0xa0 ], output: [ R ] },
{ input: [ 0xee, 0xa0, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xee, 0xa0, 0x80 ], output: [ I ] },
{ input: [ 0xee, 0xa0, 0xbf ], output: [ I ] },
{ input: [ 0xee, 0xbf, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xee, 0xbf, 0x80 ], output: [ I ] },
{ input: [ 0xee, 0xbf, 0xbf ], output: [ I ] },
{ input: [ 0xef ], output: [ R ] },
{ input: [ 0xef, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xef, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xef, 0x80 ], output: [ R ] },
{ input: [ 0xef, 0x80, 0x80 ], output: [ I ] },
{ input: [ 0xef, 0x80, 0xbf ], output: [ I ] },
{ input: [ 0xef, 0x9f ], output: [ R ] },
{ input: [ 0xef, 0x9f, 0x80 ], output: [ I ] },
{ input: [ 0xef, 0x9f, 0xbf ], output: [ I ] },
{ input: [ 0xef, 0xa0 ], output: [ R ] },
{ input: [ 0xef, 0xa0, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xef, 0xa0, 0x80 ], output: [ I ] },
{ input: [ 0xef, 0xa0, 0xbf ], output: [ I ] },
{ input: [ 0xef, 0xbf, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xef, 0xbf, 0x80 ], output: [ I ] },
{ input: [ 0xef, 0xbf, 0xbf ], output: [ I ] },
{ input: [ 0xf0 ], output: [ R ] },
{ input: [ 0xf0, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xf0, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xf0, 0x80 ], output: [ R, R ] },
{ input: [ 0xf0, 0x80, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf0, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf0, 0x8f ], output: [ R, R ] },
{ input: [ 0xf0, 0x8f, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf0, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf0, 0x90 ], output: [ R ] }, // lower limit for 2nd byte 0x90 so valid here (but truncated)
{ input: [ 0xf0, 0x90, 0x80 ], output: [ R ] },
{ input: [ 0xf0, 0x90, 0x80, 0x80 ], output: [ I ] }, // smallest valid 4-byte: U+10000
{ input: [ 0xf0, 0xbf, 0xbf, 0xbf ], output: [ I ] },
{ input: [ 0xf0, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
{ input: [ 0xf1 ], output: [ R ] },
{ input: [ 0xf1, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xf1, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xf1, 0x80 ], output: [ R ] },
{ input: [ 0xf1, 0x80, 0x80 ], output: [ R ] },
{ input: [ 0xf1, 0x80, 0x80, 0x80 ], output: [ I ] }, // U+40000
{ input: [ 0xf1, 0x8f ], output: [ R ] },
{ input: [ 0xf1, 0x8f, 0x80 ], output: [ R ] },
{ input: [ 0xf1, 0x8f, 0x80, 0x80 ], output: [ I ] }, // U+4F000
{ input: [ 0xf1, 0x90 ], output: [ R ] },
{ input: [ 0xf1, 0x90, 0x80 ], output: [ R ] },
{ input: [ 0xf1, 0x90, 0x80, 0x80 ], output: [ I ] }, // U+50000
{ input: [ 0xf1, 0xbf, 0xbf, 0xbf ], output: [ I ] },
{ input: [ 0xf1, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
{ input: [ 0xf3 ], output: [ R ] },
{ input: [ 0xf3, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xf3, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xf3, 0x80 ], output: [ R ] },
{ input: [ 0xf3, 0x80, 0x80 ], output: [ R ] },
{ input: [ 0xf3, 0x80, 0x80, 0x80 ], output: [ I ] },
{ input: [ 0xf3, 0x8f ], output: [ R ] },
{ input: [ 0xf3, 0x8f, 0x80 ], output: [ R ] },
{ input: [ 0xf3, 0x8f, 0x80, 0x80 ], output: [ I ] },
{ input: [ 0xf3, 0x90 ], output: [ R ] },
{ input: [ 0xf3, 0x90, 0x80 ], output: [ R ] },
{ input: [ 0xf3, 0x90, 0x80, 0x80 ], output: [ I ] },
{ input: [ 0xf3, 0xbf, 0xbf, 0xbf ], output: [ I ] },
{ input: [ 0xf3, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
{ input: [ 0xf4 ], output: [ R ] },
{ input: [ 0xf4, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xf4, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xf4, 0x80 ], output: [ R ] },
{ input: [ 0xf4, 0x80, 0x80 ], output: [ R ] },
{ input: [ 0xf4, 0x80, 0x80, 0x80 ], output: [ I ] },
{ input: [ 0xf4, 0x8f ], output: [ R ] },
{ input: [ 0xf4, 0x8f, 0x80 ], output: [ R ] },
{ input: [ 0xf4, 0x8f, 0x80, 0x80 ], output: [ I ] },
{ input: [ 0xf4, 0x90 ], output: [ R, R ] }, // upper limit for 2nd byte 0x8f so invalid here
{ input: [ 0xf4, 0x90, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf4, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf4, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] },
{ input: [ 0xf4, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
// F5-F7 is technically a leading byte for a 4-byte encoding but
// encoded values are > U+10FFFF so F5-F7 are rejected.
{ input: [ 0xf5 ], output: [ R ] },
{ input: [ 0xf5, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xf5, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xf5, 0x80 ], output: [ R, R ] },
{ input: [ 0xf5, 0x80, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf5, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf5, 0x8f ], output: [ R, R ] },
{ input: [ 0xf5, 0x8f, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf5, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf5, 0x90 ], output: [ R, R ] },
{ input: [ 0xf5, 0x90, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf5, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf5, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] },
{ input: [ 0xf5, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
{ input: [ 0xf7 ], output: [ R ] },
{ input: [ 0xf7, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xf7, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xf7, 0x80 ], output: [ R, R ] },
{ input: [ 0xf7, 0x80, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf7, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf7, 0x8f ], output: [ R, R ] },
{ input: [ 0xf7, 0x8f, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf7, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf7, 0x90 ], output: [ R, R ] },
{ input: [ 0xf7, 0x90, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf7, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf7, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] },
{ input: [ 0xf7, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
// F8-FF are all invalid so rejected as initial byte.
{ input: [ 0xf8 ], output: [ R ] },
{ input: [ 0xf8, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xf8, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xf8, 0x80 ], output: [ R, R ] },
{ input: [ 0xf8, 0x80, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf8, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf8, 0x8f ], output: [ R, R ] },
{ input: [ 0xf8, 0x8f, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf8, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf8, 0x90 ], output: [ R, R ] },
{ input: [ 0xf8, 0x90, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xf8, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xf8, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] },
{ input: [ 0xf8, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
{ input: [ 0xff ], output: [ R ] },
{ input: [ 0xff, 0x41 ], output: [ R, 0x41 ] },
{ input: [ 0xff, 0x7f ], output: [ R, 0x7f ] },
{ input: [ 0xff, 0x80 ], output: [ R, R ] },
{ input: [ 0xff, 0x80, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xff, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xff, 0x8f ], output: [ R, R ] },
{ input: [ 0xff, 0x8f, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xff, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xff, 0x90 ], output: [ R, R ] },
{ input: [ 0xff, 0x90, 0x80 ], output: [ R, R, R ] },
{ input: [ 0xff, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] },
{ input: [ 0xff, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] },
{ input: [ 0xff, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] },
];
function prepOutput(output, input) {
return output.map((v) => {
if (v === 'REPLACEMENT') { return [ 0xef, 0xbf, 0xbd ]; }
if (v === 'INPUT') { return input; }
return [v];
}).flat();
}
for (let { input, output, textDecoderOutput } of tests) {
let out = prepOutput(output, input);
let wtf8Res = sanitizeToWtf8(new Uint8Array(input), { allowSymbol: false });
let res1 = wtf8Res.outputUint8ArrayWtf8;
if (res1.length !== out.length) {
//console.log(input, out, res1);
throw new TypeError('wtf8 sanitize self test failed, length mismatch');
}
for (let i = 0; i < res1.length; i++) {
if (res1[i] !== out[i]) {
//console.log(input, out, res1);
throw new TypeError('wtf8 sanitize self test failed, difference at index ' + i);
}
}
let tdOut = prepOutput(textDecoderOutput || output, input);
let res2 = new TextEncoder().encode(new TextDecoder().decode(new Uint8Array(input)));
if (res2.length !== tdOut.length) {
//console.log(input, tdOut, res2);
throw new TypeError('wtf8 sanitize self test failed, length mismatch for TextDecoder output');
}
for (let i = 0; i < res2.length; i++) {
if (res2[i] !== tdOut[i]) {
//console.log(input, tdOut, res2);
throw new TypeError('wtf8 sanitize self test failed, difference at index ' + i + ' for TextDecoder output');
}
}
}
}
function runSelfTests() {
testPr121();
testUnicode110000();
testByteRanges();
}
runSelfTests();

Loading…
Cancel
Save