|
|
@ -172,45 +172,72 @@ function wtf8SanitizeString(u8) { |
|
|
|
//let iStart = i;
|
|
|
|
let ch = u8[i++]; |
|
|
|
|
|
|
|
// ASCII.
|
|
|
|
/* Decoder based on https://encoding.spec.whatwg.org/#utf-8-decoder |
|
|
|
* in "replacement" mode, except that: |
|
|
|
* - U+D800 to U+DFFF (encoded ED A0 80 to ED BF BF) are allowed |
|
|
|
* to reach WTF-8 surrogate pair processing. |
|
|
|
* |
|
|
|
* Note in particular that non-canonical encodings (codepoint value |
|
|
|
* too low or too high) must be terminated on the first invalid byte, |
|
|
|
* not after decoding the whole N-byte sequence. |
|
|
|
* |
|
|
|
* Maximum input expansion is 3x from an invalid initial byte expanding |
|
|
|
* to 3 bytes of UTF-8 encoded U+FFFD replacement character. |
|
|
|
*/ |
|
|
|
|
|
|
|
if (ch <= 0x7f) { |
|
|
|
pushWtf8(ch); |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
// Non-ASCII, decode initial (extended) UTF-8 byte:
|
|
|
|
// 10000000 to 10111111: invalid
|
|
|
|
// 110xxxxx + 1 continuation: U+0080 to U+07FF
|
|
|
|
// 1110xxxx + 2 continuation: U+0800 to U+FFFF
|
|
|
|
// 11110xxx + 3 continuation: U+10000 to U+10FFFF
|
|
|
|
// 11111000 to 11111111: invalid
|
|
|
|
//
|
|
|
|
// If the (extended) UTF-8 sequence is invalid, replace the
|
|
|
|
// maximal valid sequence with a single U+FFFD replacement
|
|
|
|
// character as described in http://unicode.org/review/pr-121.html.
|
|
|
|
|
|
|
|
let lower = 0x80; |
|
|
|
let upper = 0xbf; |
|
|
|
let numCont = 0; |
|
|
|
let cp = 0; |
|
|
|
let cpMin = 0; |
|
|
|
let cpMax = 0; |
|
|
|
if (ch >= 0b11000000 && ch <= 0b11011111) { |
|
|
|
|
|
|
|
if (ch >= 0x80 && ch <= 0xbf) { |
|
|
|
// Invalid leading continuation byte.
|
|
|
|
pushReplacement(); |
|
|
|
continue; |
|
|
|
} else if (ch >= 0xc0 && ch <= 0xc1) { |
|
|
|
// Invalid 2-byte sequence, initial byte too low.
|
|
|
|
pushReplacement(); |
|
|
|
continue; |
|
|
|
} else if (ch >= 0xc2 && ch <= 0xdf) { |
|
|
|
// 2-byte sequence, valid initial byte.
|
|
|
|
numCont = 1; |
|
|
|
cp = (ch - 0b11000000); |
|
|
|
cp = ch & 0x1f; |
|
|
|
cpMin = 0x80; |
|
|
|
cpMax = 0x7ff; |
|
|
|
} else if (ch >= 0b11100000 && ch <= 0b11101111) { |
|
|
|
lower = 0x80; |
|
|
|
upper = 0xbf; |
|
|
|
} else if (ch >= 0xe0 && ch <= 0xef) { |
|
|
|
// 3-byte sequence, valid initial byte.
|
|
|
|
numCont = 2; |
|
|
|
cp = (ch - 0b11100000); |
|
|
|
cp = ch & 0x0f; |
|
|
|
cpMin = 0x800; |
|
|
|
cpMax = 0xffff; |
|
|
|
} else if (ch >= 0b11110000 && ch <= 0b11110111) { |
|
|
|
lower = (ch === 0xe0 ? 0xa0 : 0x80); |
|
|
|
upper = 0xbf; |
|
|
|
// This would be the case for TextDecoder to reject U+D800 to U+DFFF,
|
|
|
|
// but we must allow them.
|
|
|
|
//upper = (ch === 0xed ? 0x9f : 0xbf);
|
|
|
|
} else if (ch >= 0xf0 && ch <= 0xf4) { |
|
|
|
// 4-byte sequence, valid initial byte.
|
|
|
|
numCont = 3; |
|
|
|
cp = (ch - 0b11110000); |
|
|
|
cp = ch & 0x07; |
|
|
|
cpMin = 0x10000; |
|
|
|
cpMax = 0x10ffff; |
|
|
|
lower = (ch === 0xf0 ? 0x90 : 0x80); |
|
|
|
upper = (ch === 0xf4 ? 0x8f : 0xbf); |
|
|
|
} else if (ch >= 0xf5 && ch <= 0xf7) { |
|
|
|
// Invalid 4-byte sequence, initial byte too high.
|
|
|
|
pushReplacement(); |
|
|
|
continue; |
|
|
|
} else { |
|
|
|
// Invalid initial byte, replace with one replacement char.
|
|
|
|
// This branch produces the maximum 3x input expansion.
|
|
|
|
// Invalid UTF-8 sequence, invalid initial byte.
|
|
|
|
pushReplacement(); |
|
|
|
continue; |
|
|
|
} |
|
|
@ -226,8 +253,10 @@ function wtf8SanitizeString(u8) { |
|
|
|
} |
|
|
|
|
|
|
|
let cb = u8[i++]; |
|
|
|
if (cb >= 0b10000000 && cb <= 0b10111111) { |
|
|
|
cp = (cp * 64) + (cb - 0b10000000); |
|
|
|
if (cb >= lower && cb <= upper) { |
|
|
|
cp = (cp << 6) + (cb & 0x3f); |
|
|
|
lower = 0x80; |
|
|
|
upper = 0xbf; |
|
|
|
} else { |
|
|
|
// Encoding broken at current index, replace everything so
|
|
|
|
// far (excluding the broken byte) with one replacement
|
|
|
@ -246,23 +275,11 @@ function wtf8SanitizeString(u8) { |
|
|
|
} |
|
|
|
|
|
|
|
if (cp < cpMin || cp > cpMax) { |
|
|
|
// Not a canonical shortest encoding or out-of-bounds, replace the
|
|
|
|
// entire sequence with a single replacement character.
|
|
|
|
//
|
|
|
|
// This doesn't currently handle non-shortest encoding or codepoints
|
|
|
|
// above U+10FFFF correctly: we'll first decode the entire sequence
|
|
|
|
// and then replace it with a single U+FFFD if it's incorrect. The
|
|
|
|
// correct behavior is to detect the first broken byte which necessarily
|
|
|
|
// makes the result invalid, and consume only the maximal valid byte
|
|
|
|
// prefix.
|
|
|
|
//
|
|
|
|
// For example, for U+110000 the encoding is F4 90 80 80, and the 90 is
|
|
|
|
// already broken so the initial F4 gets replaced with a U+FFFD and 90
|
|
|
|
// is then considered again. The correct output for F4 90 80 80 is
|
|
|
|
// U+FFFD U+FFFD U+FFFD U+FFFD.
|
|
|
|
|
|
|
|
pushReplacement(); |
|
|
|
continue; |
|
|
|
// This should never happen because the upper/lower continuation
|
|
|
|
// byte should restrict the codepoint to the allowed range.
|
|
|
|
throw new TypeError('internal error: codepoint not within [cpMin,cpMax]: ' + cp + ' vs ' + cpMin + '-' + cpMax); |
|
|
|
//pushReplacement();
|
|
|
|
//continue;
|
|
|
|
} |
|
|
|
|
|
|
|
// Successfully decoded a UTF-8 codepoint. We will either:
|
|
|
@ -299,10 +316,10 @@ function wtf8SanitizeString(u8) { |
|
|
|
// '\xee\x80\x80'
|
|
|
|
|
|
|
|
let bytesLeft = u8.length - i; |
|
|
|
let validSurrogatePair = bytesLeft >= 3 && |
|
|
|
let validSurrogatePair = (bytesLeft >= 3 && |
|
|
|
u8[i] === 0xed && |
|
|
|
u8[i + 1] >= 0xa0 && u8[i + 1] <= 0xbf && |
|
|
|
u8[i + 2] >= 0x80 && u8[i + 2] <= 0xbf; |
|
|
|
u8[i + 2] >= 0x80 && u8[i + 2] <= 0xbf); |
|
|
|
if (validSurrogatePair) { |
|
|
|
// Valid low surrogate follows. Decode and combine.
|
|
|
|
let hi = cp; |
|
|
@ -337,6 +354,8 @@ function wtf8SanitizeString(u8) { |
|
|
|
inputDebugString: getDebugStringForU8(u8), |
|
|
|
outputUint8ArrayWtf8, |
|
|
|
outputUint8ArrayCesu8, |
|
|
|
outputCodepointsWtf8: codepointResultWtf8, |
|
|
|
outputCodepointsCesu8: codepointResultCesu8, |
|
|
|
outputDebugStringWtf8, |
|
|
|
outputDebugStringCesu8, |
|
|
|
byteLengthWtf8: outputUint8ArrayWtf8.length, |
|
|
@ -352,13 +371,14 @@ function wtf8SanitizeString(u8) { |
|
|
|
}; |
|
|
|
} |
|
|
|
|
|
|
|
function sanitizeToWtf8(u8) { |
|
|
|
function sanitizeToWtf8(u8, args) { |
|
|
|
if (!(typeof u8 === 'object' && u8 !== null && u8 instanceof Uint8Array)) { |
|
|
|
throw new TypeError('input must be a Uint8Array'); |
|
|
|
} |
|
|
|
|
|
|
|
// Special check for symbol strings.
|
|
|
|
if (u8.length >= 1 && (u8[0] === 0x80 || u8[0] === 0x81 || u8[0] === 0x82 || u8[0] === 0xff)) { |
|
|
|
let allowSymbol = !(args && args.allowSymbol === false); |
|
|
|
if (allowSymbol && u8.length >= 1 && (u8[0] === 0x80 || u8[0] === 0x81 || u8[0] === 0x82 || u8[0] === 0xff)) { |
|
|
|
return wtf8SanitizeSymbol(u8); |
|
|
|
} else { |
|
|
|
return wtf8SanitizeString(u8); |
|
|
@ -384,6 +404,7 @@ function sanitizeToWtf8(u8) { |
|
|
|
// on conversion or validation.
|
|
|
|
|
|
|
|
function testPr121() { |
|
|
|
// Example given in PR-121.
|
|
|
|
let res = sanitizeToWtf8(new Uint8Array([0x61, 0xf1, 0x80, 0x80, 0xe1, 0x80, 0xc2, 0x62])); |
|
|
|
//console.log(res);
|
|
|
|
if (res.outputDebugStringWtf8 !== 'a<fffd><fffd><fffd>b') { |
|
|
@ -395,16 +416,320 @@ function testPr121() { |
|
|
|
function testUnicode110000() { |
|
|
|
let res = sanitizeToWtf8(new Uint8Array([0x41, 0xf4, 0x90, 0x80, 0x80])); |
|
|
|
//console.log(res);
|
|
|
|
// Currently incorrect, A<fffd>.
|
|
|
|
if (res.outputDebugStringWtf8 !== 'A<fffd><fffd><fffd><fffd>') { |
|
|
|
//console.log(res);
|
|
|
|
//throw new TypeError('wtf8 sanitize self test failed');
|
|
|
|
throw new TypeError('wtf8 sanitize self test failed'); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
function testByteRanges() { |
|
|
|
// Some basic boundary testing of valid byte ranges.
|
|
|
|
// Also compare against TextDecoder() output which should match
|
|
|
|
// for decoding and replacement character behavior except when
|
|
|
|
// U+D800 to U+DFFF are involved.
|
|
|
|
|
|
|
|
let R = 'REPLACEMENT'; |
|
|
|
let I = 'INPUT'; |
|
|
|
let tests = [ |
|
|
|
{ input: [ 0x00 ], output: [ I ] }, |
|
|
|
{ input: [ 0x40 ], output: [ I ] }, |
|
|
|
{ input: [ 0x7f ], output: [ I ] }, |
|
|
|
|
|
|
|
{ input: [ 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0x80, 0x80 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xa0 ], output: [ R ] }, |
|
|
|
{ input: [ 0xbf ], output: [ R ] }, |
|
|
|
{ input: [ 0xbf, 0xbf ], output: [ R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xc0 ], output: [ R ] }, |
|
|
|
{ input: [ 0xc0, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xc0, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xc0, 0x80 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xc0, 0xbf ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xc0, 0xc0 ], output: [ R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xc1 ], output: [ R ] }, |
|
|
|
{ input: [ 0xc1, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xc1, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xc1, 0x80 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xc1, 0xbf ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xc1, 0xc0 ], output: [ R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xc2, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xc2, 0x80 ], output: [ I ] }, // smallest valid 2-byte: U+0080
|
|
|
|
{ input: [ 0xc2, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xc2, 0xc0 ], output: [ R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xc3, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xc3, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xc3, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xc3, 0xc0 ], output: [ R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xdf, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xdf, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xdf, 0xbf ], output: [ I ] }, // highest valid 2-byte: U+07FF
|
|
|
|
{ input: [ 0xdf, 0xc0 ], output: [ R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xe0 ], output: [ R ] }, |
|
|
|
{ input: [ 0xe0, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xe0, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xe0, 0x80 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xe0, 0x80, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xe0, 0x80, 0xbf ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xe0, 0x9f ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xe0, 0x9f, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xe0, 0x9f, 0xbf ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xe0, 0xa0 ], output: [ R ] }, // lower limit for 2nd byte 0xa0 so valid here (but truncated)
|
|
|
|
{ input: [ 0xe0, 0xa0, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xe0, 0xa0, 0x80 ], output: [ I ] }, // smallest valid 3-byte: 0x800
|
|
|
|
{ input: [ 0xe0, 0xa0, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xe0, 0xbf, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xe0, 0xbf, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xe0, 0xbf, 0xbf ], output: [ I ] }, |
|
|
|
|
|
|
|
{ input: [ 0xe1 ], output: [ R ] }, |
|
|
|
{ input: [ 0xe1, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xe1, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xe1, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xe1, 0x80, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xe1, 0x80, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xe1, 0x9f ], output: [ R ] }, |
|
|
|
{ input: [ 0xe1, 0x9f, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xe1, 0x9f, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xe1, 0xa0 ], output: [ R ] }, |
|
|
|
{ input: [ 0xe1, 0xa0, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xe1, 0xa0, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xe1, 0xa0, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xe1, 0xbf, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xe1, 0xbf, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xe1, 0xbf, 0xbf ], output: [ I ] }, |
|
|
|
|
|
|
|
// Unlike with TextDecoder(), initial byte 0xED does not cause 2nd
|
|
|
|
// byte to have upper limit 0x9F because U+D800 to U+DFFF must be
|
|
|
|
// allowed for WTF-8 so these pass through as is (when otherwise valid).
|
|
|
|
{ input: [ 0xed ], output: [ R ] }, |
|
|
|
{ input: [ 0xed, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xed, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xed, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xed, 0x80, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xed, 0x80, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xed, 0x9f ], output: [ R ] }, |
|
|
|
{ input: [ 0xed, 0x9f, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xed, 0x9f, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xed, 0xa0 ], output: [ R ], textDecoderOutput: [ R, R ] }, // TextDecoder() rejects the 2nd byte so two replacements
|
|
|
|
{ input: [ 0xed, 0xa0, 0x7f ], output: [ R, 0x7f ], textDecoderOutput: [ R, R, 0x7f ] }, |
|
|
|
{ input: [ 0xed, 0xa0, 0x80 ], output: [ I ], textDecoderOutput: [ R, R, R ] }, |
|
|
|
{ input: [ 0xed, 0xa0, 0xbf ], output: [ I ], textDecoderOutput: [ R, R, R ] }, |
|
|
|
{ input: [ 0xed, 0xbf, 0x7f ], output: [ R, 0x7f ], textDecoderOutput: [ R, R, 0x7f ] }, |
|
|
|
{ input: [ 0xed, 0xbf, 0x80 ], output: [ I ], textDecoderOutput: [ R, R, R ] }, |
|
|
|
{ input: [ 0xed, 0xbf, 0xbf ], output: [ I ], textDecoderOutput: [ R, R, R ] }, |
|
|
|
|
|
|
|
// Valid surrogate pairs get combined. Just a few point checks here.
|
|
|
|
{ input: [ 0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80 ], output: [ 0xf0, 0x90, 0x80, 0x80 ], textDecoderOutput: [ R, R, R, R, R, R ] }, // U+D800 U+DC00 => U+10000
|
|
|
|
{ input: [ 0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf ], output: [ 0xf0, 0x90, 0x8f, 0xbf ], textDecoderOutput: [ R, R, R, R, R, R ] }, // U+D800 U+DFFF => U+103FF
|
|
|
|
{ input: [ 0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80 ], output: [ 0xf4, 0x8f, 0xb0, 0x80 ], textDecoderOutput: [ R, R, R, R, R, R ] }, // U+DBFF U+DC00 => U+10FC00
|
|
|
|
{ input: [ 0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf ], output: [ 0xf4, 0x8f, 0xbf, 0xbf ], textDecoderOutput: [ R, R, R, R, R, R ] }, // U+DBFF U+DFFF => U+10FFFF
|
|
|
|
|
|
|
|
{ input: [ 0xee ], output: [ R ] }, |
|
|
|
{ input: [ 0xee, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xee, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xee, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xee, 0x80, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xee, 0x80, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xee, 0x9f ], output: [ R ] }, |
|
|
|
{ input: [ 0xee, 0x9f, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xee, 0x9f, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xee, 0xa0 ], output: [ R ] }, |
|
|
|
{ input: [ 0xee, 0xa0, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xee, 0xa0, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xee, 0xa0, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xee, 0xbf, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xee, 0xbf, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xee, 0xbf, 0xbf ], output: [ I ] }, |
|
|
|
|
|
|
|
{ input: [ 0xef ], output: [ R ] }, |
|
|
|
{ input: [ 0xef, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xef, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xef, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xef, 0x80, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xef, 0x80, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xef, 0x9f ], output: [ R ] }, |
|
|
|
{ input: [ 0xef, 0x9f, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xef, 0x9f, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xef, 0xa0 ], output: [ R ] }, |
|
|
|
{ input: [ 0xef, 0xa0, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xef, 0xa0, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xef, 0xa0, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xef, 0xbf, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xef, 0xbf, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xef, 0xbf, 0xbf ], output: [ I ] }, |
|
|
|
|
|
|
|
{ input: [ 0xf0 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf0, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xf0, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xf0, 0x80 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf0, 0x80, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf0, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf0, 0x8f ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf0, 0x8f, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf0, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf0, 0x90 ], output: [ R ] }, // lower limit for 2nd byte 0x90 so valid here (but truncated)
|
|
|
|
{ input: [ 0xf0, 0x90, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf0, 0x90, 0x80, 0x80 ], output: [ I ] }, // smallest valid 4-byte: U+10000
|
|
|
|
{ input: [ 0xf0, 0xbf, 0xbf, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xf0, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xf1 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf1, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xf1, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xf1, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf1, 0x80, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf1, 0x80, 0x80, 0x80 ], output: [ I ] }, // U+40000
|
|
|
|
{ input: [ 0xf1, 0x8f ], output: [ R ] }, |
|
|
|
{ input: [ 0xf1, 0x8f, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf1, 0x8f, 0x80, 0x80 ], output: [ I ] }, // U+4F000
|
|
|
|
{ input: [ 0xf1, 0x90 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf1, 0x90, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf1, 0x90, 0x80, 0x80 ], output: [ I ] }, // U+50000
|
|
|
|
{ input: [ 0xf1, 0xbf, 0xbf, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xf1, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xf3 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf3, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xf3, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xf3, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf3, 0x80, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf3, 0x80, 0x80, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xf3, 0x8f ], output: [ R ] }, |
|
|
|
{ input: [ 0xf3, 0x8f, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf3, 0x8f, 0x80, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xf3, 0x90 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf3, 0x90, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf3, 0x90, 0x80, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xf3, 0xbf, 0xbf, 0xbf ], output: [ I ] }, |
|
|
|
{ input: [ 0xf3, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xf4 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf4, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xf4, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xf4, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf4, 0x80, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf4, 0x80, 0x80, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xf4, 0x8f ], output: [ R ] }, |
|
|
|
{ input: [ 0xf4, 0x8f, 0x80 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf4, 0x8f, 0x80, 0x80 ], output: [ I ] }, |
|
|
|
{ input: [ 0xf4, 0x90 ], output: [ R, R ] }, // upper limit for 2nd byte 0x8f so invalid here
|
|
|
|
{ input: [ 0xf4, 0x90, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf4, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf4, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf4, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
|
|
|
|
// F5-F7 is technically a leading byte for a 4-byte encoding but
|
|
|
|
// encoded values are > U+10FFFF so F5-F7 are rejected.
|
|
|
|
{ input: [ 0xf5 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf5, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xf5, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xf5, 0x80 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf5, 0x80, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf5, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf5, 0x8f ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf5, 0x8f, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf5, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf5, 0x90 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf5, 0x90, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf5, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf5, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf5, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xf7 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf7, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xf7, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xf7, 0x80 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf7, 0x80, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf7, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf7, 0x8f ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf7, 0x8f, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf7, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf7, 0x90 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf7, 0x90, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf7, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf7, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf7, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
|
|
|
|
// F8-FF are all invalid so rejected as initial byte.
|
|
|
|
{ input: [ 0xf8 ], output: [ R ] }, |
|
|
|
{ input: [ 0xf8, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xf8, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xf8, 0x80 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf8, 0x80, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf8, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf8, 0x8f ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf8, 0x8f, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf8, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf8, 0x90 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xf8, 0x90, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xf8, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf8, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xf8, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
|
|
|
|
{ input: [ 0xff ], output: [ R ] }, |
|
|
|
{ input: [ 0xff, 0x41 ], output: [ R, 0x41 ] }, |
|
|
|
{ input: [ 0xff, 0x7f ], output: [ R, 0x7f ] }, |
|
|
|
{ input: [ 0xff, 0x80 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xff, 0x80, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xff, 0x80, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xff, 0x8f ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xff, 0x8f, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xff, 0x8f, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xff, 0x90 ], output: [ R, R ] }, |
|
|
|
{ input: [ 0xff, 0x90, 0x80 ], output: [ R, R, R ] }, |
|
|
|
{ input: [ 0xff, 0x90, 0x80, 0x80 ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xff, 0xbf, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
{ input: [ 0xff, 0xc0, 0xbf, 0xbf ], output: [ R, R, R, R ] }, |
|
|
|
]; |
|
|
|
|
|
|
|
function prepOutput(output, input) { |
|
|
|
return output.map((v) => { |
|
|
|
if (v === 'REPLACEMENT') { return [ 0xef, 0xbf, 0xbd ]; } |
|
|
|
if (v === 'INPUT') { return input; } |
|
|
|
return [v]; |
|
|
|
}).flat(); |
|
|
|
} |
|
|
|
|
|
|
|
for (let { input, output, textDecoderOutput } of tests) { |
|
|
|
let out = prepOutput(output, input); |
|
|
|
let wtf8Res = sanitizeToWtf8(new Uint8Array(input), { allowSymbol: false }); |
|
|
|
let res1 = wtf8Res.outputUint8ArrayWtf8; |
|
|
|
if (res1.length !== out.length) { |
|
|
|
//console.log(input, out, res1);
|
|
|
|
throw new TypeError('wtf8 sanitize self test failed, length mismatch'); |
|
|
|
} |
|
|
|
for (let i = 0; i < res1.length; i++) { |
|
|
|
if (res1[i] !== out[i]) { |
|
|
|
//console.log(input, out, res1);
|
|
|
|
throw new TypeError('wtf8 sanitize self test failed, difference at index ' + i); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
let tdOut = prepOutput(textDecoderOutput || output, input); |
|
|
|
let res2 = new TextEncoder().encode(new TextDecoder().decode(new Uint8Array(input))); |
|
|
|
if (res2.length !== tdOut.length) { |
|
|
|
//console.log(input, tdOut, res2);
|
|
|
|
throw new TypeError('wtf8 sanitize self test failed, length mismatch for TextDecoder output'); |
|
|
|
} |
|
|
|
for (let i = 0; i < res2.length; i++) { |
|
|
|
if (res2[i] !== tdOut[i]) { |
|
|
|
//console.log(input, tdOut, res2);
|
|
|
|
throw new TypeError('wtf8 sanitize self test failed, difference at index ' + i + ' for TextDecoder output'); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
function runSelfTests() { |
|
|
|
testPr121(); |
|
|
|
testUnicode110000(); |
|
|
|
testByteRanges(); |
|
|
|
} |
|
|
|
runSelfTests(); |
|
|
|
|
|
|
|