You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

555 lines
14 KiB

/*
* Tests for global object URI handling functions:
*
* - encodeURI()
* - encodeURIComponent()
* - decodeURI()
* - decodeURIComponent()
*/
// indirect eval -> this is bound to the global object, E5 Section 10.4.2, step 1.a.
var g = (function () { var e = eval; return e('this'); } )();
/* Pure Ecmascript helper to URI encode a codepoint into URI escaped form.
* Allows surrogate pairs to be encoded into invalid UTF-8 on purpose.
*/
function encCodePoint(x, forced_len) {
var len;
var initial;
var i;
var nybbles = "0123456789ABCDEF";
var initial_bytes = [ null, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe ];
var t;
var res;
// Supports extended UTF-8 up to 36 bits
if (x < 0x80) { len = 1; }
else if (x < 0x800) { len = 2; }
else if (x < 0x10000) { len = 3; }
else if (x < 0x200000) { len = 4; }
else if (x < 0x4000000) { len = 5; }
else if (x < 0x80000000) { len = 6; }
else { len = 7; }
if (typeof forced_len === 'number') {
len = forced_len;
}
initial = initial_bytes[len];
t = [];
for (i = len - 1; i >= 0; i--) {
if (i === 0) {
t[i] = initial + x;
} else {
t[i] = (x & 0x3f) + 0x80;
}
x = x >>> 6;
}
res = [];
for (i = 0; i < len; i++) {
res.push('%' + nybbles.charAt((t[i] >>> 4) & 0x0f) +
nybbles.charAt(t[i] & 0x0f));
}
return res.join('');
}
/* Dump a string as decimal codepoints, ensures that tests produce ASCII only
* outputs.
*/
function dumpCodePoints(x) {
var i;
var res = [];
for (i = 0; i < x.length; i++) {
res.push(x.charCodeAt(i));
}
return res.join(' ');
}
/*===
basic encode
http://www.example.com/%C3%8A%D8%80%E1%88%B4#foo
104 116 116 112 58 47 47 119 119 119 46 101 120 97 109 112 108 101 46 99 111 109 47 202 1536 4660 35 102 111 111
http%3A%2F%2Fwww.example.com%2F%C3%8A%D8%80%E1%88%B4%23foo
104 116 116 112 58 47 47 119 119 119 46 101 120 97 109 112 108 101 46 99 111 109 47 202 1536 4660 35 102 111 111
===*/
/* A simple URI encoding / decoding test.
*
* Note: upper case hex escapes are required by the encoding
* algorithm in E5.1 Section 15.1.3.
*/
print('basic encode');
function basicEncodeTest() {
var uri = 'http://www.example.com/\u00ca\u0600\u1234#foo';
var t;
t = g.encodeURI(uri);
print(t);
t = g.decodeURI(t);
print(dumpCodePoints(t));
t = g.encodeURIComponent(uri);
print(t);
t = decodeURIComponent(t);
print(dumpCodePoints(t));
}
try {
basicEncodeTest();
} catch (e) {
print(e.name);
}
/*===
encoding of ascii range
%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22#$%25&'()*+,-./0123456789:;%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22%23%24%25%26'()*%2B%2C-.%2F0123456789%3A%3B%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
===*/
/* ASCII range test for character encoding */
print('encoding of ascii range');
function asciiEncodeTest() {
var i;
var txt = [];
var t;
for (i = 0; i < 128; i++) {
txt.push(String.fromCharCode(i));
}
txt = txt.join('');
t = g.encodeURI(txt);
print(t);
t = g.decodeURI(t);
print(dumpCodePoints(t));
t = g.encodeURIComponent(txt);
print(t);
t = decodeURIComponent(t);
print(dumpCodePoints(t));
}
try {
asciiEncodeTest();
} catch (e) {
print(e.name);
}
/*===
decoding of ascii range
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 37 50 51 37 50 52 37 37 50 54 39 40 41 42 37 50 66 37 50 67 45 46 37 50 70 48 49 50 51 52 53 54 55 56 57 37 51 65 37 51 66 60 37 51 68 62 37 51 70 37 52 48 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
===*/
print('decoding of ascii range');
function asciiDecodeTest() {
var i;
var txt = [];
var res;
for (i = 0; i < 128; i++) {
txt.push(encCodePoint(i));
}
txt = txt.join('');
// since e.g. control characters will be output, dump as codepoints
print(dumpCodePoints(g.decodeURI(txt)));
print(dumpCodePoints(g.decodeURIComponent(txt)));
}
try {
asciiDecodeTest();
} catch (e) {
print(e.name);
}
/*===
decode non-bmp
55296 56320
55296 56320
55304 57157
55304 57157
55441 56679
55441 56679
55804 56338
55804 56338
56256 56320
56256 56320
56260 56884
56260 56884
56319 57343
56319 57343
===*/
/* Decode non-BMP characters and check that surrogate pairs are decoded
* correctly. In other words, a single UTF-8 encoded codepoint becomes
* two Ecmascript codepoints.
*
* Decoding of non-BMP characters above U+10FFFF is required to result
* in URIError, and is tested separately below in invalid UTF-8 tests.
*/
print('decode non-bmp');
function decodeNonBmpTest() {
var inputs = [
encCodePoint(0x10000),
encCodePoint(0x12345),
encCodePoint(0x34567),
encCodePoint(0x8f012),
encCodePoint(0x100000),
encCodePoint(0x101234),
encCodePoint(0x10ffff)
];
var i;
for (i = 0; i < inputs.length; i++) {
print(dumpCodePoints(g.decodeURI(inputs[i])));
print(dumpCodePoints(g.decodeURIComponent(inputs[i])));
}
}
try {
decodeNonBmpTest();
} catch (e) {
print(e.name);
}
/*===
combine surrogate pairs in encode
%F0%90%80%80
%F0%90%8F%BF
%F0%90%90%80
%F4%8F%B0%80
%F4%8F%BF%BF
===*/
/* When encoding, surrogate pairs found in Ecmascript strings must be combined,
* and encoded into UTF-8 (as a single codepoint).
*/
print('combine surrogate pairs in encode');
try {
print(g.encodeURI('\ud800\udc00'));
print(g.encodeURI('\ud800\udfff'));
print(g.encodeURI('\ud801\udc00'));
print(g.encodeURI('\udbff\udc00'));
print(g.encodeURI('\udbff\udfff'));
} catch (e) {
print(e.name);
}
/*===
attempt to encode invalid surrogate pairs
%ED%9F%BF
URIError
URIError
URIError
URIError
%EE%80%80
URIError
URIError
URIError
%F0%90%80%80
%F0%90%8F%BF
URIError
URIError
URIError
URIError
URIError
URIError
URIError
===*/
/* An attempt to encode an invalid surrogate pair is a URIError. */
print('attempt to encode invalid surrogate pairs');
function attemptInvalidSurrogateEncode(x) {
try {
print(g.encodeURI(x));
} catch (e) {
print(e.name);
}
}
try {
attemptInvalidSurrogateEncode('\ud7ff'); // ok
attemptInvalidSurrogateEncode('\ud800');
attemptInvalidSurrogateEncode('\udbff');
attemptInvalidSurrogateEncode('\udc00');
attemptInvalidSurrogateEncode('\udfff');
attemptInvalidSurrogateEncode('\ue000'); // ok
attemptInvalidSurrogateEncode('\ud800\ud7ff');
attemptInvalidSurrogateEncode('\ud800\ud800');
attemptInvalidSurrogateEncode('\ud800\udbff');
attemptInvalidSurrogateEncode('\ud800\udc00'); // ok
attemptInvalidSurrogateEncode('\ud800\udfff'); // ok
attemptInvalidSurrogateEncode('\ud800\ue000');
attemptInvalidSurrogateEncode('\udc00\ud7ff');
attemptInvalidSurrogateEncode('\udc00\ud800');
attemptInvalidSurrogateEncode('\udc00\udbff');
attemptInvalidSurrogateEncode('\udc00\udc00');
attemptInvalidSurrogateEncode('\udc00\udfff');
attemptInvalidSurrogateEncode('\udc00\ue000');
} catch (e) {
print(e.name);
}
/*===
invalid utf-8 decode
%C0%80
%E0%80%A9
%F0%80%80%A9
%F8%80%80%80%A9
%FC%80%80%80%80%A9
%FE%80%80%80%80%80%A9
56319 57343
56319 57343
===*/
/* Decode only allows valid UTF-8 encodings up to 4 bytes. This technically
* allows codepoints up to U+1FFFFF, but UTF-8 further restricts the range to
* U+10FFFF. Codepoints above U+10FFFF would not fit into surrogate pairs
* anyway.
*
* Surrogate pair codepoints (U+D800...U+DFFF) encoded into UTF-8 naively
* are not allowed by UTF-8. Non-shortest encodings are not allowed by
* UTF-8 either.
*
* Things to test:
*
* - Surrogate pairs naively encoded into UTF-8 (= CESU-8) cause an URIError
*
* - Non-shortest UTF-8 encodings, e.g. URIError is required for C0 80.
*
* - U+10FFFF decodes correctly to an Ecmascript string with a surrogate pair
*
* - U+110000 causes an URIError
*
* - Codepoints with >4 byte encoding cause an URIError.
*
* From RFC 3629:
*
* Implementations of the decoding algorithm above MUST protect against
* decoding invalid sequences. For instance, a naive implementation may
* decode the overlong UTF-8 sequence C0 80 into the character U+0000,
* or the surrogate pair ED A1 8C ED BE B4 into U+233B4. Decoding
* invalid sequences may have security consequences or cause other
* problems.
*/
print('invalid utf-8 decode');
var _invalidInputErrorReported = false;
function testInvalidUtf8Input(x) {
var ok1, ok2;
try {
g.decodeURI(x);
ok1 = true;
} catch (e) {
if (e.name === 'URIError') {
ok1 = false;
} else {
throw e;
}
}
try {
g.decodeURIComponent(x);
ok2 = true;
} catch (e) {
if (e.name === 'URIError') {
ok2 = false;
} else {
throw e;
}
}
if (!ok1 && !ok2) {
// silent
} else {
// one or both inputs did NOT produce a URIError; only report first
// error because Rhino causes a flood otherwise
if (!_invalidInputErrorReported) {
_invalidInputErrorReported = true;
print('first error', x, ok1, ok2);
}
}
}
function testValidUtf8Input(x) {
var t;
try {
t = g.decodeURI(x);
print(dumpCodePoints(t));
} catch (e) {
print(e.name, '(unexpected)');
}
try {
t = g.decodeURIComponent(x);
print(dumpCodePoints(t));
} catch (e) {
print(e.name, '(unexpected)');
}
}
function invalidUtf8Test() {
var i;
var t;
// surrogate pairs
for (i = 0xd800; i < 0xe000; i++) {
testInvalidUtf8Input(encCodePoint(i));
}
// even valid surrogate pairs (or any surrogate character pairs)
for (i = 0xd800; i < 0xe000; i++) {
// we're just spot checking the second codepoint to keep the runtime reasonable
for (j = 0xd800; j < 0xe000; j += 127) {
testInvalidUtf8Input(encCodePoint(i) + encCodePoint(j));
}
}
// non-shortest encodings (C0 80 mentioned explicitly in spec);
// above 4 bytes rejected because not valid UTF-8 (in addition
// to not being shortest)
t = encCodePoint(0, 2); print(t);
testInvalidUtf8Input(t);
t = encCodePoint(41, 3); print(t);
testInvalidUtf8Input(t);
t = encCodePoint(41, 4); print(t);
testInvalidUtf8Input(t);
t = encCodePoint(41, 5); print(t);
testInvalidUtf8Input(t);
t = encCodePoint(41, 6); print(t);
testInvalidUtf8Input(t);
t = encCodePoint(41, 7); print(t);
testInvalidUtf8Input(t);
// U+10FFFF decodes correctly
testValidUtf8Input('%F4%8F%BF%BF');
// codepoints above utf-8 range
testInvalidUtf8Input(encCodePoint(0x110000));
testInvalidUtf8Input(encCodePoint(0x200123));
testInvalidUtf8Input(encCodePoint(0x4001234));
testInvalidUtf8Input(encCodePoint(0x80012345));
testInvalidUtf8Input(encCodePoint(0xfedcba98));
// invalid surrogate pair EDA18C EDBEB4 (from RFC 3629)
// EDA18C -> U+D84C (invalid UTF-8)
// EDBEB4 -> U+DFB4 (invalid UTF-8)
testInvalidUtf8Input('%ED%A1%8C%ED%BE%B4');
// invalid UTF-8 bytes; C2 must be followed by a byte >= 0x80
testInvalidUtf8Input('%C2%01');
}
try {
invalidUtf8Test();
} catch (e) {
print(e.name);
}
/*===
broken escapes
URIError
URIError
URIError
URIError
65
65
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
URIError
56319 57343
56319 57343
===*/
/* Test truncated hex encoding (e.g. '%' or '%1') and truncated UTF-8 encoding
* (e.g. '%C2' but no followup byte).
*/
print('broken escapes');
function testBrokenEscapes() {
var inputs = [
'%',
'%4',
'%41',
// partial encodings of U+10FFFF
'%',
'%F',
'%F4',
'%F4%',
'%F4%8',
'%F4%8F',
'%F4%8F%',
'%F4%8F%B',
'%F4%8F%BF',
'%F4%8F%BF%',
'%F4%8F%BF%B',
'%F4%8F%BF%BF',
];
var i;
for (i = 0; i < inputs.length; i++) {
try {
print(dumpCodePoints(g.decodeURI(inputs[i])));
} catch (e) {
print(e.name);
}
try {
print(dumpCodePoints(g.decodeURIComponent(inputs[i])));
} catch (e) {
print(e.name);
}
}
}
try {
testBrokenEscapes();
} catch (e) {
print(e.name);
}