/* * Tests for global object URI handling functions: * * - encodeURI() * - encodeURIComponent() * - decodeURI() * - decodeURIComponent() */ // indirect eval -> this is bound to the global object, E5 Section 10.4.2, step 1.a. var g = (function () { var e = eval; return e('this'); } )(); /* Pure Ecmascript helper to URI encode a codepoint into URI escaped form. * Allows surrogate pairs to be encoded into invalid UTF-8 on purpose. */ function encCodePoint(x, forced_len) { var len; var initial; var i; var nybbles = "0123456789ABCDEF"; var initial_bytes = [ null, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe ]; var t; var res; // Supports extended UTF-8 up to 36 bits if (x < 0x80) { len = 1; } else if (x < 0x800) { len = 2; } else if (x < 0x10000) { len = 3; } else if (x < 0x200000) { len = 4; } else if (x < 0x4000000) { len = 5; } else if (x < 0x80000000) { len = 6; } else { len = 7; } if (typeof forced_len === 'number') { len = forced_len; } initial = initial_bytes[len]; t = []; for (i = len - 1; i >= 0; i--) { if (i === 0) { t[i] = initial + x; } else { t[i] = (x & 0x3f) + 0x80; } x = x >>> 6; } res = []; for (i = 0; i < len; i++) { res.push('%' + nybbles.charAt((t[i] >>> 4) & 0x0f) + nybbles.charAt(t[i] & 0x0f)); } return res.join(''); } /* Dump a string as decimal codepoints, ensures that tests produce ASCII only * outputs. */ function dumpCodePoints(x) { var i; var res = []; for (i = 0; i < x.length; i++) { res.push(x.charCodeAt(i)); } return res.join(' '); } /*=== basic encode http://www.example.com/%C3%8A%D8%80%E1%88%B4#foo 104 116 116 112 58 47 47 119 119 119 46 101 120 97 109 112 108 101 46 99 111 109 47 202 1536 4660 35 102 111 111 http%3A%2F%2Fwww.example.com%2F%C3%8A%D8%80%E1%88%B4%23foo 104 116 116 112 58 47 47 119 119 119 46 101 120 97 109 112 108 101 46 99 111 109 47 202 1536 4660 35 102 111 111 ===*/ /* A simple URI encoding / decoding test. * * Note: upper case hex escapes are required by the encoding * algorithm in E5.1 Section 15.1.3. */ print('basic encode'); function basicEncodeTest() { var uri = 'http://www.example.com/\u00ca\u0600\u1234#foo'; var t; t = g.encodeURI(uri); print(t); t = g.decodeURI(t); print(dumpCodePoints(t)); t = g.encodeURIComponent(uri); print(t); t = decodeURIComponent(t); print(dumpCodePoints(t)); } try { basicEncodeTest(); } catch (e) { print(e.name); } /*=== encoding of ascii range %00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22#$%25&'()*+,-./0123456789:;%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 %00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22%23%24%25%26'()*%2B%2C-.%2F0123456789%3A%3B%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 ===*/ /* ASCII range test for character encoding */ print('encoding of ascii range'); function asciiEncodeTest() { var i; var txt = []; var t; for (i = 0; i < 128; i++) { txt.push(String.fromCharCode(i)); } txt = txt.join(''); t = g.encodeURI(txt); print(t); t = g.decodeURI(t); print(dumpCodePoints(t)); t = g.encodeURIComponent(txt); print(t); t = decodeURIComponent(t); print(dumpCodePoints(t)); } try { asciiEncodeTest(); } catch (e) { print(e.name); } /*=== decoding of ascii range 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 37 50 51 37 50 52 37 37 50 54 39 40 41 42 37 50 66 37 50 67 45 46 37 50 70 48 49 50 51 52 53 54 55 56 57 37 51 65 37 51 66 60 37 51 68 62 37 51 70 37 52 48 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 ===*/ print('decoding of ascii range'); function asciiDecodeTest() { var i; var txt = []; var res; for (i = 0; i < 128; i++) { txt.push(encCodePoint(i)); } txt = txt.join(''); // since e.g. control characters will be output, dump as codepoints print(dumpCodePoints(g.decodeURI(txt))); print(dumpCodePoints(g.decodeURIComponent(txt))); } try { asciiDecodeTest(); } catch (e) { print(e.name); } /*=== decode non-bmp 55296 56320 55296 56320 55304 57157 55304 57157 55441 56679 55441 56679 55804 56338 55804 56338 56256 56320 56256 56320 56260 56884 56260 56884 56319 57343 56319 57343 ===*/ /* Decode non-BMP characters and check that surrogate pairs are decoded * correctly. In other words, a single UTF-8 encoded codepoint becomes * two Ecmascript codepoints. * * Decoding of non-BMP characters above U+10FFFF is required to result * in URIError, and is tested separately below in invalid UTF-8 tests. */ print('decode non-bmp'); function decodeNonBmpTest() { var inputs = [ encCodePoint(0x10000), encCodePoint(0x12345), encCodePoint(0x34567), encCodePoint(0x8f012), encCodePoint(0x100000), encCodePoint(0x101234), encCodePoint(0x10ffff) ]; var i; for (i = 0; i < inputs.length; i++) { print(dumpCodePoints(g.decodeURI(inputs[i]))); print(dumpCodePoints(g.decodeURIComponent(inputs[i]))); } } try { decodeNonBmpTest(); } catch (e) { print(e.name); } /*=== combine surrogate pairs in encode %F0%90%80%80 %F0%90%8F%BF %F0%90%90%80 %F4%8F%B0%80 %F4%8F%BF%BF ===*/ /* When encoding, surrogate pairs found in Ecmascript strings must be combined, * and encoded into UTF-8 (as a single codepoint). */ print('combine surrogate pairs in encode'); try { print(g.encodeURI('\ud800\udc00')); print(g.encodeURI('\ud800\udfff')); print(g.encodeURI('\ud801\udc00')); print(g.encodeURI('\udbff\udc00')); print(g.encodeURI('\udbff\udfff')); } catch (e) { print(e.name); } /*=== attempt to encode invalid surrogate pairs %ED%9F%BF URIError URIError URIError URIError %EE%80%80 URIError URIError URIError %F0%90%80%80 %F0%90%8F%BF URIError URIError URIError URIError URIError URIError URIError ===*/ /* An attempt to encode an invalid surrogate pair is a URIError. */ print('attempt to encode invalid surrogate pairs'); function attemptInvalidSurrogateEncode(x) { try { print(g.encodeURI(x)); } catch (e) { print(e.name); } } try { attemptInvalidSurrogateEncode('\ud7ff'); // ok attemptInvalidSurrogateEncode('\ud800'); attemptInvalidSurrogateEncode('\udbff'); attemptInvalidSurrogateEncode('\udc00'); attemptInvalidSurrogateEncode('\udfff'); attemptInvalidSurrogateEncode('\ue000'); // ok attemptInvalidSurrogateEncode('\ud800\ud7ff'); attemptInvalidSurrogateEncode('\ud800\ud800'); attemptInvalidSurrogateEncode('\ud800\udbff'); attemptInvalidSurrogateEncode('\ud800\udc00'); // ok attemptInvalidSurrogateEncode('\ud800\udfff'); // ok attemptInvalidSurrogateEncode('\ud800\ue000'); attemptInvalidSurrogateEncode('\udc00\ud7ff'); attemptInvalidSurrogateEncode('\udc00\ud800'); attemptInvalidSurrogateEncode('\udc00\udbff'); attemptInvalidSurrogateEncode('\udc00\udc00'); attemptInvalidSurrogateEncode('\udc00\udfff'); attemptInvalidSurrogateEncode('\udc00\ue000'); } catch (e) { print(e.name); } /*=== invalid utf-8 decode %C0%80 %E0%80%A9 %F0%80%80%A9 %F8%80%80%80%A9 %FC%80%80%80%80%A9 %FE%80%80%80%80%80%A9 56319 57343 56319 57343 ===*/ /* Decode only allows valid UTF-8 encodings up to 4 bytes. This technically * allows codepoints up to U+1FFFFF, but UTF-8 further restricts the range to * U+10FFFF. Codepoints above U+10FFFF would not fit into surrogate pairs * anyway. * * Surrogate pair codepoints (U+D800...U+DFFF) encoded into UTF-8 naively * are not allowed by UTF-8. Non-shortest encodings are not allowed by * UTF-8 either. * * Things to test: * * - Surrogate pairs naively encoded into UTF-8 (= CESU-8) cause an URIError * * - Non-shortest UTF-8 encodings, e.g. URIError is required for C0 80. * * - U+10FFFF decodes correctly to an Ecmascript string with a surrogate pair * * - U+110000 causes an URIError * * - Codepoints with >4 byte encoding cause an URIError. * * From RFC 3629: * * Implementations of the decoding algorithm above MUST protect against * decoding invalid sequences. For instance, a naive implementation may * decode the overlong UTF-8 sequence C0 80 into the character U+0000, * or the surrogate pair ED A1 8C ED BE B4 into U+233B4. Decoding * invalid sequences may have security consequences or cause other * problems. */ print('invalid utf-8 decode'); var _invalidInputErrorReported = false; function testInvalidUtf8Input(x) { var ok1, ok2; try { g.decodeURI(x); ok1 = true; } catch (e) { if (e.name === 'URIError') { ok1 = false; } else { throw e; } } try { g.decodeURIComponent(x); ok2 = true; } catch (e) { if (e.name === 'URIError') { ok2 = false; } else { throw e; } } if (!ok1 && !ok2) { // silent } else { // one or both inputs did NOT produce a URIError; only report first // error because Rhino causes a flood otherwise if (!_invalidInputErrorReported) { _invalidInputErrorReported = true; print('first error', x, ok1, ok2); } } } function testValidUtf8Input(x) { var t; try { t = g.decodeURI(x); print(dumpCodePoints(t)); } catch (e) { print(e.name, '(unexpected)'); } try { t = g.decodeURIComponent(x); print(dumpCodePoints(t)); } catch (e) { print(e.name, '(unexpected)'); } } function invalidUtf8Test() { var i; var t; // surrogate pairs for (i = 0xd800; i < 0xe000; i++) { testInvalidUtf8Input(encCodePoint(i)); } // even valid surrogate pairs (or any surrogate character pairs) for (i = 0xd800; i < 0xe000; i++) { // we're just spot checking the second codepoint to keep the runtime reasonable for (j = 0xd800; j < 0xe000; j += 127) { testInvalidUtf8Input(encCodePoint(i) + encCodePoint(j)); } } // non-shortest encodings (C0 80 mentioned explicitly in spec); // above 4 bytes rejected because not valid UTF-8 (in addition // to not being shortest) t = encCodePoint(0, 2); print(t); testInvalidUtf8Input(t); t = encCodePoint(41, 3); print(t); testInvalidUtf8Input(t); t = encCodePoint(41, 4); print(t); testInvalidUtf8Input(t); t = encCodePoint(41, 5); print(t); testInvalidUtf8Input(t); t = encCodePoint(41, 6); print(t); testInvalidUtf8Input(t); t = encCodePoint(41, 7); print(t); testInvalidUtf8Input(t); // U+10FFFF decodes correctly testValidUtf8Input('%F4%8F%BF%BF'); // codepoints above utf-8 range testInvalidUtf8Input(encCodePoint(0x110000)); testInvalidUtf8Input(encCodePoint(0x200123)); testInvalidUtf8Input(encCodePoint(0x4001234)); testInvalidUtf8Input(encCodePoint(0x80012345)); testInvalidUtf8Input(encCodePoint(0xfedcba98)); // invalid surrogate pair EDA18C EDBEB4 (from RFC 3629) // EDA18C -> U+D84C (invalid UTF-8) // EDBEB4 -> U+DFB4 (invalid UTF-8) testInvalidUtf8Input('%ED%A1%8C%ED%BE%B4'); // invalid UTF-8 bytes; C2 must be followed by a byte >= 0x80 testInvalidUtf8Input('%C2%01'); } try { invalidUtf8Test(); } catch (e) { print(e.name); } /*=== broken escapes URIError URIError URIError URIError 65 65 URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError URIError 56319 57343 56319 57343 ===*/ /* Test truncated hex encoding (e.g. '%' or '%1') and truncated UTF-8 encoding * (e.g. '%C2' but no followup byte). */ print('broken escapes'); function testBrokenEscapes() { var inputs = [ '%', '%4', '%41', // partial encodings of U+10FFFF '%', '%F', '%F4', '%F4%', '%F4%8', '%F4%8F', '%F4%8F%', '%F4%8F%B', '%F4%8F%BF', '%F4%8F%BF%', '%F4%8F%BF%B', '%F4%8F%BF%BF', ]; var i; for (i = 0; i < inputs.length; i++) { try { print(dumpCodePoints(g.decodeURI(inputs[i]))); } catch (e) { print(e.name); } try { print(dumpCodePoints(g.decodeURIComponent(inputs[i]))); } catch (e) { print(e.name); } } } try { testBrokenEscapes(); } catch (e) { print(e.name); }