mirror of https://github.com/svaarala/duktape.git
Sami Vaarala
5 years ago
1 changed files with 155 additions and 0 deletions
@ -0,0 +1,155 @@ |
|||
/* |
|||
* Select a set of Unicode characters (based on included/excluded categories) |
|||
* and write out a compact bitstream for matching a character against |
|||
* the set at runtime. This is for the slow path, where we're especially |
|||
* concerned with compactness. |
|||
* |
|||
* Unicode categories (such as 'Z') and codepoints (such as 0x200d) can be |
|||
* used. Two pseudo-categories are also available for exclusion only: 'ASCII' |
|||
* and 'NONBMP'. 'ASCII' category excludes ASCII codepoints which is useful |
|||
* because C code typically contains an ASCII fast path so ASCII characters |
|||
* don't need to be considered in the Unicode tables. 'NONBMP' excludes |
|||
* codepoints above U+FFFF which is useful because such codepoints don't need |
|||
* to be supported in standard ECMAScript. |
|||
*/ |
|||
|
|||
'use strict'; |
|||
|
|||
const { createBareObject } = require('../util/bare'); |
|||
const { BitEncoder } = require('../util/bitencoder'); |
|||
const { assert } = require('../util/assert'); |
|||
|
|||
// Filter cpMap by first including all categories listed in 'includeList' and
|
|||
// then excluding anything in 'excludeList'.
|
|||
function filterCpMap(cpMap, includeList, excludeList) { |
|||
var filterAscii = excludeList.indexOf('ASCII') >= 0; |
|||
var filterNonBmp = excludeList.indexOf('NONBMP') >= 0; |
|||
var includeCatMap = createBareObject({}); |
|||
var excludeCatMap = createBareObject({}); |
|||
var includeCpMap = createBareObject({}); |
|||
var excludeCpMap = createBareObject({}); |
|||
var filteredCpMap; |
|||
|
|||
// Helper lookups to speed up processing.
|
|||
includeList.forEach((cat) => { |
|||
if (typeof cat === 'number') { |
|||
includeCpMap[cat] = true; |
|||
} else if (typeof cat === 'string') { |
|||
includeCatMap[cat] = true; |
|||
} else { |
|||
throw new TypeError('invalid includeList entry: ' + cat); |
|||
} |
|||
}); |
|||
excludeList.forEach((cat) => { |
|||
if (typeof cat === 'number') { |
|||
excludeCpMap[cat] = true; |
|||
} else if (typeof cat === 'string') { |
|||
excludeCatMap[cat] = true; // includes ASCII and NONBMP, does not matter
|
|||
} else { |
|||
throw new TypeError('invalid excludeList entry: ' + cat); |
|||
} |
|||
}); |
|||
|
|||
// Filter codepoint map according to our criteria.
|
|||
filteredCpMap = cpMap.filter((ent) => { |
|||
if (!ent) { |
|||
return false; |
|||
} |
|||
let cp = ent.cp; |
|||
let gc = ent.gc; |
|||
if ((filterAscii && cp <= 0x7f) || (filterNonBmp && cp >= 0x10000)) { |
|||
return false; |
|||
} |
|||
if ((includeCatMap[gc] || includeCpMap[cp]) && |
|||
!(excludeCatMap[gc] || excludeCpMap[cp])) { |
|||
// Included in one or more categories/codepoints, and not
|
|||
// excluded by any categories/codepoints.
|
|||
return true; |
|||
} |
|||
return false; |
|||
}); |
|||
|
|||
return filteredCpMap; |
|||
} |
|||
exports.filterCpMap = filterCpMap; |
|||
|
|||
// Pack match ranges into a varint encoding. For previous unused encoding
|
|||
// variants, see old Python tooling.
|
|||
function generateMatchTable3(ranges) { |
|||
var be = new BitEncoder(); |
|||
var freq = []; // informative
|
|||
while (freq.length < 0x110000) { |
|||
freq.push(0); |
|||
} |
|||
|
|||
function encCustom(x) { |
|||
freq[x]++; |
|||
|
|||
if (x <= 0x0e) { |
|||
// 4-bit encoding
|
|||
be.bits(x, 4); |
|||
return; |
|||
} |
|||
x -= 0x0e + 1; |
|||
|
|||
if (x <= 0xfd) { |
|||
// 12-bit encoding
|
|||
be.bits(0x0f, 4); |
|||
be.bits(x, 8); |
|||
return; |
|||
} |
|||
x -= 0xfd + 1; |
|||
|
|||
if (x <= 0xfff) { |
|||
// 24-bit encoding
|
|||
be.bits(0x0f, 4); |
|||
be.bits(0xfe, 8); |
|||
be.bits(x, 12); |
|||
return; |
|||
} |
|||
x -= 0xfff + 1; |
|||
|
|||
// 36-bit encoding
|
|||
be.bits(0x0f, 4); |
|||
be.bits(0xff, 8); |
|||
be.bits(x, 24); |
|||
} |
|||
|
|||
function encVaruint(x) { |
|||
be.varuint(x); |
|||
} |
|||
void encVaruint; |
|||
|
|||
var enc = encCustom; |
|||
|
|||
var prevRangeEnd = 0; |
|||
for (let i = 0; i < ranges.length; i++) { |
|||
let rangeStart = ranges[i][0]; |
|||
let rangeEnd = ranges[i][1]; |
|||
let r1 = rangeStart - prevRangeEnd; // 1 or above (no unjoined ranges)
|
|||
assert(r1 >= 1); |
|||
let r2 = rangeEnd - rangeStart; // 0 or above
|
|||
assert(r2 >= 0); |
|||
|
|||
// r1 is >= 1, so r1 == 0 is used as an end marker. Encoding an
|
|||
// explicit count and (r1 - 1) here improves total output size
|
|||
// by about 30 bytes.
|
|||
//
|
|||
// Encoding using BitEncoder varuint is more efficient and shares
|
|||
// code so maybe switch to that.
|
|||
|
|||
enc(r1); |
|||
enc(r2); |
|||
prevRangeEnd = rangeEnd; |
|||
} |
|||
|
|||
// End marker (r1 can never be 0).
|
|||
enc(0); |
|||
|
|||
return { |
|||
data: be.getBytes(), |
|||
nbits: be.getNumBits(), |
|||
freq: freq |
|||
}; |
|||
} |
|||
exports.generateMatchTable3 = generateMatchTable3; |
Loading…
Reference in new issue