mirror of https://github.com/svaarala/duktape.git
Sami Vaarala
2 years ago
19 changed files with 19 additions and 4935 deletions
@ -1,271 +0,0 @@ |
|||
#!/usr/bin/env python2 |
|||
# |
|||
# Combine a set of a source files into a single C file. |
|||
# |
|||
# Overview of the process: |
|||
# |
|||
# * Parse user supplied C files. Add automatic #undefs at the end |
|||
# of each C file to avoid defines bleeding from one file to another. |
|||
# |
|||
# * Combine the C files in specified order. If sources have ordering |
|||
# dependencies (depends on application), order may matter. |
|||
# |
|||
# * Process #include statements in the combined source, categorizing |
|||
# them either as "internal" (found in specified include path) or |
|||
# "external". Internal includes, unless explicitly excluded, are |
|||
# inlined into the result while extenal includes are left as is. |
|||
# Duplicate internal #include statements are replaced with a comment. |
|||
# |
|||
# At every step, source and header lines are represented with explicit |
|||
# line objects which keep track of original filename and line. The |
|||
# output contains #line directives, if requested, to ensure error |
|||
# throwing and other diagnostic info will work in a useful manner when |
|||
# deployed. It's also possible to generate a combined source with no |
|||
# #line directives. |
|||
# |
|||
# Making the process deterministic is important, so that if users have |
|||
# diffs that they apply to the combined source, such diffs would apply |
|||
# for as long as possible. |
|||
# |
|||
# Limitations and notes: |
|||
# |
|||
# * While there are automatic #undef's for #define's introduced in each |
|||
# C file, it's not possible to "undefine" structs, unions, etc. If |
|||
# there are structs/unions/typedefs with conflicting names, these |
|||
# have to be resolved in the source files first. |
|||
# |
|||
# * Because duplicate #include statements are suppressed, currently |
|||
# assumes #include statements are not conditional. |
|||
# |
|||
# * A system header might be #include'd in multiple source files with |
|||
# different feature defines (like _BSD_SOURCE). Because the #include |
|||
# file will only appear once in the resulting source, the first |
|||
# occurrence wins. The result may not work correctly if the feature |
|||
# defines must actually be different between two or more source files. |
|||
# |
|||
|
|||
import logging |
|||
import sys |
|||
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='%(name)-21s %(levelname)-7s %(message)s') |
|||
logger = logging.getLogger('combine_src.py') |
|||
logger.setLevel(logging.INFO) |
|||
|
|||
import os |
|||
import re |
|||
import json |
|||
import optparse |
|||
import logging |
|||
|
|||
# Include path for finding include files which are amalgamated. |
|||
include_paths = [] |
|||
|
|||
# Include files specifically excluded from being inlined. |
|||
include_excluded = [] |
|||
|
|||
class File: |
|||
filename_full = None |
|||
filename = None |
|||
lines = None |
|||
|
|||
def __init__(self, filename, lines): |
|||
self.filename = os.path.basename(filename) |
|||
self.filename_full = filename |
|||
self.lines = lines |
|||
|
|||
class Line: |
|||
filename_full = None |
|||
filename = None |
|||
lineno = None |
|||
data = None |
|||
|
|||
def __init__(self, filename, lineno, data): |
|||
self.filename = os.path.basename(filename) |
|||
self.filename_full = filename |
|||
self.lineno = lineno |
|||
self.data = data |
|||
|
|||
def readFile(filename): |
|||
lines = [] |
|||
|
|||
with open(filename, 'rb') as f: |
|||
lineno = 0 |
|||
for line in f: |
|||
lineno += 1 |
|||
if len(line) > 0 and line[-1] == '\n': |
|||
line = line[:-1] |
|||
lines.append(Line(filename, lineno, line)) |
|||
|
|||
return File(filename, lines) |
|||
|
|||
def lookupInclude(incfn): |
|||
re_sep = re.compile(r'/|\\') |
|||
|
|||
inccomp = re.split(re_sep, incfn) # split include path, support / and \ |
|||
|
|||
for path in include_paths: |
|||
fn = apply(os.path.join, [ path ] + inccomp) |
|||
if os.path.exists(fn): |
|||
return fn # Return full path to first match |
|||
|
|||
return None |
|||
|
|||
def addAutomaticUndefs(f): |
|||
defined = {} |
|||
|
|||
re_def = re.compile(r'#define\s+(\w+).*$') |
|||
re_undef = re.compile(r'#undef\s+(\w+).*$') |
|||
|
|||
for line in f.lines: |
|||
m = re_def.match(line.data) |
|||
if m is not None: |
|||
#logger.debug('DEFINED: %s' % repr(m.group(1))) |
|||
defined[m.group(1)] = True |
|||
m = re_undef.match(line.data) |
|||
if m is not None: |
|||
# Could just ignore #undef's here: we'd then emit |
|||
# reliable #undef's (though maybe duplicates) at |
|||
# the end. |
|||
#logger.debug('UNDEFINED: %s' % repr(m.group(1))) |
|||
if defined.has_key(m.group(1)): |
|||
del defined[m.group(1)] |
|||
|
|||
# Undefine anything that seems to be left defined. This not a 100% |
|||
# process because some #undef's might be conditional which we don't |
|||
# track at the moment. Note that it's safe to #undef something that's |
|||
# not defined. |
|||
|
|||
keys = sorted(defined.keys()) # deterministic order |
|||
if len(keys) > 0: |
|||
#logger.debug('STILL DEFINED: %r' % repr(defined.keys())) |
|||
f.lines.append(Line(f.filename, len(f.lines) + 1, '')) |
|||
f.lines.append(Line(f.filename, len(f.lines) + 1, '/* automatic undefs */')) |
|||
for k in keys: |
|||
logger.debug('automatic #undef for ' + k) |
|||
f.lines.append(Line(f.filename, len(f.lines) + 1, '#undef %s' % k)) |
|||
|
|||
def createCombined(files, prologue_filename, line_directives): |
|||
res = [] |
|||
line_map = [] # indicate combined source lines where uncombined file/line would change |
|||
metadata = { |
|||
'line_map': line_map |
|||
} |
|||
|
|||
emit_state = [ None, None ] # curr_filename, curr_lineno |
|||
|
|||
def emit(line): |
|||
if isinstance(line, (str, unicode)): |
|||
res.append(line) |
|||
emit_state[1] += 1 |
|||
else: |
|||
if line.filename != emit_state[0] or line.lineno != emit_state[1]: |
|||
if line_directives: |
|||
res.append('#line %d "%s"' % (line.lineno, line.filename)) |
|||
line_map.append({ 'original_file': line.filename, |
|||
'original_line': line.lineno, |
|||
'combined_line': len(res) + 1 }) |
|||
res.append(line.data) |
|||
emit_state[0] = line.filename |
|||
emit_state[1] = line.lineno + 1 |
|||
|
|||
included = {} # headers already included |
|||
|
|||
if prologue_filename is not None: |
|||
with open(prologue_filename, 'rb') as f: |
|||
for line in f.read().split('\n'): |
|||
res.append(line) |
|||
|
|||
re_inc = re.compile(r'^#include\s+(<|\")(.*?)(>|\").*$') |
|||
|
|||
# Process a file, appending it to the result; the input may be a |
|||
# source or an include file. #include directives are handled |
|||
# recursively. |
|||
def processFile(f): |
|||
logger.debug('Process file: ' + f.filename) |
|||
|
|||
for line in f.lines: |
|||
if not line.data.startswith('#include'): |
|||
emit(line) |
|||
continue |
|||
|
|||
m = re_inc.match(line.data) |
|||
if m is None: |
|||
raise Exception('Couldn\'t match #include line: %s' % repr(line.data)) |
|||
incpath = m.group(2) |
|||
if incpath in include_excluded: |
|||
# Specific include files excluded from the |
|||
# inlining / duplicate suppression process. |
|||
emit(line) # keep as is |
|||
continue |
|||
|
|||
if included.has_key(incpath): |
|||
# We suppress duplicate includes, both internal and |
|||
# external, based on the assumption that includes are |
|||
# not behind #if defined() checks. This is the case for |
|||
# Duktape (except for the include files excluded). |
|||
emit('/* #include %s -> already included */' % incpath) |
|||
continue |
|||
included[incpath] = True |
|||
|
|||
# An include file is considered "internal" and is amalgamated |
|||
# if it is found in the include path provided by the user. |
|||
|
|||
incfile = lookupInclude(incpath) |
|||
if incfile is not None: |
|||
logger.debug('Include considered internal: %s -> %s' % (repr(line.data), repr(incfile))) |
|||
emit('/* #include %s */' % incpath) |
|||
processFile(readFile(incfile)) |
|||
else: |
|||
logger.debug('Include considered external: %s' % repr(line.data)) |
|||
emit(line) # keep as is |
|||
|
|||
for f in files: |
|||
processFile(f) |
|||
|
|||
return '\n'.join(res) + '\n', metadata |
|||
|
|||
def main(): |
|||
global include_paths, include_excluded |
|||
|
|||
parser = optparse.OptionParser() |
|||
parser.add_option('--include-path', dest='include_paths', action='append', default=[], help='Include directory for "internal" includes, can be specified multiple times') |
|||
parser.add_option('--include-exclude', dest='include_excluded', action='append', default=[], help='Include file excluded from being considered internal (even if found in include dirs)') |
|||
parser.add_option('--prologue', dest='prologue', help='Prologue to prepend to start of file') |
|||
parser.add_option('--output-source', dest='output_source', help='Output source filename') |
|||
parser.add_option('--output-metadata', dest='output_metadata', help='Output metadata filename') |
|||
parser.add_option('--line-directives', dest='line_directives', action='store_true', default=False, help='Use #line directives in combined source') |
|||
parser.add_option('--quiet', dest='quiet', action='store_true', default=False, help='Suppress info messages (show warnings)') |
|||
parser.add_option('--verbose', dest='verbose', action='store_true', default=False, help='Show verbose debug messages') |
|||
(opts, args) = parser.parse_args() |
|||
|
|||
assert(opts.include_paths is not None) |
|||
include_paths = opts.include_paths # global for easy access |
|||
include_excluded = opts.include_excluded |
|||
assert(opts.output_source) |
|||
assert(opts.output_metadata) |
|||
|
|||
# Log level. |
|||
if opts.quiet: |
|||
logger.setLevel(logging.WARNING) |
|||
elif opts.verbose: |
|||
logger.setLevel(logging.DEBUG) |
|||
|
|||
# Read input files, add automatic #undefs |
|||
sources = args |
|||
files = [] |
|||
for fn in sources: |
|||
res = readFile(fn) |
|||
logger.debug('Add automatic undefs for: ' + fn) |
|||
addAutomaticUndefs(res) |
|||
files.append(res) |
|||
|
|||
combined_source, metadata = \ |
|||
createCombined(files, opts.prologue, opts.line_directives) |
|||
with open(opts.output_source, 'wb') as f: |
|||
f.write(combined_source) |
|||
with open(opts.output_metadata, 'wb') as f: |
|||
f.write(json.dumps(metadata, indent=4)) |
|||
|
|||
logger.info('Combined %d source files, %d bytes written to %s' % (len(files), len(combined_source), opts.output_source)) |
|||
|
|||
if __name__ == '__main__': |
|||
main() |
@ -1,246 +0,0 @@ |
|||
#!/usr/bin/env python2 |
|||
# |
|||
# Helper to create an SPDX license file (http://spdx.org) |
|||
# |
|||
# This must be executed when the dist/ directory is otherwise complete, |
|||
# except for the SPDX license, so that the file lists and such contained |
|||
# in the SPDX license will be correct. |
|||
# |
|||
# The utility outputs RDF/XML to specified file: |
|||
# |
|||
# $ python create_spdx_license.py /tmp/license.spdx |
|||
# |
|||
# Then, validate with SPDXViewer and SPDXTools: |
|||
# |
|||
# $ java -jar SPDXViewer.jar /tmp/license.spdx |
|||
# $ java -jar java -jar spdx-tools-1.2.5-jar-with-dependencies.jar RdfToHtml /tmp/license.spdx /tmp/license.html |
|||
# |
|||
# Finally, copy to dist: |
|||
# |
|||
# $ cp /tmp/license.spdx dist/license.spdx |
|||
# |
|||
# SPDX FAQ indicates there is no standard extension for an SPDX license file |
|||
# but '.spdx' is a common practice. |
|||
# |
|||
# The algorithm to compute a "verification code", implemented in this file, |
|||
# can be verified as follows: |
|||
# |
|||
# # build dist tar.xz, copy to /tmp/duktape-N.N.N.tar.xz |
|||
# $ cd /tmp |
|||
# $ tar xvfJ duktape-N.N.N.tar.xz |
|||
# $ rm duktape-N.N.N/license.spdx # remove file excluded from verification code |
|||
# $ java -jar spdx-tools-1.2.5-jar-with-dependencies.jar GenerateVerificationCode /tmp/duktape-N.N.N/ |
|||
# |
|||
# Compare the resulting verification code manually with the one in license.spdx. |
|||
# |
|||
# Resources: |
|||
# |
|||
# - http://spdx.org/about-spdx/faqs |
|||
# - http://wiki.spdx.org/view/Technical_Team/Best_Practices |
|||
# |
|||
|
|||
import os |
|||
import sys |
|||
import re |
|||
import datetime |
|||
import sha |
|||
import rdflib |
|||
from rdflib import URIRef, BNode, Literal, Namespace |
|||
|
|||
RDF = Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') |
|||
RDFS = Namespace('http://www.w3.org/2000/01/rdf-schema#') |
|||
XSD = Namespace('http://www.w3.org/2001/XMLSchema#') |
|||
SPDX = Namespace('http://spdx.org/rdf/terms#') |
|||
DOAP = Namespace('http://usefulinc.com/ns/doap#') |
|||
DUKTAPE = Namespace('http://duktape.org/rdf/terms#') |
|||
|
|||
def checksumFile(g, filename): |
|||
f = open(filename, 'rb') |
|||
d = f.read() |
|||
f.close() |
|||
shasum = sha.sha(d).digest().encode('hex').lower() |
|||
|
|||
csum_node = BNode() |
|||
g.add((csum_node, RDF.type, SPDX.Checksum)) |
|||
g.add((csum_node, SPDX.algorithm, SPDX.checksumAlgorithm_sha1)) |
|||
g.add((csum_node, SPDX.checksumValue, Literal(shasum))) |
|||
|
|||
return csum_node |
|||
|
|||
def computePackageVerification(g, dirname, excluded): |
|||
# SPDX 1.2 Section 4.7 |
|||
# The SPDXTools command "GenerateVerificationCode" can be used to |
|||
# check the verification codes created. Note that you must manually |
|||
# remove "license.spdx" from the unpacked dist directory before |
|||
# computing the verification code. |
|||
|
|||
verify_node = BNode() |
|||
|
|||
hashes = [] |
|||
for dirpath, dirnames, filenames in os.walk(dirname): |
|||
for fn in filenames: |
|||
full_fn = os.path.join(dirpath, fn) |
|||
f = open(full_fn, 'rb') |
|||
d = f.read() |
|||
f.close() |
|||
|
|||
if full_fn in excluded: |
|||
#print('excluded in verification: ' + full_fn) |
|||
continue |
|||
#print('included in verification: ' + full_fn) |
|||
|
|||
file_sha1 = sha.sha(d).digest().encode('hex').lower() |
|||
hashes.append(file_sha1) |
|||
|
|||
#print(repr(hashes)) |
|||
hashes.sort() |
|||
#print(repr(hashes)) |
|||
verify_code = sha.sha(''.join(hashes)).digest().encode('hex').lower() |
|||
|
|||
for fn in excluded: |
|||
g.add((verify_node, SPDX.packageVerificationCodeExcludedFile, Literal(fn))) |
|||
g.add((verify_node, SPDX.packageVerificationCodeValue, Literal(verify_code))) |
|||
|
|||
return verify_node |
|||
|
|||
def fileType(filename): |
|||
ign, ext = os.path.splitext(filename) |
|||
if ext in [ '.c', '.h', '.js' ]: |
|||
return SPDX.fileType_source |
|||
else: |
|||
return SPDX.fileType_other |
|||
|
|||
def getDuktapeVersion(): |
|||
f = open('./src/duktape.h') |
|||
re_ver = re.compile(r'^#define\s+DUK_VERSION\s+(\d+)L$') |
|||
for line in f: |
|||
line = line.strip() |
|||
m = re_ver.match(line) |
|||
if m is None: |
|||
continue |
|||
ver = int(m.group(1)) |
|||
return '%d.%d.%d' % ((ver / 10000) % 100, |
|||
(ver / 100) % 100, |
|||
ver % 100) |
|||
|
|||
raise Exception('could not figure out Duktape version') |
|||
|
|||
def main(): |
|||
outfile = sys.argv[1] |
|||
|
|||
if not os.path.exists('CONTRIBUTING.md') and os.path.exists('tests/ecmascript'): |
|||
sys.stderr.write('Invalid CWD, must be in Duktape root with dist/ built') |
|||
sys.exit(1) |
|||
os.chdir('dist') |
|||
if not os.path.exists('Makefile.cmdline'): |
|||
sys.stderr.write('Invalid CWD, must be in Duktape root with dist/ built') |
|||
sys.exit(1) |
|||
|
|||
duktape_version = getDuktapeVersion() |
|||
duktape_pkgname = 'duktape-' + duktape_version + '.tar.xz' |
|||
now = datetime.datetime.utcnow() |
|||
now = datetime.datetime(now.year, now.month, now.day, now.hour, now.minute, now.second) |
|||
creation_date = Literal(now.isoformat() + 'Z', datatype=XSD.dateTime) |
|||
duktape_org = Literal('Organization: duktape.org') |
|||
mit_license = URIRef('http://spdx.org/licenses/MIT') |
|||
duktape_copyright = Literal('Copyright 2013-2017 Duktape authors (see AUTHORS.rst in the Duktape distributable)') |
|||
|
|||
g = rdflib.Graph() |
|||
|
|||
crea_node = BNode() |
|||
g.add((crea_node, RDF.type, SPDX.CreationInfo)) |
|||
g.add((crea_node, RDFS.comment, Literal(''))) |
|||
g.add((crea_node, SPDX.creator, duktape_org)) |
|||
g.add((crea_node, SPDX.created, creation_date)) |
|||
g.add((crea_node, SPDX.licenseListVersion, Literal('1.20'))) # http://spdx.org/licenses/ |
|||
|
|||
# 'name' should not include a version number (see best practices) |
|||
pkg_node = BNode() |
|||
g.add((pkg_node, RDF.type, SPDX.Package)) |
|||
g.add((pkg_node, SPDX.name, Literal('Duktape'))) |
|||
g.add((pkg_node, SPDX.versionInfo, Literal(duktape_version))) |
|||
g.add((pkg_node, SPDX.packageFileName, Literal(duktape_pkgname))) |
|||
g.add((pkg_node, SPDX.supplier, duktape_org)) |
|||
g.add((pkg_node, SPDX.originator, duktape_org)) |
|||
g.add((pkg_node, SPDX.downloadLocation, Literal('http://duktape.org/' + duktape_pkgname, datatype=XSD.anyURI))) |
|||
g.add((pkg_node, SPDX.homePage, Literal('http://duktape.org/', datatype=XSD.anyURI))) |
|||
verify_node = computePackageVerification(g, '.', [ './license.spdx' ]) |
|||
g.add((pkg_node, SPDX.packageVerificationCode, verify_node)) |
|||
# SPDX.checksum: omitted because license is inside the package |
|||
g.add((pkg_node, SPDX.sourceInfo, Literal('Official duktape.org release built from GitHub repo https://github.com/svaarala/duktape.'))) |
|||
|
|||
# NOTE: MIT license alone is sufficient for now, because Duktape, Lua, |
|||
# Murmurhash2, and CommonJS (though probably not even relevant for |
|||
# licensing) are all MIT. |
|||
g.add((pkg_node, SPDX.licenseConcluded, mit_license)) |
|||
g.add((pkg_node, SPDX.licenseInfoFromFiles, mit_license)) |
|||
g.add((pkg_node, SPDX.licenseDeclared, mit_license)) |
|||
g.add((pkg_node, SPDX.licenseComments, Literal('Duktape is copyrighted by its authors and licensed under the MIT license. MurmurHash2 is used internally, it is also under the MIT license. Duktape module loader is based on the CommonJS module loading specification (without sharing any code), CommonJS is under the MIT license.'))) |
|||
g.add((pkg_node, SPDX.copyrightText, duktape_copyright)) |
|||
g.add((pkg_node, SPDX.summary, Literal('Duktape ECMAScript interpreter'))) |
|||
g.add((pkg_node, SPDX.description, Literal('Duktape is an embeddable Javascript engine, with a focus on portability and compact footprint'))) |
|||
# hasFile properties added separately below |
|||
|
|||
#reviewed_node = BNode() |
|||
#g.add((reviewed_node, RDF.type, SPDX.Review)) |
|||
#g.add((reviewed_node, SPDX.reviewer, XXX)) |
|||
#g.add((reviewed_node, SPDX.reviewDate, XXX)) |
|||
#g.add((reviewed_node, RDFS.comment, '')) |
|||
|
|||
spdx_doc = BNode() |
|||
g.add((spdx_doc, RDF.type, SPDX.SpdxDocument)) |
|||
g.add((spdx_doc, SPDX.specVersion, Literal('SPDX-1.2'))) |
|||
g.add((spdx_doc, SPDX.dataLicense, URIRef('http://spdx.org/licenses/CC0-1.0'))) |
|||
g.add((spdx_doc, RDFS.comment, Literal('SPDX license for Duktape ' + duktape_version))) |
|||
g.add((spdx_doc, SPDX.creationInfo, crea_node)) |
|||
g.add((spdx_doc, SPDX.describesPackage, pkg_node)) |
|||
# SPDX.hasExtractedLicensingInfo |
|||
# SPDX.reviewed |
|||
# SPDX.referencesFile: added below |
|||
|
|||
for dirpath, dirnames, filenames in os.walk('.'): |
|||
for fn in filenames: |
|||
full_fn = os.path.join(dirpath, fn) |
|||
#print('# file: ' + full_fn) |
|||
|
|||
file_node = BNode() |
|||
g.add((file_node, RDF.type, SPDX.File)) |
|||
g.add((file_node, SPDX.fileName, Literal(full_fn))) |
|||
g.add((file_node, SPDX.fileType, fileType(full_fn))) |
|||
g.add((file_node, SPDX.checksum, checksumFile(g, full_fn))) |
|||
|
|||
# Here we assume that LICENSE.txt provides the actual "in file" |
|||
# licensing information, and everything else is implicitly under |
|||
# MIT license. |
|||
g.add((file_node, SPDX.licenseConcluded, mit_license)) |
|||
if full_fn == './LICENSE.txt': |
|||
g.add((file_node, SPDX.licenseInfoInFile, mit_license)) |
|||
else: |
|||
g.add((file_node, SPDX.licenseInfoInFile, URIRef(SPDX.none))) |
|||
|
|||
# SPDX.licenseComments |
|||
g.add((file_node, SPDX.copyrightText, duktape_copyright)) |
|||
# SPDX.noticeText |
|||
# SPDX.artifactOf |
|||
# SPDX.fileDependency |
|||
# SPDX.fileContributor |
|||
|
|||
# XXX: should referencesFile include all files? |
|||
g.add((spdx_doc, SPDX.referencesFile, file_node)) |
|||
|
|||
g.add((pkg_node, SPDX.hasFile, file_node)) |
|||
|
|||
# Serialize into RDF/XML directly. We could also serialize into |
|||
# N-Triples and use external tools (like 'rapper') to get cleaner, |
|||
# abbreviated output. |
|||
|
|||
#print('# Duktape SPDX license file (autogenerated)') |
|||
#print(g.serialize(format='turtle')) |
|||
#print(g.serialize(format='nt')) |
|||
f = open(outfile, 'wb') |
|||
#f.write(g.serialize(format='rdf/xml')) |
|||
f.write(g.serialize(format='xml')) |
|||
f.close() |
|||
|
|||
if __name__ == '__main__': |
|||
main() |
@ -1,733 +0,0 @@ |
|||
#!/usr/bin/env python2 |
|||
# |
|||
# Extract rules for Unicode case conversion, specifically the behavior |
|||
# required by ECMAScript E5 in Sections 15.5.4.16 to 15.5.4.19. The |
|||
# bitstream encoded rules are used for the slow path at run time, so |
|||
# compactness is favored over speed. |
|||
# |
|||
# There is no support for context or locale sensitive rules, as they |
|||
# are handled directly in C code before consulting tables generated |
|||
# here. ECMAScript requires case conversion both with and without |
|||
# locale/language specific rules (e.g. String.prototype.toLowerCase() |
|||
# and String.prototype.toLocaleLowerCase()), so they are best handled |
|||
# in C anyway. |
|||
# |
|||
# Case conversion rules for ASCII are also excluded as they are handled |
|||
# by the C fast path. Rules for non-BMP characters (codepoints above |
|||
# U+FFFF) are omitted as they're not required for standard ECMAScript. |
|||
# |
|||
|
|||
import os |
|||
import sys |
|||
import re |
|||
import math |
|||
import optparse |
|||
|
|||
import dukutil |
|||
|
|||
class UnicodeData: |
|||
"""Read UnicodeData.txt into an internal representation.""" |
|||
|
|||
def __init__(self, filename): |
|||
self.data = self.read_unicode_data(filename) |
|||
print('read %d unicode data entries' % len(self.data)) |
|||
|
|||
def read_unicode_data(self, filename): |
|||
res = [] |
|||
f = open(filename, 'rb') |
|||
for line in f: |
|||
if line.startswith('#'): |
|||
continue |
|||
line = line.strip() |
|||
if line == '': |
|||
continue |
|||
parts = line.split(';') |
|||
if len(parts) != 15: |
|||
raise Exception('invalid unicode data line') |
|||
res.append(parts) |
|||
f.close() |
|||
|
|||
# Sort based on Unicode codepoint. |
|||
def mycmp(a,b): |
|||
return cmp(long(a[0], 16), long(b[0], 16)) |
|||
|
|||
res.sort(cmp=mycmp) |
|||
return res |
|||
|
|||
class SpecialCasing: |
|||
"""Read SpecialCasing.txt into an internal representation.""" |
|||
|
|||
def __init__(self, filename): |
|||
self.data = self.read_special_casing_data(filename) |
|||
print('read %d special casing entries' % len(self.data)) |
|||
|
|||
def read_special_casing_data(self, filename): |
|||
res = [] |
|||
f = open(filename, 'rb') |
|||
for line in f: |
|||
try: |
|||
idx = line.index('#') |
|||
line = line[:idx] |
|||
except ValueError: |
|||
pass |
|||
line = line.strip() |
|||
if line == '': |
|||
continue |
|||
parts = line.split(';') |
|||
parts = [i.strip() for i in parts] |
|||
while len(parts) < 6: |
|||
parts.append('') |
|||
res.append(parts) |
|||
f.close() |
|||
return res |
|||
|
|||
def parse_unicode_sequence(x): |
|||
"""Parse a Unicode sequence like ABCD 1234 into a unicode string.""" |
|||
res = '' |
|||
for i in x.split(' '): |
|||
i = i.strip() |
|||
if i == '': |
|||
continue |
|||
res += unichr(long(i, 16)) |
|||
return res |
|||
|
|||
def get_base_conversion_maps(unicode_data): |
|||
"""Create case conversion tables without handling special casing yet.""" |
|||
|
|||
uc = {} # uppercase, codepoint (number) -> string |
|||
lc = {} # lowercase |
|||
tc = {} # titlecase |
|||
|
|||
for x in unicode_data.data: |
|||
c1 = long(x[0], 16) |
|||
|
|||
# just 16-bit support needed |
|||
if c1 >= 0x10000: |
|||
continue |
|||
|
|||
if x[12] != '': |
|||
# field 12: simple uppercase mapping |
|||
c2 = parse_unicode_sequence(x[12]) |
|||
uc[c1] = c2 |
|||
tc[c1] = c2 # titlecase default == uppercase, overridden below if necessary |
|||
if x[13] != '': |
|||
# field 13: simple lowercase mapping |
|||
c2 = parse_unicode_sequence(x[13]) |
|||
lc[c1] = c2 |
|||
if x[14] != '': |
|||
# field 14: simple titlecase mapping |
|||
c2 = parse_unicode_sequence(x[14]) |
|||
tc[c1] = c2 |
|||
|
|||
return uc, lc, tc |
|||
|
|||
def update_special_casings(uc, lc, tc, special_casing): |
|||
"""Update case conversion tables with special case conversion rules.""" |
|||
|
|||
for x in special_casing.data: |
|||
c1 = long(x[0], 16) |
|||
|
|||
if x[4] != '': |
|||
# conditions |
|||
continue |
|||
|
|||
lower = parse_unicode_sequence(x[1]) |
|||
title = parse_unicode_sequence(x[2]) |
|||
upper = parse_unicode_sequence(x[3]) |
|||
|
|||
if len(lower) > 1: |
|||
lc[c1] = lower |
|||
if len(upper) > 1: |
|||
uc[c1] = upper |
|||
if len(title) > 1: |
|||
tc[c1] = title |
|||
|
|||
print('- special case: %d %d %d' % (len(lower), len(upper), len(title))) |
|||
|
|||
def remove_ascii_part(convmap): |
|||
"""Remove ASCII case conversion parts (handled by C fast path).""" |
|||
|
|||
for i in xrange(128): |
|||
if convmap.has_key(i): |
|||
del convmap[i] |
|||
|
|||
def scan_range_with_skip(convmap, start_idx, skip): |
|||
"""Scan for a range of continuous case conversion with a certain 'skip'.""" |
|||
|
|||
conv_i = start_idx |
|||
if not convmap.has_key(conv_i): |
|||
return None, None, None |
|||
elif len(convmap[conv_i]) > 1: |
|||
return None, None, None |
|||
else: |
|||
conv_o = ord(convmap[conv_i]) |
|||
|
|||
start_i = conv_i |
|||
start_o = conv_o |
|||
|
|||
while True: |
|||
new_i = conv_i + skip |
|||
new_o = conv_o + skip |
|||
|
|||
if not convmap.has_key(new_i): |
|||
break |
|||
if len(convmap[new_i]) > 1: |
|||
break |
|||
if ord(convmap[new_i]) != new_o: |
|||
break |
|||
|
|||
conv_i = new_i |
|||
conv_o = new_o |
|||
|
|||
# [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char. |
|||
count = (conv_i - start_i) / skip + 1 |
|||
if count <= 1: |
|||
return None, None, None |
|||
|
|||
# We have an acceptable range, remove them from the convmap here. |
|||
for i in xrange(start_i, conv_i + skip, skip): |
|||
del convmap[i] |
|||
|
|||
return start_i, start_o, count |
|||
|
|||
def find_first_range_with_skip(convmap, skip): |
|||
"""Find first range with a certain 'skip' value.""" |
|||
|
|||
for i in xrange(65536): |
|||
start_i, start_o, count = scan_range_with_skip(convmap, i, skip) |
|||
if start_i is None: |
|||
continue |
|||
return start_i, start_o, count |
|||
|
|||
return None, None, None |
|||
|
|||
def generate_caseconv_tables(convmap): |
|||
"""Generate bit-packed case conversion table for a given conversion map.""" |
|||
|
|||
# The bitstream encoding is based on manual inspection for whatever |
|||
# regularity the Unicode case conversion rules have. |
|||
# |
|||
# Start with a full description of case conversions which does not |
|||
# cover all codepoints; unmapped codepoints convert to themselves. |
|||
# Scan for range-to-range mappings with a range of skips starting from 1. |
|||
# Whenever a valid range is found, remove it from the map. Finally, |
|||
# output the remaining case conversions (1:1 and 1:n) on a per codepoint |
|||
# basis. |
|||
# |
|||
# This is very slow because we always scan from scratch, but its the |
|||
# most reliable and simple way to scan |
|||
|
|||
print('generate caseconv tables') |
|||
|
|||
ranges = [] # range mappings (2 or more consecutive mappings with a certain skip) |
|||
singles = [] # 1:1 character mappings |
|||
multis = [] # 1:n character mappings |
|||
|
|||
# Ranges with skips |
|||
|
|||
for skip in xrange(1,6+1): # skips 1...6 are useful |
|||
while True: |
|||
start_i, start_o, count = find_first_range_with_skip(convmap, skip) |
|||
if start_i is None: |
|||
break |
|||
print('- skip %d: %d %d %d' % (skip, start_i, start_o, count)) |
|||
ranges.append([start_i, start_o, count, skip]) |
|||
|
|||
# 1:1 conversions |
|||
|
|||
k = convmap.keys() |
|||
k.sort() |
|||
for i in k: |
|||
if len(convmap[i]) > 1: |
|||
continue |
|||
singles.append([i, ord(convmap[i])]) # codepoint, codepoint |
|||
del convmap[i] |
|||
|
|||
# There are many mappings to 2-char sequences with latter char being U+0399. |
|||
# These could be handled as a special case, but we don't do that right now. |
|||
# |
|||
# [8064L, u'\u1f08\u0399'] |
|||
# [8065L, u'\u1f09\u0399'] |
|||
# [8066L, u'\u1f0a\u0399'] |
|||
# [8067L, u'\u1f0b\u0399'] |
|||
# [8068L, u'\u1f0c\u0399'] |
|||
# [8069L, u'\u1f0d\u0399'] |
|||
# [8070L, u'\u1f0e\u0399'] |
|||
# [8071L, u'\u1f0f\u0399'] |
|||
# ... |
|||
# |
|||
# tmp = {} |
|||
# k = convmap.keys() |
|||
# k.sort() |
|||
# for i in k: |
|||
# if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399': |
|||
# tmp[i] = convmap[i][0] |
|||
# del convmap[i] |
|||
# print(repr(tmp)) |
|||
# |
|||
# skip = 1 |
|||
# while True: |
|||
# start_i, start_o, count = find_first_range_with_skip(tmp, skip) |
|||
# if start_i is None: |
|||
# break |
|||
# print('- special399, skip %d: %d %d %d' % (skip, start_i, start_o, count)) |
|||
# print(len(tmp.keys())) |
|||
# print(repr(tmp)) |
|||
# XXX: need to put 12 remaining mappings back to convmap |
|||
|
|||
# 1:n conversions |
|||
|
|||
k = convmap.keys() |
|||
k.sort() |
|||
for i in k: |
|||
multis.append([i, convmap[i]]) # codepoint, string |
|||
del convmap[i] |
|||
|
|||
for t in singles: |
|||
print '- singles: ' + repr(t) |
|||
|
|||
for t in multis: |
|||
print '- multis: ' + repr(t) |
|||
|
|||
print '- range mappings: %d' % len(ranges) |
|||
print '- single character mappings: %d' % len(singles) |
|||
print '- complex mappings (1:n): %d' % len(multis) |
|||
print '- remaining (should be zero): %d' % len(convmap.keys()) |
|||
|
|||
# XXX: opportunities for diff encoding skip=3 ranges? |
|||
prev = None |
|||
for t in ranges: |
|||
# range: [start_i, start_o, count, skip] |
|||
if t[3] != 3: |
|||
continue |
|||
if prev is not None: |
|||
print '- %d %d' % (t[0] - prev[0], t[1] - prev[1]) |
|||
else: |
|||
print '- start: %d %d' % (t[0], t[1]) |
|||
prev = t |
|||
|
|||
# Bit packed encoding. |
|||
|
|||
be = dukutil.BitEncoder() |
|||
|
|||
for curr_skip in xrange(1, 7): # 1...6 |
|||
count = 0 |
|||
for r in ranges: |
|||
start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3] |
|||
if skip != curr_skip: |
|||
continue |
|||
count += 1 |
|||
be.bits(count, 6) |
|||
print('- encode: skip=%d, count=%d' % (curr_skip, count)) |
|||
|
|||
for r in ranges: |
|||
start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3] |
|||
if skip != curr_skip: |
|||
continue |
|||
be.bits(start_i, 16) |
|||
be.bits(start_o, 16) |
|||
be.bits(r_count, 7) |
|||
be.bits(0x3f, 6) # maximum count value = end of skips |
|||
|
|||
count = len(singles) |
|||
be.bits(count, 7) |
|||
for t in singles: |
|||
cp_i, cp_o = t[0], t[1] |
|||
be.bits(cp_i, 16) |
|||
be.bits(cp_o, 16) |
|||
|
|||
count = len(multis) |
|||
be.bits(count, 7) |
|||
for t in multis: |
|||
cp_i, str_o = t[0], t[1] |
|||
be.bits(cp_i, 16) |
|||
be.bits(len(str_o), 2) |
|||
for i in xrange(len(str_o)): |
|||
be.bits(ord(str_o[i]), 16) |
|||
|
|||
return be.getBytes(), be.getNumBits() |
|||
|
|||
def generate_regexp_canonicalize_tables(convmap): |
|||
"""Generate tables for case insensitive RegExp normalization.""" |
|||
|
|||
# Generate a direct codepoint lookup for canonicalizing BMP range. |
|||
|
|||
def generate_canontab(): |
|||
res = [] |
|||
highest_nonid = -1 |
|||
|
|||
for cp in xrange(65536): |
|||
res_cp = cp # default to as is |
|||
if convmap.has_key(cp): |
|||
tmp = convmap[cp] |
|||
if len(tmp) == 1: |
|||
# If multiple codepoints from input, ignore. |
|||
res_cp = ord(tmp[0]) |
|||
if cp >= 0x80 and res_cp < 0x80: |
|||
res_cp = cp # If non-ASCII mapped to ASCII, ignore. |
|||
if cp != res_cp: |
|||
highest_nonid = cp |
|||
res.append(res_cp) |
|||
|
|||
# At the moment this is 65370, which means there's very little |
|||
# gain in assuming 1:1 mapping above a certain BMP codepoint |
|||
# (though we do assume 1:1 mapping for above BMP codepoints). |
|||
print('- highest non-identity mapping: %d' % highest_nonid) |
|||
|
|||
return res |
|||
|
|||
print('generate canontab') |
|||
canontab = generate_canontab() |
|||
|
|||
# Figure out which BMP values are never the result of canonicalization. |
|||
# Such codepoints are "don't care" in the sense that they are never |
|||
# matched against at runtime: ranges are canonicalized at compile time, |
|||
# and codepoint being matched is also canonicalized at run time. |
|||
# (Currently unused.) |
|||
|
|||
def generate_dontcare(): |
|||
res = [ True ] * 65536 |
|||
for cp in canontab: |
|||
res[cp] = False |
|||
res_count = 0 |
|||
for x in res: |
|||
if x: |
|||
res_count += 1 |
|||
print('- %d dontcare codepoints' % res_count) |
|||
return res |
|||
|
|||
print('generate canon dontcare') |
|||
dontcare = generate_dontcare() |
|||
|
|||
# Generate maximal continuous ranges for canonicalization. A continuous |
|||
# range is a sequence with N codepoints where IN+i canonicalizes to OUT+i |
|||
# for fixed IN, OUT, and i in 0...N-1. There are unfortunately >1000 |
|||
# of these ranges, mostly because there are a lot of individual exceptions. |
|||
# (Currently unused.) |
|||
|
|||
canon_ranges = [] |
|||
for cp in xrange(65536): |
|||
canon_ranges.append([ cp, canontab[cp], 1 ]) # 1 codepoint ranges at first |
|||
def merge_compatible_nogap(rng1, rng2): |
|||
# Merge adjacent ranges if continuity allows. |
|||
if rng1[0] + rng1[2] == rng2[0] and \ |
|||
rng1[1] + rng1[2] == rng2[1]: |
|||
return [ rng1[0], rng1[1], rng1[2] + rng2[2] ] |
|||
return None |
|||
def merge_check_nogap(): |
|||
len_start = len(canon_ranges) |
|||
for i in xrange(len(canon_ranges) - 1): |
|||
j = i + 1 |
|||
rng1 = canon_ranges[i] |
|||
rng2 = canon_ranges[j] |
|||
if rng1 is None or rng2 is None: continue |
|||
merged = merge_compatible_nogap(rng1, rng2) |
|||
if merged is not None: |
|||
canon_ranges[j] = None |
|||
canon_ranges[i] = merged |
|||
filtered = [] |
|||
for x in canon_ranges: |
|||
if x is not None: |
|||
filtered.append(x) |
|||
len_end = len(filtered) |
|||
if len_end < len_start: |
|||
return filtered |
|||
return None |
|||
|
|||
print('generate canon_ranges') |
|||
while True: |
|||
# Starting from individual ranges of 1 codepoint, merge adjacent |
|||
# ranges until no more ranges can be merged. |
|||
t = merge_check_nogap() |
|||
if t is None: |
|||
break |
|||
canon_ranges = t |
|||
print('- %d ranges' % len(canon_ranges)) |
|||
#for rng in canon_ranges: |
|||
# print('canon_ranges:') |
|||
# print(repr(rng)) |
|||
|
|||
# Generate true/false ranges for BMP codepoints where: |
|||
# - A codepoint is flagged true if continuity is broken at that point, so |
|||
# an explicit codepoint canonicalization is needed at runtime. |
|||
# - A codepoint is flagged false if case conversion is continuous from the |
|||
# previous codepoint, i.e. out_curr = out_prev + 1. |
|||
# |
|||
# The result is a lot of small ranges due to a lot of small 'false' ranges. |
|||
# Reduce the range set by checking if adjacent 'true' ranges have at most |
|||
# false_limit 'false' entries between them. If so, force the 'false' |
|||
# entries to 'true' (safe but results in an unnecessary runtime codepoint |
|||
# lookup) and merge the three ranges into a larger 'true' range. |
|||
# |
|||
# (Currently unused.) |
|||
|
|||
def generate_needcheck_straight(): |
|||
res = [ True ] * 65536 |
|||
assert(canontab[0] == 0) # can start from in == out == 0 |
|||
prev_in = -1 |
|||
prev_out = -1 |
|||
for i in xrange(65536): |
|||
# First create a straight true/false bitmap for BMP. |
|||
curr_in = i |
|||
curr_out = canontab[i] |
|||
if prev_in + 1 == curr_in and prev_out + 1 == curr_out: |
|||
res[i] = False |
|||
prev_in = curr_in |
|||
prev_out = curr_out |
|||
return res |
|||
def generate_needcheck_ranges(data): |
|||
# Generate maximal accurate ranges. |
|||
prev = None |
|||
count = 0 |
|||
ranges = [] |
|||
for i in data: |
|||
if prev is None or prev != i: |
|||
if prev is not None: |
|||
ranges.append([ prev, count ]) |
|||
prev = i |
|||
count = 1 |
|||
else: |
|||
count += 1 |
|||
if prev is not None: |
|||
ranges.append([ prev, count ]) |
|||
return ranges |
|||
def fillin_needcheck_ranges(data, false_limit): |
|||
# Fill in TRUE-FALSE*N-TRUE gaps into TRUE-TRUE*N-TRUE which is |
|||
# safe (leads to an unnecessary runtime check) but reduces |
|||
# range data size considerably. |
|||
res = [] |
|||
for r in data: |
|||
res.append([ r[0], r[1] ]) |
|||
while True: |
|||
found = False |
|||
for i in xrange(len(res) - 2): |
|||
r1 = res[i] |
|||
r2 = res[i + 1] |
|||
r3 = res[i + 2] |
|||
if r1[0] == True and r2[0] == False and r3[0] == True and \ |
|||
r2[1] <= false_limit: |
|||
#print('fillin %d falses' % r2[1]) |
|||
res.pop(i + 2) |
|||
res.pop(i + 1) |
|||
res[i] = [ True, r1[1] + r2[1] + r3[1] ] |
|||
found = True |
|||
break |
|||
if not found: |
|||
break |
|||
return res |
|||
|
|||
print('generate needcheck straight') |
|||
needcheck = generate_needcheck_straight() |
|||
|
|||
print('generate needcheck without false fillins') |
|||
needcheck_ranges1 = generate_needcheck_ranges(needcheck) |
|||
print('- %d ranges' % len(needcheck_ranges1)) |
|||
#print(needcheck_ranges1) |
|||
|
|||
print('generate needcheck with false fillins') |
|||
needcheck_ranges2 = fillin_needcheck_ranges(needcheck_ranges1, 11) |
|||
print('- %d ranges' % len(needcheck_ranges2)) |
|||
#print(needcheck_ranges2) |
|||
|
|||
# Generate a bitmap for BMP, divided into N-codepoint blocks, with each |
|||
# bit indicating: "entire codepoint block canonicalizes continuously, and |
|||
# the block is continuous with the previous and next block". A 'true' |
|||
# entry allows runtime code to just skip the block, advancing 'in' and |
|||
# 'out' by the block size, with no codepoint conversion. The block size |
|||
# should be large enough to produce a relatively small lookup table, but |
|||
# small enough to reduce codepoint conversions to a manageable number |
|||
# because the conversions are (currently) quite slow. This matters |
|||
# especially for case-insensitive RegExps; without any optimization, |
|||
# /[\u0000-\uffff]/i requires 65536 case conversions for runtime |
|||
# normalization. |
|||
|
|||
block_shift = 5 |
|||
block_size = 1 << block_shift |
|||
block_mask = block_size - 1 |
|||
num_blocks = 65536 / block_size |
|||
|
|||
def generate_block_bits(check_continuity): |
|||
res = [ True ] * num_blocks |
|||
for i in xrange(num_blocks): |
|||
base_in = i * block_size |
|||
base_out = canontab[base_in] |
|||
if check_continuity: |
|||
lower = -1 # [-1,block_size] |
|||
upper = block_size + 1 |
|||
else: |
|||
lower = 0 # [0,block_size-1] |
|||
upper = block_size |
|||
for j in xrange(lower, upper): |
|||
cp = base_in + j |
|||
if cp >= 0x0000 and cp <= 0xffff and canontab[cp] != base_out + j: |
|||
res[i] = False |
|||
break |
|||
return res |
|||
|
|||
def dump_block_bitmap(bits): |
|||
tmp = ''.join([ ({ True: 'x', False: '.' })[b] for b in bits]) |
|||
tmp = re.sub(r'.{64}', lambda x: x.group(0) + '\n', tmp) |
|||
blocks_true = tmp.count('x') |
|||
blocks_false = tmp.count('.') |
|||
print('%d codepoint blocks are continuous, %d blocks are not' % (blocks_true, blocks_false)) |
|||
sys.stdout.write(tmp) |
|||
#print(bits) |
|||
|
|||
def dump_test_lookup(bits): |
|||
sys.stdout.write('duk_uint8_t test = {'); |
|||
for b in bits: |
|||
if b: |
|||
sys.stdout.write('1,') |
|||
else: |
|||
sys.stdout.write('0,') |
|||
sys.stdout.write('};\n') |
|||
|
|||
def convert_to_bitmap(bits): |
|||
# C code looks up bits as: |
|||
# index = codepoint >> N |
|||
# bitnum = codepoint & mask |
|||
# bitmask = 1 << bitnum |
|||
# So block 0 is mask 0x01 of first byte, block 1 is mask 0x02 of |
|||
# first byte, etc. |
|||
res = [] |
|||
curr = 0 |
|||
mask = 0x01 |
|||
for b in bits: |
|||
if b: |
|||
curr += mask |
|||
mask = mask * 2 |
|||
if mask == 0x100: |
|||
res.append(curr) |
|||
curr = 0 |
|||
mask = 0x01 |
|||
assert(mask == 0x01) # no leftover |
|||
return res |
|||
|
|||
print('generate canon block bitmap without continuity') |
|||
block_bits1 = generate_block_bits(False) |
|||
dump_block_bitmap(block_bits1) |
|||
dump_test_lookup(block_bits1) |
|||
|
|||
print('generate canon block bitmap with continuity') |
|||
block_bits2 = generate_block_bits(True) |
|||
dump_block_bitmap(block_bits2) |
|||
dump_test_lookup(block_bits2) |
|||
|
|||
print('generate final canon bitmap') |
|||
block_bitmap = convert_to_bitmap(block_bits2) |
|||
print('- %d bytes' % len(block_bitmap)) |
|||
print('- ' + repr(block_bitmap)) |
|||
canon_bitmap = { |
|||
'data': block_bitmap, |
|||
'block_size': block_size, |
|||
'block_shift': block_shift, |
|||
'block_mask': block_mask |
|||
} |
|||
|
|||
# This is useful to figure out corner case test cases. |
|||
print('canon blocks which are different with and without continuity check') |
|||
for i in xrange(num_blocks): |
|||
if block_bits1[i] != block_bits2[i]: |
|||
print('- block %d ([%d,%d]) differs' % (i, i * block_size, i * block_size + block_size - 1)) |
|||
|
|||
return canontab, canon_bitmap |
|||
|
|||
def clonedict(x): |
|||
"Shallow clone of input dict." |
|||
res = {} |
|||
for k in x.keys(): |
|||
res[k] = x[k] |
|||
return res |
|||
|
|||
def main(): |
|||
parser = optparse.OptionParser() |
|||
parser.add_option('--command', dest='command', default='caseconv_bitpacked') |
|||
parser.add_option('--unicode-data', dest='unicode_data') |
|||
parser.add_option('--special-casing', dest='special_casing') |
|||
parser.add_option('--out-source', dest='out_source') |
|||
parser.add_option('--out-header', dest='out_header') |
|||
parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc') |
|||
parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc') |
|||
parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup') |
|||
parser.add_option('--table-name-re-canon-bitmap', dest='table_name_re_canon_bitmap', default='caseconv_re_canon_bitmap') |
|||
(opts, args) = parser.parse_args() |
|||
|
|||
unicode_data = UnicodeData(opts.unicode_data) |
|||
special_casing = SpecialCasing(opts.special_casing) |
|||
|
|||
uc, lc, tc = get_base_conversion_maps(unicode_data) |
|||
update_special_casings(uc, lc, tc, special_casing) |
|||
|
|||
if opts.command == 'caseconv_bitpacked': |
|||
# XXX: ASCII and non-BMP filtering could be an option but is now hardcoded |
|||
|
|||
# ASCII is handled with 'fast path' so not needed here. |
|||
t = clonedict(uc) |
|||
remove_ascii_part(t) |
|||
uc_bytes, uc_nbits = generate_caseconv_tables(t) |
|||
|
|||
t = clonedict(lc) |
|||
remove_ascii_part(t) |
|||
lc_bytes, lc_nbits = generate_caseconv_tables(t) |
|||
|
|||
# Generate C source and header files. |
|||
genc = dukutil.GenerateC() |
|||
genc.emitHeader('extract_caseconv.py') |
|||
genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True) |
|||
genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True) |
|||
f = open(opts.out_source, 'wb') |
|||
f.write(genc.getString()) |
|||
f.close() |
|||
|
|||
genc = dukutil.GenerateC() |
|||
genc.emitHeader('extract_caseconv.py') |
|||
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes))) |
|||
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes))) |
|||
f = open(opts.out_header, 'wb') |
|||
f.write(genc.getString()) |
|||
f.close() |
|||
elif opts.command == 're_canon_lookup': |
|||
# Direct canonicalization lookup for case insensitive regexps, includes ascii part. |
|||
t = clonedict(uc) |
|||
re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t) |
|||
|
|||
genc = dukutil.GenerateC() |
|||
genc.emitHeader('extract_caseconv.py') |
|||
genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True) |
|||
f = open(opts.out_source, 'wb') |
|||
f.write(genc.getString()) |
|||
f.close() |
|||
|
|||
genc = dukutil.GenerateC() |
|||
genc.emitHeader('extract_caseconv.py') |
|||
genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup))) |
|||
f = open(opts.out_header, 'wb') |
|||
f.write(genc.getString()) |
|||
f.close() |
|||
elif opts.command == 're_canon_bitmap': |
|||
# N-codepoint block bitmap for skipping continuous codepoint blocks |
|||
# quickly. |
|||
t = clonedict(uc) |
|||
re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t) |
|||
|
|||
genc = dukutil.GenerateC() |
|||
genc.emitHeader('extract_caseconv.py') |
|||
genc.emitArray(re_canon_bitmap['data'], opts.table_name_re_canon_bitmap, size=len(re_canon_bitmap['data']), typename='duk_uint8_t', intvalues=True, const=True) |
|||
f = open(opts.out_source, 'wb') |
|||
f.write(genc.getString()) |
|||
f.close() |
|||
|
|||
genc = dukutil.GenerateC() |
|||
genc.emitHeader('extract_caseconv.py') |
|||
genc.emitDefine('DUK_CANON_BITMAP_BLKSIZE', re_canon_bitmap['block_size']) |
|||
genc.emitDefine('DUK_CANON_BITMAP_BLKSHIFT', re_canon_bitmap['block_shift']) |
|||
genc.emitDefine('DUK_CANON_BITMAP_BLKMASK', re_canon_bitmap['block_mask']) |
|||
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_re_canon_bitmap, len(re_canon_bitmap['data']))) |
|||
f = open(opts.out_header, 'wb') |
|||
f.write(genc.getString()) |
|||
f.close() |
|||
else: |
|||
raise Exception('invalid command: %r' % opts.command) |
|||
|
|||
if __name__ == '__main__': |
|||
main() |
@ -1,385 +0,0 @@ |
|||
#!/usr/bin/env python2 |
|||
# |
|||
# Select a set of Unicode characters (based on included/excluded categories |
|||
# etc) and write out a compact bitstream for matching a character against |
|||
# the set at runtime. This is for the slow path, where we're especially |
|||
# concerned with compactness. A C source file with the table is written, |
|||
# together with a matching C header. |
|||
# |
|||
# Unicode categories (such as 'Z') can be used. Two pseudo-categories |
|||
# are also available for exclusion only: ASCII and NONBMP. "ASCII" |
|||
# category excludes ASCII codepoints which is useful because C code |
|||
# typically contains an ASCII fast path so ASCII characters don't need |
|||
# to be considered in the Unicode tables. "NONBMP" excludes codepoints |
|||
# above U+FFFF which is useful because such codepoints don't need to be |
|||
# supported in standard ECMAScript. |
|||
# |
|||
|
|||
import os |
|||
import sys |
|||
import math |
|||
import optparse |
|||
|
|||
import dukutil |
|||
|
|||
def read_unicode_data(unidata, catsinc, catsexc, filterfunc): |
|||
"Read UnicodeData.txt, including lines matching catsinc unless excluded by catsexc or filterfunc." |
|||
res = [] |
|||
f = open(unidata, 'rb') |
|||
|
|||
def filter_none(cp): |
|||
return True |
|||
if filterfunc is None: |
|||
filterfunc = filter_none |
|||
|
|||
# The Unicode parsing is slow enough to warrant some speedups. |
|||
exclude_cat_exact = {} |
|||
for cat in catsexc: |
|||
exclude_cat_exact[cat] = True |
|||
include_cat_exact = {} |
|||
for cat in catsinc: |
|||
include_cat_exact[cat] = True |
|||
|
|||
for line in f: |
|||
#line = line.strip() |
|||
parts = line.split(';') |
|||
|
|||
codepoint = parts[0] |
|||
if not filterfunc(long(codepoint, 16)): |
|||
continue |
|||
|
|||
category = parts[2] |
|||
if exclude_cat_exact.has_key(category): |
|||
continue # quick reject |
|||
|
|||
rejected = False |
|||
for cat in catsexc: |
|||
if category.startswith(cat) or codepoint == cat: |
|||
rejected = True |
|||
break |
|||
if rejected: |
|||
continue |
|||
|
|||
if include_cat_exact.has_key(category): |
|||
res.append(line) |
|||
continue |
|||
|
|||
accepted = False |
|||
for cat in catsinc: |
|||
if category.startswith(cat) or codepoint == cat: |
|||
accepted = True |
|||
break |
|||
if accepted: |
|||
res.append(line) |
|||
|
|||
f.close() |
|||
|
|||
# Sort based on Unicode codepoint |
|||
def mycmp(a,b): |
|||
t1 = a.split(';') |
|||
t2 = b.split(';') |
|||
n1 = long(t1[0], 16) |
|||
n2 = long(t2[0], 16) |
|||
return cmp(n1, n2) |
|||
|
|||
res.sort(cmp=mycmp) |
|||
|
|||
return res |
|||
|
|||
def scan_ranges(lines): |
|||
"Scan continuous ranges from (filtered) UnicodeData.txt lines." |
|||
ranges = [] |
|||
range_start = None |
|||
prev = None |
|||
|
|||
for line in lines: |
|||
t = line.split(';') |
|||
n = long(t[0], 16) |
|||
if range_start is None: |
|||
range_start = n |
|||
else: |
|||
if n == prev + 1: |
|||
# continue range |
|||
pass |
|||
else: |
|||
ranges.append((range_start, prev)) |
|||
range_start = n |
|||
prev = n |
|||
|
|||
if range_start is not None: |
|||
ranges.append((range_start, prev)) |
|||
|
|||
return ranges |
|||
|
|||
def generate_png(lines, fname): |
|||
"Generate an illustrative PNG of the character set." |
|||
from PIL import Image |
|||
|
|||
m = {} |
|||
for line in lines: |
|||
t = line.split(';') |
|||
n = long(t[0], 16) |
|||
m[n] = 1 |
|||
|
|||
codepoints = 0x10ffff + 1 |
|||
width = int(256) |
|||
height = int(math.ceil(float(codepoints) / float(width))) |
|||
im = Image.new('RGB', (width, height)) |
|||
black = (0,0,0) |
|||
white = (255,255,255) |
|||
for cp in xrange(codepoints): |
|||
y = cp / width |
|||
x = cp % width |
|||
|
|||
if m.has_key(long(cp)): |
|||
im.putpixel((x,y), black) |
|||
else: |
|||
im.putpixel((x,y), white) |
|||
|
|||
im.save(fname) |
|||
|
|||
def generate_match_table1(ranges): |
|||
"Unused match table format." |
|||
|
|||
# This is an earlier match table format which is no longer used. |
|||
# IdentifierStart-UnicodeLetter has 445 ranges and generates a |
|||
# match table of 2289 bytes. |
|||
|
|||
data = [] |
|||
prev_re = None |
|||
|
|||
def genrange(rs, re): |
|||
if (rs > re): |
|||
raise Exception('assumption failed: rs=%d re=%d' % (rs, re)) |
|||
|
|||
while True: |
|||
now = re - rs + 1 |
|||
if now > 255: |
|||
now = 255 |
|||
data.append(now) # range now |
|||
data.append(0) # skip 0 |
|||
rs = rs + now |
|||
else: |
|||
data.append(now) # range now |
|||
break |
|||
|
|||
def genskip(ss, se): |
|||
if (ss > se): |
|||
raise Exception('assumption failed: ss=%d se=%s' % (ss, se)) |
|||
|
|||
while True: |
|||
now = se - ss + 1 |
|||
if now > 255: |
|||
now = 255 |
|||
data.append(now) # skip now |
|||
data.append(0) # range 0 |
|||
ss = ss + now |
|||
else: |
|||
data.append(now) # skip now |
|||
break |
|||
|
|||
for rs, re in ranges: |
|||
if prev_re is not None: |
|||
genskip(prev_re + 1, rs - 1) |
|||
genrange(rs, re) |
|||
prev_re = re |
|||
|
|||
num_entries = len(data) |
|||
|
|||
# header: start of first range |
|||
# num entries |
|||
hdr = [] |
|||
hdr.append(ranges[0][0] >> 8) # XXX: check that not 0x10000 or over |
|||
hdr.append(ranges[0][1] & 0xff) |
|||
hdr.append(num_entries >> 8) |
|||
hdr.append(num_entries & 0xff) |
|||
|
|||
return hdr + data |
|||
|
|||
def generate_match_table2(ranges): |
|||
"Unused match table format." |
|||
|
|||
# Another attempt at a match table which is also unused. |
|||
# Total tables for all current classes is now 1472 bytes. |
|||
|
|||
data = [] |
|||
|
|||
def enc(x): |
|||
while True: |
|||
if x < 0x80: |
|||
data.append(x) |
|||
break |
|||
data.append(0x80 + (x & 0x7f)) |
|||
x = x >> 7 |
|||
|
|||
prev_re = 0 |
|||
|
|||
for rs, re in ranges: |
|||
r1 = rs - prev_re # 1 or above (no unjoined ranges) |
|||
r2 = re - rs # 0 or above |
|||
enc(r1) |
|||
enc(r2) |
|||
prev_re = re |
|||
|
|||
enc(0) # end marker |
|||
|
|||
return data |
|||
|
|||
def generate_match_table3(ranges): |
|||
"Current match table format." |
|||
|
|||
# Yet another attempt, similar to generate_match_table2 except |
|||
# in packing format. |
|||
# |
|||
# Total match size now (at time of writing): 1194 bytes. |
|||
# |
|||
# This is the current encoding format used in duk_lexer.c. |
|||
|
|||
be = dukutil.BitEncoder() |
|||
|
|||
freq = [0] * (0x10ffff + 1) # informative |
|||
|
|||
def enc(x): |
|||
freq[x] += 1 |
|||
|
|||
if x <= 0x0e: |
|||
# 4-bit encoding |
|||
be.bits(x, 4) |
|||
return |
|||
x -= 0x0e + 1 |
|||
if x <= 0xfd: |
|||
# 12-bit encoding |
|||
be.bits(0x0f, 4) |
|||
be.bits(x, 8) |
|||
return |
|||
x -= 0xfd + 1 |
|||
if x <= 0xfff: |
|||
# 24-bit encoding |
|||
be.bits(0x0f, 4) |
|||
be.bits(0xfe, 8) |
|||
be.bits(x, 12) |
|||
return |
|||
x -= 0xfff + 1 |
|||
if True: |
|||
# 36-bit encoding |
|||
be.bits(0x0f, 4) |
|||
be.bits(0xff, 8) |
|||
be.bits(x, 24) |
|||
return |
|||
|
|||
raise Exception('cannot encode') |
|||
|
|||
prev_re = 0 |
|||
|
|||
for rs, re in ranges: |
|||
r1 = rs - prev_re # 1 or above (no unjoined ranges) |
|||
r2 = re - rs # 0 or above |
|||
enc(r1) |
|||
enc(r2) |
|||
prev_re = re |
|||
|
|||
enc(0) # end marker |
|||
|
|||
data, nbits = be.getBytes(), be.getNumBits() |
|||
return data, freq |
|||
|
|||
def main(): |
|||
parser = optparse.OptionParser() |
|||
parser.add_option('--unicode-data', dest='unicode_data') # UnicodeData.txt |
|||
parser.add_option('--special-casing', dest='special_casing') # SpecialCasing.txt |
|||
parser.add_option('--include-categories', dest='include_categories') |
|||
parser.add_option('--exclude-categories', dest='exclude_categories', default='NONE') |
|||
parser.add_option('--out-source', dest='out_source') |
|||
parser.add_option('--out-header', dest='out_header') |
|||
parser.add_option('--out-png', dest='out_png') |
|||
parser.add_option('--table-name', dest='table_name', default='match_table') |
|||
(opts, args) = parser.parse_args() |
|||
|
|||
unidata = opts.unicode_data |
|||
catsinc = [] |
|||
if opts.include_categories != '': |
|||
catsinc = opts.include_categories.split(',') |
|||
catsexc = [] |
|||
if opts.exclude_categories != 'NONE': |
|||
catsexc = opts.exclude_categories.split(',') |
|||
|
|||
print 'CATSEXC: %s' % repr(catsexc) |
|||
print 'CATSINC: %s' % repr(catsinc) |
|||
|
|||
# pseudocategories |
|||
filter_ascii = ('ASCII' in catsexc) |
|||
filter_nonbmp = ('NONBMP' in catsexc) |
|||
|
|||
# Read raw result |
|||
def filter1(x): |
|||
if filter_ascii and x <= 0x7f: |
|||
# exclude ascii |
|||
return False |
|||
if filter_nonbmp and x >= 0x10000: |
|||
# exclude non-bmp |
|||
return False |
|||
return True |
|||
|
|||
print('read unicode data') |
|||
uni_filtered = read_unicode_data(unidata, catsinc, catsexc, filter1) |
|||
print('done reading unicode data') |
|||
|
|||
# Raw output |
|||
#print('RAW OUTPUT:') |
|||
#print('===========') |
|||
#print('\n'.join(uni_filtered)) |
|||
|
|||
# Scan ranges |
|||
#print('') |
|||
#print('RANGES:') |
|||
#print('=======') |
|||
ranges = scan_ranges(uni_filtered) |
|||
#for i in ranges: |
|||
# if i[0] == i[1]: |
|||
# print('0x%04x' % i[0]) |
|||
# else: |
|||
# print('0x%04x ... 0x%04x' % (i[0], i[1])) |
|||
#print('') |
|||
print('%d ranges total' % len(ranges)) |
|||
|
|||
# Generate match table |
|||
#print('') |
|||
#print('MATCH TABLE:') |
|||
#print('============') |
|||
#matchtable1 = generate_match_table1(ranges) |
|||
#matchtable2 = generate_match_table2(ranges) |
|||
matchtable3, freq = generate_match_table3(ranges) |
|||
#print 'match table: %s' % repr(matchtable3) |
|||
print 'match table length: %d bytes' % len(matchtable3) |
|||
print 'encoding freq:' |
|||
for i in xrange(len(freq)): |
|||
if freq[i] == 0: |
|||
continue |
|||
print ' %6d: %d' % (i, freq[i]) |
|||
|
|||
print('') |
|||
print('MATCH C TABLE -> file %s' % repr(opts.out_header)) |
|||
|
|||
# Create C source and header files |
|||
genc = dukutil.GenerateC() |
|||
genc.emitHeader('extract_chars.py') |
|||
genc.emitArray(matchtable3, opts.table_name, size=len(matchtable3), typename='duk_uint8_t', intvalues=True, const=True) |
|||
if opts.out_source is not None: |
|||
f = open(opts.out_source, 'wb') |
|||
f.write(genc.getString()) |
|||
f.close() |
|||
|
|||
genc = dukutil.GenerateC() |
|||
genc.emitHeader('extract_chars.py') |
|||
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name, len(matchtable3))) |
|||
if opts.out_header is not None: |
|||
f = open(opts.out_header, 'wb') |
|||
f.write(genc.getString()) |
|||
f.close() |
|||
|
|||
# Image (for illustrative purposes only) |
|||
if opts.out_png is not None: |
|||
generate_png(uni_filtered, opts.out_png) |
|||
|
|||
if __name__ == '__main__': |
|||
main() |
File diff suppressed because it is too large
@ -1,5 +0,0 @@ |
|||
import os, sys, json, yaml |
|||
|
|||
if __name__ == '__main__': |
|||
# Use safe_dump() instead of dump() to avoid tags like "!!python/unicode" |
|||
print(yaml.safe_dump(json.load(sys.stdin), default_flow_style=False)) |
@ -1,51 +0,0 @@ |
|||
#!/usr/bin/env python2 |
|||
# |
|||
# UnicodeData.txt may contain ranges in addition to individual characters. |
|||
# Unpack the ranges into individual characters for the other scripts to use. |
|||
# |
|||
|
|||
import os |
|||
import sys |
|||
import optparse |
|||
|
|||
def main(): |
|||
parser = optparse.OptionParser() |
|||
parser.add_option('--unicode-data', dest='unicode_data') |
|||
parser.add_option('--output', dest='output') |
|||
parser.add_option('--quiet', dest='quiet', action='store_true', default=False, help='Suppress info messages (show warnings)') |
|||
parser.add_option('--verbose', dest='verbose', action='store_true', default=False, help='Show verbose debug messages') |
|||
(opts, args) = parser.parse_args() |
|||
assert(opts.unicode_data is not None) |
|||
assert(opts.output is not None) |
|||
|
|||
f_in = open(opts.unicode_data, 'rb') |
|||
f_out = open(opts.output, 'wb') |
|||
while True: |
|||
line = f_in.readline() |
|||
if line == '' or line == '\n': |
|||
break |
|||
parts = line.split(';') # keep newline |
|||
if parts[1].endswith('First>'): |
|||
line2 = f_in.readline() |
|||
parts2 = line2.split(';') |
|||
if not parts2[1].endswith('Last>'): |
|||
raise Exception('cannot parse range') |
|||
cp1 = long(parts[0], 16) |
|||
cp2 = long(parts2[0], 16) |
|||
|
|||
tmp = parts[1:] |
|||
tmp[0] = '-""-' |
|||
suffix = ';'.join(tmp) |
|||
f_out.write(line) |
|||
for i in xrange(cp1 + 1, cp2): |
|||
f_out.write('%04X;%s' % (i, suffix)) |
|||
f_out.write(line2) |
|||
else: |
|||
f_out.write(line) |
|||
|
|||
f_in.close() |
|||
f_out.flush() |
|||
f_out.close() |
|||
|
|||
if __name__ == '__main__': |
|||
main() |
@ -1,4 +0,0 @@ |
|||
import os, sys, json, yaml |
|||
|
|||
if __name__ == '__main__': |
|||
print(json.dumps(yaml.safe_load(sys.stdin))) |
Loading…
Reference in new issue