Browse Source

Remove some unused Python tooling

pull/2505/head
Sami Vaarala 2 years ago
parent
commit
8b1999473b
  1. 2
      doc/objects-in-code-section.rst
  2. 2
      src-input/duk_bi_buffer.c
  3. 2
      src-input/duk_bi_date.c
  4. 4
      src-input/duk_bi_math.c
  5. 2
      src-input/duk_bi_protos.h
  6. 2
      src-input/duk_heap_alloc.c
  7. 12
      src-input/duk_hthread_builtins.c
  8. 2
      src-input/duk_lexer.c
  9. 4
      src-input/duk_util_bitdecoder.c
  10. 4
      src-input/duktape.h.in
  11. 271
      tools/combine_src.py
  12. 2
      tools/configure.py
  13. 246
      tools/create_spdx_license.py
  14. 733
      tools/extract_caseconv.py
  15. 385
      tools/extract_chars.py
  16. 3221
      tools/genbuiltins.py
  17. 5
      tools/json2yaml.py
  18. 51
      tools/prepare_unicode_data.py
  19. 4
      tools/yaml2json.py

2
doc/objects-in-code-section.rst

@ -135,7 +135,7 @@ duk_hobject
duk_hobject properties allocation
---------------------------------
* There are three layouts: static initializers generated by ``genbuiltins.py``
* There are three layouts: static initializers generated by configure tooling
must duplicate each and choose active layout at compile time.
* Property key pointer is *not* compressed at the moment.

2
src-input/duk_bi_buffer.c

@ -680,7 +680,7 @@ DUK_INTERNAL duk_ret_t duk_bi_typedarray_constructor(duk_hthread *thr) {
duk_require_constructor_call(thr);
/* We could fit built-in index into magic but that'd make the magic
* number dependent on built-in numbering (genbuiltins.py doesn't
* number dependent on built-in numbering (configure tooling doesn't
* handle that yet). So map both class and prototype from the
* element type.
*/

2
src-input/duk_bi_date.c

@ -1374,7 +1374,7 @@ DUK_LOCAL void duk__set_parts_from_args(duk_hthread *thr, duk_double_t *dparts,
* magic value is set to an index pointing to the array of control flags
* below.
*
* This must be kept in strict sync with genbuiltins.py!
* This must be kept in strict sync with configure tooling!
*/
static duk_uint16_t duk__date_magics[] = {

4
src-input/duk_bi_math.c

@ -258,7 +258,7 @@ DUK_LOCAL double duk__atan2_fixed(double x, double y) {
}
#endif /* DUK_USE_AVOID_PLATFORM_FUNCPTRS */
/* order must match constants in genbuiltins.py */
/* order must match constants in configure tooling */
DUK_LOCAL const duk__one_arg_func duk__one_arg_funcs[] = {
#if defined(DUK_USE_AVOID_PLATFORM_FUNCPTRS)
duk__fabs, duk__acos, duk__asin, duk__atan, duk__ceil, duk__cos, duk__exp,
@ -275,7 +275,7 @@ DUK_LOCAL const duk__one_arg_func duk__one_arg_funcs[] = {
#endif /* DUK_USE_AVOID_PLATFORM_FUNCPTRS */
};
/* order must match constants in genbuiltins.py */
/* order must match constants in configure tooling */
DUK_LOCAL const duk__two_arg_func duk__two_arg_funcs[] = {
#if defined(DUK_USE_AVOID_PLATFORM_FUNCPTRS)
duk__atan2_fixed,

2
src-input/duk_bi_protos.h

@ -1,6 +1,6 @@
/*
* Prototypes for built-in functions not automatically covered by the
* header declarations emitted by genbuiltins.py.
* header declarations emitted by configure tooling.
*/
#if !defined(DUK_BUILTIN_PROTOS_H_INCLUDED)

2
src-input/duk_heap_alloc.c

@ -448,7 +448,7 @@ DUK_INTERNAL void duk_heap_free(duk_heap *heap) {
/*
* Allocate a heap.
*
* String table is initialized with built-in strings from genbuiltins.py,
* String table is initialized with built-in strings from configure tooling
* either by dynamically creating the strings or by referring to ROM strings.
*/

12
src-input/duk_hthread_builtins.c

@ -7,7 +7,7 @@
#include "duk_internal.h"
/*
* Encoding constants, must match genbuiltins.py
* Encoding constants, must match configure tooling.
*/
#define DUK__PROP_FLAGS_BITS 3
@ -28,7 +28,7 @@
/*
* Create built-in objects by parsing an init bitstream generated
* by genbuiltins.py.
* by configure tooling.
*/
#if defined(DUK_USE_ROM_OBJECTS)
@ -183,7 +183,7 @@ DUK_LOCAL void duk__push_double(duk_hthread *thr, duk_bitdecoder_ctx *bd) {
for (i = 0; i < 8; i++) {
/* Encoding endianness must match target memory layout,
* build scripts and genbuiltins.py must ensure this.
* build scripts and configure tooling must ensure this.
*/
du.uc[i] = (duk_uint8_t) duk_bd_decode(bd, 8);
}
@ -366,8 +366,8 @@ DUK_INTERNAL void duk_hthread_create_builtin_objects(duk_hthread *thr) {
}
/*
* Then decode the builtins init data (see genbuiltins.py) to
* init objects. Internal prototypes are set at this stage,
* Then decode the builtins init data (see configure tooling)
* to init objects. Internal prototypes are set at this stage,
* with thr->builtins[] populated.
*/
@ -532,7 +532,7 @@ DUK_INTERNAL void duk_hthread_create_builtin_objects(duk_hthread *thr) {
}
/* Writable flag doesn't make sense for an accessor. */
DUK_ASSERT((defprop_flags & DUK_PROPDESC_FLAG_WRITABLE) == 0); /* genbuiltins.py ensures */
DUK_ASSERT((defprop_flags & DUK_PROPDESC_FLAG_WRITABLE) == 0); /* configure tooling ensures */
defprop_flags &= ~(DUK_DEFPROP_HAVE_VALUE | DUK_DEFPROP_HAVE_WRITABLE);
defprop_flags |= DUK_DEFPROP_HAVE_ENUMERABLE | DUK_DEFPROP_HAVE_CONFIGURABLE;

2
src-input/duk_lexer.c

@ -1558,7 +1558,7 @@ slow_path:
/*
* Interned identifier is compared against reserved words, which are
* currently interned into the heap context. See genbuiltins.py.
* currently interned into the heap context. See configure tooling.
*
* Note that an escape in the identifier disables recognition of
* keywords; e.g. "\u0069f = 1;" is a valid statement (assigns to

4
src-input/duk_util_bitdecoder.c

@ -78,7 +78,7 @@ DUK_INTERNAL duk_uint32_t duk_bd_decode_varuint(duk_bitdecoder_ctx *ctx) {
duk_small_uint_t t;
/* The bit encoding choices here are based on manual testing against
* the actual varuints generated by genbuiltins.py.
* the actual varuints generated by configure tooling.
*/
switch (duk_bd_decode(ctx, 2)) {
case 0:
@ -96,7 +96,7 @@ DUK_INTERNAL duk_uint32_t duk_bd_decode_varuint(duk_bitdecoder_ctx *ctx) {
}
}
/* Decode a bit packed string from a custom format used by genbuiltins.py.
/* Decode a bit packed string from a custom format used by configure tooling.
* This function is here because it's used for both heap and thread inits.
* Caller must supply the output buffer whose size is NOT checked!
*/

4
src-input/duktape.h.in

@ -1250,7 +1250,7 @@ DUK_EXTERNAL_DECL duk_double_t duk_components_to_time(duk_context *ctx, duk_time
* depend on the specific ordering, so change with care. 16 bits are not
* enough for all parts (year, specifically).
*
* Must be in-sync with genbuiltins.py.
* Must be in-sync with configure tooling.
*/
#define DUK_DATE_IDX_YEAR 0 /* year */
#define DUK_DATE_IDX_MONTH 1 /* month: 0 to 11 */
@ -1270,7 +1270,7 @@ DUK_EXTERNAL_DECL duk_double_t duk_components_to_time(duk_context *ctx, duk_time
* The unused top bits of the flags field are also used to pass values
* to helpers (duk__get_part_helper() and duk__set_part_helper()).
*
* Must be in-sync with genbuiltins.py.
* Must be in-sync with configure tooling.
*/
/* NOTE: when writing a Date provider you only need a few specific

271
tools/combine_src.py

@ -1,271 +0,0 @@
#!/usr/bin/env python2
#
# Combine a set of a source files into a single C file.
#
# Overview of the process:
#
# * Parse user supplied C files. Add automatic #undefs at the end
# of each C file to avoid defines bleeding from one file to another.
#
# * Combine the C files in specified order. If sources have ordering
# dependencies (depends on application), order may matter.
#
# * Process #include statements in the combined source, categorizing
# them either as "internal" (found in specified include path) or
# "external". Internal includes, unless explicitly excluded, are
# inlined into the result while extenal includes are left as is.
# Duplicate internal #include statements are replaced with a comment.
#
# At every step, source and header lines are represented with explicit
# line objects which keep track of original filename and line. The
# output contains #line directives, if requested, to ensure error
# throwing and other diagnostic info will work in a useful manner when
# deployed. It's also possible to generate a combined source with no
# #line directives.
#
# Making the process deterministic is important, so that if users have
# diffs that they apply to the combined source, such diffs would apply
# for as long as possible.
#
# Limitations and notes:
#
# * While there are automatic #undef's for #define's introduced in each
# C file, it's not possible to "undefine" structs, unions, etc. If
# there are structs/unions/typedefs with conflicting names, these
# have to be resolved in the source files first.
#
# * Because duplicate #include statements are suppressed, currently
# assumes #include statements are not conditional.
#
# * A system header might be #include'd in multiple source files with
# different feature defines (like _BSD_SOURCE). Because the #include
# file will only appear once in the resulting source, the first
# occurrence wins. The result may not work correctly if the feature
# defines must actually be different between two or more source files.
#
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='%(name)-21s %(levelname)-7s %(message)s')
logger = logging.getLogger('combine_src.py')
logger.setLevel(logging.INFO)
import os
import re
import json
import optparse
import logging
# Include path for finding include files which are amalgamated.
include_paths = []
# Include files specifically excluded from being inlined.
include_excluded = []
class File:
filename_full = None
filename = None
lines = None
def __init__(self, filename, lines):
self.filename = os.path.basename(filename)
self.filename_full = filename
self.lines = lines
class Line:
filename_full = None
filename = None
lineno = None
data = None
def __init__(self, filename, lineno, data):
self.filename = os.path.basename(filename)
self.filename_full = filename
self.lineno = lineno
self.data = data
def readFile(filename):
lines = []
with open(filename, 'rb') as f:
lineno = 0
for line in f:
lineno += 1
if len(line) > 0 and line[-1] == '\n':
line = line[:-1]
lines.append(Line(filename, lineno, line))
return File(filename, lines)
def lookupInclude(incfn):
re_sep = re.compile(r'/|\\')
inccomp = re.split(re_sep, incfn) # split include path, support / and \
for path in include_paths:
fn = apply(os.path.join, [ path ] + inccomp)
if os.path.exists(fn):
return fn # Return full path to first match
return None
def addAutomaticUndefs(f):
defined = {}
re_def = re.compile(r'#define\s+(\w+).*$')
re_undef = re.compile(r'#undef\s+(\w+).*$')
for line in f.lines:
m = re_def.match(line.data)
if m is not None:
#logger.debug('DEFINED: %s' % repr(m.group(1)))
defined[m.group(1)] = True
m = re_undef.match(line.data)
if m is not None:
# Could just ignore #undef's here: we'd then emit
# reliable #undef's (though maybe duplicates) at
# the end.
#logger.debug('UNDEFINED: %s' % repr(m.group(1)))
if defined.has_key(m.group(1)):
del defined[m.group(1)]
# Undefine anything that seems to be left defined. This not a 100%
# process because some #undef's might be conditional which we don't
# track at the moment. Note that it's safe to #undef something that's
# not defined.
keys = sorted(defined.keys()) # deterministic order
if len(keys) > 0:
#logger.debug('STILL DEFINED: %r' % repr(defined.keys()))
f.lines.append(Line(f.filename, len(f.lines) + 1, ''))
f.lines.append(Line(f.filename, len(f.lines) + 1, '/* automatic undefs */'))
for k in keys:
logger.debug('automatic #undef for ' + k)
f.lines.append(Line(f.filename, len(f.lines) + 1, '#undef %s' % k))
def createCombined(files, prologue_filename, line_directives):
res = []
line_map = [] # indicate combined source lines where uncombined file/line would change
metadata = {
'line_map': line_map
}
emit_state = [ None, None ] # curr_filename, curr_lineno
def emit(line):
if isinstance(line, (str, unicode)):
res.append(line)
emit_state[1] += 1
else:
if line.filename != emit_state[0] or line.lineno != emit_state[1]:
if line_directives:
res.append('#line %d "%s"' % (line.lineno, line.filename))
line_map.append({ 'original_file': line.filename,
'original_line': line.lineno,
'combined_line': len(res) + 1 })
res.append(line.data)
emit_state[0] = line.filename
emit_state[1] = line.lineno + 1
included = {} # headers already included
if prologue_filename is not None:
with open(prologue_filename, 'rb') as f:
for line in f.read().split('\n'):
res.append(line)
re_inc = re.compile(r'^#include\s+(<|\")(.*?)(>|\").*$')
# Process a file, appending it to the result; the input may be a
# source or an include file. #include directives are handled
# recursively.
def processFile(f):
logger.debug('Process file: ' + f.filename)
for line in f.lines:
if not line.data.startswith('#include'):
emit(line)
continue
m = re_inc.match(line.data)
if m is None:
raise Exception('Couldn\'t match #include line: %s' % repr(line.data))
incpath = m.group(2)
if incpath in include_excluded:
# Specific include files excluded from the
# inlining / duplicate suppression process.
emit(line) # keep as is
continue
if included.has_key(incpath):
# We suppress duplicate includes, both internal and
# external, based on the assumption that includes are
# not behind #if defined() checks. This is the case for
# Duktape (except for the include files excluded).
emit('/* #include %s -> already included */' % incpath)
continue
included[incpath] = True
# An include file is considered "internal" and is amalgamated
# if it is found in the include path provided by the user.
incfile = lookupInclude(incpath)
if incfile is not None:
logger.debug('Include considered internal: %s -> %s' % (repr(line.data), repr(incfile)))
emit('/* #include %s */' % incpath)
processFile(readFile(incfile))
else:
logger.debug('Include considered external: %s' % repr(line.data))
emit(line) # keep as is
for f in files:
processFile(f)
return '\n'.join(res) + '\n', metadata
def main():
global include_paths, include_excluded
parser = optparse.OptionParser()
parser.add_option('--include-path', dest='include_paths', action='append', default=[], help='Include directory for "internal" includes, can be specified multiple times')
parser.add_option('--include-exclude', dest='include_excluded', action='append', default=[], help='Include file excluded from being considered internal (even if found in include dirs)')
parser.add_option('--prologue', dest='prologue', help='Prologue to prepend to start of file')
parser.add_option('--output-source', dest='output_source', help='Output source filename')
parser.add_option('--output-metadata', dest='output_metadata', help='Output metadata filename')
parser.add_option('--line-directives', dest='line_directives', action='store_true', default=False, help='Use #line directives in combined source')
parser.add_option('--quiet', dest='quiet', action='store_true', default=False, help='Suppress info messages (show warnings)')
parser.add_option('--verbose', dest='verbose', action='store_true', default=False, help='Show verbose debug messages')
(opts, args) = parser.parse_args()
assert(opts.include_paths is not None)
include_paths = opts.include_paths # global for easy access
include_excluded = opts.include_excluded
assert(opts.output_source)
assert(opts.output_metadata)
# Log level.
if opts.quiet:
logger.setLevel(logging.WARNING)
elif opts.verbose:
logger.setLevel(logging.DEBUG)
# Read input files, add automatic #undefs
sources = args
files = []
for fn in sources:
res = readFile(fn)
logger.debug('Add automatic undefs for: ' + fn)
addAutomaticUndefs(res)
files.append(res)
combined_source, metadata = \
createCombined(files, opts.prologue, opts.line_directives)
with open(opts.output_source, 'wb') as f:
f.write(combined_source)
with open(opts.output_metadata, 'wb') as f:
f.write(json.dumps(metadata, indent=4))
logger.info('Combined %d source files, %d bytes written to %s' % (len(files), len(combined_source), opts.output_source))
if __name__ == '__main__':
main()

2
tools/configure.py

@ -101,7 +101,7 @@ def main():
parser.add_option('--separate-sources', dest='separate_sources', action='store_true', default=False, help='Output separate sources instead of combined source (default is combined)')
parser.add_option('--line-directives', dest='line_directives', action='store_true', default=False, help='Output #line directives in combined source (default is false)')
# Options forwarded to genbuiltins.py.
# Options for built-ins.
parser.add_option('--rom-support', dest='rom_support', action='store_true', help='Add support for ROM strings/objects (increases duktape.c size considerably)')
parser.add_option('--rom-auto-lightfunc', dest='rom_auto_lightfunc', action='store_true', default=False, help='Convert ROM built-in function properties into lightfuncs automatically whenever possible')
parser.add_option('--user-builtin-metadata', dest='obsolete_builtin_metadata', default=None, help=optparse.SUPPRESS_HELP)

246
tools/create_spdx_license.py

@ -1,246 +0,0 @@
#!/usr/bin/env python2
#
# Helper to create an SPDX license file (http://spdx.org)
#
# This must be executed when the dist/ directory is otherwise complete,
# except for the SPDX license, so that the file lists and such contained
# in the SPDX license will be correct.
#
# The utility outputs RDF/XML to specified file:
#
# $ python create_spdx_license.py /tmp/license.spdx
#
# Then, validate with SPDXViewer and SPDXTools:
#
# $ java -jar SPDXViewer.jar /tmp/license.spdx
# $ java -jar java -jar spdx-tools-1.2.5-jar-with-dependencies.jar RdfToHtml /tmp/license.spdx /tmp/license.html
#
# Finally, copy to dist:
#
# $ cp /tmp/license.spdx dist/license.spdx
#
# SPDX FAQ indicates there is no standard extension for an SPDX license file
# but '.spdx' is a common practice.
#
# The algorithm to compute a "verification code", implemented in this file,
# can be verified as follows:
#
# # build dist tar.xz, copy to /tmp/duktape-N.N.N.tar.xz
# $ cd /tmp
# $ tar xvfJ duktape-N.N.N.tar.xz
# $ rm duktape-N.N.N/license.spdx # remove file excluded from verification code
# $ java -jar spdx-tools-1.2.5-jar-with-dependencies.jar GenerateVerificationCode /tmp/duktape-N.N.N/
#
# Compare the resulting verification code manually with the one in license.spdx.
#
# Resources:
#
# - http://spdx.org/about-spdx/faqs
# - http://wiki.spdx.org/view/Technical_Team/Best_Practices
#
import os
import sys
import re
import datetime
import sha
import rdflib
from rdflib import URIRef, BNode, Literal, Namespace
RDF = Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
RDFS = Namespace('http://www.w3.org/2000/01/rdf-schema#')
XSD = Namespace('http://www.w3.org/2001/XMLSchema#')
SPDX = Namespace('http://spdx.org/rdf/terms#')
DOAP = Namespace('http://usefulinc.com/ns/doap#')
DUKTAPE = Namespace('http://duktape.org/rdf/terms#')
def checksumFile(g, filename):
f = open(filename, 'rb')
d = f.read()
f.close()
shasum = sha.sha(d).digest().encode('hex').lower()
csum_node = BNode()
g.add((csum_node, RDF.type, SPDX.Checksum))
g.add((csum_node, SPDX.algorithm, SPDX.checksumAlgorithm_sha1))
g.add((csum_node, SPDX.checksumValue, Literal(shasum)))
return csum_node
def computePackageVerification(g, dirname, excluded):
# SPDX 1.2 Section 4.7
# The SPDXTools command "GenerateVerificationCode" can be used to
# check the verification codes created. Note that you must manually
# remove "license.spdx" from the unpacked dist directory before
# computing the verification code.
verify_node = BNode()
hashes = []
for dirpath, dirnames, filenames in os.walk(dirname):
for fn in filenames:
full_fn = os.path.join(dirpath, fn)
f = open(full_fn, 'rb')
d = f.read()
f.close()
if full_fn in excluded:
#print('excluded in verification: ' + full_fn)
continue
#print('included in verification: ' + full_fn)
file_sha1 = sha.sha(d).digest().encode('hex').lower()
hashes.append(file_sha1)
#print(repr(hashes))
hashes.sort()
#print(repr(hashes))
verify_code = sha.sha(''.join(hashes)).digest().encode('hex').lower()
for fn in excluded:
g.add((verify_node, SPDX.packageVerificationCodeExcludedFile, Literal(fn)))
g.add((verify_node, SPDX.packageVerificationCodeValue, Literal(verify_code)))
return verify_node
def fileType(filename):
ign, ext = os.path.splitext(filename)
if ext in [ '.c', '.h', '.js' ]:
return SPDX.fileType_source
else:
return SPDX.fileType_other
def getDuktapeVersion():
f = open('./src/duktape.h')
re_ver = re.compile(r'^#define\s+DUK_VERSION\s+(\d+)L$')
for line in f:
line = line.strip()
m = re_ver.match(line)
if m is None:
continue
ver = int(m.group(1))
return '%d.%d.%d' % ((ver / 10000) % 100,
(ver / 100) % 100,
ver % 100)
raise Exception('could not figure out Duktape version')
def main():
outfile = sys.argv[1]
if not os.path.exists('CONTRIBUTING.md') and os.path.exists('tests/ecmascript'):
sys.stderr.write('Invalid CWD, must be in Duktape root with dist/ built')
sys.exit(1)
os.chdir('dist')
if not os.path.exists('Makefile.cmdline'):
sys.stderr.write('Invalid CWD, must be in Duktape root with dist/ built')
sys.exit(1)
duktape_version = getDuktapeVersion()
duktape_pkgname = 'duktape-' + duktape_version + '.tar.xz'
now = datetime.datetime.utcnow()
now = datetime.datetime(now.year, now.month, now.day, now.hour, now.minute, now.second)
creation_date = Literal(now.isoformat() + 'Z', datatype=XSD.dateTime)
duktape_org = Literal('Organization: duktape.org')
mit_license = URIRef('http://spdx.org/licenses/MIT')
duktape_copyright = Literal('Copyright 2013-2017 Duktape authors (see AUTHORS.rst in the Duktape distributable)')
g = rdflib.Graph()
crea_node = BNode()
g.add((crea_node, RDF.type, SPDX.CreationInfo))
g.add((crea_node, RDFS.comment, Literal('')))
g.add((crea_node, SPDX.creator, duktape_org))
g.add((crea_node, SPDX.created, creation_date))
g.add((crea_node, SPDX.licenseListVersion, Literal('1.20'))) # http://spdx.org/licenses/
# 'name' should not include a version number (see best practices)
pkg_node = BNode()
g.add((pkg_node, RDF.type, SPDX.Package))
g.add((pkg_node, SPDX.name, Literal('Duktape')))
g.add((pkg_node, SPDX.versionInfo, Literal(duktape_version)))
g.add((pkg_node, SPDX.packageFileName, Literal(duktape_pkgname)))
g.add((pkg_node, SPDX.supplier, duktape_org))
g.add((pkg_node, SPDX.originator, duktape_org))
g.add((pkg_node, SPDX.downloadLocation, Literal('http://duktape.org/' + duktape_pkgname, datatype=XSD.anyURI)))
g.add((pkg_node, SPDX.homePage, Literal('http://duktape.org/', datatype=XSD.anyURI)))
verify_node = computePackageVerification(g, '.', [ './license.spdx' ])
g.add((pkg_node, SPDX.packageVerificationCode, verify_node))
# SPDX.checksum: omitted because license is inside the package
g.add((pkg_node, SPDX.sourceInfo, Literal('Official duktape.org release built from GitHub repo https://github.com/svaarala/duktape.')))
# NOTE: MIT license alone is sufficient for now, because Duktape, Lua,
# Murmurhash2, and CommonJS (though probably not even relevant for
# licensing) are all MIT.
g.add((pkg_node, SPDX.licenseConcluded, mit_license))
g.add((pkg_node, SPDX.licenseInfoFromFiles, mit_license))
g.add((pkg_node, SPDX.licenseDeclared, mit_license))
g.add((pkg_node, SPDX.licenseComments, Literal('Duktape is copyrighted by its authors and licensed under the MIT license. MurmurHash2 is used internally, it is also under the MIT license. Duktape module loader is based on the CommonJS module loading specification (without sharing any code), CommonJS is under the MIT license.')))
g.add((pkg_node, SPDX.copyrightText, duktape_copyright))
g.add((pkg_node, SPDX.summary, Literal('Duktape ECMAScript interpreter')))
g.add((pkg_node, SPDX.description, Literal('Duktape is an embeddable Javascript engine, with a focus on portability and compact footprint')))
# hasFile properties added separately below
#reviewed_node = BNode()
#g.add((reviewed_node, RDF.type, SPDX.Review))
#g.add((reviewed_node, SPDX.reviewer, XXX))
#g.add((reviewed_node, SPDX.reviewDate, XXX))
#g.add((reviewed_node, RDFS.comment, ''))
spdx_doc = BNode()
g.add((spdx_doc, RDF.type, SPDX.SpdxDocument))
g.add((spdx_doc, SPDX.specVersion, Literal('SPDX-1.2')))
g.add((spdx_doc, SPDX.dataLicense, URIRef('http://spdx.org/licenses/CC0-1.0')))
g.add((spdx_doc, RDFS.comment, Literal('SPDX license for Duktape ' + duktape_version)))
g.add((spdx_doc, SPDX.creationInfo, crea_node))
g.add((spdx_doc, SPDX.describesPackage, pkg_node))
# SPDX.hasExtractedLicensingInfo
# SPDX.reviewed
# SPDX.referencesFile: added below
for dirpath, dirnames, filenames in os.walk('.'):
for fn in filenames:
full_fn = os.path.join(dirpath, fn)
#print('# file: ' + full_fn)
file_node = BNode()
g.add((file_node, RDF.type, SPDX.File))
g.add((file_node, SPDX.fileName, Literal(full_fn)))
g.add((file_node, SPDX.fileType, fileType(full_fn)))
g.add((file_node, SPDX.checksum, checksumFile(g, full_fn)))
# Here we assume that LICENSE.txt provides the actual "in file"
# licensing information, and everything else is implicitly under
# MIT license.
g.add((file_node, SPDX.licenseConcluded, mit_license))
if full_fn == './LICENSE.txt':
g.add((file_node, SPDX.licenseInfoInFile, mit_license))
else:
g.add((file_node, SPDX.licenseInfoInFile, URIRef(SPDX.none)))
# SPDX.licenseComments
g.add((file_node, SPDX.copyrightText, duktape_copyright))
# SPDX.noticeText
# SPDX.artifactOf
# SPDX.fileDependency
# SPDX.fileContributor
# XXX: should referencesFile include all files?
g.add((spdx_doc, SPDX.referencesFile, file_node))
g.add((pkg_node, SPDX.hasFile, file_node))
# Serialize into RDF/XML directly. We could also serialize into
# N-Triples and use external tools (like 'rapper') to get cleaner,
# abbreviated output.
#print('# Duktape SPDX license file (autogenerated)')
#print(g.serialize(format='turtle'))
#print(g.serialize(format='nt'))
f = open(outfile, 'wb')
#f.write(g.serialize(format='rdf/xml'))
f.write(g.serialize(format='xml'))
f.close()
if __name__ == '__main__':
main()

733
tools/extract_caseconv.py

@ -1,733 +0,0 @@
#!/usr/bin/env python2
#
# Extract rules for Unicode case conversion, specifically the behavior
# required by ECMAScript E5 in Sections 15.5.4.16 to 15.5.4.19. The
# bitstream encoded rules are used for the slow path at run time, so
# compactness is favored over speed.
#
# There is no support for context or locale sensitive rules, as they
# are handled directly in C code before consulting tables generated
# here. ECMAScript requires case conversion both with and without
# locale/language specific rules (e.g. String.prototype.toLowerCase()
# and String.prototype.toLocaleLowerCase()), so they are best handled
# in C anyway.
#
# Case conversion rules for ASCII are also excluded as they are handled
# by the C fast path. Rules for non-BMP characters (codepoints above
# U+FFFF) are omitted as they're not required for standard ECMAScript.
#
import os
import sys
import re
import math
import optparse
import dukutil
class UnicodeData:
"""Read UnicodeData.txt into an internal representation."""
def __init__(self, filename):
self.data = self.read_unicode_data(filename)
print('read %d unicode data entries' % len(self.data))
def read_unicode_data(self, filename):
res = []
f = open(filename, 'rb')
for line in f:
if line.startswith('#'):
continue
line = line.strip()
if line == '':
continue
parts = line.split(';')
if len(parts) != 15:
raise Exception('invalid unicode data line')
res.append(parts)
f.close()
# Sort based on Unicode codepoint.
def mycmp(a,b):
return cmp(long(a[0], 16), long(b[0], 16))
res.sort(cmp=mycmp)
return res
class SpecialCasing:
"""Read SpecialCasing.txt into an internal representation."""
def __init__(self, filename):
self.data = self.read_special_casing_data(filename)
print('read %d special casing entries' % len(self.data))
def read_special_casing_data(self, filename):
res = []
f = open(filename, 'rb')
for line in f:
try:
idx = line.index('#')
line = line[:idx]
except ValueError:
pass
line = line.strip()
if line == '':
continue
parts = line.split(';')
parts = [i.strip() for i in parts]
while len(parts) < 6:
parts.append('')
res.append(parts)
f.close()
return res
def parse_unicode_sequence(x):
"""Parse a Unicode sequence like ABCD 1234 into a unicode string."""
res = ''
for i in x.split(' '):
i = i.strip()
if i == '':
continue
res += unichr(long(i, 16))
return res
def get_base_conversion_maps(unicode_data):
"""Create case conversion tables without handling special casing yet."""
uc = {} # uppercase, codepoint (number) -> string
lc = {} # lowercase
tc = {} # titlecase
for x in unicode_data.data:
c1 = long(x[0], 16)
# just 16-bit support needed
if c1 >= 0x10000:
continue
if x[12] != '':
# field 12: simple uppercase mapping
c2 = parse_unicode_sequence(x[12])
uc[c1] = c2
tc[c1] = c2 # titlecase default == uppercase, overridden below if necessary
if x[13] != '':
# field 13: simple lowercase mapping
c2 = parse_unicode_sequence(x[13])
lc[c1] = c2
if x[14] != '':
# field 14: simple titlecase mapping
c2 = parse_unicode_sequence(x[14])
tc[c1] = c2
return uc, lc, tc
def update_special_casings(uc, lc, tc, special_casing):
"""Update case conversion tables with special case conversion rules."""
for x in special_casing.data:
c1 = long(x[0], 16)
if x[4] != '':
# conditions
continue
lower = parse_unicode_sequence(x[1])
title = parse_unicode_sequence(x[2])
upper = parse_unicode_sequence(x[3])
if len(lower) > 1:
lc[c1] = lower
if len(upper) > 1:
uc[c1] = upper
if len(title) > 1:
tc[c1] = title
print('- special case: %d %d %d' % (len(lower), len(upper), len(title)))
def remove_ascii_part(convmap):
"""Remove ASCII case conversion parts (handled by C fast path)."""
for i in xrange(128):
if convmap.has_key(i):
del convmap[i]
def scan_range_with_skip(convmap, start_idx, skip):
"""Scan for a range of continuous case conversion with a certain 'skip'."""
conv_i = start_idx
if not convmap.has_key(conv_i):
return None, None, None
elif len(convmap[conv_i]) > 1:
return None, None, None
else:
conv_o = ord(convmap[conv_i])
start_i = conv_i
start_o = conv_o
while True:
new_i = conv_i + skip
new_o = conv_o + skip
if not convmap.has_key(new_i):
break
if len(convmap[new_i]) > 1:
break
if ord(convmap[new_i]) != new_o:
break
conv_i = new_i
conv_o = new_o
# [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char.
count = (conv_i - start_i) / skip + 1
if count <= 1:
return None, None, None
# We have an acceptable range, remove them from the convmap here.
for i in xrange(start_i, conv_i + skip, skip):
del convmap[i]
return start_i, start_o, count
def find_first_range_with_skip(convmap, skip):
"""Find first range with a certain 'skip' value."""
for i in xrange(65536):
start_i, start_o, count = scan_range_with_skip(convmap, i, skip)
if start_i is None:
continue
return start_i, start_o, count
return None, None, None
def generate_caseconv_tables(convmap):
"""Generate bit-packed case conversion table for a given conversion map."""
# The bitstream encoding is based on manual inspection for whatever
# regularity the Unicode case conversion rules have.
#
# Start with a full description of case conversions which does not
# cover all codepoints; unmapped codepoints convert to themselves.
# Scan for range-to-range mappings with a range of skips starting from 1.
# Whenever a valid range is found, remove it from the map. Finally,
# output the remaining case conversions (1:1 and 1:n) on a per codepoint
# basis.
#
# This is very slow because we always scan from scratch, but its the
# most reliable and simple way to scan
print('generate caseconv tables')
ranges = [] # range mappings (2 or more consecutive mappings with a certain skip)
singles = [] # 1:1 character mappings
multis = [] # 1:n character mappings
# Ranges with skips
for skip in xrange(1,6+1): # skips 1...6 are useful
while True:
start_i, start_o, count = find_first_range_with_skip(convmap, skip)
if start_i is None:
break
print('- skip %d: %d %d %d' % (skip, start_i, start_o, count))
ranges.append([start_i, start_o, count, skip])
# 1:1 conversions
k = convmap.keys()
k.sort()
for i in k:
if len(convmap[i]) > 1:
continue
singles.append([i, ord(convmap[i])]) # codepoint, codepoint
del convmap[i]
# There are many mappings to 2-char sequences with latter char being U+0399.
# These could be handled as a special case, but we don't do that right now.
#
# [8064L, u'\u1f08\u0399']
# [8065L, u'\u1f09\u0399']
# [8066L, u'\u1f0a\u0399']
# [8067L, u'\u1f0b\u0399']
# [8068L, u'\u1f0c\u0399']
# [8069L, u'\u1f0d\u0399']
# [8070L, u'\u1f0e\u0399']
# [8071L, u'\u1f0f\u0399']
# ...
#
# tmp = {}
# k = convmap.keys()
# k.sort()
# for i in k:
# if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399':
# tmp[i] = convmap[i][0]
# del convmap[i]
# print(repr(tmp))
#
# skip = 1
# while True:
# start_i, start_o, count = find_first_range_with_skip(tmp, skip)
# if start_i is None:
# break
# print('- special399, skip %d: %d %d %d' % (skip, start_i, start_o, count))
# print(len(tmp.keys()))
# print(repr(tmp))
# XXX: need to put 12 remaining mappings back to convmap
# 1:n conversions
k = convmap.keys()
k.sort()
for i in k:
multis.append([i, convmap[i]]) # codepoint, string
del convmap[i]
for t in singles:
print '- singles: ' + repr(t)
for t in multis:
print '- multis: ' + repr(t)
print '- range mappings: %d' % len(ranges)
print '- single character mappings: %d' % len(singles)
print '- complex mappings (1:n): %d' % len(multis)
print '- remaining (should be zero): %d' % len(convmap.keys())
# XXX: opportunities for diff encoding skip=3 ranges?
prev = None
for t in ranges:
# range: [start_i, start_o, count, skip]
if t[3] != 3:
continue
if prev is not None:
print '- %d %d' % (t[0] - prev[0], t[1] - prev[1])
else:
print '- start: %d %d' % (t[0], t[1])
prev = t
# Bit packed encoding.
be = dukutil.BitEncoder()
for curr_skip in xrange(1, 7): # 1...6
count = 0
for r in ranges:
start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
if skip != curr_skip:
continue
count += 1
be.bits(count, 6)
print('- encode: skip=%d, count=%d' % (curr_skip, count))
for r in ranges:
start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
if skip != curr_skip:
continue
be.bits(start_i, 16)
be.bits(start_o, 16)
be.bits(r_count, 7)
be.bits(0x3f, 6) # maximum count value = end of skips
count = len(singles)
be.bits(count, 7)
for t in singles:
cp_i, cp_o = t[0], t[1]
be.bits(cp_i, 16)
be.bits(cp_o, 16)
count = len(multis)
be.bits(count, 7)
for t in multis:
cp_i, str_o = t[0], t[1]
be.bits(cp_i, 16)
be.bits(len(str_o), 2)
for i in xrange(len(str_o)):
be.bits(ord(str_o[i]), 16)
return be.getBytes(), be.getNumBits()
def generate_regexp_canonicalize_tables(convmap):
"""Generate tables for case insensitive RegExp normalization."""
# Generate a direct codepoint lookup for canonicalizing BMP range.
def generate_canontab():
res = []
highest_nonid = -1
for cp in xrange(65536):
res_cp = cp # default to as is
if convmap.has_key(cp):
tmp = convmap[cp]
if len(tmp) == 1:
# If multiple codepoints from input, ignore.
res_cp = ord(tmp[0])
if cp >= 0x80 and res_cp < 0x80:
res_cp = cp # If non-ASCII mapped to ASCII, ignore.
if cp != res_cp:
highest_nonid = cp
res.append(res_cp)
# At the moment this is 65370, which means there's very little
# gain in assuming 1:1 mapping above a certain BMP codepoint
# (though we do assume 1:1 mapping for above BMP codepoints).
print('- highest non-identity mapping: %d' % highest_nonid)
return res
print('generate canontab')
canontab = generate_canontab()
# Figure out which BMP values are never the result of canonicalization.
# Such codepoints are "don't care" in the sense that they are never
# matched against at runtime: ranges are canonicalized at compile time,
# and codepoint being matched is also canonicalized at run time.
# (Currently unused.)
def generate_dontcare():
res = [ True ] * 65536
for cp in canontab:
res[cp] = False
res_count = 0
for x in res:
if x:
res_count += 1
print('- %d dontcare codepoints' % res_count)
return res
print('generate canon dontcare')
dontcare = generate_dontcare()
# Generate maximal continuous ranges for canonicalization. A continuous
# range is a sequence with N codepoints where IN+i canonicalizes to OUT+i
# for fixed IN, OUT, and i in 0...N-1. There are unfortunately >1000
# of these ranges, mostly because there are a lot of individual exceptions.
# (Currently unused.)
canon_ranges = []
for cp in xrange(65536):
canon_ranges.append([ cp, canontab[cp], 1 ]) # 1 codepoint ranges at first
def merge_compatible_nogap(rng1, rng2):
# Merge adjacent ranges if continuity allows.
if rng1[0] + rng1[2] == rng2[0] and \
rng1[1] + rng1[2] == rng2[1]:
return [ rng1[0], rng1[1], rng1[2] + rng2[2] ]
return None
def merge_check_nogap():
len_start = len(canon_ranges)
for i in xrange(len(canon_ranges) - 1):
j = i + 1
rng1 = canon_ranges[i]
rng2 = canon_ranges[j]
if rng1 is None or rng2 is None: continue
merged = merge_compatible_nogap(rng1, rng2)
if merged is not None:
canon_ranges[j] = None
canon_ranges[i] = merged
filtered = []
for x in canon_ranges:
if x is not None:
filtered.append(x)
len_end = len(filtered)
if len_end < len_start:
return filtered
return None
print('generate canon_ranges')
while True:
# Starting from individual ranges of 1 codepoint, merge adjacent
# ranges until no more ranges can be merged.
t = merge_check_nogap()
if t is None:
break
canon_ranges = t
print('- %d ranges' % len(canon_ranges))
#for rng in canon_ranges:
# print('canon_ranges:')
# print(repr(rng))
# Generate true/false ranges for BMP codepoints where:
# - A codepoint is flagged true if continuity is broken at that point, so
# an explicit codepoint canonicalization is needed at runtime.
# - A codepoint is flagged false if case conversion is continuous from the
# previous codepoint, i.e. out_curr = out_prev + 1.
#
# The result is a lot of small ranges due to a lot of small 'false' ranges.
# Reduce the range set by checking if adjacent 'true' ranges have at most
# false_limit 'false' entries between them. If so, force the 'false'
# entries to 'true' (safe but results in an unnecessary runtime codepoint
# lookup) and merge the three ranges into a larger 'true' range.
#
# (Currently unused.)
def generate_needcheck_straight():
res = [ True ] * 65536
assert(canontab[0] == 0) # can start from in == out == 0
prev_in = -1
prev_out = -1
for i in xrange(65536):
# First create a straight true/false bitmap for BMP.
curr_in = i
curr_out = canontab[i]
if prev_in + 1 == curr_in and prev_out + 1 == curr_out:
res[i] = False
prev_in = curr_in
prev_out = curr_out
return res
def generate_needcheck_ranges(data):
# Generate maximal accurate ranges.
prev = None
count = 0
ranges = []
for i in data:
if prev is None or prev != i:
if prev is not None:
ranges.append([ prev, count ])
prev = i
count = 1
else:
count += 1
if prev is not None:
ranges.append([ prev, count ])
return ranges
def fillin_needcheck_ranges(data, false_limit):
# Fill in TRUE-FALSE*N-TRUE gaps into TRUE-TRUE*N-TRUE which is
# safe (leads to an unnecessary runtime check) but reduces
# range data size considerably.
res = []
for r in data:
res.append([ r[0], r[1] ])
while True:
found = False
for i in xrange(len(res) - 2):
r1 = res[i]
r2 = res[i + 1]
r3 = res[i + 2]
if r1[0] == True and r2[0] == False and r3[0] == True and \
r2[1] <= false_limit:
#print('fillin %d falses' % r2[1])
res.pop(i + 2)
res.pop(i + 1)
res[i] = [ True, r1[1] + r2[1] + r3[1] ]
found = True
break
if not found:
break
return res
print('generate needcheck straight')
needcheck = generate_needcheck_straight()
print('generate needcheck without false fillins')
needcheck_ranges1 = generate_needcheck_ranges(needcheck)
print('- %d ranges' % len(needcheck_ranges1))
#print(needcheck_ranges1)
print('generate needcheck with false fillins')
needcheck_ranges2 = fillin_needcheck_ranges(needcheck_ranges1, 11)
print('- %d ranges' % len(needcheck_ranges2))
#print(needcheck_ranges2)
# Generate a bitmap for BMP, divided into N-codepoint blocks, with each
# bit indicating: "entire codepoint block canonicalizes continuously, and
# the block is continuous with the previous and next block". A 'true'
# entry allows runtime code to just skip the block, advancing 'in' and
# 'out' by the block size, with no codepoint conversion. The block size
# should be large enough to produce a relatively small lookup table, but
# small enough to reduce codepoint conversions to a manageable number
# because the conversions are (currently) quite slow. This matters
# especially for case-insensitive RegExps; without any optimization,
# /[\u0000-\uffff]/i requires 65536 case conversions for runtime
# normalization.
block_shift = 5
block_size = 1 << block_shift
block_mask = block_size - 1
num_blocks = 65536 / block_size
def generate_block_bits(check_continuity):
res = [ True ] * num_blocks
for i in xrange(num_blocks):
base_in = i * block_size
base_out = canontab[base_in]
if check_continuity:
lower = -1 # [-1,block_size]
upper = block_size + 1
else:
lower = 0 # [0,block_size-1]
upper = block_size
for j in xrange(lower, upper):
cp = base_in + j
if cp >= 0x0000 and cp <= 0xffff and canontab[cp] != base_out + j:
res[i] = False
break
return res
def dump_block_bitmap(bits):
tmp = ''.join([ ({ True: 'x', False: '.' })[b] for b in bits])
tmp = re.sub(r'.{64}', lambda x: x.group(0) + '\n', tmp)
blocks_true = tmp.count('x')
blocks_false = tmp.count('.')
print('%d codepoint blocks are continuous, %d blocks are not' % (blocks_true, blocks_false))
sys.stdout.write(tmp)
#print(bits)
def dump_test_lookup(bits):
sys.stdout.write('duk_uint8_t test = {');
for b in bits:
if b:
sys.stdout.write('1,')
else:
sys.stdout.write('0,')
sys.stdout.write('};\n')
def convert_to_bitmap(bits):
# C code looks up bits as:
# index = codepoint >> N
# bitnum = codepoint & mask
# bitmask = 1 << bitnum
# So block 0 is mask 0x01 of first byte, block 1 is mask 0x02 of
# first byte, etc.
res = []
curr = 0
mask = 0x01
for b in bits:
if b:
curr += mask
mask = mask * 2
if mask == 0x100:
res.append(curr)
curr = 0
mask = 0x01
assert(mask == 0x01) # no leftover
return res
print('generate canon block bitmap without continuity')
block_bits1 = generate_block_bits(False)
dump_block_bitmap(block_bits1)
dump_test_lookup(block_bits1)
print('generate canon block bitmap with continuity')
block_bits2 = generate_block_bits(True)
dump_block_bitmap(block_bits2)
dump_test_lookup(block_bits2)
print('generate final canon bitmap')
block_bitmap = convert_to_bitmap(block_bits2)
print('- %d bytes' % len(block_bitmap))
print('- ' + repr(block_bitmap))
canon_bitmap = {
'data': block_bitmap,
'block_size': block_size,
'block_shift': block_shift,
'block_mask': block_mask
}
# This is useful to figure out corner case test cases.
print('canon blocks which are different with and without continuity check')
for i in xrange(num_blocks):
if block_bits1[i] != block_bits2[i]:
print('- block %d ([%d,%d]) differs' % (i, i * block_size, i * block_size + block_size - 1))
return canontab, canon_bitmap
def clonedict(x):
"Shallow clone of input dict."
res = {}
for k in x.keys():
res[k] = x[k]
return res
def main():
parser = optparse.OptionParser()
parser.add_option('--command', dest='command', default='caseconv_bitpacked')
parser.add_option('--unicode-data', dest='unicode_data')
parser.add_option('--special-casing', dest='special_casing')
parser.add_option('--out-source', dest='out_source')
parser.add_option('--out-header', dest='out_header')
parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc')
parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc')
parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup')
parser.add_option('--table-name-re-canon-bitmap', dest='table_name_re_canon_bitmap', default='caseconv_re_canon_bitmap')
(opts, args) = parser.parse_args()
unicode_data = UnicodeData(opts.unicode_data)
special_casing = SpecialCasing(opts.special_casing)
uc, lc, tc = get_base_conversion_maps(unicode_data)
update_special_casings(uc, lc, tc, special_casing)
if opts.command == 'caseconv_bitpacked':
# XXX: ASCII and non-BMP filtering could be an option but is now hardcoded
# ASCII is handled with 'fast path' so not needed here.
t = clonedict(uc)
remove_ascii_part(t)
uc_bytes, uc_nbits = generate_caseconv_tables(t)
t = clonedict(lc)
remove_ascii_part(t)
lc_bytes, lc_nbits = generate_caseconv_tables(t)
# Generate C source and header files.
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
f = open(opts.out_source, 'wb')
f.write(genc.getString())
f.close()
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes)))
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes)))
f = open(opts.out_header, 'wb')
f.write(genc.getString())
f.close()
elif opts.command == 're_canon_lookup':
# Direct canonicalization lookup for case insensitive regexps, includes ascii part.
t = clonedict(uc)
re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True)
f = open(opts.out_source, 'wb')
f.write(genc.getString())
f.close()
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup)))
f = open(opts.out_header, 'wb')
f.write(genc.getString())
f.close()
elif opts.command == 're_canon_bitmap':
# N-codepoint block bitmap for skipping continuous codepoint blocks
# quickly.
t = clonedict(uc)
re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitArray(re_canon_bitmap['data'], opts.table_name_re_canon_bitmap, size=len(re_canon_bitmap['data']), typename='duk_uint8_t', intvalues=True, const=True)
f = open(opts.out_source, 'wb')
f.write(genc.getString())
f.close()
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitDefine('DUK_CANON_BITMAP_BLKSIZE', re_canon_bitmap['block_size'])
genc.emitDefine('DUK_CANON_BITMAP_BLKSHIFT', re_canon_bitmap['block_shift'])
genc.emitDefine('DUK_CANON_BITMAP_BLKMASK', re_canon_bitmap['block_mask'])
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_re_canon_bitmap, len(re_canon_bitmap['data'])))
f = open(opts.out_header, 'wb')
f.write(genc.getString())
f.close()
else:
raise Exception('invalid command: %r' % opts.command)
if __name__ == '__main__':
main()

385
tools/extract_chars.py

@ -1,385 +0,0 @@
#!/usr/bin/env python2
#
# Select a set of Unicode characters (based on included/excluded categories
# etc) and write out a compact bitstream for matching a character against
# the set at runtime. This is for the slow path, where we're especially
# concerned with compactness. A C source file with the table is written,
# together with a matching C header.
#
# Unicode categories (such as 'Z') can be used. Two pseudo-categories
# are also available for exclusion only: ASCII and NONBMP. "ASCII"
# category excludes ASCII codepoints which is useful because C code
# typically contains an ASCII fast path so ASCII characters don't need
# to be considered in the Unicode tables. "NONBMP" excludes codepoints
# above U+FFFF which is useful because such codepoints don't need to be
# supported in standard ECMAScript.
#
import os
import sys
import math
import optparse
import dukutil
def read_unicode_data(unidata, catsinc, catsexc, filterfunc):
"Read UnicodeData.txt, including lines matching catsinc unless excluded by catsexc or filterfunc."
res = []
f = open(unidata, 'rb')
def filter_none(cp):
return True
if filterfunc is None:
filterfunc = filter_none
# The Unicode parsing is slow enough to warrant some speedups.
exclude_cat_exact = {}
for cat in catsexc:
exclude_cat_exact[cat] = True
include_cat_exact = {}
for cat in catsinc:
include_cat_exact[cat] = True
for line in f:
#line = line.strip()
parts = line.split(';')
codepoint = parts[0]
if not filterfunc(long(codepoint, 16)):
continue
category = parts[2]
if exclude_cat_exact.has_key(category):
continue # quick reject
rejected = False
for cat in catsexc:
if category.startswith(cat) or codepoint == cat:
rejected = True
break
if rejected:
continue
if include_cat_exact.has_key(category):
res.append(line)
continue
accepted = False
for cat in catsinc:
if category.startswith(cat) or codepoint == cat:
accepted = True
break
if accepted:
res.append(line)
f.close()
# Sort based on Unicode codepoint
def mycmp(a,b):
t1 = a.split(';')
t2 = b.split(';')
n1 = long(t1[0], 16)
n2 = long(t2[0], 16)
return cmp(n1, n2)
res.sort(cmp=mycmp)
return res
def scan_ranges(lines):
"Scan continuous ranges from (filtered) UnicodeData.txt lines."
ranges = []
range_start = None
prev = None
for line in lines:
t = line.split(';')
n = long(t[0], 16)
if range_start is None:
range_start = n
else:
if n == prev + 1:
# continue range
pass
else:
ranges.append((range_start, prev))
range_start = n
prev = n
if range_start is not None:
ranges.append((range_start, prev))
return ranges
def generate_png(lines, fname):
"Generate an illustrative PNG of the character set."
from PIL import Image
m = {}
for line in lines:
t = line.split(';')
n = long(t[0], 16)
m[n] = 1
codepoints = 0x10ffff + 1
width = int(256)
height = int(math.ceil(float(codepoints) / float(width)))
im = Image.new('RGB', (width, height))
black = (0,0,0)
white = (255,255,255)
for cp in xrange(codepoints):
y = cp / width
x = cp % width
if m.has_key(long(cp)):
im.putpixel((x,y), black)
else:
im.putpixel((x,y), white)
im.save(fname)
def generate_match_table1(ranges):
"Unused match table format."
# This is an earlier match table format which is no longer used.
# IdentifierStart-UnicodeLetter has 445 ranges and generates a
# match table of 2289 bytes.
data = []
prev_re = None
def genrange(rs, re):
if (rs > re):
raise Exception('assumption failed: rs=%d re=%d' % (rs, re))
while True:
now = re - rs + 1
if now > 255:
now = 255
data.append(now) # range now
data.append(0) # skip 0
rs = rs + now
else:
data.append(now) # range now
break
def genskip(ss, se):
if (ss > se):
raise Exception('assumption failed: ss=%d se=%s' % (ss, se))
while True:
now = se - ss + 1
if now > 255:
now = 255
data.append(now) # skip now
data.append(0) # range 0
ss = ss + now
else:
data.append(now) # skip now
break
for rs, re in ranges:
if prev_re is not None:
genskip(prev_re + 1, rs - 1)
genrange(rs, re)
prev_re = re
num_entries = len(data)
# header: start of first range
# num entries
hdr = []
hdr.append(ranges[0][0] >> 8) # XXX: check that not 0x10000 or over
hdr.append(ranges[0][1] & 0xff)
hdr.append(num_entries >> 8)
hdr.append(num_entries & 0xff)
return hdr + data
def generate_match_table2(ranges):
"Unused match table format."
# Another attempt at a match table which is also unused.
# Total tables for all current classes is now 1472 bytes.
data = []
def enc(x):
while True:
if x < 0x80:
data.append(x)
break
data.append(0x80 + (x & 0x7f))
x = x >> 7
prev_re = 0
for rs, re in ranges:
r1 = rs - prev_re # 1 or above (no unjoined ranges)
r2 = re - rs # 0 or above
enc(r1)
enc(r2)
prev_re = re
enc(0) # end marker
return data
def generate_match_table3(ranges):
"Current match table format."
# Yet another attempt, similar to generate_match_table2 except
# in packing format.
#
# Total match size now (at time of writing): 1194 bytes.
#
# This is the current encoding format used in duk_lexer.c.
be = dukutil.BitEncoder()
freq = [0] * (0x10ffff + 1) # informative
def enc(x):
freq[x] += 1
if x <= 0x0e:
# 4-bit encoding
be.bits(x, 4)
return
x -= 0x0e + 1
if x <= 0xfd:
# 12-bit encoding
be.bits(0x0f, 4)
be.bits(x, 8)
return
x -= 0xfd + 1
if x <= 0xfff:
# 24-bit encoding
be.bits(0x0f, 4)
be.bits(0xfe, 8)
be.bits(x, 12)
return
x -= 0xfff + 1
if True:
# 36-bit encoding
be.bits(0x0f, 4)
be.bits(0xff, 8)
be.bits(x, 24)
return
raise Exception('cannot encode')
prev_re = 0
for rs, re in ranges:
r1 = rs - prev_re # 1 or above (no unjoined ranges)
r2 = re - rs # 0 or above
enc(r1)
enc(r2)
prev_re = re
enc(0) # end marker
data, nbits = be.getBytes(), be.getNumBits()
return data, freq
def main():
parser = optparse.OptionParser()
parser.add_option('--unicode-data', dest='unicode_data') # UnicodeData.txt
parser.add_option('--special-casing', dest='special_casing') # SpecialCasing.txt
parser.add_option('--include-categories', dest='include_categories')
parser.add_option('--exclude-categories', dest='exclude_categories', default='NONE')
parser.add_option('--out-source', dest='out_source')
parser.add_option('--out-header', dest='out_header')
parser.add_option('--out-png', dest='out_png')
parser.add_option('--table-name', dest='table_name', default='match_table')
(opts, args) = parser.parse_args()
unidata = opts.unicode_data
catsinc = []
if opts.include_categories != '':
catsinc = opts.include_categories.split(',')
catsexc = []
if opts.exclude_categories != 'NONE':
catsexc = opts.exclude_categories.split(',')
print 'CATSEXC: %s' % repr(catsexc)
print 'CATSINC: %s' % repr(catsinc)
# pseudocategories
filter_ascii = ('ASCII' in catsexc)
filter_nonbmp = ('NONBMP' in catsexc)
# Read raw result
def filter1(x):
if filter_ascii and x <= 0x7f:
# exclude ascii
return False
if filter_nonbmp and x >= 0x10000:
# exclude non-bmp
return False
return True
print('read unicode data')
uni_filtered = read_unicode_data(unidata, catsinc, catsexc, filter1)
print('done reading unicode data')
# Raw output
#print('RAW OUTPUT:')
#print('===========')
#print('\n'.join(uni_filtered))
# Scan ranges
#print('')
#print('RANGES:')
#print('=======')
ranges = scan_ranges(uni_filtered)
#for i in ranges:
# if i[0] == i[1]:
# print('0x%04x' % i[0])
# else:
# print('0x%04x ... 0x%04x' % (i[0], i[1]))
#print('')
print('%d ranges total' % len(ranges))
# Generate match table
#print('')
#print('MATCH TABLE:')
#print('============')
#matchtable1 = generate_match_table1(ranges)
#matchtable2 = generate_match_table2(ranges)
matchtable3, freq = generate_match_table3(ranges)
#print 'match table: %s' % repr(matchtable3)
print 'match table length: %d bytes' % len(matchtable3)
print 'encoding freq:'
for i in xrange(len(freq)):
if freq[i] == 0:
continue
print ' %6d: %d' % (i, freq[i])
print('')
print('MATCH C TABLE -> file %s' % repr(opts.out_header))
# Create C source and header files
genc = dukutil.GenerateC()
genc.emitHeader('extract_chars.py')
genc.emitArray(matchtable3, opts.table_name, size=len(matchtable3), typename='duk_uint8_t', intvalues=True, const=True)
if opts.out_source is not None:
f = open(opts.out_source, 'wb')
f.write(genc.getString())
f.close()
genc = dukutil.GenerateC()
genc.emitHeader('extract_chars.py')
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name, len(matchtable3)))
if opts.out_header is not None:
f = open(opts.out_header, 'wb')
f.write(genc.getString())
f.close()
# Image (for illustrative purposes only)
if opts.out_png is not None:
generate_png(uni_filtered, opts.out_png)
if __name__ == '__main__':
main()

3221
tools/genbuiltins.py

File diff suppressed because it is too large

5
tools/json2yaml.py

@ -1,5 +0,0 @@
import os, sys, json, yaml
if __name__ == '__main__':
# Use safe_dump() instead of dump() to avoid tags like "!!python/unicode"
print(yaml.safe_dump(json.load(sys.stdin), default_flow_style=False))

51
tools/prepare_unicode_data.py

@ -1,51 +0,0 @@
#!/usr/bin/env python2
#
# UnicodeData.txt may contain ranges in addition to individual characters.
# Unpack the ranges into individual characters for the other scripts to use.
#
import os
import sys
import optparse
def main():
parser = optparse.OptionParser()
parser.add_option('--unicode-data', dest='unicode_data')
parser.add_option('--output', dest='output')
parser.add_option('--quiet', dest='quiet', action='store_true', default=False, help='Suppress info messages (show warnings)')
parser.add_option('--verbose', dest='verbose', action='store_true', default=False, help='Show verbose debug messages')
(opts, args) = parser.parse_args()
assert(opts.unicode_data is not None)
assert(opts.output is not None)
f_in = open(opts.unicode_data, 'rb')
f_out = open(opts.output, 'wb')
while True:
line = f_in.readline()
if line == '' or line == '\n':
break
parts = line.split(';') # keep newline
if parts[1].endswith('First>'):
line2 = f_in.readline()
parts2 = line2.split(';')
if not parts2[1].endswith('Last>'):
raise Exception('cannot parse range')
cp1 = long(parts[0], 16)
cp2 = long(parts2[0], 16)
tmp = parts[1:]
tmp[0] = '-""-'
suffix = ';'.join(tmp)
f_out.write(line)
for i in xrange(cp1 + 1, cp2):
f_out.write('%04X;%s' % (i, suffix))
f_out.write(line2)
else:
f_out.write(line)
f_in.close()
f_out.flush()
f_out.close()
if __name__ == '__main__':
main()

4
tools/yaml2json.py

@ -1,4 +0,0 @@
import os, sys, json, yaml
if __name__ == '__main__':
print(json.dumps(yaml.safe_load(sys.stdin)))
Loading…
Cancel
Save