duktape/tools/scan_strings.py


								#!/usr/bin/env python2

								#

								#  Scan potential external strings from ECMAScript and C files.

								#

								#  Very simplistic example with a lot of limitations:

								#

								#    - Doesn't handle multiple variables in a variable declaration

								#

								#    - Only extracts strings from C files, these may correspond to

								#      Duktape/C bindings (but in many cases don't)

								#


								import os

								import sys

								import re

								import json


								strmap = {}


								# ECMAScript function declaration

								re_funcname = re.compile(r'function\s+(\w+)', re.UNICODE)


								# ECMAScript variable declaration

								# XXX: doesn't handle multiple variables

								re_vardecl = re.compile(r'var\s+(\w+)', re.UNICODE)


								# ECMAScript variable assignment

								re_varassign = re.compile(r'(\w+)\s*=\s*', re.UNICODE)


								# ECMAScript dotted property reference (also matches numbers like

								# '4.0', which are separately rejected below)

								re_propref = re.compile(r'(\w+(?:\.\w+)+)', re.UNICODE)

								re_digits = re.compile(r'^\d+$', re.UNICODE)


								# ECMAScript or C string literal

								re_strlit_dquot = re.compile(r'("(?:\\"|\\\\|[^"])*")', re.UNICODE)

								re_strlit_squot = re.compile(r'(\'(?:\\\'|\\\\|[^\'])*\')', re.UNICODE)


								def strDecode(x):

								    # Need to decode hex, unicode, and other escapes.  Python syntax

								    # is close enough to C and ECMAScript so use eval for now.


								    try:

								        return eval('u' + x)  # interpret as unicode string

								    except:

								        sys.stderr.write('Failed to parse: ' + repr(x) + ', ignoring\n')

								        return None


								def scan(f, fn):

								    global strmap


								    # Scan rules depend on file type

								    if fn[-2:] == '.c':

								        use_funcname = False

								        use_vardecl = False

								        use_varassign = False

								        use_propref = False

								        use_strlit_dquot = True

								        use_strlit_squot = False

								    else:

								        use_funcname = True

								        use_vardecl = True

								        use_varassign = True

								        use_propref = True

								        use_strlit_dquot = True

								        use_strlit_squot = True


								    for line in f:

								        # Assume input data is UTF-8

								        line = line.decode('utf-8')


								        if use_funcname:

								            for m in re_funcname.finditer(line):

								                strmap[m.group(1)] = True


								        if use_vardecl:

								            for m in re_vardecl.finditer(line):

								                strmap[m.group(1)] = True


								        if use_varassign:

								            for m in re_varassign.finditer(line):

								                strmap[m.group(1)] = True


								        if use_propref:

								            for m in re_propref.finditer(line):

								                parts = m.group(1).split('.')

								                if re_digits.match(parts[0]) is not None:

								                    # Probably a number ('4.0' or such)

								                    pass

								                else:

								                    for part in parts:

								                        strmap[part] = True


								        if use_strlit_dquot:

								            for m in re_strlit_dquot.finditer(line):

								                s = strDecode(m.group(1))

								                if s is not None:

								                    strmap[s] = True


								        if use_strlit_squot:

								            for m in re_strlit_squot.finditer(line):

								                s = strDecode(m.group(1))

								                if s is not None:

								                    strmap[s] = True


								def main():

								    for fn in sys.argv[1:]:

								        f = open(fn, 'rb')

								        scan(f, fn)

								        f.close()


								    strs = []

								    strs_base64 = []

								    doc = {

								        # Strings as Unicode strings

								        'scanned_strings': strs,


								        # Strings as base64-encoded UTF-8 data, which should be ready

								        # to be used in C code (Duktape internal string representation

								        # is UTF-8)

								        'scanned_strings_base64': strs_base64

								    }

								    k = strmap.keys()

								    k.sort()

								    for s in k:

								        strs.append(s)

								        t = s.encode('utf-8').encode('base64')

								        if len(t) > 0 and t[-1] == '\n':

								            t = t[0:-1]

								        strs_base64.append(t)


								    print(json.dumps(doc, indent=4, ensure_ascii=True, sort_keys=True))


								if __name__ == '__main__':

								    main()