duktape/tools/scan_strings.py

#!/usr/bin/env python2
#
#  Scan potential external strings from ECMAScript and C files.
#
#  Very simplistic example with a lot of limitations:
#
#    - Doesn't handle multiple variables in a variable declaration
#
#    - Only extracts strings from C files, these may correspond to
#      Duktape/C bindings (but in many cases don't)
#

import os
import sys
import re
import json

strmap = {}

# ECMAScript function declaration
re_funcname = re.compile(r'function\s+(\w+)', re.UNICODE)

# ECMAScript variable declaration
# XXX: doesn't handle multiple variables
re_vardecl = re.compile(r'var\s+(\w+)', re.UNICODE)

# ECMAScript variable assignment
re_varassign = re.compile(r'(\w+)\s*=\s*', re.UNICODE)

# ECMAScript dotted property reference (also matches numbers like
# '4.0', which are separately rejected below)
re_propref = re.compile(r'(\w+(?:\.\w+)+)', re.UNICODE)
re_digits = re.compile(r'^\d+$', re.UNICODE)

# ECMAScript or C string literal
re_strlit_dquot = re.compile(r'("(?:\\"|\\\\|[^"])*")', re.UNICODE)
re_strlit_squot = re.compile(r'(\'(?:\\\'|\\\\|[^\'])*\')', re.UNICODE)

def strDecode(x):
    # Need to decode hex, unicode, and other escapes.  Python syntax
    # is close enough to C and ECMAScript so use eval for now.

    try:
        return eval('u' + x)  # interpret as unicode string
    except:
        sys.stderr.write('Failed to parse: ' + repr(x) + ', ignoring\n')
        return None

def scan(f, fn):
    global strmap

    # Scan rules depend on file type
    if fn[-2:] == '.c':
        use_funcname = False
        use_vardecl = False
        use_varassign = False
        use_propref = False
        use_strlit_dquot = True
        use_strlit_squot = False
    else:
        use_funcname = True
        use_vardecl = True
        use_varassign = True
        use_propref = True
        use_strlit_dquot = True
        use_strlit_squot = True

    for line in f:
        # Assume input data is UTF-8
        line = line.decode('utf-8')

        if use_funcname:
            for m in re_funcname.finditer(line):
                strmap[m.group(1)] = True

        if use_vardecl:
            for m in re_vardecl.finditer(line):
                strmap[m.group(1)] = True

        if use_varassign:
            for m in re_varassign.finditer(line):
                strmap[m.group(1)] = True

        if use_propref:
            for m in re_propref.finditer(line):
                parts = m.group(1).split('.')
                if re_digits.match(parts[0]) is not None:
                    # Probably a number ('4.0' or such)
                    pass
                else:
                    for part in parts:
                        strmap[part] = True

        if use_strlit_dquot:
            for m in re_strlit_dquot.finditer(line):
                s = strDecode(m.group(1))
                if s is not None:
                    strmap[s] = True

        if use_strlit_squot:
            for m in re_strlit_squot.finditer(line):
                s = strDecode(m.group(1))
                if s is not None:
                    strmap[s] = True

def main():
    for fn in sys.argv[1:]:
        f = open(fn, 'rb')
        scan(f, fn)
        f.close()

    strs = []
    strs_base64 = []
    doc = {
        # Strings as Unicode strings
        'scanned_strings': strs,

        # Strings as base64-encoded UTF-8 data, which should be ready
        # to be used in C code (Duktape internal string representation
        # is UTF-8)
        'scanned_strings_base64': strs_base64
    }
    k = strmap.keys()
    k.sort()
    for s in k:
        strs.append(s)
        t = s.encode('utf-8').encode('base64')
        if len(t) > 0 and t[-1] == '\n':
            t = t[0:-1]
        strs_base64.append(t)

    print(json.dumps(doc, indent=4, ensure_ascii=True, sort_keys=True))

if __name__ == '__main__':
    main()
Split dist util, reorg tools, Python PEP8 8 years ago			`#!/usr/bin/env python2`
			`#`
Use 'ECMAScript' spelling in misc places 7 years ago			`# Scan potential external strings from ECMAScript and C files.`
Split dist util, reorg tools, Python PEP8 8 years ago			`#`
			`# Very simplistic example with a lot of limitations:`
			`#`
			`# - Doesn't handle multiple variables in a variable declaration`
			`#`
			`# - Only extracts strings from C files, these may correspond to`
			`# Duktape/C bindings (but in many cases don't)`
			`#`

			`import os`
			`import sys`
			`import re`
			`import json`

			`strmap = {}`

Use 'ECMAScript' spelling in misc places 7 years ago			`# ECMAScript function declaration`
Split dist util, reorg tools, Python PEP8 8 years ago			`re_funcname = re.compile(r'function\s+(\w+)', re.UNICODE)`

Use 'ECMAScript' spelling in misc places 7 years ago			`# ECMAScript variable declaration`
Split dist util, reorg tools, Python PEP8 8 years ago			`# XXX: doesn't handle multiple variables`
			`re_vardecl = re.compile(r'var\s+(\w+)', re.UNICODE)`

Use 'ECMAScript' spelling in misc places 7 years ago			`# ECMAScript variable assignment`
Split dist util, reorg tools, Python PEP8 8 years ago			`re_varassign = re.compile(r'(\w+)\s=\s', re.UNICODE)`

Use 'ECMAScript' spelling in misc places 7 years ago			`# ECMAScript dotted property reference (also matches numbers like`
Split dist util, reorg tools, Python PEP8 8 years ago			`# '4.0', which are separately rejected below)`
			`re_propref = re.compile(r'(\w+(?:\.\w+)+)', re.UNICODE)`
			`re_digits = re.compile(r'^\d+$', re.UNICODE)`

Use 'ECMAScript' spelling in misc places 7 years ago			`# ECMAScript or C string literal`
Split dist util, reorg tools, Python PEP8 8 years ago			`re_strlit_dquot = re.compile(r'("(?:\\"\|\\\\\|[^"])*")', re.UNICODE)`
			`re_strlit_squot = re.compile(r'(\'(?:\\\'\|\\\\\|[^\'])*\')', re.UNICODE)`

			`def strDecode(x):`
			`# Need to decode hex, unicode, and other escapes. Python syntax`
Use 'ECMAScript' spelling in misc places 7 years ago			`# is close enough to C and ECMAScript so use eval for now.`
Split dist util, reorg tools, Python PEP8 8 years ago
			`try:`
			`return eval('u' + x) # interpret as unicode string`
			`except:`
			`sys.stderr.write('Failed to parse: ' + repr(x) + ', ignoring\n')`
			`return None`

			`def scan(f, fn):`
			`global strmap`

			`# Scan rules depend on file type`
			`if fn[-2:] == '.c':`
			`use_funcname = False`
			`use_vardecl = False`
			`use_varassign = False`
			`use_propref = False`
			`use_strlit_dquot = True`
			`use_strlit_squot = False`
			`else:`
			`use_funcname = True`
			`use_vardecl = True`
			`use_varassign = True`
			`use_propref = True`
			`use_strlit_dquot = True`
			`use_strlit_squot = True`

			`for line in f:`
			`# Assume input data is UTF-8`
			`line = line.decode('utf-8')`

			`if use_funcname:`
			`for m in re_funcname.finditer(line):`
			`strmap[m.group(1)] = True`

			`if use_vardecl:`
			`for m in re_vardecl.finditer(line):`
			`strmap[m.group(1)] = True`

			`if use_varassign:`
			`for m in re_varassign.finditer(line):`
			`strmap[m.group(1)] = True`

			`if use_propref:`
			`for m in re_propref.finditer(line):`
			`parts = m.group(1).split('.')`
			`if re_digits.match(parts[0]) is not None:`
			`# Probably a number ('4.0' or such)`
			`pass`
			`else:`
			`for part in parts:`
			`strmap[part] = True`

			`if use_strlit_dquot:`
			`for m in re_strlit_dquot.finditer(line):`
			`s = strDecode(m.group(1))`
			`if s is not None:`
			`strmap[s] = True`

			`if use_strlit_squot:`
			`for m in re_strlit_squot.finditer(line):`
			`s = strDecode(m.group(1))`
			`if s is not None:`
			`strmap[s] = True`

			`def main():`
			`for fn in sys.argv[1:]:`
			`f = open(fn, 'rb')`
			`scan(f, fn)`
			`f.close()`

			`strs = []`
			`strs_base64 = []`
			`doc = {`
			`# Strings as Unicode strings`
			`'scanned_strings': strs,`

			`# Strings as base64-encoded UTF-8 data, which should be ready`
			`# to be used in C code (Duktape internal string representation`
			`# is UTF-8)`
			`'scanned_strings_base64': strs_base64`
			`}`
			`k = strmap.keys()`
			`k.sort()`
			`for s in k:`
			`strs.append(s)`
			`t = s.encode('utf-8').encode('base64')`
			`if len(t) > 0 and t[-1] == '\n':`
			`t = t[0:-1]`
			`strs_base64.append(t)`

			`print(json.dumps(doc, indent=4, ensure_ascii=True, sort_keys=True))`

			`if __name__ == '__main__':`
			`main()`