#!/usr/bin/env python # Copyright 2018 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Runs bcanalyzer to extract data from LLVM Bitcode (BC) files. IsBitcodeFile(): Reads the magic header of a file to quickly decide whether it is a BC file. ParseTag(): Heuristically parses a single-line tag from bcanalyzer dump (exporeted for testing). RunBcAnalyzerOnIntermediates(): BulkForkAndCall() target: Given BC file [paths], runs (llvm-)bcanalyzer on each path, parses the output, extracts strings, and returns {path: [strings]}. This file can also be run stand-alone in order to test out the logic on smaller sample sizes. """ from __future__ import print_function import argparse import os import re import subprocess import concurrent import path_util # Upper bound on number of bytes per character in strings. 4-byte / 32-bit # strings are rare and are likely confused with 32-bit int arrays. So by # default, only accept up to 2-byte / 16-bit strings. _CHAR_WIDTH_LIMIT = 2 _RE_SPLIT = re.compile(r'=(\d+)') # children tags that should not be counted as types. # - is meta data. # - with the following (or other tag) are counted # as a single type entry. _NON_TYPE_TAGS = set(['NUMENTRY', 'STRUCT_NAME']) # Use bit-fields for tag types: 1 => Opening tag, 2 => Closed tag. OPENING_TAG = 1 CLOSING_TAG = 2 SELF_CLOSING_TAG = OPENING_TAG | CLOSING_TAG def _IsOpeningTag(tag_type): return tag_type & 1 def _IsClosingTag(tag_type): return tag_type & 2 def IsBitcodeFile(path): try: with open(path, 'rb') as f: return f.read(4) == 'BC\xc0\xde' except IOError: return False def ParseTag(line): """Heuristically parses a single-line tag from bcanalyzer dump. Since input data are machine-generated, so we only need "good enough" parsing logic that favors simplicity. For example, '' is accepted. Args: line: Stripped line that may have a single-line tag with trailing text. Returns: (tag_type, tag, attrib_pos) if successful, else (None) * 3. Details: tag_type: One of {OPENING_TAG, CLOSING_TAG, SELF_CLOSING_TAG}. tag: The tag name. attrib_pos: Position in |line| to start parsing attributes. """ # # ==> (OPENING_TAG, 'TYPE_BLOCK_ID', 14). # Trailing text! # ==> (SELF_CLOSING_TAG, 'ARRAY', 6). # # ==> (CLOSING_TAG, 'TYPE_BLOCK_ID', 15). # Assumes |line| is stripped, i.e., so no indent and no trailing new line. if len(line) < 2 or line[0] != '<': return (None, None, None) tag_type, pos = (CLOSING_TAG, 2) if line[1] == '/' else (OPENING_TAG, 1) for i in xrange(pos, len(line)): if not line[i].isalnum() and line[i] != '_': if i == pos or not line[i] in ' >/': break end = line.find('>', i) if end < 0: break if line[end - 1] == '/': return (SELF_CLOSING_TAG, line[pos:i], i) return (tag_type, line[pos:i], i) return (None, None, None) def _ParseOpItems(line, pos): """Heuristically extracts op0=# op1=# ... values from a single-line tag.""" # # ^ pos = 8 # ==> iter([42]). # # ^ pos = 8 # ==> iter([84, 101, 115, 116, 56, 97]). # # ^ pos = 7 # ==> iter([1, 0, 0, 1, 1, 0]). # # ^ pos = 5 # ==> iter([8412, 101, 1150, 116, 5200, 98, 0]). # In particular, skip 'abbrevid=#'. start = line.index(' op', pos) end = line.index('>', start) for t in _RE_SPLIT.finditer(line[start:end]): yield int(t.group(1)) # Emits uint16 values as a stream of 2 bytes (little-endian). def _UnpackUint16ListToBytes(items): for item in items: yield item & 0xFF yield (item >> 8) & 0xFF # Emits uint32 values as a stream of 4 bytes (little-endian). def _UnpackUint32ListToBytes(items): for item in items: yield item & 0xFF yield (item >> 8) & 0xFF yield (item >> 16) & 0xFF yield (item >> 24) & 0xFF class _BcIntArrayType: """The specs of an integer array type.""" # Lookup table to map from width to an unpacker that splits ints into bytes. _UNPACKER_MAP = { 1: iter, 2: _UnpackUint16ListToBytes, 4: _UnpackUint32ListToBytes } def __init__(self, length, width): # Number of elements in the array. self.length = length # Number of bytes per element. self.width = width def ParseOpItemsAsString(self, line, attrib_pos, add_null_at_end): """Reads op0=# op=# ... values and returns them as a list of bytes. Interprets each op0=# op1=# ... value as a |self.width|-byte integer, splits them into component bytes (little-endian), and returns the result as string. Args: line: Stripped line of single-line tag with op0=# op1=# ... data. attrib_pos: Position in |line| where attribute list starts. add_null_add_end: Whether to append |'\x00' * self.width|. """ items = _ParseOpItems(line, attrib_pos) unpacker = _BcIntArrayType._UNPACKER_MAP[self.width] s = ''.join(chr(t) for t in unpacker(items)) if add_null_at_end: s += '\x00' * self.width # Rather stringent check to ensure exact size match. assert len(s) == self.length * self.width return s class _BcTypeInfo: """Stateful parser of , specialized for integer arrays.""" # # # Type ids should be in [0, 8]. # # Type id = 0: int8. # # Type id = 1: Pointer to type id 0 # # ==> int8*. # # Type id = 2: Array with 4 elements # # of type id 0 ==> int8[4] # # Joins next Tag. # # Type id = 3: Struct (unused). # # Type id = 4: Function (unused). # # Type id = 5: int16. # # Type id = 6: Pointer to type id 5 # # ==> int16*. # # Type id = 7: int32. # # Type id = 8: Array with 4 elements # # of type id 5 ==> int16[4] # def __init__(self): # Auto-incrementing current type id. self.cur_type_id = 0 # Maps from type id (of an integer) to number of bits. self.int_types = {} # Maps from type id (of an integer array) to _BcIntArrayType. self.int_array_types = {} def Feed(self, line, tag, attrib_pos): """Parses a single-line tag and store integer and integer array types. Args: line: Stripped line of single-line tag with op0=# op1=# ... data. tag: The tag type in |line| (child tag of ). attrib_pos: Position in |line| where attribute list starts. """ if tag in _NON_TYPE_TAGS: return if tag == 'INTEGER': num_bits = next(_ParseOpItems(line, attrib_pos)) # op0. self.int_types[self.cur_type_id] = num_bits elif tag == 'ARRAY': [size, item_type_id] = list(_ParseOpItems(line, attrib_pos)) # op0, op1. bits = self.int_types.get(item_type_id) if bits is not None: # |bits| can be None for non-int arrays. self.int_array_types[self.cur_type_id] = _BcIntArrayType(size, bits / 8) self.cur_type_id += 1 def GetArrayType(self, idx): return self.int_array_types.get(idx) def _ParseBcAnalyzer(lines): """A generator to extract strings from bcanalyzer dump of a BC file.""" # ... # # ... (See above; parsed by _BcTypeInfo) # # ... # # # Current type id := 1 ==> int8*. # # # Current type id := 2 ==> int8[4]. # record string = 'Foo' # # {'F','o','o',1}. # # Current type id := 7 ==> int32. # # Stores 1000. # # Stores -1000. # # Current type id := 8 ==> int16[4]. # # # # ... # Notes: # - Only parse first and first . # - is stateful: A "current type id" exists, and that's set # by , with op0= referring to type id. # - For array lengths one needs to refer to the corresponding . # - Strings / arrays are in , , and . # - abbrevid=# is redundant (repeats tag type) and unused # - Character data are stored in op0=# op1=# ..., one per character. These # values should fit in the proper range, and can be fairly large. # - has implicit 0 at end. # - Data lengths agree with the length in the matching entry. # - "record string" text is not very useful: It only appears if all # characters are printable. # - Signed vs. unsigned types are undistinguished. # - In , the op0= value is stored as 2 * abs(x) + (signed ? 0 : 1). # - In of int, values are coerced to unsigned type. # - Strings and int arrays are undistinguished. # - : If an uint8 array happens to end with 0, then this gets used! # - Arrays (or integers) of all-0 appear as . Presumably this gets # placed into .bss section. STATE_VOID = 0 STATE_TYPE_BLOCK = 1 STATE_CONST_BLOCK = 2 state = STATE_VOID type_info = None consts_cur_type = None # State machine to parse the *first* to initialize # |type_info|, then the *first* to yield strings. for line in lines: line = line.lstrip() (tag_type, tag, attrib_pos) = ParseTag(line) if tag_type is None: continue if state == STATE_VOID: if _IsOpeningTag(tag_type): if tag == 'TYPE_BLOCK_ID': if type_info is None: state = STATE_TYPE_BLOCK type_info = _BcTypeInfo() elif tag == 'CONSTANTS_BLOCK': if type_info is not None: state = STATE_CONST_BLOCK elif state == STATE_TYPE_BLOCK: if _IsClosingTag(tag_type) and tag == 'TYPE_BLOCK_ID': state = STATE_VOID else: type_info.Feed(line, tag, attrib_pos) elif state == STATE_CONST_BLOCK: if _IsClosingTag(tag_type) and tag == 'CONSTANTS_BLOCK': # Skip remaining data, including subsequent s. break elif tag == 'SETTYPE': consts_cur_type_id = next(_ParseOpItems(line, attrib_pos)) # op0. consts_cur_type = type_info.GetArrayType(consts_cur_type_id) elif consts_cur_type and consts_cur_type.width <= _CHAR_WIDTH_LIMIT: if tag in ['CSTRING', 'STRING', 'DATA']: # Exclude 32-bit / 4-byte strings since they're rarely used, and are # likely confused with 32-bit int arrays. s = consts_cur_type.ParseOpItemsAsString( line, attrib_pos, tag == 'CSTRING') yield (consts_cur_type, s) class _BcAnalyzerRunner: """Helper to run bcanalyzer and extract output lines. """ def __init__(self, tool_prefix, output_directory): self._args = [path_util.GetBcAnalyzerPath(tool_prefix), '--dump', '--disable-histogram'] self._output_directory = output_directory def RunOnFile(self, obj_file): output = subprocess.check_output(self._args + [obj_file], cwd=self._output_directory) return output.splitlines() # This is a target for BulkForkAndCall(). def RunBcAnalyzerOnIntermediates(target, tool_prefix, output_directory): """Calls bcanalyzer and returns encoded map from path to strings. Args: target: A list of BC file paths. """ assert isinstance(target, list) runner = _BcAnalyzerRunner(tool_prefix, output_directory) strings_by_path = {} for t in target: strings_by_path[t] = [s for _, s in _ParseBcAnalyzer(runner.RunOnFile(t))] # Escape strings by repr() so there will be no special characters to interfere # concurrent.EncodeDictOfLists() and decoding. return concurrent.EncodeDictOfLists(strings_by_path, value_transform=repr) def main(): parser = argparse.ArgumentParser() parser.add_argument('--tool-prefix', required=True) parser.add_argument('--output-directory', default='.') parser.add_argument('--char-width-limit', type=int) parser.add_argument('objects', type=os.path.realpath, nargs='+') args = parser.parse_args() base_path = os.path.normpath(args.output_directory) runner = _BcAnalyzerRunner(args.tool_prefix, args.output_directory) if args.char_width_limit is not None: global _CHAR_WIDTH_LIMIT _CHAR_WIDTH_LIMIT = args.char_width_limit for obj_path in args.objects: rel_path = os.path.relpath(obj_path, base_path) print('File: %s' % rel_path) for cur_type, s in _ParseBcAnalyzer(runner.RunOnFile(obj_path)): print(' char%d[%d]: %r' % (cur_type.width * 8, cur_type.length, s)) print('') if __name__ == '__main__': main()