mirror of
https://github.com/klzgrad/naiveproxy.git
synced 2024-11-24 22:36:09 +03:00
380 lines
14 KiB
Python
380 lines
14 KiB
Python
|
#!/usr/bin/env python
|
||
|
# Copyright 2018 The Chromium Authors. All rights reserved.
|
||
|
# Use of this source code is governed by a BSD-style license that can be
|
||
|
# found in the LICENSE file.
|
||
|
|
||
|
"""Runs bcanalyzer to extract data from LLVM Bitcode (BC) files.
|
||
|
|
||
|
IsBitcodeFile():
|
||
|
Reads the magic header of a file to quickly decide whether it is a BC file.
|
||
|
|
||
|
ParseTag():
|
||
|
Heuristically parses a single-line tag from bcanalyzer dump (exporeted for
|
||
|
testing).
|
||
|
|
||
|
RunBcAnalyzerOnIntermediates():
|
||
|
BulkForkAndCall() target: Given BC file [paths], runs (llvm-)bcanalyzer on
|
||
|
each path, parses the output, extracts strings, and returns {path: [strings]}.
|
||
|
|
||
|
This file can also be run stand-alone in order to test out the logic on smaller
|
||
|
sample sizes.
|
||
|
"""
|
||
|
|
||
|
from __future__ import print_function
|
||
|
|
||
|
import argparse
|
||
|
import os
|
||
|
import re
|
||
|
import subprocess
|
||
|
|
||
|
import concurrent
|
||
|
import path_util
|
||
|
|
||
|
|
||
|
# Upper bound on number of bytes per character in strings. 4-byte / 32-bit
|
||
|
# strings are rare and are likely confused with 32-bit int arrays. So by
|
||
|
# default, only accept up to 2-byte / 16-bit strings.
|
||
|
_CHAR_WIDTH_LIMIT = 2
|
||
|
|
||
|
_RE_SPLIT = re.compile(r'=(\d+)')
|
||
|
# <TYPE_BLOCK_ID> children tags that should not be counted as types.
|
||
|
# - <NUMENTRY> is meta data.
|
||
|
# - <STRUCT_NAME> with the following <STRUCT_NAMED> (or other tag) are counted
|
||
|
# as a single type entry.
|
||
|
_NON_TYPE_TAGS = set(['NUMENTRY', 'STRUCT_NAME'])
|
||
|
|
||
|
# Use bit-fields for tag types: 1 => Opening tag, 2 => Closed tag.
|
||
|
OPENING_TAG = 1
|
||
|
CLOSING_TAG = 2
|
||
|
SELF_CLOSING_TAG = OPENING_TAG | CLOSING_TAG
|
||
|
|
||
|
|
||
|
def _IsOpeningTag(tag_type):
|
||
|
return tag_type & 1
|
||
|
|
||
|
|
||
|
def _IsClosingTag(tag_type):
|
||
|
return tag_type & 2
|
||
|
|
||
|
|
||
|
def IsBitcodeFile(path):
|
||
|
try:
|
||
|
with open(path, 'rb') as f:
|
||
|
return f.read(4) == 'BC\xc0\xde'
|
||
|
except IOError:
|
||
|
return False
|
||
|
|
||
|
|
||
|
def ParseTag(line):
|
||
|
"""Heuristically parses a single-line tag from bcanalyzer dump.
|
||
|
|
||
|
Since input data are machine-generated, so we only need "good enough" parsing
|
||
|
logic that favors simplicity. For example, '</FOO/>' is accepted.
|
||
|
|
||
|
Args:
|
||
|
line: Stripped line that may have a single-line tag with trailing text.
|
||
|
|
||
|
Returns:
|
||
|
(tag_type, tag, attrib_pos) if successful, else (None) * 3. Details:
|
||
|
tag_type: One of {OPENING_TAG, CLOSING_TAG, SELF_CLOSING_TAG}.
|
||
|
tag: The tag name.
|
||
|
attrib_pos: Position in |line| to start parsing attributes.
|
||
|
"""
|
||
|
# <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4>
|
||
|
# ==> (OPENING_TAG, 'TYPE_BLOCK_ID', 14).
|
||
|
# <ARRAY abbrevid=9 op0=1 op1=7/> Trailing text!
|
||
|
# ==> (SELF_CLOSING_TAG, 'ARRAY', 6).
|
||
|
# </TYPE_BLOCK_ID>
|
||
|
# ==> (CLOSING_TAG, 'TYPE_BLOCK_ID', 15).
|
||
|
|
||
|
# Assumes |line| is stripped, i.e., so no indent and no trailing new line.
|
||
|
if len(line) < 2 or line[0] != '<':
|
||
|
return (None, None, None)
|
||
|
tag_type, pos = (CLOSING_TAG, 2) if line[1] == '/' else (OPENING_TAG, 1)
|
||
|
for i in xrange(pos, len(line)):
|
||
|
if not line[i].isalnum() and line[i] != '_':
|
||
|
if i == pos or not line[i] in ' >/':
|
||
|
break
|
||
|
end = line.find('>', i)
|
||
|
if end < 0:
|
||
|
break
|
||
|
if line[end - 1] == '/':
|
||
|
return (SELF_CLOSING_TAG, line[pos:i], i)
|
||
|
return (tag_type, line[pos:i], i)
|
||
|
return (None, None, None)
|
||
|
|
||
|
|
||
|
def _ParseOpItems(line, pos):
|
||
|
"""Heuristically extracts op0=# op1=# ... values from a single-line tag."""
|
||
|
# <SETTYPE abbrevid=4 op0=42/>
|
||
|
# ^ pos = 8
|
||
|
# ==> iter([42]).
|
||
|
# <CSTRING abbrevid=11 op0=84 op1=101 op2=115 op3=116 op4=56 op5=97/>
|
||
|
# ^ pos = 8
|
||
|
# ==> iter([84, 101, 115, 116, 56, 97]).
|
||
|
# <STRING abbrevid=9 op0=1 op1=0 op2=0 op3=1 op4=1 op5=0/>
|
||
|
# ^ pos = 7
|
||
|
# ==> iter([1, 0, 0, 1, 1, 0]).
|
||
|
# <DATA op0=8412 op1=101 op2=1150 op3=116 op4=5200 op5=98 op6=0/>
|
||
|
# ^ pos = 5
|
||
|
# ==> iter([8412, 101, 1150, 116, 5200, 98, 0]).
|
||
|
|
||
|
# In particular, skip 'abbrevid=#'.
|
||
|
start = line.index(' op', pos)
|
||
|
end = line.index('>', start)
|
||
|
for t in _RE_SPLIT.finditer(line[start:end]):
|
||
|
yield int(t.group(1))
|
||
|
|
||
|
|
||
|
# Emits uint16 values as a stream of 2 bytes (little-endian).
|
||
|
def _UnpackUint16ListToBytes(items):
|
||
|
for item in items:
|
||
|
yield item & 0xFF
|
||
|
yield (item >> 8) & 0xFF
|
||
|
|
||
|
|
||
|
# Emits uint32 values as a stream of 4 bytes (little-endian).
|
||
|
def _UnpackUint32ListToBytes(items):
|
||
|
for item in items:
|
||
|
yield item & 0xFF
|
||
|
yield (item >> 8) & 0xFF
|
||
|
yield (item >> 16) & 0xFF
|
||
|
yield (item >> 24) & 0xFF
|
||
|
|
||
|
|
||
|
class _BcIntArrayType:
|
||
|
"""The specs of an integer array type."""
|
||
|
|
||
|
# Lookup table to map from width to an unpacker that splits ints into bytes.
|
||
|
_UNPACKER_MAP = {
|
||
|
1: iter,
|
||
|
2: _UnpackUint16ListToBytes,
|
||
|
4: _UnpackUint32ListToBytes
|
||
|
}
|
||
|
|
||
|
def __init__(self, length, width):
|
||
|
# Number of elements in the array.
|
||
|
self.length = length
|
||
|
# Number of bytes per element.
|
||
|
self.width = width
|
||
|
|
||
|
def ParseOpItemsAsString(self, line, attrib_pos, add_null_at_end):
|
||
|
"""Reads op0=# op=# ... values and returns them as a list of bytes.
|
||
|
|
||
|
Interprets each op0=# op1=# ... value as a |self.width|-byte integer, splits
|
||
|
them into component bytes (little-endian), and returns the result as string.
|
||
|
|
||
|
Args:
|
||
|
line: Stripped line of single-line tag with op0=# op1=# ... data.
|
||
|
attrib_pos: Position in |line| where attribute list starts.
|
||
|
add_null_add_end: Whether to append |'\x00' * self.width|.
|
||
|
"""
|
||
|
items = _ParseOpItems(line, attrib_pos)
|
||
|
unpacker = _BcIntArrayType._UNPACKER_MAP[self.width]
|
||
|
s = ''.join(chr(t) for t in unpacker(items))
|
||
|
if add_null_at_end:
|
||
|
s += '\x00' * self.width
|
||
|
# Rather stringent check to ensure exact size match.
|
||
|
assert len(s) == self.length * self.width
|
||
|
return s
|
||
|
|
||
|
|
||
|
class _BcTypeInfo:
|
||
|
"""Stateful parser of <TYPE_BLOCK_ID>, specialized for integer arrays."""
|
||
|
|
||
|
# <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4>
|
||
|
# <NUMENTRY op0=9/> # Type ids should be in [0, 8].
|
||
|
# <INTEGER op0=8/> # Type id = 0: int8.
|
||
|
# <POINTER abbrevid=4 op0=0 op1=0/> # Type id = 1: Pointer to type id 0
|
||
|
# # ==> int8*.
|
||
|
# <ARRAY abbrevid=9 op0=4 op1=0/> # Type id = 2: Array with 4 elements
|
||
|
# # of type id 0 ==> int8[4]
|
||
|
# <STRUCT_NAME op0=115 op1=116 op2=114/> # Joins next Tag.
|
||
|
# <STRUCT_NAMED abbrevid=8 op0=0 op1=1/> # Type id = 3: Struct (unused).
|
||
|
# <FUNCTION abbrevid=5 op0=0 op1=12/> # Type id = 4: Function (unused).
|
||
|
# <INTEGER op0=16/> # Type id = 5: int16.
|
||
|
# <POINTER abbrevid=4 op0=5 op1=0/> # Type id = 6: Pointer to type id 5
|
||
|
# # ==> int16*.
|
||
|
# <INTEGER op0=32/> # Type id = 7: int32.
|
||
|
# <ARRAY abbrevid=9 op0=5 op1=4/> # Type id = 8: Array with 4 elements
|
||
|
# # of type id 5 ==> int16[4]
|
||
|
# <TYPE_BLOCK_ID>
|
||
|
|
||
|
def __init__(self):
|
||
|
# Auto-incrementing current type id.
|
||
|
self.cur_type_id = 0
|
||
|
# Maps from type id (of an integer) to number of bits.
|
||
|
self.int_types = {}
|
||
|
# Maps from type id (of an integer array) to _BcIntArrayType.
|
||
|
self.int_array_types = {}
|
||
|
|
||
|
def Feed(self, line, tag, attrib_pos):
|
||
|
"""Parses a single-line tag and store integer and integer array types.
|
||
|
|
||
|
Args:
|
||
|
line: Stripped line of single-line tag with op0=# op1=# ... data.
|
||
|
tag: The tag type in |line| (child tag of <TYPE_BLOCK_ID>).
|
||
|
attrib_pos: Position in |line| where attribute list starts.
|
||
|
"""
|
||
|
if tag in _NON_TYPE_TAGS:
|
||
|
return
|
||
|
if tag == 'INTEGER':
|
||
|
num_bits = next(_ParseOpItems(line, attrib_pos)) # op0.
|
||
|
self.int_types[self.cur_type_id] = num_bits
|
||
|
elif tag == 'ARRAY':
|
||
|
[size, item_type_id] = list(_ParseOpItems(line, attrib_pos)) # op0, op1.
|
||
|
bits = self.int_types.get(item_type_id)
|
||
|
if bits is not None: # |bits| can be None for non-int arrays.
|
||
|
self.int_array_types[self.cur_type_id] = _BcIntArrayType(size, bits / 8)
|
||
|
self.cur_type_id += 1
|
||
|
|
||
|
def GetArrayType(self, idx):
|
||
|
return self.int_array_types.get(idx)
|
||
|
|
||
|
|
||
|
def _ParseBcAnalyzer(lines):
|
||
|
"""A generator to extract strings from bcanalyzer dump of a BC file."""
|
||
|
|
||
|
# ...
|
||
|
# <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4>
|
||
|
# ... (See above; parsed by _BcTypeInfo)
|
||
|
# <TYPE_BLOCK_ID>
|
||
|
# ...
|
||
|
# <CONSTANTS_BLOCK NumWords=93 BlockCodeSize=4>
|
||
|
# <SETTYPE abbrevid=4 op0=1/> # Current type id := 1 ==> int8*.
|
||
|
# <CE_INBOUNDS_GEP op0=3 op1=4 op2=1 op3=12 op4=57 op5=12 op6=57/>
|
||
|
# <SETTYPE abbrevid=4 op0=2/> # Current type id := 2 ==> int8[4].
|
||
|
# <CSTRING abbrevid=11 op0=70 op1=111 op2=111/> record string = 'Foo'
|
||
|
# <STRING abbrevid=11 op0=70 op1=111 op2=111 op3=1/> # {'F','o','o',1}.
|
||
|
# <SETTYPE abbrevid=6 op0=7/> # Current type id := 7 ==> int32.
|
||
|
# <INTEGER abbrevid=5 op0=2000/> # Stores 1000.
|
||
|
# <INTEGER abbrevid=5 op0=2001/> # Stores -1000.
|
||
|
# <SETTYPE abbrevid=4 op0=8/> # Current type id := 8 ==> int16[4].
|
||
|
# <NULL/>
|
||
|
# <DATA abbrevid=11 op0=8400 op1=10100 op2=11500 op3=0/>
|
||
|
# </CONSTANTS_BLOCK>
|
||
|
# ...
|
||
|
|
||
|
# Notes:
|
||
|
# - Only parse first <TYPE_BLOCK_ID> and first <CONSTANTS_BLOCK>.
|
||
|
# - <CONSTANTS_BLOCK> is stateful: A "current type id" exists, and that's set
|
||
|
# by <SETTYPE>, with op0= referring to type id.
|
||
|
# - For array lengths one needs to refer to the corresponding <ARRAY>.
|
||
|
# - Strings / arrays are in <CSTRING>, <STRING>, and <DATA>.
|
||
|
# - abbrevid=# is redundant (repeats tag type) and unused
|
||
|
# - Character data are stored in op0=# op1=# ..., one per character. These
|
||
|
# values should fit in the proper range, and can be fairly large.
|
||
|
# - <CSTRING> has implicit 0 at end.
|
||
|
# - Data lengths agree with the length in the matching <ARRAY> entry.
|
||
|
# - "record string" text is not very useful: It only appears if all
|
||
|
# characters are printable.
|
||
|
# - Signed vs. unsigned types are undistinguished.
|
||
|
# - In <INTEGER>, the op0= value is stored as 2 * abs(x) + (signed ? 0 : 1).
|
||
|
# - In <ARRAY> of int, values are coerced to unsigned type.
|
||
|
# - Strings and int arrays are undistinguished.
|
||
|
# - <CSTRING>: If an uint8 array happens to end with 0, then this gets used!
|
||
|
# - Arrays (or integers) of all-0 appear as <NULL/>. Presumably this gets
|
||
|
# placed into .bss section.
|
||
|
|
||
|
STATE_VOID = 0
|
||
|
STATE_TYPE_BLOCK = 1
|
||
|
STATE_CONST_BLOCK = 2
|
||
|
state = STATE_VOID
|
||
|
|
||
|
type_info = None
|
||
|
consts_cur_type = None
|
||
|
|
||
|
# State machine to parse the *first* <TYPE_BLOCK_ID> to initialize
|
||
|
# |type_info|, then the *first* <CONSTANTS_BLOCK> to yield strings.
|
||
|
for line in lines:
|
||
|
line = line.lstrip()
|
||
|
(tag_type, tag, attrib_pos) = ParseTag(line)
|
||
|
if tag_type is None:
|
||
|
continue
|
||
|
if state == STATE_VOID:
|
||
|
if _IsOpeningTag(tag_type):
|
||
|
if tag == 'TYPE_BLOCK_ID':
|
||
|
if type_info is None:
|
||
|
state = STATE_TYPE_BLOCK
|
||
|
type_info = _BcTypeInfo()
|
||
|
elif tag == 'CONSTANTS_BLOCK':
|
||
|
if type_info is not None:
|
||
|
state = STATE_CONST_BLOCK
|
||
|
|
||
|
elif state == STATE_TYPE_BLOCK:
|
||
|
if _IsClosingTag(tag_type) and tag == 'TYPE_BLOCK_ID':
|
||
|
state = STATE_VOID
|
||
|
else:
|
||
|
type_info.Feed(line, tag, attrib_pos)
|
||
|
|
||
|
elif state == STATE_CONST_BLOCK:
|
||
|
if _IsClosingTag(tag_type) and tag == 'CONSTANTS_BLOCK':
|
||
|
# Skip remaining data, including subsequent <CONSTANTS_BLOCK>s.
|
||
|
break
|
||
|
elif tag == 'SETTYPE':
|
||
|
consts_cur_type_id = next(_ParseOpItems(line, attrib_pos)) # op0.
|
||
|
consts_cur_type = type_info.GetArrayType(consts_cur_type_id)
|
||
|
elif consts_cur_type and consts_cur_type.width <= _CHAR_WIDTH_LIMIT:
|
||
|
if tag in ['CSTRING', 'STRING', 'DATA']:
|
||
|
# Exclude 32-bit / 4-byte strings since they're rarely used, and are
|
||
|
# likely confused with 32-bit int arrays.
|
||
|
s = consts_cur_type.ParseOpItemsAsString(
|
||
|
line, attrib_pos, tag == 'CSTRING')
|
||
|
yield (consts_cur_type, s)
|
||
|
|
||
|
|
||
|
class _BcAnalyzerRunner:
|
||
|
"""Helper to run bcanalyzer and extract output lines. """
|
||
|
def __init__(self, tool_prefix, output_directory):
|
||
|
self._args = [path_util.GetBcAnalyzerPath(tool_prefix), '--dump',
|
||
|
'--disable-histogram']
|
||
|
self._output_directory = output_directory
|
||
|
|
||
|
def RunOnFile(self, obj_file):
|
||
|
output = subprocess.check_output(self._args + [obj_file],
|
||
|
cwd=self._output_directory)
|
||
|
return output.splitlines()
|
||
|
|
||
|
|
||
|
# This is a target for BulkForkAndCall().
|
||
|
def RunBcAnalyzerOnIntermediates(target, tool_prefix, output_directory):
|
||
|
"""Calls bcanalyzer and returns encoded map from path to strings.
|
||
|
|
||
|
Args:
|
||
|
target: A list of BC file paths.
|
||
|
"""
|
||
|
assert isinstance(target, list)
|
||
|
runner = _BcAnalyzerRunner(tool_prefix, output_directory)
|
||
|
strings_by_path = {}
|
||
|
for t in target:
|
||
|
strings_by_path[t] = [s for _, s in _ParseBcAnalyzer(runner.RunOnFile(t))]
|
||
|
# Escape strings by repr() so there will be no special characters to interfere
|
||
|
# concurrent.EncodeDictOfLists() and decoding.
|
||
|
return concurrent.EncodeDictOfLists(strings_by_path, value_transform=repr)
|
||
|
|
||
|
|
||
|
def main():
|
||
|
parser = argparse.ArgumentParser()
|
||
|
parser.add_argument('--tool-prefix', required=True)
|
||
|
parser.add_argument('--output-directory', default='.')
|
||
|
parser.add_argument('--char-width-limit', type=int)
|
||
|
parser.add_argument('objects', type=os.path.realpath, nargs='+')
|
||
|
|
||
|
args = parser.parse_args()
|
||
|
base_path = os.path.normpath(args.output_directory)
|
||
|
runner = _BcAnalyzerRunner(args.tool_prefix, args.output_directory)
|
||
|
if args.char_width_limit is not None:
|
||
|
global _CHAR_WIDTH_LIMIT
|
||
|
_CHAR_WIDTH_LIMIT = args.char_width_limit
|
||
|
|
||
|
for obj_path in args.objects:
|
||
|
rel_path = os.path.relpath(obj_path, base_path)
|
||
|
print('File: %s' % rel_path)
|
||
|
for cur_type, s in _ParseBcAnalyzer(runner.RunOnFile(obj_path)):
|
||
|
print(' char%d[%d]: %r' % (cur_type.width * 8, cur_type.length, s))
|
||
|
print('')
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|