mirror of
https://github.com/klzgrad/naiveproxy.git
synced 2024-12-01 09:46:09 +03:00
452 lines
17 KiB
Python
Executable File
452 lines
17 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright 2018 The Chromium Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
"""Analyzer for Object Files.
|
|
|
|
This file works around Python's lack of concurrency.
|
|
|
|
_BulkObjectFileAnalyzerWorker:
|
|
Performs the actual work. Uses Process Pools to shard out per-object-file
|
|
work and then aggregates results.
|
|
|
|
_BulkObjectFileAnalyzerMaster:
|
|
Creates a subprocess and sends IPCs to it asking it to do work.
|
|
|
|
_BulkObjectFileAnalyzerSlave:
|
|
Receives IPCs and delegates logic to _BulkObjectFileAnalyzerWorker.
|
|
Runs _BulkObjectFileAnalyzerWorker on a background thread in order to stay
|
|
responsive to IPCs.
|
|
|
|
BulkObjectFileAnalyzer:
|
|
Extracts information from .o files. Alias for _BulkObjectFileAnalyzerMaster,
|
|
but when SUPERSIZE_DISABLE_ASYNC=1, alias for _BulkObjectFileAnalyzerWorker.
|
|
* AnalyzePaths(): Processes all .o files to collect symbol names that exist
|
|
within each. Does not work with thin archives (expand them first).
|
|
* SortPaths(): Sort results of AnalyzePaths().
|
|
* AnalyzeStringLiterals(): Must be run after AnalyzePaths() has completed.
|
|
Extracts string literals from .o files, and then locates them within the
|
|
"** merge strings" sections within an ELF's .rodata section.
|
|
* GetSymbolNames(): Accessor.
|
|
* Close(): Disposes data.
|
|
|
|
This file can also be run stand-alone in order to test out the logic on smaller
|
|
sample sizes.
|
|
"""
|
|
|
|
from __future__ import print_function
|
|
|
|
import argparse
|
|
import atexit
|
|
import collections
|
|
import errno
|
|
import logging
|
|
import os
|
|
import multiprocessing
|
|
import Queue
|
|
import signal
|
|
import sys
|
|
import threading
|
|
import traceback
|
|
|
|
import bcanalyzer
|
|
import concurrent
|
|
import demangle
|
|
import nm
|
|
import string_extract
|
|
|
|
|
|
_MSG_ANALYZE_PATHS = 1
|
|
_MSG_SORT_PATHS = 2
|
|
_MSG_ANALYZE_STRINGS = 3
|
|
_MSG_GET_SYMBOL_NAMES = 4
|
|
_MSG_GET_STRINGS = 5
|
|
|
|
_active_pids = None
|
|
|
|
|
|
def _DecodePosition(x):
|
|
# Encoded as "123:123"
|
|
sep_idx = x.index(':')
|
|
return (int(x[:sep_idx]), int(x[sep_idx + 1:]))
|
|
|
|
|
|
def _MakeToolPrefixAbsolute(tool_prefix):
|
|
# Ensure tool_prefix is absolute so that CWD does not affect it
|
|
if os.path.sep in tool_prefix:
|
|
# Use abspath() on the dirname to avoid it stripping a trailing /.
|
|
dirname = os.path.dirname(tool_prefix)
|
|
tool_prefix = os.path.abspath(dirname) + tool_prefix[len(dirname):]
|
|
return tool_prefix
|
|
|
|
|
|
class _PathsByType:
|
|
def __init__(self, arch, obj, bc):
|
|
self.arch = arch
|
|
self.obj = obj
|
|
self.bc = bc
|
|
|
|
|
|
class _BulkObjectFileAnalyzerWorker(object):
|
|
def __init__(self, tool_prefix, output_directory, track_string_literals=True):
|
|
self._tool_prefix = _MakeToolPrefixAbsolute(tool_prefix)
|
|
self._output_directory = output_directory
|
|
self._track_string_literals = track_string_literals
|
|
self._list_of_encoded_elf_string_ranges_by_path = None
|
|
self._paths_by_name = collections.defaultdict(list)
|
|
self._encoded_string_addresses_by_path_chunks = []
|
|
self._encoded_strings_by_path_chunks = []
|
|
|
|
def _ClassifyPaths(self, paths):
|
|
"""Classifies |paths| (.o and .a files) by file type into separate lists.
|
|
|
|
Returns:
|
|
A _PathsByType instance storing classified disjoint sublists of |paths|.
|
|
"""
|
|
arch_paths = []
|
|
obj_paths = []
|
|
bc_paths = []
|
|
for path in paths:
|
|
if path.endswith('.a'):
|
|
# .a files are typically system libraries containing .o files that are
|
|
# ELF files (and never BC files).
|
|
arch_paths.append(path)
|
|
elif bcanalyzer.IsBitcodeFile(os.path.join(self._output_directory, path)):
|
|
# Chromium build tools create BC files with .o extension. As a result,
|
|
# IsBitcodeFile() is needed to distinguish BC files from ELF .o files.
|
|
bc_paths.append(path)
|
|
else:
|
|
obj_paths.append(path)
|
|
return _PathsByType(arch=arch_paths, obj=obj_paths, bc=bc_paths)
|
|
|
|
def _MakeBatches(self, paths, size=None):
|
|
if size is None:
|
|
# Create 1-tuples of strings.
|
|
return [(p,) for p in paths]
|
|
# Create 1-tuples of arrays of strings.
|
|
return [(paths[i:i + size],) for i in xrange(0, len(paths), size)]
|
|
|
|
def _DoBulkFork(self, runner, batches):
|
|
# Order of the jobs doesn't matter since each job owns independent paths,
|
|
# and our output is a dict where paths are the key.
|
|
return concurrent.BulkForkAndCall(
|
|
runner, batches, tool_prefix=self._tool_prefix,
|
|
output_directory=self._output_directory)
|
|
|
|
def _RunNm(self, paths_by_type):
|
|
"""Calls nm to get symbols and (for non-BC files) string addresses."""
|
|
# Downstream functions rely upon .a not being grouped.
|
|
batches = self._MakeBatches(paths_by_type.arch, None)
|
|
# Combine object files and Bitcode files for nm.
|
|
BATCH_SIZE = 50 # Arbitrarily chosen.
|
|
batches.extend(
|
|
self._MakeBatches(paths_by_type.obj + paths_by_type.bc, BATCH_SIZE))
|
|
results = self._DoBulkFork(nm.RunNmOnIntermediates, batches)
|
|
|
|
# Names are still mangled.
|
|
all_paths_by_name = self._paths_by_name
|
|
for encoded_syms, encoded_strs in results:
|
|
symbol_names_by_path = concurrent.DecodeDictOfLists(encoded_syms)
|
|
for path, names in symbol_names_by_path.iteritems():
|
|
for name in names:
|
|
all_paths_by_name[name].append(path)
|
|
|
|
if encoded_strs != concurrent.EMPTY_ENCODED_DICT:
|
|
self._encoded_string_addresses_by_path_chunks.append(encoded_strs)
|
|
|
|
def _RunLlvmBcAnalyzer(self, paths_by_type):
|
|
"""Calls llvm-bcanalyzer to extract string data (for LLD-LTO)."""
|
|
BATCH_SIZE = 50 # Arbitrarily chosen.
|
|
batches = self._MakeBatches(paths_by_type.bc, BATCH_SIZE)
|
|
results = self._DoBulkFork(
|
|
bcanalyzer.RunBcAnalyzerOnIntermediates, batches)
|
|
for encoded_strs in results:
|
|
if encoded_strs != concurrent.EMPTY_ENCODED_DICT:
|
|
self._encoded_strings_by_path_chunks.append(encoded_strs);
|
|
|
|
def AnalyzePaths(self, paths):
|
|
logging.debug('worker: AnalyzePaths() started.')
|
|
paths_by_type = self._ClassifyPaths(paths)
|
|
logging.info('File counts: {\'arch\': %d, \'obj\': %d, \'bc\': %d}',
|
|
len(paths_by_type.arch), len(paths_by_type.obj),
|
|
len(paths_by_type.bc))
|
|
self._RunNm(paths_by_type)
|
|
if self._track_string_literals:
|
|
self._RunLlvmBcAnalyzer(paths_by_type)
|
|
logging.debug('worker: AnalyzePaths() completed.')
|
|
|
|
def SortPaths(self):
|
|
# Demangle all names, which can result in some merging of lists.
|
|
self._paths_by_name = demangle.DemangleKeysAndMergeLists(
|
|
self._paths_by_name, self._tool_prefix)
|
|
# Sort and uniquefy.
|
|
for key in self._paths_by_name.iterkeys():
|
|
self._paths_by_name[key] = sorted(set(self._paths_by_name[key]))
|
|
|
|
def _ReadElfStringData(self, elf_path, elf_string_ranges):
|
|
# Read string_data from elf_path, to be shared with forked processes.
|
|
address, offset, _ = string_extract.LookupElfRodataInfo(
|
|
elf_path, self._tool_prefix)
|
|
adjust = address - offset
|
|
abs_elf_string_ranges = (
|
|
(addr - adjust, s) for addr, s in elf_string_ranges)
|
|
return string_extract.ReadFileChunks(elf_path, abs_elf_string_ranges)
|
|
|
|
def _GetEncodedRangesFromStringAddresses(self, string_data):
|
|
params = ((chunk,)
|
|
for chunk in self._encoded_string_addresses_by_path_chunks)
|
|
# Order of the jobs doesn't matter since each job owns independent paths,
|
|
# and our output is a dict where paths are the key.
|
|
results = concurrent.BulkForkAndCall(
|
|
string_extract.ResolveStringPiecesIndirect, params,
|
|
string_data=string_data, tool_prefix=self._tool_prefix,
|
|
output_directory=self._output_directory)
|
|
return list(results)
|
|
|
|
def _GetEncodedRangesFromStrings(self, string_data):
|
|
params = ((chunk,) for chunk in self._encoded_strings_by_path_chunks)
|
|
# Order of the jobs doesn't matter since each job owns independent paths,
|
|
# and our output is a dict where paths are the key.
|
|
results = concurrent.BulkForkAndCall(
|
|
string_extract.ResolveStringPieces, params, string_data=string_data)
|
|
return list(results)
|
|
|
|
def AnalyzeStringLiterals(self, elf_path, elf_string_ranges):
|
|
logging.debug('worker: AnalyzeStringLiterals() started.')
|
|
string_data = self._ReadElfStringData(elf_path, elf_string_ranges)
|
|
|
|
# [source_idx][batch_idx][section_idx] -> Encoded {path: [string_ranges]}.
|
|
encoded_ranges_sources = [
|
|
self._GetEncodedRangesFromStringAddresses(string_data),
|
|
self._GetEncodedRangesFromStrings(string_data),
|
|
]
|
|
# [section_idx] -> {path: [string_ranges]}.
|
|
self._list_of_encoded_elf_string_ranges_by_path = []
|
|
# Contract [source_idx] and [batch_idx], then decode and join.
|
|
for section_idx in xrange(len(elf_string_ranges)): # Fetch result.
|
|
t = []
|
|
for encoded_ranges in encoded_ranges_sources: # [source_idx].
|
|
t.extend([b[section_idx] for b in encoded_ranges]) # [batch_idx].
|
|
self._list_of_encoded_elf_string_ranges_by_path.append(
|
|
concurrent.JoinEncodedDictOfLists(t))
|
|
logging.debug('worker: AnalyzeStringLiterals() completed.')
|
|
|
|
def GetSymbolNames(self):
|
|
return self._paths_by_name
|
|
|
|
def GetStringPositions(self):
|
|
return [concurrent.DecodeDictOfLists(x, value_transform=_DecodePosition)
|
|
for x in self._list_of_encoded_elf_string_ranges_by_path]
|
|
|
|
def GetEncodedStringPositions(self):
|
|
return self._list_of_encoded_elf_string_ranges_by_path
|
|
|
|
def Close(self):
|
|
pass
|
|
|
|
|
|
def _TerminateSubprocesses():
|
|
global _active_pids
|
|
if _active_pids:
|
|
for pid in _active_pids:
|
|
os.kill(pid, signal.SIGKILL)
|
|
_active_pids = []
|
|
|
|
|
|
class _BulkObjectFileAnalyzerMaster(object):
|
|
"""Runs BulkObjectFileAnalyzer in a subprocess."""
|
|
def __init__(self, tool_prefix, output_directory, track_string_literals=True):
|
|
self._tool_prefix = tool_prefix
|
|
self._output_directory = output_directory
|
|
self._track_string_literals = track_string_literals
|
|
self._child_pid = None
|
|
self._pipe = None
|
|
|
|
def _Spawn(self):
|
|
global _active_pids
|
|
parent_conn, child_conn = multiprocessing.Pipe()
|
|
self._child_pid = os.fork()
|
|
if self._child_pid:
|
|
# We are the parent process.
|
|
if _active_pids is None:
|
|
_active_pids = []
|
|
atexit.register(_TerminateSubprocesses)
|
|
_active_pids.append(self._child_pid)
|
|
self._pipe = parent_conn
|
|
else:
|
|
# We are the child process.
|
|
logging.root.handlers[0].setFormatter(logging.Formatter(
|
|
'obj_analyzer: %(levelname).1s %(relativeCreated)6d %(message)s'))
|
|
worker_analyzer = _BulkObjectFileAnalyzerWorker(
|
|
self._tool_prefix, self._output_directory,
|
|
track_string_literals=self._track_string_literals)
|
|
slave = _BulkObjectFileAnalyzerSlave(worker_analyzer, child_conn)
|
|
slave.Run()
|
|
|
|
def AnalyzePaths(self, paths):
|
|
if self._child_pid is None:
|
|
self._Spawn()
|
|
|
|
logging.debug('Sending batch of %d paths to subprocess', len(paths))
|
|
payload = '\x01'.join(paths)
|
|
self._pipe.send((_MSG_ANALYZE_PATHS, payload))
|
|
|
|
def SortPaths(self):
|
|
self._pipe.send((_MSG_SORT_PATHS,))
|
|
|
|
def AnalyzeStringLiterals(self, elf_path, elf_string_ranges):
|
|
self._pipe.send((_MSG_ANALYZE_STRINGS, elf_path, elf_string_ranges))
|
|
|
|
def GetSymbolNames(self):
|
|
self._pipe.send((_MSG_GET_SYMBOL_NAMES,))
|
|
self._pipe.recv() # None
|
|
logging.debug('Decoding nm results from forked process')
|
|
encoded_paths_by_name = self._pipe.recv()
|
|
return concurrent.DecodeDictOfLists(encoded_paths_by_name)
|
|
|
|
def GetStringPositions(self):
|
|
self._pipe.send((_MSG_GET_STRINGS,))
|
|
self._pipe.recv() # None
|
|
logging.debug('Decoding string symbol results from forked process')
|
|
result = self._pipe.recv()
|
|
return [concurrent.DecodeDictOfLists(x, value_transform=_DecodePosition)
|
|
for x in result]
|
|
|
|
def Close(self):
|
|
self._pipe.close()
|
|
# Child process should terminate gracefully at this point, but leave it in
|
|
# _active_pids to be killed just in case.
|
|
|
|
|
|
class _BulkObjectFileAnalyzerSlave(object):
|
|
"""The subprocess entry point."""
|
|
def __init__(self, worker_analyzer, pipe):
|
|
self._worker_analyzer = worker_analyzer
|
|
self._pipe = pipe
|
|
# Use a worker thread so that AnalyzeStringLiterals() is non-blocking. The
|
|
# thread allows the main thread to process a call to GetSymbolNames() while
|
|
# AnalyzeStringLiterals() is in progress.
|
|
self._job_queue = Queue.Queue()
|
|
self._worker_thread = threading.Thread(target=self._WorkerThreadMain)
|
|
self._allow_analyze_paths = True
|
|
|
|
def _WorkerThreadMain(self):
|
|
while True:
|
|
# Handle exceptions so test failure will be explicit and not block.
|
|
try:
|
|
func = self._job_queue.get()
|
|
func()
|
|
except Exception:
|
|
traceback.print_exc()
|
|
self._job_queue.task_done()
|
|
|
|
def _WaitForAnalyzePathJobs(self):
|
|
if self._allow_analyze_paths:
|
|
self._job_queue.join()
|
|
self._allow_analyze_paths = False
|
|
|
|
# Handle messages in a function outside the event loop, so local variables are
|
|
# independent across messages, and can be bound to jobs by lambdas using
|
|
# closures instead of functools.partial().
|
|
def _HandleMessage(self, message):
|
|
if message[0] == _MSG_ANALYZE_PATHS:
|
|
assert self._allow_analyze_paths, (
|
|
'Cannot call AnalyzePaths() after AnalyzeStringLiterals()s.')
|
|
paths = message[1].split('\x01')
|
|
self._job_queue.put(lambda: self._worker_analyzer.AnalyzePaths(paths))
|
|
elif message[0] == _MSG_SORT_PATHS:
|
|
assert self._allow_analyze_paths, (
|
|
'Cannot call SortPaths() after AnalyzeStringLiterals()s.')
|
|
self._job_queue.put(self._worker_analyzer.SortPaths)
|
|
elif message[0] == _MSG_ANALYZE_STRINGS:
|
|
self._WaitForAnalyzePathJobs()
|
|
elf_path, string_positions = message[1:]
|
|
self._job_queue.put(
|
|
lambda: self._worker_analyzer.AnalyzeStringLiterals(
|
|
elf_path, string_positions))
|
|
elif message[0] == _MSG_GET_SYMBOL_NAMES:
|
|
self._WaitForAnalyzePathJobs()
|
|
self._pipe.send(None)
|
|
paths_by_name = self._worker_analyzer.GetSymbolNames()
|
|
self._pipe.send(concurrent.EncodeDictOfLists(paths_by_name))
|
|
elif message[0] == _MSG_GET_STRINGS:
|
|
self._job_queue.join()
|
|
# Send a None packet so that other side can measure IPC transfer time.
|
|
self._pipe.send(None)
|
|
self._pipe.send(self._worker_analyzer.GetEncodedStringPositions())
|
|
|
|
def Run(self):
|
|
try:
|
|
self._worker_thread.start()
|
|
while True:
|
|
self._HandleMessage(self._pipe.recv())
|
|
except EOFError:
|
|
pass
|
|
except EnvironmentError, e:
|
|
# Parent process exited so don't log.
|
|
if e.errno in (errno.EPIPE, errno.ECONNRESET):
|
|
sys.exit(1)
|
|
|
|
logging.debug('bulk subprocess finished.')
|
|
sys.exit(0)
|
|
|
|
|
|
BulkObjectFileAnalyzer = _BulkObjectFileAnalyzerMaster
|
|
if concurrent.DISABLE_ASYNC:
|
|
BulkObjectFileAnalyzer = _BulkObjectFileAnalyzerWorker
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--multiprocess', action='store_true')
|
|
parser.add_argument('--tool-prefix', required=True)
|
|
parser.add_argument('--output-directory', required=True)
|
|
parser.add_argument('--elf-file', type=os.path.realpath)
|
|
parser.add_argument('--show-names', action='store_true')
|
|
parser.add_argument('--show-strings', action='store_true')
|
|
parser.add_argument('objects', type=os.path.realpath, nargs='+')
|
|
|
|
args = parser.parse_args()
|
|
logging.basicConfig(level=logging.DEBUG,
|
|
format='%(levelname).1s %(relativeCreated)6d %(message)s')
|
|
|
|
if args.multiprocess:
|
|
bulk_analyzer = _BulkObjectFileAnalyzerMaster(
|
|
args.tool_prefix, args.output_directory)
|
|
else:
|
|
concurrent.DISABLE_ASYNC = True
|
|
bulk_analyzer = _BulkObjectFileAnalyzerWorker(
|
|
args.tool_prefix, args.output_directory)
|
|
|
|
# Pass individually to test multiple calls.
|
|
for path in args.objects:
|
|
bulk_analyzer.AnalyzePaths([path])
|
|
bulk_analyzer.SortPaths()
|
|
|
|
names_to_paths = bulk_analyzer.GetSymbolNames()
|
|
print('Found {} names'.format(len(names_to_paths)))
|
|
if args.show_names:
|
|
for name, paths in names_to_paths.iteritems():
|
|
print('{}: {!r}'.format(name, paths))
|
|
|
|
if args.elf_file:
|
|
address, offset, size = string_extract.LookupElfRodataInfo(
|
|
args.elf_file, args.tool_prefix)
|
|
bulk_analyzer.AnalyzeStringLiterals(args.elf_file, ((address, size),))
|
|
|
|
positions_by_path = bulk_analyzer.GetStringPositions()[0]
|
|
print('Found {} string literals'.format(sum(
|
|
len(v) for v in positions_by_path.itervalues())))
|
|
if args.show_strings:
|
|
logging.debug('.rodata adjust=%d', address - offset)
|
|
for path, positions in positions_by_path.iteritems():
|
|
strs = string_extract.ReadFileChunks(
|
|
args.elf_file, ((offset + addr, size) for addr, size in positions))
|
|
print('{}: {!r}'.format(
|
|
path, [s if len(s) < 20 else s[:20] + '...' for s in strs]))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|