# Copyright 2017 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Main Python API for analyzing binary size.""" import argparse import calendar import collections import datetime import gzip import itertools import logging import os import posixpath import re import subprocess import sys import tempfile import zipfile import apkanalyzer import ar import concurrent import demangle import describe import file_format import function_signature import linker_map_parser import models import ninja_parser import nm import path_util sys.path.insert(1, os.path.join(path_util.SRC_ROOT, 'tools', 'grit')) from grit.format import data_pack # Holds computation state that is live only when an output directory exists. _OutputDirectoryContext = collections.namedtuple('_OutputDirectoryContext', [ 'elf_object_paths', # Only when elf_path is also provided. 'known_inputs', # Only when elf_path is also provided. 'output_directory', 'source_mapper', 'thin_archives', ]) # Tunable "knobs" for CreateSectionSizesAndSymbols(). class SectionSizeKnobs(object): def __init__(self): # A limit on the number of symbols an address can have, before these symbols # are compacted into shared symbols. Increasing this value causes more data # to be stored .size files, but is also more expensive. # Effect of max_same_name_alias_count (as of Oct 2017, with min_pss = max): # 1: shared .text syms = 1772874 bytes, file size = 9.43MiB (645476 syms). # 2: shared .text syms = 1065654 bytes, file size = 9.58MiB (669952 syms). # 6: shared .text syms = 464058 bytes, file size = 10.11MiB (782693 syms). # 10: shared .text syms = 365648 bytes, file size = 10.24MiB (813758 syms). # 20: shared .text syms = 86202 bytes, file size = 10.38MiB (854548 syms). # 40: shared .text syms = 48424 bytes, file size = 10.50MiB (890396 syms). # 50: shared .text syms = 41860 bytes, file size = 10.54MiB (902304 syms). # max: shared .text syms = 0 bytes, file size = 11.10MiB (1235449 syms). self.max_same_name_alias_count = 40 # 50kb is basically negligable. # An estimate of pak translation compression ratio to make comparisons # between .size files reasonable. Otherwise this can differ every pak # change. self.pak_compression_ratio = 0.33 # File name: Source file. self.apk_other_files = { 'assets/icudtl.dat': '../../third_party/icu/android/icudtl.dat', 'assets/snapshot_blob_32.bin': '../../v8/snapshot_blob_32.bin', 'assets/snapshot_blob_64.bin': '../../v8/snapshot_blob_64.bin', 'assets/natives_blob.bin': '../../v8/natives_blob.bin', 'assets/unwind_cfi_32': '../../base/trace_event/cfi_backtrace_android.cc', 'assets/webapk_dex_version.txt': ( '../../chrome/android/webapk/libs/runtime_library_version.gni'), 'lib/armeabi-v7a/libarcore_sdk_c_minimal.so': ( '../../third_party/arcore-android-sdk'), } self.apk_expected_other_files = set([ # From Monochrome.apk 'AndroidManifest.xml', 'resources.arsc', 'assets/AndroidManifest.xml', 'assets/metaresources.arsc', 'META-INF/CHROMIUM.SF', 'META-INF/CHROMIUM.RSA', 'META-INF/MANIFEST.MF', ]) def _OpenMaybeGz(path): """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`.""" if path.endswith('.gz'): return gzip.open(path, 'rb') return open(path, 'rb') def _StripLinkerAddedSymbolPrefixes(raw_symbols): """Removes prefixes sometimes added to symbol names during link Removing prefixes make symbol names match up with those found in .o files. """ for symbol in raw_symbols: full_name = symbol.full_name if full_name.startswith('startup.'): symbol.flags |= models.FLAG_STARTUP symbol.full_name = full_name[8:] elif full_name.startswith('unlikely.'): symbol.flags |= models.FLAG_UNLIKELY symbol.full_name = full_name[9:] elif full_name.startswith('rel.local.'): symbol.flags |= models.FLAG_REL_LOCAL symbol.full_name = full_name[10:] elif full_name.startswith('rel.'): symbol.flags |= models.FLAG_REL symbol.full_name = full_name[4:] elif full_name.startswith('hot.'): symbol.flags |= models.FLAG_HOT symbol.full_name = full_name[4:] elif full_name.startswith('.L.str'): symbol.full_name = models.STRING_LITERAL_NAME def _NormalizeNames(raw_symbols): """Ensures that all names are formatted in a useful way. This includes: - Deriving |name| and |template_name| from |full_name|. - Stripping of return types (for functions). - Moving "vtable for" and the like to be suffixes rather than prefixes. """ found_prefixes = set() for symbol in raw_symbols: full_name = symbol.full_name # See comment in _CalculatePadding() about when this can happen. Don't # process names for non-native sections. if (full_name.startswith('*') or symbol.IsOverhead() or symbol.IsOther() or symbol.IsPak()): symbol.template_name = full_name symbol.name = full_name elif symbol.IsDex(): symbol.full_name, symbol.template_name, symbol.name = ( function_signature.ParseJava(full_name)) elif symbol.IsNative(): # Remove [clone] suffix, and set flag accordingly. # Search from left-to-right, as multiple [clone]s can exist. # Example name suffixes: # [clone .part.322] # GCC # [clone .isra.322] # GCC # [clone .constprop.1064] # GCC # [clone .11064] # clang # http://unix.stackexchange.com/questions/223013/function-symbol-gets-part-suffix-after-compilation idx = full_name.find(' [clone ') if idx != -1: full_name = full_name[:idx] symbol.flags |= models.FLAG_CLONE # Clones for C symbols. if symbol.section == 't': idx = full_name.rfind('.') if idx != -1 and full_name[idx + 1:].isdigit(): new_name = full_name[:idx] # Generated symbols that end with .123 but are not clones. # Find these via: # size_info.symbols.WhereInSection('t').WhereIsGroup().SortedByCount() if new_name not in ('__tcf_0', 'startup'): full_name = new_name symbol.flags |= models.FLAG_CLONE # Remove .part / .isra / .constprop. idx = full_name.rfind('.', 0, idx) if idx != -1: full_name = full_name[:idx] # E.g.: vtable for FOO idx = full_name.find(' for ', 0, 30) if idx != -1: found_prefixes.add(full_name[:idx + 4]) full_name = '{} [{}]'.format(full_name[idx + 5:], full_name[:idx]) # E.g.: virtual thunk to FOO idx = full_name.find(' to ', 0, 30) if idx != -1: found_prefixes.add(full_name[:idx + 3]) full_name = '{} [{}]'.format(full_name[idx + 4:], full_name[:idx]) # Strip out return type, and split out name, template_name. # Function parsing also applies to non-text symbols. # E.g. Function statics. symbol.full_name, symbol.template_name, symbol.name = ( function_signature.Parse(full_name)) # Remove anonymous namespaces (they just harm clustering). symbol.template_name = symbol.template_name.replace( '(anonymous namespace)::', '') symbol.full_name = symbol.full_name.replace( '(anonymous namespace)::', '') non_anonymous_name = symbol.name.replace('(anonymous namespace)::', '') if symbol.name != non_anonymous_name: symbol.flags |= models.FLAG_ANONYMOUS symbol.name = non_anonymous_name # Allow using "is" to compare names (and should help with RAM). This applies # to all symbols. function_signature.InternSameNames(symbol) logging.debug('Found name prefixes of: %r', found_prefixes) def _NormalizeObjectPath(path): if path.startswith('obj/'): # Convert obj/third_party/... -> third_party/... path = path[4:] elif path.startswith('../../'): # Convert ../../third_party/... -> third_party/... path = path[6:] if path.endswith(')'): # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o so that hierarchical # breakdowns consider the .o part to be a separate node. start_idx = path.rindex('(') path = os.path.join(path[:start_idx], path[start_idx + 1:-1]) return path def _NormalizeSourcePath(path): """Returns (is_generated, normalized_path)""" if path.startswith('gen/'): # Convert gen/third_party/... -> third_party/... return True, path[4:] if path.startswith('../../'): # Convert ../../third_party/... -> third_party/... return False, path[6:] return True, path def _ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper): """Fills in the |source_path| attribute and normalizes |object_path|.""" if source_mapper: logging.info('Looking up source paths from ninja files') for symbol in raw_symbols: object_path = symbol.object_path if symbol.IsDex() or symbol.IsOther(): if symbol.source_path: symbol.generated_source, symbol.source_path = _NormalizeSourcePath( symbol.source_path) elif object_path: # We don't have source info for prebuilt .a files. if not os.path.isabs(object_path) and not object_path.startswith('..'): source_path = source_mapper.FindSourceForPath(object_path) if source_path: symbol.generated_source, symbol.source_path = ( _NormalizeSourcePath(source_path)) symbol.object_path = _NormalizeObjectPath(object_path) assert source_mapper.unmatched_paths_count == 0, ( 'One or more source file paths could not be found. Likely caused by ' '.ninja files being generated at a different time than the .map file.') else: logging.info('Normalizing object paths') for symbol in raw_symbols: if symbol.object_path: symbol.object_path = _NormalizeObjectPath(symbol.object_path) def _ComputeAncestorPath(path_list, symbol_count): """Returns the common ancestor of the given paths.""" if not path_list: return '' prefix = os.path.commonprefix(path_list) # Check if all paths were the same. if prefix == path_list[0]: return prefix # Put in buckets to cut down on the number of unique paths. if symbol_count >= 100: symbol_count_str = '100+' elif symbol_count >= 50: symbol_count_str = '50-99' elif symbol_count >= 20: symbol_count_str = '20-49' elif symbol_count >= 10: symbol_count_str = '10-19' else: symbol_count_str = str(symbol_count) # Put the path count as a subdirectory so that grouping by path will show # "{shared}" as a bucket, and the symbol counts as leafs. if not prefix: return os.path.join('{shared}', symbol_count_str) return os.path.join(os.path.dirname(prefix), '{shared}', symbol_count_str) def _CompactLargeAliasesIntoSharedSymbols(raw_symbols, knobs): """Converts symbols with large number of aliases into single symbols. The merged symbol's path fields are changed to common-ancestor paths in the form: common/dir/{shared}/$SYMBOL_COUNT Assumes aliases differ only by path (not by name). """ num_raw_symbols = len(raw_symbols) num_shared_symbols = 0 src_cursor = 0 dst_cursor = 0 while src_cursor < num_raw_symbols: symbol = raw_symbols[src_cursor] raw_symbols[dst_cursor] = symbol dst_cursor += 1 aliases = symbol.aliases if aliases and len(aliases) > knobs.max_same_name_alias_count: symbol.source_path = _ComputeAncestorPath( [s.source_path for s in aliases if s.source_path], len(aliases)) symbol.object_path = _ComputeAncestorPath( [s.object_path for s in aliases if s.object_path], len(aliases)) symbol.generated_source = all(s.generated_source for s in aliases) symbol.aliases = None num_shared_symbols += 1 src_cursor += len(aliases) else: src_cursor += 1 raw_symbols[dst_cursor:] = [] num_removed = src_cursor - dst_cursor logging.debug('Converted %d aliases into %d shared-path symbols', num_removed, num_shared_symbols) def _ConnectNmAliases(raw_symbols): """Ensures |aliases| is set correctly for all symbols.""" prev_sym = raw_symbols[0] for sym in raw_symbols[1:]: # Don't merge bss symbols. if sym.address > 0 and prev_sym.address == sym.address: # Don't merge padding-only symbols (** symbol gaps). if prev_sym.size > 0: # Don't merge if already merged. if prev_sym.aliases is None or prev_sym.aliases is not sym.aliases: if prev_sym.aliases: prev_sym.aliases.append(sym) else: prev_sym.aliases = [prev_sym, sym] sym.aliases = prev_sym.aliases prev_sym = sym def _AssignNmAliasPathsAndCreatePathAliases(raw_symbols, object_paths_by_name): num_found_paths = 0 num_unknown_names = 0 num_path_mismatches = 0 num_aliases_created = 0 ret = [] for symbol in raw_symbols: ret.append(symbol) full_name = symbol.full_name if (symbol.IsBss() or symbol.IsStringLiteral() or not full_name or full_name[0] in '*.' or # e.g. ** merge symbols, .Lswitch.table full_name == 'startup'): continue object_paths = object_paths_by_name.get(full_name) if object_paths: num_found_paths += 1 else: if num_unknown_names < 10: logging.warning('Symbol not found in any .o files: %r', symbol) num_unknown_names += 1 continue if symbol.object_path and symbol.object_path not in object_paths: if num_path_mismatches < 10: logging.warning('Symbol path reported by .map not found by nm.') logging.warning('sym=%r', symbol) logging.warning('paths=%r', object_paths) object_paths.append(symbol.object_path) object_paths.sort() num_path_mismatches += 1 symbol.object_path = object_paths[0] if len(object_paths) > 1: # Create one symbol for each object_path. aliases = symbol.aliases or [symbol] symbol.aliases = aliases num_aliases_created += len(object_paths) - 1 for object_path in object_paths[1:]: new_sym = models.Symbol( symbol.section_name, symbol.size, address=symbol.address, full_name=full_name, object_path=object_path, aliases=aliases) aliases.append(new_sym) ret.append(new_sym) logging.debug('Cross-referenced %d symbols with nm output. ' 'num_unknown_names=%d num_path_mismatches=%d ' 'num_aliases_created=%d', num_found_paths, num_unknown_names, num_path_mismatches, num_aliases_created) return ret def _DiscoverMissedObjectPaths(raw_symbols, known_inputs): # Missing object paths are caused by .a files added by -l flags, which are not # listed as explicit inputs within .ninja rules. missed_inputs = set() for symbol in raw_symbols: path = symbol.object_path if path.endswith(')'): # Convert foo/bar.a(baz.o) -> foo/bar.a path = path[:path.rindex('(')] if path and path not in known_inputs: missed_inputs.add(path) return missed_inputs def _CreateMergeStringsReplacements(merge_string_syms, list_of_positions_by_object_path): """Creates replacement symbols for |merge_syms|.""" ret = [] STRING_LITERAL_NAME = models.STRING_LITERAL_NAME assert len(merge_string_syms) == len(list_of_positions_by_object_path) tups = itertools.izip(merge_string_syms, list_of_positions_by_object_path) for merge_sym, positions_by_object_path in tups: merge_sym_address = merge_sym.address new_symbols = [] ret.append(new_symbols) for object_path, positions in positions_by_object_path.iteritems(): for offset, size in positions: address = merge_sym_address + offset symbol = models.Symbol( models.SECTION_RODATA, size, address, STRING_LITERAL_NAME, object_path=object_path) new_symbols.append(symbol) logging.debug('Created %d string literal symbols', sum(len(x) for x in ret)) logging.debug('Sorting string literals') for symbols in ret: # In order to achieve a total ordering in the presense of aliases, need to # include both |address| and |object_path|. # In order to achieve consistent deduping, need to include |size|. symbols.sort(key=lambda x: (x.address, -x.size, x.object_path)) logging.debug('Deduping string literals') num_removed = 0 size_removed = 0 num_aliases = 0 for i, symbols in enumerate(ret): if not symbols: continue prev_symbol = symbols[0] new_symbols = [prev_symbol] for symbol in symbols[1:]: padding = symbol.address - prev_symbol.end_address if (prev_symbol.address == symbol.address and prev_symbol.size == symbol.size): # String is an alias. num_aliases += 1 aliases = prev_symbol.aliases if aliases: aliases.append(symbol) symbol.aliases = aliases else: aliases = [prev_symbol, symbol] prev_symbol.aliases = aliases symbol.aliases = aliases elif padding + symbol.size <= 0: # String is a substring of prior one. num_removed += 1 size_removed += symbol.size continue elif padding < 0: # String overlaps previous one. Adjust to not overlap. symbol.address -= padding symbol.size += padding new_symbols.append(symbol) prev_symbol = symbol ret[i] = new_symbols # Aliases come out in random order, so sort to be deterministic. ret[i].sort(key=lambda s: (s.address, s.object_path)) logging.debug( 'Removed %d overlapping string literals (%d bytes) & created %d aliases', num_removed, size_removed, num_aliases) return ret def _CalculatePadding(raw_symbols): """Populates the |padding| field based on symbol addresses. Symbols must already be sorted by |address|. """ seen_sections = set() for i, symbol in enumerate(raw_symbols[1:]): prev_symbol = raw_symbols[i] if symbol.IsOverhead(): # Overhead symbols are not actionable so should be padding-only. symbol.padding = symbol.size if prev_symbol.section_name != symbol.section_name: assert symbol.section_name not in seen_sections, ( 'Input symbols must be sorted by section, then address.') seen_sections.add(symbol.section_name) continue if (symbol.address <= 0 or prev_symbol.address <= 0 or not symbol.IsNative() or not prev_symbol.IsNative()): continue if symbol.address == prev_symbol.address: if symbol.aliases and symbol.aliases is prev_symbol.aliases: symbol.padding = prev_symbol.padding symbol.size = prev_symbol.size continue # Padding-only symbols happen for ** symbol gaps. assert prev_symbol.size_without_padding == 0, ( 'Found duplicate symbols:\n%r\n%r' % (prev_symbol, symbol)) padding = symbol.address - prev_symbol.end_address # These thresholds were found by experimenting with arm32 Chrome. # E.g.: Set them to 0 and see what warnings get logged, then take max value. # TODO(agrieve): See if these thresholds make sense for architectures # other than arm32. if (not symbol.full_name.startswith('*') and not symbol.IsStringLiteral() and ( symbol.section in 'rd' and padding >= 256 or symbol.section in 't' and padding >= 64)): # Should not happen. logging.warning('Large padding of %d between:\n A) %r\n B) %r' % ( padding, prev_symbol, symbol)) symbol.padding = padding symbol.size += padding assert symbol.size >= 0, ( 'Symbol has negative size (likely not sorted propertly): ' '%r\nprev symbol: %r' % (symbol, prev_symbol)) def _AddNmAliases(raw_symbols, names_by_address): """Adds symbols that were removed by identical code folding.""" # Step 1: Create list of (index_of_symbol, name_list). logging.debug('Creating alias list') replacements = [] num_new_symbols = 0 missing_names = collections.defaultdict(list) for i, s in enumerate(raw_symbols): # Don't alias padding-only symbols (e.g. ** symbol gap) if s.size_without_padding == 0: continue name_list = names_by_address.get(s.address) if name_list: if s.full_name not in name_list: missing_names[s.full_name].append(s.address) logging.warning('Name missing from aliases: %s %s', s.full_name, name_list) continue replacements.append((i, name_list)) num_new_symbols += len(name_list) - 1 if missing_names and logging.getLogger().isEnabledFor(logging.INFO): for address, names in names_by_address.iteritems(): for name in names: if name in missing_names: logging.info('Missing name %s is at address %x instead of [%s]' % (name, address, ','.join('%x' % a for a in missing_names[name]))) if float(num_new_symbols) / len(raw_symbols) < .05: logging.warning('Number of aliases is oddly low (%.0f%%). It should ' 'usually be around 25%%. Ensure --tool-prefix is correct. ', float(num_new_symbols) / len(raw_symbols) * 100) # Step 2: Create new symbols as siblings to each existing one. logging.debug('Creating %d new symbols from nm output', num_new_symbols) expected_num_symbols = len(raw_symbols) + num_new_symbols ret = [] prev_src = 0 for cur_src, name_list in replacements: ret += raw_symbols[prev_src:cur_src] prev_src = cur_src + 1 sym = raw_symbols[cur_src] # Create symbols (|sym| gets recreated and discarded). new_syms = [] for full_name in name_list: # Do not set |aliases| in order to avoid being pruned by # _CompactLargeAliasesIntoSharedSymbols(), which assumes aliases differ # only by path. The field will be set afterwards by _ConnectNmAliases(). new_syms.append(models.Symbol( sym.section_name, sym.size, address=sym.address, full_name=full_name)) ret += new_syms ret += raw_symbols[prev_src:] assert expected_num_symbols == len(ret) return ret def LoadAndPostProcessSizeInfo(path): """Returns a SizeInfo for the given |path|.""" logging.debug('Loading results from: %s', path) size_info = file_format.LoadSizeInfo(path) logging.info('Normalizing symbol names') _NormalizeNames(size_info.raw_symbols) logging.info('Calculating padding') _CalculatePadding(size_info.raw_symbols) logging.info('Loaded %d symbols', len(size_info.raw_symbols)) return size_info def CreateMetadata(map_path, elf_path, apk_path, tool_prefix, output_directory): metadata = None if elf_path: logging.debug('Constructing metadata') git_rev = _DetectGitRevision(os.path.dirname(elf_path)) architecture = _ArchFromElf(elf_path, tool_prefix) build_id = BuildIdFromElf(elf_path, tool_prefix) timestamp_obj = datetime.datetime.utcfromtimestamp(os.path.getmtime( elf_path)) timestamp = calendar.timegm(timestamp_obj.timetuple()) relative_tool_prefix = path_util.ToSrcRootRelative(tool_prefix) metadata = { models.METADATA_GIT_REVISION: git_rev, models.METADATA_ELF_ARCHITECTURE: architecture, models.METADATA_ELF_MTIME: timestamp, models.METADATA_ELF_BUILD_ID: build_id, models.METADATA_TOOL_PREFIX: relative_tool_prefix, } if output_directory: relative_to_out = lambda path: os.path.relpath(path, output_directory) gn_args = _ParseGnArgs(os.path.join(output_directory, 'args.gn')) metadata[models.METADATA_MAP_FILENAME] = relative_to_out(map_path) metadata[models.METADATA_ELF_FILENAME] = relative_to_out(elf_path) metadata[models.METADATA_GN_ARGS] = gn_args if apk_path: metadata[models.METADATA_APK_FILENAME] = relative_to_out(apk_path) metadata[models.METADATA_APK_SIZE] = os.path.getsize(apk_path) return metadata def _ResolveThinArchivePaths(raw_symbols, thin_archives): """Converts object_paths for thin archives to external .o paths.""" for symbol in raw_symbols: object_path = symbol.object_path if object_path.endswith(')'): start_idx = object_path.rindex('(') archive_path = object_path[:start_idx] if archive_path in thin_archives: subpath = object_path[start_idx + 1:-1] symbol.object_path = ar.CreateThinObjectPath(archive_path, subpath) def _ParseElfInfo(map_path, elf_path, tool_prefix, track_string_literals, outdir_context=None): """Adds ELF section sizes and symbols.""" if elf_path: # Run nm on the elf file to retrieve the list of symbol names per-address. # This list is required because the .map file contains only a single name # for each address, yet multiple symbols are often coalesced when they are # identical. This coalescing happens mainly for small symbols and for C++ # templates. Such symbols make up ~500kb of libchrome.so on Android. elf_nm_result = nm.CollectAliasesByAddressAsync(elf_path, tool_prefix) # Run nm on all .o/.a files to retrieve the symbol names within them. # The list is used to detect when mutiple .o files contain the same symbol # (e.g. inline functions), and to update the object_path / source_path # fields accordingly. # Looking in object files is required because the .map file choses a # single path for these symbols. # Rather than record all paths for each symbol, set the paths to be the # common ancestor of all paths. if outdir_context: bulk_analyzer = nm.BulkObjectFileAnalyzer( tool_prefix, outdir_context.output_directory) bulk_analyzer.AnalyzePaths(outdir_context.elf_object_paths) logging.info('Parsing Linker Map') with _OpenMaybeGz(map_path) as map_file: section_sizes, raw_symbols = ( linker_map_parser.MapFileParser().Parse(map_file)) if outdir_context and outdir_context.thin_archives: _ResolveThinArchivePaths(raw_symbols, outdir_context.thin_archives) if elf_path: logging.debug('Validating section sizes') elf_section_sizes = _SectionSizesFromElf(elf_path, tool_prefix) for k, v in elf_section_sizes.iteritems(): if v != section_sizes.get(k): logging.error('ELF file and .map file do not agree on section sizes.') logging.error('.map file: %r', section_sizes) logging.error('readelf: %r', elf_section_sizes) sys.exit(1) if elf_path and outdir_context: missed_object_paths = _DiscoverMissedObjectPaths( raw_symbols, outdir_context.known_inputs) missed_object_paths = ar.ExpandThinArchives( missed_object_paths, outdir_context.output_directory)[0] bulk_analyzer.AnalyzePaths(missed_object_paths) bulk_analyzer.SortPaths() if track_string_literals: merge_string_syms = [s for s in raw_symbols if s.full_name == '** merge strings' or s.full_name == '** lld merge strings'] # More likely for there to be a bug in supersize than an ELF to not have a # single string literal. assert merge_string_syms string_positions = [(s.address, s.size) for s in merge_string_syms] bulk_analyzer.AnalyzeStringLiterals(elf_path, string_positions) logging.info('Stripping linker prefixes from symbol names') _StripLinkerAddedSymbolPrefixes(raw_symbols) # Map file for some reason doesn't demangle all names. # Demangle prints its own log statement. demangle.DemangleRemainingSymbols(raw_symbols, tool_prefix) if elf_path: logging.info( 'Adding symbols removed by identical code folding (as reported by nm)') # This normally does not block (it's finished by this time). names_by_address = elf_nm_result.get() raw_symbols = _AddNmAliases(raw_symbols, names_by_address) if outdir_context: object_paths_by_name = bulk_analyzer.GetSymbolNames() logging.debug( 'Fetched path information for %d symbols from %d files', len(object_paths_by_name), len(outdir_context.elf_object_paths) + len(missed_object_paths)) # For aliases, this provides path information where there wasn't any. logging.info('Creating aliases for symbols shared by multiple paths') raw_symbols = _AssignNmAliasPathsAndCreatePathAliases( raw_symbols, object_paths_by_name) if track_string_literals: logging.info('Waiting for string literal extraction to complete.') list_of_positions_by_object_path = bulk_analyzer.GetStringPositions() bulk_analyzer.Close() if track_string_literals: logging.info('Deconstructing ** merge strings into literals') replacements = _CreateMergeStringsReplacements(merge_string_syms, list_of_positions_by_object_path) for merge_sym, literal_syms in itertools.izip( merge_string_syms, replacements): # Don't replace if no literals were found. if literal_syms: # Re-find the symbols since aliases cause their indices to change. idx = raw_symbols.index(merge_sym) # This assignment is a bit slow (causes array to be shifted), but # is fast enough since len(merge_string_syms) < 10. raw_symbols[idx:idx + 1] = literal_syms return section_sizes, raw_symbols def _ComputePakFileSymbols( file_name, contents, res_info, symbols_by_id, compression_ratio=1): id_map = {id(v): k for k, v in sorted(contents.resources.items(), reverse=True)} alias_map = {k: id_map[id(v)] for k, v in contents.resources.iteritems() if id_map[id(v)] != k} # Longest locale pak is es-419.pak if len(os.path.basename(file_name)) <= 9: section_name = models.SECTION_PAK_TRANSLATIONS else: section_name = models.SECTION_PAK_NONTRANSLATED overhead = (12 + 6) * compression_ratio # Header size plus extra offset symbols_by_id[file_name] = models.Symbol( section_name, overhead, full_name='{}: overhead'.format(file_name)) for resource_id in sorted(contents.resources): if resource_id in alias_map: # 4 extra bytes of metadata (2 16-bit ints) size = 4 resource_id = alias_map[resource_id] else: # 6 extra bytes of metadata (1 32-bit int, 1 16-bit int) size = len(contents.resources[resource_id]) + 6 name, source_path = res_info[resource_id] if resource_id not in symbols_by_id: full_name = '{}: {}'.format(source_path, name) symbols_by_id[resource_id] = models.Symbol( section_name, 0, address=resource_id, full_name=full_name) size *= compression_ratio symbols_by_id[resource_id].size += size class _ResourceSourceMapper(object): def __init__(self, apk_path, output_directory, knobs): self._knobs = knobs self._res_info = self._LoadResInfo(apk_path, output_directory) self._pattern_dollar_underscore = re.compile(r'\$(.*?)__\d+') self._pattern_version_suffix = re.compile(r'-v\d+/') @staticmethod def _ParseResInfoFile(res_info_path): with open(res_info_path, 'r') as info_file: res_info = {} renames = {} for line in info_file.readlines(): dest, source = line.strip().split(',') # Allow indirection due to renames. if dest.startswith('Rename:'): dest = dest.split(':', 1)[1] renames[dest] = source else: res_info[dest] = source for dest, renamed_dest in renames.iteritems(): # Allow one more level of indirection due to renaming renamed files renamed_dest = renames.get(renamed_dest, renamed_dest) actual_source = res_info.get(renamed_dest); if actual_source: res_info[dest] = actual_source return res_info def _LoadResInfo(self, apk_path, output_directory): apk_name = os.path.basename(apk_path) apk_res_info_name = apk_name + '.res.info' apk_res_info_path = os.path.join( output_directory, 'size-info', apk_res_info_name) res_info_without_root = self._ParseResInfoFile(apk_res_info_path) # We package resources in the res/ folder only in the apk. res_info = { os.path.join('res', dest): source for dest, source in res_info_without_root.iteritems() } res_info.update(self._knobs.apk_other_files) return res_info def FindSourceForPath(self, path): original_path = path # Sometimes android adds $ in front and __# before extension. path = self._pattern_dollar_underscore.sub(r'\1', path) ret = self._res_info.get(path) if ret: return ret # Android build tools may append extra -v flags for the root dir. path = self._pattern_version_suffix.sub('/', path) ret = self._res_info.get(path) if ret: return ret if original_path not in self._knobs.apk_expected_other_files: logging.warning('Unexpected file in apk: %s', original_path) return None def _ParsePakInfoFile(pak_info_path): with open(pak_info_path, 'r') as info_file: res_info = {} for line in info_file.readlines(): name, res_id, path = line.split(',') res_info[int(res_id)] = (name, path.strip()) return res_info def _ParsePakSymbols( section_sizes, object_paths, output_directory, symbols_by_id): for path in object_paths: whitelist_path = os.path.join(output_directory, path + '.whitelist') if (not os.path.exists(whitelist_path) or os.path.getsize(whitelist_path) == 0): continue with open(whitelist_path, 'r') as f: for line in f: resource_id = int(line.rstrip()) # There may be object files in static libraries that are removed by the # linker when there are no external references to its symbols. These # files may be included in object_paths which our apk does not use, # resulting in resource_ids that don't end up being in the final apk. if resource_id not in symbols_by_id: continue symbols_by_id[resource_id].object_path = path raw_symbols = sorted(symbols_by_id.values(), key=lambda s: (s.section_name, s.address)) raw_total = 0.0 int_total = 0 for symbol in raw_symbols: raw_total += symbol.size # We truncate rather than round to ensure that we do not over attribute. It # is easier to add another symbol to make up the difference. symbol.size = int(symbol.size) int_total += symbol.size # Attribute excess to translations since only those are compressed. raw_symbols.append(models.Symbol( models.SECTION_PAK_TRANSLATIONS, int(round(raw_total - int_total)), full_name='Overhead: Pak compression artifacts')) for symbol in raw_symbols: prev = section_sizes.setdefault(symbol.section_name, 0) section_sizes[symbol.section_name] = prev + symbol.size return raw_symbols def _ParseApkElfSectionSize(section_sizes, metadata, apk_elf_result): if metadata: logging.debug('Extracting section sizes from .so within .apk') apk_build_id, apk_section_sizes, elf_overhead_size = apk_elf_result.get() assert apk_build_id == metadata[models.METADATA_ELF_BUILD_ID], ( 'BuildID from apk_elf_result did not match') packed_section_name = None architecture = metadata[models.METADATA_ELF_ARCHITECTURE] # Packing occurs enabled only arm32 & arm64. if architecture == 'arm': packed_section_name = '.rel.dyn' elif architecture == 'arm64': packed_section_name = '.rela.dyn' if packed_section_name: logging.debug('Recording size of unpacked relocations') if packed_section_name not in section_sizes: logging.warning('Packed section not present: %s', packed_section_name) else: apk_section_sizes['%s (unpacked)' % packed_section_name] = ( section_sizes.get(packed_section_name)) return apk_section_sizes, elf_overhead_size return section_sizes, 0 def _ParseDexSymbols(section_sizes, apk_path, output_directory): symbols = apkanalyzer.CreateDexSymbols(apk_path, output_directory) prev = section_sizes.setdefault(models.SECTION_DEX, 0) section_sizes[models.SECTION_DEX] = prev + sum(s.size for s in symbols) return symbols def _ParseApkOtherSymbols(section_sizes, apk_path, apk_so_path, output_directory, knobs): res_source_mapper = _ResourceSourceMapper(apk_path, output_directory, knobs) apk_symbols = [] zip_info_total = 0 with zipfile.ZipFile(apk_path) as z: for zip_info in z.infolist(): zip_info_total += zip_info.compress_size # Skip main shared library, pak, and dex files as they are accounted for. if (zip_info.filename == apk_so_path or zip_info.filename.endswith('.dex') or zip_info.filename.endswith('.pak')): continue source_path = res_source_mapper.FindSourceForPath(zip_info.filename) if source_path is None: source_path = os.path.join(models.APK_PREFIX_PATH, zip_info.filename) apk_symbols.append(models.Symbol( models.SECTION_OTHER, zip_info.compress_size, source_path=source_path, full_name=zip_info.filename)) # Full name must disambiguate overhead_size = os.path.getsize(apk_path) - zip_info_total assert overhead_size >= 0, 'Apk overhead must be non-negative' zip_overhead_symbol = models.Symbol( models.SECTION_OTHER, overhead_size, full_name='Overhead: APK file') apk_symbols.append(zip_overhead_symbol) prev = section_sizes.setdefault(models.SECTION_OTHER, 0) section_sizes[models.SECTION_OTHER] = prev + sum(s.size for s in apk_symbols) return apk_symbols def _FindPakSymbolsFromApk(apk_path, output_directory, knobs): with zipfile.ZipFile(apk_path) as z: pak_zip_infos = (f for f in z.infolist() if f.filename.endswith('.pak')) apk_info_name = os.path.basename(apk_path) + '.pak.info' pak_info_path = os.path.join(output_directory, 'size-info', apk_info_name) res_info = _ParsePakInfoFile(pak_info_path) symbols_by_id = {} total_compressed_size = 0 total_uncompressed_size = 0 for zip_info in pak_zip_infos: contents = data_pack.ReadDataPackFromString(z.read(zip_info)) compression_ratio = 1.0 if zip_info.compress_size < zip_info.file_size: total_compressed_size += zip_info.compress_size total_uncompressed_size += zip_info.file_size compression_ratio = knobs.pak_compression_ratio _ComputePakFileSymbols( zip_info.filename, contents, res_info, symbols_by_id, compression_ratio=compression_ratio) if total_uncompressed_size > 0: actual_ratio = ( float(total_compressed_size) / total_uncompressed_size) logging.info('Pak Compression Ratio: %f Actual: %f Diff: %.0f', knobs.pak_compression_ratio, actual_ratio, (knobs.pak_compression_ratio - actual_ratio) * total_uncompressed_size) return symbols_by_id def _FindPakSymbolsFromFiles(pak_files, pak_info_path, output_directory): """Uses files from args to find and add pak symbols.""" res_info = _ParsePakInfoFile(pak_info_path) symbols_by_id = {} for pak_file_path in pak_files: with open(pak_file_path, 'r') as f: contents = data_pack.ReadDataPackFromString(f.read()) _ComputePakFileSymbols( os.path.relpath(pak_file_path, output_directory), contents, res_info, symbols_by_id) return symbols_by_id def _CalculateElfOverhead(section_sizes, elf_path): if elf_path: section_sizes_total_without_bss = sum( s for k, s in section_sizes.iteritems() if k != models.SECTION_BSS) elf_overhead_size = ( os.path.getsize(elf_path) - section_sizes_total_without_bss) assert elf_overhead_size >= 0, ( 'Negative ELF overhead {}'.format(elf_overhead_size)) return elf_overhead_size return 0 def CreateSectionSizesAndSymbols( map_path=None, tool_prefix=None, output_directory=None, elf_path=None, apk_path=None, track_string_literals=True, metadata=None, apk_so_path=None, pak_files=None, pak_info_file=None, knobs=SectionSizeKnobs()): """Creates sections sizes and symbols for a SizeInfo. Args: map_path: Path to the linker .map(.gz) file to parse. elf_path: Path to the corresponding unstripped ELF file. Used to find symbol aliases and inlined functions. Can be None. tool_prefix: Prefix for c++filt & nm (required). output_directory: Build output directory. If None, source_paths and symbol alias information will not be recorded. track_string_literals: Whether to break down "** merge string" sections into smaller symbols (requires output_directory). """ if apk_path and elf_path: # Extraction takes around 1 second, so do it in parallel. apk_elf_result = concurrent.ForkAndCall( _ElfInfoFromApk, (apk_path, apk_so_path, tool_prefix)) outdir_context = None source_mapper = None if output_directory: # Start by finding the elf_object_paths, so that nm can run on them while # the linker .map is being parsed. logging.info('Parsing ninja files.') source_mapper, ninja_elf_object_paths = ( ninja_parser.Parse(output_directory, elf_path)) logging.debug('Parsed %d .ninja files.', source_mapper.parsed_file_count) assert not elf_path or ninja_elf_object_paths, ( 'Failed to find link command in ninja files for ' + os.path.relpath(elf_path, output_directory)) if ninja_elf_object_paths: elf_object_paths, thin_archives = ar.ExpandThinArchives( ninja_elf_object_paths, output_directory) known_inputs = set(elf_object_paths) known_inputs.update(ninja_elf_object_paths) else: elf_object_paths = None known_inputs = None # When we don't know which elf file is used, just search all paths. thin_archives = set( p for p in source_mapper.IterAllPaths() if p.endswith('.a') and ar.IsThinArchive( os.path.join(output_directory, p))) outdir_context = _OutputDirectoryContext( elf_object_paths=elf_object_paths, known_inputs=known_inputs, output_directory=output_directory, source_mapper=source_mapper, thin_archives=thin_archives) section_sizes, raw_symbols = _ParseElfInfo( map_path, elf_path, tool_prefix, track_string_literals, outdir_context) elf_overhead_size = _CalculateElfOverhead(section_sizes, elf_path) pak_symbols_by_id = None if apk_path: pak_symbols_by_id = _FindPakSymbolsFromApk(apk_path, output_directory, knobs) if elf_path: section_sizes, elf_overhead_size = _ParseApkElfSectionSize( section_sizes, metadata, apk_elf_result) raw_symbols.extend( _ParseDexSymbols(section_sizes, apk_path, output_directory)) raw_symbols.extend( _ParseApkOtherSymbols(section_sizes, apk_path, apk_so_path, output_directory, knobs)) elif pak_files and pak_info_file: pak_symbols_by_id = _FindPakSymbolsFromFiles( pak_files, pak_info_file, output_directory) if elf_path: elf_overhead_symbol = models.Symbol( models.SECTION_OTHER, elf_overhead_size, full_name='Overhead: ELF file') prev = section_sizes.setdefault(models.SECTION_OTHER, 0) section_sizes[models.SECTION_OTHER] = prev + elf_overhead_size raw_symbols.append(elf_overhead_symbol) if pak_symbols_by_id: object_paths = (p for p in source_mapper.IterAllPaths() if p.endswith('.o')) pak_raw_symbols = _ParsePakSymbols( section_sizes, object_paths, output_directory, pak_symbols_by_id) raw_symbols.extend(pak_raw_symbols) _ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper) logging.info('Converting excessive aliases into shared-path symbols') _CompactLargeAliasesIntoSharedSymbols(raw_symbols, knobs) logging.debug('Connecting nm aliases') _ConnectNmAliases(raw_symbols) return section_sizes, raw_symbols def CreateSizeInfo( section_sizes, raw_symbols, metadata=None, normalize_names=True): """Performs operations on all symbols and creates a SizeInfo object.""" logging.debug('Sorting %d symbols', len(raw_symbols)) # TODO(agrieve): Either change this sort so that it's only sorting by section # (and not using .sort()), or have it specify a total ordering (which must # also include putting padding-only symbols before others of the same # address). Note: The sort as-is takes ~1.5 seconds. raw_symbols.sort(key=lambda s: ( s.IsPak(), s.IsBss(), s.section_name, s.address)) logging.info('Processed %d symbols', len(raw_symbols)) # Padding not really required, but it is useful to check for large padding and # log a warning. logging.info('Calculating padding') _CalculatePadding(raw_symbols) # Do not call _NormalizeNames() during archive since that method tends to need # tweaks over time. Calling it only when loading .size files allows for more # flexability. if normalize_names: _NormalizeNames(raw_symbols) return models.SizeInfo(section_sizes, raw_symbols, metadata=metadata) def _DetectGitRevision(directory): try: git_rev = subprocess.check_output( ['git', '-C', directory, 'rev-parse', 'HEAD']) return git_rev.rstrip() except Exception: logging.warning('Failed to detect git revision for file metadata.') return None def BuildIdFromElf(elf_path, tool_prefix): args = [path_util.GetReadElfPath(tool_prefix), '-n', elf_path] stdout = subprocess.check_output(args) match = re.search(r'Build ID: (\w+)', stdout) assert match, 'Build ID not found from running: ' + ' '.join(args) return match.group(1) def _SectionSizesFromElf(elf_path, tool_prefix): args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide', elf_path] stdout = subprocess.check_output(args) section_sizes = {} # Matches [ 2] .hash HASH 00000000006681f0 0001f0 003154 04 A 3 0 8 for match in re.finditer(r'\[[\s\d]+\] (\..*)$', stdout, re.MULTILINE): items = match.group(1).split() section_sizes[items[0]] = int(items[4], 16) return section_sizes def _ArchFromElf(elf_path, tool_prefix): args = [path_util.GetReadElfPath(tool_prefix), '-h', elf_path] stdout = subprocess.check_output(args) machine = re.search('Machine:\s*(.+)', stdout).group(1) if machine == 'Intel 80386': return 'x86' if machine == 'Advanced Micro Devices X86-64': return 'x64' elif machine == 'ARM': return 'arm' elif machine == 'AArch64': return 'arm64' return machine def _ParseGnArgs(args_path): """Returns a list of normalized "key=value" strings.""" args = {} with open(args_path) as f: for l in f: # Strips #s even if within string literal. Not a problem in practice. parts = l.split('#')[0].split('=') if len(parts) != 2: continue args[parts[0].strip()] = parts[1].strip() return ["%s=%s" % x for x in sorted(args.iteritems())] def _DetectLinkerName(map_path): with _OpenMaybeGz(map_path) as map_file: return linker_map_parser.DetectLinkerNameFromMapFileHeader(next(map_file)) def _ElfInfoFromApk(apk_path, apk_so_path, tool_prefix): """Returns a tuple of (build_id, section_sizes).""" with zipfile.ZipFile(apk_path) as apk, \ tempfile.NamedTemporaryFile() as f: f.write(apk.read(apk_so_path)) f.flush() build_id = BuildIdFromElf(f.name, tool_prefix) section_sizes = _SectionSizesFromElf(f.name, tool_prefix) elf_overhead_size = _CalculateElfOverhead(section_sizes, f.name) return build_id, section_sizes, elf_overhead_size def _AutoIdentifyInputFile(args): file_output = subprocess.check_output(['file', args.f]) format_text = file_output[file_output.find(': ') + 2:] # File-not-found -> 'cannot ...' and directory -> 'directory', which don't # match anything here, so they are handled by the final 'return False'. if (format_text.startswith('Java archive data') or format_text.startswith('Zip archive data')): logging.info('Auto-identified --apk-file.') args.apk_file = args.f return True if format_text.startswith('ELF '): logging.info('Auto-identified --elf-file.') args.elf_file = args.f return True if format_text.startswith('ASCII text'): logging.info('Auto-identified --map-file.') args.map_file = args.f return True return False def AddMainPathsArguments(parser): """Add arguments for DeduceMainPaths().""" parser.add_argument('-f', metavar='FILE', help='Auto-identify input file type.') parser.add_argument('--apk-file', help='.apk file to measure. When set, --elf-file will be ' 'derived (if unset). Providing the .apk allows ' 'for the size of packed relocations to be recorded') parser.add_argument('--elf-file', help='Path to input ELF file. Currently used for ' 'capturing metadata.') parser.add_argument('--map-file', help='Path to input .map(.gz) file. Defaults to ' '{{elf_file}}.map(.gz)?. If given without ' '--elf-file, no size metadata will be recorded.') parser.add_argument('--no-source-paths', action='store_true', help='Do not use .ninja files to map ' 'object_path -> source_path') parser.add_argument('--output-directory', help='Path to the root build directory.') parser.add_argument('--tool-prefix', help='Path prefix for c++filt, nm, readelf.') def AddArguments(parser): parser.add_argument('size_file', help='Path to output .size file.') parser.add_argument('--pak-file', action='append', help='Paths to pak files.') parser.add_argument('--pak-info-file', help='This file should contain all ids found in the pak ' 'files that have been passed in.') parser.add_argument('--no-string-literals', dest='track_string_literals', default=True, action='store_false', help='Disable breaking down "** merge strings" into more ' 'granular symbols.') AddMainPathsArguments(parser) def DeduceMainPaths(args, parser): """Computes main paths based on input, and deduces them if needed.""" if args.f is not None: if not _AutoIdentifyInputFile(args): parser.error('Cannot find or identify file %s' % args.f) apk_path = args.apk_file elf_path = args.elf_file map_path = args.map_file any_input = apk_path or elf_path or map_path if not any_input: parser.error('Must pass at least one of --apk-file, --elf-file, --map-file') output_directory_finder = path_util.OutputDirectoryFinder( value=args.output_directory, any_path_within_output_directory=any_input) apk_so_path = None if apk_path: with zipfile.ZipFile(apk_path) as z: lib_infos = [f for f in z.infolist() if f.filename.endswith('.so') and f.file_size > 0] assert lib_infos, 'APK has no .so files.' # TODO(agrieve): Add support for multiple .so files, and take into account # secondary architectures. apk_so_path = max(lib_infos, key=lambda x:x.file_size).filename logging.debug('Sub-apk path=%s', apk_so_path) if not elf_path and output_directory_finder.Tentative(): elf_path = os.path.join( output_directory_finder.Tentative(), 'lib.unstripped', os.path.basename(apk_so_path.replace('crazy.', ''))) logging.debug('Detected --elf-file=%s', elf_path) if map_path: if not map_path.endswith('.map') and not map_path.endswith('.map.gz'): parser.error('Expected --map-file to end with .map or .map.gz') else: map_path = elf_path + '.map' if not os.path.exists(map_path): map_path += '.gz' if not os.path.exists(map_path): parser.error('Could not find .map(.gz)? file. Ensure you have built with ' 'is_official_build=true, or use --map-file to point me a ' 'linker map file.') linker_name = _DetectLinkerName(map_path) tool_prefix_finder = path_util.ToolPrefixFinder( value=args.tool_prefix, output_directory_finder=output_directory_finder, linker_name=linker_name) tool_prefix = tool_prefix_finder.Finalized() output_directory = None if not args.no_source_paths: output_directory = output_directory_finder.Finalized() return (output_directory, tool_prefix, apk_path, apk_so_path, elf_path, map_path) def Run(args, parser): if not args.size_file.endswith('.size'): parser.error('size_file must end with .size') (output_directory, tool_prefix, apk_path, apk_so_path, elf_path, map_path) = ( DeduceMainPaths(args, parser)) metadata = CreateMetadata(map_path, elf_path, apk_path, tool_prefix, output_directory) section_sizes, raw_symbols = CreateSectionSizesAndSymbols( map_path=map_path, tool_prefix=tool_prefix, elf_path=elf_path, apk_path=apk_path, output_directory=output_directory, track_string_literals=args.track_string_literals, metadata=metadata, apk_so_path=apk_so_path, pak_files=args.pak_file, pak_info_file=args.pak_info_file) size_info = CreateSizeInfo( section_sizes, raw_symbols, metadata=metadata, normalize_names=False) if logging.getLogger().isEnabledFor(logging.INFO): for line in describe.DescribeSizeInfoCoverage(size_info): logging.info(line) logging.info('Recorded info for %d symbols', len(size_info.raw_symbols)) logging.info('Recording metadata: \n %s', '\n '.join(describe.DescribeMetadata(size_info.metadata))) logging.info('Saving result to %s', args.size_file) file_format.SaveSizeInfo(size_info, args.size_file) size_in_mb = os.path.getsize(args.size_file) / 1024.0 / 1024.0 logging.info('Done. File size is %.2fMiB.', size_in_mb)