# Copyright 2017 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Main Python API for analyzing binary size."""

import argparse
import calendar
import collections
import datetime
import gzip
import itertools
import logging
import os
import posixpath
import re
import subprocess
import sys
import tempfile
import zipfile

import apkanalyzer
import ar
import concurrent
import demangle
import describe
import file_format
import function_signature
import linker_map_parser
import models
import ninja_parser
import nm
import obj_analyzer
import path_util

sys.path.insert(1, os.path.join(path_util.SRC_ROOT, 'tools', 'grit'))
from grit.format import data_pack

_OWNERS_FILENAME = 'OWNERS'
_COMPONENT_REGEX = re.compile(r'\s*#\s*COMPONENT\s*:\s*(\S+)')
_FILE_PATH_REGEX = re.compile(r'\s*file://(\S+)')

# Holds computation state that is live only when an output directory exists.
_OutputDirectoryContext = collections.namedtuple('_OutputDirectoryContext', [
    'elf_object_paths',  # Only when elf_path is also provided.
    'known_inputs',  # Only when elf_path is also provided.
    'output_directory',
    'source_mapper',
    'thin_archives',
])


# Tunable "knobs" for CreateSectionSizesAndSymbols().
class SectionSizeKnobs(object):
  def __init__(self):
    # A limit on the number of symbols an address can have, before these symbols
    # are compacted into shared symbols. Increasing this value causes more data
    # to be stored .size files, but is also more expensive.
    # Effect of max_same_name_alias_count (as of Oct 2017, with min_pss = max):
    # 1: shared .text syms = 1772874 bytes, file size = 9.43MiB (645476 syms).
    # 2: shared .text syms = 1065654 bytes, file size = 9.58MiB (669952 syms).
    # 6: shared .text syms = 464058 bytes, file size = 10.11MiB (782693 syms).
    # 10: shared .text syms = 365648 bytes, file size = 10.24MiB (813758 syms).
    # 20: shared .text syms = 86202 bytes, file size = 10.38MiB (854548 syms).
    # 40: shared .text syms = 48424 bytes, file size = 10.50MiB (890396 syms).
    # 50: shared .text syms = 41860 bytes, file size = 10.54MiB (902304 syms).
    # max: shared .text syms = 0 bytes, file size = 11.10MiB (1235449 syms).
    self.max_same_name_alias_count = 40  # 50kb is basically negligable.

    # An estimate of pak translation compression ratio to make comparisons
    # between .size files reasonable. Otherwise this can differ every pak
    # change.
    self.pak_compression_ratio = 0.33

    # File name: Source file.
    self.apk_other_files = {
      'assets/icudtl.dat': '../../third_party/icu/android/icudtl.dat',
      'assets/snapshot_blob_32.bin': '../../v8/snapshot_blob_32.bin',
      'assets/snapshot_blob_64.bin': '../../v8/snapshot_blob_64.bin',
      'assets/natives_blob.bin': '../../v8/natives_blob.bin',
      'assets/unwind_cfi_32': '../../base/trace_event/cfi_backtrace_android.cc',
      'assets/webapk_dex_version.txt': (
          '../../chrome/android/webapk/libs/runtime_library_version.gni'),
      'lib/armeabi-v7a/libarcore_sdk_c_minimal.so': (
          '../../third_party/arcore-android-sdk'),
    }

    self.apk_expected_other_files = set([
      # From Monochrome.apk
      'AndroidManifest.xml',
      'resources.arsc',
      'assets/AndroidManifest.xml',
      'assets/metaresources.arsc',
      'META-INF/CHROMIUM.SF',
      'META-INF/CHROMIUM.RSA',
      'META-INF/MANIFEST.MF',
    ])

    self.src_root = path_util.SRC_ROOT


def _OpenMaybeGz(path):
  """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`."""
  if path.endswith('.gz'):
    return gzip.open(path, 'rb')
  return open(path, 'rb')


def _StripLinkerAddedSymbolPrefixes(raw_symbols):
  """Removes prefixes sometimes added to symbol names during link

  Removing prefixes make symbol names match up with those found in .o files.
  """
  for symbol in raw_symbols:
    full_name = symbol.full_name
    if full_name.startswith('startup.'):
      symbol.flags |= models.FLAG_STARTUP
      symbol.full_name = full_name[8:]
    elif full_name.startswith('unlikely.'):
      symbol.flags |= models.FLAG_UNLIKELY
      symbol.full_name = full_name[9:]
    elif full_name.startswith('rel.local.'):
      symbol.flags |= models.FLAG_REL_LOCAL
      symbol.full_name = full_name[10:]
    elif full_name.startswith('rel.'):
      symbol.flags |= models.FLAG_REL
      symbol.full_name = full_name[4:]
    elif full_name.startswith('hot.'):
      symbol.flags |= models.FLAG_HOT
      symbol.full_name = full_name[4:]
    elif full_name.startswith('.L.str'):
      symbol.full_name = models.STRING_LITERAL_NAME


def _NormalizeNames(raw_symbols):
  """Ensures that all names are formatted in a useful way.

  This includes:
    - Deriving |name| and |template_name| from |full_name|.
    - Stripping of return types (for functions).
    - Moving "vtable for" and the like to be suffixes rather than prefixes.
  """
  found_prefixes = set()
  for symbol in raw_symbols:
    full_name = symbol.full_name

    # See comment in _CalculatePadding() about when this can happen. Don't
    # process names for non-native sections.
    if symbol.IsPak():
      # full_name: "about_ui_resources.grdp: IDR_ABOUT_UI_CREDITS_HTML".
      space_idx = full_name.rindex(' ')
      name = full_name[space_idx + 1:]
      symbol.template_name = name
      symbol.name = name
    elif (full_name.startswith('*') or
        symbol.IsOverhead() or
        symbol.IsOther()):
      symbol.template_name = full_name
      symbol.name = full_name
    elif symbol.IsDex():
      symbol.full_name, symbol.template_name, symbol.name = (
          function_signature.ParseJava(full_name))
    elif symbol.IsNative():
      # Remove [clone] suffix, and set flag accordingly.
      # Search from left-to-right, as multiple [clone]s can exist.
      # Example name suffixes:
      #     [clone .part.322]  # GCC
      #     [clone .isra.322]  # GCC
      #     [clone .constprop.1064]  # GCC
      #     [clone .11064]  # clang
      # http://unix.stackexchange.com/questions/223013/function-symbol-gets-part-suffix-after-compilation
      idx = full_name.find(' [clone ')
      if idx != -1:
        full_name = full_name[:idx]
        symbol.flags |= models.FLAG_CLONE

      # Clones for C symbols.
      if symbol.section == 't':
        idx = full_name.rfind('.')
        if idx != -1 and full_name[idx + 1:].isdigit():
          new_name = full_name[:idx]
          # Generated symbols that end with .123 but are not clones.
          # Find these via:
          # size_info.symbols.WhereInSection('t').WhereIsGroup().SortedByCount()
          if new_name not in ('__tcf_0', 'startup'):
            full_name = new_name
            symbol.flags |= models.FLAG_CLONE
            # Remove .part / .isra / .constprop.
            idx = full_name.rfind('.', 0, idx)
            if idx != -1:
              full_name = full_name[:idx]

      # E.g.: vtable for FOO
      idx = full_name.find(' for ', 0, 30)
      if idx != -1:
        found_prefixes.add(full_name[:idx + 4])
        full_name = '{} [{}]'.format(full_name[idx + 5:], full_name[:idx])

      # E.g.: virtual thunk to FOO
      idx = full_name.find(' to ', 0, 30)
      if idx != -1:
        found_prefixes.add(full_name[:idx + 3])
        full_name = '{} [{}]'.format(full_name[idx + 4:], full_name[:idx])

      # Strip out return type, and split out name, template_name.
      # Function parsing also applies to non-text symbols.
      # E.g. Function statics.
      symbol.full_name, symbol.template_name, symbol.name = (
          function_signature.Parse(full_name))

      # Remove anonymous namespaces (they just harm clustering).
      symbol.template_name = symbol.template_name.replace(
          '(anonymous namespace)::', '')
      symbol.full_name = symbol.full_name.replace(
          '(anonymous namespace)::', '')
      non_anonymous_name = symbol.name.replace('(anonymous namespace)::', '')
      if symbol.name != non_anonymous_name:
        symbol.flags |= models.FLAG_ANONYMOUS
        symbol.name = non_anonymous_name

    # Allow using "is" to compare names (and should help with RAM). This applies
    # to all symbols.
    function_signature.InternSameNames(symbol)

  logging.debug('Found name prefixes of: %r', found_prefixes)


def _NormalizeObjectPath(path):
  """Normalizes object paths.

  Prefixes are removed: obj/, ../../
  Archive names made more pathy: foo/bar.a(baz.o) -> foo/bar.a/baz.o
  """
  if path.startswith('obj/'):
    # Convert obj/third_party/... -> third_party/...
    path = path[4:]
  elif path.startswith('../../'):
    # Convert ../../third_party/... -> third_party/...
    path = path[6:]
  if path.endswith(')'):
    # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o so that hierarchical
    # breakdowns consider the .o part to be a separate node.
    start_idx = path.rindex('(')
    path = os.path.join(path[:start_idx], path[start_idx + 1:-1])
  return path


def _NormalizeSourcePath(path):
  """Returns (is_generated, normalized_path)"""
  if path.startswith('gen/'):
    # Convert gen/third_party/... -> third_party/...
    return True, path[4:]
  if path.startswith('../../'):
    # Convert ../../third_party/... -> third_party/...
    return False, path[6:]
  return True, path


def _ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper):
  """Fills in the |source_path| attribute and normalizes |object_path|."""
  if source_mapper:
    logging.info('Looking up source paths from ninja files')
    for symbol in raw_symbols:
      object_path = symbol.object_path
      if symbol.IsDex() or symbol.IsOther():
        if symbol.source_path:
          symbol.generated_source, symbol.source_path = _NormalizeSourcePath(
              symbol.source_path)
      elif object_path:
        # We don't have source info for prebuilt .a files.
        if not os.path.isabs(object_path) and not object_path.startswith('..'):
          source_path = source_mapper.FindSourceForPath(object_path)
          if source_path:
            symbol.generated_source, symbol.source_path = (
                _NormalizeSourcePath(source_path))
        symbol.object_path = _NormalizeObjectPath(object_path)
    assert source_mapper.unmatched_paths_count == 0, (
        'One or more source file paths could not be found. Likely caused by '
        '.ninja files being generated at a different time than the .map file.')
  else:
    logging.info('Normalizing object paths')
    for symbol in raw_symbols:
      if symbol.object_path:
        symbol.object_path = _NormalizeObjectPath(symbol.object_path)


def _ComputeAncestorPath(path_list, symbol_count):
  """Returns the common ancestor of the given paths."""
  if not path_list:
    return ''

  prefix = os.path.commonprefix(path_list)
  # Check if all paths were the same.
  if prefix == path_list[0]:
    return prefix

  # Put in buckets to cut down on the number of unique paths.
  if symbol_count >= 100:
    symbol_count_str = '100+'
  elif symbol_count >= 50:
    symbol_count_str = '50-99'
  elif symbol_count >= 20:
    symbol_count_str = '20-49'
  elif symbol_count >= 10:
    symbol_count_str = '10-19'
  else:
    symbol_count_str = str(symbol_count)

  # Put the path count as a subdirectory so that grouping by path will show
  # "{shared}" as a bucket, and the symbol counts as leafs.
  if not prefix:
    return os.path.join('{shared}', symbol_count_str)
  return os.path.join(os.path.dirname(prefix), '{shared}', symbol_count_str)


def _CompactLargeAliasesIntoSharedSymbols(raw_symbols, knobs):
  """Converts symbols with large number of aliases into single symbols.

  The merged symbol's path fields are changed to common-ancestor paths in
  the form: common/dir/{shared}/$SYMBOL_COUNT

  Assumes aliases differ only by path (not by name).
  """
  num_raw_symbols = len(raw_symbols)
  num_shared_symbols = 0
  src_cursor = 0
  dst_cursor = 0
  while src_cursor < num_raw_symbols:
    symbol = raw_symbols[src_cursor]
    raw_symbols[dst_cursor] = symbol
    dst_cursor += 1
    aliases = symbol.aliases
    if aliases and len(aliases) > knobs.max_same_name_alias_count:
      symbol.source_path = _ComputeAncestorPath(
          [s.source_path for s in aliases if s.source_path], len(aliases))
      symbol.object_path = _ComputeAncestorPath(
          [s.object_path for s in aliases if s.object_path], len(aliases))
      symbol.generated_source = all(s.generated_source for s in aliases)
      symbol.aliases = None
      num_shared_symbols += 1
      src_cursor += len(aliases)
    else:
      src_cursor += 1
  raw_symbols[dst_cursor:] = []
  num_removed = src_cursor - dst_cursor
  logging.debug('Converted %d aliases into %d shared-path symbols',
                num_removed, num_shared_symbols)


def _ConnectNmAliases(raw_symbols):
  """Ensures |aliases| is set correctly for all symbols."""
  prev_sym = raw_symbols[0]
  for sym in raw_symbols[1:]:
    # Don't merge bss symbols.
    if sym.address > 0 and prev_sym.address == sym.address:
      # Don't merge padding-only symbols (** symbol gaps).
      if prev_sym.size > 0:
        # Don't merge if already merged.
        if prev_sym.aliases is None or prev_sym.aliases is not sym.aliases:
          if prev_sym.aliases:
            prev_sym.aliases.append(sym)
          else:
            prev_sym.aliases = [prev_sym, sym]
          sym.aliases = prev_sym.aliases
    prev_sym = sym


def _AssignNmAliasPathsAndCreatePathAliases(raw_symbols, object_paths_by_name):
  num_found_paths = 0
  num_unknown_names = 0
  num_path_mismatches = 0
  num_aliases_created = 0
  ret = []
  for symbol in raw_symbols:
    ret.append(symbol)
    full_name = symbol.full_name
    if (symbol.IsBss() or
        symbol.IsStringLiteral() or
        not full_name or
        full_name[0] in '*.' or  # e.g. ** merge symbols, .Lswitch.table
        full_name == 'startup'):
      continue

    object_paths = object_paths_by_name.get(full_name)
    if object_paths:
      num_found_paths += 1
    else:
      if num_unknown_names < 10:
        logging.warning('Symbol not found in any .o files: %r', symbol)
      num_unknown_names += 1
      continue

    if symbol.object_path and symbol.object_path not in object_paths:
      if num_path_mismatches < 10:
        logging.warning('Symbol path reported by .map not found by nm.')
        logging.warning('sym=%r', symbol)
        logging.warning('paths=%r', object_paths)
      object_paths.append(symbol.object_path)
      object_paths.sort()
      num_path_mismatches += 1

    symbol.object_path = object_paths[0]

    if len(object_paths) > 1:
      # Create one symbol for each object_path.
      aliases = symbol.aliases or [symbol]
      symbol.aliases = aliases
      num_aliases_created += len(object_paths) - 1
      for object_path in object_paths[1:]:
        new_sym = models.Symbol(
            symbol.section_name, symbol.size, address=symbol.address,
            full_name=full_name, object_path=object_path, aliases=aliases)
        aliases.append(new_sym)
        ret.append(new_sym)

  logging.debug('Cross-referenced %d symbols with nm output. '
                'num_unknown_names=%d num_path_mismatches=%d '
                'num_aliases_created=%d',
                num_found_paths, num_unknown_names, num_path_mismatches,
                num_aliases_created)
  return ret


def _DiscoverMissedObjectPaths(raw_symbols, known_inputs):
  # Missing object paths are caused by .a files added by -l flags, which are not
  # listed as explicit inputs within .ninja rules.
  missed_inputs = set()
  for symbol in raw_symbols:
    path = symbol.object_path
    if path.endswith(')'):
      # Convert foo/bar.a(baz.o) -> foo/bar.a
      path = path[:path.rindex('(')]
    if path and path not in known_inputs:
      missed_inputs.add(path)
  return missed_inputs


def _CreateMergeStringsReplacements(merge_string_syms,
                                    list_of_positions_by_object_path):
  """Creates replacement symbols for |merge_syms|."""
  ret = []
  STRING_LITERAL_NAME = models.STRING_LITERAL_NAME
  assert len(merge_string_syms) == len(list_of_positions_by_object_path)
  tups = itertools.izip(merge_string_syms, list_of_positions_by_object_path)
  for merge_sym, positions_by_object_path in tups:
    merge_sym_address = merge_sym.address
    new_symbols = []
    ret.append(new_symbols)
    for object_path, positions in positions_by_object_path.iteritems():
      for offset, size in positions:
        address = merge_sym_address + offset
        symbol = models.Symbol(
            models.SECTION_RODATA, size, address, STRING_LITERAL_NAME,
            object_path=object_path)
        new_symbols.append(symbol)

  logging.debug('Created %d string literal symbols', sum(len(x) for x in ret))
  logging.debug('Sorting string literals')
  for symbols in ret:
    # In order to achieve a total ordering in the presense of aliases, need to
    # include both |address| and |object_path|.
    # In order to achieve consistent deduping, need to include |size|.
    symbols.sort(key=lambda x: (x.address, -x.size, x.object_path))

  logging.debug('Deduping string literals')
  num_removed = 0
  size_removed = 0
  num_aliases = 0
  for i, symbols in enumerate(ret):
    if not symbols:
      continue
    prev_symbol = symbols[0]
    new_symbols = [prev_symbol]
    for symbol in symbols[1:]:
      padding = symbol.address - prev_symbol.end_address
      if (prev_symbol.address == symbol.address and
          prev_symbol.size == symbol.size):
        # String is an alias.
        num_aliases += 1
        aliases = prev_symbol.aliases
        if aliases:
          aliases.append(symbol)
          symbol.aliases = aliases
        else:
          aliases = [prev_symbol, symbol]
          prev_symbol.aliases = aliases
          symbol.aliases = aliases
      elif padding + symbol.size <= 0:
        # String is a substring of prior one.
        num_removed += 1
        size_removed += symbol.size
        continue
      elif padding < 0:
        # String overlaps previous one. Adjust to not overlap.
        symbol.address -= padding
        symbol.size += padding
      new_symbols.append(symbol)
      prev_symbol = symbol
    ret[i] = new_symbols
    # Aliases come out in random order, so sort to be deterministic.
    ret[i].sort(key=lambda s: (s.address, s.object_path))

  logging.debug(
      'Removed %d overlapping string literals (%d bytes) & created %d aliases',
                num_removed, size_removed, num_aliases)
  return ret


def _CalculatePadding(raw_symbols):
  """Populates the |padding| field based on symbol addresses.

  Symbols must already be sorted by |address|.
  """
  seen_sections = set()
  for i, symbol in enumerate(raw_symbols[1:]):
    prev_symbol = raw_symbols[i]
    if symbol.IsOverhead():
      # Overhead symbols are not actionable so should be padding-only.
      symbol.padding = symbol.size
    if prev_symbol.section_name != symbol.section_name:
      assert symbol.section_name not in seen_sections, (
          'Input symbols must be sorted by section, then address.')
      seen_sections.add(symbol.section_name)
      continue
    if (symbol.address <= 0 or prev_symbol.address <= 0 or
        not symbol.IsNative() or not prev_symbol.IsNative()):
      continue

    if symbol.address == prev_symbol.address:
      if symbol.aliases and symbol.aliases is prev_symbol.aliases:
        symbol.padding = prev_symbol.padding
        symbol.size = prev_symbol.size
        continue
      # Padding-only symbols happen for ** symbol gaps.
      assert prev_symbol.size_without_padding == 0, (
          'Found duplicate symbols:\n%r\n%r' % (prev_symbol, symbol))

    padding = symbol.address - prev_symbol.end_address
    # These thresholds were found by experimenting with arm32 Chrome.
    # E.g.: Set them to 0 and see what warnings get logged, then take max value.
    # TODO(agrieve): See if these thresholds make sense for architectures
    #     other than arm32.
    if (not symbol.full_name.startswith('*') and
        not symbol.IsStringLiteral() and (
        symbol.section in 'rd' and padding >= 256 or
        symbol.section in 't' and padding >= 64)):
      # Should not happen.
      logging.warning('Large padding of %d between:\n  A) %r\n  B) %r' % (
                      padding, prev_symbol, symbol))
    symbol.padding = padding
    symbol.size += padding
    assert symbol.size >= 0, (
        'Symbol has negative size (likely not sorted propertly): '
        '%r\nprev symbol: %r' % (symbol, prev_symbol))


def _ParseComponentFromOwners(filename):
  """Searches an OWNERS file for lines that start with `# COMPONENT:`.

  If an OWNERS file has no COMPONENT but references another OWNERS file, follow
  the reference and check that file instead.

  Args:
    filename: Path to the file to parse.
  Returns:
    The text that follows the `# COMPONENT:` prefix, such as 'component>name'.
    Empty string if no component found or the file didn't exist.
  """
  reference_paths = []
  try:
    with open(filename) as f:
      for line in f:
        component_matches = _COMPONENT_REGEX.match(line)
        path_matches = _FILE_PATH_REGEX.match(line)
        if component_matches:
          return component_matches.group(1)
        elif path_matches:
          reference_paths.append(path_matches.group(1))
  except IOError:
    return ''

  if len(reference_paths) == 1:
    newpath = os.path.join(path_util.SRC_ROOT, reference_paths[0])
    return _ParseComponentFromOwners(newpath)
  else:
    return ''


def _FindComponentRoot(start_path, cache, knobs):
  """Searches all parent directories for COMPONENT in OWNERS files.

  Args:
    start_path: Path of directory to start searching from. Must be relative to
      SRC_ROOT.
    cache: Dict of OWNERS paths. Used instead of filesystem if paths are present
      in the dict.
    knobs: Instance of SectionSizeKnobs. Tunable knobs and options.

  Returns:
    COMPONENT belonging to |start_path|, or empty string if not found.
  """
  prev_dir = None
  test_dir = start_path
  # This loop will traverse the directory structure upwards until reaching
  # SRC_ROOT, where test_dir and prev_dir will both equal an empty string.
  while test_dir != prev_dir:
    cached_component = cache.get(test_dir)
    if cached_component:
      return cached_component
    elif cached_component is None:
      owners_path = os.path.join(knobs.src_root, test_dir, _OWNERS_FILENAME)
      component = _ParseComponentFromOwners(owners_path)
      cache[test_dir] = component
      if component:
        return component
    prev_dir = test_dir
    test_dir = os.path.dirname(test_dir)
  return ''


def _PopulateComponents(raw_symbols, knobs):
  """Populates the |component| field based on |source_path|.

  Symbols without a |source_path| are skipped.

  Args:
    raw_symbols: list of Symbol objects.
    knobs: Instance of SectionSizeKnobs. Tunable knobs and options.
  """
  seen_paths = {}
  for symbol in raw_symbols:
    if symbol.source_path:
      folder_path = os.path.dirname(symbol.source_path)
      symbol.component = _FindComponentRoot(folder_path, seen_paths, knobs)


def _AddNmAliases(raw_symbols, names_by_address):
  """Adds symbols that were removed by identical code folding."""
  # Step 1: Create list of (index_of_symbol, name_list).
  logging.debug('Creating alias list')
  replacements = []
  num_new_symbols = 0
  missing_names = collections.defaultdict(list)
  for i, s in enumerate(raw_symbols):
    # Don't alias padding-only symbols (e.g. ** symbol gap)
    if s.size_without_padding == 0:
      continue
    name_list = names_by_address.get(s.address)
    if name_list:
      if s.full_name not in name_list:
        missing_names[s.full_name].append(s.address)
        logging.warning('Name missing from aliases: %s %s', s.full_name,
                        name_list)
        continue
      replacements.append((i, name_list))
      num_new_symbols += len(name_list) - 1

  if missing_names and logging.getLogger().isEnabledFor(logging.INFO):
    for address, names in names_by_address.iteritems():
      for name in names:
        if name in missing_names:
          logging.info('Missing name %s is at address %x instead of [%s]' %
              (name, address, ','.join('%x' % a for a in missing_names[name])))

  if float(num_new_symbols) / len(raw_symbols) < .05:
    logging.warning('Number of aliases is oddly low (%.0f%%). It should '
                    'usually be around 25%%. Ensure --tool-prefix is correct. ',
                    float(num_new_symbols) / len(raw_symbols) * 100)

  # Step 2: Create new symbols as siblings to each existing one.
  logging.debug('Creating %d new symbols from nm output', num_new_symbols)
  expected_num_symbols = len(raw_symbols) + num_new_symbols
  ret = []
  prev_src = 0
  for cur_src, name_list in replacements:
    ret += raw_symbols[prev_src:cur_src]
    prev_src = cur_src + 1
    sym = raw_symbols[cur_src]
    # Create symbols (|sym| gets recreated and discarded).
    new_syms = []
    for full_name in name_list:
      # Do not set |aliases| in order to avoid being pruned by
      # _CompactLargeAliasesIntoSharedSymbols(), which assumes aliases differ
      # only by path. The field will be set afterwards by _ConnectNmAliases().
      new_syms.append(models.Symbol(
          sym.section_name, sym.size, address=sym.address, full_name=full_name))
    ret += new_syms
  ret += raw_symbols[prev_src:]
  assert expected_num_symbols == len(ret)
  return ret


def LoadAndPostProcessSizeInfo(path, file_obj=None):
  """Returns a SizeInfo for the given |path|."""
  logging.debug('Loading results from: %s', path)
  size_info = file_format.LoadSizeInfo(path, file_obj=file_obj)
  logging.info('Normalizing symbol names')
  _NormalizeNames(size_info.raw_symbols)
  logging.info('Calculating padding')
  _CalculatePadding(size_info.raw_symbols)
  logging.info('Loaded %d symbols', len(size_info.raw_symbols))
  return size_info


def CreateMetadata(map_path, elf_path, apk_path, tool_prefix, output_directory,
                   linker_name):
  """Creates metadata dict.

  Args:
    map_path: Path to the linker .map(.gz) file to parse.
    elf_path: Path to the corresponding unstripped ELF file. Used to find symbol
        aliases and inlined functions. Can be None.
    apk_path: Path to the .apk file to measure.
    tool_prefix: Prefix for c++filt & nm.
    output_directory: Build output directory.
    linker_name: 'gold', 'lld_v#' (# is a number), 'lld-lto_v#', or None.

  Returns:
    None if |elf_path| is not supplied. Otherwise returns dict mapping string
    constants to values.
    If |elf_path| is supplied, git revision and elf info are included.
    If |output_directory| is also supplied, then filenames will be included.
  """
  metadata = None
  if elf_path:
    logging.debug('Constructing metadata')
    git_rev = _DetectGitRevision(os.path.dirname(elf_path))
    architecture = _ArchFromElf(elf_path, tool_prefix)
    build_id = BuildIdFromElf(elf_path, tool_prefix)
    timestamp_obj = datetime.datetime.utcfromtimestamp(os.path.getmtime(
        elf_path))
    timestamp = calendar.timegm(timestamp_obj.timetuple())
    relative_tool_prefix = path_util.ToSrcRootRelative(tool_prefix)

    metadata = {
        models.METADATA_GIT_REVISION: git_rev,
        models.METADATA_ELF_ARCHITECTURE: architecture,
        models.METADATA_ELF_MTIME: timestamp,
        models.METADATA_ELF_BUILD_ID: build_id,
        models.METADATA_LINKER_NAME: linker_name,
        models.METADATA_TOOL_PREFIX: relative_tool_prefix,
    }

    if output_directory:
      relative_to_out = lambda path: os.path.relpath(path, output_directory)
      gn_args = _ParseGnArgs(os.path.join(output_directory, 'args.gn'))
      metadata[models.METADATA_MAP_FILENAME] = relative_to_out(map_path)
      metadata[models.METADATA_ELF_FILENAME] = relative_to_out(elf_path)
      metadata[models.METADATA_GN_ARGS] = gn_args

      if apk_path:
        metadata[models.METADATA_APK_FILENAME] = relative_to_out(apk_path)
        metadata[models.METADATA_APK_SIZE] = os.path.getsize(apk_path)
  return metadata


def _ResolveThinArchivePaths(raw_symbols, thin_archives):
  """Converts object_paths for thin archives to external .o paths."""
  for symbol in raw_symbols:
    object_path = symbol.object_path
    if object_path.endswith(')'):
      start_idx = object_path.rindex('(')
      archive_path = object_path[:start_idx]
      if archive_path in thin_archives:
        subpath = object_path[start_idx + 1:-1]
        symbol.object_path = ar.CreateThinObjectPath(archive_path, subpath)


def _ParseElfInfo(map_path, elf_path, tool_prefix, track_string_literals,
                  outdir_context=None, linker_name=None):
  """Adds ELF section sizes and symbols."""
  if elf_path:
    # Run nm on the elf file to retrieve the list of symbol names per-address.
    # This list is required because the .map file contains only a single name
    # for each address, yet multiple symbols are often coalesced when they are
    # identical. This coalescing happens mainly for small symbols and for C++
    # templates. Such symbols make up ~500kb of libchrome.so on Android.
    elf_nm_result = nm.CollectAliasesByAddressAsync(elf_path, tool_prefix)

    # Run nm on all .o/.a files to retrieve the symbol names within them.
    # The list is used to detect when mutiple .o files contain the same symbol
    # (e.g. inline functions), and to update the object_path / source_path
    # fields accordingly.
    # Looking in object files is required because the .map file choses a
    # single path for these symbols.
    # Rather than record all paths for each symbol, set the paths to be the
    # common ancestor of all paths.
    if outdir_context:
      bulk_analyzer = obj_analyzer.BulkObjectFileAnalyzer(
          tool_prefix, outdir_context.output_directory,
          track_string_literals=track_string_literals)
      bulk_analyzer.AnalyzePaths(outdir_context.elf_object_paths)

  logging.info('Parsing Linker Map')
  with _OpenMaybeGz(map_path) as map_file:
    section_sizes, raw_symbols = (
        linker_map_parser.MapFileParser().Parse(linker_name, map_file))

    if outdir_context and outdir_context.thin_archives:
      _ResolveThinArchivePaths(raw_symbols, outdir_context.thin_archives)

  if elf_path:
    logging.debug('Validating section sizes')
    elf_section_sizes = _SectionSizesFromElf(elf_path, tool_prefix)
    for k, v in elf_section_sizes.iteritems():
      if v != section_sizes.get(k):
        logging.error('ELF file and .map file do not agree on section sizes.')
        logging.error('.map file: %r', section_sizes)
        logging.error('readelf: %r', elf_section_sizes)
        sys.exit(1)

  if elf_path and outdir_context:
    missed_object_paths = _DiscoverMissedObjectPaths(
        raw_symbols, outdir_context.known_inputs)
    missed_object_paths = ar.ExpandThinArchives(
        missed_object_paths, outdir_context.output_directory)[0]
    bulk_analyzer.AnalyzePaths(missed_object_paths)
    bulk_analyzer.SortPaths()
    if track_string_literals:
      merge_string_syms = [s for s in raw_symbols if
                           s.full_name == '** merge strings' or
                           s.full_name == '** lld merge strings']
      # More likely for there to be a bug in supersize than an ELF to not have a
      # single string literal.
      assert merge_string_syms
      string_ranges = [(s.address, s.size) for s in merge_string_syms]
      bulk_analyzer.AnalyzeStringLiterals(elf_path, string_ranges)

  logging.info('Stripping linker prefixes from symbol names')
  _StripLinkerAddedSymbolPrefixes(raw_symbols)
  # Map file for some reason doesn't demangle all names.
  # Demangle prints its own log statement.
  demangle.DemangleRemainingSymbols(raw_symbols, tool_prefix)

  object_paths_by_name = {}
  if elf_path:
    logging.info(
        'Adding symbols removed by identical code folding (as reported by nm)')
    # This normally does not block (it's finished by this time).
    names_by_address = elf_nm_result.get()
    raw_symbols = _AddNmAliases(raw_symbols, names_by_address)

    if outdir_context:
      object_paths_by_name = bulk_analyzer.GetSymbolNames()
      logging.debug(
          'Fetched path information for %d symbols from %d files',
          len(object_paths_by_name),
          len(outdir_context.elf_object_paths) + len(missed_object_paths))

      # For aliases, this provides path information where there wasn't any.
      logging.info('Creating aliases for symbols shared by multiple paths')
      raw_symbols = _AssignNmAliasPathsAndCreatePathAliases(
          raw_symbols, object_paths_by_name)

      if track_string_literals:
        logging.info('Waiting for string literal extraction to complete.')
        list_of_positions_by_object_path = bulk_analyzer.GetStringPositions()
      bulk_analyzer.Close()

      if track_string_literals:
        logging.info('Deconstructing ** merge strings into literals')
        replacements = _CreateMergeStringsReplacements(merge_string_syms,
            list_of_positions_by_object_path)
        for merge_sym, literal_syms in itertools.izip(
            merge_string_syms, replacements):
          # Don't replace if no literals were found.
          if literal_syms:
            # Re-find the symbols since aliases cause their indices to change.
            idx = raw_symbols.index(merge_sym)
            # This assignment is a bit slow (causes array to be shifted), but
            # is fast enough since len(merge_string_syms) < 10.
            raw_symbols[idx:idx + 1] = literal_syms

  return section_sizes, raw_symbols, object_paths_by_name


def _ComputePakFileSymbols(
    file_name, contents, res_info, symbols_by_id, compression_ratio=1):
  id_map = {id(v): k
            for k, v in sorted(contents.resources.items(), reverse=True)}
  alias_map = {k: id_map[id(v)] for k, v in contents.resources.iteritems()
               if id_map[id(v)] != k}
  # Longest locale pak is es-419.pak
  if len(os.path.basename(file_name)) <= 9:
    section_name = models.SECTION_PAK_TRANSLATIONS
  else:
    section_name = models.SECTION_PAK_NONTRANSLATED
  overhead = (12 + 6) * compression_ratio  # Header size plus extra offset
  symbols_by_id[-1] = models.Symbol(
      section_name, overhead, full_name='Overhead: {}'.format(file_name))
  for resource_id in sorted(contents.resources):
    if resource_id in alias_map:
      # 4 extra bytes of metadata (2 16-bit ints)
      size = 4
      resource_id = alias_map[resource_id]
    else:
      # 6 extra bytes of metadata (1 32-bit int, 1 16-bit int)
      size = len(contents.resources[resource_id]) + 6
      name, source_path = res_info[resource_id]
      if resource_id not in symbols_by_id:
        full_name = '{}: {}'.format(source_path, name)
        symbols_by_id[resource_id] = models.Symbol(
            section_name, 0, address=resource_id, full_name=full_name)
    size *= compression_ratio
    symbols_by_id[resource_id].size += size


class _ResourceSourceMapper(object):
  def __init__(self, apk_path, output_directory, knobs):
    self._knobs = knobs
    self._res_info = self._LoadResInfo(apk_path, output_directory)
    self._pattern_dollar_underscore = re.compile(r'\$(.*?)__\d+')
    self._pattern_version_suffix = re.compile(r'-v\d+/')

  @staticmethod
  def _ParseResInfoFile(res_info_path):
    with open(res_info_path, 'r') as info_file:
      res_info = {}
      renames = {}
      for line in info_file.readlines():
        dest, source = line.strip().split(',')
        # Allow indirection due to renames.
        if dest.startswith('Rename:'):
          dest = dest.split(':', 1)[1]
          renames[dest] = source
        else:
          res_info[dest] = source
      for dest, renamed_dest in renames.iteritems():
        # Allow one more level of indirection due to renaming renamed files
        renamed_dest = renames.get(renamed_dest, renamed_dest)
        actual_source = res_info.get(renamed_dest)
        if actual_source:
          res_info[dest] = actual_source
    return res_info

  def _LoadResInfo(self, apk_path, output_directory):
    apk_name = os.path.basename(apk_path)
    apk_res_info_name = apk_name + '.res.info'
    apk_res_info_path = os.path.join(
        output_directory, 'size-info', apk_res_info_name)
    res_info_without_root = self._ParseResInfoFile(apk_res_info_path)
    # We package resources in the res/ folder only in the apk.
    res_info = {
        os.path.join('res', dest): source
        for dest, source in res_info_without_root.iteritems()
    }
    res_info.update(self._knobs.apk_other_files)
    return res_info

  def FindSourceForPath(self, path):
    original_path = path
    # Sometimes android adds $ in front and __# before extension.
    path = self._pattern_dollar_underscore.sub(r'\1', path)
    ret = self._res_info.get(path)
    if ret:
      return ret
    # Android build tools may append extra -v flags for the root dir.
    path = self._pattern_version_suffix.sub('/', path)
    ret = self._res_info.get(path)
    if ret:
      return ret
    if original_path not in self._knobs.apk_expected_other_files:
      logging.warning('Unexpected file in apk: %s', original_path)
    return None


def _ParsePakInfoFile(pak_info_path):
  with open(pak_info_path, 'r') as info_file:
    res_info = {}
    for line in info_file.readlines():
      name, res_id, path = line.split(',')
      res_info[int(res_id)] = (name, path.strip())
  return res_info


def _ParsePakSymbols(
    section_sizes, symbols_by_id, object_paths_by_pak_id):
  raw_symbols = []
  for resource_id, symbol in symbols_by_id.iteritems():
    raw_symbols.append(symbol)
    paths = object_paths_by_pak_id.get(resource_id)
    if not paths:
      continue
    symbol.object_path = paths.pop()
    if not paths:
      continue
    aliases = symbol.aliases or [symbol]
    symbol.aliases = aliases
    for path in paths:
      new_sym = models.Symbol(
          symbol.section_name, symbol.size, address=symbol.address,
          full_name=symbol.full_name, object_path=path, aliases=aliases)
      aliases.append(new_sym)
      raw_symbols.append(new_sym)
  raw_symbols.sort(key=lambda s: (s.section_name, s.address, s.object_path))
  raw_total = 0.0
  int_total = 0
  for symbol in raw_symbols:
    raw_total += symbol.size
    # We truncate rather than round to ensure that we do not over attribute. It
    # is easier to add another symbol to make up the difference.
    symbol.size = int(symbol.size)
    int_total += symbol.size
  # Attribute excess to translations since only those are compressed.
  raw_symbols.append(models.Symbol(
      models.SECTION_PAK_TRANSLATIONS, int(round(raw_total - int_total)),
      full_name='Overhead: Pak compression artifacts'))

  for symbol in raw_symbols:
    prev = section_sizes.setdefault(symbol.section_name, 0)
    section_sizes[symbol.section_name] = prev + symbol.size
  return raw_symbols


def _ParseApkElfSectionSize(section_sizes, metadata, apk_elf_result):
  if metadata:
    logging.debug('Extracting section sizes from .so within .apk')
    apk_build_id, apk_section_sizes, elf_overhead_size = apk_elf_result.get()
    assert apk_build_id == metadata[models.METADATA_ELF_BUILD_ID], (
        'BuildID from apk_elf_result did not match')

    packed_section_name = None
    architecture = metadata[models.METADATA_ELF_ARCHITECTURE]
    # Packing occurs enabled only arm32 & arm64.
    if architecture == 'arm':
      packed_section_name = '.rel.dyn'
    elif architecture == 'arm64':
      packed_section_name = '.rela.dyn'

    if packed_section_name:
      logging.debug('Recording size of unpacked relocations')
      if packed_section_name not in section_sizes:
        logging.warning('Packed section not present: %s', packed_section_name)
      else:
        apk_section_sizes['%s (unpacked)' % packed_section_name] = (
            section_sizes.get(packed_section_name))
    return apk_section_sizes, elf_overhead_size
  return section_sizes, 0


def _ParseDexSymbols(section_sizes, apk_path, output_directory):
  symbols = apkanalyzer.CreateDexSymbols(apk_path, output_directory)
  prev = section_sizes.setdefault(models.SECTION_DEX, 0)
  section_sizes[models.SECTION_DEX] = prev + sum(s.size for s in symbols)
  return symbols


def _ParseApkOtherSymbols(section_sizes, apk_path, apk_so_path,
                          output_directory, knobs):
  res_source_mapper = _ResourceSourceMapper(apk_path, output_directory, knobs)
  apk_symbols = []
  zip_info_total = 0
  with zipfile.ZipFile(apk_path) as z:
    for zip_info in z.infolist():
      zip_info_total += zip_info.compress_size
      # Skip main shared library, pak, and dex files as they are accounted for.
      if (zip_info.filename == apk_so_path
          or zip_info.filename.endswith('.dex')
          or zip_info.filename.endswith('.pak')):
        continue
      source_path = res_source_mapper.FindSourceForPath(zip_info.filename)
      if source_path is None:
        source_path = os.path.join(models.APK_PREFIX_PATH, zip_info.filename)
      apk_symbols.append(models.Symbol(
            models.SECTION_OTHER, zip_info.compress_size,
            source_path=source_path,
            full_name=zip_info.filename))  # Full name must disambiguate
  overhead_size = os.path.getsize(apk_path) - zip_info_total
  assert overhead_size >= 0, 'Apk overhead must be non-negative'
  zip_overhead_symbol = models.Symbol(
      models.SECTION_OTHER, overhead_size, full_name='Overhead: APK file')
  apk_symbols.append(zip_overhead_symbol)
  prev = section_sizes.setdefault(models.SECTION_OTHER, 0)
  section_sizes[models.SECTION_OTHER] = prev + sum(s.size for s in apk_symbols)
  return apk_symbols


def _CreatePakObjectMap(object_paths_by_name):
  # IDS_ macro usages result in templated function calls that contain the
  # resource ID in them. These names are collected along with all other symbols
  # by running "nm" on them. We just need to extract the values from them.
  object_paths_by_pak_id = {}
  PREFIX = 'void ui::WhitelistedResource<'
  id_start_idx = len(PREFIX)
  id_end_idx = -len('>()')
  for name in object_paths_by_name:
    if name.startswith(PREFIX):
      pak_id = int(name[id_start_idx:id_end_idx])
      logging.info('PAK ID: %d', pak_id)
      object_paths_by_pak_id[pak_id] = object_paths_by_name[name]
  return object_paths_by_pak_id


def _FindPakSymbolsFromApk(apk_path, output_directory, knobs):
  with zipfile.ZipFile(apk_path) as z:
    pak_zip_infos = (f for f in z.infolist() if f.filename.endswith('.pak'))
    apk_info_name = os.path.basename(apk_path) + '.pak.info'
    pak_info_path = os.path.join(output_directory, 'size-info', apk_info_name)
    res_info = _ParsePakInfoFile(pak_info_path)
    symbols_by_id = {}
    total_compressed_size = 0
    total_uncompressed_size = 0
    for zip_info in pak_zip_infos:
      contents = data_pack.ReadDataPackFromString(z.read(zip_info))
      compression_ratio = 1.0
      if zip_info.compress_size < zip_info.file_size:
        total_compressed_size += zip_info.compress_size
        total_uncompressed_size += zip_info.file_size
        compression_ratio = knobs.pak_compression_ratio
      _ComputePakFileSymbols(
          zip_info.filename, contents,
          res_info, symbols_by_id, compression_ratio=compression_ratio)
    if total_uncompressed_size > 0:
      actual_ratio = (
          float(total_compressed_size) / total_uncompressed_size)
      logging.info('Pak Compression Ratio: %f Actual: %f Diff: %.0f',
          knobs.pak_compression_ratio, actual_ratio,
          (knobs.pak_compression_ratio - actual_ratio) *
              total_uncompressed_size)
  return symbols_by_id


def _FindPakSymbolsFromFiles(pak_files, pak_info_path, output_directory):
  """Uses files from args to find and add pak symbols."""
  res_info = _ParsePakInfoFile(pak_info_path)
  symbols_by_id = {}
  for pak_file_path in pak_files:
    with open(pak_file_path, 'r') as f:
      contents = data_pack.ReadDataPackFromString(f.read())
      _ComputePakFileSymbols(
          os.path.relpath(pak_file_path, output_directory), contents, res_info,
          symbols_by_id)
  return symbols_by_id


def _CalculateElfOverhead(section_sizes, elf_path):
  if elf_path:
    section_sizes_total_without_bss = sum(
        s for k, s in section_sizes.iteritems() if k != models.SECTION_BSS)
    elf_overhead_size = (
        os.path.getsize(elf_path) - section_sizes_total_without_bss)
    assert elf_overhead_size >= 0, (
        'Negative ELF overhead {}'.format(elf_overhead_size))
    return elf_overhead_size
  return 0


def CreateSectionSizesAndSymbols(
      map_path=None, tool_prefix=None, output_directory=None, elf_path=None,
      apk_path=None, track_string_literals=True, metadata=None,
      apk_so_path=None, pak_files=None, pak_info_file=None, linker_name=None,
      knobs=SectionSizeKnobs()):
  """Creates sections sizes and symbols for a SizeInfo.

  Args:
    map_path: Path to the linker .map(.gz) file to parse.
    elf_path: Path to the corresponding unstripped ELF file. Used to find symbol
        aliases and inlined functions. Can be None.
    tool_prefix: Prefix for c++filt & nm (required).
    output_directory: Build output directory. If None, source_paths and symbol
        alias information will not be recorded.
    track_string_literals: Whether to break down "** merge string" sections into
        smaller symbols (requires output_directory).

  Returns:
    A tuple of (section_sizes, raw_symbols).
    section_sizes is a dict mapping section names to their size
    raw_symbols is a list of Symbol objects
  """
  if apk_path and elf_path:
    # Extraction takes around 1 second, so do it in parallel.
    apk_elf_result = concurrent.ForkAndCall(
        _ElfInfoFromApk, (apk_path, apk_so_path, tool_prefix))

  outdir_context = None
  source_mapper = None
  if output_directory:
    # Start by finding the elf_object_paths, so that nm can run on them while
    # the linker .map is being parsed.
    logging.info('Parsing ninja files.')
    source_mapper, ninja_elf_object_paths = (
        ninja_parser.Parse(output_directory, elf_path))
    logging.debug('Parsed %d .ninja files.', source_mapper.parsed_file_count)
    assert not elf_path or ninja_elf_object_paths, (
        'Failed to find link command in ninja files for ' +
        os.path.relpath(elf_path, output_directory))

    if ninja_elf_object_paths:
      elf_object_paths, thin_archives = ar.ExpandThinArchives(
          ninja_elf_object_paths, output_directory)
      known_inputs = set(elf_object_paths)
      known_inputs.update(ninja_elf_object_paths)
    else:
      elf_object_paths = None
      known_inputs = None
      # When we don't know which elf file is used, just search all paths.
      thin_archives = set(
          p for p in source_mapper.IterAllPaths()
          if p.endswith('.a') and ar.IsThinArchive(
              os.path.join(output_directory, p)))

    outdir_context = _OutputDirectoryContext(
        elf_object_paths=elf_object_paths,
        known_inputs=known_inputs,
        output_directory=output_directory,
        source_mapper=source_mapper,
        thin_archives=thin_archives)

  section_sizes, raw_symbols, object_paths_by_name = _ParseElfInfo(
      map_path, elf_path, tool_prefix, track_string_literals,
      outdir_context=outdir_context, linker_name=linker_name)
  elf_overhead_size = _CalculateElfOverhead(section_sizes, elf_path)

  pak_symbols_by_id = None
  if apk_path:
    pak_symbols_by_id = _FindPakSymbolsFromApk(
        apk_path, output_directory, knobs)
    if elf_path:
      section_sizes, elf_overhead_size = _ParseApkElfSectionSize(
          section_sizes, metadata, apk_elf_result)
    raw_symbols.extend(
        _ParseDexSymbols(section_sizes, apk_path, output_directory))
    raw_symbols.extend(
        _ParseApkOtherSymbols(section_sizes, apk_path, apk_so_path,
                              output_directory, knobs))
  elif pak_files and pak_info_file:
    pak_symbols_by_id = _FindPakSymbolsFromFiles(
        pak_files, pak_info_file, output_directory)

  if elf_path:
    elf_overhead_symbol = models.Symbol(
        models.SECTION_OTHER, elf_overhead_size, full_name='Overhead: ELF file')
    prev = section_sizes.setdefault(models.SECTION_OTHER, 0)
    section_sizes[models.SECTION_OTHER] = prev + elf_overhead_size
    raw_symbols.append(elf_overhead_symbol)

  if pak_symbols_by_id:
    logging.debug('Extracting pak IDs from symbol names, and creating symbols')
    object_paths_by_pak_id = _CreatePakObjectMap(object_paths_by_name)
    pak_raw_symbols = _ParsePakSymbols(
        section_sizes, pak_symbols_by_id, object_paths_by_pak_id)
    raw_symbols.extend(pak_raw_symbols)

  _ExtractSourcePathsAndNormalizeObjectPaths(raw_symbols, source_mapper)
  _PopulateComponents(raw_symbols, knobs)
  logging.info('Converting excessive aliases into shared-path symbols')
  _CompactLargeAliasesIntoSharedSymbols(raw_symbols, knobs)
  logging.debug('Connecting nm aliases')
  _ConnectNmAliases(raw_symbols)
  return section_sizes, raw_symbols


def CreateSizeInfo(
    section_sizes, raw_symbols, metadata=None, normalize_names=True):
  """Performs operations on all symbols and creates a SizeInfo object."""
  logging.debug('Sorting %d symbols', len(raw_symbols))
  # TODO(agrieve): Either change this sort so that it's only sorting by section
  #     (and not using .sort()), or have it specify a total ordering (which must
  #     also include putting padding-only symbols before others of the same
  #     address). Note: The sort as-is takes ~1.5 seconds.
  raw_symbols.sort(key=lambda s: (
      s.IsPak(), s.IsBss(), s.section_name, s.address))
  logging.info('Processed %d symbols', len(raw_symbols))

  # Padding not really required, but it is useful to check for large padding and
  # log a warning.
  logging.info('Calculating padding')
  _CalculatePadding(raw_symbols)

  # Do not call _NormalizeNames() during archive since that method tends to need
  # tweaks over time. Calling it only when loading .size files allows for more
  # flexability.
  if normalize_names:
    _NormalizeNames(raw_symbols)

  return models.SizeInfo(section_sizes, raw_symbols, metadata=metadata)


def _DetectGitRevision(directory):
  """Runs git rev-parse to get the SHA1 hash of the current revision.

  Args:
    directory: Path to directory where rev-parse command will be run.

  Returns:
    A string with the SHA1 hash, or None if an error occured.
  """
  try:
    git_rev = subprocess.check_output(
        ['git', '-C', directory, 'rev-parse', 'HEAD'])
    return git_rev.rstrip()
  except Exception:
    logging.warning('Failed to detect git revision for file metadata.')
    return None


def BuildIdFromElf(elf_path, tool_prefix):
  args = [path_util.GetReadElfPath(tool_prefix), '-n', elf_path]
  stdout = subprocess.check_output(args)
  match = re.search(r'Build ID: (\w+)', stdout)
  assert match, 'Build ID not found from running: ' + ' '.join(args)
  return match.group(1)


def _SectionSizesFromElf(elf_path, tool_prefix):
  args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide', elf_path]
  stdout = subprocess.check_output(args)
  section_sizes = {}
  # Matches  [ 2] .hash HASH 00000000006681f0 0001f0 003154 04   A  3   0  8
  for match in re.finditer(r'\[[\s\d]+\] (\..*)$', stdout, re.MULTILINE):
    items = match.group(1).split()
    section_sizes[items[0]] = int(items[4], 16)
  return section_sizes


def _ArchFromElf(elf_path, tool_prefix):
  args = [path_util.GetReadElfPath(tool_prefix), '-h', elf_path]
  stdout = subprocess.check_output(args)
  machine = re.search('Machine:\s*(.+)', stdout).group(1)
  if machine == 'Intel 80386':
    return 'x86'
  if machine == 'Advanced Micro Devices X86-64':
    return 'x64'
  elif machine == 'ARM':
    return 'arm'
  elif machine == 'AArch64':
    return 'arm64'
  return machine


def _ParseGnArgs(args_path):
  """Returns a list of normalized "key=value" strings."""
  args = {}
  with open(args_path) as f:
    for l in f:
      # Strips #s even if within string literal. Not a problem in practice.
      parts = l.split('#')[0].split('=')
      if len(parts) != 2:
        continue
      args[parts[0].strip()] = parts[1].strip()
  return ["%s=%s" % x for x in sorted(args.iteritems())]


def _DetectLinkerName(map_path):
  with _OpenMaybeGz(map_path) as map_file:
    return linker_map_parser.DetectLinkerNameFromMapFile(map_file)


def _ElfInfoFromApk(apk_path, apk_so_path, tool_prefix):
  """Returns a tuple of (build_id, section_sizes)."""
  with zipfile.ZipFile(apk_path) as apk, \
       tempfile.NamedTemporaryFile() as f:
    f.write(apk.read(apk_so_path))
    f.flush()
    build_id = BuildIdFromElf(f.name, tool_prefix)
    section_sizes = _SectionSizesFromElf(f.name, tool_prefix)
    elf_overhead_size = _CalculateElfOverhead(section_sizes, f.name)
    return build_id, section_sizes, elf_overhead_size


def _AutoIdentifyInputFile(args):
  file_output = subprocess.check_output(['file', args.f])
  format_text = file_output[file_output.find(': ') + 2:]
  # File-not-found -> 'cannot ...' and directory -> 'directory', which don't
  # match anything here, so they are handled by the final 'return False'.
  if (format_text.startswith('Java archive data') or
      format_text.startswith('Zip archive data')):
    logging.info('Auto-identified --apk-file.')
    args.apk_file = args.f
    return True
  if format_text.startswith('ELF '):
    logging.info('Auto-identified --elf-file.')
    args.elf_file = args.f
    return True
  if format_text.startswith('ASCII text'):
    logging.info('Auto-identified --map-file.')
    args.map_file = args.f
    return True
  return False


def AddMainPathsArguments(parser):
  """Add arguments for DeduceMainPaths()."""
  parser.add_argument('-f', metavar='FILE',
                      help='Auto-identify input file type.')
  parser.add_argument('--apk-file',
                      help='.apk file to measure. When set, --elf-file will be '
                            'derived (if unset). Providing the .apk allows '
                            'for the size of packed relocations to be recorded')
  parser.add_argument('--elf-file',
                      help='Path to input ELF file. Currently used for '
                           'capturing metadata.')
  parser.add_argument('--map-file',
                      help='Path to input .map(.gz) file. Defaults to '
                           '{{elf_file}}.map(.gz)?. If given without '
                           '--elf-file, no size metadata will be recorded.')
  parser.add_argument('--no-source-paths', action='store_true',
                      help='Do not use .ninja files to map '
                           'object_path -> source_path')
  parser.add_argument('--output-directory',
                      help='Path to the root build directory.')
  parser.add_argument('--tool-prefix',
                      help='Path prefix for c++filt, nm, readelf.')


def AddArguments(parser):
  parser.add_argument('size_file', help='Path to output .size file.')
  parser.add_argument('--pak-file', action='append',
                      help='Paths to pak files.')
  parser.add_argument('--pak-info-file',
                      help='This file should contain all ids found in the pak '
                           'files that have been passed in.')
  parser.add_argument('--no-string-literals', dest='track_string_literals',
                      default=True, action='store_false',
                      help='Disable breaking down "** merge strings" into more '
                           'granular symbols.')
  parser.add_argument('--source-directory',
                      help='Custom path to the root source directory.')
  AddMainPathsArguments(parser)


def DeduceMainPaths(args, parser):
  """Computes main paths based on input, and deduces them if needed."""
  if args.f is not None:
    if not _AutoIdentifyInputFile(args):
      parser.error('Cannot find or identify file %s' % args.f)

  apk_path = args.apk_file
  elf_path = args.elf_file
  map_path = args.map_file
  any_input = apk_path or elf_path or map_path
  if not any_input:
    parser.error('Must pass at least one of --apk-file, --elf-file, --map-file')
  output_directory_finder = path_util.OutputDirectoryFinder(
      value=args.output_directory,
      any_path_within_output_directory=any_input)

  apk_so_path = None
  if apk_path:
    with zipfile.ZipFile(apk_path) as z:
      lib_infos = [f for f in z.infolist()
                   if f.filename.endswith('.so') and f.file_size > 0]
    assert lib_infos, 'APK has no .so files.'
    # TODO(agrieve): Add support for multiple .so files, and take into account
    #     secondary architectures.
    apk_so_path = max(lib_infos, key=lambda x:x.file_size).filename
    logging.debug('Sub-apk path=%s', apk_so_path)
    if not elf_path and output_directory_finder.Tentative():
      elf_path = os.path.join(
          output_directory_finder.Tentative(), 'lib.unstripped',
          os.path.basename(apk_so_path.replace('crazy.', '')))
      logging.debug('Detected --elf-file=%s', elf_path)

  if map_path:
    if not map_path.endswith('.map') and not map_path.endswith('.map.gz'):
      parser.error('Expected --map-file to end with .map or .map.gz')
  else:
    map_path = elf_path + '.map'
    if not os.path.exists(map_path):
      map_path += '.gz'
    if not os.path.exists(map_path):
      parser.error('Could not find .map(.gz)? file. Ensure you have built with '
                   'is_official_build=true, or use --map-file to point me a '
                   'linker map file.')

  linker_name = _DetectLinkerName(map_path)
  logging.info('Linker name: %s' % linker_name)
  tool_prefix_finder = path_util.ToolPrefixFinder(
      value=args.tool_prefix,
      output_directory_finder=output_directory_finder,
      linker_name=linker_name)
  tool_prefix = tool_prefix_finder.Finalized()
  output_directory = None
  if not args.no_source_paths:
    output_directory = output_directory_finder.Finalized()
  return (output_directory, tool_prefix, apk_path, apk_so_path, elf_path,
          map_path, linker_name)


def Run(args, parser):
  if not args.size_file.endswith('.size'):
    parser.error('size_file must end with .size')

  (output_directory, tool_prefix, apk_path, apk_so_path, elf_path, map_path,
       linker_name) = (DeduceMainPaths(args, parser))

  metadata = CreateMetadata(map_path, elf_path, apk_path, tool_prefix,
                            output_directory, linker_name)

  knobs = SectionSizeKnobs()
  if args.source_directory:
    knobs.src_root = args.source_directory

  section_sizes, raw_symbols = CreateSectionSizesAndSymbols(
      map_path=map_path, tool_prefix=tool_prefix, elf_path=elf_path,
      apk_path=apk_path, output_directory=output_directory,
      track_string_literals=args.track_string_literals,
      metadata=metadata, apk_so_path=apk_so_path,
      pak_files=args.pak_file, pak_info_file=args.pak_info_file,
      linker_name=linker_name, knobs=knobs)
  size_info = CreateSizeInfo(
      section_sizes, raw_symbols, metadata=metadata, normalize_names=False)

  if logging.getLogger().isEnabledFor(logging.INFO):
    for line in describe.DescribeSizeInfoCoverage(size_info):
      logging.info(line)
  logging.info('Recorded info for %d symbols', len(size_info.raw_symbols))
  logging.info('Recording metadata: \n  %s',
               '\n  '.join(describe.DescribeMetadata(size_info.metadata)))
  logging.info('Saving result to %s', args.size_file)
  file_format.SaveSizeInfo(size_info, args.size_file)
  size_in_mb = os.path.getsize(args.size_file) / 1024.0 / 1024.0
  logging.info('Done. File size is %.2fMiB.', size_in_mb)