# Copyright 2018 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Utilities to extract string literals from object files. LookupElfRodataInfo(): Runs readelf to extract and return .rodata section spec of an ELF file. ReadFileChunks(): Reads raw data from a file, given a list of ranges in the file. ResolveStringPiecesIndirect(): BulkForkAndCall() target: Given {path: [string addresses]} and [raw_string_data for each string_section]: - Reads {path: [src_strings]}. - For each path, searches for src_strings in at most 1 raw_string_data over each string_section. If found, translates to string_range and annotates it to the string_section. - Returns [{path: [string_ranges]} for each string_section]. ResolveStringPieces(): BulkForkAndCall() target: Given {path: [strings]} and [raw_string_data for each string_section]: - For each path, searches for src_strings in at most 1 raw_string_data over each string_section. If found, translates to string_range and annotates it to the string_section. - Returns [{path: [string_ranges]} for each string_section]. """ import ast import collections import itertools import logging import os import subprocess import ar import concurrent import models import path_util def LookupElfRodataInfo(elf_path, tool_prefix): """Returns (address, offset, size) for the .rodata section.""" args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide', elf_path] output = subprocess.check_output(args) lines = output.splitlines() for line in lines: # [Nr] Name Type Addr Off Size ES Flg Lk Inf Al # [07] .rodata PROGBITS 025e7000 237c000 5ec4f6 00 A 0 0 256 if '.rodata ' in line: fields = line[line.index(models.SECTION_RODATA):].split() return int(fields[2], 16), int(fields[3], 16), int(fields[4], 16) raise AssertionError('No .rodata for command: ' + repr(args)) def ReadFileChunks(path, section_ranges): """Returns a list of raw data from |path|, specified by |section_ranges|. Args: section_ranges: List of (offset, size). """ ret = [] if not section_ranges: return ret with open(path, 'rb') as f: for offset, size in section_ranges: f.seek(offset) ret.append(f.read(size)) return ret def _ExtractArchivePath(path): # E.g. foo/bar.a(baz.o) if path.endswith(')'): start_idx = path.index('(') return path[:start_idx] return None def _LookupStringSectionPositions(target, tool_prefix, output_directory): """Returns a dict of object_path -> [(offset, size)...] of .rodata sections. Args: target: An archive path string (e.g., "foo.a") or a list of object paths. """ is_archive = isinstance(target, basestring) args = [path_util.GetReadElfPath(tool_prefix), '-S', '--wide'] if is_archive: args.append(target) else: # Assign path for when len(target) == 1, (no File: line exists). path = target[0] args.extend(target) output = subprocess.check_output(args, cwd=output_directory) lines = output.splitlines() section_positions_by_path = {} cur_offsets = [] for line in lines: # File: base/third_party/libevent/libevent.a(buffer.o) # [Nr] Name Type Addr Off Size ES Flg Lk Inf Al # [11] .rodata.str1.1 PROGBITS 00000000 0000b4 000004 01 AMS 0 0 1 # [11] .rodata.str4.4 PROGBITS 00000000 0000b4 000004 01 AMS 0 0 4 # [11] .rodata.str8.8 PROGBITS 00000000 0000b4 000004 01 AMS 0 0 8 # [80] .rodata..L.str PROGBITS 00000000 000530 000002 00 A 0 0 1 # The various string sections differ by alignment. # The presence of a wchar_t literal (L"asdf") seems to make a str4 section. # When multiple sections exist, nm gives us no indication as to which # section each string corresponds to. if line.startswith('File: '): if cur_offsets: section_positions_by_path[path] = cur_offsets cur_offsets = [] path = line[6:] elif '.rodata.' in line: progbits_idx = line.find('PROGBITS ') if progbits_idx != -1: fields = line[progbits_idx:].split() position = (int(fields[2], 16), int(fields[3], 16)) # The heuristics in _IterStringLiterals rely on str1 coming first. if fields[-1] == '1': cur_offsets.insert(0, position) else: cur_offsets.append(position) if cur_offsets: section_positions_by_path[path] = cur_offsets return section_positions_by_path def _ReadStringSections(target, output_directory, positions_by_path): """Returns a dict of object_path -> [string...] of .rodata chunks. Args: target: An archive path string (e.g., "foo.a") or a list of object paths. positions_by_path: A dict of object_path -> [(offset, size)...] """ is_archive = isinstance(target, basestring) string_sections_by_path = {} if is_archive: for subpath, chunk in ar.IterArchiveChunks( os.path.join(output_directory, target)): path = '{}({})'.format(target, subpath) positions = positions_by_path.get(path) # No positions if file has no string literals. if positions: string_sections_by_path[path] = ( [chunk[offset:offset + size] for offset, size in positions]) else: for path in target: positions = positions_by_path.get(path) # We already log a warning about this in _IterStringLiterals(). if positions: string_sections_by_path[path] = ReadFileChunks( os.path.join(output_directory, path), positions) return string_sections_by_path def _IterStringLiterals(path, addresses, obj_sections): """Yields all string literals (including \0) for the given object path. Args: path: Object file path. addresses: List of string offsets encoded as hex strings. obj_sections: List of contents of .rodata.str sections read from the given object file. """ next_offsets = sorted(int(a, 16) for a in addresses) if not obj_sections: # Happens when there is an address for a symbol which is not actually a # string literal, or when string_sections_by_path is missing an entry. logging.warning('Object has %d strings but no string sections: %s', len(addresses), path) return for section_data in obj_sections: cur_offsets = next_offsets # Always assume first element is 0. I'm not entirely sure why this is # necessary, but strings get missed without it. next_offsets = [0] prev_offset = 0 # TODO(agrieve): Switch to using nm --print-size in order to capture the # address+size of each string rather than just the address. for offset in cur_offsets[1:]: if offset >= len(section_data): # Remaining offsets are for next section. next_offsets.append(offset) continue # Figure out which offsets apply to this section via heuristic of them # all ending with a null character. if offset == prev_offset or section_data[offset - 1] != '\0': next_offsets.append(offset) continue yield section_data[prev_offset:offset] prev_offset = offset if prev_offset < len(section_data): yield section_data[prev_offset:] def _AnnotateStringData(string_data, path_value_gen): """Annotates each |string_data| section data with paths and ranges. Args: string_data: [raw_string_data for each string_section] from an ELF file. path_value_gen: A generator of (path, value) pairs, where |path| is the path to an object file and |value| is a string to annotate. Returns: [{path: [string_ranges]} for each string_section]. """ ret = [collections.defaultdict(list) for _ in string_data] # Brute-force search ** merge strings sections in |string_data| for string # values from |path_value_gen|. This is by far the slowest part of # AnalyzeStringLiterals(). # TODO(agrieve): Pre-process |string_data| into a dict of literal->address (at # least for ASCII strings). for path, value in path_value_gen: first_match = -1 first_match_dict = None for target_dict, data in itertools.izip(ret, string_data): # Set offset so that it will be 0 when len(value) is added to it below. offset = -len(value) while True: offset = data.find(value, offset + len(value)) if offset == -1: break # Preferring exact matches (those following \0) over substring matches # significantly increases accuracy (although shows that linker isn't # being optimal). if offset == 0 or data[offset - 1] == '\0': break if first_match == -1: first_match = offset first_match_dict = target_dict if offset != -1: break if offset == -1: # Exact match not found, so take suffix match if it exists. offset = first_match target_dict = first_match_dict # Missing strings happen when optimization make them unused. if offset != -1: # Encode tuple as a string for easier mashalling. target_dict[path].append(str(offset) + ':' + str(len(value))) return ret # This is a target for BulkForkAndCall(). def ResolveStringPiecesIndirect(encoded_string_addresses_by_path, string_data, tool_prefix, output_directory): string_addresses_by_path = concurrent.DecodeDictOfLists( encoded_string_addresses_by_path) # Assign |target| as archive path, or a list of object paths. any_path = next(string_addresses_by_path.iterkeys()) target = _ExtractArchivePath(any_path) if not target: target = string_addresses_by_path.keys() # Run readelf to find location of .rodata within the .o files. section_positions_by_path = _LookupStringSectionPositions( target, tool_prefix, output_directory) # Load the .rodata sections (from object files) as strings. string_sections_by_path = _ReadStringSections( target, output_directory, section_positions_by_path) def GeneratePathAndValues(): for path, object_addresses in string_addresses_by_path.iteritems(): for value in _IterStringLiterals( path, object_addresses, string_sections_by_path.get(path)): yield path, value ret = _AnnotateStringData(string_data, GeneratePathAndValues()) return [concurrent.EncodeDictOfLists(x) for x in ret] # This is a target for BulkForkAndCall(). def ResolveStringPieces(encoded_strings_by_path, string_data): # ast.literal_eval() undoes repr() applied to strings. strings_by_path = concurrent.DecodeDictOfLists( encoded_strings_by_path, value_transform=ast.literal_eval) def GeneratePathAndValues(): for path, strings in strings_by_path.iteritems(): for value in strings: yield path, value ret = _AnnotateStringData(string_data, GeneratePathAndValues()) return [concurrent.EncodeDictOfLists(x) for x in ret]