naiveproxy/tools/gn/string_utils.cc

// Copyright (c) 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "tools/gn/string_utils.h"

#include <stddef.h>
#include <cctype>

#include "base/strings/string_number_conversions.h"
#include "tools/gn/err.h"
#include "tools/gn/input_file.h"
#include "tools/gn/parser.h"
#include "tools/gn/scope.h"
#include "tools/gn/token.h"
#include "tools/gn/tokenizer.h"
#include "tools/gn/value.h"

namespace {

// Constructs an Err indicating a range inside a string. We assume that the
// token has quotes around it that are not counted by the offset.
Err ErrInsideStringToken(const Token& token, size_t offset, size_t size,
                         const std::string& msg,
                         const std::string& help = std::string()) {
  // The "+1" is skipping over the " at the beginning of the token.
  int int_offset = static_cast<int>(offset);
  Location begin_loc(token.location().file(),
                     token.location().line_number(),
                     token.location().column_number() + int_offset + 1,
                     token.location().byte() + int_offset + 1);
  Location end_loc(
      token.location().file(),
      token.location().line_number(),
      token.location().column_number() + int_offset + 1 +
          static_cast<int>(size),
      token.location().byte() + int_offset + 1 + static_cast<int>(size));
  return Err(LocationRange(begin_loc, end_loc), msg, help);
}

// Notes about expression interpolation. This is based loosly on Dart but is
// slightly less flexible. In Dart, seeing the ${ in a string is something
// the toplevel parser knows about, and it will recurse into the block
// treating it as a first-class {...} block. So even things like this work:
//   "hello ${"foo}"*2+"bar"}"  =>  "hello foo}foo}bar"
// (you can see it did not get confused by the nested strings or the nested "}"
// inside the block).
//
// This is cool but complicates the parser for almost no benefit for this
// non-general-purpose programming language. The main reason expressions are
// supported here at all are to support "${scope.variable}" and "${list[0]}",
// neither of which have any of these edge-cases.
//
// In this simplified approach, we search for the terminating '}' and execute
// the result. This means we can't support any expressions with embedded '}'
// or '"'. To keep people from getting confusing about what's supported and
// what's not, only identifier and accessor expressions are allowed (neither
// of these run into any of these edge-cases).
bool AppendInterpolatedExpression(Scope* scope,
                                  const Token& token,
                                  const char* input,
                                  size_t begin_offset,
                                  size_t end_offset,
                                  std::string* output,
                                  Err* err) {
  SourceFile empty_source_file;  // Prevent most vexing parse.
  InputFile input_file(empty_source_file);
  input_file.SetContents(
      std::string(&input[begin_offset], end_offset - begin_offset));

  // Tokenize.
  std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err);
  if (err->has_error()) {
    // The error will point into our temporary buffer, rewrite it to refer
    // to the original token. This will make the location information less
    // precise, but generally there won't be complicated things in string
    // interpolations.
    *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
                                err->message(), err->help_text());
    return false;
  }

  // Parse.
  std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err);
  if (err->has_error()) {
    // Rewrite error as above.
    *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
                                err->message(), err->help_text());
    return false;
  }
  if (!(node->AsIdentifier() || node->AsAccessor())) {
    *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
        "Invalid string interpolation.",
        "The thing inside the ${} must be an identifier ${foo},\n"
        "a scope access ${foo.bar}, or a list access ${foo[0]}.");
    return false;
  }

  // Evaluate.
  Value result = node->Execute(scope, err);
  if (err->has_error()) {
    // Rewrite error as above.
    *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
                                err->message(), err->help_text());
    return false;
  }

  output->append(result.ToString(false));
  return true;
}

bool AppendInterpolatedIdentifier(Scope* scope,
                                  const Token& token,
                                  const char* input,
                                  size_t begin_offset,
                                  size_t end_offset,
                                  std::string* output,
                                  Err* err) {
  base::StringPiece identifier(&input[begin_offset],
                               end_offset - begin_offset);
  const Value* value = scope->GetValue(identifier, true);
  if (!value) {
    // We assume the input points inside the token.
    *err = ErrInsideStringToken(
        token, identifier.data() - token.value().data() - 1, identifier.size(),
        "Undefined identifier in string expansion.",
        std::string("\"") + identifier + "\" is not currently in scope.");
    return false;
  }

  output->append(value->ToString(false));
  return true;
}

// Handles string interpolations: $identifier and ${expression}
//
// |*i| is the index into |input| after the $. This will be updated to point to
// the last character consumed on success. The token is the original string
// to blame on failure.
//
// On failure, returns false and sets the error. On success, appends the
// result of the interpolation to |*output|.
bool AppendStringInterpolation(Scope* scope,
                               const Token& token,
                               const char* input, size_t size,
                               size_t* i,
                               std::string* output,
                               Err* err) {
  size_t dollars_index = *i - 1;

  if (input[*i] == '{') {
    // Bracketed expression.
    (*i)++;
    size_t begin_offset = *i;

    // Find the closing } and check for non-identifier chars. Don't need to
    // bother checking for the more-restricted first character of an identifier
    // since the {} unambiguously denotes the range, and identifiers with
    // invalid names just won't be found later.
    bool has_non_ident_chars = false;
    while (*i < size && input[*i] != '}') {
      has_non_ident_chars |= Tokenizer::IsIdentifierContinuingChar(input[*i]);
      (*i)++;
    }
    if (*i == size) {
      *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index,
                                  "Unterminated ${...");
      return false;
    }

    // In the common case, the thing inside the {} will actually be a
    // simple identifier. Avoid all the complicated parsing of accessors
    // in this case.
    if (!has_non_ident_chars) {
      return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
                                          *i, output, err);
    }
    return AppendInterpolatedExpression(scope, token, input, begin_offset, *i,
                                        output, err);
  }

  // Simple identifier.
  // The first char of an identifier is more restricted.
  if (!Tokenizer::IsIdentifierFirstChar(input[*i])) {
    *err = ErrInsideStringToken(
        token, dollars_index, *i - dollars_index + 1,
        "$ not followed by an identifier char.",
        "It you want a literal $ use \"\\$\".");
    return false;
  }
  size_t begin_offset = *i;
  (*i)++;

  // Find the first non-identifier char following the string.
  while (*i < size && Tokenizer::IsIdentifierContinuingChar(input[*i]))
    (*i)++;
  size_t end_offset = *i;
  (*i)--;  // Back up to mark the last character consumed.
  return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
                                      end_offset, output, err);
}

// Handles a hex literal: $0xFF
//
// |*i| is the index into |input| after the $. This will be updated to point to
// the last character consumed on success. The token is the original string
// to blame on failure.
//
// On failure, returns false and sets the error. On success, appends the
// char with the given hex value to |*output|.
bool AppendHexByte(Scope* scope,
                   const Token& token,
                   const char* input, size_t size,
                   size_t* i,
                   std::string* output,
                   Err* err) {
  size_t dollars_index = *i - 1;
  // "$0" is already known to exist.
  if (*i + 3 >= size || input[*i + 1] != 'x' || !std::isxdigit(input[*i + 2]) ||
      !std::isxdigit(input[*i + 3])) {
    *err = ErrInsideStringToken(
        token, dollars_index, *i - dollars_index + 1,
        "Invalid hex character. Hex values must look like 0xFF.");
    return false;
  }
  int value = 0;
  if (!base::HexStringToInt(base::StringPiece(&input[*i + 2], 2), &value)) {
    *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
                                "Could not convert hex value.");
    return false;
  }
  *i += 3;
  output->push_back(value);
  return true;
}

}  // namespace

bool ExpandStringLiteral(Scope* scope,
                         const Token& literal,
                         Value* result,
                         Err* err) {
  DCHECK(literal.type() == Token::STRING);
  DCHECK(literal.value().size() > 1);  // Should include quotes.
  DCHECK(result->type() == Value::STRING);  // Should be already set.

  // The token includes the surrounding quotes, so strip those off.
  const char* input = &literal.value().data()[1];
  size_t size = literal.value().size() - 2;

  std::string& output = result->string_value();
  output.reserve(size);
  for (size_t i = 0; i < size; i++) {
    if (input[i] == '\\') {
      if (i < size - 1) {
        switch (input[i + 1]) {
          case '\\':
          case '"':
          case '$':
            output.push_back(input[i + 1]);
            i++;
            continue;
          default:  // Everything else has no meaning: pass the literal.
            break;
        }
      }
      output.push_back(input[i]);
    } else if (input[i] == '$') {
      i++;
      if (i == size) {
        *err = ErrInsideStringToken(literal, i - 1, 1, "$ at end of string.",
            "I was expecting an identifier, 0xFF, or {...} after the $.");
        return false;
      }
      if (input[i] == '0') {
        if (!AppendHexByte(scope, literal, input, size, &i, &output, err))
          return false;
      } else if (!AppendStringInterpolation(scope, literal, input, size, &i,
                                     &output, err))
        return false;
    } else {
      output.push_back(input[i]);
    }
  }
  return true;
}

size_t EditDistance(const base::StringPiece& s1,
                    const base::StringPiece& s2,
                    size_t max_edit_distance) {
  // The algorithm implemented below is the "classic"
  // dynamic-programming algorithm for computing the Levenshtein
  // distance, which is described here:
  //
  //   http://en.wikipedia.org/wiki/Levenshtein_distance
  //
  // Although the algorithm is typically described using an m x n
  // array, only one row plus one element are used at a time, so this
  // implementation just keeps one vector for the row.  To update one entry,
  // only the entries to the left, top, and top-left are needed.  The left
  // entry is in row[x-1], the top entry is what's in row[x] from the last
  // iteration, and the top-left entry is stored in previous.
  size_t m = s1.size();
  size_t n = s2.size();

  std::vector<size_t> row(n + 1);
  for (size_t i = 1; i <= n; ++i)
    row[i] = i;

  for (size_t y = 1; y <= m; ++y) {
    row[0] = y;
    size_t best_this_row = row[0];

    size_t previous = y - 1;
    for (size_t x = 1; x <= n; ++x) {
      size_t old_row = row[x];
      row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u),
                        std::min(row[x - 1], row[x]) + 1u);
      previous = old_row;
      best_this_row = std::min(best_this_row, row[x]);
    }

    if (max_edit_distance && best_this_row > max_edit_distance)
      return max_edit_distance + 1;
  }

  return row[n];
}

base::StringPiece SpellcheckString(
    const base::StringPiece& text,
    const std::vector<base::StringPiece>& words) {
  const size_t kMaxValidEditDistance = 3u;

  size_t min_distance = kMaxValidEditDistance + 1u;
  base::StringPiece result;
  for (base::StringPiece word : words) {
    size_t distance = EditDistance(word, text, kMaxValidEditDistance);
    if (distance < min_distance) {
      min_distance = distance;
      result = word;
    }
  }
  return result;
}