naiveproxy/tools/gn/string_utils.cc

// Copyright (c) 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "tools/gn/string_utils.h"

#include <stddef.h>
#include <cctype>

#include "base/strings/string_number_conversions.h"
#include "tools/gn/err.h"
#include "tools/gn/input_file.h"
#include "tools/gn/parser.h"
#include "tools/gn/scope.h"
#include "tools/gn/token.h"
#include "tools/gn/tokenizer.h"
#include "tools/gn/value.h"

namespace {

// Constructs an Err indicating a range inside a string. We assume that the
// token has quotes around it that are not counted by the offset.
Err ErrInsideStringToken(const Token& token, size_t offset, size_t size,
                         const std::string& msg,
                         const std::string& help = std::string()) {
  // The "+1" is skipping over the " at the beginning of the token.
  int int_offset = static_cast<int>(offset);
  Location begin_loc(token.location().file(),
                     token.location().line_number(),
                     token.location().column_number() + int_offset + 1,
                     token.location().byte() + int_offset + 1);
  Location end_loc(
      token.location().file(),
      token.location().line_number(),
      token.location().column_number() + int_offset + 1 +
          static_cast<int>(size),
      token.location().byte() + int_offset + 1 + static_cast<int>(size));
  return Err(LocationRange(begin_loc, end_loc), msg, help);
}

// Notes about expression interpolation. This is based loosly on Dart but is
// slightly less flexible. In Dart, seeing the ${ in a string is something
// the toplevel parser knows about, and it will recurse into the block
// treating it as a first-class {...} block. So even things like this work:
//   "hello ${"foo}"*2+"bar"}"  =>  "hello foo}foo}bar"
// (you can see it did not get confused by the nested strings or the nested "}"
// inside the block).
//
// This is cool but complicates the parser for almost no benefit for this
// non-general-purpose programming language. The main reason expressions are
// supported here at all are to support "${scope.variable}" and "${list[0]}",
// neither of which have any of these edge-cases.
//
// In this simplified approach, we search for the terminating '}' and execute
// the result. This means we can't support any expressions with embedded '}'
// or '"'. To keep people from getting confusing about what's supported and
// what's not, only identifier and accessor expressions are allowed (neither
// of these run into any of these edge-cases).
bool AppendInterpolatedExpression(Scope* scope,
                                  const Token& token,
                                  const char* input,
                                  size_t begin_offset,
                                  size_t end_offset,
                                  std::string* output,
                                  Err* err) {
  SourceFile empty_source_file;  // Prevent most vexing parse.
  InputFile input_file(empty_source_file);
  input_file.SetContents(
      std::string(&input[begin_offset], end_offset - begin_offset));

  // Tokenize.
  std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err);
  if (err->has_error()) {
    // The error will point into our temporary buffer, rewrite it to refer
    // to the original token. This will make the location information less
    // precise, but generally there won't be complicated things in string
    // interpolations.
    *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
                                err->message(), err->help_text());
    return false;
  }

  // Parse.
  std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err);
  if (err->has_error()) {
    // Rewrite error as above.
    *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
                                err->message(), err->help_text());
    return false;
  }
  if (!(node->AsIdentifier() || node->AsAccessor())) {
    *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
        "Invalid string interpolation.",
        "The thing inside the ${} must be an identifier ${foo},\n"
        "a scope access ${foo.bar}, or a list access ${foo[0]}.");
    return false;
  }

  // Evaluate.
  Value result = node->Execute(scope, err);
  if (err->has_error()) {
    // Rewrite error as above.
    *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
                                err->message(), err->help_text());
    return false;
  }

  output->append(result.ToString(false));
  return true;
}

bool AppendInterpolatedIdentifier(Scope* scope,
                                  const Token& token,
                                  const char* input,
                                  size_t begin_offset,
                                  size_t end_offset,
                                  std::string* output,
                                  Err* err) {
  base::StringPiece identifier(&input[begin_offset],
                               end_offset - begin_offset);
  const Value* value = scope->GetValue(identifier, true);
  if (!value) {
    // We assume the input points inside the token.
    *err = ErrInsideStringToken(
        token, identifier.data() - token.value().data() - 1, identifier.size(),
        "Undefined identifier in string expansion.",
        std::string("\"") + identifier + "\" is not currently in scope.");
    return false;
  }

  output->append(value->ToString(false));
  return true;
}

// Handles string interpolations: $identifier and ${expression}
//
// |*i| is the index into |input| after the $. This will be updated to point to
// the last character consumed on success. The token is the original string
// to blame on failure.
//
// On failure, returns false and sets the error. On success, appends the
// result of the interpolation to |*output|.
bool AppendStringInterpolation(Scope* scope,
                               const Token& token,
                               const char* input, size_t size,
                               size_t* i,
                               std::string* output,
                               Err* err) {
  size_t dollars_index = *i - 1;

  if (input[*i] == '{') {
    // Bracketed expression.
    (*i)++;
    size_t begin_offset = *i;

    // Find the closing } and check for non-identifier chars. Don't need to
    // bother checking for the more-restricted first character of an identifier
    // since the {} unambiguously denotes the range, and identifiers with
    // invalid names just won't be found later.
    bool has_non_ident_chars = false;
    while (*i < size && input[*i] != '}') {
      has_non_ident_chars |= Tokenizer::IsIdentifierContinuingChar(input[*i]);
      (*i)++;
    }
    if (*i == size) {
      *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index,
                                  "Unterminated ${...");
      return false;
    }

    // In the common case, the thing inside the {} will actually be a
    // simple identifier. Avoid all the complicated parsing of accessors
    // in this case.
    if (!has_non_ident_chars) {
      return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
                                          *i, output, err);
    }
    return AppendInterpolatedExpression(scope, token, input, begin_offset, *i,
                                        output, err);
  }

  // Simple identifier.
  // The first char of an identifier is more restricted.
  if (!Tokenizer::IsIdentifierFirstChar(input[*i])) {
    *err = ErrInsideStringToken(
        token, dollars_index, *i - dollars_index + 1,
        "$ not followed by an identifier char.",
        "It you want a literal $ use \"\\$\".");
    return false;
  }
  size_t begin_offset = *i;
  (*i)++;

  // Find the first non-identifier char following the string.
  while (*i < size && Tokenizer::IsIdentifierContinuingChar(input[*i]))
    (*i)++;
  size_t end_offset = *i;
  (*i)--;  // Back up to mark the last character consumed.
  return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
                                      end_offset, output, err);
}

// Handles a hex literal: $0xFF
//
// |*i| is the index into |input| after the $. This will be updated to point to
// the last character consumed on success. The token is the original string
// to blame on failure.
//
// On failure, returns false and sets the error. On success, appends the
// char with the given hex value to |*output|.
bool AppendHexByte(Scope* scope,
                   const Token& token,
                   const char* input, size_t size,
                   size_t* i,
                   std::string* output,
                   Err* err) {
  size_t dollars_index = *i - 1;
  // "$0" is already known to exist.
  if (*i + 3 >= size || input[*i + 1] != 'x' || !std::isxdigit(input[*i + 2]) ||
      !std::isxdigit(input[*i + 3])) {
    *err = ErrInsideStringToken(
        token, dollars_index, *i - dollars_index + 1,
        "Invalid hex character. Hex values must look like 0xFF.");
    return false;
  }
  int value = 0;
  if (!base::HexStringToInt(base::StringPiece(&input[*i + 2], 2), &value)) {
    *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
                                "Could not convert hex value.");
    return false;
  }
  *i += 3;
  output->push_back(value);
  return true;
}

}  // namespace

bool ExpandStringLiteral(Scope* scope,
                         const Token& literal,
                         Value* result,
                         Err* err) {
  DCHECK(literal.type() == Token::STRING);
  DCHECK(literal.value().size() > 1);  // Should include quotes.
  DCHECK(result->type() == Value::STRING);  // Should be already set.

  // The token includes the surrounding quotes, so strip those off.
  const char* input = &literal.value().data()[1];
  size_t size = literal.value().size() - 2;

  std::string& output = result->string_value();
  output.reserve(size);
  for (size_t i = 0; i < size; i++) {
    if (input[i] == '\\') {
      if (i < size - 1) {
        switch (input[i + 1]) {
          case '\\':
          case '"':
          case '$':
            output.push_back(input[i + 1]);
            i++;
            continue;
          default:  // Everything else has no meaning: pass the literal.
            break;
        }
      }
      output.push_back(input[i]);
    } else if (input[i] == '$') {
      i++;
      if (i == size) {
        *err = ErrInsideStringToken(literal, i - 1, 1, "$ at end of string.",
            "I was expecting an identifier, 0xFF, or {...} after the $.");
        return false;
      }
      if (input[i] == '0') {
        if (!AppendHexByte(scope, literal, input, size, &i, &output, err))
          return false;
      } else if (!AppendStringInterpolation(scope, literal, input, size, &i,
                                     &output, err))
        return false;
    } else {
      output.push_back(input[i]);
    }
  }
  return true;
}

size_t EditDistance(const base::StringPiece& s1,
                    const base::StringPiece& s2,
                    size_t max_edit_distance) {
  // The algorithm implemented below is the "classic"
  // dynamic-programming algorithm for computing the Levenshtein
  // distance, which is described here:
  //
  //   http://en.wikipedia.org/wiki/Levenshtein_distance
  //
  // Although the algorithm is typically described using an m x n
  // array, only one row plus one element are used at a time, so this
  // implementation just keeps one vector for the row.  To update one entry,
  // only the entries to the left, top, and top-left are needed.  The left
  // entry is in row[x-1], the top entry is what's in row[x] from the last
  // iteration, and the top-left entry is stored in previous.
  size_t m = s1.size();
  size_t n = s2.size();

  std::vector<size_t> row(n + 1);
  for (size_t i = 1; i <= n; ++i)
    row[i] = i;

  for (size_t y = 1; y <= m; ++y) {
    row[0] = y;
    size_t best_this_row = row[0];

    size_t previous = y - 1;
    for (size_t x = 1; x <= n; ++x) {
      size_t old_row = row[x];
      row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u),
                        std::min(row[x - 1], row[x]) + 1u);
      previous = old_row;
      best_this_row = std::min(best_this_row, row[x]);
    }

    if (max_edit_distance && best_this_row > max_edit_distance)
      return max_edit_distance + 1;
  }

  return row[n];
}

base::StringPiece SpellcheckString(
    const base::StringPiece& text,
    const std::vector<base::StringPiece>& words) {
  const size_t kMaxValidEditDistance = 3u;

  size_t min_distance = kMaxValidEditDistance + 1u;
  base::StringPiece result;
  for (base::StringPiece word : words) {
    size_t distance = EditDistance(word, text, kMaxValidEditDistance);
    if (distance < min_distance) {
      min_distance = distance;
      result = word;
    }
  }
  return result;
}
Import chromium-63.0.3239.132 2018-01-28 19:30:36 +03:00			`// Copyright (c) 2013 The Chromium Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style license that can be`
			`// found in the LICENSE file.`

			`#include "tools/gn/string_utils.h"`

			`#include <stddef.h>`
			`#include <cctype>`

			`#include "base/strings/string_number_conversions.h"`
			`#include "tools/gn/err.h"`
			`#include "tools/gn/input_file.h"`
			`#include "tools/gn/parser.h"`
			`#include "tools/gn/scope.h"`
			`#include "tools/gn/token.h"`
			`#include "tools/gn/tokenizer.h"`
			`#include "tools/gn/value.h"`

			`namespace {`

			`// Constructs an Err indicating a range inside a string. We assume that the`
			`// token has quotes around it that are not counted by the offset.`
			`Err ErrInsideStringToken(const Token& token, size_t offset, size_t size,`
			`const std::string& msg,`
			`const std::string& help = std::string()) {`
			`// The "+1" is skipping over the " at the beginning of the token.`
			`int int_offset = static_cast<int>(offset);`
			`Location begin_loc(token.location().file(),`
			`token.location().line_number(),`
			`token.location().column_number() + int_offset + 1,`
			`token.location().byte() + int_offset + 1);`
			`Location end_loc(`
			`token.location().file(),`
			`token.location().line_number(),`
			`token.location().column_number() + int_offset + 1 +`
			`static_cast<int>(size),`
			`token.location().byte() + int_offset + 1 + static_cast<int>(size));`
			`return Err(LocationRange(begin_loc, end_loc), msg, help);`
			`}`

			`// Notes about expression interpolation. This is based loosly on Dart but is`
			`// slightly less flexible. In Dart, seeing the ${ in a string is something`
			`// the toplevel parser knows about, and it will recurse into the block`
			`// treating it as a first-class {...} block. So even things like this work:`
			`// "hello ${"foo}"*2+"bar"}" => "hello foo}foo}bar"`
			`// (you can see it did not get confused by the nested strings or the nested "}"`
			`// inside the block).`
			`//`
			`// This is cool but complicates the parser for almost no benefit for this`
			`// non-general-purpose programming language. The main reason expressions are`
			`// supported here at all are to support "${scope.variable}" and "${list[0]}",`
			`// neither of which have any of these edge-cases.`
			`//`
			`// In this simplified approach, we search for the terminating '}' and execute`
			`// the result. This means we can't support any expressions with embedded '}'`
			`// or '"'. To keep people from getting confusing about what's supported and`
			`// what's not, only identifier and accessor expressions are allowed (neither`
			`// of these run into any of these edge-cases).`
			`bool AppendInterpolatedExpression(Scope* scope,`
			`const Token& token,`
			`const char* input,`
			`size_t begin_offset,`
			`size_t end_offset,`
			`std::string* output,`
			`Err* err) {`
			`SourceFile empty_source_file; // Prevent most vexing parse.`
			`InputFile input_file(empty_source_file);`
			`input_file.SetContents(`
			`std::string(&input[begin_offset], end_offset - begin_offset));`

			`// Tokenize.`
			`std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err);`
			`if (err->has_error()) {`
			`// The error will point into our temporary buffer, rewrite it to refer`
			`// to the original token. This will make the location information less`
			`// precise, but generally there won't be complicated things in string`
			`// interpolations.`
			`*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,`
			`err->message(), err->help_text());`
			`return false;`
			`}`

			`// Parse.`
			`std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err);`
			`if (err->has_error()) {`
			`// Rewrite error as above.`
			`*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,`
			`err->message(), err->help_text());`
			`return false;`
			`}`
			`if (!(node->AsIdentifier() \|\| node->AsAccessor())) {`
			`*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,`
			`"Invalid string interpolation.",`
			`"The thing inside the ${} must be an identifier ${foo},\n"`
			`"a scope access ${foo.bar}, or a list access ${foo[0]}.");`
			`return false;`
			`}`

			`// Evaluate.`
			`Value result = node->Execute(scope, err);`
			`if (err->has_error()) {`
			`// Rewrite error as above.`
			`*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,`
			`err->message(), err->help_text());`
			`return false;`
			`}`

			`output->append(result.ToString(false));`
			`return true;`
			`}`

			`bool AppendInterpolatedIdentifier(Scope* scope,`
			`const Token& token,`
			`const char* input,`
			`size_t begin_offset,`
			`size_t end_offset,`
			`std::string* output,`
			`Err* err) {`
			`base::StringPiece identifier(&input[begin_offset],`
			`end_offset - begin_offset);`
			`const Value* value = scope->GetValue(identifier, true);`
			`if (!value) {`
			`// We assume the input points inside the token.`
			`*err = ErrInsideStringToken(`
			`token, identifier.data() - token.value().data() - 1, identifier.size(),`
			`"Undefined identifier in string expansion.",`
			`std::string("\"") + identifier + "\" is not currently in scope.");`
			`return false;`
			`}`

			`output->append(value->ToString(false));`
			`return true;`
			`}`

			`// Handles string interpolations: $identifier and ${expression}`
			`//`
			`// \|*i\| is the index into \|input\| after the $. This will be updated to point to`
			`// the last character consumed on success. The token is the original string`
			`// to blame on failure.`
			`//`
			`// On failure, returns false and sets the error. On success, appends the`
			`// result of the interpolation to \|*output\|.`
			`bool AppendStringInterpolation(Scope* scope,`
			`const Token& token,`
			`const char* input, size_t size,`
			`size_t* i,`
			`std::string* output,`
			`Err* err) {`
			`size_t dollars_index = *i - 1;`

			`if (input[*i] == '{') {`
			`// Bracketed expression.`
			`(*i)++;`
			`size_t begin_offset = *i;`

			`// Find the closing } and check for non-identifier chars. Don't need to`
			`// bother checking for the more-restricted first character of an identifier`
			`// since the {} unambiguously denotes the range, and identifiers with`
			`// invalid names just won't be found later.`
			`bool has_non_ident_chars = false;`
			`while (i < size && input[i] != '}') {`
			`has_non_ident_chars \|= Tokenizer::IsIdentifierContinuingChar(input[*i]);`
			`(*i)++;`
			`}`
			`if (*i == size) {`
			`err = ErrInsideStringToken(token, dollars_index, i - dollars_index,`
			`"Unterminated ${...");`
			`return false;`
			`}`

			`// In the common case, the thing inside the {} will actually be a`
			`// simple identifier. Avoid all the complicated parsing of accessors`
			`// in this case.`
			`if (!has_non_ident_chars) {`
			`return AppendInterpolatedIdentifier(scope, token, input, begin_offset,`
			`*i, output, err);`
			`}`
			`return AppendInterpolatedExpression(scope, token, input, begin_offset, *i,`
			`output, err);`
			`}`

			`// Simple identifier.`
			`// The first char of an identifier is more restricted.`
			`if (!Tokenizer::IsIdentifierFirstChar(input[*i])) {`
			`*err = ErrInsideStringToken(`
			`token, dollars_index, *i - dollars_index + 1,`
			`"$ not followed by an identifier char.",`
			`"It you want a literal $ use \"\\$\".");`
			`return false;`
			`}`
			`size_t begin_offset = *i;`
			`(*i)++;`

			`// Find the first non-identifier char following the string.`
			`while (i < size && Tokenizer::IsIdentifierContinuingChar(input[i]))`
			`(*i)++;`
			`size_t end_offset = *i;`
			`(*i)--; // Back up to mark the last character consumed.`
			`return AppendInterpolatedIdentifier(scope, token, input, begin_offset,`
			`end_offset, output, err);`
			`}`

			`// Handles a hex literal: $0xFF`
			`//`
			`// \|*i\| is the index into \|input\| after the $. This will be updated to point to`
			`// the last character consumed on success. The token is the original string`
			`// to blame on failure.`
			`//`
			`// On failure, returns false and sets the error. On success, appends the`
			`// char with the given hex value to \|*output\|.`
			`bool AppendHexByte(Scope* scope,`
			`const Token& token,`
			`const char* input, size_t size,`
			`size_t* i,`
			`std::string* output,`
			`Err* err) {`
			`size_t dollars_index = *i - 1;`
			`// "$0" is already known to exist.`
			`if (i + 3 >= size \|\| input[i + 1] != 'x' \|\| !std::isxdigit(input[*i + 2]) \|\|`
			`!std::isxdigit(input[*i + 3])) {`
			`*err = ErrInsideStringToken(`
			`token, dollars_index, *i - dollars_index + 1,`
			`"Invalid hex character. Hex values must look like 0xFF.");`
			`return false;`
			`}`
			`int value = 0;`
			`if (!base::HexStringToInt(base::StringPiece(&input[*i + 2], 2), &value)) {`
			`err = ErrInsideStringToken(token, dollars_index, i - dollars_index + 1,`
			`"Could not convert hex value.");`
			`return false;`
			`}`
			`*i += 3;`
			`output->push_back(value);`
			`return true;`
			`}`

			`} // namespace`

			`bool ExpandStringLiteral(Scope* scope,`
			`const Token& literal,`
			`Value* result,`
			`Err* err) {`
			`DCHECK(literal.type() == Token::STRING);`
			`DCHECK(literal.value().size() > 1); // Should include quotes.`
			`DCHECK(result->type() == Value::STRING); // Should be already set.`

			`// The token includes the surrounding quotes, so strip those off.`
			`const char* input = &literal.value().data()[1];`
			`size_t size = literal.value().size() - 2;`

			`std::string& output = result->string_value();`
			`output.reserve(size);`
			`for (size_t i = 0; i < size; i++) {`
			`if (input[i] == '\\') {`
			`if (i < size - 1) {`
			`switch (input[i + 1]) {`
			`case '\\':`
			`case '"':`
			`case '$':`
			`output.push_back(input[i + 1]);`
			`i++;`
			`continue;`
			`default: // Everything else has no meaning: pass the literal.`
			`break;`
			`}`
			`}`
			`output.push_back(input[i]);`
			`} else if (input[i] == '$') {`
			`i++;`
			`if (i == size) {`
			`*err = ErrInsideStringToken(literal, i - 1, 1, "$ at end of string.",`
			`"I was expecting an identifier, 0xFF, or {...} after the $.");`
			`return false;`
			`}`
			`if (input[i] == '0') {`
			`if (!AppendHexByte(scope, literal, input, size, &i, &output, err))`
			`return false;`
			`} else if (!AppendStringInterpolation(scope, literal, input, size, &i,`
			`&output, err))`
			`return false;`
			`} else {`
			`output.push_back(input[i]);`
			`}`
			`}`
			`return true;`
			`}`

			`size_t EditDistance(const base::StringPiece& s1,`
			`const base::StringPiece& s2,`
			`size_t max_edit_distance) {`
			`// The algorithm implemented below is the "classic"`
			`// dynamic-programming algorithm for computing the Levenshtein`
			`// distance, which is described here:`
			`//`
			`// http://en.wikipedia.org/wiki/Levenshtein_distance`
			`//`
			`// Although the algorithm is typically described using an m x n`
			`// array, only one row plus one element are used at a time, so this`
			`// implementation just keeps one vector for the row. To update one entry,`
			`// only the entries to the left, top, and top-left are needed. The left`
			`// entry is in row[x-1], the top entry is what's in row[x] from the last`
			`// iteration, and the top-left entry is stored in previous.`
			`size_t m = s1.size();`
			`size_t n = s2.size();`

			`std::vector<size_t> row(n + 1);`
			`for (size_t i = 1; i <= n; ++i)`
			`row[i] = i;`

			`for (size_t y = 1; y <= m; ++y) {`
			`row[0] = y;`
			`size_t best_this_row = row[0];`

			`size_t previous = y - 1;`
			`for (size_t x = 1; x <= n; ++x) {`
			`size_t old_row = row[x];`
			`row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u),`
			`std::min(row[x - 1], row[x]) + 1u);`
			`previous = old_row;`
			`best_this_row = std::min(best_this_row, row[x]);`
			`}`

			`if (max_edit_distance && best_this_row > max_edit_distance)`
			`return max_edit_distance + 1;`
			`}`

			`return row[n];`
			`}`

			`base::StringPiece SpellcheckString(`
			`const base::StringPiece& text,`
			`const std::vector<base::StringPiece>& words) {`
			`const size_t kMaxValidEditDistance = 3u;`

			`size_t min_distance = kMaxValidEditDistance + 1u;`
			`base::StringPiece result;`
			`for (base::StringPiece word : words) {`
			`size_t distance = EditDistance(word, text, kMaxValidEditDistance);`
			`if (distance < min_distance) {`
			`min_distance = distance;`
			`result = word;`
			`}`
			`}`
			`return result;`
			`}`