mirror of
https://github.com/klzgrad/naiveproxy.git
synced 2024-12-01 09:46:09 +03:00
346 lines
13 KiB
C++
346 lines
13 KiB
C++
|
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style license that can be
|
||
|
// found in the LICENSE file.
|
||
|
|
||
|
#include "tools/gn/string_utils.h"
|
||
|
|
||
|
#include <stddef.h>
|
||
|
#include <cctype>
|
||
|
|
||
|
#include "base/strings/string_number_conversions.h"
|
||
|
#include "tools/gn/err.h"
|
||
|
#include "tools/gn/input_file.h"
|
||
|
#include "tools/gn/parser.h"
|
||
|
#include "tools/gn/scope.h"
|
||
|
#include "tools/gn/token.h"
|
||
|
#include "tools/gn/tokenizer.h"
|
||
|
#include "tools/gn/value.h"
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
// Constructs an Err indicating a range inside a string. We assume that the
|
||
|
// token has quotes around it that are not counted by the offset.
|
||
|
Err ErrInsideStringToken(const Token& token, size_t offset, size_t size,
|
||
|
const std::string& msg,
|
||
|
const std::string& help = std::string()) {
|
||
|
// The "+1" is skipping over the " at the beginning of the token.
|
||
|
int int_offset = static_cast<int>(offset);
|
||
|
Location begin_loc(token.location().file(),
|
||
|
token.location().line_number(),
|
||
|
token.location().column_number() + int_offset + 1,
|
||
|
token.location().byte() + int_offset + 1);
|
||
|
Location end_loc(
|
||
|
token.location().file(),
|
||
|
token.location().line_number(),
|
||
|
token.location().column_number() + int_offset + 1 +
|
||
|
static_cast<int>(size),
|
||
|
token.location().byte() + int_offset + 1 + static_cast<int>(size));
|
||
|
return Err(LocationRange(begin_loc, end_loc), msg, help);
|
||
|
}
|
||
|
|
||
|
// Notes about expression interpolation. This is based loosly on Dart but is
|
||
|
// slightly less flexible. In Dart, seeing the ${ in a string is something
|
||
|
// the toplevel parser knows about, and it will recurse into the block
|
||
|
// treating it as a first-class {...} block. So even things like this work:
|
||
|
// "hello ${"foo}"*2+"bar"}" => "hello foo}foo}bar"
|
||
|
// (you can see it did not get confused by the nested strings or the nested "}"
|
||
|
// inside the block).
|
||
|
//
|
||
|
// This is cool but complicates the parser for almost no benefit for this
|
||
|
// non-general-purpose programming language. The main reason expressions are
|
||
|
// supported here at all are to support "${scope.variable}" and "${list[0]}",
|
||
|
// neither of which have any of these edge-cases.
|
||
|
//
|
||
|
// In this simplified approach, we search for the terminating '}' and execute
|
||
|
// the result. This means we can't support any expressions with embedded '}'
|
||
|
// or '"'. To keep people from getting confusing about what's supported and
|
||
|
// what's not, only identifier and accessor expressions are allowed (neither
|
||
|
// of these run into any of these edge-cases).
|
||
|
bool AppendInterpolatedExpression(Scope* scope,
|
||
|
const Token& token,
|
||
|
const char* input,
|
||
|
size_t begin_offset,
|
||
|
size_t end_offset,
|
||
|
std::string* output,
|
||
|
Err* err) {
|
||
|
SourceFile empty_source_file; // Prevent most vexing parse.
|
||
|
InputFile input_file(empty_source_file);
|
||
|
input_file.SetContents(
|
||
|
std::string(&input[begin_offset], end_offset - begin_offset));
|
||
|
|
||
|
// Tokenize.
|
||
|
std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err);
|
||
|
if (err->has_error()) {
|
||
|
// The error will point into our temporary buffer, rewrite it to refer
|
||
|
// to the original token. This will make the location information less
|
||
|
// precise, but generally there won't be complicated things in string
|
||
|
// interpolations.
|
||
|
*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
|
||
|
err->message(), err->help_text());
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// Parse.
|
||
|
std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err);
|
||
|
if (err->has_error()) {
|
||
|
// Rewrite error as above.
|
||
|
*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
|
||
|
err->message(), err->help_text());
|
||
|
return false;
|
||
|
}
|
||
|
if (!(node->AsIdentifier() || node->AsAccessor())) {
|
||
|
*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
|
||
|
"Invalid string interpolation.",
|
||
|
"The thing inside the ${} must be an identifier ${foo},\n"
|
||
|
"a scope access ${foo.bar}, or a list access ${foo[0]}.");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// Evaluate.
|
||
|
Value result = node->Execute(scope, err);
|
||
|
if (err->has_error()) {
|
||
|
// Rewrite error as above.
|
||
|
*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
|
||
|
err->message(), err->help_text());
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
output->append(result.ToString(false));
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool AppendInterpolatedIdentifier(Scope* scope,
|
||
|
const Token& token,
|
||
|
const char* input,
|
||
|
size_t begin_offset,
|
||
|
size_t end_offset,
|
||
|
std::string* output,
|
||
|
Err* err) {
|
||
|
base::StringPiece identifier(&input[begin_offset],
|
||
|
end_offset - begin_offset);
|
||
|
const Value* value = scope->GetValue(identifier, true);
|
||
|
if (!value) {
|
||
|
// We assume the input points inside the token.
|
||
|
*err = ErrInsideStringToken(
|
||
|
token, identifier.data() - token.value().data() - 1, identifier.size(),
|
||
|
"Undefined identifier in string expansion.",
|
||
|
std::string("\"") + identifier + "\" is not currently in scope.");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
output->append(value->ToString(false));
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// Handles string interpolations: $identifier and ${expression}
|
||
|
//
|
||
|
// |*i| is the index into |input| after the $. This will be updated to point to
|
||
|
// the last character consumed on success. The token is the original string
|
||
|
// to blame on failure.
|
||
|
//
|
||
|
// On failure, returns false and sets the error. On success, appends the
|
||
|
// result of the interpolation to |*output|.
|
||
|
bool AppendStringInterpolation(Scope* scope,
|
||
|
const Token& token,
|
||
|
const char* input, size_t size,
|
||
|
size_t* i,
|
||
|
std::string* output,
|
||
|
Err* err) {
|
||
|
size_t dollars_index = *i - 1;
|
||
|
|
||
|
if (input[*i] == '{') {
|
||
|
// Bracketed expression.
|
||
|
(*i)++;
|
||
|
size_t begin_offset = *i;
|
||
|
|
||
|
// Find the closing } and check for non-identifier chars. Don't need to
|
||
|
// bother checking for the more-restricted first character of an identifier
|
||
|
// since the {} unambiguously denotes the range, and identifiers with
|
||
|
// invalid names just won't be found later.
|
||
|
bool has_non_ident_chars = false;
|
||
|
while (*i < size && input[*i] != '}') {
|
||
|
has_non_ident_chars |= Tokenizer::IsIdentifierContinuingChar(input[*i]);
|
||
|
(*i)++;
|
||
|
}
|
||
|
if (*i == size) {
|
||
|
*err = ErrInsideStringToken(token, dollars_index, *i - dollars_index,
|
||
|
"Unterminated ${...");
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// In the common case, the thing inside the {} will actually be a
|
||
|
// simple identifier. Avoid all the complicated parsing of accessors
|
||
|
// in this case.
|
||
|
if (!has_non_ident_chars) {
|
||
|
return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
|
||
|
*i, output, err);
|
||
|
}
|
||
|
return AppendInterpolatedExpression(scope, token, input, begin_offset, *i,
|
||
|
output, err);
|
||
|
}
|
||
|
|
||
|
// Simple identifier.
|
||
|
// The first char of an identifier is more restricted.
|
||
|
if (!Tokenizer::IsIdentifierFirstChar(input[*i])) {
|
||
|
*err = ErrInsideStringToken(
|
||
|
token, dollars_index, *i - dollars_index + 1,
|
||
|
"$ not followed by an identifier char.",
|
||
|
"It you want a literal $ use \"\\$\".");
|
||
|
return false;
|
||
|
}
|
||
|
size_t begin_offset = *i;
|
||
|
(*i)++;
|
||
|
|
||
|
// Find the first non-identifier char following the string.
|
||
|
while (*i < size && Tokenizer::IsIdentifierContinuingChar(input[*i]))
|
||
|
(*i)++;
|
||
|
size_t end_offset = *i;
|
||
|
(*i)--; // Back up to mark the last character consumed.
|
||
|
return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
|
||
|
end_offset, output, err);
|
||
|
}
|
||
|
|
||
|
// Handles a hex literal: $0xFF
|
||
|
//
|
||
|
// |*i| is the index into |input| after the $. This will be updated to point to
|
||
|
// the last character consumed on success. The token is the original string
|
||
|
// to blame on failure.
|
||
|
//
|
||
|
// On failure, returns false and sets the error. On success, appends the
|
||
|
// char with the given hex value to |*output|.
|
||
|
bool AppendHexByte(Scope* scope,
|
||
|
const Token& token,
|
||
|
const char* input, size_t size,
|
||
|
size_t* i,
|
||
|
std::string* output,
|
||
|
Err* err) {
|
||
|
size_t dollars_index = *i - 1;
|
||
|
// "$0" is already known to exist.
|
||
|
if (*i + 3 >= size || input[*i + 1] != 'x' || !std::isxdigit(input[*i + 2]) ||
|
||
|
!std::isxdigit(input[*i + 3])) {
|
||
|
*err = ErrInsideStringToken(
|
||
|
token, dollars_index, *i - dollars_index + 1,
|
||
|
"Invalid hex character. Hex values must look like 0xFF.");
|
||
|
return false;
|
||
|
}
|
||
|
int value = 0;
|
||
|
if (!base::HexStringToInt(base::StringPiece(&input[*i + 2], 2), &value)) {
|
||
|
*err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
|
||
|
"Could not convert hex value.");
|
||
|
return false;
|
||
|
}
|
||
|
*i += 3;
|
||
|
output->push_back(value);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
bool ExpandStringLiteral(Scope* scope,
|
||
|
const Token& literal,
|
||
|
Value* result,
|
||
|
Err* err) {
|
||
|
DCHECK(literal.type() == Token::STRING);
|
||
|
DCHECK(literal.value().size() > 1); // Should include quotes.
|
||
|
DCHECK(result->type() == Value::STRING); // Should be already set.
|
||
|
|
||
|
// The token includes the surrounding quotes, so strip those off.
|
||
|
const char* input = &literal.value().data()[1];
|
||
|
size_t size = literal.value().size() - 2;
|
||
|
|
||
|
std::string& output = result->string_value();
|
||
|
output.reserve(size);
|
||
|
for (size_t i = 0; i < size; i++) {
|
||
|
if (input[i] == '\\') {
|
||
|
if (i < size - 1) {
|
||
|
switch (input[i + 1]) {
|
||
|
case '\\':
|
||
|
case '"':
|
||
|
case '$':
|
||
|
output.push_back(input[i + 1]);
|
||
|
i++;
|
||
|
continue;
|
||
|
default: // Everything else has no meaning: pass the literal.
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
output.push_back(input[i]);
|
||
|
} else if (input[i] == '$') {
|
||
|
i++;
|
||
|
if (i == size) {
|
||
|
*err = ErrInsideStringToken(literal, i - 1, 1, "$ at end of string.",
|
||
|
"I was expecting an identifier, 0xFF, or {...} after the $.");
|
||
|
return false;
|
||
|
}
|
||
|
if (input[i] == '0') {
|
||
|
if (!AppendHexByte(scope, literal, input, size, &i, &output, err))
|
||
|
return false;
|
||
|
} else if (!AppendStringInterpolation(scope, literal, input, size, &i,
|
||
|
&output, err))
|
||
|
return false;
|
||
|
} else {
|
||
|
output.push_back(input[i]);
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
size_t EditDistance(const base::StringPiece& s1,
|
||
|
const base::StringPiece& s2,
|
||
|
size_t max_edit_distance) {
|
||
|
// The algorithm implemented below is the "classic"
|
||
|
// dynamic-programming algorithm for computing the Levenshtein
|
||
|
// distance, which is described here:
|
||
|
//
|
||
|
// http://en.wikipedia.org/wiki/Levenshtein_distance
|
||
|
//
|
||
|
// Although the algorithm is typically described using an m x n
|
||
|
// array, only one row plus one element are used at a time, so this
|
||
|
// implementation just keeps one vector for the row. To update one entry,
|
||
|
// only the entries to the left, top, and top-left are needed. The left
|
||
|
// entry is in row[x-1], the top entry is what's in row[x] from the last
|
||
|
// iteration, and the top-left entry is stored in previous.
|
||
|
size_t m = s1.size();
|
||
|
size_t n = s2.size();
|
||
|
|
||
|
std::vector<size_t> row(n + 1);
|
||
|
for (size_t i = 1; i <= n; ++i)
|
||
|
row[i] = i;
|
||
|
|
||
|
for (size_t y = 1; y <= m; ++y) {
|
||
|
row[0] = y;
|
||
|
size_t best_this_row = row[0];
|
||
|
|
||
|
size_t previous = y - 1;
|
||
|
for (size_t x = 1; x <= n; ++x) {
|
||
|
size_t old_row = row[x];
|
||
|
row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u),
|
||
|
std::min(row[x - 1], row[x]) + 1u);
|
||
|
previous = old_row;
|
||
|
best_this_row = std::min(best_this_row, row[x]);
|
||
|
}
|
||
|
|
||
|
if (max_edit_distance && best_this_row > max_edit_distance)
|
||
|
return max_edit_distance + 1;
|
||
|
}
|
||
|
|
||
|
return row[n];
|
||
|
}
|
||
|
|
||
|
base::StringPiece SpellcheckString(
|
||
|
const base::StringPiece& text,
|
||
|
const std::vector<base::StringPiece>& words) {
|
||
|
const size_t kMaxValidEditDistance = 3u;
|
||
|
|
||
|
size_t min_distance = kMaxValidEditDistance + 1u;
|
||
|
base::StringPiece result;
|
||
|
for (base::StringPiece word : words) {
|
||
|
size_t distance = EditDistance(word, text, kMaxValidEditDistance);
|
||
|
if (distance < min_distance) {
|
||
|
min_distance = distance;
|
||
|
result = word;
|
||
|
}
|
||
|
}
|
||
|
return result;
|
||
|
}
|