// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "tools/gn/tokenizer.h" #include "base/logging.h" #include "base/strings/string_util.h" #include "tools/gn/input_file.h" namespace { bool CouldBeTwoCharOperatorBegin(char c) { return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || c == '+' || c == '|' || c == '&'; } bool CouldBeTwoCharOperatorEnd(char c) { return c == '=' || c == '|' || c == '&'; } bool CouldBeOneCharOperator(char c) { return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || c == ':' || c == '|' || c == '&' || c == '-'; } bool CouldBeOperator(char c) { return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); } bool IsScoperChar(char c) { return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; } Token::Type GetSpecificOperatorType(base::StringPiece value) { if (value == "=") return Token::EQUAL; if (value == "+") return Token::PLUS; if (value == "-") return Token::MINUS; if (value == "+=") return Token::PLUS_EQUALS; if (value == "-=") return Token::MINUS_EQUALS; if (value == "==") return Token::EQUAL_EQUAL; if (value == "!=") return Token::NOT_EQUAL; if (value == "<=") return Token::LESS_EQUAL; if (value == ">=") return Token::GREATER_EQUAL; if (value == "<") return Token::LESS_THAN; if (value == ">") return Token::GREATER_THAN; if (value == "&&") return Token::BOOLEAN_AND; if (value == "||") return Token::BOOLEAN_OR; if (value == "!") return Token::BANG; if (value == ".") return Token::DOT; return Token::INVALID; } } // namespace Tokenizer::Tokenizer(const InputFile* input_file, Err* err) : input_file_(input_file), input_(input_file->contents()), err_(err), cur_(0), line_number_(1), column_number_(1) { } Tokenizer::~Tokenizer() = default; // static std::vector Tokenizer::Tokenize(const InputFile* input_file, Err* err) { Tokenizer t(input_file, err); return t.Run(); } std::vector Tokenizer::Run() { DCHECK(tokens_.empty()); while (!done()) { AdvanceToNextToken(); if (done()) break; Location location = GetCurrentLocation(); Token::Type type = ClassifyCurrent(); if (type == Token::INVALID) { *err_ = GetErrorForInvalidToken(location); break; } size_t token_begin = cur_; AdvanceToEndOfToken(location, type); if (has_error()) break; size_t token_end = cur_; base::StringPiece token_value(&input_.data()[token_begin], token_end - token_begin); if (type == Token::UNCLASSIFIED_OPERATOR) { type = GetSpecificOperatorType(token_value); } else if (type == Token::IDENTIFIER) { if (token_value == "if") type = Token::IF; else if (token_value == "else") type = Token::ELSE; else if (token_value == "true") type = Token::TRUE_TOKEN; else if (token_value == "false") type = Token::FALSE_TOKEN; } else if (type == Token::UNCLASSIFIED_COMMENT) { if (AtStartOfLine(token_begin) && // If it's a standalone comment, but is a continuation of a comment on // a previous line, then instead make it a continued suffix comment. (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT || tokens_.back().location().line_number() + 1 != location.line_number() || tokens_.back().location().column_number() != location.column_number())) { type = Token::LINE_COMMENT; if (!at_end()) // Could be EOF. Advance(); // The current \n. // If this comment is separated from the next syntax element, then we // want to tag it as a block comment. This will become a standalone // statement at the parser level to keep this comment separate, rather // than attached to the subsequent statement. while (!at_end() && IsCurrentWhitespace()) { if (IsCurrentNewline()) { type = Token::BLOCK_COMMENT; break; } Advance(); } } else { type = Token::SUFFIX_COMMENT; } } tokens_.push_back(Token(location, type, token_value)); } if (err_->has_error()) tokens_.clear(); return tokens_; } // static size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) { DCHECK_GT(n, 0); if (n == 1) return 0; int cur_line = 1; size_t cur_byte = 0; while (cur_byte < buf.size()) { if (IsNewline(buf, cur_byte)) { cur_line++; if (cur_line == n) return cur_byte + 1; } cur_byte++; } return static_cast(-1); } // static bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) { DCHECK(offset < buffer.size()); // We may need more logic here to handle different line ending styles. return buffer[offset] == '\n'; } // static bool Tokenizer::IsIdentifierFirstChar(char c) { return base::IsAsciiAlpha(c) || c == '_'; } // static bool Tokenizer::IsIdentifierContinuingChar(char c) { // Also allow digits after the first char. return IsIdentifierFirstChar(c) || base::IsAsciiDigit(c); } void Tokenizer::AdvanceToNextToken() { while (!at_end() && IsCurrentWhitespace()) Advance(); } Token::Type Tokenizer::ClassifyCurrent() const { DCHECK(!at_end()); char next_char = cur_char(); if (base::IsAsciiDigit(next_char)) return Token::INTEGER; if (next_char == '"') return Token::STRING; // Note: '-' handled specially below. if (next_char != '-' && CouldBeOperator(next_char)) return Token::UNCLASSIFIED_OPERATOR; if (IsIdentifierFirstChar(next_char)) return Token::IDENTIFIER; if (next_char == '[') return Token::LEFT_BRACKET; if (next_char == ']') return Token::RIGHT_BRACKET; if (next_char == '(') return Token::LEFT_PAREN; if (next_char == ')') return Token::RIGHT_PAREN; if (next_char == '{') return Token::LEFT_BRACE; if (next_char == '}') return Token::RIGHT_BRACE; if (next_char == '.') return Token::DOT; if (next_char == ',') return Token::COMMA; if (next_char == '#') return Token::UNCLASSIFIED_COMMENT; // For the case of '-' differentiate between a negative number and anything // else. if (next_char == '-') { if (!CanIncrement()) return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of // file. char following_char = input_[cur_ + 1]; if (base::IsAsciiDigit(following_char)) return Token::INTEGER; return Token::UNCLASSIFIED_OPERATOR; } return Token::INVALID; } void Tokenizer::AdvanceToEndOfToken(const Location& location, Token::Type type) { switch (type) { case Token::INTEGER: do { Advance(); } while (!at_end() && base::IsAsciiDigit(cur_char())); if (!at_end()) { // Require the char after a number to be some kind of space, scope, // or operator. char c = cur_char(); if (!IsCurrentWhitespace() && !CouldBeOperator(c) && !IsScoperChar(c) && c != ',') { *err_ = Err(GetCurrentLocation(), "This is not a valid number.", "Learn to count."); // Highlight the number. err_->AppendRange(LocationRange(location, GetCurrentLocation())); } } break; case Token::STRING: { char initial = cur_char(); Advance(); // Advance past initial " for (;;) { if (at_end()) { *err_ = Err(LocationRange(location, GetCurrentLocation()), "Unterminated string literal.", "Don't leave me hanging like this!"); break; } if (IsCurrentStringTerminator(initial)) { Advance(); // Skip past last " break; } else if (IsCurrentNewline()) { *err_ = Err(LocationRange(location, GetCurrentLocation()), "Newline in string constant."); } Advance(); } break; } case Token::UNCLASSIFIED_OPERATOR: // Some operators are two characters, some are one. if (CouldBeTwoCharOperatorBegin(cur_char())) { if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) Advance(); } Advance(); break; case Token::IDENTIFIER: while (!at_end() && IsIdentifierContinuingChar(cur_char())) Advance(); break; case Token::LEFT_BRACKET: case Token::RIGHT_BRACKET: case Token::LEFT_BRACE: case Token::RIGHT_BRACE: case Token::LEFT_PAREN: case Token::RIGHT_PAREN: case Token::DOT: case Token::COMMA: Advance(); // All are one char. break; case Token::UNCLASSIFIED_COMMENT: // Eat to EOL. while (!at_end() && !IsCurrentNewline()) Advance(); break; case Token::INVALID: default: *err_ = Err(location, "Everything is all messed up", "Please insert system disk in drive A: and press any key."); NOTREACHED(); return; } } bool Tokenizer::AtStartOfLine(size_t location) const { while (location > 0) { --location; char c = input_[location]; if (c == '\n') return true; if (c != ' ') return false; } return true; } bool Tokenizer::IsCurrentWhitespace() const { DCHECK(!at_end()); char c = input_[cur_]; // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal. return c == 0x0A || c == 0x0D || c == 0x20; } bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { DCHECK(!at_end()); if (cur_char() != quote_char) return false; // Check for escaping. \" is not a string terminator, but \\" is. Count // the number of preceding backslashes. int num_backslashes = 0; for (int i = static_cast(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) num_backslashes++; // Even backslashes mean that they were escaping each other and don't count // as escaping this quote. return (num_backslashes % 2) == 0; } bool Tokenizer::IsCurrentNewline() const { return IsNewline(input_, cur_); } void Tokenizer::Advance() { DCHECK(cur_ < input_.size()); if (IsCurrentNewline()) { line_number_++; column_number_ = 1; } else { column_number_++; } cur_++; } Location Tokenizer::GetCurrentLocation() const { return Location( input_file_, line_number_, column_number_, static_cast(cur_)); } Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { std::string help; if (cur_char() == ';') { // Semicolon. help = "Semicolons are not needed, delete this one."; } else if (cur_char() == '\t') { // Tab. help = "You got a tab character in here. Tabs are evil. " "Convert to spaces."; } else if (cur_char() == '/' && cur_ + 1 < input_.size() && (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { // Different types of comments. help = "Comments should start with # instead"; } else if (cur_char() == '\'') { help = "Strings are delimited by \" characters, not apostrophes."; } else { help = "I have no idea what this is."; } return Err(location, "Invalid token.", help); }