// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef BASE_JSON_JSON_PARSER_H_ #define BASE_JSON_JSON_PARSER_H_ #include #include #include #include #include "base/base_export.h" #include "base/compiler_specific.h" #include "base/gtest_prod_util.h" #include "base/json/json_reader.h" #include "base/macros.h" #include "base/optional.h" #include "base/strings/string_piece.h" namespace base { class Value; namespace internal { class JSONParserTest; // The implementation behind the JSONReader interface. This class is not meant // to be used directly; it encapsulates logic that need not be exposed publicly. // // This parser guarantees O(n) time through the input string. It also optimizes // base::Value by using StringPiece where possible when returning Value // objects by using "hidden roots," discussed in the implementation. // // Iteration happens on the byte level, with the functions CanConsume and // NextChar. The conversion from byte to JSON token happens without advancing // the parser in GetNextToken/ParseToken, that is tokenization operates on // the current parser position without advancing. // // Built on top of these are a family of Consume functions that iterate // internally. Invariant: on entry of a Consume function, the parser is wound // to the first byte of a valid JSON token. On exit, it is on the last byte // of a token, such that the next iteration of the parser will be at the byte // immediately following the token, which would likely be the first byte of the // next token. class BASE_EXPORT JSONParser { public: explicit JSONParser(int options); ~JSONParser(); // Parses the input string according to the set options and returns the // result as a Value. // Wrap this in base::FooValue::From() to check the Value is of type Foo and // convert to a FooValue at the same time. std::unique_ptr Parse(StringPiece input); // Returns the error code. JSONReader::JsonParseError error_code() const; // Returns the human-friendly error message. std::string GetErrorMessage() const; // Returns the error line number if parse error happened. Otherwise always // returns 0. int error_line() const; // Returns the error column number if parse error happened. Otherwise always // returns 0. int error_column() const; private: enum Token { T_OBJECT_BEGIN, // { T_OBJECT_END, // } T_ARRAY_BEGIN, // [ T_ARRAY_END, // ] T_STRING, T_NUMBER, T_BOOL_TRUE, // true T_BOOL_FALSE, // false T_NULL, // null T_LIST_SEPARATOR, // , T_OBJECT_PAIR_SEPARATOR, // : T_END_OF_INPUT, T_INVALID_TOKEN, }; // A helper class used for parsing strings. One optimization performed is to // create base::Value with a StringPiece to avoid unnecessary std::string // copies. This is not possible if the input string needs to be decoded from // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped. // This class centralizes that logic. class StringBuilder { public: // Empty constructor. Used for creating a builder with which to assign to. StringBuilder(); // |pos| is the beginning of an input string, excluding the |"|. explicit StringBuilder(const char* pos); ~StringBuilder(); StringBuilder& operator=(StringBuilder&& other); // Either increases the |length_| of the string or copies the character if // the StringBuilder has been converted. |c| must be in the basic ASCII // plane; all other characters need to be in UTF-8 units, appended with // AppendString below. void Append(const char& c); // Appends a string to the std::string. Must be Convert()ed to use. void AppendString(const char* str, size_t len); // Converts the builder from its default StringPiece to a full std::string, // performing a copy. Once a builder is converted, it cannot be made a // StringPiece again. void Convert(); // Returns the builder as a StringPiece. StringPiece AsStringPiece(); // Returns the builder as a std::string. const std::string& AsString(); // Returns the builder as a string, invalidating all state. This allows // the internal string buffer representation to be destructively moved // in cases where the builder will not be needed any more. std::string DestructiveAsString(); private: // The beginning of the input string. const char* pos_; // Number of bytes in |pos_| that make up the string being built. size_t length_; // The copied string representation. Will be unset until Convert() is // called. base::Optional string_; }; // Quick check that the stream has capacity to consume |length| more bytes. bool CanConsume(int length); // The basic way to consume a single character in the stream. Consumes one // byte of the input stream and returns a pointer to the rest of it. const char* NextChar(); // Performs the equivalent of NextChar N times. void NextNChars(int n); // Skips over whitespace and comments to find the next token in the stream. // This does not advance the parser for non-whitespace or comment chars. Token GetNextToken(); // Consumes whitespace characters and comments until the next non-that is // encountered. void EatWhitespaceAndComments(); // Helper function that consumes a comment, assuming that the parser is // currently wound to a '/'. bool EatComment(); // Calls GetNextToken() and then ParseToken(). std::unique_ptr ParseNextToken(); // Takes a token that represents the start of a Value ("a structural token" // in RFC terms) and consumes it, returning the result as a Value. std::unique_ptr ParseToken(Token token); // Assuming that the parser is currently wound to '{', this parses a JSON // object into a DictionaryValue. std::unique_ptr ConsumeDictionary(); // Assuming that the parser is wound to '[', this parses a JSON list into a // std::unique_ptr. std::unique_ptr ConsumeList(); // Calls through ConsumeStringRaw and wraps it in a value. std::unique_ptr ConsumeString(); // Assuming that the parser is wound to a double quote, this parses a string, // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on // success and places result into |out|. Returns false on failure with // error information set. bool ConsumeStringRaw(StringBuilder* out); // Helper function for ConsumeStringRaw() that consumes the next four or 10 // bytes (parser is wound to the first character of a HEX sequence, with the // potential for consuming another \uXXXX for a surrogate). Returns true on // success and places the UTF8 code units in |dest_string|, and false on // failure. bool DecodeUTF16(std::string* dest_string); // Helper function for ConsumeStringRaw() that takes a single code point, // decodes it into UTF-8 units, and appends it to the given builder. The // point must be valid. void DecodeUTF8(const int32_t& point, StringBuilder* dest); // Assuming that the parser is wound to the start of a valid JSON number, // this parses and converts it to either an int or double value. std::unique_ptr ConsumeNumber(); // Helper that reads characters that are ints. Returns true if a number was // read and false on error. bool ReadInt(bool allow_leading_zeros); // Consumes the literal values of |true|, |false|, and |null|, assuming the // parser is wound to the first character of any of those. std::unique_ptr ConsumeLiteral(); // Compares two string buffers of a given length. static bool StringsAreEqual(const char* left, const char* right, size_t len); // Sets the error information to |code| at the current column, based on // |index_| and |index_last_line_|, with an optional positive/negative // adjustment by |column_adjust|. void ReportError(JSONReader::JsonParseError code, int column_adjust); // Given the line and column number of an error, formats one of the error // message contants from json_reader.h for human display. static std::string FormatErrorMessage(int line, int column, const std::string& description); // base::JSONParserOptions that control parsing. const int options_; // Pointer to the start of the input data. const char* start_pos_; // Pointer to the current position in the input data. Equivalent to // |start_pos_ + index_|. const char* pos_; // Pointer to the last character of the input data. const char* end_pos_; // The index in the input stream to which the parser is wound. int index_; // The number of times the parser has recursed (current stack depth). int stack_depth_; // The line number that the parser is at currently. int line_number_; // The last value of |index_| on the previous line. int index_last_line_; // Error information. JSONReader::JsonParseError error_code_; int error_line_; int error_column_; friend class JSONParserTest; FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar); FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary); FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList); FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString); FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals); FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers); FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages); FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidCharacters); FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidUTF16EscapeSequence); DISALLOW_COPY_AND_ASSIGN(JSONParser); }; // Used when decoding and an invalid utf-8 sequence is encountered. BASE_EXPORT extern const char kUnicodeReplacementString[]; } // namespace internal } // namespace base #endif // BASE_JSON_JSON_PARSER_H_