diff --git a/src/Language/Lexer.php b/src/Language/Lexer.php index 121016d..6dc975d 100644 --- a/src/Language/Lexer.php +++ b/src/Language/Lexer.php @@ -10,6 +10,8 @@ use GraphQL\Utils; * source lexes, the final Token emitted by the lexer will be of kind * EOF, after which the lexer will repeatedly return the same EOF token * whenever called. + * + * Algorithm is O(N) both on memory and time */ class Lexer { @@ -51,6 +53,26 @@ class Lexer */ public $lineStart; + /** + * Current cursor position for UTF8 encoding of the source + * + * @var int + */ + private $position; + + /** + * Current cursor position for ASCII representation of the source + * + * @var int + */ + private $byteStreamPosition; + + /** + * Lexer constructor. + * + * @param Source $source + * @param array $options + */ public function __construct(Source $source, array $options = []) { $startOfFileToken = new Token(Token::SOF, 0, 0, 0, 0, null); @@ -61,6 +83,7 @@ class Lexer $this->token = $startOfFileToken; $this->line = 1; $this->lineStart = 0; + $this->position = $this->byteStreamPosition = 0; } /** @@ -95,10 +118,11 @@ class Lexer */ private function readToken(Token $prev) { - $body = $this->source->body; $bodyLength = $this->source->length; - $position = $this->positionAfterWhitespace($prev->end); + $this->positionAfterWhitespace(); + $position = $this->position; + $line = $this->line; $col = 1 + $position - $this->lineStart; @@ -106,7 +130,8 @@ class Lexer return new Token(Token::EOF, $bodyLength, $bodyLength, $line, $col, $prev); } - $code = Utils::charCodeAt($body, $position); + // Read next char and advance string cursor: + list (, $code, $bytes) = $this->readChar(true); // SourceCharacter if ($code < 0x0020 && $code !== 0x0009 && $code !== 0x000A && $code !== 0x000D) { @@ -121,7 +146,8 @@ class Lexer case 33: // ! return new Token(Token::BANG, $position, $position + 1, $line, $col, $prev); case 35: // # - return $this->readComment($position, $line, $col, $prev); + $this->moveStringCursor(-1, -1 * $bytes); + return $this->readComment($line, $col, $prev); case 36: // $ return new Token(Token::DOLLAR, $position, $position + 1, $line, $col, $prev); case 40: // ( @@ -129,8 +155,10 @@ class Lexer case 41: // ) return new Token(Token::PAREN_R, $position, $position + 1, $line, $col, $prev); case 46: // . - if (Utils::charCodeAt($body, $position+1) === 46 && - Utils::charCodeAt($body, $position+2) === 46) { + list (, $charCode1) = $this->readChar(true); + list (, $charCode2) = $this->readChar(true); + + if ($charCode1 === 46 && $charCode2 === 46) { return new Token(Token::SPREAD, $position, $position + 3, $line, $col, $prev); } break; @@ -162,21 +190,24 @@ class Lexer case 105: case 106: case 107: case 108: case 109: case 110: case 111: case 112: case 113: case 114: case 115: case 116: case 117: case 118: case 119: case 120: case 121: case 122: - return $this->readName($position, $line, $col, $prev); + return $this->moveStringCursor(-1, -1 * $bytes) + ->readName($line, $col, $prev); // - case 45: // 0-9 case 48: case 49: case 50: case 51: case 52: case 53: case 54: case 55: case 56: case 57: - return $this->readNumber($position, $code, $line, $col, $prev); + return $this->moveStringCursor(-1, -1 * $bytes) + ->readNumber($line, $col, $prev); // " case 34: - return $this->readString($position, $line, $col, $prev); + return $this->moveStringCursor(-1, -1 * $bytes) + ->readString($line, $col, $prev); } $errMessage = $code === 39 - ? "Unexpected single quote character ('), did you mean to use ". 'a double quote (")?' - : 'Cannot parse the unexpected character ' . Utils::printCharCode($code) . '.'; + ? "Unexpected single quote character ('), did you mean to use ". 'a double quote (")?' + : 'Cannot parse the unexpected character ' . Utils::printCharCode($code) . '.'; throw new SyntaxError( $this->source, @@ -190,38 +221,34 @@ class Lexer * * [_A-Za-z][_0-9A-Za-z]* * - * @param int $position * @param int $line * @param int $col * @param Token $prev * @return Token */ - private function readName($position, $line, $col, Token $prev) + private function readName($line, $col, Token $prev) { - $body = $this->source->body; - $bodyLength = $this->source->length; - $end = $position + 1; + $value = ''; + $start = $this->position; + list ($char, $code) = $this->readChar(); - while ( - $end !== $bodyLength && - ($code = Utils::charCodeAt($body, $end)) && - ( - $code === 95 || // _ - $code >= 48 && $code <= 57 || // 0-9 - $code >= 65 && $code <= 90 || // A-Z - $code >= 97 && $code <= 122 // a-z - ) - ) { - ++$end; + while ($code && ( + $code === 95 || // _ + $code >= 48 && $code <= 57 || // 0-9 + $code >= 65 && $code <= 90 || // A-Z + $code >= 97 && $code <= 122 // a-z + )) { + $value .= $char; + list ($char, $code) = $this->moveStringCursor(1, 1)->readChar(); } return new Token( Token::NAME, - $position, - $end, + $start, + $this->position, $line, $col, $prev, - mb_substr($body, $position, $end - $position, 'UTF-8') + $value ); } @@ -232,126 +259,130 @@ class Lexer * Int: -?(0|[1-9][0-9]*) * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)? * - * @param int $start - * @param string $firstCode * @param int $line * @param int $col * @param Token $prev * @return Token * @throws SyntaxError */ - private function readNumber($start, $firstCode, $line, $col, Token $prev) + private function readNumber($line, $col, Token $prev) { - $code = $firstCode; - $body = $this->source->body; - $position = $start; + $value = ''; + $start = $this->position; + list ($char, $code) = $this->readChar(); + $isFloat = false; if ($code === 45) { // - - $code = Utils::charCodeAt($body, ++$position); + $value .= $char; + list ($char, $code) = $this->moveStringCursor(1, 1)->readChar(); } // guard against leading zero's if ($code === 48) { // 0 - $code = Utils::charCodeAt($body, ++$position); + $value .= $char; + list ($char, $code) = $this->moveStringCursor(1, 1)->readChar(); if ($code >= 48 && $code <= 57) { - throw new SyntaxError($this->source, $position, "Invalid number, unexpected digit after 0: " . Utils::printCharCode($code)); + throw new SyntaxError($this->source, $this->position, "Invalid number, unexpected digit after 0: " . Utils::printCharCode($code)); } } else { - $position = $this->readDigits($position, $code); - $code = Utils::charCodeAt($body, $position); + $value .= $this->readDigits(); + list ($char, $code) = $this->readChar(); } if ($code === 46) { // . $isFloat = true; + $this->moveStringCursor(1, 1); - $code = Utils::charCodeAt($body, ++$position); - $position = $this->readDigits($position, $code); - $code = Utils::charCodeAt($body, $position); + $value .= $char; + $value .= $this->readDigits(); + list ($char, $code) = $this->readChar(); } if ($code === 69 || $code === 101) { // E e $isFloat = true; - $code = Utils::charCodeAt($body, ++$position); + $value .= $char; + list ($char, $code) = $this->moveStringCursor(1, 1)->readChar(); if ($code === 43 || $code === 45) { // + - - $code = Utils::charCodeAt($body, ++$position); + $value .= $char; + $this->moveStringCursor(1, 1); } - $position = $this->readDigits($position, $code); + $value .= $this->readDigits(); } return new Token( $isFloat ? Token::FLOAT : Token::INT, $start, - $position, + $this->position, $line, $col, $prev, - mb_substr($body, $start, $position - $start, 'UTF-8') + $value ); } /** - * Returns the new position in the source after reading digits. + * Returns string with all digits + changes current string cursor position to point to the first char after digits */ - private function readDigits($start, $firstCode) + private function readDigits() { - $body = $this->source->body; - $position = $start; - $code = $firstCode; + list ($char, $code) = $this->readChar(); if ($code >= 48 && $code <= 57) { // 0 - 9 + $value = ''; + do { - $code = Utils::charCodeAt($body, ++$position); + $value .= $char; + list ($char, $code) = $this->moveStringCursor(1, 1)->readChar(); } while ($code >= 48 && $code <= 57); // 0 - 9 - return $position; + return $value; } - if ($position > $this->source->length - 1) { + if ($this->position > $this->source->length - 1) { $code = null; } throw new SyntaxError( $this->source, - $position, + $this->position, 'Invalid number, expected digit but got: ' . Utils::printCharCode($code) ); } /** - * @param int $start * @param int $line * @param int $col * @param Token $prev * @return Token * @throws SyntaxError */ - private function readString($start, $line, $col, Token $prev) + private function readString($line, $col, Token $prev) { - $body = $this->source->body; - $bodyLength = $this->source->length; + $start = $this->position; - $position = $start + 1; - $chunkStart = $position; - $code = null; + // Skip leading quote and read first string char: + list ($char, $code, $bytes) = $this->moveStringCursor(1, 1)->readChar(); + + $chunk = ''; $value = ''; while ( - $position < $bodyLength && - ($code = Utils::charCodeAt($body, $position)) && + $code && // not LineTerminator - $code !== 0x000A && $code !== 0x000D && + $code !== 10 && $code !== 13 && // not Quote (") $code !== 34 ) { - $this->assertValidStringCharacterCode($code, $position); + $this->assertValidStringCharacterCode($code, $this->position); + $this->moveStringCursor(1, $bytes); - ++$position; if ($code === 92) { // \ - $value .= mb_substr($body, $chunkStart, $position - 1 - $chunkStart, 'UTF-8'); - $code = Utils::charCodeAt($body, $position); + $value .= $chunk; + list (, $code) = $this->readChar(true); + switch ($code) { case 34: $value .= '"'; break; case 47: $value .= '/'; break; @@ -362,45 +393,51 @@ class Lexer case 114: $value .= "\r"; break; case 116: $value .= "\t"; break; case 117: - $hex = mb_substr($body, $position + 1, 4, 'UTF-8'); + $position = $this->position; + list ($hex) = $this->readChars(4, true); if (!preg_match('/[0-9a-fA-F]{4}/', $hex)) { throw new SyntaxError( $this->source, - $position, + $position - 1, 'Invalid character escape sequence: \\u' . $hex ); } $code = hexdec($hex); - $this->assertValidStringCharacterCode($code, $position - 1); + $this->assertValidStringCharacterCode($code, $position - 2); $value .= Utils::chr($code); - $position += 4; break; default: throw new SyntaxError( $this->source, - $position, + $this->position - 1, 'Invalid character escape sequence: \\' . Utils::chr($code) ); } - ++$position; - $chunkStart = $position; + $chunk = ''; + } else { + $chunk .= $char; } + + list ($char, $code, $bytes) = $this->readChar(); } if ($code !== 34) { throw new SyntaxError( $this->source, - $position, + $this->position, 'Unterminated string.' ); } - $value .= mb_substr($body, $chunkStart, $position - $chunkStart, 'UTF-8'); + $value .= $chunk; + + // Skip trailing quote: + $this->moveStringCursor(1, 1); return new Token( Token::STRING, $start, - $position + 1, + $this->position, $line, $col, $prev, @@ -422,43 +459,33 @@ class Lexer /** * Reads from body starting at startPosition until it finds a non-whitespace - * or commented character, then returns the position of that character for - * lexing. - * - * @param $startPosition - * @return int + * or commented character, then places cursor to the position of that character. */ - private function positionAfterWhitespace($startPosition) + private function positionAfterWhitespace() { - $body = $this->source->body; - $bodyLength = $this->source->length; - $position = $startPosition; - - while ($position < $bodyLength) { - $code = Utils::charCodeAt($body, $position); + while ($this->position < $this->source->length) { + list(, $code, $bytes) = $this->readChar(); // Skip whitespace // tab | space | comma | BOM if ($code === 9 || $code === 32 || $code === 44 || $code === 0xFEFF) { - $position++; + $this->moveStringCursor(1, $bytes); } else if ($code === 10) { // new line - $position++; + $this->moveStringCursor(1, $bytes); $this->line++; - $this->lineStart = $position; + $this->lineStart = $this->position; } else if ($code === 13) { // carriage return - if (Utils::charCodeAt($body, $position + 1) === 10) { - $position += 2; - } else { - $position ++; + list(, $nextCode, $nextBytes) = $this->moveStringCursor(1, $bytes)->readChar(); + + if ($nextCode === 10) { // lf after cr + $this->moveStringCursor(1, $nextBytes); } $this->line++; - $this->lineStart = $position; + $this->lineStart = $this->position; } else { break; } } - - return $position; } /** @@ -466,21 +493,22 @@ class Lexer * * #[\u0009\u0020-\uFFFF]* * - * @param $start * @param $line * @param $col * @param Token $prev * @return Token */ - private function readComment($start, $line, $col, Token $prev) + private function readComment($line, $col, Token $prev) { - $body = $this->source->body; - $position = $start; + $start = $this->position; + $value = ''; + $bytes = 1; do { - $code = Utils::charCodeAt($body, ++$position); + list ($char, $code, $bytes) = $this->moveStringCursor(1, $bytes)->readChar(); + $value .= $char; } while ( - $code !== null && + $code && // SourceCharacter but not LineTerminator ($code > 0x001F || $code === 0x0009) ); @@ -488,11 +516,97 @@ class Lexer return new Token( Token::COMMENT, $start, - $position, + $this->position, $line, $col, $prev, - mb_substr($body, $start + 1, $position - $start, 'UTF-8') + $value ); } + + /** + * Reads next UTF8Character from the byte stream, starting from $byteStreamPosition. + * + * @param bool $advance + * @param int $byteStreamPosition + * @return array + */ + private function readChar($advance = false, $byteStreamPosition = null) + { + if ($byteStreamPosition === null) { + $byteStreamPosition = $this->byteStreamPosition; + } + + $code = 0; + $utf8char = ''; + $bytes = 0; + $positionOffset = 0; + + if (isset($this->source->body[$byteStreamPosition])) { + $ord = ord($this->source->body[$byteStreamPosition]); + + if ($ord < 128) { + $bytes = 1; + } else if ($ord < 224) { + $bytes = 2; + } elseif ($ord < 240) { + $bytes = 3; + } else { + $bytes = 4; + } + + $utf8char = ''; + for ($pos = $byteStreamPosition; $pos < $byteStreamPosition + $bytes; $pos++) { + $utf8char .= $this->source->body[$pos]; + } + $positionOffset = 1; + $code = $bytes === 1 ? $ord : Utils::ord($utf8char); + } + + if ($advance) { + $this->moveStringCursor($positionOffset, $bytes); + } + + return [$utf8char, $code, $bytes]; + } + + /** + * Reads next $numberOfChars UTF8 characters from the byte stream, starting from $byteStreamPosition. + * + * @param $numberOfChars + * @param bool $advance + * @param null $byteStreamPosition + * @return array + */ + private function readChars($numberOfChars, $advance = false, $byteStreamPosition = null) + { + $result = ''; + $totalBytes = 0; + $byteOffset = $byteStreamPosition ?: $this->byteStreamPosition; + + for ($i = 0; $i < $numberOfChars; $i++) { + list ($char, $code, $bytes) = $this->readChar(false, $byteOffset); + $totalBytes += $bytes; + $byteOffset += $bytes; + $result .= $char; + } + if ($advance) { + $this->moveStringCursor($numberOfChars, $totalBytes); + } + return [$result, $totalBytes]; + } + + /** + * Moves internal string cursor position + * + * @param $positionOffset + * @param $byteStreamOffset + * @return $this + */ + private function moveStringCursor($positionOffset, $byteStreamOffset) + { + $this->position += $positionOffset; + $this->byteStreamPosition += $byteStreamOffset; + return $this; + } } diff --git a/src/Utils.php b/src/Utils.php index 1507182..6a41f57 100644 --- a/src/Utils.php +++ b/src/Utils.php @@ -290,15 +290,17 @@ class Utils */ public static function ord($char, $encoding = 'UTF-8') { + if (!$char && '0' !== $char) { + return 0; + } if (!isset($char[1])) { return ord($char); } - if ($encoding === 'UCS-4BE') { - list(, $ord) = (strlen($char) === 4) ? unpack('N', $char) : unpack('n', $char); - return $ord; - } else { - return self::ord(mb_convert_encoding($char, 'UCS-4BE', $encoding), 'UCS-4BE'); + if ($encoding !== 'UCS-4BE') { + $char = mb_convert_encoding($char, 'UCS-4BE', $encoding); } + list(, $ord) = unpack('N', $char); + return $ord; } /**