Refactored Lexer algorithm for better performance. Now O(N) vs O(N^2) previously (#137)

This commit is contained in:
Vladimir Razuvaev 2017-06-24 22:42:55 +07:00
parent ffc4542cd0
commit a0657b7847
2 changed files with 233 additions and 117 deletions

View File

@ -10,6 +10,8 @@ use GraphQL\Utils;
* source lexes, the final Token emitted by the lexer will be of kind * source lexes, the final Token emitted by the lexer will be of kind
* EOF, after which the lexer will repeatedly return the same EOF token * EOF, after which the lexer will repeatedly return the same EOF token
* whenever called. * whenever called.
*
* Algorithm is O(N) both on memory and time
*/ */
class Lexer class Lexer
{ {
@ -51,6 +53,26 @@ class Lexer
*/ */
public $lineStart; public $lineStart;
/**
* Current cursor position for UTF8 encoding of the source
*
* @var int
*/
private $position;
/**
* Current cursor position for ASCII representation of the source
*
* @var int
*/
private $byteStreamPosition;
/**
* Lexer constructor.
*
* @param Source $source
* @param array $options
*/
public function __construct(Source $source, array $options = []) public function __construct(Source $source, array $options = [])
{ {
$startOfFileToken = new Token(Token::SOF, 0, 0, 0, 0, null); $startOfFileToken = new Token(Token::SOF, 0, 0, 0, 0, null);
@ -61,6 +83,7 @@ class Lexer
$this->token = $startOfFileToken; $this->token = $startOfFileToken;
$this->line = 1; $this->line = 1;
$this->lineStart = 0; $this->lineStart = 0;
$this->position = $this->byteStreamPosition = 0;
} }
/** /**
@ -95,10 +118,11 @@ class Lexer
*/ */
private function readToken(Token $prev) private function readToken(Token $prev)
{ {
$body = $this->source->body;
$bodyLength = $this->source->length; $bodyLength = $this->source->length;
$position = $this->positionAfterWhitespace($prev->end); $this->positionAfterWhitespace();
$position = $this->position;
$line = $this->line; $line = $this->line;
$col = 1 + $position - $this->lineStart; $col = 1 + $position - $this->lineStart;
@ -106,7 +130,8 @@ class Lexer
return new Token(Token::EOF, $bodyLength, $bodyLength, $line, $col, $prev); return new Token(Token::EOF, $bodyLength, $bodyLength, $line, $col, $prev);
} }
$code = Utils::charCodeAt($body, $position); // Read next char and advance string cursor:
list (, $code, $bytes) = $this->readChar(true);
// SourceCharacter // SourceCharacter
if ($code < 0x0020 && $code !== 0x0009 && $code !== 0x000A && $code !== 0x000D) { if ($code < 0x0020 && $code !== 0x0009 && $code !== 0x000A && $code !== 0x000D) {
@ -121,7 +146,8 @@ class Lexer
case 33: // ! case 33: // !
return new Token(Token::BANG, $position, $position + 1, $line, $col, $prev); return new Token(Token::BANG, $position, $position + 1, $line, $col, $prev);
case 35: // # case 35: // #
return $this->readComment($position, $line, $col, $prev); $this->moveStringCursor(-1, -1 * $bytes);
return $this->readComment($line, $col, $prev);
case 36: // $ case 36: // $
return new Token(Token::DOLLAR, $position, $position + 1, $line, $col, $prev); return new Token(Token::DOLLAR, $position, $position + 1, $line, $col, $prev);
case 40: // ( case 40: // (
@ -129,8 +155,10 @@ class Lexer
case 41: // ) case 41: // )
return new Token(Token::PAREN_R, $position, $position + 1, $line, $col, $prev); return new Token(Token::PAREN_R, $position, $position + 1, $line, $col, $prev);
case 46: // . case 46: // .
if (Utils::charCodeAt($body, $position+1) === 46 && list (, $charCode1) = $this->readChar(true);
Utils::charCodeAt($body, $position+2) === 46) { list (, $charCode2) = $this->readChar(true);
if ($charCode1 === 46 && $charCode2 === 46) {
return new Token(Token::SPREAD, $position, $position + 3, $line, $col, $prev); return new Token(Token::SPREAD, $position, $position + 3, $line, $col, $prev);
} }
break; break;
@ -162,16 +190,19 @@ class Lexer
case 105: case 106: case 107: case 108: case 109: case 110: case 111: case 105: case 106: case 107: case 108: case 109: case 110: case 111:
case 112: case 113: case 114: case 115: case 116: case 117: case 118: case 112: case 113: case 114: case 115: case 116: case 117: case 118:
case 119: case 120: case 121: case 122: case 119: case 120: case 121: case 122:
return $this->readName($position, $line, $col, $prev); return $this->moveStringCursor(-1, -1 * $bytes)
->readName($line, $col, $prev);
// - // -
case 45: case 45:
// 0-9 // 0-9
case 48: case 49: case 50: case 51: case 52: case 48: case 49: case 50: case 51: case 52:
case 53: case 54: case 55: case 56: case 57: case 53: case 54: case 55: case 56: case 57:
return $this->readNumber($position, $code, $line, $col, $prev); return $this->moveStringCursor(-1, -1 * $bytes)
->readNumber($line, $col, $prev);
// " // "
case 34: case 34:
return $this->readString($position, $line, $col, $prev); return $this->moveStringCursor(-1, -1 * $bytes)
->readString($line, $col, $prev);
} }
$errMessage = $code === 39 $errMessage = $code === 39
@ -190,38 +221,34 @@ class Lexer
* *
* [_A-Za-z][_0-9A-Za-z]* * [_A-Za-z][_0-9A-Za-z]*
* *
* @param int $position
* @param int $line * @param int $line
* @param int $col * @param int $col
* @param Token $prev * @param Token $prev
* @return Token * @return Token
*/ */
private function readName($position, $line, $col, Token $prev) private function readName($line, $col, Token $prev)
{ {
$body = $this->source->body; $value = '';
$bodyLength = $this->source->length; $start = $this->position;
$end = $position + 1; list ($char, $code) = $this->readChar();
while ( while ($code && (
$end !== $bodyLength &&
($code = Utils::charCodeAt($body, $end)) &&
(
$code === 95 || // _ $code === 95 || // _
$code >= 48 && $code <= 57 || // 0-9 $code >= 48 && $code <= 57 || // 0-9
$code >= 65 && $code <= 90 || // A-Z $code >= 65 && $code <= 90 || // A-Z
$code >= 97 && $code <= 122 // a-z $code >= 97 && $code <= 122 // a-z
) )) {
) { $value .= $char;
++$end; list ($char, $code) = $this->moveStringCursor(1, 1)->readChar();
} }
return new Token( return new Token(
Token::NAME, Token::NAME,
$position, $start,
$end, $this->position,
$line, $line,
$col, $col,
$prev, $prev,
mb_substr($body, $position, $end - $position, 'UTF-8') $value
); );
} }
@ -232,126 +259,130 @@ class Lexer
* Int: -?(0|[1-9][0-9]*) * Int: -?(0|[1-9][0-9]*)
* Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)? * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
* *
* @param int $start
* @param string $firstCode
* @param int $line * @param int $line
* @param int $col * @param int $col
* @param Token $prev * @param Token $prev
* @return Token * @return Token
* @throws SyntaxError * @throws SyntaxError
*/ */
private function readNumber($start, $firstCode, $line, $col, Token $prev) private function readNumber($line, $col, Token $prev)
{ {
$code = $firstCode; $value = '';
$body = $this->source->body; $start = $this->position;
$position = $start; list ($char, $code) = $this->readChar();
$isFloat = false; $isFloat = false;
if ($code === 45) { // - if ($code === 45) { // -
$code = Utils::charCodeAt($body, ++$position); $value .= $char;
list ($char, $code) = $this->moveStringCursor(1, 1)->readChar();
} }
// guard against leading zero's // guard against leading zero's
if ($code === 48) { // 0 if ($code === 48) { // 0
$code = Utils::charCodeAt($body, ++$position); $value .= $char;
list ($char, $code) = $this->moveStringCursor(1, 1)->readChar();
if ($code >= 48 && $code <= 57) { if ($code >= 48 && $code <= 57) {
throw new SyntaxError($this->source, $position, "Invalid number, unexpected digit after 0: " . Utils::printCharCode($code)); throw new SyntaxError($this->source, $this->position, "Invalid number, unexpected digit after 0: " . Utils::printCharCode($code));
} }
} else { } else {
$position = $this->readDigits($position, $code); $value .= $this->readDigits();
$code = Utils::charCodeAt($body, $position); list ($char, $code) = $this->readChar();
} }
if ($code === 46) { // . if ($code === 46) { // .
$isFloat = true; $isFloat = true;
$this->moveStringCursor(1, 1);
$code = Utils::charCodeAt($body, ++$position); $value .= $char;
$position = $this->readDigits($position, $code); $value .= $this->readDigits();
$code = Utils::charCodeAt($body, $position); list ($char, $code) = $this->readChar();
} }
if ($code === 69 || $code === 101) { // E e if ($code === 69 || $code === 101) { // E e
$isFloat = true; $isFloat = true;
$code = Utils::charCodeAt($body, ++$position); $value .= $char;
list ($char, $code) = $this->moveStringCursor(1, 1)->readChar();
if ($code === 43 || $code === 45) { // + - if ($code === 43 || $code === 45) { // + -
$code = Utils::charCodeAt($body, ++$position); $value .= $char;
$this->moveStringCursor(1, 1);
} }
$position = $this->readDigits($position, $code); $value .= $this->readDigits();
} }
return new Token( return new Token(
$isFloat ? Token::FLOAT : Token::INT, $isFloat ? Token::FLOAT : Token::INT,
$start, $start,
$position, $this->position,
$line, $line,
$col, $col,
$prev, $prev,
mb_substr($body, $start, $position - $start, 'UTF-8') $value
); );
} }
/** /**
* Returns the new position in the source after reading digits. * Returns string with all digits + changes current string cursor position to point to the first char after digits
*/ */
private function readDigits($start, $firstCode) private function readDigits()
{ {
$body = $this->source->body; list ($char, $code) = $this->readChar();
$position = $start;
$code = $firstCode;
if ($code >= 48 && $code <= 57) { // 0 - 9 if ($code >= 48 && $code <= 57) { // 0 - 9
$value = '';
do { do {
$code = Utils::charCodeAt($body, ++$position); $value .= $char;
list ($char, $code) = $this->moveStringCursor(1, 1)->readChar();
} while ($code >= 48 && $code <= 57); // 0 - 9 } while ($code >= 48 && $code <= 57); // 0 - 9
return $position; return $value;
} }
if ($position > $this->source->length - 1) { if ($this->position > $this->source->length - 1) {
$code = null; $code = null;
} }
throw new SyntaxError( throw new SyntaxError(
$this->source, $this->source,
$position, $this->position,
'Invalid number, expected digit but got: ' . Utils::printCharCode($code) 'Invalid number, expected digit but got: ' . Utils::printCharCode($code)
); );
} }
/** /**
* @param int $start
* @param int $line * @param int $line
* @param int $col * @param int $col
* @param Token $prev * @param Token $prev
* @return Token * @return Token
* @throws SyntaxError * @throws SyntaxError
*/ */
private function readString($start, $line, $col, Token $prev) private function readString($line, $col, Token $prev)
{ {
$body = $this->source->body; $start = $this->position;
$bodyLength = $this->source->length;
$position = $start + 1; // Skip leading quote and read first string char:
$chunkStart = $position; list ($char, $code, $bytes) = $this->moveStringCursor(1, 1)->readChar();
$code = null;
$chunk = '';
$value = ''; $value = '';
while ( while (
$position < $bodyLength && $code &&
($code = Utils::charCodeAt($body, $position)) &&
// not LineTerminator // not LineTerminator
$code !== 0x000A && $code !== 0x000D && $code !== 10 && $code !== 13 &&
// not Quote (") // not Quote (")
$code !== 34 $code !== 34
) { ) {
$this->assertValidStringCharacterCode($code, $position); $this->assertValidStringCharacterCode($code, $this->position);
$this->moveStringCursor(1, $bytes);
++$position;
if ($code === 92) { // \ if ($code === 92) { // \
$value .= mb_substr($body, $chunkStart, $position - 1 - $chunkStart, 'UTF-8'); $value .= $chunk;
$code = Utils::charCodeAt($body, $position); list (, $code) = $this->readChar(true);
switch ($code) { switch ($code) {
case 34: $value .= '"'; break; case 34: $value .= '"'; break;
case 47: $value .= '/'; break; case 47: $value .= '/'; break;
@ -362,45 +393,51 @@ class Lexer
case 114: $value .= "\r"; break; case 114: $value .= "\r"; break;
case 116: $value .= "\t"; break; case 116: $value .= "\t"; break;
case 117: case 117:
$hex = mb_substr($body, $position + 1, 4, 'UTF-8'); $position = $this->position;
list ($hex) = $this->readChars(4, true);
if (!preg_match('/[0-9a-fA-F]{4}/', $hex)) { if (!preg_match('/[0-9a-fA-F]{4}/', $hex)) {
throw new SyntaxError( throw new SyntaxError(
$this->source, $this->source,
$position, $position - 1,
'Invalid character escape sequence: \\u' . $hex 'Invalid character escape sequence: \\u' . $hex
); );
} }
$code = hexdec($hex); $code = hexdec($hex);
$this->assertValidStringCharacterCode($code, $position - 1); $this->assertValidStringCharacterCode($code, $position - 2);
$value .= Utils::chr($code); $value .= Utils::chr($code);
$position += 4;
break; break;
default: default:
throw new SyntaxError( throw new SyntaxError(
$this->source, $this->source,
$position, $this->position - 1,
'Invalid character escape sequence: \\' . Utils::chr($code) 'Invalid character escape sequence: \\' . Utils::chr($code)
); );
} }
++$position; $chunk = '';
$chunkStart = $position; } else {
$chunk .= $char;
} }
list ($char, $code, $bytes) = $this->readChar();
} }
if ($code !== 34) { if ($code !== 34) {
throw new SyntaxError( throw new SyntaxError(
$this->source, $this->source,
$position, $this->position,
'Unterminated string.' 'Unterminated string.'
); );
} }
$value .= mb_substr($body, $chunkStart, $position - $chunkStart, 'UTF-8'); $value .= $chunk;
// Skip trailing quote:
$this->moveStringCursor(1, 1);
return new Token( return new Token(
Token::STRING, Token::STRING,
$start, $start,
$position + 1, $this->position,
$line, $line,
$col, $col,
$prev, $prev,
@ -422,43 +459,33 @@ class Lexer
/** /**
* Reads from body starting at startPosition until it finds a non-whitespace * Reads from body starting at startPosition until it finds a non-whitespace
* or commented character, then returns the position of that character for * or commented character, then places cursor to the position of that character.
* lexing.
*
* @param $startPosition
* @return int
*/ */
private function positionAfterWhitespace($startPosition) private function positionAfterWhitespace()
{ {
$body = $this->source->body; while ($this->position < $this->source->length) {
$bodyLength = $this->source->length; list(, $code, $bytes) = $this->readChar();
$position = $startPosition;
while ($position < $bodyLength) {
$code = Utils::charCodeAt($body, $position);
// Skip whitespace // Skip whitespace
// tab | space | comma | BOM // tab | space | comma | BOM
if ($code === 9 || $code === 32 || $code === 44 || $code === 0xFEFF) { if ($code === 9 || $code === 32 || $code === 44 || $code === 0xFEFF) {
$position++; $this->moveStringCursor(1, $bytes);
} else if ($code === 10) { // new line } else if ($code === 10) { // new line
$position++; $this->moveStringCursor(1, $bytes);
$this->line++; $this->line++;
$this->lineStart = $position; $this->lineStart = $this->position;
} else if ($code === 13) { // carriage return } else if ($code === 13) { // carriage return
if (Utils::charCodeAt($body, $position + 1) === 10) { list(, $nextCode, $nextBytes) = $this->moveStringCursor(1, $bytes)->readChar();
$position += 2;
} else { if ($nextCode === 10) { // lf after cr
$position ++; $this->moveStringCursor(1, $nextBytes);
} }
$this->line++; $this->line++;
$this->lineStart = $position; $this->lineStart = $this->position;
} else { } else {
break; break;
} }
} }
return $position;
} }
/** /**
@ -466,21 +493,22 @@ class Lexer
* *
* #[\u0009\u0020-\uFFFF]* * #[\u0009\u0020-\uFFFF]*
* *
* @param $start
* @param $line * @param $line
* @param $col * @param $col
* @param Token $prev * @param Token $prev
* @return Token * @return Token
*/ */
private function readComment($start, $line, $col, Token $prev) private function readComment($line, $col, Token $prev)
{ {
$body = $this->source->body; $start = $this->position;
$position = $start; $value = '';
$bytes = 1;
do { do {
$code = Utils::charCodeAt($body, ++$position); list ($char, $code, $bytes) = $this->moveStringCursor(1, $bytes)->readChar();
$value .= $char;
} while ( } while (
$code !== null && $code &&
// SourceCharacter but not LineTerminator // SourceCharacter but not LineTerminator
($code > 0x001F || $code === 0x0009) ($code > 0x001F || $code === 0x0009)
); );
@ -488,11 +516,97 @@ class Lexer
return new Token( return new Token(
Token::COMMENT, Token::COMMENT,
$start, $start,
$position, $this->position,
$line, $line,
$col, $col,
$prev, $prev,
mb_substr($body, $start + 1, $position - $start, 'UTF-8') $value
); );
} }
/**
* Reads next UTF8Character from the byte stream, starting from $byteStreamPosition.
*
* @param bool $advance
* @param int $byteStreamPosition
* @return array
*/
private function readChar($advance = false, $byteStreamPosition = null)
{
if ($byteStreamPosition === null) {
$byteStreamPosition = $this->byteStreamPosition;
}
$code = 0;
$utf8char = '';
$bytes = 0;
$positionOffset = 0;
if (isset($this->source->body[$byteStreamPosition])) {
$ord = ord($this->source->body[$byteStreamPosition]);
if ($ord < 128) {
$bytes = 1;
} else if ($ord < 224) {
$bytes = 2;
} elseif ($ord < 240) {
$bytes = 3;
} else {
$bytes = 4;
}
$utf8char = '';
for ($pos = $byteStreamPosition; $pos < $byteStreamPosition + $bytes; $pos++) {
$utf8char .= $this->source->body[$pos];
}
$positionOffset = 1;
$code = $bytes === 1 ? $ord : Utils::ord($utf8char);
}
if ($advance) {
$this->moveStringCursor($positionOffset, $bytes);
}
return [$utf8char, $code, $bytes];
}
/**
* Reads next $numberOfChars UTF8 characters from the byte stream, starting from $byteStreamPosition.
*
* @param $numberOfChars
* @param bool $advance
* @param null $byteStreamPosition
* @return array
*/
private function readChars($numberOfChars, $advance = false, $byteStreamPosition = null)
{
$result = '';
$totalBytes = 0;
$byteOffset = $byteStreamPosition ?: $this->byteStreamPosition;
for ($i = 0; $i < $numberOfChars; $i++) {
list ($char, $code, $bytes) = $this->readChar(false, $byteOffset);
$totalBytes += $bytes;
$byteOffset += $bytes;
$result .= $char;
}
if ($advance) {
$this->moveStringCursor($numberOfChars, $totalBytes);
}
return [$result, $totalBytes];
}
/**
* Moves internal string cursor position
*
* @param $positionOffset
* @param $byteStreamOffset
* @return $this
*/
private function moveStringCursor($positionOffset, $byteStreamOffset)
{
$this->position += $positionOffset;
$this->byteStreamPosition += $byteStreamOffset;
return $this;
}
} }

View File

@ -290,15 +290,17 @@ class Utils
*/ */
public static function ord($char, $encoding = 'UTF-8') public static function ord($char, $encoding = 'UTF-8')
{ {
if (!$char && '0' !== $char) {
return 0;
}
if (!isset($char[1])) { if (!isset($char[1])) {
return ord($char); return ord($char);
} }
if ($encoding === 'UCS-4BE') { if ($encoding !== 'UCS-4BE') {
list(, $ord) = (strlen($char) === 4) ? unpack('N', $char) : unpack('n', $char); $char = mb_convert_encoding($char, 'UCS-4BE', $encoding);
return $ord;
} else {
return self::ord(mb_convert_encoding($char, 'UCS-4BE', $encoding), 'UCS-4BE');
} }
list(, $ord) = unpack('N', $char);
return $ord;
} }
/** /**