Refactored Lexer algorithm for better performance. Now O(N) vs O(N^2) previously (#137)

2025-02-15 12:03:12 +03:00 · 2017-06-24 22:42:55 +07:00 · 2017-06-24 22:42:55 +07:00 · a0657b7847
commit a0657b7847
parent ffc4542cd0
2 changed files with 233 additions and 117 deletions
--- a/src/Language/Lexer.php
+++ b/src/Language/Lexer.php
@ -10,6 +10,8 @@ use GraphQL\Utils;
 * source lexes, the final Token emitted by the lexer will be of kind
 * EOF, after which the lexer will repeatedly return the same EOF token
 * whenever called.
 *
 * Algorithm is O(N) both on memory and time
 */
 class Lexer
 {
@ -51,6 +53,26 @@ class Lexer
     */
    public $lineStart;
    /**
     * Current cursor position for UTF8 encoding of the source
     *
     * @var int
     */
    private $position;
    /**
     * Current cursor position for ASCII representation of the source
     *
     * @var int
     */
    private $byteStreamPosition;
    /**
     * Lexer constructor.
     *
     * @param Source $source
     * @param array $options
     */
    public function __construct(Source $source, array $options = [])
    {
        $startOfFileToken = new Token(Token::SOF, 0, 0, 0, 0, null);
@ -61,6 +83,7 @@ class Lexer
        $this->token = $startOfFileToken;
        $this->line = 1;
        $this->lineStart = 0;
        $this->position = $this->byteStreamPosition = 0;
    }
    /**
@ -95,10 +118,11 @@ class Lexer
     */
    private function readToken(Token $prev)
    {
        $body = $this->source->body;
        $bodyLength = $this->source->length;
-        $position = $this->positionAfterWhitespace($prev->end);
+        $this->positionAfterWhitespace();
        $position = $this->position;
        $line = $this->line;
        $col = 1 + $position - $this->lineStart;
@ -106,7 +130,8 @@ class Lexer
            return new Token(Token::EOF, $bodyLength, $bodyLength, $line, $col, $prev);
        }
-        $code = Utils::charCodeAt($body, $position);
+        // Read next char and advance string cursor:
        list (, $code, $bytes) = $this->readChar(true);
        // SourceCharacter
        if ($code < 0x0020 && $code !== 0x0009 && $code !== 0x000A && $code !== 0x000D) {
@ -121,7 +146,8 @@ class Lexer
            case 33: // !
                return new Token(Token::BANG, $position, $position + 1, $line, $col, $prev);
            case 35: // #
-                return $this->readComment($position, $line, $col, $prev);
+                $this->moveStringCursor(-1, -1 * $bytes);
                return $this->readComment($line, $col, $prev);
            case 36: // $
                return new Token(Token::DOLLAR, $position, $position + 1, $line, $col, $prev);
            case 40: // (
@ -129,8 +155,10 @@ class Lexer
            case 41: // )
                return new Token(Token::PAREN_R, $position, $position + 1, $line, $col, $prev);
            case 46: // .
-                if (Utils::charCodeAt($body, $position+1) === 46 &&
+                list (, $charCode1) = $this->readChar(true);
-                    Utils::charCodeAt($body, $position+2) === 46) {
+                list (, $charCode2) = $this->readChar(true);
                if ($charCode1 === 46 && $charCode2 === 46) {
                    return new Token(Token::SPREAD, $position, $position + 3, $line, $col, $prev);
                }
                break;
@ -162,16 +190,19 @@ class Lexer
            case 105: case 106: case 107: case 108: case 109: case 110: case 111:
            case 112: case 113: case 114: case 115: case 116: case 117: case 118:
            case 119: case 120: case 121: case 122:
-                return $this->readName($position, $line, $col, $prev);
+                return $this->moveStringCursor(-1, -1 * $bytes)
                    ->readName($line, $col, $prev);
            // -
            case 45:
            // 0-9
            case 48: case 49: case 50: case 51: case 52:
            case 53: case 54: case 55: case 56: case 57:
-                return $this->readNumber($position, $code, $line, $col, $prev);
+                return $this->moveStringCursor(-1, -1 * $bytes)
                    ->readNumber($line, $col, $prev);
            // "
            case 34:
-                return $this->readString($position, $line, $col, $prev);
+                return $this->moveStringCursor(-1, -1 * $bytes)
                    ->readString($line, $col, $prev);
        }
        $errMessage = $code === 39
@ -190,38 +221,34 @@ class Lexer
     *
     * [_A-Za-z][_0-9A-Za-z]*
     *
     * @param int $position
     * @param int $line
     * @param int $col
     * @param Token $prev
     * @return Token
     */
-    private function readName($position, $line, $col, Token $prev)
+    private function readName($line, $col, Token $prev)
    {
-        $body = $this->source->body;
+        $value = '';
-        $bodyLength = $this->source->length;
+        $start = $this->position;
-        $end = $position + 1;
+        list ($char, $code) = $this->readChar();
-        while (
+        while ($code && (
            $end !== $bodyLength &&
            ($code = Utils::charCodeAt($body, $end)) &&
            (
            $code === 95 || // _
            $code >= 48 && $code <= 57 || // 0-9
            $code >= 65 && $code <= 90 || // A-Z
            $code >= 97 && $code <= 122 // a-z
-            )
+        )) {
-        ) {
+            $value .= $char;
-            ++$end;
+            list ($char, $code) = $this->moveStringCursor(1, 1)->readChar();
        }
        return new Token(
            Token::NAME,
-            $position,
+            $start,
-            $end,
+            $this->position,
            $line,
            $col,
            $prev,
-            mb_substr($body, $position, $end - $position, 'UTF-8')
+            $value
        );
    }
@ -232,126 +259,130 @@ class Lexer
     * Int:   -?(0|[1-9][0-9]*)
     * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
     *
     * @param int $start
     * @param string $firstCode
     * @param int $line
     * @param int $col
     * @param Token $prev
     * @return Token
     * @throws SyntaxError
     */
-    private function readNumber($start, $firstCode, $line, $col, Token $prev)
+    private function readNumber($line, $col, Token $prev)
    {
-        $code = $firstCode;
+        $value = '';
-        $body = $this->source->body;
+        $start = $this->position;
-        $position = $start;
+        list ($char, $code) = $this->readChar();
        $isFloat = false;
        if ($code === 45) { // -
-            $code = Utils::charCodeAt($body, ++$position);
+            $value .= $char;
            list ($char, $code) = $this->moveStringCursor(1, 1)->readChar();
        }
        // guard against leading zero's
        if ($code === 48) { // 0
-            $code = Utils::charCodeAt($body, ++$position);
+            $value .= $char;
            list ($char, $code) = $this->moveStringCursor(1, 1)->readChar();
            if ($code >= 48 && $code <= 57) {
-                throw new SyntaxError($this->source, $position, "Invalid number, unexpected digit after 0: " . Utils::printCharCode($code));
+                throw new SyntaxError($this->source, $this->position, "Invalid number, unexpected digit after 0: " . Utils::printCharCode($code));
            }
        } else {
-            $position = $this->readDigits($position, $code);
+            $value .= $this->readDigits();
-            $code = Utils::charCodeAt($body, $position);
+            list ($char, $code) = $this->readChar();
        }
        if ($code === 46) { // .
            $isFloat = true;
            $this->moveStringCursor(1, 1);
-            $code = Utils::charCodeAt($body, ++$position);
+            $value .= $char;
-            $position = $this->readDigits($position, $code);
+            $value .= $this->readDigits();
-            $code = Utils::charCodeAt($body, $position);
+            list ($char, $code) = $this->readChar();
        }
        if ($code === 69 || $code === 101) { // E e
            $isFloat = true;
-            $code = Utils::charCodeAt($body, ++$position);
+            $value .= $char;
            list ($char, $code) = $this->moveStringCursor(1, 1)->readChar();
            if ($code === 43 || $code === 45) { // + -
-                $code = Utils::charCodeAt($body, ++$position);
+                $value .= $char;
                $this->moveStringCursor(1, 1);
            }
-            $position = $this->readDigits($position, $code);
+            $value .= $this->readDigits();
        }
        return new Token(
            $isFloat ? Token::FLOAT : Token::INT,
            $start,
-            $position,
+            $this->position,
            $line,
            $col,
            $prev,
-            mb_substr($body, $start, $position - $start, 'UTF-8')
+            $value
        );
    }
    /**
-     * Returns the new position in the source after reading digits.
+     * Returns string with all digits + changes current string cursor position to point to the first char after digits
     */
-    private function readDigits($start, $firstCode)
+    private function readDigits()
    {
-        $body = $this->source->body;
+        list ($char, $code) = $this->readChar();
        $position = $start;
        $code = $firstCode;
        if ($code >= 48 && $code <= 57) { // 0 - 9
            $value = '';
            do {
-                $code = Utils::charCodeAt($body, ++$position);
+                $value .= $char;
                list ($char, $code) = $this->moveStringCursor(1, 1)->readChar();
            } while ($code >= 48 && $code <= 57); // 0 - 9
-            return $position;
+            return $value;
        }
-        if ($position > $this->source->length - 1) {
+        if ($this->position > $this->source->length - 1) {
            $code = null;
        }
        throw new SyntaxError(
            $this->source,
-            $position,
+            $this->position,
            'Invalid number, expected digit but got: ' . Utils::printCharCode($code)
        );
    }
    /**
     * @param int $start
     * @param int $line
     * @param int $col
     * @param Token $prev
     * @return Token
     * @throws SyntaxError
     */
-    private function readString($start, $line, $col, Token $prev)
+    private function readString($line, $col, Token $prev)
    {
-        $body = $this->source->body;
+        $start = $this->position;
        $bodyLength = $this->source->length;
-        $position = $start + 1;
+        // Skip leading quote and read first string char:
-        $chunkStart = $position;
+        list ($char, $code, $bytes) = $this->moveStringCursor(1, 1)->readChar();
-        $code = null;
+
        $chunk = '';
        $value = '';
        while (
-            $position < $bodyLength &&
+            $code &&
            ($code = Utils::charCodeAt($body, $position)) &&
            // not LineTerminator
-            $code !== 0x000A && $code !== 0x000D &&
+            $code !== 10 && $code !== 13 &&
            // not Quote (")
            $code !== 34
        ) {
-            $this->assertValidStringCharacterCode($code, $position);
+            $this->assertValidStringCharacterCode($code, $this->position);
            $this->moveStringCursor(1, $bytes);
            ++$position;
            if ($code === 92) { // \
-                $value .= mb_substr($body, $chunkStart, $position - 1 - $chunkStart, 'UTF-8');
+                $value .= $chunk;
-                $code = Utils::charCodeAt($body, $position);
+                list (, $code) = $this->readChar(true);
                switch ($code) {
                    case 34: $value .= '"'; break;
                    case 47: $value .= '/'; break;
@ -362,45 +393,51 @@ class Lexer
                    case 114: $value .= "\r"; break;
                    case 116: $value .= "\t"; break;
                    case 117:
-                        $hex = mb_substr($body, $position + 1, 4, 'UTF-8');
+                        $position = $this->position;
                        list ($hex) = $this->readChars(4, true);
                        if (!preg_match('/[0-9a-fA-F]{4}/', $hex)) {
                            throw new SyntaxError(
                                $this->source,
-                                $position,
+                                $position - 1,
                                'Invalid character escape sequence: \\u' . $hex
                            );
                        }
                        $code = hexdec($hex);
-                        $this->assertValidStringCharacterCode($code, $position - 1);
+                        $this->assertValidStringCharacterCode($code, $position - 2);
                        $value .= Utils::chr($code);
                        $position += 4;
                        break;
                    default:
                        throw new SyntaxError(
                            $this->source,
-                            $position,
+                            $this->position - 1,
                            'Invalid character escape sequence: \\' . Utils::chr($code)
                        );
                }
-                ++$position;
+                $chunk = '';
-                $chunkStart = $position;
+            } else {
                $chunk .= $char;
            }
            list ($char, $code, $bytes) = $this->readChar();
        }
        if ($code !== 34) {
            throw new SyntaxError(
                $this->source,
-                $position,
+                $this->position,
                'Unterminated string.'
            );
        }
-        $value .= mb_substr($body, $chunkStart, $position - $chunkStart, 'UTF-8');
+        $value .= $chunk;
        // Skip trailing quote:
        $this->moveStringCursor(1, 1);
        return new Token(
            Token::STRING,
            $start,
-            $position + 1,
+            $this->position,
            $line,
            $col,
            $prev,
@ -422,43 +459,33 @@ class Lexer
    /**
     * Reads from body starting at startPosition until it finds a non-whitespace
-     * or commented character, then returns the position of that character for
+     * or commented character, then places cursor to the position of that character.
     * lexing.
     *
     * @param $startPosition
     * @return int
     */
-    private function positionAfterWhitespace($startPosition)
+    private function positionAfterWhitespace()
    {
-        $body = $this->source->body;
+        while ($this->position < $this->source->length) {
-        $bodyLength = $this->source->length;
+            list(, $code, $bytes) = $this->readChar();
        $position = $startPosition;
        while ($position < $bodyLength) {
            $code = Utils::charCodeAt($body, $position);
            // Skip whitespace
            // tab | space | comma | BOM
            if ($code === 9 || $code === 32 || $code === 44 || $code === 0xFEFF) {
-                $position++;
+                $this->moveStringCursor(1, $bytes);
            } else if ($code === 10) { // new line
-                $position++;
+                $this->moveStringCursor(1, $bytes);
                $this->line++;
-                $this->lineStart = $position;
+                $this->lineStart = $this->position;
            } else if ($code === 13) { // carriage return
-                if (Utils::charCodeAt($body, $position + 1) === 10) {
+                list(, $nextCode, $nextBytes) = $this->moveStringCursor(1, $bytes)->readChar();
-                    $position += 2;
+
-                } else {
+                if ($nextCode === 10) { // lf after cr
-                    $position ++;
+                    $this->moveStringCursor(1, $nextBytes);
                }
                $this->line++;
-                $this->lineStart = $position;
+                $this->lineStart = $this->position;
            } else {
                break;
            }
        }
        return $position;
    }
    /**
@ -466,21 +493,22 @@ class Lexer
     *
     * #[\u0009\u0020-\uFFFF]*
     *
     * @param $start
     * @param $line
     * @param $col
     * @param Token $prev
     * @return Token
     */
-    private function readComment($start, $line, $col, Token $prev)
+    private function readComment($line, $col, Token $prev)
    {
-        $body = $this->source->body;
+        $start = $this->position;
-        $position = $start;
+        $value = '';
        $bytes = 1;
        do {
-            $code = Utils::charCodeAt($body, ++$position);
+            list ($char, $code, $bytes) = $this->moveStringCursor(1, $bytes)->readChar();
            $value .= $char;
        } while (
-            $code !== null &&
+            $code &&
            // SourceCharacter but not LineTerminator
            ($code > 0x001F || $code === 0x0009)
        );
@ -488,11 +516,97 @@ class Lexer
        return new Token(
            Token::COMMENT,
            $start,
-            $position,
+            $this->position,
            $line,
            $col,
            $prev,
-            mb_substr($body, $start + 1, $position - $start, 'UTF-8')
+            $value
        );
    }
    /**
     * Reads next UTF8Character from the byte stream, starting from $byteStreamPosition.
     *
     * @param bool $advance
     * @param int $byteStreamPosition
     * @return array
     */
    private function readChar($advance = false, $byteStreamPosition = null)
    {
        if ($byteStreamPosition === null) {
            $byteStreamPosition = $this->byteStreamPosition;
        }
        $code = 0;
        $utf8char = '';
        $bytes = 0;
        $positionOffset = 0;
        if (isset($this->source->body[$byteStreamPosition])) {
            $ord = ord($this->source->body[$byteStreamPosition]);
            if ($ord < 128) {
                $bytes = 1;
            } else if ($ord < 224) {
                $bytes = 2;
            } elseif ($ord < 240) {
                $bytes = 3;
            } else {
                $bytes = 4;
            }
            $utf8char = '';
            for ($pos = $byteStreamPosition; $pos < $byteStreamPosition + $bytes; $pos++) {
                $utf8char .= $this->source->body[$pos];
            }
            $positionOffset = 1;
            $code = $bytes === 1 ? $ord : Utils::ord($utf8char);
        }
        if ($advance) {
            $this->moveStringCursor($positionOffset, $bytes);
        }
        return [$utf8char, $code, $bytes];
    }
    /**
     * Reads next $numberOfChars UTF8 characters from the byte stream, starting from $byteStreamPosition.
     *
     * @param $numberOfChars
     * @param bool $advance
     * @param null $byteStreamPosition
     * @return array
     */
    private function readChars($numberOfChars, $advance = false, $byteStreamPosition = null)
    {
        $result = '';
        $totalBytes = 0;
        $byteOffset = $byteStreamPosition ?: $this->byteStreamPosition;
        for ($i = 0; $i < $numberOfChars; $i++) {
            list ($char, $code, $bytes) = $this->readChar(false, $byteOffset);
            $totalBytes += $bytes;
            $byteOffset += $bytes;
            $result .= $char;
        }
        if ($advance) {
            $this->moveStringCursor($numberOfChars, $totalBytes);
        }
        return [$result, $totalBytes];
    }
    /**
     * Moves internal string cursor position
     *
     * @param $positionOffset
     * @param $byteStreamOffset
     * @return $this
     */
    private function moveStringCursor($positionOffset, $byteStreamOffset)
    {
        $this->position += $positionOffset;
        $this->byteStreamPosition += $byteStreamOffset;
        return $this;
    }
 }
--- a/src/Utils.php
+++ b/src/Utils.php
@ -290,15 +290,17 @@ class Utils
     */
    public static function ord($char, $encoding = 'UTF-8')
    {
        if (!$char && '0' !== $char) {
            return 0;
        }
        if (!isset($char[1])) {
            return ord($char);
        }
-        if ($encoding === 'UCS-4BE') {
+        if ($encoding !== 'UCS-4BE') {
-            list(, $ord) = (strlen($char) === 4) ? unpack('N', $char) : unpack('n', $char);
+            $char = mb_convert_encoding($char, 'UCS-4BE', $encoding);
            return $ord;
        } else {
            return self::ord(mb_convert_encoding($char, 'UCS-4BE', $encoding), 'UCS-4BE');
        }
        list(, $ord) = unpack('N', $char);
        return $ord;
    }
    /**