SqlLexer.php

PHP

Path: src/Parsing/SQL/SqlLexer.php

<?php

namespace mini\Parsing\SQL;

/**
 * SQL Lexer - Tokenizes SQL strings
 *
 * Uses a single combined regex with named groups for fast tokenization.
 * Patterns are ordered long-to-short to ensure correct matching.
 */
class SqlLexer
{
    // Token types
    public const T_SELECT = 'SELECT';
    public const T_INSERT = 'INSERT';
    public const T_REPLACE = 'REPLACE';
    public const T_UPDATE = 'UPDATE';
    public const T_DELETE = 'DELETE';
    public const T_FROM   = 'FROM';
    public const T_INTO   = 'INTO';
    public const T_VALUES = 'VALUES';
    public const T_SET    = 'SET';
    public const T_WHERE  = 'WHERE';
    public const T_AND    = 'AND';
    public const T_OR     = 'OR';
    public const T_IN     = 'IN';
    public const T_ORDER  = 'ORDER';
    public const T_BY     = 'BY';
    public const T_LIMIT  = 'LIMIT';
    public const T_OFFSET = 'OFFSET';
    public const T_AS     = 'AS';
    public const T_ASC    = 'ASC';
    public const T_DESC   = 'DESC';
    public const T_NOT    = 'NOT';
    public const T_IS     = 'IS';
    public const T_NULL   = 'NULL';
    public const T_TRUE   = 'TRUE';
    public const T_FALSE  = 'FALSE';
    public const T_LIKE   = 'LIKE';
    public const T_JOIN   = 'JOIN';
    public const T_LEFT   = 'LEFT';
    public const T_RIGHT  = 'RIGHT';
    public const T_INNER  = 'INNER';
    public const T_OUTER  = 'OUTER';
    public const T_FULL   = 'FULL';
    public const T_CROSS  = 'CROSS';
    public const T_ON     = 'ON';
    public const T_DISTINCT = 'DISTINCT';
    public const T_GROUP  = 'GROUP';
    public const T_HAVING = 'HAVING';
    public const T_BETWEEN = 'BETWEEN';
    public const T_EXISTS = 'EXISTS';
    public const T_UNION = 'UNION';
    public const T_INTERSECT = 'INTERSECT';
    public const T_EXCEPT = 'EXCEPT';
    public const T_ALL = 'ALL';
    public const T_ANY = 'ANY';
    public const T_SOME = 'SOME';
    public const T_CASE = 'CASE';
    public const T_WHEN = 'WHEN';
    public const T_THEN = 'THEN';
    public const T_ELSE = 'ELSE';
    public const T_END = 'END';
    public const T_OVER = 'OVER';
    public const T_PARTITION = 'PARTITION';
    public const T_WITH = 'WITH';
    public const T_RECURSIVE = 'RECURSIVE';
    public const T_CURRENT_DATE = 'CURRENT_DATE';
    public const T_CURRENT_TIME = 'CURRENT_TIME';
    public const T_CURRENT_TIMESTAMP = 'CURRENT_TIMESTAMP';
    public const T_FETCH = 'FETCH';
    public const T_FIRST = 'FIRST';
    public const T_NEXT  = 'NEXT';
    public const T_ROWS  = 'ROWS';
    public const T_ROW   = 'ROW';
    public const T_ONLY  = 'ONLY';
    public const T_CREATE = 'CREATE';
    public const T_DROP = 'DROP';
    public const T_ALTER = 'ALTER';
    public const T_TABLE = 'TABLE';
    public const T_INDEX = 'INDEX';
    public const T_VIEW = 'VIEW';
    public const T_IF = 'IF';
    public const T_PRIMARY = 'PRIMARY';
    public const T_KEY = 'KEY';
    public const T_UNIQUE = 'UNIQUE';
    public const T_FOREIGN = 'FOREIGN';
    public const T_REFERENCES = 'REFERENCES';
    public const T_CONSTRAINT = 'CONSTRAINT';
    public const T_DEFAULT = 'DEFAULT';
    public const T_AUTOINCREMENT = 'AUTOINCREMENT';
    public const T_CHECK = 'CHECK';
    public const T_CASCADE = 'CASCADE';
    public const T_RESTRICT = 'RESTRICT';
    public const T_ACTION = 'ACTION';
    public const T_NO = 'NO';
    public const T_TEMPORARY = 'TEMPORARY';
    public const T_TEMP = 'TEMP';

    public const T_IDENTIFIER = 'IDENTIFIER';
    public const T_STRING = 'STRING';
    public const T_NUMBER = 'NUMBER';
    public const T_PLACEHOLDER = 'PLACEHOLDER';
    public const T_COMMA  = 'COMMA';
    public const T_DOT    = 'DOT';
    public const T_STAR   = 'STAR';
    public const T_LPAREN = 'LPAREN';
    public const T_RPAREN = 'RPAREN';
    public const T_OP     = 'OPERATOR';
    public const T_EOF    = 'EOF';

    /** @var array<string, string> Keyword to token type */
    private const KEYWORDS = [
        'SELECT' => self::T_SELECT, 'INSERT' => self::T_INSERT, 'REPLACE' => self::T_REPLACE,
        'UPDATE' => self::T_UPDATE, 'DELETE' => self::T_DELETE, 'FROM' => self::T_FROM,
        'INTO' => self::T_INTO, 'VALUES' => self::T_VALUES, 'SET' => self::T_SET,
        'WHERE' => self::T_WHERE, 'AND' => self::T_AND, 'OR' => self::T_OR,
        'IN' => self::T_IN, 'ORDER' => self::T_ORDER, 'BY' => self::T_BY,
        'LIMIT' => self::T_LIMIT, 'OFFSET' => self::T_OFFSET, 'AS' => self::T_AS,
        'ASC' => self::T_ASC, 'DESC' => self::T_DESC, 'NOT' => self::T_NOT,
        'IS' => self::T_IS, 'NULL' => self::T_NULL, 'TRUE' => self::T_TRUE,
        'FALSE' => self::T_FALSE, 'LIKE' => self::T_LIKE, 'JOIN' => self::T_JOIN,
        'LEFT' => self::T_LEFT, 'RIGHT' => self::T_RIGHT, 'INNER' => self::T_INNER,
        'OUTER' => self::T_OUTER, 'FULL' => self::T_FULL, 'CROSS' => self::T_CROSS,
        'ON' => self::T_ON, 'DISTINCT' => self::T_DISTINCT, 'GROUP' => self::T_GROUP,
        'HAVING' => self::T_HAVING, 'BETWEEN' => self::T_BETWEEN, 'EXISTS' => self::T_EXISTS,
        'UNION' => self::T_UNION, 'INTERSECT' => self::T_INTERSECT, 'EXCEPT' => self::T_EXCEPT,
        'ALL' => self::T_ALL, 'ANY' => self::T_ANY, 'SOME' => self::T_SOME,
        'CASE' => self::T_CASE, 'WHEN' => self::T_WHEN, 'THEN' => self::T_THEN,
        'ELSE' => self::T_ELSE, 'END' => self::T_END, 'OVER' => self::T_OVER,
        'PARTITION' => self::T_PARTITION, 'WITH' => self::T_WITH, 'RECURSIVE' => self::T_RECURSIVE,
        'CURRENT_DATE' => self::T_CURRENT_DATE, 'CURRENT_TIME' => self::T_CURRENT_TIME,
        'CURRENT_TIMESTAMP' => self::T_CURRENT_TIMESTAMP, 'FETCH' => self::T_FETCH,
        'FIRST' => self::T_FIRST, 'NEXT' => self::T_NEXT, 'ROWS' => self::T_ROWS,
        'ROW' => self::T_ROW, 'ONLY' => self::T_ONLY, 'CREATE' => self::T_CREATE,
        'DROP' => self::T_DROP, 'ALTER' => self::T_ALTER, 'TABLE' => self::T_TABLE,
        'INDEX' => self::T_INDEX, 'VIEW' => self::T_VIEW, 'IF' => self::T_IF,
        'PRIMARY' => self::T_PRIMARY, 'KEY' => self::T_KEY, 'UNIQUE' => self::T_UNIQUE,
        'FOREIGN' => self::T_FOREIGN, 'REFERENCES' => self::T_REFERENCES,
        'CONSTRAINT' => self::T_CONSTRAINT, 'DEFAULT' => self::T_DEFAULT,
        'AUTOINCREMENT' => self::T_AUTOINCREMENT, 'CHECK' => self::T_CHECK,
        'CASCADE' => self::T_CASCADE, 'RESTRICT' => self::T_RESTRICT,
        'ACTION' => self::T_ACTION, 'NO' => self::T_NO,
        'TEMPORARY' => self::T_TEMPORARY, 'TEMP' => self::T_TEMP,
    ];

    /**
     * Combined regex pattern built once at class load time.
     * Named groups: WS (whitespace), CMT (comment), NUM, STR, HEX, POS, NAM, ID, BT, DQ, OP
     */
    private static string $pattern;

    private string $sql;

    public static function init(): void
    {
        // Operators sorted long-to-short for correct matching
        $ops = ['<>', '>=', '<=', '!=', '||', '>', '<', '=', '!', '+', '-', '*', '/', '%', '|', '(', ')', ',', '.'];
        $opPattern = implode('|', array_map(fn($op) => preg_quote($op, '~'), $ops));

        // Build combined pattern with named capture groups
        // Uses preg_match_all for batch processing - much faster than iterative preg_match
        self::$pattern = '~' .
            '(?<WS>\s+)|' .                                       // whitespace (skipped)
            '(?<CMT>--[^\n]*|\#[^\n]*|/\*[\s\S]*?\*/)|' .         // comments (skipped)
            '(?<NUM>\d+(?:\.\d*)?)|' .                            // numbers
            "'(?<STR>(?:[^'\\\\]|''|\\\\.)*)'|" .                 // strings
            "[xX]'(?<HEX>[0-9a-fA-F]*)'|" .                       // hex blobs
            '(?<POS>\?)|' .                                       // positional placeholder
            '(?<NAM>:[a-zA-Z_]\w*)|' .                            // named placeholder
            '(?<ID>[a-zA-Z_]\w*)|' .                              // identifiers
            '`(?<BT>[^`]*)`|' .                                   // backtick quoted
            '"(?<DQ>[^"]*)"' .                                    // double quoted
            '|(?<OP>' . $opPattern . ')|' .                       // operators (sorted)
            '(?<ERR>[\s\S])' .                                    // catch-all for errors
        '~';
    }

    public function __construct(string $sql)
    {
        $this->sql = $sql;
    }

    /**
     * Tokenize SQL string into array of tokens
     *
     * @return array Array of tokens with type, value, and position
     * @throws SqlSyntaxException
     */
    public function tokenize(): array
    {
        $sql = $this->sql;
        $count = preg_match_all(self::$pattern, $sql, $m, PREG_PATTERN_ORDER | PREG_UNMATCHED_AS_NULL);

        // Local refs for hot path
        $keywords = self::KEYWORDS;
        $raw = $m[0];
        $mWS = $m['WS'];
        $mCMT = $m['CMT'];
        $mNUM = $m['NUM'];
        $mSTR = $m['STR'];
        $mHEX = $m['HEX'];
        $mPOS = $m['POS'];
        $mNAM = $m['NAM'];
        $mID = $m['ID'];
        $mBT = $m['BT'];
        $mDQ = $m['DQ'];
        $mOP = $m['OP'];
        $mERR = $m['ERR'];

        // Single pass: calculate offset and build tokens together
        $tokens = [];
        $pos = 0;

        for ($i = 0; $i < $count; $i++) {
            $len = strlen($raw[$i]);

            // Error check
            if ($mERR[$i] !== null) {
                throw new SqlSyntaxException("Unexpected character '{$mERR[$i]}'", $sql, $pos);
            }

            // Skip whitespace and comments
            if ($mWS[$i] !== null || $mCMT[$i] !== null) {
                $pos += $len;
                continue;
            }

            // Numbers
            if ($mNUM[$i] !== null) {
                $tokens[] = ['type' => self::T_NUMBER, 'value' => $mNUM[$i], 'pos' => $pos];
                $pos += $len;
                continue;
            }

            // Strings
            if ($mSTR[$i] !== null) {
                $str = $mSTR[$i];
                $str = str_replace("''", "'", $str);
                $str = str_replace("\\'", "'", $str);
                $tokens[] = ['type' => self::T_STRING, 'value' => $str, 'pos' => $pos];
                $pos += $len;
                continue;
            }

            // Hex blobs
            if ($mHEX[$i] !== null) {
                $hex = $mHEX[$i];
                $binary = $hex === '' ? '' : hex2bin($hex);
                if ($binary === false) {
                    throw new SqlSyntaxException("Invalid hex literal: x'$hex'", $sql, $pos);
                }
                $tokens[] = ['type' => self::T_STRING, 'value' => $binary, 'pos' => $pos];
                $pos += $len;
                continue;
            }

            // Positional placeholder
            if ($mPOS[$i] !== null) {
                $tokens[] = ['type' => self::T_PLACEHOLDER, 'value' => '?', 'pos' => $pos];
                $pos += $len;
                continue;
            }

            // Named placeholder
            if ($mNAM[$i] !== null) {
                $tokens[] = ['type' => self::T_PLACEHOLDER, 'value' => $mNAM[$i], 'pos' => $pos];
                $pos += $len;
                continue;
            }

            // Identifiers/keywords
            if ($mID[$i] !== null) {
                $word = $mID[$i];
                $type = $keywords[strtoupper($word)] ?? self::T_IDENTIFIER;
                $tokens[] = ['type' => $type, 'value' => $word, 'pos' => $pos];
                $pos += $len;
                continue;
            }

            // Backtick quoted
            if ($mBT[$i] !== null) {
                $tokens[] = ['type' => self::T_IDENTIFIER, 'value' => $mBT[$i], 'pos' => $pos];
                $pos += $len;
                continue;
            }

            // Double quoted
            if ($mDQ[$i] !== null) {
                $tokens[] = ['type' => self::T_IDENTIFIER, 'value' => $mDQ[$i], 'pos' => $pos];
                $pos += $len;
                continue;
            }

            // Operators
            if ($mOP[$i] !== null) {
                $op = $mOP[$i];
                $type = match ($op) {
                    '(' => self::T_LPAREN,
                    ')' => self::T_RPAREN,
                    ',' => self::T_COMMA,
                    '.' => self::T_DOT,
                    '*' => self::T_STAR,
                    default => self::T_OP,
                };
                $tokens[] = ['type' => $type, 'value' => $op, 'pos' => $pos];
            }
            $pos += $len;
        }

        $tokens[] = ['type' => self::T_EOF, 'value' => null, 'pos' => $pos];
        return $tokens;
    }
}

// Initialize static pattern
SqlLexer::init();