CssSelectorParser.php

PHP

Path: src/Html/CssSelectorParser.php

<?php

declare(strict_types=1);

namespace mini\Html;

/**
 * CSS Selector Parser — lexer + recursive descent parser.
 *
 * Supports: tag, #id, .class, [attr], [attr="value"], *, descendant (space),
 * child (>), and selector lists (comma). Throws on unsupported syntax.
 *
 * Grammar:
 *   selectorList    = complexSelector (',' complexSelector)*
 *   complexSelector = compoundSelector (combinator compoundSelector)*
 *   combinator      = '>' | WS
 *   compoundSelector = simpleSelector+
 *   simpleSelector  = '*' | IDENT | HASH | '.' IDENT | '[' IDENT ('=' (STRING | IDENT))? ']'
 */
class CssSelectorParser
{
    private const T_IDENT    = 'IDENT';
    private const T_HASH     = 'HASH';
    private const T_DOT      = 'DOT';
    private const T_LBRACKET = 'LBRACKET';
    private const T_RBRACKET = 'RBRACKET';
    private const T_EQUALS   = 'EQUALS';
    private const T_STRING   = 'STRING';
    private const T_COMMA    = 'COMMA';
    private const T_GT       = 'GT';
    private const T_STAR     = 'STAR';
    private const T_WS       = 'WS';
    private const T_EOF      = 'EOF';

    private static string $pattern = '~
        (?<HASH>\#[\w-]+)|
        (?<STR>"[^"]*"|\'[^\']*\')|
        (?<ID>[\w-]+)|
        (?<DOT>\.)|
        (?<LB>\[)|
        (?<RB>\])|
        (?<EQ>=)|
        (?<COMMA>,)|
        (?<GT>>)|
        (?<STAR>\*)|
        (?<WS>\s+)|
        (?<ERR>[\s\S])
    ~x';

    private string $selector;
    /** @var array<int, array{type: string, value: string, pos: int}> */
    private array $tokens;
    private int $cursor;

    private function __construct(string $selector)
    {
        $this->selector = $selector;
    }

    /**
     * Parse a CSS selector string.
     *
     * @return array<int, array<int, array{compound: array, combinator: ?string}>>
     */
    public static function parse(string $selector): array
    {
        $parser = new self($selector);
        $parser->tokenize();
        return $parser->selectorList();
    }

    private function tokenize(): void
    {
        $count = preg_match_all(self::$pattern, $this->selector, $m, PREG_PATTERN_ORDER | PREG_UNMATCHED_AS_NULL);

        $tokens = [];
        $pos = 0;

        for ($i = 0; $i < $count; $i++) {
            $raw = $m[0][$i];
            $len = strlen($raw);

            if ($m['ERR'][$i] !== null) {
                throw new \InvalidArgumentException(
                    "CSS selector syntax error: unexpected '{$m['ERR'][$i]}' at position {$pos} in \"{$this->selector}\""
                );
            }

            if ($m['HASH'][$i] !== null) {
                $tokens[] = ['type' => self::T_HASH, 'value' => substr($m['HASH'][$i], 1), 'pos' => $pos];
            } elseif ($m['STR'][$i] !== null) {
                $tokens[] = ['type' => self::T_STRING, 'value' => substr($m['STR'][$i], 1, -1), 'pos' => $pos];
            } elseif ($m['ID'][$i] !== null) {
                $tokens[] = ['type' => self::T_IDENT, 'value' => $m['ID'][$i], 'pos' => $pos];
            } elseif ($m['DOT'][$i] !== null) {
                $tokens[] = ['type' => self::T_DOT, 'value' => '.', 'pos' => $pos];
            } elseif ($m['LB'][$i] !== null) {
                $tokens[] = ['type' => self::T_LBRACKET, 'value' => '[', 'pos' => $pos];
            } elseif ($m['RB'][$i] !== null) {
                $tokens[] = ['type' => self::T_RBRACKET, 'value' => ']', 'pos' => $pos];
            } elseif ($m['EQ'][$i] !== null) {
                $tokens[] = ['type' => self::T_EQUALS, 'value' => '=', 'pos' => $pos];
            } elseif ($m['COMMA'][$i] !== null) {
                $tokens[] = ['type' => self::T_COMMA, 'value' => ',', 'pos' => $pos];
            } elseif ($m['GT'][$i] !== null) {
                $tokens[] = ['type' => self::T_GT, 'value' => '>', 'pos' => $pos];
            } elseif ($m['STAR'][$i] !== null) {
                $tokens[] = ['type' => self::T_STAR, 'value' => '*', 'pos' => $pos];
            } elseif ($m['WS'][$i] !== null) {
                $tokens[] = ['type' => self::T_WS, 'value' => $m['WS'][$i], 'pos' => $pos];
            }

            $pos += $len;
        }

        $tokens[] = ['type' => self::T_EOF, 'value' => '', 'pos' => $pos];
        $this->tokens = $tokens;
        $this->cursor = 0;
    }

    private function current(): array
    {
        return $this->tokens[$this->cursor];
    }

    private function advance(): array
    {
        return $this->tokens[$this->cursor++];
    }

    private function skipWs(): void
    {
        while ($this->current()['type'] === self::T_WS) {
            $this->cursor++;
        }
    }

    private function expect(string $type): array
    {
        $tok = $this->current();
        if ($tok['type'] !== $type) {
            throw new \InvalidArgumentException(
                "CSS selector syntax error: expected {$type}, got '{$tok['value']}' at position {$tok['pos']} in \"{$this->selector}\""
            );
        }
        $this->cursor++;
        return $tok;
    }

    /**
     * @return array<int, array<int, array{compound: array, combinator: ?string}>>
     */
    private function selectorList(): array
    {
        $list = [];
        $this->skipWs();
        $list[] = $this->complexSelector();

        while ($this->current()['type'] === self::T_COMMA) {
            $this->cursor++; // consume ','
            $this->skipWs();
            $list[] = $this->complexSelector();
        }

        if ($this->current()['type'] !== self::T_EOF) {
            $tok = $this->current();
            throw new \InvalidArgumentException(
                "CSS selector syntax error: unexpected '{$tok['value']}' at position {$tok['pos']} in \"{$this->selector}\""
            );
        }

        return $list;
    }

    /**
     * @return array<int, array{compound: array, combinator: ?string}>
     */
    private function complexSelector(): array
    {
        $segments = [];
        $segments[] = ['compound' => $this->compoundSelector(), 'combinator' => null];

        while (true) {
            $type = $this->current()['type'];

            // Child combinator: optional WS > optional WS
            if ($type === self::T_GT) {
                $this->cursor++;
                $this->skipWs();
                $segments[] = ['compound' => $this->compoundSelector(), 'combinator' => '>'];
                continue;
            }

            // Descendant combinator: WS followed by a compound selector start
            if ($type === self::T_WS) {
                // Peek past whitespace to see what follows
                $saved = $this->cursor;
                $this->skipWs();
                $next = $this->current()['type'];

                // If followed by '>' it's WS around child combinator, not descendant
                if ($next === self::T_GT) {
                    $this->cursor++;
                    $this->skipWs();
                    $segments[] = ['compound' => $this->compoundSelector(), 'combinator' => '>'];
                    continue;
                }

                // If followed by comma, EOF, or nothing parsable — end of this complex selector
                if ($next === self::T_COMMA || $next === self::T_EOF) {
                    break;
                }

                // Otherwise it's a descendant combinator
                $segments[] = ['compound' => $this->compoundSelector(), 'combinator' => ' '];
                continue;
            }

            break;
        }

        return $segments;
    }

    /** @return array<int, array> */
    private function compoundSelector(): array
    {
        $parts = [];

        while (true) {
            $type = $this->current()['type'];

            if ($type === self::T_STAR) {
                $this->cursor++;
                $parts[] = ['universal'];
            } elseif ($type === self::T_IDENT) {
                $tok = $this->advance();
                $parts[] = ['type', $tok['value']];
            } elseif ($type === self::T_HASH) {
                $tok = $this->advance();
                $parts[] = ['id', $tok['value']];
            } elseif ($type === self::T_DOT) {
                $this->cursor++;
                $ident = $this->expect(self::T_IDENT);
                $parts[] = ['class', $ident['value']];
            } elseif ($type === self::T_LBRACKET) {
                $parts[] = $this->attributeSelector();
            } else {
                break;
            }
        }

        if ($parts === []) {
            $tok = $this->current();
            throw new \InvalidArgumentException(
                "CSS selector syntax error: expected selector, got '{$tok['value']}' at position {$tok['pos']} in \"{$this->selector}\""
            );
        }

        // Check for unsupported pseudo-classes/pseudo-elements
        if ($this->current()['type'] === self::T_IDENT) {
            $val = $this->current()['value'];
            if ($val[0] === ':') {
                throw new \InvalidArgumentException(
                    "CSS selector syntax error: pseudo-classes/pseudo-elements are not supported in \"{$this->selector}\""
                );
            }
        }

        return $parts;
    }

    /** @return array */
    private function attributeSelector(): array
    {
        $this->expect(self::T_LBRACKET);
        $attr = $this->expect(self::T_IDENT);

        if ($this->current()['type'] === self::T_RBRACKET) {
            $this->cursor++;
            return ['attr', $attr['value']];
        }

        $this->expect(self::T_EQUALS);

        $valTok = $this->current();
        if ($valTok['type'] === self::T_STRING) {
            $this->cursor++;
            $value = $valTok['value'];
        } elseif ($valTok['type'] === self::T_IDENT) {
            $this->cursor++;
            $value = $valTok['value'];
        } else {
            throw new \InvalidArgumentException(
                "CSS selector syntax error: expected attribute value, got '{$valTok['value']}' at position {$valTok['pos']} in \"{$this->selector}\""
            );
        }

        $this->expect(self::T_RBRACKET);
        return ['attr', $attr['value'], $value];
    }
}