GenericParser.php

PHP

Path: src/Parsing/GenericParser.php

<?php
namespace mini\Parsing;

use ArrayAccess;
use Countable;
use IteratorAggregate;
use LogicException;
use Stringable;
use Traversable;

/**
 * GenericParser
 *
 * A small, general-purpose, lossless structural parser for arbitrary strings.
 *
 * The parser walks an input string and builds a lightweight syntax tree:
 *
 * - Text spans are represented as TextNode instances.
 * - Delimited regions (quotes and bracket pairs) are represented as DelimitedNode instances.
 * - The root of the tree is a NodeList, which:
 *   - implements ArrayAccess, IteratorAggregate and Countable,
 *   - can be cast to string to reconstruct the original input,
 *   - allows `$tree[1]` to access the second root-level node.
 *
 * Configuration is provided to the constructor:
 *
 * - `$quotes`      : list of characters that start/end quoted regions (e.g. ['"', "'", '`']).
 * - `$escapeStyle` : how strings are escaped:
 *      - GenericParser::ESCAPE_NONE           : no escape handling.
 *      - GenericParser::ESCAPE_C              : C-style backslash escapes (\" \\ \n \r \t ...).
 *      - GenericParser::ESCAPE_JSON           : JSON-style backslash escapes (subset of C-style).
 *      - GenericParser::ESCAPE_QUOTE_DOUBLING : quote-doubling ("" or '' inside strings).
 * - `$pairs`       : associative array of opening => closing delimiters
 *                    (e.g. ['(' => ')', '[' => ']', '{' => '}']).
 *
 * Example:
 *
 *  $parser = new GenericParser(
 *      quotes: ['"', "'", '`'],
 *      escapeStyle: GenericParser::ESCAPE_C,
 *      pairs: ['(' => ')', '[' => ']', '{' => '}']
 *  );
 *
 *  $tree = $parser->parse($input);
 *
 *  echo $tree;      // outputs the exact string that was parsed
 *  echo $tree[1];   // second root-level node
 *
 * This parser is intentionally minimal: it only understands quotes and bracket
 * pairs. Everything else is left as raw text. Quoted regions are treated as
 * opaque: their contents are not further structured into DelimitedNode
 * instances; they simply contain a single TextNode child.
 */
final class GenericParser
{
    public const ESCAPE_NONE            = 'none';
    public const ESCAPE_C               = 'c';
    public const ESCAPE_JSON            = 'json';
    public const ESCAPE_QUOTE_DOUBLING  = 'double';

    /** @var array<string,string> Quote open => close (same char for symmetric) */
    private array $quotes;

    /** @var array<string,string> Pair open => close (nested structure) */
    private array $pairs;

    private string $escapeStyle;

    private string $input = '';
    private int $length = 0;
    private int $pos = 0;

    /**
     * @param array<string,string> $quotes Map of opening => closing quote characters.
     *                                     Use same char for symmetric: ['"' => '"', "'" => "'"]
     *                                     Use different for asymmetric: ['[' => ']']
     * @param array<string,string> $pairs Map of opening => closing for nested structures (e.g., ['(' => ')'])
     * @param string               $escapeStyle One of the ESCAPE_* constants.
     */
    public function __construct(
        array $quotes = ['"' => '"', "'" => "'", '`' => '`', '[' => ']', '{' => '}'],
        array $pairs = ['(' => ')'],
        string $escapeStyle = self::ESCAPE_QUOTE_DOUBLING
    ) {
        $this->quotes = $quotes;
        $this->pairs = $pairs;
        $this->escapeStyle = $escapeStyle;
    }

    /**
     * Create a parser configured for SQL
     */
    public static function sql(): self
    {
        return new self(
            quotes: ['"' => '"', "'" => "'", '`' => '`', '[' => ']'],
            pairs: ['(' => ')'],
            escapeStyle: self::ESCAPE_QUOTE_DOUBLING
        );
    }

    /**
     * Parse the given string into a NodeList tree.
     *
     * The returned NodeList:
     * - can be cast to string to reconstruct the original input,
     * - is indexable via ArrayAccess (`$tree[1]`),
     * - is iterable (`foreach ($tree as $node)`).
     *
     * @param  string   $input
     * @return NodeList
     */
    public function parse(string $input): NodeList
    {
        $this->input  = $input;
        $this->length = strlen($input);
        $this->pos    = 0;

        return $this->parseList([]);
    }

    /**
     * Parse until the end of the string or until one of the given delimiters is encountered.
     *
     * @param  string[] $endDelimiters Single-character strings that terminate this list.
     * @return NodeList
     */
    private function parseList(array $endDelimiters): NodeList
    {
        $nodes = [];
        $text  = '';

        while ($this->pos < $this->length) {
            $ch = $this->input[$this->pos];

            // Stop if we hit any of the configured end delimiters
            if (in_array($ch, $endDelimiters, true)) {
                break;
            }

            // Opening of a delimited region: quote or pair
            if (array_key_exists($ch, $this->quotes) || array_key_exists($ch, $this->pairs)) {
                // Flush any accumulated text first
                if ($text !== '') {
                    $nodes[] = new TextNode($text);
                    $text = '';
                }

                $nodes[] = $this->parseDelimited($ch);
                continue;
            }

            // Plain text
            $text .= $ch;
            $this->pos++;
        }

        if ($text !== '') {
            $nodes[] = new TextNode($text);
        }

        return new NodeList($nodes);
    }

    /**
     * Parse a delimited region starting at the current position.
     *
     * This handles:
     * - quotes (opaque, no nested structure), and
     * - pairs (with nested structure).
     *
     * @param  string       $startChar
     * @return DelimitedNode
     */
    private function parseDelimited(string $startChar): DelimitedNode
    {
        // Quotes: opaque content
        if (array_key_exists($startChar, $this->quotes)) {
            return $this->parseQuoted($startChar, $this->quotes[$startChar]);
        }

        // Pairs: nested structure
        $open  = $startChar;
        $close = $this->pairs[$open];

        // Consume the opening delimiter
        $this->pos++;

        $children = $this->parseList([$close]);

        $closed = false;
        if ($this->pos < $this->length && $this->input[$this->pos] === $close) {
            $this->pos++; // consume closing delimiter
            $closed = true;
        }

        return new DelimitedNode($open, $close, $children, $closed);
    }

    /**
     * Parse a quoted region starting at the current position.
     *
     * Quoted regions are treated as opaque: their content is represented as a
     * single TextNode child of the DelimitedNode.
     *
     * Escape behavior inside the quote depends on $escapeStyle:
     * - ESCAPE_NONE           : no escapes, quote closes on the next matching char.
     * - ESCAPE_C / ESCAPE_JSON: backslash escapes; backslash plus next char are
     *                           treated as literal content.
     * - ESCAPE_QUOTE_DOUBLING : doubled close char inside the quoted region
     *                           is treated as an escaped quote.
     *
     * @param  string $open  Opening quote character
     * @param  string $close Closing quote character (same as open for symmetric quotes)
     * @return DelimitedNode
     */
    private function parseQuoted(string $open, string $close): DelimitedNode
    {
        // Consume the opening quote
        $this->pos++;

        $buf    = '';
        $closed = false;

        while ($this->pos < $this->length) {
            $ch = $this->input[$this->pos];

            // Quote-doubling: ]] or '' inside the string
            if ($this->escapeStyle === self::ESCAPE_QUOTE_DOUBLING) {
                if ($ch === $close) {
                    // If doubled, treat as escaped quote
                    if (
                        $this->pos + 1 < $this->length
                        && $this->input[$this->pos + 1] === $close
                    ) {
                        $buf .= $close . $close;
                        $this->pos += 2;
                        continue;
                    }

                    // Single close char terminates the string
                    $this->pos++;
                    $closed = true;
                    break;
                }

                $buf .= $ch;
                $this->pos++;
                continue;
            }

            // Backslash-escape mode (C / JSON)
            if ($this->escapeStyle === self::ESCAPE_C || $this->escapeStyle === self::ESCAPE_JSON) {
                if ($ch === '\\') {
                    // Include the backslash and the char following it as literal text,
                    // and do not treat the following char as a terminator.
                    if ($this->pos + 1 < $this->length) {
                        $buf .= '\\' . $this->input[$this->pos + 1];
                        $this->pos += 2;
                        continue;
                    }

                    // Trailing backslash at end-of-string, treat as literal
                    $buf .= '\\';
                    $this->pos++;
                    continue;
                }

                if ($ch === $close) {
                    $this->pos++;
                    $closed = true;
                    break;
                }

                $buf .= $ch;
                $this->pos++;
                continue;
            }

            // ESCAPE_NONE: no special escape handling, quote closes on next matching char
            if ($ch === $close) {
                $this->pos++;
                $closed = true;
                break;
            }

            $buf .= $ch;
            $this->pos++;
        }

        $children = new NodeList([
            new TextNode($buf),
        ]);

        return new DelimitedNode($open, $close, $children, $closed);
    }
}

/**
 * Node
 *
 * Base interface for all nodes in the syntax tree.
 *
 * Nodes are:
 * - TextNode        : raw text spans.
 * - DelimitedNode   : regions enclosed in delimiters such as quotes or brackets.
 */
interface Node extends Stringable
{
    /**
     * @return Node[] Direct child nodes (empty for TextNode).
     */
    public function children(): array;
}

/**
 * TextNode
 *
 * Represents a contiguous span of plain text.
 */
final class TextNode implements Node
{
    /**
     * @param string $text The exact text content of this node.
     */
    public function __construct(
        public readonly string $text
    ) {
    }

    /**
     * Return the raw text content.
     */
    public function __toString(): string
    {
        return $this->text;
    }

    /**
     * Text nodes have no children.
     *
     * @return Node[]
     */
    public function children(): array
    {
        return [];
    }
}

/**
 * DelimitedNode
 *
 * Represents a region enclosed by a pair of delimiters, such as:
 * - quotes     : " ... ", ' ... ', ` ... `
 * - brackets   : ( ... ), [ ... ], { ... }
 *
 * The node retains the exact opening and closing delimiters and a NodeList of
 * its inner content. If the closing delimiter was not found (unbalanced input),
 * $closed will be false, and __toString() will not append the closing delimiter,
 * preserving the original input faithfully.
 */
final class DelimitedNode implements Node
{
    /**
     * @param string   $open    Opening delimiter character.
     * @param string   $close   Closing delimiter character.
     * @param NodeList $children List of nodes inside this delimited region.
     * @param bool     $closed  True if a closing delimiter was found in the input.
     */
    public function __construct(
        public readonly string $open,
        public readonly string $close,
        public readonly NodeList $children,
        public readonly bool $closed = true
    ) {
    }

    /**
     * Reconstruct the original delimited region as a string.
     *
     * If the node is marked as not closed, the closing delimiter is omitted,
     * matching the original (unbalanced) input.
     */
    public function __toString(): string
    {
        return $this->open
             . $this->children
             . ($this->closed ? $this->close : '');
    }

    /**
     * @return Node[] Direct children of this delimited region.
     */
    public function children(): array
    {
        return $this->children->all();
    }
}

/**
 * NodeList
 *
 * A list of Node instances that:
 * - can be echoed to reconstruct the exact concatenated text of all nodes,
 * - supports array access (`$list[0]`),
 * - is iterable, and
 * - implements Countable.
 *
 * The root of a parsed tree is a NodeList, but NodeList is also used inside
 * DelimitedNode instances for nested content.
 */
final class NodeList implements ArrayAccess, IteratorAggregate, Countable, Stringable
{
    /** @var Node[] */
    private array $nodes;

    /**
     * @param Node[] $nodes
     */
    public function __construct(array $nodes)
    {
        $this->nodes = array_values($nodes);
    }

    /**
     * Reconstruct the original text represented by this list of nodes.
     */
    public function __toString(): string
    {
        return implode('', array_map('strval', $this->nodes));
    }

    /**
     * @inheritDoc
     */
    public function offsetExists(mixed $offset): bool
    {
        return isset($this->nodes[$offset]);
    }

    /**
     * @inheritDoc
     *
     * @return Node|null
     */
    public function offsetGet(mixed $offset): ?Node
    {
        return $this->nodes[$offset] ?? null;
    }

    /**
     * @inheritDoc
     */
    public function offsetSet(mixed $offset, mixed $value): void
    {
        if ($offset === null) {
            $this->nodes[] = $value;
        } else {
            $this->nodes[$offset] = $value;
        }
    }

    /**
     * @inheritDoc
     */
    public function offsetUnset(mixed $offset): void
    {
        unset($this->nodes[$offset]);
        $this->nodes = array_values($this->nodes);
    }

    /**
     * Iterate over all nodes.
     *
     * @return Traversable<Node>
     */
    public function getIterator(): Traversable
    {
        yield from $this->nodes;
    }

    /**
     * Number of nodes in this list.
     */
    public function count(): int
    {
        return count($this->nodes);
    }

    /**
     * Return the underlying node array.
     *
     * @return Node[]
     */
    public function all(): array
    {
        return $this->nodes;
    }

    /**
     * Walk all nodes depth-first, applying a callback to each
     *
     * The callback receives each node and can return:
     * - A Node to replace it
     * - A string to replace it with a TextNode
     * - null to keep the node unchanged
     *
     * For DelimitedNode, children are walked first (depth-first), then
     * the callback is applied to the DelimitedNode itself.
     *
     * @param \Closure(Node): (Node|string|null) $fn
     */
    public function walk(\Closure $fn): void
    {
        foreach ($this->nodes as $i => $node) {
            // Recurse into children first
            if ($node instanceof DelimitedNode) {
                $node->children->walk($fn);
            }

            // Apply callback
            $result = $fn($node);
            if ($result === null) {
                continue;
            }

            if (is_string($result)) {
                $this->nodes[$i] = new TextNode($result);
            } else {
                $this->nodes[$i] = $result;
            }
        }
    }
}