| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444 | <?phpnamespace JmesPath;/** * Tokenizes JMESPath expressions */class Lexer{    const T_DOT = 'dot';    const T_STAR = 'star';    const T_COMMA = 'comma';    const T_COLON = 'colon';    const T_CURRENT = 'current';    const T_EXPREF = 'expref';    const T_LPAREN = 'lparen';    const T_RPAREN = 'rparen';    const T_LBRACE = 'lbrace';    const T_RBRACE = 'rbrace';    const T_LBRACKET = 'lbracket';    const T_RBRACKET = 'rbracket';    const T_FLATTEN = 'flatten';    const T_IDENTIFIER = 'identifier';    const T_NUMBER = 'number';    const T_QUOTED_IDENTIFIER = 'quoted_identifier';    const T_UNKNOWN = 'unknown';    const T_PIPE = 'pipe';    const T_OR = 'or';    const T_AND = 'and';    const T_NOT = 'not';    const T_FILTER = 'filter';    const T_LITERAL = 'literal';    const T_EOF = 'eof';    const T_COMPARATOR = 'comparator';    const STATE_IDENTIFIER = 0;    const STATE_NUMBER = 1;    const STATE_SINGLE_CHAR = 2;    const STATE_WHITESPACE = 3;    const STATE_STRING_LITERAL = 4;    const STATE_QUOTED_STRING = 5;    const STATE_JSON_LITERAL = 6;    const STATE_LBRACKET = 7;    const STATE_PIPE = 8;    const STATE_LT = 9;    const STATE_GT = 10;    const STATE_EQ = 11;    const STATE_NOT = 12;    const STATE_AND = 13;    /** @var array We know what token we are consuming based on each char */    private static $transitionTable = [        '<'  => self::STATE_LT,        '>'  => self::STATE_GT,        '='  => self::STATE_EQ,        '!'  => self::STATE_NOT,        '['  => self::STATE_LBRACKET,        '|'  => self::STATE_PIPE,        '&'  => self::STATE_AND,        '`'  => self::STATE_JSON_LITERAL,        '"'  => self::STATE_QUOTED_STRING,        "'"  => self::STATE_STRING_LITERAL,        '-'  => self::STATE_NUMBER,        '0'  => self::STATE_NUMBER,        '1'  => self::STATE_NUMBER,        '2'  => self::STATE_NUMBER,        '3'  => self::STATE_NUMBER,        '4'  => self::STATE_NUMBER,        '5'  => self::STATE_NUMBER,        '6'  => self::STATE_NUMBER,        '7'  => self::STATE_NUMBER,        '8'  => self::STATE_NUMBER,        '9'  => self::STATE_NUMBER,        ' '  => self::STATE_WHITESPACE,        "\t" => self::STATE_WHITESPACE,        "\n" => self::STATE_WHITESPACE,        "\r" => self::STATE_WHITESPACE,        '.'  => self::STATE_SINGLE_CHAR,        '*'  => self::STATE_SINGLE_CHAR,        ']'  => self::STATE_SINGLE_CHAR,        ','  => self::STATE_SINGLE_CHAR,        ':'  => self::STATE_SINGLE_CHAR,        '@'  => self::STATE_SINGLE_CHAR,        '('  => self::STATE_SINGLE_CHAR,        ')'  => self::STATE_SINGLE_CHAR,        '{'  => self::STATE_SINGLE_CHAR,        '}'  => self::STATE_SINGLE_CHAR,        '_'  => self::STATE_IDENTIFIER,        'A'  => self::STATE_IDENTIFIER,        'B'  => self::STATE_IDENTIFIER,        'C'  => self::STATE_IDENTIFIER,        'D'  => self::STATE_IDENTIFIER,        'E'  => self::STATE_IDENTIFIER,        'F'  => self::STATE_IDENTIFIER,        'G'  => self::STATE_IDENTIFIER,        'H'  => self::STATE_IDENTIFIER,        'I'  => self::STATE_IDENTIFIER,        'J'  => self::STATE_IDENTIFIER,        'K'  => self::STATE_IDENTIFIER,        'L'  => self::STATE_IDENTIFIER,        'M'  => self::STATE_IDENTIFIER,        'N'  => self::STATE_IDENTIFIER,        'O'  => self::STATE_IDENTIFIER,        'P'  => self::STATE_IDENTIFIER,        'Q'  => self::STATE_IDENTIFIER,        'R'  => self::STATE_IDENTIFIER,        'S'  => self::STATE_IDENTIFIER,        'T'  => self::STATE_IDENTIFIER,        'U'  => self::STATE_IDENTIFIER,        'V'  => self::STATE_IDENTIFIER,        'W'  => self::STATE_IDENTIFIER,        'X'  => self::STATE_IDENTIFIER,        'Y'  => self::STATE_IDENTIFIER,        'Z'  => self::STATE_IDENTIFIER,        'a'  => self::STATE_IDENTIFIER,        'b'  => self::STATE_IDENTIFIER,        'c'  => self::STATE_IDENTIFIER,        'd'  => self::STATE_IDENTIFIER,        'e'  => self::STATE_IDENTIFIER,        'f'  => self::STATE_IDENTIFIER,        'g'  => self::STATE_IDENTIFIER,        'h'  => self::STATE_IDENTIFIER,        'i'  => self::STATE_IDENTIFIER,        'j'  => self::STATE_IDENTIFIER,        'k'  => self::STATE_IDENTIFIER,        'l'  => self::STATE_IDENTIFIER,        'm'  => self::STATE_IDENTIFIER,        'n'  => self::STATE_IDENTIFIER,        'o'  => self::STATE_IDENTIFIER,        'p'  => self::STATE_IDENTIFIER,        'q'  => self::STATE_IDENTIFIER,        'r'  => self::STATE_IDENTIFIER,        's'  => self::STATE_IDENTIFIER,        't'  => self::STATE_IDENTIFIER,        'u'  => self::STATE_IDENTIFIER,        'v'  => self::STATE_IDENTIFIER,        'w'  => self::STATE_IDENTIFIER,        'x'  => self::STATE_IDENTIFIER,        'y'  => self::STATE_IDENTIFIER,        'z'  => self::STATE_IDENTIFIER,    ];    /** @var array Valid identifier characters after first character */    private $validIdentifier = [        'A' => true, 'B' => true, 'C' => true, 'D' => true, 'E' => true,        'F' => true, 'G' => true, 'H' => true, 'I' => true, 'J' => true,        'K' => true, 'L' => true, 'M' => true, 'N' => true, 'O' => true,        'P' => true, 'Q' => true, 'R' => true, 'S' => true, 'T' => true,        'U' => true, 'V' => true, 'W' => true, 'X' => true, 'Y' => true,        'Z' => true, 'a' => true, 'b' => true, 'c' => true, 'd' => true,        'e' => true, 'f' => true, 'g' => true, 'h' => true, 'i' => true,        'j' => true, 'k' => true, 'l' => true, 'm' => true, 'n' => true,        'o' => true, 'p' => true, 'q' => true, 'r' => true, 's' => true,        't' => true, 'u' => true, 'v' => true, 'w' => true, 'x' => true,        'y' => true, 'z' => true, '_' => true, '0' => true, '1' => true,        '2' => true, '3' => true, '4' => true, '5' => true, '6' => true,        '7' => true, '8' => true, '9' => true,    ];    /** @var array Valid number characters after the first character */    private $numbers = [        '0' => true, '1' => true, '2' => true, '3' => true, '4' => true,        '5' => true, '6' => true, '7' => true, '8' => true, '9' => true    ];    /** @var array Map of simple single character tokens */    private $simpleTokens = [        '.' => self::T_DOT,        '*' => self::T_STAR,        ']' => self::T_RBRACKET,        ',' => self::T_COMMA,        ':' => self::T_COLON,        '@' => self::T_CURRENT,        '(' => self::T_LPAREN,        ')' => self::T_RPAREN,        '{' => self::T_LBRACE,        '}' => self::T_RBRACE,    ];    /**     * Tokenize the JMESPath expression into an array of tokens hashes that     * contain a 'type', 'value', and 'key'.     *     * @param string $input JMESPath input     *     * @return array     * @throws SyntaxErrorException     */    public function tokenize($input)    {        $tokens = [];        if ($input === '') {            goto eof;        }        $chars = str_split($input);        while (false !== ($current = current($chars))) {            // Every character must be in the transition character table.            if (!isset(self::$transitionTable[$current])) {                $tokens[] = [                    'type'  => self::T_UNKNOWN,                    'pos'   => key($chars),                    'value' => $current                ];                next($chars);                continue;            }            $state = self::$transitionTable[$current];            if ($state === self::STATE_SINGLE_CHAR) {                // Consume simple tokens like ".", ",", "@", etc.                $tokens[] = [                    'type'  => $this->simpleTokens[$current],                    'pos'   => key($chars),                    'value' => $current                ];                next($chars);            } elseif ($state === self::STATE_IDENTIFIER) {                // Consume identifiers                $start = key($chars);                $buffer = '';                do {                    $buffer .= $current;                    $current = next($chars);                } while ($current !== false && isset($this->validIdentifier[$current]));                $tokens[] = [                    'type'  => self::T_IDENTIFIER,                    'value' => $buffer,                    'pos'   => $start                ];            } elseif ($state === self::STATE_WHITESPACE) {                // Skip whitespace                next($chars);            } elseif ($state === self::STATE_LBRACKET) {                // Consume "[", "[?", and "[]"                $position = key($chars);                $actual = next($chars);                if ($actual === ']') {                    next($chars);                    $tokens[] = [                        'type'  => self::T_FLATTEN,                        'pos'   => $position,                        'value' => '[]'                    ];                } elseif ($actual === '?') {                    next($chars);                    $tokens[] = [                        'type'  => self::T_FILTER,                        'pos'   => $position,                        'value' => '[?'                    ];                } else {                    $tokens[] = [                        'type'  => self::T_LBRACKET,                        'pos'   => $position,                        'value' => '['                    ];                }            } elseif ($state === self::STATE_STRING_LITERAL) {                // Consume raw string literals                $t = $this->inside($chars, "'", self::T_LITERAL);                $t['value'] = str_replace("\\'", "'", $t['value']);                $tokens[] = $t;            } elseif ($state === self::STATE_PIPE) {                // Consume pipe and OR                $tokens[] = $this->matchOr($chars, '|', '|', self::T_OR, self::T_PIPE);            } elseif ($state == self::STATE_JSON_LITERAL) {                // Consume JSON literals                $token = $this->inside($chars, '`', self::T_LITERAL);                if ($token['type'] === self::T_LITERAL) {                    $token['value'] = str_replace('\\`', '`', $token['value']);                    $token = $this->parseJson($token);                }                $tokens[] = $token;            } elseif ($state == self::STATE_NUMBER) {                // Consume numbers                $start = key($chars);                $buffer = '';                do {                    $buffer .= $current;                    $current = next($chars);                } while ($current !== false && isset($this->numbers[$current]));                $tokens[] = [                    'type'  => self::T_NUMBER,                    'value' => (int)$buffer,                    'pos'   => $start                ];            } elseif ($state === self::STATE_QUOTED_STRING) {                // Consume quoted identifiers                $token = $this->inside($chars, '"', self::T_QUOTED_IDENTIFIER);                if ($token['type'] === self::T_QUOTED_IDENTIFIER) {                    $token['value'] = '"' . $token['value'] . '"';                    $token = $this->parseJson($token);                }                $tokens[] = $token;            } elseif ($state === self::STATE_EQ) {                // Consume equals                $tokens[] = $this->matchOr($chars, '=', '=', self::T_COMPARATOR, self::T_UNKNOWN);            } elseif ($state == self::STATE_AND) {                $tokens[] = $this->matchOr($chars, '&', '&', self::T_AND, self::T_EXPREF);            } elseif ($state === self::STATE_NOT) {                // Consume not equal                $tokens[] = $this->matchOr($chars, '!', '=', self::T_COMPARATOR, self::T_NOT);            } else {                // either '<' or '>'                // Consume less than and greater than                $tokens[] = $this->matchOr($chars, $current, '=', self::T_COMPARATOR, self::T_COMPARATOR);            }        }        eof:        $tokens[] = [            'type'  => self::T_EOF,            'pos'   => mb_strlen($input, 'UTF-8'),            'value' => null        ];        return $tokens;    }    /**     * Returns a token based on whether or not the next token matches the     * expected value. If it does, a token of "$type" is returned. Otherwise,     * a token of "$orElse" type is returned.     *     * @param array  $chars    Array of characters by reference.     * @param string $current  The current character.     * @param string $expected Expected character.     * @param string $type     Expected result type.     * @param string $orElse   Otherwise return a token of this type.     *     * @return array Returns a conditional token.     */    private function matchOr(array &$chars, $current, $expected, $type, $orElse)    {        if (next($chars) === $expected) {            next($chars);            return [                'type'  => $type,                'pos'   => key($chars) - 1,                'value' => $current . $expected            ];        }        return [            'type'  => $orElse,            'pos'   => key($chars) - 1,            'value' => $current        ];    }    /**     * Returns a token the is the result of consuming inside of delimiter     * characters. Escaped delimiters will be adjusted before returning a     * value. If the token is not closed, "unknown" is returned.     *     * @param array  $chars Array of characters by reference.     * @param string $delim The delimiter character.     * @param string $type  Token type.     *     * @return array Returns the consumed token.     */    private function inside(array &$chars, $delim, $type)    {        $position = key($chars);        $current = next($chars);        $buffer = '';        while ($current !== $delim) {            if ($current === '\\') {                $buffer .= '\\';                $current = next($chars);            }            if ($current === false) {                // Unclosed delimiter                return [                    'type'  => self::T_UNKNOWN,                    'value' => $buffer,                    'pos'   => $position                ];            }            $buffer .= $current;            $current = next($chars);        }        next($chars);        return ['type' => $type, 'value' => $buffer, 'pos' => $position];    }    /**     * Parses a JSON token or sets the token type to "unknown" on error.     *     * @param array $token Token that needs parsing.     *     * @return array Returns a token with a parsed value.     */    private function parseJson(array $token)    {        $value = json_decode($token['value'], true);        if ($error = json_last_error()) {            // Legacy support for elided quotes. Try to parse again by adding            // quotes around the bad input value.            $value = json_decode('"' . $token['value'] . '"', true);            if ($error = json_last_error()) {                $token['type'] = self::T_UNKNOWN;                return $token;            }        }        $token['value'] = $value;        return $token;    }}
 |