123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444 |
- <?php
- namespace JmesPath;
- /**
- * Tokenizes JMESPath expressions
- */
- class Lexer
- {
- const T_DOT = 'dot';
- const T_STAR = 'star';
- const T_COMMA = 'comma';
- const T_COLON = 'colon';
- const T_CURRENT = 'current';
- const T_EXPREF = 'expref';
- const T_LPAREN = 'lparen';
- const T_RPAREN = 'rparen';
- const T_LBRACE = 'lbrace';
- const T_RBRACE = 'rbrace';
- const T_LBRACKET = 'lbracket';
- const T_RBRACKET = 'rbracket';
- const T_FLATTEN = 'flatten';
- const T_IDENTIFIER = 'identifier';
- const T_NUMBER = 'number';
- const T_QUOTED_IDENTIFIER = 'quoted_identifier';
- const T_UNKNOWN = 'unknown';
- const T_PIPE = 'pipe';
- const T_OR = 'or';
- const T_AND = 'and';
- const T_NOT = 'not';
- const T_FILTER = 'filter';
- const T_LITERAL = 'literal';
- const T_EOF = 'eof';
- const T_COMPARATOR = 'comparator';
- const STATE_IDENTIFIER = 0;
- const STATE_NUMBER = 1;
- const STATE_SINGLE_CHAR = 2;
- const STATE_WHITESPACE = 3;
- const STATE_STRING_LITERAL = 4;
- const STATE_QUOTED_STRING = 5;
- const STATE_JSON_LITERAL = 6;
- const STATE_LBRACKET = 7;
- const STATE_PIPE = 8;
- const STATE_LT = 9;
- const STATE_GT = 10;
- const STATE_EQ = 11;
- const STATE_NOT = 12;
- const STATE_AND = 13;
- /** @var array We know what token we are consuming based on each char */
- private static $transitionTable = [
- '<' => self::STATE_LT,
- '>' => self::STATE_GT,
- '=' => self::STATE_EQ,
- '!' => self::STATE_NOT,
- '[' => self::STATE_LBRACKET,
- '|' => self::STATE_PIPE,
- '&' => self::STATE_AND,
- '`' => self::STATE_JSON_LITERAL,
- '"' => self::STATE_QUOTED_STRING,
- "'" => self::STATE_STRING_LITERAL,
- '-' => self::STATE_NUMBER,
- '0' => self::STATE_NUMBER,
- '1' => self::STATE_NUMBER,
- '2' => self::STATE_NUMBER,
- '3' => self::STATE_NUMBER,
- '4' => self::STATE_NUMBER,
- '5' => self::STATE_NUMBER,
- '6' => self::STATE_NUMBER,
- '7' => self::STATE_NUMBER,
- '8' => self::STATE_NUMBER,
- '9' => self::STATE_NUMBER,
- ' ' => self::STATE_WHITESPACE,
- "\t" => self::STATE_WHITESPACE,
- "\n" => self::STATE_WHITESPACE,
- "\r" => self::STATE_WHITESPACE,
- '.' => self::STATE_SINGLE_CHAR,
- '*' => self::STATE_SINGLE_CHAR,
- ']' => self::STATE_SINGLE_CHAR,
- ',' => self::STATE_SINGLE_CHAR,
- ':' => self::STATE_SINGLE_CHAR,
- '@' => self::STATE_SINGLE_CHAR,
- '(' => self::STATE_SINGLE_CHAR,
- ')' => self::STATE_SINGLE_CHAR,
- '{' => self::STATE_SINGLE_CHAR,
- '}' => self::STATE_SINGLE_CHAR,
- '_' => self::STATE_IDENTIFIER,
- 'A' => self::STATE_IDENTIFIER,
- 'B' => self::STATE_IDENTIFIER,
- 'C' => self::STATE_IDENTIFIER,
- 'D' => self::STATE_IDENTIFIER,
- 'E' => self::STATE_IDENTIFIER,
- 'F' => self::STATE_IDENTIFIER,
- 'G' => self::STATE_IDENTIFIER,
- 'H' => self::STATE_IDENTIFIER,
- 'I' => self::STATE_IDENTIFIER,
- 'J' => self::STATE_IDENTIFIER,
- 'K' => self::STATE_IDENTIFIER,
- 'L' => self::STATE_IDENTIFIER,
- 'M' => self::STATE_IDENTIFIER,
- 'N' => self::STATE_IDENTIFIER,
- 'O' => self::STATE_IDENTIFIER,
- 'P' => self::STATE_IDENTIFIER,
- 'Q' => self::STATE_IDENTIFIER,
- 'R' => self::STATE_IDENTIFIER,
- 'S' => self::STATE_IDENTIFIER,
- 'T' => self::STATE_IDENTIFIER,
- 'U' => self::STATE_IDENTIFIER,
- 'V' => self::STATE_IDENTIFIER,
- 'W' => self::STATE_IDENTIFIER,
- 'X' => self::STATE_IDENTIFIER,
- 'Y' => self::STATE_IDENTIFIER,
- 'Z' => self::STATE_IDENTIFIER,
- 'a' => self::STATE_IDENTIFIER,
- 'b' => self::STATE_IDENTIFIER,
- 'c' => self::STATE_IDENTIFIER,
- 'd' => self::STATE_IDENTIFIER,
- 'e' => self::STATE_IDENTIFIER,
- 'f' => self::STATE_IDENTIFIER,
- 'g' => self::STATE_IDENTIFIER,
- 'h' => self::STATE_IDENTIFIER,
- 'i' => self::STATE_IDENTIFIER,
- 'j' => self::STATE_IDENTIFIER,
- 'k' => self::STATE_IDENTIFIER,
- 'l' => self::STATE_IDENTIFIER,
- 'm' => self::STATE_IDENTIFIER,
- 'n' => self::STATE_IDENTIFIER,
- 'o' => self::STATE_IDENTIFIER,
- 'p' => self::STATE_IDENTIFIER,
- 'q' => self::STATE_IDENTIFIER,
- 'r' => self::STATE_IDENTIFIER,
- 's' => self::STATE_IDENTIFIER,
- 't' => self::STATE_IDENTIFIER,
- 'u' => self::STATE_IDENTIFIER,
- 'v' => self::STATE_IDENTIFIER,
- 'w' => self::STATE_IDENTIFIER,
- 'x' => self::STATE_IDENTIFIER,
- 'y' => self::STATE_IDENTIFIER,
- 'z' => self::STATE_IDENTIFIER,
- ];
- /** @var array Valid identifier characters after first character */
- private $validIdentifier = [
- 'A' => true, 'B' => true, 'C' => true, 'D' => true, 'E' => true,
- 'F' => true, 'G' => true, 'H' => true, 'I' => true, 'J' => true,
- 'K' => true, 'L' => true, 'M' => true, 'N' => true, 'O' => true,
- 'P' => true, 'Q' => true, 'R' => true, 'S' => true, 'T' => true,
- 'U' => true, 'V' => true, 'W' => true, 'X' => true, 'Y' => true,
- 'Z' => true, 'a' => true, 'b' => true, 'c' => true, 'd' => true,
- 'e' => true, 'f' => true, 'g' => true, 'h' => true, 'i' => true,
- 'j' => true, 'k' => true, 'l' => true, 'm' => true, 'n' => true,
- 'o' => true, 'p' => true, 'q' => true, 'r' => true, 's' => true,
- 't' => true, 'u' => true, 'v' => true, 'w' => true, 'x' => true,
- 'y' => true, 'z' => true, '_' => true, '0' => true, '1' => true,
- '2' => true, '3' => true, '4' => true, '5' => true, '6' => true,
- '7' => true, '8' => true, '9' => true,
- ];
- /** @var array Valid number characters after the first character */
- private $numbers = [
- '0' => true, '1' => true, '2' => true, '3' => true, '4' => true,
- '5' => true, '6' => true, '7' => true, '8' => true, '9' => true
- ];
- /** @var array Map of simple single character tokens */
- private $simpleTokens = [
- '.' => self::T_DOT,
- '*' => self::T_STAR,
- ']' => self::T_RBRACKET,
- ',' => self::T_COMMA,
- ':' => self::T_COLON,
- '@' => self::T_CURRENT,
- '(' => self::T_LPAREN,
- ')' => self::T_RPAREN,
- '{' => self::T_LBRACE,
- '}' => self::T_RBRACE,
- ];
- /**
- * Tokenize the JMESPath expression into an array of tokens hashes that
- * contain a 'type', 'value', and 'key'.
- *
- * @param string $input JMESPath input
- *
- * @return array
- * @throws SyntaxErrorException
- */
- public function tokenize($input)
- {
- $tokens = [];
- if ($input === '') {
- goto eof;
- }
- $chars = str_split($input);
- while (false !== ($current = current($chars))) {
- // Every character must be in the transition character table.
- if (!isset(self::$transitionTable[$current])) {
- $tokens[] = [
- 'type' => self::T_UNKNOWN,
- 'pos' => key($chars),
- 'value' => $current
- ];
- next($chars);
- continue;
- }
- $state = self::$transitionTable[$current];
- if ($state === self::STATE_SINGLE_CHAR) {
- // Consume simple tokens like ".", ",", "@", etc.
- $tokens[] = [
- 'type' => $this->simpleTokens[$current],
- 'pos' => key($chars),
- 'value' => $current
- ];
- next($chars);
- } elseif ($state === self::STATE_IDENTIFIER) {
- // Consume identifiers
- $start = key($chars);
- $buffer = '';
- do {
- $buffer .= $current;
- $current = next($chars);
- } while ($current !== false && isset($this->validIdentifier[$current]));
- $tokens[] = [
- 'type' => self::T_IDENTIFIER,
- 'value' => $buffer,
- 'pos' => $start
- ];
- } elseif ($state === self::STATE_WHITESPACE) {
- // Skip whitespace
- next($chars);
- } elseif ($state === self::STATE_LBRACKET) {
- // Consume "[", "[?", and "[]"
- $position = key($chars);
- $actual = next($chars);
- if ($actual === ']') {
- next($chars);
- $tokens[] = [
- 'type' => self::T_FLATTEN,
- 'pos' => $position,
- 'value' => '[]'
- ];
- } elseif ($actual === '?') {
- next($chars);
- $tokens[] = [
- 'type' => self::T_FILTER,
- 'pos' => $position,
- 'value' => '[?'
- ];
- } else {
- $tokens[] = [
- 'type' => self::T_LBRACKET,
- 'pos' => $position,
- 'value' => '['
- ];
- }
- } elseif ($state === self::STATE_STRING_LITERAL) {
- // Consume raw string literals
- $t = $this->inside($chars, "'", self::T_LITERAL);
- $t['value'] = str_replace("\\'", "'", $t['value']);
- $tokens[] = $t;
- } elseif ($state === self::STATE_PIPE) {
- // Consume pipe and OR
- $tokens[] = $this->matchOr($chars, '|', '|', self::T_OR, self::T_PIPE);
- } elseif ($state == self::STATE_JSON_LITERAL) {
- // Consume JSON literals
- $token = $this->inside($chars, '`', self::T_LITERAL);
- if ($token['type'] === self::T_LITERAL) {
- $token['value'] = str_replace('\\`', '`', $token['value']);
- $token = $this->parseJson($token);
- }
- $tokens[] = $token;
- } elseif ($state == self::STATE_NUMBER) {
- // Consume numbers
- $start = key($chars);
- $buffer = '';
- do {
- $buffer .= $current;
- $current = next($chars);
- } while ($current !== false && isset($this->numbers[$current]));
- $tokens[] = [
- 'type' => self::T_NUMBER,
- 'value' => (int)$buffer,
- 'pos' => $start
- ];
- } elseif ($state === self::STATE_QUOTED_STRING) {
- // Consume quoted identifiers
- $token = $this->inside($chars, '"', self::T_QUOTED_IDENTIFIER);
- if ($token['type'] === self::T_QUOTED_IDENTIFIER) {
- $token['value'] = '"' . $token['value'] . '"';
- $token = $this->parseJson($token);
- }
- $tokens[] = $token;
- } elseif ($state === self::STATE_EQ) {
- // Consume equals
- $tokens[] = $this->matchOr($chars, '=', '=', self::T_COMPARATOR, self::T_UNKNOWN);
- } elseif ($state == self::STATE_AND) {
- $tokens[] = $this->matchOr($chars, '&', '&', self::T_AND, self::T_EXPREF);
- } elseif ($state === self::STATE_NOT) {
- // Consume not equal
- $tokens[] = $this->matchOr($chars, '!', '=', self::T_COMPARATOR, self::T_NOT);
- } else {
- // either '<' or '>'
- // Consume less than and greater than
- $tokens[] = $this->matchOr($chars, $current, '=', self::T_COMPARATOR, self::T_COMPARATOR);
- }
- }
- eof:
- $tokens[] = [
- 'type' => self::T_EOF,
- 'pos' => mb_strlen($input, 'UTF-8'),
- 'value' => null
- ];
- return $tokens;
- }
- /**
- * Returns a token based on whether or not the next token matches the
- * expected value. If it does, a token of "$type" is returned. Otherwise,
- * a token of "$orElse" type is returned.
- *
- * @param array $chars Array of characters by reference.
- * @param string $current The current character.
- * @param string $expected Expected character.
- * @param string $type Expected result type.
- * @param string $orElse Otherwise return a token of this type.
- *
- * @return array Returns a conditional token.
- */
- private function matchOr(array &$chars, $current, $expected, $type, $orElse)
- {
- if (next($chars) === $expected) {
- next($chars);
- return [
- 'type' => $type,
- 'pos' => key($chars) - 1,
- 'value' => $current . $expected
- ];
- }
- return [
- 'type' => $orElse,
- 'pos' => key($chars) - 1,
- 'value' => $current
- ];
- }
- /**
- * Returns a token the is the result of consuming inside of delimiter
- * characters. Escaped delimiters will be adjusted before returning a
- * value. If the token is not closed, "unknown" is returned.
- *
- * @param array $chars Array of characters by reference.
- * @param string $delim The delimiter character.
- * @param string $type Token type.
- *
- * @return array Returns the consumed token.
- */
- private function inside(array &$chars, $delim, $type)
- {
- $position = key($chars);
- $current = next($chars);
- $buffer = '';
- while ($current !== $delim) {
- if ($current === '\\') {
- $buffer .= '\\';
- $current = next($chars);
- }
- if ($current === false) {
- // Unclosed delimiter
- return [
- 'type' => self::T_UNKNOWN,
- 'value' => $buffer,
- 'pos' => $position
- ];
- }
- $buffer .= $current;
- $current = next($chars);
- }
- next($chars);
- return ['type' => $type, 'value' => $buffer, 'pos' => $position];
- }
- /**
- * Parses a JSON token or sets the token type to "unknown" on error.
- *
- * @param array $token Token that needs parsing.
- *
- * @return array Returns a token with a parsed value.
- */
- private function parseJson(array $token)
- {
- $value = json_decode($token['value'], true);
- if ($error = json_last_error()) {
- // Legacy support for elided quotes. Try to parse again by adding
- // quotes around the bad input value.
- $value = json_decode('"' . $token['value'] . '"', true);
- if ($error = json_last_error()) {
- $token['type'] = self::T_UNKNOWN;
- return $token;
- }
- }
- $token['value'] = $value;
- return $token;
- }
- }
|