Lexer.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. <?php
  2. namespace JmesPath;
  3. /**
  4. * Tokenizes JMESPath expressions
  5. */
  6. class Lexer
  7. {
  8. const T_DOT = 'dot';
  9. const T_STAR = 'star';
  10. const T_COMMA = 'comma';
  11. const T_COLON = 'colon';
  12. const T_CURRENT = 'current';
  13. const T_EXPREF = 'expref';
  14. const T_LPAREN = 'lparen';
  15. const T_RPAREN = 'rparen';
  16. const T_LBRACE = 'lbrace';
  17. const T_RBRACE = 'rbrace';
  18. const T_LBRACKET = 'lbracket';
  19. const T_RBRACKET = 'rbracket';
  20. const T_FLATTEN = 'flatten';
  21. const T_IDENTIFIER = 'identifier';
  22. const T_NUMBER = 'number';
  23. const T_QUOTED_IDENTIFIER = 'quoted_identifier';
  24. const T_UNKNOWN = 'unknown';
  25. const T_PIPE = 'pipe';
  26. const T_OR = 'or';
  27. const T_AND = 'and';
  28. const T_NOT = 'not';
  29. const T_FILTER = 'filter';
  30. const T_LITERAL = 'literal';
  31. const T_EOF = 'eof';
  32. const T_COMPARATOR = 'comparator';
  33. const STATE_IDENTIFIER = 0;
  34. const STATE_NUMBER = 1;
  35. const STATE_SINGLE_CHAR = 2;
  36. const STATE_WHITESPACE = 3;
  37. const STATE_STRING_LITERAL = 4;
  38. const STATE_QUOTED_STRING = 5;
  39. const STATE_JSON_LITERAL = 6;
  40. const STATE_LBRACKET = 7;
  41. const STATE_PIPE = 8;
  42. const STATE_LT = 9;
  43. const STATE_GT = 10;
  44. const STATE_EQ = 11;
  45. const STATE_NOT = 12;
  46. const STATE_AND = 13;
  47. /** @var array We know what token we are consuming based on each char */
  48. private static $transitionTable = [
  49. '<' => self::STATE_LT,
  50. '>' => self::STATE_GT,
  51. '=' => self::STATE_EQ,
  52. '!' => self::STATE_NOT,
  53. '[' => self::STATE_LBRACKET,
  54. '|' => self::STATE_PIPE,
  55. '&' => self::STATE_AND,
  56. '`' => self::STATE_JSON_LITERAL,
  57. '"' => self::STATE_QUOTED_STRING,
  58. "'" => self::STATE_STRING_LITERAL,
  59. '-' => self::STATE_NUMBER,
  60. '0' => self::STATE_NUMBER,
  61. '1' => self::STATE_NUMBER,
  62. '2' => self::STATE_NUMBER,
  63. '3' => self::STATE_NUMBER,
  64. '4' => self::STATE_NUMBER,
  65. '5' => self::STATE_NUMBER,
  66. '6' => self::STATE_NUMBER,
  67. '7' => self::STATE_NUMBER,
  68. '8' => self::STATE_NUMBER,
  69. '9' => self::STATE_NUMBER,
  70. ' ' => self::STATE_WHITESPACE,
  71. "\t" => self::STATE_WHITESPACE,
  72. "\n" => self::STATE_WHITESPACE,
  73. "\r" => self::STATE_WHITESPACE,
  74. '.' => self::STATE_SINGLE_CHAR,
  75. '*' => self::STATE_SINGLE_CHAR,
  76. ']' => self::STATE_SINGLE_CHAR,
  77. ',' => self::STATE_SINGLE_CHAR,
  78. ':' => self::STATE_SINGLE_CHAR,
  79. '@' => self::STATE_SINGLE_CHAR,
  80. '(' => self::STATE_SINGLE_CHAR,
  81. ')' => self::STATE_SINGLE_CHAR,
  82. '{' => self::STATE_SINGLE_CHAR,
  83. '}' => self::STATE_SINGLE_CHAR,
  84. '_' => self::STATE_IDENTIFIER,
  85. 'A' => self::STATE_IDENTIFIER,
  86. 'B' => self::STATE_IDENTIFIER,
  87. 'C' => self::STATE_IDENTIFIER,
  88. 'D' => self::STATE_IDENTIFIER,
  89. 'E' => self::STATE_IDENTIFIER,
  90. 'F' => self::STATE_IDENTIFIER,
  91. 'G' => self::STATE_IDENTIFIER,
  92. 'H' => self::STATE_IDENTIFIER,
  93. 'I' => self::STATE_IDENTIFIER,
  94. 'J' => self::STATE_IDENTIFIER,
  95. 'K' => self::STATE_IDENTIFIER,
  96. 'L' => self::STATE_IDENTIFIER,
  97. 'M' => self::STATE_IDENTIFIER,
  98. 'N' => self::STATE_IDENTIFIER,
  99. 'O' => self::STATE_IDENTIFIER,
  100. 'P' => self::STATE_IDENTIFIER,
  101. 'Q' => self::STATE_IDENTIFIER,
  102. 'R' => self::STATE_IDENTIFIER,
  103. 'S' => self::STATE_IDENTIFIER,
  104. 'T' => self::STATE_IDENTIFIER,
  105. 'U' => self::STATE_IDENTIFIER,
  106. 'V' => self::STATE_IDENTIFIER,
  107. 'W' => self::STATE_IDENTIFIER,
  108. 'X' => self::STATE_IDENTIFIER,
  109. 'Y' => self::STATE_IDENTIFIER,
  110. 'Z' => self::STATE_IDENTIFIER,
  111. 'a' => self::STATE_IDENTIFIER,
  112. 'b' => self::STATE_IDENTIFIER,
  113. 'c' => self::STATE_IDENTIFIER,
  114. 'd' => self::STATE_IDENTIFIER,
  115. 'e' => self::STATE_IDENTIFIER,
  116. 'f' => self::STATE_IDENTIFIER,
  117. 'g' => self::STATE_IDENTIFIER,
  118. 'h' => self::STATE_IDENTIFIER,
  119. 'i' => self::STATE_IDENTIFIER,
  120. 'j' => self::STATE_IDENTIFIER,
  121. 'k' => self::STATE_IDENTIFIER,
  122. 'l' => self::STATE_IDENTIFIER,
  123. 'm' => self::STATE_IDENTIFIER,
  124. 'n' => self::STATE_IDENTIFIER,
  125. 'o' => self::STATE_IDENTIFIER,
  126. 'p' => self::STATE_IDENTIFIER,
  127. 'q' => self::STATE_IDENTIFIER,
  128. 'r' => self::STATE_IDENTIFIER,
  129. 's' => self::STATE_IDENTIFIER,
  130. 't' => self::STATE_IDENTIFIER,
  131. 'u' => self::STATE_IDENTIFIER,
  132. 'v' => self::STATE_IDENTIFIER,
  133. 'w' => self::STATE_IDENTIFIER,
  134. 'x' => self::STATE_IDENTIFIER,
  135. 'y' => self::STATE_IDENTIFIER,
  136. 'z' => self::STATE_IDENTIFIER,
  137. ];
  138. /** @var array Valid identifier characters after first character */
  139. private $validIdentifier = [
  140. 'A' => true, 'B' => true, 'C' => true, 'D' => true, 'E' => true,
  141. 'F' => true, 'G' => true, 'H' => true, 'I' => true, 'J' => true,
  142. 'K' => true, 'L' => true, 'M' => true, 'N' => true, 'O' => true,
  143. 'P' => true, 'Q' => true, 'R' => true, 'S' => true, 'T' => true,
  144. 'U' => true, 'V' => true, 'W' => true, 'X' => true, 'Y' => true,
  145. 'Z' => true, 'a' => true, 'b' => true, 'c' => true, 'd' => true,
  146. 'e' => true, 'f' => true, 'g' => true, 'h' => true, 'i' => true,
  147. 'j' => true, 'k' => true, 'l' => true, 'm' => true, 'n' => true,
  148. 'o' => true, 'p' => true, 'q' => true, 'r' => true, 's' => true,
  149. 't' => true, 'u' => true, 'v' => true, 'w' => true, 'x' => true,
  150. 'y' => true, 'z' => true, '_' => true, '0' => true, '1' => true,
  151. '2' => true, '3' => true, '4' => true, '5' => true, '6' => true,
  152. '7' => true, '8' => true, '9' => true,
  153. ];
  154. /** @var array Valid number characters after the first character */
  155. private $numbers = [
  156. '0' => true, '1' => true, '2' => true, '3' => true, '4' => true,
  157. '5' => true, '6' => true, '7' => true, '8' => true, '9' => true
  158. ];
  159. /** @var array Map of simple single character tokens */
  160. private $simpleTokens = [
  161. '.' => self::T_DOT,
  162. '*' => self::T_STAR,
  163. ']' => self::T_RBRACKET,
  164. ',' => self::T_COMMA,
  165. ':' => self::T_COLON,
  166. '@' => self::T_CURRENT,
  167. '(' => self::T_LPAREN,
  168. ')' => self::T_RPAREN,
  169. '{' => self::T_LBRACE,
  170. '}' => self::T_RBRACE,
  171. ];
  172. /**
  173. * Tokenize the JMESPath expression into an array of tokens hashes that
  174. * contain a 'type', 'value', and 'key'.
  175. *
  176. * @param string $input JMESPath input
  177. *
  178. * @return array
  179. * @throws SyntaxErrorException
  180. */
  181. public function tokenize($input)
  182. {
  183. $tokens = [];
  184. if ($input === '') {
  185. goto eof;
  186. }
  187. $chars = str_split($input);
  188. while (false !== ($current = current($chars))) {
  189. // Every character must be in the transition character table.
  190. if (!isset(self::$transitionTable[$current])) {
  191. $tokens[] = [
  192. 'type' => self::T_UNKNOWN,
  193. 'pos' => key($chars),
  194. 'value' => $current
  195. ];
  196. next($chars);
  197. continue;
  198. }
  199. $state = self::$transitionTable[$current];
  200. if ($state === self::STATE_SINGLE_CHAR) {
  201. // Consume simple tokens like ".", ",", "@", etc.
  202. $tokens[] = [
  203. 'type' => $this->simpleTokens[$current],
  204. 'pos' => key($chars),
  205. 'value' => $current
  206. ];
  207. next($chars);
  208. } elseif ($state === self::STATE_IDENTIFIER) {
  209. // Consume identifiers
  210. $start = key($chars);
  211. $buffer = '';
  212. do {
  213. $buffer .= $current;
  214. $current = next($chars);
  215. } while ($current !== false && isset($this->validIdentifier[$current]));
  216. $tokens[] = [
  217. 'type' => self::T_IDENTIFIER,
  218. 'value' => $buffer,
  219. 'pos' => $start
  220. ];
  221. } elseif ($state === self::STATE_WHITESPACE) {
  222. // Skip whitespace
  223. next($chars);
  224. } elseif ($state === self::STATE_LBRACKET) {
  225. // Consume "[", "[?", and "[]"
  226. $position = key($chars);
  227. $actual = next($chars);
  228. if ($actual === ']') {
  229. next($chars);
  230. $tokens[] = [
  231. 'type' => self::T_FLATTEN,
  232. 'pos' => $position,
  233. 'value' => '[]'
  234. ];
  235. } elseif ($actual === '?') {
  236. next($chars);
  237. $tokens[] = [
  238. 'type' => self::T_FILTER,
  239. 'pos' => $position,
  240. 'value' => '[?'
  241. ];
  242. } else {
  243. $tokens[] = [
  244. 'type' => self::T_LBRACKET,
  245. 'pos' => $position,
  246. 'value' => '['
  247. ];
  248. }
  249. } elseif ($state === self::STATE_STRING_LITERAL) {
  250. // Consume raw string literals
  251. $t = $this->inside($chars, "'", self::T_LITERAL);
  252. $t['value'] = str_replace("\\'", "'", $t['value']);
  253. $tokens[] = $t;
  254. } elseif ($state === self::STATE_PIPE) {
  255. // Consume pipe and OR
  256. $tokens[] = $this->matchOr($chars, '|', '|', self::T_OR, self::T_PIPE);
  257. } elseif ($state == self::STATE_JSON_LITERAL) {
  258. // Consume JSON literals
  259. $token = $this->inside($chars, '`', self::T_LITERAL);
  260. if ($token['type'] === self::T_LITERAL) {
  261. $token['value'] = str_replace('\\`', '`', $token['value']);
  262. $token = $this->parseJson($token);
  263. }
  264. $tokens[] = $token;
  265. } elseif ($state == self::STATE_NUMBER) {
  266. // Consume numbers
  267. $start = key($chars);
  268. $buffer = '';
  269. do {
  270. $buffer .= $current;
  271. $current = next($chars);
  272. } while ($current !== false && isset($this->numbers[$current]));
  273. $tokens[] = [
  274. 'type' => self::T_NUMBER,
  275. 'value' => (int)$buffer,
  276. 'pos' => $start
  277. ];
  278. } elseif ($state === self::STATE_QUOTED_STRING) {
  279. // Consume quoted identifiers
  280. $token = $this->inside($chars, '"', self::T_QUOTED_IDENTIFIER);
  281. if ($token['type'] === self::T_QUOTED_IDENTIFIER) {
  282. $token['value'] = '"' . $token['value'] . '"';
  283. $token = $this->parseJson($token);
  284. }
  285. $tokens[] = $token;
  286. } elseif ($state === self::STATE_EQ) {
  287. // Consume equals
  288. $tokens[] = $this->matchOr($chars, '=', '=', self::T_COMPARATOR, self::T_UNKNOWN);
  289. } elseif ($state == self::STATE_AND) {
  290. $tokens[] = $this->matchOr($chars, '&', '&', self::T_AND, self::T_EXPREF);
  291. } elseif ($state === self::STATE_NOT) {
  292. // Consume not equal
  293. $tokens[] = $this->matchOr($chars, '!', '=', self::T_COMPARATOR, self::T_NOT);
  294. } else {
  295. // either '<' or '>'
  296. // Consume less than and greater than
  297. $tokens[] = $this->matchOr($chars, $current, '=', self::T_COMPARATOR, self::T_COMPARATOR);
  298. }
  299. }
  300. eof:
  301. $tokens[] = [
  302. 'type' => self::T_EOF,
  303. 'pos' => mb_strlen($input, 'UTF-8'),
  304. 'value' => null
  305. ];
  306. return $tokens;
  307. }
  308. /**
  309. * Returns a token based on whether or not the next token matches the
  310. * expected value. If it does, a token of "$type" is returned. Otherwise,
  311. * a token of "$orElse" type is returned.
  312. *
  313. * @param array $chars Array of characters by reference.
  314. * @param string $current The current character.
  315. * @param string $expected Expected character.
  316. * @param string $type Expected result type.
  317. * @param string $orElse Otherwise return a token of this type.
  318. *
  319. * @return array Returns a conditional token.
  320. */
  321. private function matchOr(array &$chars, $current, $expected, $type, $orElse)
  322. {
  323. if (next($chars) === $expected) {
  324. next($chars);
  325. return [
  326. 'type' => $type,
  327. 'pos' => key($chars) - 1,
  328. 'value' => $current . $expected
  329. ];
  330. }
  331. return [
  332. 'type' => $orElse,
  333. 'pos' => key($chars) - 1,
  334. 'value' => $current
  335. ];
  336. }
  337. /**
  338. * Returns a token the is the result of consuming inside of delimiter
  339. * characters. Escaped delimiters will be adjusted before returning a
  340. * value. If the token is not closed, "unknown" is returned.
  341. *
  342. * @param array $chars Array of characters by reference.
  343. * @param string $delim The delimiter character.
  344. * @param string $type Token type.
  345. *
  346. * @return array Returns the consumed token.
  347. */
  348. private function inside(array &$chars, $delim, $type)
  349. {
  350. $position = key($chars);
  351. $current = next($chars);
  352. $buffer = '';
  353. while ($current !== $delim) {
  354. if ($current === '\\') {
  355. $buffer .= '\\';
  356. $current = next($chars);
  357. }
  358. if ($current === false) {
  359. // Unclosed delimiter
  360. return [
  361. 'type' => self::T_UNKNOWN,
  362. 'value' => $buffer,
  363. 'pos' => $position
  364. ];
  365. }
  366. $buffer .= $current;
  367. $current = next($chars);
  368. }
  369. next($chars);
  370. return ['type' => $type, 'value' => $buffer, 'pos' => $position];
  371. }
  372. /**
  373. * Parses a JSON token or sets the token type to "unknown" on error.
  374. *
  375. * @param array $token Token that needs parsing.
  376. *
  377. * @return array Returns a token with a parsed value.
  378. */
  379. private function parseJson(array $token)
  380. {
  381. $value = json_decode($token['value'], true);
  382. if ($error = json_last_error()) {
  383. // Legacy support for elided quotes. Try to parse again by adding
  384. // quotes around the bad input value.
  385. $value = json_decode('"' . $token['value'] . '"', true);
  386. if ($error = json_last_error()) {
  387. $token['type'] = self::T_UNKNOWN;
  388. return $token;
  389. }
  390. }
  391. $token['value'] = $value;
  392. return $token;
  393. }
  394. }