445 lines
15 KiB
PHP
445 lines
15 KiB
PHP
<?php
|
|
namespace JmesPath;
|
|
|
|
/**
|
|
* Tokenizes JMESPath expressions
|
|
*/
|
|
class Lexer
|
|
{
|
|
const T_DOT = 'dot';
|
|
const T_STAR = 'star';
|
|
const T_COMMA = 'comma';
|
|
const T_COLON = 'colon';
|
|
const T_CURRENT = 'current';
|
|
const T_EXPREF = 'expref';
|
|
const T_LPAREN = 'lparen';
|
|
const T_RPAREN = 'rparen';
|
|
const T_LBRACE = 'lbrace';
|
|
const T_RBRACE = 'rbrace';
|
|
const T_LBRACKET = 'lbracket';
|
|
const T_RBRACKET = 'rbracket';
|
|
const T_FLATTEN = 'flatten';
|
|
const T_IDENTIFIER = 'identifier';
|
|
const T_NUMBER = 'number';
|
|
const T_QUOTED_IDENTIFIER = 'quoted_identifier';
|
|
const T_UNKNOWN = 'unknown';
|
|
const T_PIPE = 'pipe';
|
|
const T_OR = 'or';
|
|
const T_AND = 'and';
|
|
const T_NOT = 'not';
|
|
const T_FILTER = 'filter';
|
|
const T_LITERAL = 'literal';
|
|
const T_EOF = 'eof';
|
|
const T_COMPARATOR = 'comparator';
|
|
|
|
const STATE_IDENTIFIER = 0;
|
|
const STATE_NUMBER = 1;
|
|
const STATE_SINGLE_CHAR = 2;
|
|
const STATE_WHITESPACE = 3;
|
|
const STATE_STRING_LITERAL = 4;
|
|
const STATE_QUOTED_STRING = 5;
|
|
const STATE_JSON_LITERAL = 6;
|
|
const STATE_LBRACKET = 7;
|
|
const STATE_PIPE = 8;
|
|
const STATE_LT = 9;
|
|
const STATE_GT = 10;
|
|
const STATE_EQ = 11;
|
|
const STATE_NOT = 12;
|
|
const STATE_AND = 13;
|
|
|
|
/** @var array We know what token we are consuming based on each char */
|
|
private static $transitionTable = [
|
|
'<' => self::STATE_LT,
|
|
'>' => self::STATE_GT,
|
|
'=' => self::STATE_EQ,
|
|
'!' => self::STATE_NOT,
|
|
'[' => self::STATE_LBRACKET,
|
|
'|' => self::STATE_PIPE,
|
|
'&' => self::STATE_AND,
|
|
'`' => self::STATE_JSON_LITERAL,
|
|
'"' => self::STATE_QUOTED_STRING,
|
|
"'" => self::STATE_STRING_LITERAL,
|
|
'-' => self::STATE_NUMBER,
|
|
'0' => self::STATE_NUMBER,
|
|
'1' => self::STATE_NUMBER,
|
|
'2' => self::STATE_NUMBER,
|
|
'3' => self::STATE_NUMBER,
|
|
'4' => self::STATE_NUMBER,
|
|
'5' => self::STATE_NUMBER,
|
|
'6' => self::STATE_NUMBER,
|
|
'7' => self::STATE_NUMBER,
|
|
'8' => self::STATE_NUMBER,
|
|
'9' => self::STATE_NUMBER,
|
|
' ' => self::STATE_WHITESPACE,
|
|
"\t" => self::STATE_WHITESPACE,
|
|
"\n" => self::STATE_WHITESPACE,
|
|
"\r" => self::STATE_WHITESPACE,
|
|
'.' => self::STATE_SINGLE_CHAR,
|
|
'*' => self::STATE_SINGLE_CHAR,
|
|
']' => self::STATE_SINGLE_CHAR,
|
|
',' => self::STATE_SINGLE_CHAR,
|
|
':' => self::STATE_SINGLE_CHAR,
|
|
'@' => self::STATE_SINGLE_CHAR,
|
|
'(' => self::STATE_SINGLE_CHAR,
|
|
')' => self::STATE_SINGLE_CHAR,
|
|
'{' => self::STATE_SINGLE_CHAR,
|
|
'}' => self::STATE_SINGLE_CHAR,
|
|
'_' => self::STATE_IDENTIFIER,
|
|
'A' => self::STATE_IDENTIFIER,
|
|
'B' => self::STATE_IDENTIFIER,
|
|
'C' => self::STATE_IDENTIFIER,
|
|
'D' => self::STATE_IDENTIFIER,
|
|
'E' => self::STATE_IDENTIFIER,
|
|
'F' => self::STATE_IDENTIFIER,
|
|
'G' => self::STATE_IDENTIFIER,
|
|
'H' => self::STATE_IDENTIFIER,
|
|
'I' => self::STATE_IDENTIFIER,
|
|
'J' => self::STATE_IDENTIFIER,
|
|
'K' => self::STATE_IDENTIFIER,
|
|
'L' => self::STATE_IDENTIFIER,
|
|
'M' => self::STATE_IDENTIFIER,
|
|
'N' => self::STATE_IDENTIFIER,
|
|
'O' => self::STATE_IDENTIFIER,
|
|
'P' => self::STATE_IDENTIFIER,
|
|
'Q' => self::STATE_IDENTIFIER,
|
|
'R' => self::STATE_IDENTIFIER,
|
|
'S' => self::STATE_IDENTIFIER,
|
|
'T' => self::STATE_IDENTIFIER,
|
|
'U' => self::STATE_IDENTIFIER,
|
|
'V' => self::STATE_IDENTIFIER,
|
|
'W' => self::STATE_IDENTIFIER,
|
|
'X' => self::STATE_IDENTIFIER,
|
|
'Y' => self::STATE_IDENTIFIER,
|
|
'Z' => self::STATE_IDENTIFIER,
|
|
'a' => self::STATE_IDENTIFIER,
|
|
'b' => self::STATE_IDENTIFIER,
|
|
'c' => self::STATE_IDENTIFIER,
|
|
'd' => self::STATE_IDENTIFIER,
|
|
'e' => self::STATE_IDENTIFIER,
|
|
'f' => self::STATE_IDENTIFIER,
|
|
'g' => self::STATE_IDENTIFIER,
|
|
'h' => self::STATE_IDENTIFIER,
|
|
'i' => self::STATE_IDENTIFIER,
|
|
'j' => self::STATE_IDENTIFIER,
|
|
'k' => self::STATE_IDENTIFIER,
|
|
'l' => self::STATE_IDENTIFIER,
|
|
'm' => self::STATE_IDENTIFIER,
|
|
'n' => self::STATE_IDENTIFIER,
|
|
'o' => self::STATE_IDENTIFIER,
|
|
'p' => self::STATE_IDENTIFIER,
|
|
'q' => self::STATE_IDENTIFIER,
|
|
'r' => self::STATE_IDENTIFIER,
|
|
's' => self::STATE_IDENTIFIER,
|
|
't' => self::STATE_IDENTIFIER,
|
|
'u' => self::STATE_IDENTIFIER,
|
|
'v' => self::STATE_IDENTIFIER,
|
|
'w' => self::STATE_IDENTIFIER,
|
|
'x' => self::STATE_IDENTIFIER,
|
|
'y' => self::STATE_IDENTIFIER,
|
|
'z' => self::STATE_IDENTIFIER,
|
|
];
|
|
|
|
/** @var array Valid identifier characters after first character */
|
|
private $validIdentifier = [
|
|
'A' => true, 'B' => true, 'C' => true, 'D' => true, 'E' => true,
|
|
'F' => true, 'G' => true, 'H' => true, 'I' => true, 'J' => true,
|
|
'K' => true, 'L' => true, 'M' => true, 'N' => true, 'O' => true,
|
|
'P' => true, 'Q' => true, 'R' => true, 'S' => true, 'T' => true,
|
|
'U' => true, 'V' => true, 'W' => true, 'X' => true, 'Y' => true,
|
|
'Z' => true, 'a' => true, 'b' => true, 'c' => true, 'd' => true,
|
|
'e' => true, 'f' => true, 'g' => true, 'h' => true, 'i' => true,
|
|
'j' => true, 'k' => true, 'l' => true, 'm' => true, 'n' => true,
|
|
'o' => true, 'p' => true, 'q' => true, 'r' => true, 's' => true,
|
|
't' => true, 'u' => true, 'v' => true, 'w' => true, 'x' => true,
|
|
'y' => true, 'z' => true, '_' => true, '0' => true, '1' => true,
|
|
'2' => true, '3' => true, '4' => true, '5' => true, '6' => true,
|
|
'7' => true, '8' => true, '9' => true,
|
|
];
|
|
|
|
/** @var array Valid number characters after the first character */
|
|
private $numbers = [
|
|
'0' => true, '1' => true, '2' => true, '3' => true, '4' => true,
|
|
'5' => true, '6' => true, '7' => true, '8' => true, '9' => true
|
|
];
|
|
|
|
/** @var array Map of simple single character tokens */
|
|
private $simpleTokens = [
|
|
'.' => self::T_DOT,
|
|
'*' => self::T_STAR,
|
|
']' => self::T_RBRACKET,
|
|
',' => self::T_COMMA,
|
|
':' => self::T_COLON,
|
|
'@' => self::T_CURRENT,
|
|
'(' => self::T_LPAREN,
|
|
')' => self::T_RPAREN,
|
|
'{' => self::T_LBRACE,
|
|
'}' => self::T_RBRACE,
|
|
];
|
|
|
|
/**
|
|
* Tokenize the JMESPath expression into an array of tokens hashes that
|
|
* contain a 'type', 'value', and 'key'.
|
|
*
|
|
* @param string $input JMESPath input
|
|
*
|
|
* @return array
|
|
* @throws SyntaxErrorException
|
|
*/
|
|
public function tokenize($input)
|
|
{
|
|
$tokens = [];
|
|
|
|
if ($input === '') {
|
|
goto eof;
|
|
}
|
|
|
|
$chars = str_split($input);
|
|
|
|
while (false !== ($current = current($chars))) {
|
|
|
|
// Every character must be in the transition character table.
|
|
if (!isset(self::$transitionTable[$current])) {
|
|
$tokens[] = [
|
|
'type' => self::T_UNKNOWN,
|
|
'pos' => key($chars),
|
|
'value' => $current
|
|
];
|
|
next($chars);
|
|
continue;
|
|
}
|
|
|
|
$state = self::$transitionTable[$current];
|
|
|
|
if ($state === self::STATE_SINGLE_CHAR) {
|
|
|
|
// Consume simple tokens like ".", ",", "@", etc.
|
|
$tokens[] = [
|
|
'type' => $this->simpleTokens[$current],
|
|
'pos' => key($chars),
|
|
'value' => $current
|
|
];
|
|
next($chars);
|
|
|
|
} elseif ($state === self::STATE_IDENTIFIER) {
|
|
|
|
// Consume identifiers
|
|
$start = key($chars);
|
|
$buffer = '';
|
|
do {
|
|
$buffer .= $current;
|
|
$current = next($chars);
|
|
} while ($current !== false && isset($this->validIdentifier[$current]));
|
|
$tokens[] = [
|
|
'type' => self::T_IDENTIFIER,
|
|
'value' => $buffer,
|
|
'pos' => $start
|
|
];
|
|
|
|
} elseif ($state === self::STATE_WHITESPACE) {
|
|
|
|
// Skip whitespace
|
|
next($chars);
|
|
|
|
} elseif ($state === self::STATE_LBRACKET) {
|
|
|
|
// Consume "[", "[?", and "[]"
|
|
$position = key($chars);
|
|
$actual = next($chars);
|
|
if ($actual === ']') {
|
|
next($chars);
|
|
$tokens[] = [
|
|
'type' => self::T_FLATTEN,
|
|
'pos' => $position,
|
|
'value' => '[]'
|
|
];
|
|
} elseif ($actual === '?') {
|
|
next($chars);
|
|
$tokens[] = [
|
|
'type' => self::T_FILTER,
|
|
'pos' => $position,
|
|
'value' => '[?'
|
|
];
|
|
} else {
|
|
$tokens[] = [
|
|
'type' => self::T_LBRACKET,
|
|
'pos' => $position,
|
|
'value' => '['
|
|
];
|
|
}
|
|
|
|
} elseif ($state === self::STATE_STRING_LITERAL) {
|
|
|
|
// Consume raw string literals
|
|
$t = $this->inside($chars, "'", self::T_LITERAL);
|
|
$t['value'] = str_replace("\\'", "'", $t['value']);
|
|
$tokens[] = $t;
|
|
|
|
} elseif ($state === self::STATE_PIPE) {
|
|
|
|
// Consume pipe and OR
|
|
$tokens[] = $this->matchOr($chars, '|', '|', self::T_OR, self::T_PIPE);
|
|
|
|
} elseif ($state == self::STATE_JSON_LITERAL) {
|
|
|
|
// Consume JSON literals
|
|
$token = $this->inside($chars, '`', self::T_LITERAL);
|
|
if ($token['type'] === self::T_LITERAL) {
|
|
$token['value'] = str_replace('\\`', '`', $token['value']);
|
|
$token = $this->parseJson($token);
|
|
}
|
|
$tokens[] = $token;
|
|
|
|
} elseif ($state == self::STATE_NUMBER) {
|
|
|
|
// Consume numbers
|
|
$start = key($chars);
|
|
$buffer = '';
|
|
do {
|
|
$buffer .= $current;
|
|
$current = next($chars);
|
|
} while ($current !== false && isset($this->numbers[$current]));
|
|
$tokens[] = [
|
|
'type' => self::T_NUMBER,
|
|
'value' => (int)$buffer,
|
|
'pos' => $start
|
|
];
|
|
|
|
} elseif ($state === self::STATE_QUOTED_STRING) {
|
|
|
|
// Consume quoted identifiers
|
|
$token = $this->inside($chars, '"', self::T_QUOTED_IDENTIFIER);
|
|
if ($token['type'] === self::T_QUOTED_IDENTIFIER) {
|
|
$token['value'] = '"' . $token['value'] . '"';
|
|
$token = $this->parseJson($token);
|
|
}
|
|
$tokens[] = $token;
|
|
|
|
} elseif ($state === self::STATE_EQ) {
|
|
|
|
// Consume equals
|
|
$tokens[] = $this->matchOr($chars, '=', '=', self::T_COMPARATOR, self::T_UNKNOWN);
|
|
|
|
} elseif ($state == self::STATE_AND) {
|
|
|
|
$tokens[] = $this->matchOr($chars, '&', '&', self::T_AND, self::T_EXPREF);
|
|
|
|
} elseif ($state === self::STATE_NOT) {
|
|
|
|
// Consume not equal
|
|
$tokens[] = $this->matchOr($chars, '!', '=', self::T_COMPARATOR, self::T_NOT);
|
|
|
|
} else {
|
|
|
|
// either '<' or '>'
|
|
// Consume less than and greater than
|
|
$tokens[] = $this->matchOr($chars, $current, '=', self::T_COMPARATOR, self::T_COMPARATOR);
|
|
|
|
}
|
|
}
|
|
|
|
eof:
|
|
$tokens[] = [
|
|
'type' => self::T_EOF,
|
|
'pos' => mb_strlen($input, 'UTF-8'),
|
|
'value' => null
|
|
];
|
|
|
|
return $tokens;
|
|
}
|
|
|
|
/**
|
|
* Returns a token based on whether or not the next token matches the
|
|
* expected value. If it does, a token of "$type" is returned. Otherwise,
|
|
* a token of "$orElse" type is returned.
|
|
*
|
|
* @param array $chars Array of characters by reference.
|
|
* @param string $current The current character.
|
|
* @param string $expected Expected character.
|
|
* @param string $type Expected result type.
|
|
* @param string $orElse Otherwise return a token of this type.
|
|
*
|
|
* @return array Returns a conditional token.
|
|
*/
|
|
private function matchOr(array &$chars, $current, $expected, $type, $orElse)
|
|
{
|
|
if (next($chars) === $expected) {
|
|
next($chars);
|
|
return [
|
|
'type' => $type,
|
|
'pos' => key($chars) - 1,
|
|
'value' => $current . $expected
|
|
];
|
|
}
|
|
|
|
return [
|
|
'type' => $orElse,
|
|
'pos' => key($chars) - 1,
|
|
'value' => $current
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Returns a token the is the result of consuming inside of delimiter
|
|
* characters. Escaped delimiters will be adjusted before returning a
|
|
* value. If the token is not closed, "unknown" is returned.
|
|
*
|
|
* @param array $chars Array of characters by reference.
|
|
* @param string $delim The delimiter character.
|
|
* @param string $type Token type.
|
|
*
|
|
* @return array Returns the consumed token.
|
|
*/
|
|
private function inside(array &$chars, $delim, $type)
|
|
{
|
|
$position = key($chars);
|
|
$current = next($chars);
|
|
$buffer = '';
|
|
|
|
while ($current !== $delim) {
|
|
if ($current === '\\') {
|
|
$buffer .= '\\';
|
|
$current = next($chars);
|
|
}
|
|
if ($current === false) {
|
|
// Unclosed delimiter
|
|
return [
|
|
'type' => self::T_UNKNOWN,
|
|
'value' => $buffer,
|
|
'pos' => $position
|
|
];
|
|
}
|
|
$buffer .= $current;
|
|
$current = next($chars);
|
|
}
|
|
|
|
next($chars);
|
|
|
|
return ['type' => $type, 'value' => $buffer, 'pos' => $position];
|
|
}
|
|
|
|
/**
|
|
* Parses a JSON token or sets the token type to "unknown" on error.
|
|
*
|
|
* @param array $token Token that needs parsing.
|
|
*
|
|
* @return array Returns a token with a parsed value.
|
|
*/
|
|
private function parseJson(array $token)
|
|
{
|
|
$value = json_decode($token['value'], true);
|
|
|
|
if ($error = json_last_error()) {
|
|
// Legacy support for elided quotes. Try to parse again by adding
|
|
// quotes around the bad input value.
|
|
$value = json_decode('"' . $token['value'] . '"', true);
|
|
if ($error = json_last_error()) {
|
|
$token['type'] = self::T_UNKNOWN;
|
|
return $token;
|
|
}
|
|
}
|
|
|
|
$token['value'] = $value;
|
|
return $token;
|
|
}
|
|
}
|