1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/query/advanced_query_parser/parser.h"
16
17 #include <memory>
18 #include <string_view>
19
20 #include "icing/absl_ports/canonical_errors.h"
21 #include "icing/legacy/core/icing-string-util.h"
22 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
23 #include "icing/util/status-macros.h"
24
25 namespace icing {
26 namespace lib {
27
28 namespace {
29
CreateNaryNode(std::string_view operator_text,std::vector<std::unique_ptr<Node>> && operands)30 std::unique_ptr<Node> CreateNaryNode(
31 std::string_view operator_text,
32 std::vector<std::unique_ptr<Node>>&& operands) {
33 if (operands.empty()) {
34 return nullptr;
35 }
36 if (operands.size() == 1) {
37 return std::move(operands.at(0));
38 }
39 return std::make_unique<NaryOperatorNode>(std::string(operator_text),
40 std::move(operands));
41 }
42
43 } // namespace
44
Consume(Lexer::TokenType token_type)45 libtextclassifier3::Status Parser::Consume(Lexer::TokenType token_type) {
46 if (!Match(token_type)) {
47 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
48 "Unable to consume token %d.", static_cast<int>(token_type)));
49 }
50 ++current_token_;
51 return libtextclassifier3::Status::OK;
52 }
53
ConsumeText()54 libtextclassifier3::StatusOr<std::unique_ptr<TextNode>> Parser::ConsumeText() {
55 if (!Match(Lexer::TokenType::TEXT)) {
56 return absl_ports::InvalidArgumentError("Unable to consume token as TEXT.");
57 }
58 auto text_node = std::make_unique<TextNode>(std::move(current_token_->text),
59 current_token_->original_text);
60 ++current_token_;
61 return text_node;
62 }
63
64 libtextclassifier3::StatusOr<std::unique_ptr<FunctionNameNode>>
ConsumeFunctionName()65 Parser::ConsumeFunctionName() {
66 if (!Match(Lexer::TokenType::FUNCTION_NAME)) {
67 return absl_ports::InvalidArgumentError(
68 "Unable to consume token as FUNCTION_NAME.");
69 }
70 auto function_name_node =
71 std::make_unique<FunctionNameNode>(std::move(current_token_->text));
72 ++current_token_;
73 return function_name_node;
74 }
75
76 // stringElement
77 // : STRING STAR?
78 libtextclassifier3::StatusOr<std::unique_ptr<StringNode>>
ConsumeStringElement()79 Parser::ConsumeStringElement() {
80 if (!Match(Lexer::TokenType::STRING)) {
81 return absl_ports::InvalidArgumentError(
82 "Unable to consume token as STRING.");
83 }
84 std::string text = std::move(current_token_->text);
85 std::string_view raw_text = current_token_->original_text;
86 ++current_token_;
87
88 bool is_prefix = false;
89 if (Match(Lexer::TokenType::STAR)) {
90 is_prefix = true;
91 ++current_token_;
92 }
93
94 return std::make_unique<StringNode>(std::move(text), raw_text, is_prefix);
95 }
96
ConsumeComparator()97 libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() {
98 if (!Match(Lexer::TokenType::COMPARATOR)) {
99 return absl_ports::InvalidArgumentError(
100 "Unable to consume token as COMPARATOR.");
101 }
102 std::string comparator = std::move(current_token_->text);
103 ++current_token_;
104 return comparator;
105 }
106
107 // member
108 // : TEXT (DOT TEXT)* (DOT function)?
109 // | TEXT STAR
110 // ;
111 libtextclassifier3::StatusOr<std::unique_ptr<MemberNode>>
ConsumeMember()112 Parser::ConsumeMember() {
113 ICING_ASSIGN_OR_RETURN(std::unique_ptr<TextNode> text_node, ConsumeText());
114 std::vector<std::unique_ptr<TextNode>> children;
115
116 // Member could be either `TEXT (DOT TEXT)* (DOT function)?` or `TEXT STAR`
117 // at this point. So check for 'STAR' to differentiate the two cases.
118 if (Match(Lexer::TokenType::STAR)) {
119 Consume(Lexer::TokenType::STAR);
120 std::string_view raw_text = text_node->raw_value();
121 std::string text = std::move(*text_node).value();
122 text_node = std::make_unique<TextNode>(std::move(text), raw_text,
123 /*is_prefix=*/true);
124 children.push_back(std::move(text_node));
125 } else {
126 children.push_back(std::move(text_node));
127 while (Match(Lexer::TokenType::DOT)) {
128 Consume(Lexer::TokenType::DOT);
129 if (MatchFunction()) {
130 ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node,
131 ConsumeFunction());
132 // Once a function is matched, we should exit the current rule based on
133 // the grammar.
134 return std::make_unique<MemberNode>(std::move(children),
135 std::move(function_node));
136 }
137 ICING_ASSIGN_OR_RETURN(text_node, ConsumeText());
138 children.push_back(std::move(text_node));
139 }
140 }
141 return std::make_unique<MemberNode>(std::move(children),
142 /*function=*/nullptr);
143 }
144
145 // function
146 // : FUNCTION_NAME LPAREN argList? RPAREN
147 // ;
148 libtextclassifier3::StatusOr<std::unique_ptr<FunctionNode>>
ConsumeFunction()149 Parser::ConsumeFunction() {
150 ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNameNode> function_name,
151 ConsumeFunctionName());
152 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
153
154 std::vector<std::unique_ptr<Node>> args;
155 if (Match(Lexer::TokenType::RPAREN)) {
156 // Got empty argument.
157 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
158 } else {
159 ICING_ASSIGN_OR_RETURN(args, ConsumeArgs());
160 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
161 }
162 return std::make_unique<FunctionNode>(std::move(function_name),
163 std::move(args));
164 }
165
166 // comparable
167 // : stringElement
168 // | member
169 // | function
170 // ;
171 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeComparable()172 Parser::ConsumeComparable() {
173 if (Match(Lexer::TokenType::STRING)) {
174 return ConsumeStringElement();
175 } else if (MatchMember()) {
176 return ConsumeMember();
177 }
178 // The current token sequence isn't a STRING or member. Therefore, it must be
179 // a function.
180 return ConsumeFunction();
181 }
182
183 // composite
184 // : LPAREN expression RPAREN
185 // ;
ConsumeComposite()186 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeComposite() {
187 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
188
189 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> expression, ConsumeExpression());
190
191 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
192 return expression;
193 }
194
195 // argList
196 // : expression (COMMA expression)*
197 // ;
198 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<Node>>>
ConsumeArgs()199 Parser::ConsumeArgs() {
200 std::vector<std::unique_ptr<Node>> args;
201 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> arg, ConsumeExpression());
202 args.push_back(std::move(arg));
203 while (Match(Lexer::TokenType::COMMA)) {
204 Consume(Lexer::TokenType::COMMA);
205 ICING_ASSIGN_OR_RETURN(arg, ConsumeExpression());
206 args.push_back(std::move(arg));
207 }
208 return args;
209 }
210
211 // restriction
212 // : comparable (COMPARATOR MINUS? (comparable | composite))?
213 // ;
214 // COMPARATOR will not be produced in Scoring Lexer.
215 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeRestriction()216 Parser::ConsumeRestriction() {
217 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> comparable, ConsumeComparable());
218
219 if (!Match(Lexer::TokenType::COMPARATOR)) {
220 return comparable;
221 }
222 ICING_ASSIGN_OR_RETURN(std::string operator_text, ConsumeComparator());
223
224 bool has_minus = Match(Lexer::TokenType::MINUS);
225 if (has_minus) {
226 Consume(Lexer::TokenType::MINUS);
227 }
228
229 std::unique_ptr<Node> arg;
230 if (MatchComposite()) {
231 ICING_ASSIGN_OR_RETURN(arg, ConsumeComposite());
232 } else if (MatchComparable()) {
233 ICING_ASSIGN_OR_RETURN(arg, ConsumeComparable());
234 } else {
235 return absl_ports::InvalidArgumentError(
236 "ARG: must begin with LPAREN or FIRST(comparable)");
237 }
238
239 if (has_minus) {
240 arg = std::make_unique<UnaryOperatorNode>("MINUS", std::move(arg));
241 }
242
243 std::vector<std::unique_ptr<Node>> args;
244 args.push_back(std::move(comparable));
245 args.push_back(std::move(arg));
246 return std::make_unique<NaryOperatorNode>(std::move(operator_text),
247 std::move(args));
248 }
249
250 // simple
251 // : restriction
252 // | composite
253 // ;
ConsumeSimple()254 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSimple() {
255 if (MatchComposite()) {
256 return ConsumeComposite();
257 } else if (MatchRestriction()) {
258 return ConsumeRestriction();
259 }
260 return absl_ports::InvalidArgumentError(
261 "SIMPLE: must be a restriction or composite");
262 }
263
264 // term
265 // : NOT? simple
266 // | MINUS simple
267 // ;
268 // NOT will not be produced in Scoring Lexer.
ConsumeTerm()269 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeTerm() {
270 if (!Match(Lexer::TokenType::NOT) && !Match(Lexer::TokenType::MINUS)) {
271 return ConsumeSimple();
272 }
273 std::string operator_text;
274 if (language_ == Lexer::Language::SCORING) {
275 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
276 operator_text = "MINUS";
277 } else {
278 if (Match(Lexer::TokenType::NOT)) {
279 Consume(Lexer::TokenType::NOT);
280 operator_text = "NOT";
281 } else {
282 Consume(Lexer::TokenType::MINUS);
283 operator_text = "MINUS";
284 }
285 }
286 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> simple, ConsumeSimple());
287 return std::make_unique<UnaryOperatorNode>(operator_text, std::move(simple));
288 }
289
290 // factor
291 // : term (OR term)*
292 // ;
ConsumeFactor()293 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeFactor() {
294 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> term, ConsumeTerm());
295 std::vector<std::unique_ptr<Node>> terms;
296 terms.push_back(std::move(term));
297
298 while (Match(Lexer::TokenType::OR)) {
299 Consume(Lexer::TokenType::OR);
300 ICING_ASSIGN_OR_RETURN(term, ConsumeTerm());
301 terms.push_back(std::move(term));
302 }
303
304 return CreateNaryNode("OR", std::move(terms));
305 }
306
307 // sequence
308 // : (factor)+
309 // ;
ConsumeSequence()310 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSequence() {
311 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> factor, ConsumeFactor());
312 std::vector<std::unique_ptr<Node>> factors;
313 factors.push_back(std::move(factor));
314
315 while (MatchFactor()) {
316 ICING_ASSIGN_OR_RETURN(factor, ConsumeFactor());
317 factors.push_back(std::move(factor));
318 }
319
320 return CreateNaryNode("AND", std::move(factors));
321 }
322
323 // expression
324 // : sequence (AND sequence)*
325 // ;
326 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeQueryExpression()327 Parser::ConsumeQueryExpression() {
328 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> sequence, ConsumeSequence());
329 std::vector<std::unique_ptr<Node>> sequences;
330 sequences.push_back(std::move(sequence));
331
332 while (Match(Lexer::TokenType::AND)) {
333 Consume(Lexer::TokenType::AND);
334 ICING_ASSIGN_OR_RETURN(sequence, ConsumeSequence());
335 sequences.push_back(std::move(sequence));
336 }
337
338 return CreateNaryNode("AND", std::move(sequences));
339 }
340
341 // multExpr
342 // : term ((TIMES | DIV) term)*
343 // ;
ConsumeMultExpr()344 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeMultExpr() {
345 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeTerm());
346 std::vector<std::unique_ptr<Node>> stack;
347 stack.push_back(std::move(node));
348
349 while (Match(Lexer::TokenType::TIMES) || Match(Lexer::TokenType::DIV)) {
350 while (Match(Lexer::TokenType::TIMES)) {
351 Consume(Lexer::TokenType::TIMES);
352 ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
353 stack.push_back(std::move(node));
354 }
355 node = CreateNaryNode("TIMES", std::move(stack));
356 stack.clear();
357 stack.push_back(std::move(node));
358
359 while (Match(Lexer::TokenType::DIV)) {
360 Consume(Lexer::TokenType::DIV);
361 ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
362 stack.push_back(std::move(node));
363 }
364 node = CreateNaryNode("DIV", std::move(stack));
365 stack.clear();
366 stack.push_back(std::move(node));
367 }
368
369 return std::move(stack[0]);
370 }
371
372 // expression
373 // : multExpr ((PLUS | MINUS) multExpr)*
374 // ;
375 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeScoringExpression()376 Parser::ConsumeScoringExpression() {
377 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeMultExpr());
378 std::vector<std::unique_ptr<Node>> stack;
379 stack.push_back(std::move(node));
380
381 while (Match(Lexer::TokenType::PLUS) || Match(Lexer::TokenType::MINUS)) {
382 while (Match(Lexer::TokenType::PLUS)) {
383 Consume(Lexer::TokenType::PLUS);
384 ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
385 stack.push_back(std::move(node));
386 }
387 node = CreateNaryNode("PLUS", std::move(stack));
388 stack.clear();
389 stack.push_back(std::move(node));
390
391 while (Match(Lexer::TokenType::MINUS)) {
392 Consume(Lexer::TokenType::MINUS);
393 ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
394 stack.push_back(std::move(node));
395 }
396 node = CreateNaryNode("MINUS", std::move(stack));
397 stack.clear();
398 stack.push_back(std::move(node));
399 }
400
401 return std::move(stack[0]);
402 }
403
404 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeExpression()405 Parser::ConsumeExpression() {
406 switch (language_) {
407 case Lexer::Language::QUERY:
408 return ConsumeQueryExpression();
409 case Lexer::Language::SCORING:
410 return ConsumeScoringExpression();
411 }
412 }
413
414 // query
415 // : expression? EOF
416 // ;
ConsumeQuery()417 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeQuery() {
418 language_ = Lexer::Language::QUERY;
419 std::unique_ptr<Node> node;
420 if (current_token_ != lexer_tokens_.end()) {
421 ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
422 }
423 if (current_token_ != lexer_tokens_.end()) {
424 return absl_ports::InvalidArgumentError(
425 "Error parsing Query. Must reach EOF after parsing Expression!");
426 }
427 return node;
428 }
429
430 // scoring
431 // : expression EOF
432 // ;
ConsumeScoring()433 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeScoring() {
434 language_ = Lexer::Language::SCORING;
435 std::unique_ptr<Node> node;
436 if (current_token_ == lexer_tokens_.end()) {
437 return absl_ports::InvalidArgumentError("Got empty scoring expression!");
438 }
439 ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
440 if (current_token_ != lexer_tokens_.end()) {
441 return absl_ports::InvalidArgumentError(
442 "Error parsing the scoring expression. Must reach EOF after parsing "
443 "Expression!");
444 }
445 return node;
446 }
447
448 } // namespace lib
449 } // namespace icing
450