• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/query/advanced_query_parser/parser.h"
16 
17 #include <memory>
18 #include <string_view>
19 
20 #include "icing/absl_ports/canonical_errors.h"
21 #include "icing/legacy/core/icing-string-util.h"
22 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
23 #include "icing/util/status-macros.h"
24 
25 namespace icing {
26 namespace lib {
27 
28 namespace {
29 
CreateNaryNode(std::string_view operator_text,std::vector<std::unique_ptr<Node>> && operands)30 std::unique_ptr<Node> CreateNaryNode(
31     std::string_view operator_text,
32     std::vector<std::unique_ptr<Node>>&& operands) {
33   if (operands.empty()) {
34     return nullptr;
35   }
36   if (operands.size() == 1) {
37     return std::move(operands.at(0));
38   }
39   return std::make_unique<NaryOperatorNode>(std::string(operator_text),
40                                             std::move(operands));
41 }
42 
43 }  // namespace
44 
Consume(Lexer::TokenType token_type)45 libtextclassifier3::Status Parser::Consume(Lexer::TokenType token_type) {
46   if (!Match(token_type)) {
47     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
48         "Unable to consume token %d.", static_cast<int>(token_type)));
49   }
50   ++current_token_;
51   return libtextclassifier3::Status::OK;
52 }
53 
ConsumeText()54 libtextclassifier3::StatusOr<std::unique_ptr<TextNode>> Parser::ConsumeText() {
55   if (!Match(Lexer::TokenType::TEXT)) {
56     return absl_ports::InvalidArgumentError("Unable to consume token as TEXT.");
57   }
58   auto text_node = std::make_unique<TextNode>(std::move(current_token_->text),
59                                               current_token_->original_text);
60   ++current_token_;
61   return text_node;
62 }
63 
64 libtextclassifier3::StatusOr<std::unique_ptr<FunctionNameNode>>
ConsumeFunctionName()65 Parser::ConsumeFunctionName() {
66   if (!Match(Lexer::TokenType::FUNCTION_NAME)) {
67     return absl_ports::InvalidArgumentError(
68         "Unable to consume token as FUNCTION_NAME.");
69   }
70   auto function_name_node =
71       std::make_unique<FunctionNameNode>(std::move(current_token_->text));
72   ++current_token_;
73   return function_name_node;
74 }
75 
76 // stringElement
77 //    : STRING STAR?
78 libtextclassifier3::StatusOr<std::unique_ptr<StringNode>>
ConsumeStringElement()79 Parser::ConsumeStringElement() {
80   if (!Match(Lexer::TokenType::STRING)) {
81     return absl_ports::InvalidArgumentError(
82         "Unable to consume token as STRING.");
83   }
84   std::string text = std::move(current_token_->text);
85   std::string_view raw_text = current_token_->original_text;
86   ++current_token_;
87 
88   bool is_prefix = false;
89   if (Match(Lexer::TokenType::STAR)) {
90     is_prefix = true;
91     ++current_token_;
92   }
93 
94   return std::make_unique<StringNode>(std::move(text), raw_text, is_prefix);
95 }
96 
ConsumeComparator()97 libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() {
98   if (!Match(Lexer::TokenType::COMPARATOR)) {
99     return absl_ports::InvalidArgumentError(
100         "Unable to consume token as COMPARATOR.");
101   }
102   std::string comparator = std::move(current_token_->text);
103   ++current_token_;
104   return comparator;
105 }
106 
107 // member
108 //    :  TEXT (DOT TEXT)* (DOT function)?
109 //    |  TEXT STAR
110 //    ;
111 libtextclassifier3::StatusOr<std::unique_ptr<MemberNode>>
ConsumeMember()112 Parser::ConsumeMember() {
113   ICING_ASSIGN_OR_RETURN(std::unique_ptr<TextNode> text_node, ConsumeText());
114   std::vector<std::unique_ptr<TextNode>> children;
115 
116   // Member could be either `TEXT (DOT TEXT)* (DOT function)?` or `TEXT STAR`
117   // at this point. So check for 'STAR' to differentiate the two cases.
118   if (Match(Lexer::TokenType::STAR)) {
119     Consume(Lexer::TokenType::STAR);
120     std::string_view raw_text = text_node->raw_value();
121     std::string text = std::move(*text_node).value();
122     text_node = std::make_unique<TextNode>(std::move(text), raw_text,
123                                            /*is_prefix=*/true);
124     children.push_back(std::move(text_node));
125   } else {
126     children.push_back(std::move(text_node));
127     while (Match(Lexer::TokenType::DOT)) {
128       Consume(Lexer::TokenType::DOT);
129       if (MatchFunction()) {
130         ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node,
131                                ConsumeFunction());
132         // Once a function is matched, we should exit the current rule based on
133         // the grammar.
134         return std::make_unique<MemberNode>(std::move(children),
135                                             std::move(function_node));
136       }
137       ICING_ASSIGN_OR_RETURN(text_node, ConsumeText());
138       children.push_back(std::move(text_node));
139     }
140   }
141   return std::make_unique<MemberNode>(std::move(children),
142                                       /*function=*/nullptr);
143 }
144 
145 // function
146 //    : FUNCTION_NAME LPAREN argList? RPAREN
147 //    ;
148 libtextclassifier3::StatusOr<std::unique_ptr<FunctionNode>>
ConsumeFunction()149 Parser::ConsumeFunction() {
150   ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNameNode> function_name,
151                          ConsumeFunctionName());
152   ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
153 
154   std::vector<std::unique_ptr<Node>> args;
155   if (Match(Lexer::TokenType::RPAREN)) {
156     // Got empty argument.
157     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
158   } else {
159     ICING_ASSIGN_OR_RETURN(args, ConsumeArgs());
160     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
161   }
162   return std::make_unique<FunctionNode>(std::move(function_name),
163                                         std::move(args));
164 }
165 
166 // comparable
167 //     : stringElement
168 //     | member
169 //     | function
170 //     ;
171 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeComparable()172 Parser::ConsumeComparable() {
173   if (Match(Lexer::TokenType::STRING)) {
174     return ConsumeStringElement();
175   } else if (MatchMember()) {
176     return ConsumeMember();
177   }
178   // The current token sequence isn't a STRING or member. Therefore, it must be
179   // a function.
180   return ConsumeFunction();
181 }
182 
183 // composite
184 //    : LPAREN expression RPAREN
185 //    ;
ConsumeComposite()186 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeComposite() {
187   ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
188 
189   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> expression, ConsumeExpression());
190 
191   ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
192   return expression;
193 }
194 
195 // argList
196 //    : expression (COMMA expression)*
197 //    ;
198 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<Node>>>
ConsumeArgs()199 Parser::ConsumeArgs() {
200   std::vector<std::unique_ptr<Node>> args;
201   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> arg, ConsumeExpression());
202   args.push_back(std::move(arg));
203   while (Match(Lexer::TokenType::COMMA)) {
204     Consume(Lexer::TokenType::COMMA);
205     ICING_ASSIGN_OR_RETURN(arg, ConsumeExpression());
206     args.push_back(std::move(arg));
207   }
208   return args;
209 }
210 
211 // restriction
212 //     : comparable (COMPARATOR MINUS? (comparable | composite))?
213 //     ;
214 // COMPARATOR will not be produced in Scoring Lexer.
215 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeRestriction()216 Parser::ConsumeRestriction() {
217   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> comparable, ConsumeComparable());
218 
219   if (!Match(Lexer::TokenType::COMPARATOR)) {
220     return comparable;
221   }
222   ICING_ASSIGN_OR_RETURN(std::string operator_text, ConsumeComparator());
223 
224   bool has_minus = Match(Lexer::TokenType::MINUS);
225   if (has_minus) {
226     Consume(Lexer::TokenType::MINUS);
227   }
228 
229   std::unique_ptr<Node> arg;
230   if (MatchComposite()) {
231     ICING_ASSIGN_OR_RETURN(arg, ConsumeComposite());
232   } else if (MatchComparable()) {
233     ICING_ASSIGN_OR_RETURN(arg, ConsumeComparable());
234   } else {
235     return absl_ports::InvalidArgumentError(
236         "ARG: must begin with LPAREN or FIRST(comparable)");
237   }
238 
239   if (has_minus) {
240     arg = std::make_unique<UnaryOperatorNode>("MINUS", std::move(arg));
241   }
242 
243   std::vector<std::unique_ptr<Node>> args;
244   args.push_back(std::move(comparable));
245   args.push_back(std::move(arg));
246   return std::make_unique<NaryOperatorNode>(std::move(operator_text),
247                                             std::move(args));
248 }
249 
250 // simple
251 //     : restriction
252 //     | composite
253 //     ;
ConsumeSimple()254 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSimple() {
255   if (MatchComposite()) {
256     return ConsumeComposite();
257   } else if (MatchRestriction()) {
258     return ConsumeRestriction();
259   }
260   return absl_ports::InvalidArgumentError(
261       "SIMPLE: must be a restriction or composite");
262 }
263 
264 // term
265 //     : NOT? simple
266 //     | MINUS simple
267 //     ;
268 // NOT will not be produced in Scoring Lexer.
ConsumeTerm()269 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeTerm() {
270   if (!Match(Lexer::TokenType::NOT) && !Match(Lexer::TokenType::MINUS)) {
271     return ConsumeSimple();
272   }
273   std::string operator_text;
274   if (language_ == Lexer::Language::SCORING) {
275     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
276     operator_text = "MINUS";
277   } else {
278     if (Match(Lexer::TokenType::NOT)) {
279       Consume(Lexer::TokenType::NOT);
280       operator_text = "NOT";
281     } else {
282       Consume(Lexer::TokenType::MINUS);
283       operator_text = "MINUS";
284     }
285   }
286   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> simple, ConsumeSimple());
287   return std::make_unique<UnaryOperatorNode>(operator_text, std::move(simple));
288 }
289 
290 // factor
291 //     : term (OR term)*
292 //     ;
ConsumeFactor()293 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeFactor() {
294   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> term, ConsumeTerm());
295   std::vector<std::unique_ptr<Node>> terms;
296   terms.push_back(std::move(term));
297 
298   while (Match(Lexer::TokenType::OR)) {
299     Consume(Lexer::TokenType::OR);
300     ICING_ASSIGN_OR_RETURN(term, ConsumeTerm());
301     terms.push_back(std::move(term));
302   }
303 
304   return CreateNaryNode("OR", std::move(terms));
305 }
306 
307 // sequence
308 //    : (factor)+
309 //    ;
ConsumeSequence()310 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSequence() {
311   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> factor, ConsumeFactor());
312   std::vector<std::unique_ptr<Node>> factors;
313   factors.push_back(std::move(factor));
314 
315   while (MatchFactor()) {
316     ICING_ASSIGN_OR_RETURN(factor, ConsumeFactor());
317     factors.push_back(std::move(factor));
318   }
319 
320   return CreateNaryNode("AND", std::move(factors));
321 }
322 
323 // expression
324 //    : sequence (AND sequence)*
325 //    ;
326 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeQueryExpression()327 Parser::ConsumeQueryExpression() {
328   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> sequence, ConsumeSequence());
329   std::vector<std::unique_ptr<Node>> sequences;
330   sequences.push_back(std::move(sequence));
331 
332   while (Match(Lexer::TokenType::AND)) {
333     Consume(Lexer::TokenType::AND);
334     ICING_ASSIGN_OR_RETURN(sequence, ConsumeSequence());
335     sequences.push_back(std::move(sequence));
336   }
337 
338   return CreateNaryNode("AND", std::move(sequences));
339 }
340 
341 // multExpr
342 //     : term ((TIMES | DIV) term)*
343 //     ;
ConsumeMultExpr()344 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeMultExpr() {
345   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeTerm());
346   std::vector<std::unique_ptr<Node>> stack;
347   stack.push_back(std::move(node));
348 
349   while (Match(Lexer::TokenType::TIMES) || Match(Lexer::TokenType::DIV)) {
350     while (Match(Lexer::TokenType::TIMES)) {
351       Consume(Lexer::TokenType::TIMES);
352       ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
353       stack.push_back(std::move(node));
354     }
355     node = CreateNaryNode("TIMES", std::move(stack));
356     stack.clear();
357     stack.push_back(std::move(node));
358 
359     while (Match(Lexer::TokenType::DIV)) {
360       Consume(Lexer::TokenType::DIV);
361       ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
362       stack.push_back(std::move(node));
363     }
364     node = CreateNaryNode("DIV", std::move(stack));
365     stack.clear();
366     stack.push_back(std::move(node));
367   }
368 
369   return std::move(stack[0]);
370 }
371 
372 // expression
373 //    : multExpr ((PLUS | MINUS) multExpr)*
374 //    ;
375 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeScoringExpression()376 Parser::ConsumeScoringExpression() {
377   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeMultExpr());
378   std::vector<std::unique_ptr<Node>> stack;
379   stack.push_back(std::move(node));
380 
381   while (Match(Lexer::TokenType::PLUS) || Match(Lexer::TokenType::MINUS)) {
382     while (Match(Lexer::TokenType::PLUS)) {
383       Consume(Lexer::TokenType::PLUS);
384       ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
385       stack.push_back(std::move(node));
386     }
387     node = CreateNaryNode("PLUS", std::move(stack));
388     stack.clear();
389     stack.push_back(std::move(node));
390 
391     while (Match(Lexer::TokenType::MINUS)) {
392       Consume(Lexer::TokenType::MINUS);
393       ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
394       stack.push_back(std::move(node));
395     }
396     node = CreateNaryNode("MINUS", std::move(stack));
397     stack.clear();
398     stack.push_back(std::move(node));
399   }
400 
401   return std::move(stack[0]);
402 }
403 
404 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeExpression()405 Parser::ConsumeExpression() {
406   switch (language_) {
407     case Lexer::Language::QUERY:
408       return ConsumeQueryExpression();
409     case Lexer::Language::SCORING:
410       return ConsumeScoringExpression();
411   }
412 }
413 
414 // query
415 //     : expression? EOF
416 //     ;
ConsumeQuery()417 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeQuery() {
418   language_ = Lexer::Language::QUERY;
419   std::unique_ptr<Node> node;
420   if (current_token_ != lexer_tokens_.end()) {
421     ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
422   }
423   if (current_token_ != lexer_tokens_.end()) {
424     return absl_ports::InvalidArgumentError(
425         "Error parsing Query. Must reach EOF after parsing Expression!");
426   }
427   return node;
428 }
429 
430 // scoring
431 //     : expression EOF
432 //     ;
ConsumeScoring()433 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeScoring() {
434   language_ = Lexer::Language::SCORING;
435   std::unique_ptr<Node> node;
436   if (current_token_ == lexer_tokens_.end()) {
437     return absl_ports::InvalidArgumentError("Got empty scoring expression!");
438   }
439   ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
440   if (current_token_ != lexer_tokens_.end()) {
441     return absl_ports::InvalidArgumentError(
442         "Error parsing the scoring expression. Must reach EOF after parsing "
443         "Expression!");
444   }
445   return node;
446 }
447 
448 }  // namespace lib
449 }  // namespace icing
450