• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/result/snippet-retriever.h"
16 
17 #include <algorithm>
18 #include <iterator>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26 
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/proto/document.pb.h"
31 #include "icing/proto/search.pb.h"
32 #include "icing/proto/term.pb.h"
33 #include "icing/query/query-terms.h"
34 #include "icing/schema/property-util.h"
35 #include "icing/schema/schema-store.h"
36 #include "icing/schema/section.h"
37 #include "icing/store/document-filter-data.h"
38 #include "icing/tokenization/language-segmenter.h"
39 #include "icing/tokenization/token.h"
40 #include "icing/tokenization/tokenizer-factory.h"
41 #include "icing/tokenization/tokenizer.h"
42 #include "icing/transform/normalizer.h"
43 #include "icing/util/character-iterator.h"
44 #include "icing/util/i18n-utils.h"
45 #include "icing/util/logging.h"
46 #include "icing/util/status-macros.h"
47 
48 namespace icing {
49 namespace lib {
50 
51 namespace {
52 
AddIndexToPath(int values_size,int index,const std::string & property_path)53 inline std::string AddIndexToPath(int values_size, int index,
54                                   const std::string& property_path) {
55   if (values_size == 1) {
56     return property_path;
57   }
58   return absl_ports::StrCat(
59       property_path, property_util::ConvertToPropertyExprIndexStr(index));
60 }
61 
62 // Returns a string of the normalized text of the input Token. Normalization
63 // is applied based on the Token's type.
NormalizeToken(const Normalizer & normalizer,const Token & token)64 std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
65   switch (token.type) {
66     case Token::Type::RFC822_NAME:
67       [[fallthrough]];
68     case Token::Type::RFC822_COMMENT:
69       [[fallthrough]];
70     case Token::Type::RFC822_LOCAL_ADDRESS:
71       [[fallthrough]];
72     case Token::Type::RFC822_HOST_ADDRESS:
73       [[fallthrough]];
74     case Token::Type::RFC822_ADDRESS:
75       [[fallthrough]];
76     case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
77       [[fallthrough]];
78     case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
79       [[fallthrough]];
80     case Token::Type::RFC822_TOKEN:
81       [[fallthrough]];
82     case Token::Type::URL_SCHEME:
83       [[fallthrough]];
84     case Token::Type::URL_USERNAME:
85       [[fallthrough]];
86     case Token::Type::URL_PASSWORD:
87       [[fallthrough]];
88     case Token::Type::URL_HOST_COMMON_PART:
89       [[fallthrough]];
90     case Token::Type::URL_HOST_SIGNIFICANT_PART:
91       [[fallthrough]];
92     case Token::Type::URL_PORT:
93       [[fallthrough]];
94     case Token::Type::URL_PATH_PART:
95       [[fallthrough]];
96     case Token::Type::URL_QUERY:
97       [[fallthrough]];
98     case Token::Type::URL_REF:
99       [[fallthrough]];
100     case Token::Type::URL_SUFFIX:
101       [[fallthrough]];
102     case Token::Type::URL_SUFFIX_INNERMOST:
103       [[fallthrough]];
104     case Token::Type::REGULAR:
105       return normalizer.NormalizeTerm(token.text);
106     case Token::Type::VERBATIM:
107       return std::string(token.text);
108     case Token::Type::QUERY_EXCLUSION:
109       [[fallthrough]];
110     case Token::Type::QUERY_LEFT_PARENTHESES:
111       [[fallthrough]];
112     case Token::Type::QUERY_RIGHT_PARENTHESES:
113       [[fallthrough]];
114     case Token::Type::QUERY_OR:
115       [[fallthrough]];
116     case Token::Type::QUERY_PROPERTY:
117       [[fallthrough]];
118     case Token::Type::INVALID:
119       ICING_LOG(WARNING) << "Unable to normalize token of type: "
120                          << static_cast<int>(token.type);
121       return std::string(token.text);
122   }
123 }
124 
125 // Returns a CharacterIterator for token's text, advancing one past the last
126 // matching character from the query term.
FindMatchEnd(const Normalizer & normalizer,const Token & token,const std::string & match_query_term)127 CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
128                                const std::string& match_query_term) {
129   switch (token.type) {
130     case Token::Type::VERBATIM: {
131       // VERBATIM tokens are not normalized. This means the non-normalized
132       // matched query term must be either equal to or a prefix of the token's
133       // text. Therefore, the match must end at the end of the matched query
134       // term.
135       CharacterIterator verbatim_match_end =
136           CharacterIterator(token.text, 0, 0, 0);
137       verbatim_match_end.AdvanceToUtf8(match_query_term.length());
138       return verbatim_match_end;
139     }
140     case Token::Type::QUERY_EXCLUSION:
141       [[fallthrough]];
142     case Token::Type::QUERY_LEFT_PARENTHESES:
143       [[fallthrough]];
144     case Token::Type::QUERY_RIGHT_PARENTHESES:
145       [[fallthrough]];
146     case Token::Type::QUERY_OR:
147       [[fallthrough]];
148     case Token::Type::QUERY_PROPERTY:
149       [[fallthrough]];
150     case Token::Type::INVALID:
151       ICING_LOG(WARNING)
152           << "Unexpected Token type " << static_cast<int>(token.type)
153           << " found when finding match end of query term and token.";
154       [[fallthrough]];
155     case Token::Type::RFC822_NAME:
156       [[fallthrough]];
157     case Token::Type::RFC822_COMMENT:
158       [[fallthrough]];
159     case Token::Type::RFC822_LOCAL_ADDRESS:
160       [[fallthrough]];
161     case Token::Type::RFC822_HOST_ADDRESS:
162       [[fallthrough]];
163     case Token::Type::RFC822_ADDRESS:
164       [[fallthrough]];
165     case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
166       [[fallthrough]];
167     case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
168       [[fallthrough]];
169     case Token::Type::RFC822_TOKEN:
170       [[fallthrough]];
171     case Token::Type::URL_SCHEME:
172       [[fallthrough]];
173     case Token::Type::URL_USERNAME:
174       [[fallthrough]];
175     case Token::Type::URL_PASSWORD:
176       [[fallthrough]];
177     case Token::Type::URL_HOST_COMMON_PART:
178       [[fallthrough]];
179     case Token::Type::URL_HOST_SIGNIFICANT_PART:
180       [[fallthrough]];
181     case Token::Type::URL_PORT:
182       [[fallthrough]];
183     case Token::Type::URL_QUERY:
184       [[fallthrough]];
185     case Token::Type::URL_PATH_PART:
186       [[fallthrough]];
187     case Token::Type::URL_REF:
188       [[fallthrough]];
189     case Token::Type::URL_SUFFIX:
190       [[fallthrough]];
191     case Token::Type::URL_SUFFIX_INNERMOST:
192       [[fallthrough]];
193     case Token::Type::REGULAR:
194       return normalizer.FindNormalizedMatchEndPosition(token.text,
195                                                        match_query_term);
196   }
197 }
198 
199 class TokenMatcher {
200  public:
201   virtual ~TokenMatcher() = default;
202 
203   // Returns a CharacterIterator pointing just past the end of the substring in
204   // token.text that matches a query term. Note that the utf* indices will be
205   // in relation to token.text's start.
206   //
207   // If there is no match, then it will construct a CharacterIterator with all
208   // of its indices set to -1.
209   //
210   // Ex. With an exact matcher, query terms=["foo","bar"] and token.text="bar",
211   // Matches will return a CharacterIterator(u8:3, u16:3, u32:3).
212   virtual CharacterIterator Matches(Token token) const = 0;
213 };
214 
215 class TokenMatcherExact : public TokenMatcher {
216  public:
TokenMatcherExact(const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)217   explicit TokenMatcherExact(
218       const std::unordered_set<std::string>& unrestricted_query_terms,
219       const std::unordered_set<std::string>& restricted_query_terms,
220       const Normalizer& normalizer)
221       : unrestricted_query_terms_(unrestricted_query_terms),
222         restricted_query_terms_(restricted_query_terms),
223         normalizer_(normalizer) {}
224 
Matches(Token token) const225   CharacterIterator Matches(Token token) const override {
226     std::string s = NormalizeToken(normalizer_, token);
227     auto itr = unrestricted_query_terms_.find(s);
228     if (itr == unrestricted_query_terms_.end()) {
229       itr = restricted_query_terms_.find(s);
230     }
231     if (itr != unrestricted_query_terms_.end() &&
232         itr != restricted_query_terms_.end()) {
233       return FindMatchEnd(normalizer_, token, *itr);
234     }
235 
236     return CharacterIterator(token.text, -1, -1, -1);
237   }
238 
239  private:
240   const std::unordered_set<std::string>& unrestricted_query_terms_;
241   const std::unordered_set<std::string>& restricted_query_terms_;
242   const Normalizer& normalizer_;
243 };
244 
245 class TokenMatcherPrefix : public TokenMatcher {
246  public:
TokenMatcherPrefix(const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)247   explicit TokenMatcherPrefix(
248       const std::unordered_set<std::string>& unrestricted_query_terms,
249       const std::unordered_set<std::string>& restricted_query_terms,
250       const Normalizer& normalizer)
251       : unrestricted_query_terms_(unrestricted_query_terms),
252         restricted_query_terms_(restricted_query_terms),
253         normalizer_(normalizer) {}
254 
Matches(Token token) const255   CharacterIterator Matches(Token token) const override {
256     std::string s = NormalizeToken(normalizer_, token);
257     for (const std::string& query_term : unrestricted_query_terms_) {
258       if (query_term.length() <= s.length() &&
259           s.compare(0, query_term.length(), query_term) == 0) {
260         return FindMatchEnd(normalizer_, token, query_term);
261       }
262     }
263     for (const std::string& query_term : restricted_query_terms_) {
264       if (query_term.length() <= s.length() &&
265           s.compare(0, query_term.length(), query_term) == 0) {
266         return FindMatchEnd(normalizer_, token, query_term);
267       }
268     }
269     return CharacterIterator(token.text, -1, -1, -1);
270   }
271 
272  private:
273   const std::unordered_set<std::string>& unrestricted_query_terms_;
274   const std::unordered_set<std::string>& restricted_query_terms_;
275   const Normalizer& normalizer_;
276 };
277 
CreateTokenMatcher(TermMatchType::Code match_type,const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)278 libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher(
279     TermMatchType::Code match_type,
280     const std::unordered_set<std::string>& unrestricted_query_terms,
281     const std::unordered_set<std::string>& restricted_query_terms,
282     const Normalizer& normalizer) {
283   switch (match_type) {
284     case TermMatchType::EXACT_ONLY:
285       return std::make_unique<TokenMatcherExact>(
286           unrestricted_query_terms, restricted_query_terms, normalizer);
287     case TermMatchType::PREFIX:
288       return std::make_unique<TokenMatcherPrefix>(
289           unrestricted_query_terms, restricted_query_terms, normalizer);
290     case TermMatchType::UNKNOWN:
291       [[fallthrough]];
292     default:
293       return absl_ports::InvalidArgumentError("Invalid match type provided.");
294   }
295 }
296 
297 // Finds the start position of a valid token that is after
298 // window_start_min_exclusive_utf32
299 //
300 // Returns:
301 //   the position of the window start if successful
302 //   INTERNAL_ERROR - if a tokenizer error is encountered
DetermineWindowStart(const ResultSpecProto::SnippetSpecProto & snippet_spec,std::string_view value,int window_start_min_exclusive_utf32,Tokenizer::Iterator * iterator)303 libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
304     const ResultSpecProto::SnippetSpecProto& snippet_spec,
305     std::string_view value, int window_start_min_exclusive_utf32,
306     Tokenizer::Iterator* iterator) {
307   if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) {
308     return absl_ports::InternalError(
309         "Couldn't reset tokenizer to determine snippet window!");
310   }
311   return iterator->CalculateTokenStart();
312 }
313 
314 // Increments window_end_exclusive so long as the character at the position
315 // of window_end_exclusive is punctuation and does not exceed
316 // window_end_max_exclusive_utf32.
IncludeTrailingPunctuation(std::string_view value,CharacterIterator window_end_exclusive,int window_end_max_exclusive_utf32)317 CharacterIterator IncludeTrailingPunctuation(
318     std::string_view value, CharacterIterator window_end_exclusive,
319     int window_end_max_exclusive_utf32) {
320   size_t max_search_index = value.length() - 1;
321   while (window_end_exclusive.utf8_index() <= max_search_index &&
322          window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) {
323     int char_len = 0;
324     if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive.utf8_index(),
325                                      &char_len)) {
326       break;
327     }
328     // Expand window by char_len and check the next character.
329     window_end_exclusive.AdvanceToUtf32(window_end_exclusive.utf32_index() + 1);
330   }
331   return window_end_exclusive;
332 }
333 
334 // Finds the end position of a valid token that is before the
335 // window_end_max_exclusive_utf32.
336 //
337 // Returns:
338 //   the position of the window end if successful
339 //   INTERNAL_ERROR - if a tokenizer error is encountered
DetermineWindowEnd(const ResultSpecProto::SnippetSpecProto & snippet_spec,std::string_view value,int window_end_max_exclusive_utf32,Tokenizer::Iterator * iterator)340 libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd(
341     const ResultSpecProto::SnippetSpecProto& snippet_spec,
342     std::string_view value, int window_end_max_exclusive_utf32,
343     Tokenizer::Iterator* iterator) {
344   if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) {
345     return absl_ports::InternalError(
346         "Couldn't reset tokenizer to determine snippet window!");
347   }
348   ICING_ASSIGN_OR_RETURN(CharacterIterator end_exclusive,
349                          iterator->CalculateTokenEndExclusive());
350   return IncludeTrailingPunctuation(value, end_exclusive,
351                                     window_end_max_exclusive_utf32);
352 }
353 
354 struct SectionData {
355   std::string_view section_name;
356   std::string_view section_subcontent;
357 };
358 
359 // Creates a snippet match proto for the match pointed to by the iterator,
360 // between start_itr and end_itr
361 // Returns:
362 //   the position of the window start if successful
363 //   INTERNAL_ERROR - if a tokenizer error is encountered and iterator is left
364 //     in an invalid state
365 //   ABORTED_ERROR - if an invalid utf-8 sequence is encountered
RetrieveMatch(const ResultSpecProto::SnippetSpecProto & snippet_spec,const SectionData & value,Tokenizer::Iterator * iterator,const CharacterIterator & start_itr,const CharacterIterator & end_itr)366 libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
367     const ResultSpecProto::SnippetSpecProto& snippet_spec,
368     const SectionData& value, Tokenizer::Iterator* iterator,
369     const CharacterIterator& start_itr, const CharacterIterator& end_itr) {
370   SnippetMatchProto snippet_match;
371   // When finding boundaries,  we have a few cases:
372   //
373   // Case 1:
374   //   If we have an odd length match an odd length window, the window surrounds
375   //   the match perfectly.
376   //     match  = "bar" in "foo bar baz"
377   //     window =              |---|
378   //
379   // Case 2:
380   //   If we have an even length match with an even length window, the window
381   //   surrounds the match perfectly.
382   //     match  = "baar" in "foo baar baz"
383   //     window =               |----|
384   //
385   // Case 3:
386   //   If we have an odd length match with an even length window, we allocate
387   //   that extra window byte to the beginning.
388   //     match  = "bar" in "foo bar baz"
389   //     window =             |----|
390   //
391   // Case 4:
392   //   If we have an even length match with an odd length window, we allocate
393   //   that extra window byte to the end.
394   //     match  = "baar" in "foo baar baz"
395   //     window =               |-----|
396   //
397   // We have do +1/-1 below to get the math to match up.
398   int match_pos_utf32 = start_itr.utf32_index();
399   int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32;
400   int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2;
401   int window_start_min_exclusive_utf32 =
402       (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1;
403   int window_end_max_exclusive_utf32 =
404       match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2;
405 
406   snippet_match.set_exact_match_byte_position(start_itr.utf8_index());
407   snippet_match.set_exact_match_utf16_position(start_itr.utf16_index());
408   snippet_match.set_exact_match_byte_length(end_itr.utf8_index() -
409                                             start_itr.utf8_index());
410   snippet_match.set_exact_match_utf16_length(end_itr.utf16_index() -
411                                              start_itr.utf16_index());
412 
413   // Only include windows if it'll at least include the matched text. Otherwise,
414   // it'll just be an empty string anyways.
415   if (snippet_spec.max_window_utf32_length() >= match_len_utf32) {
416     // Find the beginning of the window.
417     ICING_ASSIGN_OR_RETURN(
418         CharacterIterator window_start,
419         DetermineWindowStart(snippet_spec, value.section_subcontent,
420                              window_start_min_exclusive_utf32, iterator));
421 
422     // Check. Did we get fewer characters than we requested? If so, then add it
423     // on to the window_end.
424     int extra_window_space =
425         window_start.utf32_index() - 1 - window_start_min_exclusive_utf32;
426     window_end_max_exclusive_utf32 += extra_window_space;
427 
428     // Find the end of the window.
429     ICING_ASSIGN_OR_RETURN(
430         CharacterIterator window_end,
431         DetermineWindowEnd(snippet_spec, value.section_subcontent,
432                            window_end_max_exclusive_utf32, iterator));
433 
434     // Check one more time. Did we get fewer characters than we requested? If
435     // so, then see if we can push the start back again.
436     extra_window_space =
437         window_end_max_exclusive_utf32 - window_end.utf32_index();
438     if (extra_window_space > 0) {
439       window_start_min_exclusive_utf32 =
440           window_start.utf32_index() - 1 - extra_window_space;
441       ICING_ASSIGN_OR_RETURN(
442           window_start,
443           DetermineWindowStart(snippet_spec, value.section_subcontent,
444                                window_start_min_exclusive_utf32, iterator));
445     }
446 
447     snippet_match.set_window_byte_position(window_start.utf8_index());
448     snippet_match.set_window_utf16_position(window_start.utf16_index());
449     snippet_match.set_window_byte_length(window_end.utf8_index() -
450                                          window_start.utf8_index());
451     snippet_match.set_window_utf16_length(window_end.utf16_index() -
452                                           window_start.utf16_index());
453 
454     // DetermineWindowStart/End may change the position of the iterator, but it
455     // will be reset once the entire batch of tokens is checked.
456   }
457 
458   return snippet_match;
459 }
460 
461 struct MatchOptions {
462   const ResultSpecProto::SnippetSpecProto& snippet_spec;
463   int max_matches_remaining;
464 };
465 
466 // Retrieves snippets in the string values of current_property.
467 // Tokenizer is provided to tokenize string content and matcher is provided to
468 // indicate when a token matches content in the query.
469 //
470 // current_property is the property with the string values to snippet.
471 // property_path is the path in the document to current_property.
472 //
473 // MatchOptions holds the snippet spec and number of desired matches remaining.
474 // Each call to GetEntriesFromProperty will decrement max_matches_remaining
475 // by the number of entries that it adds to snippet_proto.
476 //
477 // The SnippetEntries found for matched content will be added to snippet_proto.
GetEntriesFromProperty(const PropertyProto * current_property,const std::string & property_path,const TokenMatcher * matcher,const Tokenizer * tokenizer,MatchOptions * match_options,SnippetProto * snippet_proto)478 void GetEntriesFromProperty(const PropertyProto* current_property,
479                             const std::string& property_path,
480                             const TokenMatcher* matcher,
481                             const Tokenizer* tokenizer,
482                             MatchOptions* match_options,
483                             SnippetProto* snippet_proto) {
484   // We're at the end. Let's check our values.
485   for (int i = 0; i < current_property->string_values_size(); ++i) {
486     SnippetProto::EntryProto snippet_entry;
487     snippet_entry.set_property_name(AddIndexToPath(
488         current_property->string_values_size(), /*index=*/i, property_path));
489     std::string_view value = current_property->string_values(i);
490     std::unique_ptr<Tokenizer::Iterator> iterator =
491         tokenizer->Tokenize(value).ValueOrDie();
492     // All iterators are moved through positions sequentially. Constructing them
493     // each time resets them to the beginning of the string. This means that,
494     // for t tokens and in a string of n chars, each MoveToUtf8 call from the
495     // beginning of the string is on average O(n/2), whereas calling MoveToUtf8
496     // from the token immediately prior to the desired one is O(n/t).
497     // Constructing each outside of the while-loop ensures that performance will
498     // be O(t * (n/t)) = O(n) rather than O(t * n / 2).
499     CharacterIterator start_itr(value);
500     CharacterIterator end_itr(value);
501     CharacterIterator reset_itr(value);
502     bool encountered_error = false;
503     while (iterator->Advance()) {
504       std::vector<Token> batch_tokens = iterator->GetTokens();
505       if (batch_tokens.empty()) {
506         continue;
507       }
508 
509       bool needs_reset = false;
510       reset_itr.MoveToUtf8(batch_tokens.at(0).text.begin() - value.begin());
511       start_itr = reset_itr;
512       end_itr = start_itr;
513       for (int i = 0; i < batch_tokens.size(); ++i) {
514         const Token& token = batch_tokens.at(i);
515         CharacterIterator submatch_end = matcher->Matches(token);
516         // If the token matched a query term, then submatch_end will point to an
517         // actual position within token.text.
518         if (submatch_end.utf8_index() == -1) {
519           continue;
520         }
521         // As snippet matching may move iterator around, we save a reset
522         // iterator so that we can reset to the initial iterator state, and
523         // continue Advancing in order in the next round.
524         if (!start_itr.MoveToUtf8(token.text.begin() - value.begin())) {
525           encountered_error = true;
526           break;
527         }
528         if (!end_itr.MoveToUtf8(token.text.end() - value.begin())) {
529           encountered_error = true;
530           break;
531         }
532         SectionData data = {property_path, value};
533         auto match_or = RetrieveMatch(match_options->snippet_spec, data,
534                                       iterator.get(), start_itr, end_itr);
535         if (!match_or.ok()) {
536           if (absl_ports::IsAborted(match_or.status())) {
537             // Only an aborted. We can't get this match, but we might be able
538             // to retrieve others. Just continue.
539             continue;
540           } else {
541             encountered_error = true;
542             break;
543           }
544         }
545         SnippetMatchProto match = std::move(match_or).ValueOrDie();
546         if (match.window_byte_length() > 0) {
547           needs_reset = true;
548         }
549         // submatch_end refers to a position *within* token.text.
550         // This, conveniently enough, means that index that submatch_end
551         // points to is the length of the submatch (because the submatch
552         // starts at 0 in token.text).
553         match.set_submatch_byte_length(submatch_end.utf8_index());
554         match.set_submatch_utf16_length(submatch_end.utf16_index());
555         // Add the values for the submatch.
556         snippet_entry.mutable_snippet_matches()->Add(std::move(match));
557 
558         if (--match_options->max_matches_remaining <= 0) {
559           *snippet_proto->add_entries() = std::move(snippet_entry);
560           return;
561         }
562       }
563 
564       if (encountered_error) {
565         break;
566       }
567 
568       // RetrieveMatch may call DetermineWindowStart/End if windowing is
569       // requested, which may change the position of the iterator. So, reset the
570       // iterator back to the original position. The first token of the token
571       // batch will be the token to reset to.
572       if (needs_reset) {
573         if (reset_itr.utf8_index() > 0) {
574           encountered_error =
575               !iterator->ResetToTokenStartingAfter(reset_itr.utf32_index() - 1);
576         } else {
577           encountered_error = !iterator->ResetToStart();
578         }
579       }
580       if (encountered_error) {
581         break;
582       }
583     }
584     if (!snippet_entry.snippet_matches().empty()) {
585       *snippet_proto->add_entries() = std::move(snippet_entry);
586     }
587   }
588 }
589 
590 // Retrieves snippets in document from content at section_path.
591 // Tokenizer is provided to tokenize string content and matcher is provided to
592 // indicate when a token matches content in the query.
593 //
594 // section_path_index refers to the current property that is held by document.
595 // current_path is equivalent to the first section_path_index values in
596 // section_path, but with value indices present.
597 //
598 // For example, suppose that a hit appeared somewhere in the "bcc.emailAddress".
599 // The arguments for RetrieveSnippetForSection might be
600 // {section_path=["bcc", "emailAddress"], section_path_index=0, current_path=""}
601 // on the first call and
602 // {section_path=["bcc", "emailAddress"], section_path_index=1,
603 // current_path="bcc[1]"} on the second recursive call.
604 //
605 // MatchOptions holds the snippet spec and number of desired matches remaining.
606 // Each call to RetrieveSnippetForSection will decrement max_matches_remaining
607 // by the number of entries that it adds to snippet_proto.
608 //
609 // The SnippetEntries found for matched content will be added to snippet_proto.
RetrieveSnippetForSection(const DocumentProto & document,const TokenMatcher * matcher,const Tokenizer * tokenizer,const std::vector<std::string_view> & section_path,int section_path_index,const std::string & current_path,MatchOptions * match_options,SnippetProto * snippet_proto)610 void RetrieveSnippetForSection(
611     const DocumentProto& document, const TokenMatcher* matcher,
612     const Tokenizer* tokenizer,
613     const std::vector<std::string_view>& section_path, int section_path_index,
614     const std::string& current_path, MatchOptions* match_options,
615     SnippetProto* snippet_proto) {
616   std::string_view next_property_name = section_path.at(section_path_index);
617   const PropertyProto* current_property =
618       property_util::GetPropertyProto(document, next_property_name);
619   if (current_property == nullptr) {
620     ICING_VLOG(1) << "No property " << next_property_name << " found at path "
621                   << current_path;
622     return;
623   }
624   std::string property_path = property_util::ConcatenatePropertyPathExpr(
625       current_path, next_property_name);
626   if (section_path_index == section_path.size() - 1) {
627     // We're at the end. Let's check our values.
628     GetEntriesFromProperty(current_property, property_path, matcher, tokenizer,
629                            match_options, snippet_proto);
630   } else {
631     // Still got more to go. Let's look through our subdocuments.
632     std::vector<SnippetProto::EntryProto> entries;
633     for (int i = 0; i < current_property->document_values_size(); ++i) {
634       std::string new_path = AddIndexToPath(
635           current_property->document_values_size(), /*index=*/i, property_path);
636       RetrieveSnippetForSection(current_property->document_values(i), matcher,
637                                 tokenizer, section_path, section_path_index + 1,
638                                 new_path, match_options, snippet_proto);
639       if (match_options->max_matches_remaining <= 0) {
640         break;
641       }
642     }
643   }
644 }
645 
646 }  // namespace
647 
648 libtextclassifier3::StatusOr<std::unique_ptr<SnippetRetriever>>
Create(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer)649 SnippetRetriever::Create(const SchemaStore* schema_store,
650                          const LanguageSegmenter* language_segmenter,
651                          const Normalizer* normalizer) {
652   ICING_RETURN_ERROR_IF_NULL(schema_store);
653   ICING_RETURN_ERROR_IF_NULL(language_segmenter);
654   ICING_RETURN_ERROR_IF_NULL(normalizer);
655 
656   return std::unique_ptr<SnippetRetriever>(
657       new SnippetRetriever(schema_store, language_segmenter, normalizer));
658 }
659 
RetrieveSnippet(const SectionRestrictQueryTermsMap & query_terms,TermMatchType::Code match_type,const ResultSpecProto::SnippetSpecProto & snippet_spec,const DocumentProto & document,SectionIdMask section_id_mask) const660 SnippetProto SnippetRetriever::RetrieveSnippet(
661     const SectionRestrictQueryTermsMap& query_terms,
662     TermMatchType::Code match_type,
663     const ResultSpecProto::SnippetSpecProto& snippet_spec,
664     const DocumentProto& document, SectionIdMask section_id_mask) const {
665   SnippetProto snippet_proto;
666   ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
667                          schema_store_.GetSchemaTypeId(document.schema()),
668                          snippet_proto);
669   const std::unordered_set<std::string> empty_set;
670   auto itr = query_terms.find("");
671   const std::unordered_set<std::string>& unrestricted_set =
672       (itr != query_terms.end()) ? itr->second : empty_set;
673   while (section_id_mask != kSectionIdMaskNone) {
674     SectionId section_id = __builtin_ctzll(section_id_mask);
675     // Remove this section from the mask.
676     section_id_mask &= ~(UINT64_C(1) << section_id);
677 
678     MatchOptions match_options = {snippet_spec};
679     match_options.max_matches_remaining =
680         snippet_spec.num_matches_per_property();
681 
682     // Determine the section name and match type.
683     auto section_metadata_or =
684         schema_store_.GetSectionMetadata(type_id, section_id);
685     if (!section_metadata_or.ok()) {
686       continue;
687     }
688     const SectionMetadata* metadata = section_metadata_or.ValueOrDie();
689     std::vector<std::string_view> section_path =
690         property_util::SplitPropertyPathExpr(metadata->path);
691 
692     // Match type must be as restrictive as possible. Prefix matches for a
693     // snippet should only be included if both the query is Prefix and the
694     // section has prefixes enabled.
695     TermMatchType::Code section_match_type = TermMatchType::EXACT_ONLY;
696     if (match_type == TermMatchType::PREFIX &&
697         metadata->term_match_type == TermMatchType::PREFIX) {
698       section_match_type = TermMatchType::PREFIX;
699     }
700 
701     itr = query_terms.find(metadata->path);
702     const std::unordered_set<std::string>& restricted_set =
703         (itr != query_terms.end()) ? itr->second : empty_set;
704     libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> matcher_or =
705         CreateTokenMatcher(section_match_type, unrestricted_set, restricted_set,
706                            normalizer_);
707     if (!matcher_or.ok()) {
708       continue;
709     }
710     std::unique_ptr<TokenMatcher> matcher = std::move(matcher_or).ValueOrDie();
711 
712     auto tokenizer_or = tokenizer_factory::CreateIndexingTokenizer(
713         metadata->tokenizer, &language_segmenter_);
714     if (!tokenizer_or.ok()) {
715       // If we couldn't create the tokenizer properly, just skip this section.
716       continue;
717     }
718     std::unique_ptr<Tokenizer> tokenizer = std::move(tokenizer_or).ValueOrDie();
719     RetrieveSnippetForSection(
720         document, matcher.get(), tokenizer.get(), section_path,
721         /*section_path_index=*/0, "", &match_options, &snippet_proto);
722   }
723   return snippet_proto;
724 }
725 
726 }  // namespace lib
727 }  // namespace icing
728