1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/result/snippet-retriever.h"
16
17 #include <algorithm>
18 #include <iterator>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/proto/document.pb.h"
31 #include "icing/proto/search.pb.h"
32 #include "icing/proto/term.pb.h"
33 #include "icing/query/query-terms.h"
34 #include "icing/schema/property-util.h"
35 #include "icing/schema/schema-store.h"
36 #include "icing/schema/section.h"
37 #include "icing/store/document-filter-data.h"
38 #include "icing/tokenization/language-segmenter.h"
39 #include "icing/tokenization/token.h"
40 #include "icing/tokenization/tokenizer-factory.h"
41 #include "icing/tokenization/tokenizer.h"
42 #include "icing/transform/normalizer.h"
43 #include "icing/util/character-iterator.h"
44 #include "icing/util/i18n-utils.h"
45 #include "icing/util/logging.h"
46 #include "icing/util/status-macros.h"
47
48 namespace icing {
49 namespace lib {
50
51 namespace {
52
AddIndexToPath(int values_size,int index,const std::string & property_path)53 inline std::string AddIndexToPath(int values_size, int index,
54 const std::string& property_path) {
55 if (values_size == 1) {
56 return property_path;
57 }
58 return absl_ports::StrCat(
59 property_path, property_util::ConvertToPropertyExprIndexStr(index));
60 }
61
62 // Returns a string of the normalized text of the input Token. Normalization
63 // is applied based on the Token's type.
NormalizeToken(const Normalizer & normalizer,const Token & token)64 std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
65 switch (token.type) {
66 case Token::Type::RFC822_NAME:
67 [[fallthrough]];
68 case Token::Type::RFC822_COMMENT:
69 [[fallthrough]];
70 case Token::Type::RFC822_LOCAL_ADDRESS:
71 [[fallthrough]];
72 case Token::Type::RFC822_HOST_ADDRESS:
73 [[fallthrough]];
74 case Token::Type::RFC822_ADDRESS:
75 [[fallthrough]];
76 case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
77 [[fallthrough]];
78 case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
79 [[fallthrough]];
80 case Token::Type::RFC822_TOKEN:
81 [[fallthrough]];
82 case Token::Type::URL_SCHEME:
83 [[fallthrough]];
84 case Token::Type::URL_USERNAME:
85 [[fallthrough]];
86 case Token::Type::URL_PASSWORD:
87 [[fallthrough]];
88 case Token::Type::URL_HOST_COMMON_PART:
89 [[fallthrough]];
90 case Token::Type::URL_HOST_SIGNIFICANT_PART:
91 [[fallthrough]];
92 case Token::Type::URL_PORT:
93 [[fallthrough]];
94 case Token::Type::URL_PATH_PART:
95 [[fallthrough]];
96 case Token::Type::URL_QUERY:
97 [[fallthrough]];
98 case Token::Type::URL_REF:
99 [[fallthrough]];
100 case Token::Type::URL_SUFFIX:
101 [[fallthrough]];
102 case Token::Type::URL_SUFFIX_INNERMOST:
103 [[fallthrough]];
104 case Token::Type::REGULAR:
105 return normalizer.NormalizeTerm(token.text);
106 case Token::Type::VERBATIM:
107 return std::string(token.text);
108 case Token::Type::QUERY_EXCLUSION:
109 [[fallthrough]];
110 case Token::Type::QUERY_LEFT_PARENTHESES:
111 [[fallthrough]];
112 case Token::Type::QUERY_RIGHT_PARENTHESES:
113 [[fallthrough]];
114 case Token::Type::QUERY_OR:
115 [[fallthrough]];
116 case Token::Type::QUERY_PROPERTY:
117 [[fallthrough]];
118 case Token::Type::INVALID:
119 ICING_LOG(WARNING) << "Unable to normalize token of type: "
120 << static_cast<int>(token.type);
121 return std::string(token.text);
122 }
123 }
124
125 // Returns a CharacterIterator for token's text, advancing one past the last
126 // matching character from the query term.
FindMatchEnd(const Normalizer & normalizer,const Token & token,const std::string & match_query_term)127 CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
128 const std::string& match_query_term) {
129 switch (token.type) {
130 case Token::Type::VERBATIM: {
131 // VERBATIM tokens are not normalized. This means the non-normalized
132 // matched query term must be either equal to or a prefix of the token's
133 // text. Therefore, the match must end at the end of the matched query
134 // term.
135 CharacterIterator verbatim_match_end =
136 CharacterIterator(token.text, 0, 0, 0);
137 verbatim_match_end.AdvanceToUtf8(match_query_term.length());
138 return verbatim_match_end;
139 }
140 case Token::Type::QUERY_EXCLUSION:
141 [[fallthrough]];
142 case Token::Type::QUERY_LEFT_PARENTHESES:
143 [[fallthrough]];
144 case Token::Type::QUERY_RIGHT_PARENTHESES:
145 [[fallthrough]];
146 case Token::Type::QUERY_OR:
147 [[fallthrough]];
148 case Token::Type::QUERY_PROPERTY:
149 [[fallthrough]];
150 case Token::Type::INVALID:
151 ICING_LOG(WARNING)
152 << "Unexpected Token type " << static_cast<int>(token.type)
153 << " found when finding match end of query term and token.";
154 [[fallthrough]];
155 case Token::Type::RFC822_NAME:
156 [[fallthrough]];
157 case Token::Type::RFC822_COMMENT:
158 [[fallthrough]];
159 case Token::Type::RFC822_LOCAL_ADDRESS:
160 [[fallthrough]];
161 case Token::Type::RFC822_HOST_ADDRESS:
162 [[fallthrough]];
163 case Token::Type::RFC822_ADDRESS:
164 [[fallthrough]];
165 case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
166 [[fallthrough]];
167 case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
168 [[fallthrough]];
169 case Token::Type::RFC822_TOKEN:
170 [[fallthrough]];
171 case Token::Type::URL_SCHEME:
172 [[fallthrough]];
173 case Token::Type::URL_USERNAME:
174 [[fallthrough]];
175 case Token::Type::URL_PASSWORD:
176 [[fallthrough]];
177 case Token::Type::URL_HOST_COMMON_PART:
178 [[fallthrough]];
179 case Token::Type::URL_HOST_SIGNIFICANT_PART:
180 [[fallthrough]];
181 case Token::Type::URL_PORT:
182 [[fallthrough]];
183 case Token::Type::URL_QUERY:
184 [[fallthrough]];
185 case Token::Type::URL_PATH_PART:
186 [[fallthrough]];
187 case Token::Type::URL_REF:
188 [[fallthrough]];
189 case Token::Type::URL_SUFFIX:
190 [[fallthrough]];
191 case Token::Type::URL_SUFFIX_INNERMOST:
192 [[fallthrough]];
193 case Token::Type::REGULAR:
194 return normalizer.FindNormalizedMatchEndPosition(token.text,
195 match_query_term);
196 }
197 }
198
199 class TokenMatcher {
200 public:
201 virtual ~TokenMatcher() = default;
202
203 // Returns a CharacterIterator pointing just past the end of the substring in
204 // token.text that matches a query term. Note that the utf* indices will be
205 // in relation to token.text's start.
206 //
207 // If there is no match, then it will construct a CharacterIterator with all
208 // of its indices set to -1.
209 //
210 // Ex. With an exact matcher, query terms=["foo","bar"] and token.text="bar",
211 // Matches will return a CharacterIterator(u8:3, u16:3, u32:3).
212 virtual CharacterIterator Matches(Token token) const = 0;
213 };
214
215 class TokenMatcherExact : public TokenMatcher {
216 public:
TokenMatcherExact(const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)217 explicit TokenMatcherExact(
218 const std::unordered_set<std::string>& unrestricted_query_terms,
219 const std::unordered_set<std::string>& restricted_query_terms,
220 const Normalizer& normalizer)
221 : unrestricted_query_terms_(unrestricted_query_terms),
222 restricted_query_terms_(restricted_query_terms),
223 normalizer_(normalizer) {}
224
Matches(Token token) const225 CharacterIterator Matches(Token token) const override {
226 std::string s = NormalizeToken(normalizer_, token);
227 auto itr = unrestricted_query_terms_.find(s);
228 if (itr == unrestricted_query_terms_.end()) {
229 itr = restricted_query_terms_.find(s);
230 }
231 if (itr != unrestricted_query_terms_.end() &&
232 itr != restricted_query_terms_.end()) {
233 return FindMatchEnd(normalizer_, token, *itr);
234 }
235
236 return CharacterIterator(token.text, -1, -1, -1);
237 }
238
239 private:
240 const std::unordered_set<std::string>& unrestricted_query_terms_;
241 const std::unordered_set<std::string>& restricted_query_terms_;
242 const Normalizer& normalizer_;
243 };
244
245 class TokenMatcherPrefix : public TokenMatcher {
246 public:
TokenMatcherPrefix(const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)247 explicit TokenMatcherPrefix(
248 const std::unordered_set<std::string>& unrestricted_query_terms,
249 const std::unordered_set<std::string>& restricted_query_terms,
250 const Normalizer& normalizer)
251 : unrestricted_query_terms_(unrestricted_query_terms),
252 restricted_query_terms_(restricted_query_terms),
253 normalizer_(normalizer) {}
254
Matches(Token token) const255 CharacterIterator Matches(Token token) const override {
256 std::string s = NormalizeToken(normalizer_, token);
257 for (const std::string& query_term : unrestricted_query_terms_) {
258 if (query_term.length() <= s.length() &&
259 s.compare(0, query_term.length(), query_term) == 0) {
260 return FindMatchEnd(normalizer_, token, query_term);
261 }
262 }
263 for (const std::string& query_term : restricted_query_terms_) {
264 if (query_term.length() <= s.length() &&
265 s.compare(0, query_term.length(), query_term) == 0) {
266 return FindMatchEnd(normalizer_, token, query_term);
267 }
268 }
269 return CharacterIterator(token.text, -1, -1, -1);
270 }
271
272 private:
273 const std::unordered_set<std::string>& unrestricted_query_terms_;
274 const std::unordered_set<std::string>& restricted_query_terms_;
275 const Normalizer& normalizer_;
276 };
277
CreateTokenMatcher(TermMatchType::Code match_type,const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)278 libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher(
279 TermMatchType::Code match_type,
280 const std::unordered_set<std::string>& unrestricted_query_terms,
281 const std::unordered_set<std::string>& restricted_query_terms,
282 const Normalizer& normalizer) {
283 switch (match_type) {
284 case TermMatchType::EXACT_ONLY:
285 return std::make_unique<TokenMatcherExact>(
286 unrestricted_query_terms, restricted_query_terms, normalizer);
287 case TermMatchType::PREFIX:
288 return std::make_unique<TokenMatcherPrefix>(
289 unrestricted_query_terms, restricted_query_terms, normalizer);
290 case TermMatchType::UNKNOWN:
291 [[fallthrough]];
292 default:
293 return absl_ports::InvalidArgumentError("Invalid match type provided.");
294 }
295 }
296
297 // Finds the start position of a valid token that is after
298 // window_start_min_exclusive_utf32
299 //
300 // Returns:
301 // the position of the window start if successful
302 // INTERNAL_ERROR - if a tokenizer error is encountered
DetermineWindowStart(const ResultSpecProto::SnippetSpecProto & snippet_spec,std::string_view value,int window_start_min_exclusive_utf32,Tokenizer::Iterator * iterator)303 libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
304 const ResultSpecProto::SnippetSpecProto& snippet_spec,
305 std::string_view value, int window_start_min_exclusive_utf32,
306 Tokenizer::Iterator* iterator) {
307 if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) {
308 return absl_ports::InternalError(
309 "Couldn't reset tokenizer to determine snippet window!");
310 }
311 return iterator->CalculateTokenStart();
312 }
313
314 // Increments window_end_exclusive so long as the character at the position
315 // of window_end_exclusive is punctuation and does not exceed
316 // window_end_max_exclusive_utf32.
IncludeTrailingPunctuation(std::string_view value,CharacterIterator window_end_exclusive,int window_end_max_exclusive_utf32)317 CharacterIterator IncludeTrailingPunctuation(
318 std::string_view value, CharacterIterator window_end_exclusive,
319 int window_end_max_exclusive_utf32) {
320 size_t max_search_index = value.length() - 1;
321 while (window_end_exclusive.utf8_index() <= max_search_index &&
322 window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) {
323 int char_len = 0;
324 if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive.utf8_index(),
325 &char_len)) {
326 break;
327 }
328 // Expand window by char_len and check the next character.
329 window_end_exclusive.AdvanceToUtf32(window_end_exclusive.utf32_index() + 1);
330 }
331 return window_end_exclusive;
332 }
333
334 // Finds the end position of a valid token that is before the
335 // window_end_max_exclusive_utf32.
336 //
337 // Returns:
338 // the position of the window end if successful
339 // INTERNAL_ERROR - if a tokenizer error is encountered
DetermineWindowEnd(const ResultSpecProto::SnippetSpecProto & snippet_spec,std::string_view value,int window_end_max_exclusive_utf32,Tokenizer::Iterator * iterator)340 libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd(
341 const ResultSpecProto::SnippetSpecProto& snippet_spec,
342 std::string_view value, int window_end_max_exclusive_utf32,
343 Tokenizer::Iterator* iterator) {
344 if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) {
345 return absl_ports::InternalError(
346 "Couldn't reset tokenizer to determine snippet window!");
347 }
348 ICING_ASSIGN_OR_RETURN(CharacterIterator end_exclusive,
349 iterator->CalculateTokenEndExclusive());
350 return IncludeTrailingPunctuation(value, end_exclusive,
351 window_end_max_exclusive_utf32);
352 }
353
354 struct SectionData {
355 std::string_view section_name;
356 std::string_view section_subcontent;
357 };
358
359 // Creates a snippet match proto for the match pointed to by the iterator,
360 // between start_itr and end_itr
361 // Returns:
362 // the position of the window start if successful
363 // INTERNAL_ERROR - if a tokenizer error is encountered and iterator is left
364 // in an invalid state
365 // ABORTED_ERROR - if an invalid utf-8 sequence is encountered
RetrieveMatch(const ResultSpecProto::SnippetSpecProto & snippet_spec,const SectionData & value,Tokenizer::Iterator * iterator,const CharacterIterator & start_itr,const CharacterIterator & end_itr)366 libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
367 const ResultSpecProto::SnippetSpecProto& snippet_spec,
368 const SectionData& value, Tokenizer::Iterator* iterator,
369 const CharacterIterator& start_itr, const CharacterIterator& end_itr) {
370 SnippetMatchProto snippet_match;
371 // When finding boundaries, we have a few cases:
372 //
373 // Case 1:
374 // If we have an odd length match an odd length window, the window surrounds
375 // the match perfectly.
376 // match = "bar" in "foo bar baz"
377 // window = |---|
378 //
379 // Case 2:
380 // If we have an even length match with an even length window, the window
381 // surrounds the match perfectly.
382 // match = "baar" in "foo baar baz"
383 // window = |----|
384 //
385 // Case 3:
386 // If we have an odd length match with an even length window, we allocate
387 // that extra window byte to the beginning.
388 // match = "bar" in "foo bar baz"
389 // window = |----|
390 //
391 // Case 4:
392 // If we have an even length match with an odd length window, we allocate
393 // that extra window byte to the end.
394 // match = "baar" in "foo baar baz"
395 // window = |-----|
396 //
397 // We have do +1/-1 below to get the math to match up.
398 int match_pos_utf32 = start_itr.utf32_index();
399 int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32;
400 int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2;
401 int window_start_min_exclusive_utf32 =
402 (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1;
403 int window_end_max_exclusive_utf32 =
404 match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2;
405
406 snippet_match.set_exact_match_byte_position(start_itr.utf8_index());
407 snippet_match.set_exact_match_utf16_position(start_itr.utf16_index());
408 snippet_match.set_exact_match_byte_length(end_itr.utf8_index() -
409 start_itr.utf8_index());
410 snippet_match.set_exact_match_utf16_length(end_itr.utf16_index() -
411 start_itr.utf16_index());
412
413 // Only include windows if it'll at least include the matched text. Otherwise,
414 // it'll just be an empty string anyways.
415 if (snippet_spec.max_window_utf32_length() >= match_len_utf32) {
416 // Find the beginning of the window.
417 ICING_ASSIGN_OR_RETURN(
418 CharacterIterator window_start,
419 DetermineWindowStart(snippet_spec, value.section_subcontent,
420 window_start_min_exclusive_utf32, iterator));
421
422 // Check. Did we get fewer characters than we requested? If so, then add it
423 // on to the window_end.
424 int extra_window_space =
425 window_start.utf32_index() - 1 - window_start_min_exclusive_utf32;
426 window_end_max_exclusive_utf32 += extra_window_space;
427
428 // Find the end of the window.
429 ICING_ASSIGN_OR_RETURN(
430 CharacterIterator window_end,
431 DetermineWindowEnd(snippet_spec, value.section_subcontent,
432 window_end_max_exclusive_utf32, iterator));
433
434 // Check one more time. Did we get fewer characters than we requested? If
435 // so, then see if we can push the start back again.
436 extra_window_space =
437 window_end_max_exclusive_utf32 - window_end.utf32_index();
438 if (extra_window_space > 0) {
439 window_start_min_exclusive_utf32 =
440 window_start.utf32_index() - 1 - extra_window_space;
441 ICING_ASSIGN_OR_RETURN(
442 window_start,
443 DetermineWindowStart(snippet_spec, value.section_subcontent,
444 window_start_min_exclusive_utf32, iterator));
445 }
446
447 snippet_match.set_window_byte_position(window_start.utf8_index());
448 snippet_match.set_window_utf16_position(window_start.utf16_index());
449 snippet_match.set_window_byte_length(window_end.utf8_index() -
450 window_start.utf8_index());
451 snippet_match.set_window_utf16_length(window_end.utf16_index() -
452 window_start.utf16_index());
453
454 // DetermineWindowStart/End may change the position of the iterator, but it
455 // will be reset once the entire batch of tokens is checked.
456 }
457
458 return snippet_match;
459 }
460
461 struct MatchOptions {
462 const ResultSpecProto::SnippetSpecProto& snippet_spec;
463 int max_matches_remaining;
464 };
465
466 // Retrieves snippets in the string values of current_property.
467 // Tokenizer is provided to tokenize string content and matcher is provided to
468 // indicate when a token matches content in the query.
469 //
470 // current_property is the property with the string values to snippet.
471 // property_path is the path in the document to current_property.
472 //
473 // MatchOptions holds the snippet spec and number of desired matches remaining.
474 // Each call to GetEntriesFromProperty will decrement max_matches_remaining
475 // by the number of entries that it adds to snippet_proto.
476 //
477 // The SnippetEntries found for matched content will be added to snippet_proto.
GetEntriesFromProperty(const PropertyProto * current_property,const std::string & property_path,const TokenMatcher * matcher,const Tokenizer * tokenizer,MatchOptions * match_options,SnippetProto * snippet_proto)478 void GetEntriesFromProperty(const PropertyProto* current_property,
479 const std::string& property_path,
480 const TokenMatcher* matcher,
481 const Tokenizer* tokenizer,
482 MatchOptions* match_options,
483 SnippetProto* snippet_proto) {
484 // We're at the end. Let's check our values.
485 for (int i = 0; i < current_property->string_values_size(); ++i) {
486 SnippetProto::EntryProto snippet_entry;
487 snippet_entry.set_property_name(AddIndexToPath(
488 current_property->string_values_size(), /*index=*/i, property_path));
489 std::string_view value = current_property->string_values(i);
490 std::unique_ptr<Tokenizer::Iterator> iterator =
491 tokenizer->Tokenize(value).ValueOrDie();
492 // All iterators are moved through positions sequentially. Constructing them
493 // each time resets them to the beginning of the string. This means that,
494 // for t tokens and in a string of n chars, each MoveToUtf8 call from the
495 // beginning of the string is on average O(n/2), whereas calling MoveToUtf8
496 // from the token immediately prior to the desired one is O(n/t).
497 // Constructing each outside of the while-loop ensures that performance will
498 // be O(t * (n/t)) = O(n) rather than O(t * n / 2).
499 CharacterIterator start_itr(value);
500 CharacterIterator end_itr(value);
501 CharacterIterator reset_itr(value);
502 bool encountered_error = false;
503 while (iterator->Advance()) {
504 std::vector<Token> batch_tokens = iterator->GetTokens();
505 if (batch_tokens.empty()) {
506 continue;
507 }
508
509 bool needs_reset = false;
510 reset_itr.MoveToUtf8(batch_tokens.at(0).text.begin() - value.begin());
511 start_itr = reset_itr;
512 end_itr = start_itr;
513 for (int i = 0; i < batch_tokens.size(); ++i) {
514 const Token& token = batch_tokens.at(i);
515 CharacterIterator submatch_end = matcher->Matches(token);
516 // If the token matched a query term, then submatch_end will point to an
517 // actual position within token.text.
518 if (submatch_end.utf8_index() == -1) {
519 continue;
520 }
521 // As snippet matching may move iterator around, we save a reset
522 // iterator so that we can reset to the initial iterator state, and
523 // continue Advancing in order in the next round.
524 if (!start_itr.MoveToUtf8(token.text.begin() - value.begin())) {
525 encountered_error = true;
526 break;
527 }
528 if (!end_itr.MoveToUtf8(token.text.end() - value.begin())) {
529 encountered_error = true;
530 break;
531 }
532 SectionData data = {property_path, value};
533 auto match_or = RetrieveMatch(match_options->snippet_spec, data,
534 iterator.get(), start_itr, end_itr);
535 if (!match_or.ok()) {
536 if (absl_ports::IsAborted(match_or.status())) {
537 // Only an aborted. We can't get this match, but we might be able
538 // to retrieve others. Just continue.
539 continue;
540 } else {
541 encountered_error = true;
542 break;
543 }
544 }
545 SnippetMatchProto match = std::move(match_or).ValueOrDie();
546 if (match.window_byte_length() > 0) {
547 needs_reset = true;
548 }
549 // submatch_end refers to a position *within* token.text.
550 // This, conveniently enough, means that index that submatch_end
551 // points to is the length of the submatch (because the submatch
552 // starts at 0 in token.text).
553 match.set_submatch_byte_length(submatch_end.utf8_index());
554 match.set_submatch_utf16_length(submatch_end.utf16_index());
555 // Add the values for the submatch.
556 snippet_entry.mutable_snippet_matches()->Add(std::move(match));
557
558 if (--match_options->max_matches_remaining <= 0) {
559 *snippet_proto->add_entries() = std::move(snippet_entry);
560 return;
561 }
562 }
563
564 if (encountered_error) {
565 break;
566 }
567
568 // RetrieveMatch may call DetermineWindowStart/End if windowing is
569 // requested, which may change the position of the iterator. So, reset the
570 // iterator back to the original position. The first token of the token
571 // batch will be the token to reset to.
572 if (needs_reset) {
573 if (reset_itr.utf8_index() > 0) {
574 encountered_error =
575 !iterator->ResetToTokenStartingAfter(reset_itr.utf32_index() - 1);
576 } else {
577 encountered_error = !iterator->ResetToStart();
578 }
579 }
580 if (encountered_error) {
581 break;
582 }
583 }
584 if (!snippet_entry.snippet_matches().empty()) {
585 *snippet_proto->add_entries() = std::move(snippet_entry);
586 }
587 }
588 }
589
590 // Retrieves snippets in document from content at section_path.
591 // Tokenizer is provided to tokenize string content and matcher is provided to
592 // indicate when a token matches content in the query.
593 //
594 // section_path_index refers to the current property that is held by document.
595 // current_path is equivalent to the first section_path_index values in
596 // section_path, but with value indices present.
597 //
598 // For example, suppose that a hit appeared somewhere in the "bcc.emailAddress".
599 // The arguments for RetrieveSnippetForSection might be
600 // {section_path=["bcc", "emailAddress"], section_path_index=0, current_path=""}
601 // on the first call and
602 // {section_path=["bcc", "emailAddress"], section_path_index=1,
603 // current_path="bcc[1]"} on the second recursive call.
604 //
605 // MatchOptions holds the snippet spec and number of desired matches remaining.
606 // Each call to RetrieveSnippetForSection will decrement max_matches_remaining
607 // by the number of entries that it adds to snippet_proto.
608 //
609 // The SnippetEntries found for matched content will be added to snippet_proto.
RetrieveSnippetForSection(const DocumentProto & document,const TokenMatcher * matcher,const Tokenizer * tokenizer,const std::vector<std::string_view> & section_path,int section_path_index,const std::string & current_path,MatchOptions * match_options,SnippetProto * snippet_proto)610 void RetrieveSnippetForSection(
611 const DocumentProto& document, const TokenMatcher* matcher,
612 const Tokenizer* tokenizer,
613 const std::vector<std::string_view>& section_path, int section_path_index,
614 const std::string& current_path, MatchOptions* match_options,
615 SnippetProto* snippet_proto) {
616 std::string_view next_property_name = section_path.at(section_path_index);
617 const PropertyProto* current_property =
618 property_util::GetPropertyProto(document, next_property_name);
619 if (current_property == nullptr) {
620 ICING_VLOG(1) << "No property " << next_property_name << " found at path "
621 << current_path;
622 return;
623 }
624 std::string property_path = property_util::ConcatenatePropertyPathExpr(
625 current_path, next_property_name);
626 if (section_path_index == section_path.size() - 1) {
627 // We're at the end. Let's check our values.
628 GetEntriesFromProperty(current_property, property_path, matcher, tokenizer,
629 match_options, snippet_proto);
630 } else {
631 // Still got more to go. Let's look through our subdocuments.
632 std::vector<SnippetProto::EntryProto> entries;
633 for (int i = 0; i < current_property->document_values_size(); ++i) {
634 std::string new_path = AddIndexToPath(
635 current_property->document_values_size(), /*index=*/i, property_path);
636 RetrieveSnippetForSection(current_property->document_values(i), matcher,
637 tokenizer, section_path, section_path_index + 1,
638 new_path, match_options, snippet_proto);
639 if (match_options->max_matches_remaining <= 0) {
640 break;
641 }
642 }
643 }
644 }
645
646 } // namespace
647
648 libtextclassifier3::StatusOr<std::unique_ptr<SnippetRetriever>>
Create(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer)649 SnippetRetriever::Create(const SchemaStore* schema_store,
650 const LanguageSegmenter* language_segmenter,
651 const Normalizer* normalizer) {
652 ICING_RETURN_ERROR_IF_NULL(schema_store);
653 ICING_RETURN_ERROR_IF_NULL(language_segmenter);
654 ICING_RETURN_ERROR_IF_NULL(normalizer);
655
656 return std::unique_ptr<SnippetRetriever>(
657 new SnippetRetriever(schema_store, language_segmenter, normalizer));
658 }
659
RetrieveSnippet(const SectionRestrictQueryTermsMap & query_terms,TermMatchType::Code match_type,const ResultSpecProto::SnippetSpecProto & snippet_spec,const DocumentProto & document,SectionIdMask section_id_mask) const660 SnippetProto SnippetRetriever::RetrieveSnippet(
661 const SectionRestrictQueryTermsMap& query_terms,
662 TermMatchType::Code match_type,
663 const ResultSpecProto::SnippetSpecProto& snippet_spec,
664 const DocumentProto& document, SectionIdMask section_id_mask) const {
665 SnippetProto snippet_proto;
666 ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
667 schema_store_.GetSchemaTypeId(document.schema()),
668 snippet_proto);
669 const std::unordered_set<std::string> empty_set;
670 auto itr = query_terms.find("");
671 const std::unordered_set<std::string>& unrestricted_set =
672 (itr != query_terms.end()) ? itr->second : empty_set;
673 while (section_id_mask != kSectionIdMaskNone) {
674 SectionId section_id = __builtin_ctzll(section_id_mask);
675 // Remove this section from the mask.
676 section_id_mask &= ~(UINT64_C(1) << section_id);
677
678 MatchOptions match_options = {snippet_spec};
679 match_options.max_matches_remaining =
680 snippet_spec.num_matches_per_property();
681
682 // Determine the section name and match type.
683 auto section_metadata_or =
684 schema_store_.GetSectionMetadata(type_id, section_id);
685 if (!section_metadata_or.ok()) {
686 continue;
687 }
688 const SectionMetadata* metadata = section_metadata_or.ValueOrDie();
689 std::vector<std::string_view> section_path =
690 property_util::SplitPropertyPathExpr(metadata->path);
691
692 // Match type must be as restrictive as possible. Prefix matches for a
693 // snippet should only be included if both the query is Prefix and the
694 // section has prefixes enabled.
695 TermMatchType::Code section_match_type = TermMatchType::EXACT_ONLY;
696 if (match_type == TermMatchType::PREFIX &&
697 metadata->term_match_type == TermMatchType::PREFIX) {
698 section_match_type = TermMatchType::PREFIX;
699 }
700
701 itr = query_terms.find(metadata->path);
702 const std::unordered_set<std::string>& restricted_set =
703 (itr != query_terms.end()) ? itr->second : empty_set;
704 libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> matcher_or =
705 CreateTokenMatcher(section_match_type, unrestricted_set, restricted_set,
706 normalizer_);
707 if (!matcher_or.ok()) {
708 continue;
709 }
710 std::unique_ptr<TokenMatcher> matcher = std::move(matcher_or).ValueOrDie();
711
712 auto tokenizer_or = tokenizer_factory::CreateIndexingTokenizer(
713 metadata->tokenizer, &language_segmenter_);
714 if (!tokenizer_or.ok()) {
715 // If we couldn't create the tokenizer properly, just skip this section.
716 continue;
717 }
718 std::unique_ptr<Tokenizer> tokenizer = std::move(tokenizer_or).ValueOrDie();
719 RetrieveSnippetForSection(
720 document, matcher.get(), tokenizer.get(), section_path,
721 /*section_path_index=*/0, "", &match_options, &snippet_proto);
722 }
723 return snippet_proto;
724 }
725
726 } // namespace lib
727 } // namespace icing
728