1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_ 16 #define ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <vector> 23 24 #include "icing/text_classifier/lib3/utils/base/statusor.h" 25 #include "icing/absl_ports/mutex.h" 26 #include "icing/tokenization/language-segmenter.h" 27 #include "unicode/ubrk.h" 28 29 namespace icing { 30 namespace lib { 31 32 // This class is used to segment sentences into words based on rules 33 // (https://unicode.org/reports/tr29/#Word_Boundaries) and language 34 // understanding. Based on the basic segmentation done by UBreakIterator, 35 // some extra rules are applied in this class: 36 // 37 // 1. All ASCII terms will be returned. 38 // 2. For non-ASCII terms, only the alphabetic terms are returned, which means 39 // non-ASCII punctuation and special characters are left out. 40 // 3. Multiple continuous whitespaces are treated as one. 41 // 42 // The rules above are common to the high-level tokenizers that might use this 43 // class. Other special tokenization logic will be in each tokenizer. 44 class IcuLanguageSegmenter : public LanguageSegmenter { 45 public: 46 static libtextclassifier3::StatusOr<std::unique_ptr<IcuLanguageSegmenter>> 47 Create(std::string&& locale); 48 ~IcuLanguageSegmenter()49 ~IcuLanguageSegmenter() override { 50 if (cached_break_iterator_ != nullptr) { 51 ubrk_close(cached_break_iterator_); 52 } 53 } 54 55 IcuLanguageSegmenter(const IcuLanguageSegmenter&) = delete; 56 IcuLanguageSegmenter& operator=(const IcuLanguageSegmenter&) = delete; 57 58 // The segmentation depends on the language detected in the input text. 59 // 60 // Note: It could happen that the language detected from text is wrong, then 61 // there would be a small chance that the text is segmented incorrectly. 62 // 63 // Returns: 64 // An iterator of terms on success 65 // INTERNAL_ERROR if any error occurs 66 libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> 67 Segment(std::string_view text) const override; 68 69 // The segmentation depends on the language detected in the input text. 70 // 71 // Note: It could happen that the language detected from text is wrong, then 72 // there would be a small chance that the text is segmented incorrectly. 73 // 74 // Returns: 75 // A list of terms on success 76 // INTERNAL_ERROR if any error occurs 77 libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms( 78 std::string_view text) const override; 79 80 private: 81 // Declared a friend so that it can call AcceptBreakIterator. 82 friend class IcuLanguageSegmenterIterator; 83 IcuLanguageSegmenter(std::string && locale,UBreakIterator * iterator)84 explicit IcuLanguageSegmenter(std::string&& locale, UBreakIterator* iterator) 85 : locale_(std::move(locale)), cached_break_iterator_(iterator) {} 86 87 // Returns a UBreakIterator that the caller owns. 88 // If cached_break_iterator_ is non-null, transfers ownership to caller and 89 // sets cached_break_iterator_ to null. 90 // If cached_break_iterator is null, creates a new UBreakIterator and 91 // transfers ownership to caller. 92 UBreakIterator* ProduceBreakIterator() const; 93 94 // Caller transfers ownership of itr to IcuLanguageSegmenter. 95 // If cached_break_iterator_ is null, itr becomes the cached_break_iterator_ 96 // If cached_break_iterator_ is non-null, then itr will be closed. 97 void ReturnBreakIterator(UBreakIterator* itr) const; 98 99 // Used to help segment text 100 const std::string locale_; 101 102 // The underlying class that does the segmentation, ubrk_close() must be 103 // called after using. 104 mutable UBreakIterator* cached_break_iterator_ ICING_GUARDED_BY(mutex_); 105 106 mutable absl_ports::shared_mutex mutex_; 107 }; 108 109 } // namespace lib 110 } // namespace icing 111 112 #endif // ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_ 113