1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_ 16 #define ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <vector> 23 24 #include "icing/text_classifier/lib3/utils/base/statusor.h" 25 #include "icing/tokenization/language-segmenter.h" 26 27 namespace icing { 28 namespace lib { 29 30 // This class is used to segment sentences into words based on rules 31 // (https://unicode.org/reports/tr29/#Word_Boundaries) and language 32 // understanding. Based on the basic segmentation done by UBreakIterator, 33 // some extra rules are applied in this class: 34 // 35 // 1. All ASCII terms will be returned. 36 // 2. For non-ASCII terms, only the alphabetic terms are returned, which means 37 // non-ASCII punctuation and special characters are left out. 38 // 3. Multiple continuous whitespaces are treated as one. 39 // 40 // The rules above are common to the high-level tokenizers that might use this 41 // class. Other special tokenization logic will be in each tokenizer. 42 class IcuLanguageSegmenter : public LanguageSegmenter { 43 public: 44 explicit IcuLanguageSegmenter(std::string locale); 45 46 IcuLanguageSegmenter(const IcuLanguageSegmenter&) = delete; 47 IcuLanguageSegmenter& operator=(const IcuLanguageSegmenter&) = delete; 48 49 // The segmentation depends on the language detected in the input text. 50 // 51 // Note: It could happen that the language detected from text is wrong, then 52 // there would be a small chance that the text is segmented incorrectly. 53 // 54 // Returns: 55 // An iterator of terms on success 56 // INTERNAL_ERROR if any error occurs 57 libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> 58 Segment(std::string_view text) const override; 59 60 // The segmentation depends on the language detected in the input text. 61 // 62 // Note: It could happen that the language detected from text is wrong, then 63 // there would be a small chance that the text is segmented incorrectly. 64 // 65 // Returns: 66 // A list of terms on success 67 // INTERNAL_ERROR if any error occurs 68 libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms( 69 std::string_view text) const override; 70 71 private: 72 // Used to help segment text 73 const std::string locale_; 74 }; 75 76 } // namespace lib 77 } // namespace icing 78 79 #endif // ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_ 80