• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
16 #define ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/tokenization/language-segmenter.h"
26 
27 namespace icing {
28 namespace lib {
29 
30 // This class is used to segment sentences into words based on rules
31 // (https://unicode.org/reports/tr29/#Word_Boundaries) and language
32 // understanding. Based on the basic segmentation done by UBreakIterator,
33 // some extra rules are applied in this class:
34 //
35 // 1. All ASCII terms will be returned.
36 // 2. For non-ASCII terms, only the alphabetic terms are returned, which means
37 //    non-ASCII punctuation and special characters are left out.
38 // 3. Multiple continuous whitespaces are treated as one.
39 //
40 // The rules above are common to the high-level tokenizers that might use this
41 // class. Other special tokenization logic will be in each tokenizer.
42 class IcuLanguageSegmenter : public LanguageSegmenter {
43  public:
44   explicit IcuLanguageSegmenter(std::string locale);
45 
46   IcuLanguageSegmenter(const IcuLanguageSegmenter&) = delete;
47   IcuLanguageSegmenter& operator=(const IcuLanguageSegmenter&) = delete;
48 
49   // The segmentation depends on the language detected in the input text.
50   //
51   // Note: It could happen that the language detected from text is wrong, then
52   // there would be a small chance that the text is segmented incorrectly.
53   //
54   // Returns:
55   //   An iterator of terms on success
56   //   INTERNAL_ERROR if any error occurs
57   libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
58   Segment(std::string_view text) const override;
59 
60   // The segmentation depends on the language detected in the input text.
61   //
62   // Note: It could happen that the language detected from text is wrong, then
63   // there would be a small chance that the text is segmented incorrectly.
64   //
65   // Returns:
66   //   A list of terms on success
67   //   INTERNAL_ERROR if any error occurs
68   libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
69       std::string_view text) const override;
70 
71  private:
72   // Used to help segment text
73   const std::string locale_;
74 };
75 
76 }  // namespace lib
77 }  // namespace icing
78 
79 #endif  // ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
80