• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
16 #define ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <vector>
23 
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/absl_ports/mutex.h"
26 #include "icing/tokenization/language-segmenter.h"
27 #include "unicode/ubrk.h"
28 
29 namespace icing {
30 namespace lib {
31 
32 // This class is used to segment sentences into words based on rules
33 // (https://unicode.org/reports/tr29/#Word_Boundaries) and language
34 // understanding. Based on the basic segmentation done by UBreakIterator,
35 // some extra rules are applied in this class:
36 //
37 // 1. All ASCII terms will be returned.
38 // 2. For non-ASCII terms, only the alphabetic terms are returned, which means
39 //    non-ASCII punctuation and special characters are left out.
40 // 3. Multiple continuous whitespaces are treated as one.
41 //
42 // The rules above are common to the high-level tokenizers that might use this
43 // class. Other special tokenization logic will be in each tokenizer.
44 class IcuLanguageSegmenter : public LanguageSegmenter {
45  public:
46   static libtextclassifier3::StatusOr<std::unique_ptr<IcuLanguageSegmenter>>
47   Create(std::string&& locale);
48 
~IcuLanguageSegmenter()49   ~IcuLanguageSegmenter() override {
50     if (cached_break_iterator_ != nullptr) {
51       ubrk_close(cached_break_iterator_);
52     }
53   }
54 
55   IcuLanguageSegmenter(const IcuLanguageSegmenter&) = delete;
56   IcuLanguageSegmenter& operator=(const IcuLanguageSegmenter&) = delete;
57 
58   // The segmentation depends on the language detected in the input text.
59   //
60   // Note: It could happen that the language detected from text is wrong, then
61   // there would be a small chance that the text is segmented incorrectly.
62   //
63   // Returns:
64   //   An iterator of terms on success
65   //   INTERNAL_ERROR if any error occurs
66   libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
67   Segment(std::string_view text) const override;
68 
69   // The segmentation depends on the language detected in the input text.
70   //
71   // Note: It could happen that the language detected from text is wrong, then
72   // there would be a small chance that the text is segmented incorrectly.
73   //
74   // Returns:
75   //   A list of terms on success
76   //   INTERNAL_ERROR if any error occurs
77   libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
78       std::string_view text) const override;
79 
80  private:
81   // Declared a friend so that it can call AcceptBreakIterator.
82   friend class IcuLanguageSegmenterIterator;
83 
IcuLanguageSegmenter(std::string && locale,UBreakIterator * iterator)84   explicit IcuLanguageSegmenter(std::string&& locale, UBreakIterator* iterator)
85       : locale_(std::move(locale)), cached_break_iterator_(iterator) {}
86 
87   // Returns a UBreakIterator that the caller owns.
88   // If cached_break_iterator_ is non-null, transfers ownership to caller and
89   // sets cached_break_iterator_ to null.
90   // If cached_break_iterator is null, creates a new UBreakIterator and
91   // transfers ownership to caller.
92   UBreakIterator* ProduceBreakIterator() const;
93 
94   // Caller transfers ownership of itr to IcuLanguageSegmenter.
95   // If cached_break_iterator_ is null, itr becomes the cached_break_iterator_
96   // If cached_break_iterator_ is non-null, then itr will be closed.
97   void ReturnBreakIterator(UBreakIterator* itr) const;
98 
99   // Used to help segment text
100   const std::string locale_;
101 
102   // The underlying class that does the segmentation, ubrk_close() must be
103   // called after using.
104   mutable UBreakIterator* cached_break_iterator_ ICING_GUARDED_BY(mutex_);
105 
106   mutable absl_ports::shared_mutex mutex_;
107 };
108 
109 }  // namespace lib
110 }  // namespace icing
111 
112 #endif  // ICING_TOKENIZATION_ICU_ICU_LANGUAGE_SEGMENTER_H_
113