• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/tokenization/icu/icu-language-segmenter.h"
16 #include "icing/tokenization/language-segmenter-factory.h"
17 #include "icing/util/logging.h"
18 #include "unicode/uloc.h"
19 
20 namespace icing {
21 namespace lib {
22 
23 namespace language_segmenter_factory {
24 
25 namespace {
26 constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
27 }  // namespace
28 
29 // Creates a language segmenter with the given locale.
30 //
31 // Returns:
32 //   A LanguageSegmenter on success
33 //   INVALID_ARGUMENT if locale string is invalid
34 //
35 // TODO(b/156383798): Figure out if we want to verify locale strings and notify
36 // users. Right now illegal locale strings will be ignored by ICU. ICU
37 // components will be created with its default locale.
Create(SegmenterOptions options)38 libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
39     SegmenterOptions options) {
40   // Word connector rules for "en_US_POSIX" (American English (Computer)) are
41   // different from other locales. E.g. "email.subject" will be split into 3
42   // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one
43   // term in other locales. Our current LanguageSegmenter doesn't handle this
44   // special rule, so we replace it with "en_US".
45   if (options.locale == kLocaleAmericanEnglishComputer) {
46     ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer
47                        << " not supported. Converting to locale " << ULOC_US;
48     options.locale = ULOC_US;
49   }
50   return std::make_unique<IcuLanguageSegmenter>(std::move(options.locale));
51 }
52 
53 }  // namespace language_segmenter_factory
54 
55 }  // namespace lib
56 }  // namespace icing
57