1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "icing/tokenization/icu/icu-language-segmenter.h" 16 #include "icing/tokenization/language-segmenter-factory.h" 17 #include "icing/util/logging.h" 18 #include "unicode/uloc.h" 19 20 namespace icing { 21 namespace lib { 22 23 namespace language_segmenter_factory { 24 25 namespace { 26 constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX"; 27 } // namespace 28 29 // Creates a language segmenter with the given locale. 30 // 31 // Returns: 32 // A LanguageSegmenter on success 33 // INVALID_ARGUMENT if locale string is invalid 34 // 35 // TODO(b/156383798): Figure out if we want to verify locale strings and notify 36 // users. Right now illegal locale strings will be ignored by ICU. ICU 37 // components will be created with its default locale. Create(SegmenterOptions options)38libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create( 39 SegmenterOptions options) { 40 // Word connector rules for "en_US_POSIX" (American English (Computer)) are 41 // different from other locales. E.g. "email.subject" will be split into 3 42 // terms in "en_US_POSIX": "email", ".", and "subject", while it's just one 43 // term in other locales. Our current LanguageSegmenter doesn't handle this 44 // special rule, so we replace it with "en_US". 45 if (options.locale == kLocaleAmericanEnglishComputer) { 46 ICING_LOG(WARNING) << "Locale " << kLocaleAmericanEnglishComputer 47 << " not supported. Converting to locale " << ULOC_US; 48 options.locale = ULOC_US; 49 } 50 return std::make_unique<IcuLanguageSegmenter>(std::move(options.locale)); 51 } 52 53 } // namespace language_segmenter_factory 54 55 } // namespace lib 56 } // namespace icing 57