1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_RELEVANT_SCRIPT_FEATURE_H_ 18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_RELEVANT_SCRIPT_FEATURE_H_ 19 20 #include <memory> 21 22 #include "lang_id/common/fel/feature-extractor.h" 23 #include "lang_id/common/fel/task-context.h" 24 #include "lang_id/common/fel/workspace.h" 25 #include "lang_id/features/light-sentence-features.h" 26 #include "lang_id/light-sentence.h" 27 #include "lang_id/script/script-detector.h" 28 29 namespace libtextclassifier3 { 30 namespace mobile { 31 namespace lang_id { 32 33 // Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode 34 // script (see below): each such feature indicates the script and the ratio of 35 // UTF8 characters in that script, in the given sentence. 36 // 37 // What is a relevant script? Recognizing all 100+ Unicode scripts would 38 // require too much code size and runtime. Instead, we focus only on a few 39 // scripts that communicate a lot of language information: e.g., the use of 40 // Hiragana characters almost always indicates Japanese, so Hiragana is a 41 // "relevant" script for us. The Latin script is used by dozens of language, so 42 // Latin is not relevant in this context. 43 class RelevantScriptFeature : public LightSentenceFeature { 44 public: 45 bool Setup(TaskContext *context) override; 46 bool Init(TaskContext *context) override; 47 48 // Appends the features computed from the sentence to the feature vector. 49 void Evaluate(const WorkspaceSet &workspaces, 50 const LightSentence &sentence, 51 FeatureVector *result) const override; 52 53 SAFTM_DEFINE_REGISTRATION_METHOD("continuous-bag-of-relevant-scripts", 54 RelevantScriptFeature); 55 56 private: 57 // Detects script of individual UTF8 characters. 58 std::unique_ptr<ScriptDetector> script_detector_; 59 60 // Current model supports scripts in [0, num_supported_scripts_). 61 int num_supported_scripts_ = 0; 62 }; 63 64 } // namespace lang_id 65 } // namespace mobile 66 } // namespace nlp_saft 67 68 #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_RELEVANT_SCRIPT_FEATURE_H_ 69