1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ 18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ 19 20 21 #include <stddef.h> 22 23 #include <memory> 24 #include <string> 25 #include <utility> 26 #include <vector> 27 28 #include "lang_id/common/lite_base/macros.h" 29 #include "lang_id/model-provider.h" 30 31 namespace libtextclassifier3 { 32 namespace mobile { 33 namespace lang_id { 34 35 // Forward-declaration of the class that performs all underlying work. 36 class LangIdImpl; 37 38 struct LangIdResult { 39 // An n-best list of possible language codes for a given input sorted in 40 // descending order according to each code's respective probability. 41 // 42 // This list is guaranteed to be non-empty after calling 43 // LangId::FindLanguages. The most likely language code is always the first 44 // item in this array. 45 // 46 // If the model cannot make a prediction, this array contains a single result: 47 // a language code LangId::kUnknownLanguageCode with probability 1. 48 std::vector<std::pair<std::string, float>> predictions; 49 }; 50 51 // Class for detecting the language of a document. 52 // 53 // Note: this class does not handle the details of loading the actual model. 54 // Those details have been "outsourced" to the ModelProvider class. 55 // 56 // This class is thread safe. 57 class LangId { 58 public: 59 // Standard BCP-47 language code for Unknown/Undetermined language. 60 static const char kUnknownLanguageCode[]; 61 62 // Constructs a LangId object, based on |model_provider|. 63 // 64 // Note: we don't crash if we detect a problem at construction time (e.g., the 65 // model provider can't read an underlying file). Instead, we mark the 66 // newly-constructed object as invalid; clients can invoke FindLanguage() on 67 // an invalid object: nothing crashes, but accuracy will be bad. 68 explicit LangId(std::unique_ptr<ModelProvider> model_provider); 69 70 virtual ~LangId(); 71 72 // Computes the n-best list of language codes and probabilities corresponding 73 // to the most likely languages the given input text is written in. That list 74 // includes the most likely |max_results| languages and is sorted in 75 // descending order by language probability. 76 // 77 // The input text consists of the |num_bytes| bytes that starts at |data|. 78 // 79 // If max_results <= 0, we report probabilities for all languages known by 80 // this LangId object (as always, in decreasing order of their probabilities). 81 // 82 // Note: If this LangId object is not valid (see is_valid()) or if this LangId 83 // object can't make a prediction, this method sets the LangIdResult to 84 // contain a single entry with kUnknownLanguageCode with probability 1. 85 // 86 void FindLanguages(const char *data, size_t num_bytes, LangIdResult *result, 87 int max_results = 0) const; 88 89 // Convenience version of FindLanguages(const char *, size_t, LangIdResult *). 90 void FindLanguages(const std::string &text, LangIdResult *result, 91 int max_results = 0) const { 92 FindLanguages(text.data(), text.size(), result, max_results); 93 } 94 95 // Returns language code for the most likely language for a piece of text. 96 // 97 // The input text consists of the |num_bytes| bytes that start at |data|. 98 // 99 // Note: this method reports the most likely (1-best) language only if its 100 // probability is high enough; otherwise, it returns 101 // LangId::kUnknownLanguageCode. The specific probability threshold is tuned 102 // to the needs of an early client. If you need a different threshold, you 103 // can use FindLanguages (plural) to get the full LangIdResult, and apply your 104 // own threshold. 105 // 106 // Note: if this LangId object is not valid (see is_valid()) or if this LangId 107 // object can't make a prediction, then this method returns 108 // LangId::kUnknownLanguageCode. 109 // 110 std::string FindLanguage(const char *data, size_t num_bytes) const; 111 112 // Convenience version of FindLanguage(const char *, size_t). FindLanguage(const std::string & text)113 std::string FindLanguage(const std::string &text) const { 114 return FindLanguage(text.data(), text.size()); 115 } 116 117 // Returns true if this object has been correctly initialized and is ready to 118 // perform predictions. For more info, see doc for LangId 119 // constructor above. 120 bool is_valid() const; 121 122 // Returns the version of the model used by this LangId object. On success, 123 // the returned version number is a strictly positive integer. Returns 0 if 124 // the model version can not be determined (e.g., for old models that do not 125 // specify a version number). 126 int GetModelVersion() const; 127 128 // Returns a typed property stored in the model file. 129 float GetFloatProperty(const std::string &property, 130 float default_value) const; 131 132 private: 133 // Pimpl ("pointer to implementation") pattern, to hide all internals from our 134 // clients. 135 std::unique_ptr<LangIdImpl> pimpl_; 136 137 SAFTM_DISALLOW_COPY_AND_ASSIGN(LangId); 138 }; 139 140 } // namespace lang_id 141 } // namespace mobile 142 } // namespace nlp_saft 143 144 #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ 145