1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_ 18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_ 19 20 #include <mutex> // NOLINT: see comments for state_mutex_ 21 #include <string> 22 #include <vector> 23 24 #include "lang_id/common/fel/feature-extractor.h" 25 #include "lang_id/common/fel/task-context.h" 26 #include "lang_id/common/fel/workspace.h" 27 #include "lang_id/features/light-sentence-features.h" 28 #include "lang_id/light-sentence.h" 29 30 // TODO(abakalov): Add a test. 31 namespace libtextclassifier3 { 32 namespace mobile { 33 namespace lang_id { 34 35 // Class for computing continuous char ngram features. 36 // 37 // Feature function descriptor parameters: 38 // include_terminators(bool, false): 39 // If 'true', then splits the text based on spaces to get tokens, adds "^" 40 // to the beginning of each token, and adds "$" to the end of each token. 41 // NOTE: currently, we support only include_terminators=true. 42 // include_spaces(bool, false): 43 // If 'true', then includes char ngrams containing spaces. 44 // NOTE: currently, we support only include_spaces=false. 45 // use_equal_weight(bool, false): 46 // If 'true', then weighs each unique ngram by 1.0 / (number of unique 47 // ngrams in the input). Otherwise, weighs each unique ngram by (ngram 48 // count) / (total number of ngrams). 49 // NOTE: currently, we support only use_equal_weight=false. 50 // id_dim(int, 10000): 51 // The integer id of each char ngram is computed as follows: 52 // Hash32WithDefault(char ngram) % id_dim. 53 // size(int, 3): 54 // Only ngrams of this size will be extracted. 55 // 56 // NOTE: this class is not thread-safe. TODO(salcianu): make it thread-safe. 57 class ContinuousBagOfNgramsFunction : public LightSentenceFeature { 58 public: 59 bool Setup(TaskContext *context) override; 60 bool Init(TaskContext *context) override; 61 62 // Appends the features computed from the sentence to the feature vector. 63 void Evaluate(const WorkspaceSet &workspaces, const LightSentence &sentence, 64 FeatureVector *result) const override; 65 66 SAFTM_DEFINE_REGISTRATION_METHOD("continuous-bag-of-ngrams", 67 ContinuousBagOfNgramsFunction); 68 69 private: 70 // Auxiliary for Evaluate(). Fills counts_ and non_zero_count_indices_ (see 71 // below), and returns the total ngram count. 72 int ComputeNgramCounts(const LightSentence &sentence) const; 73 74 // Guards counts_ and non_zero_count_indices_. NOTE: we use std::* constructs 75 // (instead of absl::Mutex & co) to simplify porting to Android and to avoid 76 // pulling in absl (which increases our code size). 77 mutable std::mutex state_mutex_; 78 79 // counts_[i] is the count of all ngrams with id i. Work data for Evaluate(). 80 // NOTE: we declare this vector as a field, such that its underlying capacity 81 // stays allocated in between calls to Evaluate(). 82 mutable std::vector<int> counts_; 83 84 // Indices of non-zero elements of counts_. See comments for counts_. 85 mutable std::vector<int> non_zero_count_indices_; 86 87 // The integer id of each char ngram is computed as follows: 88 // Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_. 89 int ngram_id_dimension_; 90 91 // Only ngrams of size ngram_size_ will be extracted. 92 int ngram_size_; 93 }; 94 95 } // namespace lang_id 96 } // namespace mobile 97 } // namespace nlp_saft 98 99 #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_FEATURES_CHAR_NGRAM_FEATURE_H_ 100