1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_LANG_ID_LANGUAGE_IDENTIFIER_FEATURES_H_ 18 #define LIBTEXTCLASSIFIER_LANG_ID_LANGUAGE_IDENTIFIER_FEATURES_H_ 19 20 #include <string> 21 22 #include "common/feature-extractor.h" 23 #include "common/task-context.h" 24 #include "common/workspace.h" 25 #include "lang_id/light-sentence-features.h" 26 #include "lang_id/light-sentence.h" 27 28 namespace libtextclassifier { 29 namespace nlp_core { 30 namespace lang_id { 31 32 // Class for computing continuous char ngram features. 33 // 34 // Feature function descriptor parameters: 35 // id_dim(int, 10000): 36 // The integer id of each char ngram is computed as follows: 37 // Hash32WithDefaultSeed(char ngram) % id_dim. 38 // size(int, 3): 39 // Only ngrams of this size will be extracted. 40 // 41 // NOTE: this class is not thread-safe. TODO(salcianu): make it thread-safe. 42 class ContinuousBagOfNgramsFunction : public LightSentenceFeature { 43 public: 44 bool Setup(TaskContext *context) override; 45 bool Init(TaskContext *context) override; 46 47 // Appends the features computed from the sentence to the feature vector. 48 void Evaluate(const WorkspaceSet &workspaces, const LightSentence &sentence, 49 FeatureVector *result) const override; 50 51 TC_DEFINE_REGISTRATION_METHOD("continuous-bag-of-ngrams", 52 ContinuousBagOfNgramsFunction); 53 54 private: 55 // Auxiliary for Evaluate(). Fills counts_ and non_zero_count_indices_ (see 56 // below), and returns the total ngram count. 57 int ComputeNgramCounts(const LightSentence &sentence) const; 58 59 // counts_[i] is the count of all ngrams with id i. Work data for Evaluate(). 60 // NOTE: we declare this vector as a field, such that its underlying capacity 61 // stays allocated in between calls to Evaluate(). 62 mutable std::vector<int> counts_; 63 64 // Indices of non-zero elements of counts_. See comments for counts_. 65 mutable std::vector<int> non_zero_count_indices_; 66 67 // The integer id of each char ngram is computed as follows: 68 // Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_. 69 int ngram_id_dimension_; 70 71 // Only ngrams of size ngram_size_ will be extracted. 72 int ngram_size_; 73 }; 74 75 } // namespace lang_id 76 } // namespace nlp_core 77 } // namespace libtextclassifier 78 79 #endif // LIBTEXTCLASSIFIER_LANG_ID_LANGUAGE_IDENTIFIER_FEATURES_H_ 80