• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_LANG_ID_LANGUAGE_IDENTIFIER_FEATURES_H_
18 #define LIBTEXTCLASSIFIER_LANG_ID_LANGUAGE_IDENTIFIER_FEATURES_H_
19 
20 #include <string>
21 
22 #include "common/feature-extractor.h"
23 #include "common/task-context.h"
24 #include "common/workspace.h"
25 #include "lang_id/light-sentence-features.h"
26 #include "lang_id/light-sentence.h"
27 
28 namespace libtextclassifier {
29 namespace nlp_core {
30 namespace lang_id {
31 
32 // Class for computing continuous char ngram features.
33 //
34 // Feature function descriptor parameters:
35 //   id_dim(int, 10000):
36 //     The integer id of each char ngram is computed as follows:
37 //     Hash32WithDefaultSeed(char ngram) % id_dim.
38 //   size(int, 3):
39 //     Only ngrams of this size will be extracted.
40 //
41 // NOTE: this class is not thread-safe.  TODO(salcianu): make it thread-safe.
42 class ContinuousBagOfNgramsFunction : public LightSentenceFeature {
43  public:
44   bool Setup(TaskContext *context) override;
45   bool Init(TaskContext *context) override;
46 
47   // Appends the features computed from the sentence to the feature vector.
48   void Evaluate(const WorkspaceSet &workspaces, const LightSentence &sentence,
49                 FeatureVector *result) const override;
50 
51   TC_DEFINE_REGISTRATION_METHOD("continuous-bag-of-ngrams",
52                                 ContinuousBagOfNgramsFunction);
53 
54  private:
55   // Auxiliary for Evaluate().  Fills counts_ and non_zero_count_indices_ (see
56   // below), and returns the total ngram count.
57   int ComputeNgramCounts(const LightSentence &sentence) const;
58 
59   // counts_[i] is the count of all ngrams with id i.  Work data for Evaluate().
60   // NOTE: we declare this vector as a field, such that its underlying capacity
61   // stays allocated in between calls to Evaluate().
62   mutable std::vector<int> counts_;
63 
64   // Indices of non-zero elements of counts_.  See comments for counts_.
65   mutable std::vector<int> non_zero_count_indices_;
66 
67   // The integer id of each char ngram is computed as follows:
68   // Hash32WithDefaultSeed(char_ngram) % ngram_id_dimension_.
69   int ngram_id_dimension_;
70 
71   // Only ngrams of size ngram_size_ will be extracted.
72   int ngram_size_;
73 };
74 
75 }  // namespace lang_id
76 }  // namespace nlp_core
77 }  // namespace libtextclassifier
78 
79 #endif  // LIBTEXTCLASSIFIER_LANG_ID_LANGUAGE_IDENTIFIER_FEATURES_H_
80