• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "actions/ngram-model.h"
18 
19 #include <algorithm>
20 
21 #include "actions/feature-processor.h"
22 #include "utils/hash/farmhash.h"
23 #include "utils/strings/stringpiece.h"
24 
25 namespace libtextclassifier3 {
26 namespace {
27 
28 // An iterator to iterate over the initial tokens of the n-grams of a model.
29 class FirstTokenIterator
30     : public std::iterator<std::random_access_iterator_tag,
31                            /*value_type=*/uint32, /*difference_type=*/ptrdiff_t,
32                            /*pointer=*/const uint32*,
33                            /*reference=*/uint32&> {
34  public:
FirstTokenIterator(const NGramLinearRegressionModel * model,int index)35   explicit FirstTokenIterator(const NGramLinearRegressionModel* model,
36                               int index)
37       : model_(model), index_(index) {}
38 
operator ++()39   FirstTokenIterator& operator++() {
40     index_++;
41     return *this;
42   }
operator +=(ptrdiff_t dist)43   FirstTokenIterator& operator+=(ptrdiff_t dist) {
44     index_ += dist;
45     return *this;
46   }
operator -(const FirstTokenIterator & other_it) const47   ptrdiff_t operator-(const FirstTokenIterator& other_it) const {
48     return index_ - other_it.index_;
49   }
operator *() const50   uint32 operator*() const {
51     const uint32 token_offset = (*model_->ngram_start_offsets())[index_];
52     return (*model_->hashed_ngram_tokens())[token_offset];
53   }
index() const54   int index() const { return index_; }
55 
56  private:
57   const NGramLinearRegressionModel* model_;
58   int index_;
59 };
60 
61 }  // anonymous namespace
62 
Create(const UniLib * unilib,const NGramLinearRegressionModel * model,const Tokenizer * tokenizer)63 std::unique_ptr<NGramModel> NGramModel::Create(
64     const UniLib* unilib, const NGramLinearRegressionModel* model,
65     const Tokenizer* tokenizer) {
66   if (model == nullptr) {
67     return nullptr;
68   }
69   if (tokenizer == nullptr && model->tokenizer_options() == nullptr) {
70     TC3_LOG(ERROR) << "No tokenizer options specified.";
71     return nullptr;
72   }
73   return std::unique_ptr<NGramModel>(new NGramModel(unilib, model, tokenizer));
74 }
75 
NGramModel(const UniLib * unilib,const NGramLinearRegressionModel * model,const Tokenizer * tokenizer)76 NGramModel::NGramModel(const UniLib* unilib,
77                        const NGramLinearRegressionModel* model,
78                        const Tokenizer* tokenizer)
79     : model_(model) {
80   // Create new tokenizer if options are specified, reuse feature processor
81   // tokenizer otherwise.
82   if (model->tokenizer_options() != nullptr) {
83     owned_tokenizer_ = CreateTokenizer(model->tokenizer_options(), unilib);
84     tokenizer_ = owned_tokenizer_.get();
85   } else {
86     tokenizer_ = tokenizer;
87   }
88 }
89 
90 // Returns whether a given n-gram matches the token stream.
IsNGramMatch(const uint32 * tokens,size_t num_tokens,const uint32 * ngram_tokens,size_t num_ngram_tokens,int max_skips) const91 bool NGramModel::IsNGramMatch(const uint32* tokens, size_t num_tokens,
92                               const uint32* ngram_tokens,
93                               size_t num_ngram_tokens, int max_skips) const {
94   int token_idx = 0, ngram_token_idx = 0, skip_remain = 0;
95   for (; token_idx < num_tokens && ngram_token_idx < num_ngram_tokens;) {
96     if (tokens[token_idx] == ngram_tokens[ngram_token_idx]) {
97       // Token matches. Advance both and reset the skip budget.
98       ++token_idx;
99       ++ngram_token_idx;
100       skip_remain = max_skips;
101     } else if (skip_remain > 0) {
102       // No match, but we have skips left, so just advance over the token.
103       ++token_idx;
104       skip_remain--;
105     } else {
106       // No match and we're out of skips. Reject.
107       return false;
108     }
109   }
110   return ngram_token_idx == num_ngram_tokens;
111 }
112 
113 // Calculates the total number of skip-grams that can be created for a stream
114 // with the given number of tokens.
GetNumSkipGrams(int num_tokens,int max_ngram_length,int max_skips)115 uint64 NGramModel::GetNumSkipGrams(int num_tokens, int max_ngram_length,
116                                    int max_skips) {
117   // Start with unigrams.
118   uint64 total = num_tokens;
119   for (int ngram_len = 2;
120        ngram_len <= max_ngram_length && ngram_len <= num_tokens; ++ngram_len) {
121     // We can easily compute the expected length of the n-gram (with skips),
122     // but it doesn't account for the fact that they may be longer than the
123     // input and should be pruned.
124     // Instead, we iterate over the distribution of effective n-gram lengths
125     // and add each length individually.
126     const int num_gaps = ngram_len - 1;
127     const int len_min = ngram_len;
128     const int len_max = ngram_len + num_gaps * max_skips;
129     const int len_mid = (len_max + len_min) / 2;
130     for (int len_i = len_min; len_i <= len_max; ++len_i) {
131       if (len_i > num_tokens) continue;
132       const int num_configs_of_len_i =
133           len_i <= len_mid ? len_i - len_min + 1 : len_max - len_i + 1;
134       const int num_start_offsets = num_tokens - len_i + 1;
135       total += num_configs_of_len_i * num_start_offsets;
136     }
137   }
138   return total;
139 }
140 
GetFirstTokenMatches(uint32 token_hash) const141 std::pair<int, int> NGramModel::GetFirstTokenMatches(uint32 token_hash) const {
142   const int num_ngrams = model_->ngram_weights()->size();
143   const auto start_it = FirstTokenIterator(model_, 0);
144   const auto end_it = FirstTokenIterator(model_, num_ngrams);
145   const int start = std::lower_bound(start_it, end_it, token_hash).index();
146   const int end = std::upper_bound(start_it, end_it, token_hash).index();
147   return std::make_pair(start, end);
148 }
149 
Eval(const UnicodeText & text,float * score) const150 bool NGramModel::Eval(const UnicodeText& text, float* score) const {
151   const std::vector<Token> raw_tokens = tokenizer_->Tokenize(text);
152 
153   // If we have no tokens, then just bail early.
154   if (raw_tokens.empty()) {
155     if (score != nullptr) {
156       *score = model_->default_token_weight();
157     }
158     return false;
159   }
160 
161   // Hash the tokens.
162   std::vector<uint32> tokens;
163   tokens.reserve(raw_tokens.size());
164   for (const Token& raw_token : raw_tokens) {
165     tokens.push_back(tc3farmhash::Fingerprint32(raw_token.value.data(),
166                                                 raw_token.value.length()));
167   }
168 
169   // Calculate the total number of skip-grams that can be generated for the
170   // input text.
171   const uint64 num_candidates = GetNumSkipGrams(
172       tokens.size(), model_->max_denom_ngram_length(), model_->max_skips());
173 
174   // For each token, see whether it denotes the start of an n-gram in the model.
175   int num_matches = 0;
176   float weight_matches = 0.f;
177   for (size_t start_i = 0; start_i < tokens.size(); ++start_i) {
178     const std::pair<int, int> ngram_range =
179         GetFirstTokenMatches(tokens[start_i]);
180     for (int ngram_idx = ngram_range.first; ngram_idx < ngram_range.second;
181          ++ngram_idx) {
182       const uint16 ngram_tokens_begin =
183           (*model_->ngram_start_offsets())[ngram_idx];
184       const uint16 ngram_tokens_end =
185           (*model_->ngram_start_offsets())[ngram_idx + 1];
186       if (IsNGramMatch(
187               /*tokens=*/tokens.data() + start_i,
188               /*num_tokens=*/tokens.size() - start_i,
189               /*ngram_tokens=*/model_->hashed_ngram_tokens()->data() +
190                   ngram_tokens_begin,
191               /*num_ngram_tokens=*/ngram_tokens_end - ngram_tokens_begin,
192               /*max_skips=*/model_->max_skips())) {
193         ++num_matches;
194         weight_matches += (*model_->ngram_weights())[ngram_idx];
195       }
196     }
197   }
198 
199   // Calculate the score.
200   const int num_misses = num_candidates - num_matches;
201   const float internal_score =
202       (weight_matches + (model_->default_token_weight() * num_misses)) /
203       num_candidates;
204   if (score != nullptr) {
205     *score = internal_score;
206   }
207   return internal_score > model_->threshold();
208 }
209 
EvalConversation(const Conversation & conversation,const int num_messages) const210 bool NGramModel::EvalConversation(const Conversation& conversation,
211                                   const int num_messages) const {
212   for (int i = 1; i <= num_messages; i++) {
213     const std::string& message =
214         conversation.messages[conversation.messages.size() - i].text;
215     const UnicodeText message_unicode(
216         UTF8ToUnicodeText(message, /*do_copy=*/false));
217     // Run ngram linear regression model.
218     if (Eval(message_unicode)) {
219       return true;
220     }
221   }
222   return false;
223 }
224 
225 }  // namespace libtextclassifier3
226