1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_ 18 #define LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_ 19 20 #include <memory> 21 22 #include "actions/actions_model_generated.h" 23 #include "utils/tokenizer.h" 24 #include "utils/utf8/unicodetext.h" 25 #include "utils/utf8/unilib.h" 26 27 namespace libtextclassifier3 { 28 29 class NGramModel { 30 public: 31 static std::unique_ptr<NGramModel> Create( 32 const NGramLinearRegressionModel* model, const Tokenizer* tokenizer, 33 const UniLib* unilib); 34 35 // Evaluates an n-gram linear regression model, and tests against the 36 // threshold. Returns true in case of a positive classification. The caller 37 // may also optionally query the score. 38 bool Eval(const UnicodeText& text, float* score = nullptr) const; 39 40 // Exposed for testing only. 41 static uint64 GetNumSkipGrams(int num_tokens, int max_ngram_length, 42 int max_skips); 43 44 private: 45 NGramModel(const NGramLinearRegressionModel* model, 46 const Tokenizer* tokenizer, const UniLib* unilib); 47 48 // Returns the (begin,end] range of n-grams where the first hashed token 49 // matches the given value. 50 std::pair<int, int> GetFirstTokenMatches(uint32 token_hash) const; 51 52 // Returns whether a given n-gram matches the token stream. 53 bool IsNGramMatch(const uint32* tokens, size_t num_tokens, 54 const uint32* ngram_tokens, size_t num_ngram_tokens, 55 int max_skips) const; 56 57 const NGramLinearRegressionModel* model_; 58 const Tokenizer* tokenizer_; 59 std::unique_ptr<Tokenizer> owned_tokenizer_; 60 }; 61 62 } // namespace libtextclassifier3 63 64 #endif // LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_ 65