1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_ 18 #define LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_ 19 20 #include <memory> 21 22 #include "actions/actions_model_generated.h" 23 #include "actions/types.h" 24 #include "utils/tokenizer.h" 25 #include "utils/utf8/unicodetext.h" 26 #include "utils/utf8/unilib.h" 27 28 namespace libtextclassifier3 { 29 30 class NGramModel { 31 public: 32 static std::unique_ptr<NGramModel> Create( 33 const UniLib* unilib, const NGramLinearRegressionModel* model, 34 const Tokenizer* tokenizer); 35 36 // Evaluates an n-gram linear regression model, and tests against the 37 // threshold. Returns true in case of a positive classification. The caller 38 // may also optionally query the score. 39 bool Eval(const UnicodeText& text, float* score = nullptr) const; 40 41 // Evaluates an n-gram linear regression model against all messages in a 42 // conversation and returns true in case of any positive classification. 43 bool EvalConversation(const Conversation& conversation, 44 const int num_messages) const; 45 46 // Exposed for testing only. 47 static uint64 GetNumSkipGrams(int num_tokens, int max_ngram_length, 48 int max_skips); 49 50 private: 51 NGramModel(const UniLib* unilib, const NGramLinearRegressionModel* model, 52 const Tokenizer* tokenizer); 53 54 // Returns the (begin,end] range of n-grams where the first hashed token 55 // matches the given value. 56 std::pair<int, int> GetFirstTokenMatches(uint32 token_hash) const; 57 58 // Returns whether a given n-gram matches the token stream. 59 bool IsNGramMatch(const uint32* tokens, size_t num_tokens, 60 const uint32* ngram_tokens, size_t num_ngram_tokens, 61 int max_skips) const; 62 63 const NGramLinearRegressionModel* model_; 64 const Tokenizer* tokenizer_; 65 std::unique_ptr<Tokenizer> owned_tokenizer_; 66 }; 67 68 } // namespace libtextclassifier3 69 70 #endif // LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_ 71