1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_ 18 #define LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_ 19 20 #include <memory> 21 22 #include "actions/actions_model_generated.h" 23 #include "actions/sensitive-classifier-base.h" 24 #include "actions/types.h" 25 #include "utils/tokenizer.h" 26 #include "utils/utf8/unicodetext.h" 27 #include "utils/utf8/unilib.h" 28 29 namespace libtextclassifier3 { 30 31 class NGramSensitiveModel : public SensitiveTopicModelBase { 32 public: 33 static std::unique_ptr<NGramSensitiveModel> Create( 34 const UniLib* unilib, const NGramLinearRegressionModel* model, 35 const Tokenizer* tokenizer); 36 37 // Evaluates an n-gram linear regression model, and tests against the 38 // threshold. Returns true in case of a positive classification. The caller 39 // may also optionally query the score. 40 std::pair<bool, float> Eval(const UnicodeText& text) const override; 41 42 // Evaluates an n-gram linear regression model against all messages in a 43 // conversation and returns true in case of any positive classification. 44 std::pair<bool, float> EvalConversation(const Conversation& conversation, 45 int num_messages) const override; 46 47 // Exposed for testing only. 48 static uint64 GetNumSkipGrams(int num_tokens, int max_ngram_length, 49 int max_skips); 50 51 private: 52 explicit NGramSensitiveModel(const UniLib* unilib, 53 const NGramLinearRegressionModel* model, 54 const Tokenizer* tokenizer); 55 56 // Returns the (begin,end] range of n-grams where the first hashed token 57 // matches the given value. 58 std::pair<int, int> GetFirstTokenMatches(uint32 token_hash) const; 59 60 // Returns whether a given n-gram matches the token stream. 61 bool IsNGramMatch(const uint32* tokens, size_t num_tokens, 62 const uint32* ngram_tokens, size_t num_ngram_tokens, 63 int max_skips) const; 64 65 const NGramLinearRegressionModel* model_; 66 const Tokenizer* tokenizer_; 67 std::unique_ptr<Tokenizer> owned_tokenizer_; 68 }; 69 70 } // namespace libtextclassifier3 71 72 #endif // LIBTEXTCLASSIFIER_ACTIONS_NGRAM_MODEL_H_ 73