1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_POD_NER_POD_NER_IMPL_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_POD_NER_POD_NER_IMPL_H_ 19 20 #include <memory> 21 22 #include "annotator/model_generated.h" 23 #include "annotator/types.h" 24 #include "utils/bert_tokenizer.h" 25 #include "utils/utf8/unicodetext.h" 26 #include "utils/utf8/unilib.h" 27 #include "tensorflow/lite/context.h" 28 #include "tensorflow/lite/interpreter.h" 29 #include "tensorflow/lite/kernels/register.h" 30 #include "tensorflow/lite/string_util.h" 31 32 namespace libtextclassifier3 { 33 34 // Uses POD NER BERT-based model for annotating various types of entities. 35 class PodNerAnnotator { 36 public: 37 static std::unique_ptr<PodNerAnnotator> Create(const PodNerModel *model, 38 const UniLib &unilib); 39 40 bool Annotate(const UnicodeText &context, 41 std::vector<AnnotatedSpan> *results) const; 42 43 // Returns true if an entity was detected under 'click', and the selection 44 // indices expanded and assigned to 'result'. Otherwise returns false, and 45 // resets 'result'. 46 bool SuggestSelection(const UnicodeText &context, CodepointSpan click, 47 AnnotatedSpan *result) const; 48 49 bool ClassifyText(const UnicodeText &context, CodepointSpan click, 50 ClassificationResult *result) const; 51 52 std::vector<std::string> GetSupportedCollections() const; 53 54 private: PodNerAnnotator(const UniLib & unilib)55 explicit PodNerAnnotator(const UniLib &unilib) : unilib_(unilib) {} 56 57 std::vector<PodNerModel_::LabelT> ReadResultsFromInterpreter( 58 tflite::Interpreter &interpreter) const; 59 60 std::vector<PodNerModel_::LabelT> ExecuteModel( 61 const VectorSpan<int> &wordpiece_indices, 62 const VectorSpan<int32_t> &token_starts, 63 const VectorSpan<Token> &tokens) const; 64 65 bool PrepareText(const UnicodeText &text_unicode, 66 std::vector<int32_t> *wordpiece_indices, 67 std::vector<int32_t> *token_starts, 68 std::vector<Token> *tokens) const; 69 70 bool AnnotateAroundSpanOfInterest(const UnicodeText &context, 71 const CodepointSpan &span_of_interest, 72 std::vector<AnnotatedSpan> *results) const; 73 74 const UniLib &unilib_; 75 bool lowercase_input_; 76 int logits_index_in_output_tensor_; 77 bool append_final_period_; 78 int max_num_effective_wordpieces_; 79 int sliding_window_num_wordpieces_overlap_; 80 float max_ratio_unknown_wordpieces_; 81 int min_number_of_tokens_; 82 int min_number_of_wordpieces_; 83 int cls_wordpiece_id_; 84 int sep_wordpiece_id_; 85 int period_wordpiece_id_; 86 int unknown_wordpiece_id_; 87 std::vector<PodNerModel_::CollectionT> collections_; 88 std::vector<PodNerModel_::LabelT> labels_; 89 std::unique_ptr<BertTokenizer> tokenizer_; 90 const PodNerModel *model_; 91 }; 92 93 } // namespace libtextclassifier3 94 95 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_POD_NER_POD_NER_IMPL_H_ 96