1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_ 19 20 #include "annotator/model_generated.h" 21 #include "annotator/types.h" 22 #include "utils/utf8/unicodetext.h" 23 #include "utils/utf8/unilib.h" 24 #include "lang_id/lang-id.h" 25 26 namespace libtextclassifier3 { 27 28 // Returns classification with "translate" when the input text is in a language 29 // not understood by the user. 30 class TranslateAnnotator { 31 public: TranslateAnnotator(const TranslateAnnotatorOptions * options,const libtextclassifier3::mobile::lang_id::LangId * langid_model,const UniLib * unilib)32 TranslateAnnotator(const TranslateAnnotatorOptions* options, 33 const libtextclassifier3::mobile::lang_id::LangId* langid_model, 34 const UniLib* unilib) 35 : options_(options), langid_model_(langid_model), unilib_(unilib) {} 36 37 // Returns true if a classification_result was filled with "translate" 38 // classification. 39 bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices, 40 const std::string& user_familiar_language_tags, 41 ClassificationResult* classification_result) const; 42 43 protected: 44 struct LanguageConfidence { 45 std::string language; 46 float confidence = -1.0; 47 }; 48 49 // Detects language of the selection in given context using the "Backoff 50 // algorithm", sorted by the score descendingly. It is based on several 51 // heuristics, see the code. This is the same algorithm that TextClassifier 52 // uses in Android Q. 53 std::vector<LanguageConfidence> BackoffDetectLanguages( 54 const UnicodeText& context, CodepointSpan selection_indices) const; 55 56 // Returns the iterator of the next whitespace/punctuation character in given 57 // text, starting from given position and going forward (iff direction == 1), 58 // and backward (iff direction == -1). 59 UnicodeText::const_iterator FindIndexOfNextWhitespaceOrPunctuation( 60 const UnicodeText& text, int start_index, int direction) const; 61 62 // Returns substring from given text, centered around the specified indices, 63 // of certain minimum length. The substring is token aligned, so it is 64 // guaranteed that the words won't be broken down. 65 UnicodeText TokenAlignedSubstringAroundSpan(const UnicodeText& text, 66 CodepointSpan indices, 67 int minimum_length) const; 68 69 private: 70 std::string CreateSerializedEntityData( 71 const std::vector<TranslateAnnotator::LanguageConfidence>& confidences) 72 const; 73 74 const TranslateAnnotatorOptions* options_; 75 const libtextclassifier3::mobile::lang_id::LangId* langid_model_; 76 const UniLib* unilib_; 77 }; 78 79 } // namespace libtextclassifier3 80 81 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_ 82