1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_ 19 20 #include <string> 21 #include <unordered_set> 22 #include <vector> 23 24 #include "annotator/model_generated.h" 25 #include "annotator/types.h" 26 #include "utils/base/logging.h" 27 #include "utils/container/sorted-strings-table.h" 28 #include "utils/tokenizer.h" 29 #include "utils/utf8/unicodetext.h" 30 31 namespace libtextclassifier3 { 32 33 // Annotator of numbers in text. 34 // 35 // Integer supported values are in range [-1 000 000 000, 1 000 000 000]. 36 // Doble supposted values are in range [-999999999.999999999, 37 // 999999999.999999999]. 38 class NumberAnnotator { 39 public: NumberAnnotator(const NumberAnnotatorOptions * options,const UniLib * unilib)40 explicit NumberAnnotator(const NumberAnnotatorOptions* options, 41 const UniLib* unilib) 42 : options_(options), 43 unilib_(unilib), 44 tokenizer_(Tokenizer(TokenizationType_LETTER_DIGIT, unilib, 45 /*codepoint_ranges=*/{}, 46 /*internal_tokenizer_codepoint_ranges=*/{}, 47 /*split_on_script_change=*/false, 48 /*icu_preserve_whitespace_tokens=*/true)), 49 percent_suffixes_(FromFlatbufferStringToUnordredSet( 50 options_->percentage_pieces_string())), 51 max_number_of_digits_(options->max_number_of_digits()) {} 52 53 // Classifies given text, and if it is a number, it passes the result in 54 // 'classification_result' and returns true, otherwise returns false. 55 bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices, 56 AnnotationUsecase annotation_usecase, 57 ClassificationResult* classification_result) const; 58 59 // Finds all number instances in the input text. Returns true in any case. 60 bool FindAll(const UnicodeText& context_unicode, 61 AnnotationUsecase annotation_usecase, 62 std::vector<AnnotatedSpan>* result) const; 63 64 private: 65 // Converts a Flatbuffer string containing zero-separated percent suffixes 66 // to an unordered set. 67 static std::unordered_set<std::string> FromFlatbufferStringToUnordredSet( 68 const flatbuffers::String* flatbuffer_percent_strings); 69 70 // Checks if the annotated numbers from the context represent percentages. 71 // If yes, replaces the collection type and the annotation boundary in the 72 // result. 73 void FindPercentages(const UnicodeText& context, 74 std::vector<AnnotatedSpan>* result) const; 75 76 // Checks if the tokens from in the interval [start_index-2, start_index] are 77 // valid characters that can preced a number context. 78 bool TokensAreValidStart(const std::vector<Token>& tokens, 79 int start_index) const; 80 81 // Checks if the tokens in the interval (..., prefix_end_index] are a valid 82 // number prefix. 83 bool TokensAreValidNumberPrefix(const std::vector<Token>& tokens, 84 int prefix_end_index) const; 85 86 // Checks if the tokens from in the interval [ending_index, ending_index+2] 87 // are valid characters that can follow a number context. 88 bool TokensAreValidEnding(const std::vector<Token>& tokens, 89 int ending_index) const; 90 91 // Checks if the tokens in the interval [suffix_start_index, ...) are a valid 92 // number suffix. 93 bool TokensAreValidNumberSuffix(const std::vector<Token>& tokens, 94 int suffix_start_index) const; 95 96 // Checks if the tokens in the interval [suffix_start_index, ...) are a valid 97 // percent suffix. If false, returns -1, else returns the end codepoint. 98 int FindPercentSuffixEndCodepoint(const std::vector<Token>& tokens, 99 int suffix_token_start_index) const; 100 101 // Checks if the given text represents a number (either int or double). 102 bool TryParseNumber(const UnicodeText& token_text, bool is_negative, 103 int64* parsed_int_value, 104 double* parsed_double_value) const; 105 106 // Checks if a word contains only CJT characters. 107 bool IsCJTterm(UnicodeText::const_iterator token_begin_it, 108 int token_length) const; 109 110 AnnotatedSpan CreateAnnotatedSpan(int start, int end, int int_value, 111 double double_value, 112 const std::string collection, float score, 113 float priority_score) const; 114 115 const NumberAnnotatorOptions* options_; 116 const UniLib* unilib_; 117 const Tokenizer tokenizer_; 118 const std::unordered_set<std::string> percent_suffixes_; 119 const int max_number_of_digits_; 120 }; 121 122 } // namespace libtextclassifier3 123 124 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_ 125