1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ 19 20 #include <string> 21 #include <unordered_map> 22 #include <unordered_set> 23 #include <vector> 24 25 #include "annotator/feature-processor.h" 26 #include "annotator/model_generated.h" 27 #include "annotator/types.h" 28 #include "utils/utf8/unicodetext.h" 29 #include "utils/utf8/unilib.h" 30 31 namespace libtextclassifier3 { 32 33 namespace internal { 34 enum class DurationUnit { 35 UNKNOWN = -1, 36 WEEK = 0, 37 DAY = 1, 38 HOUR = 2, 39 MINUTE = 3, 40 SECOND = 4 41 42 // NOTE: If we want to add MONTH and YEAR we'll have to think of different 43 // parsing format, because MONTH and YEAR don't have a fixed number of 44 // milliseconds, unlike week/day/hour/minute/second. We ignore the daylight 45 // savings time and assume the day is always 24 hours. 46 }; 47 48 // Prepares the mapping between token values and duration unit types. 49 std::unordered_map<std::string, internal::DurationUnit> 50 BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions* options, 51 const UniLib* unilib); 52 53 // Creates a set of strings from a flatbuffer string vector. 54 std::unordered_set<std::string> BuildStringSet( 55 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>* 56 strings, 57 const UniLib* unilib); 58 59 // Creates a set of ints from a flatbuffer int vector. 60 std::unordered_set<int32> BuildInt32Set(const flatbuffers::Vector<int32>* ints); 61 62 } // namespace internal 63 64 // Annotator of duration expressions like "3 minutes 30 seconds". 65 class DurationAnnotator { 66 public: DurationAnnotator(const DurationAnnotatorOptions * options,const FeatureProcessor * feature_processor,const UniLib * unilib)67 explicit DurationAnnotator(const DurationAnnotatorOptions* options, 68 const FeatureProcessor* feature_processor, 69 const UniLib* unilib) 70 : options_(options), 71 feature_processor_(feature_processor), 72 unilib_(unilib), 73 token_value_to_duration_unit_( 74 internal::BuildTokenToDurationUnitMapping(options, unilib)), 75 filler_expressions_( 76 internal::BuildStringSet(options->filler_expressions(), unilib)), 77 half_expressions_( 78 internal::BuildStringSet(options->half_expressions(), unilib)), 79 sub_token_separator_codepoints_(internal::BuildInt32Set( 80 options->sub_token_separator_codepoints())) {} 81 82 // Classifies given text, and if it is a duration, it passes the result in 83 // 'classification_result' and returns true, otherwise returns false. 84 bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices, 85 AnnotationUsecase annotation_usecase, 86 ClassificationResult* classification_result) const; 87 88 // Finds all duration instances in the input text. 89 bool FindAll(const UnicodeText& context, const std::vector<Token>& tokens, 90 AnnotationUsecase annotation_usecase, ModeFlag mode, 91 std::vector<AnnotatedSpan>* results) const; 92 93 private: 94 // Represents a component of duration parsed from text (e.g. "3 hours" from 95 // the expression "3 hours and 20 minutes"). 96 struct ParsedDurationAtom { 97 // Unit of the duration. 98 internal::DurationUnit unit = internal::DurationUnit::UNKNOWN; 99 100 // Quantity of the duration unit. 101 double value = 0; 102 103 // True, if half an unit was specified (either in addition, or exclusively). 104 // E.g. "hour and a half". 105 // NOTE: Quarter, three-quarters etc. is not supported. 106 bool plus_half = false; 107 HalfParsedDurationAtom108 static ParsedDurationAtom Half() { 109 ParsedDurationAtom result; 110 result.plus_half = true; 111 return result; 112 } 113 }; 114 115 // Starts consuming tokens and returns the index past the last consumed token. 116 int FindDurationStartingAt(const UnicodeText& context, 117 const std::vector<Token>& tokens, 118 int start_token_index, 119 AnnotatedSpan* result) const; 120 121 bool ParseQuantityToken(const Token& token, ParsedDurationAtom* value) const; 122 bool ParseDurationUnitToken(const Token& token, 123 internal::DurationUnit* duration_unit) const; 124 bool ParseQuantityDurationUnitToken(const Token& token, 125 ParsedDurationAtom* value) const; 126 bool ParseFillerToken(const Token& token) const; 127 128 int64 ParsedDurationAtomsToMillis( 129 const std::vector<ParsedDurationAtom>& atoms) const; 130 131 const DurationAnnotatorOptions* options_; 132 const FeatureProcessor* feature_processor_; 133 const UniLib* unilib_; 134 const std::unordered_map<std::string, internal::DurationUnit> 135 token_value_to_duration_unit_; 136 const std::unordered_set<std::string> filler_expressions_; 137 const std::unordered_set<std::string> half_expressions_; 138 const std::unordered_set<int32> sub_token_separator_codepoints_; 139 }; 140 141 } // namespace libtextclassifier3 142 143 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ 144