1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_ 19 20 #include <string> 21 #include <unordered_map> 22 #include <vector> 23 24 #include "annotator/model_generated.h" 25 #include "annotator/types.h" 26 #include "utils/strings/stringpiece.h" 27 #include "utils/utf8/unicodetext.h" 28 #include "utils/utf8/unilib.h" 29 30 namespace libtextclassifier3 { 31 32 struct CompiledRule { 33 // The compiled regular expression. 34 std::unique_ptr<const UniLib::RegexPattern> compiled_regex; 35 36 // The uncompiled pattern and information about the pattern groups. 37 const DatetimeModelPattern_::Regex* regex; 38 39 // DatetimeModelPattern which 'regex' is part of and comes from. 40 const DatetimeModelPattern* pattern; 41 }; 42 43 // A helper class for DatetimeParser that extracts structured data 44 // (DateParseDate) from the current match of the passed RegexMatcher. 45 class DatetimeExtractor { 46 public: DatetimeExtractor(const CompiledRule & rule,const UniLib::RegexMatcher & matcher,int locale_id,const UniLib * unilib,const std::vector<std::unique_ptr<const UniLib::RegexPattern>> & extractor_rules,const std::unordered_map<DatetimeExtractorType,std::unordered_map<int,int>> & type_and_locale_to_extractor_rule)47 explicit DatetimeExtractor( 48 const CompiledRule& rule, const UniLib::RegexMatcher& matcher, 49 int locale_id, const UniLib* unilib, 50 const std::vector<std::unique_ptr<const UniLib::RegexPattern>>& 51 extractor_rules, 52 const std::unordered_map<DatetimeExtractorType, 53 std::unordered_map<int, int>>& 54 type_and_locale_to_extractor_rule) 55 : rule_(rule), 56 matcher_(matcher), 57 locale_id_(locale_id), 58 unilib_(*unilib), 59 rules_(extractor_rules), 60 type_and_locale_to_rule_(type_and_locale_to_extractor_rule) {} 61 bool Extract(DatetimeParsedData* result, CodepointSpan* result_span) const; 62 63 private: 64 bool RuleIdForType(DatetimeExtractorType type, int* rule_id) const; 65 66 // Returns true if the rule for given extractor matched. If it matched, 67 // match_result will contain the first group of the rule (if match_result not 68 // nullptr). 69 bool ExtractType(const UnicodeText& input, 70 DatetimeExtractorType extractor_type, 71 UnicodeText* match_result = nullptr) const; 72 73 bool GroupTextFromMatch(int group_id, UnicodeText* result) const; 74 75 // Updates the span to include the current match for the given group. 76 bool UpdateMatchSpan(int group_id, CodepointSpan* span) const; 77 78 // Returns true if any of the extractors from 'mapping' matched. If it did, 79 // will fill 'result' with the associated value from 'mapping'. 80 template <typename T> 81 bool MapInput(const UnicodeText& input, 82 const std::vector<std::pair<DatetimeExtractorType, T>>& mapping, 83 T* result) const; 84 85 bool ParseDigits(const UnicodeText& input, int* parsed_digits) const; 86 bool ParseWrittenNumber(const UnicodeText& input, int* parsed_number) const; 87 bool ParseYear(const UnicodeText& input, int* parsed_year) const; 88 bool ParseMonth(const UnicodeText& input, int* parsed_month) const; 89 bool ParseMeridiem(const UnicodeText& input, int* parsed_meridiem) const; 90 bool ParseRelativeValue( 91 const UnicodeText& input, 92 DatetimeComponent::RelativeQualifier* parsed_relative_value) const; 93 bool ParseRelationDistance(const UnicodeText& input, 94 int* parsed_distance) const; 95 bool ParseFieldType( 96 const UnicodeText& input, 97 DatetimeComponent::ComponentType* parsed_field_type) const; 98 bool ParseDayOfWeek(const UnicodeText& input, int* parsed_day_of_week) const; 99 100 bool ParseRelationAndConvertToRelativeCount(const UnicodeText& input, 101 int* relative_count) const; 102 103 // There are some special words which represent multiple date time components 104 // e.g. if the text says “by noon” it clearly indicates that the hour is 12, 105 // minute is 0 and meridiam is PM. 106 // The method handles such tokens and translates them into multiple date time 107 // components. 108 bool ParseAbsoluteDateValues( 109 const UnicodeText& input, 110 std::unordered_map<DatetimeComponent::ComponentType, int>* values) const; 111 112 const CompiledRule& rule_; 113 const UniLib::RegexMatcher& matcher_; 114 int locale_id_; 115 const UniLib& unilib_; 116 const std::vector<std::unique_ptr<const UniLib::RegexPattern>>& rules_; 117 const std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>>& 118 type_and_locale_to_rule_; 119 }; 120 121 } // namespace libtextclassifier3 122 123 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_ 124