1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_ 19 20 #include <string> 21 #include <unordered_map> 22 #include <vector> 23 24 #include "annotator/model_generated.h" 25 #include "annotator/types.h" 26 #include "utils/strings/stringpiece.h" 27 #include "utils/utf8/unicodetext.h" 28 #include "utils/utf8/unilib.h" 29 30 namespace libtextclassifier3 { 31 32 struct CompiledRule { 33 // The compiled regular expression. 34 std::unique_ptr<const UniLib::RegexPattern> compiled_regex; 35 36 // The uncompiled pattern and information about the pattern groups. 37 const DatetimeModelPattern_::Regex* regex; 38 39 // DatetimeModelPattern which 'regex' is part of and comes from. 40 const DatetimeModelPattern* pattern; 41 }; 42 43 // A helper class for DatetimeParser that extracts structured data 44 // (DateParseDate) from the current match of the passed RegexMatcher. 45 class DatetimeExtractor { 46 public: DatetimeExtractor(const CompiledRule & rule,const UniLib::RegexMatcher & matcher,int locale_id,const UniLib & unilib,const std::vector<std::unique_ptr<const UniLib::RegexPattern>> & extractor_rules,const std::unordered_map<DatetimeExtractorType,std::unordered_map<int,int>> & type_and_locale_to_extractor_rule)47 DatetimeExtractor( 48 const CompiledRule& rule, const UniLib::RegexMatcher& matcher, 49 int locale_id, const UniLib& unilib, 50 const std::vector<std::unique_ptr<const UniLib::RegexPattern>>& 51 extractor_rules, 52 const std::unordered_map<DatetimeExtractorType, 53 std::unordered_map<int, int>>& 54 type_and_locale_to_extractor_rule) 55 : rule_(rule), 56 matcher_(matcher), 57 locale_id_(locale_id), 58 unilib_(unilib), 59 rules_(extractor_rules), 60 type_and_locale_to_rule_(type_and_locale_to_extractor_rule) {} 61 bool Extract(DateParseData* result, CodepointSpan* result_span) const; 62 63 private: 64 bool RuleIdForType(DatetimeExtractorType type, int* rule_id) const; 65 66 // Returns true if the rule for given extractor matched. If it matched, 67 // match_result will contain the first group of the rule (if match_result not 68 // nullptr). 69 bool ExtractType(const UnicodeText& input, 70 DatetimeExtractorType extractor_type, 71 UnicodeText* match_result = nullptr) const; 72 73 bool GroupTextFromMatch(int group_id, UnicodeText* result) const; 74 75 // Updates the span to include the current match for the given group. 76 bool UpdateMatchSpan(int group_id, CodepointSpan* span) const; 77 78 // Returns true if any of the extractors from 'mapping' matched. If it did, 79 // will fill 'result' with the associated value from 'mapping'. 80 template <typename T> 81 bool MapInput(const UnicodeText& input, 82 const std::vector<std::pair<DatetimeExtractorType, T>>& mapping, 83 T* result) const; 84 85 bool ParseDigits(const UnicodeText& input, int* parsed_digits) const; 86 bool ParseWrittenNumber(const UnicodeText& input, int* parsed_number) const; 87 bool ParseYear(const UnicodeText& input, int* parsed_year) const; 88 bool ParseMonth(const UnicodeText& input, int* parsed_month) const; 89 bool ParseAMPM(const UnicodeText& input, 90 DateParseData::AMPM* parsed_ampm) const; 91 bool ParseRelation(const UnicodeText& input, 92 DateParseData::Relation* parsed_relation) const; 93 bool ParseRelationDistance(const UnicodeText& input, 94 int* parsed_distance) const; 95 bool ParseTimeUnit(const UnicodeText& input, 96 DateParseData::TimeUnit* parsed_time_unit) const; 97 bool ParseRelationType( 98 const UnicodeText& input, 99 DateParseData::RelationType* parsed_relation_type) const; 100 bool ParseWeekday(const UnicodeText& input, 101 DateParseData::RelationType* parsed_weekday) const; 102 103 const CompiledRule& rule_; 104 const UniLib::RegexMatcher& matcher_; 105 int locale_id_; 106 const UniLib& unilib_; 107 const std::vector<std::unique_ptr<const UniLib::RegexPattern>>& rules_; 108 const std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>>& 109 type_and_locale_to_rule_; 110 }; 111 112 } // namespace libtextclassifier3 113 114 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_ 115