1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_PARSER_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_PARSER_H_ 19 20 #include <memory> 21 #include <string> 22 #include <unordered_map> 23 #include <unordered_set> 24 #include <vector> 25 26 #include "annotator/datetime/extractor.h" 27 #include "annotator/model_generated.h" 28 #include "annotator/types.h" 29 #include "utils/base/integral_types.h" 30 #include "utils/calendar/calendar.h" 31 #include "utils/utf8/unicodetext.h" 32 #include "utils/utf8/unilib.h" 33 #include "utils/zlib/zlib.h" 34 35 namespace libtextclassifier3 { 36 37 // Parses datetime expressions in the input and resolves them to actual absolute 38 // time. 39 class DatetimeParser { 40 public: 41 static std::unique_ptr<DatetimeParser> Instance( 42 const DatetimeModel* model, const UniLib* unilib, 43 const CalendarLib* calendarlib, ZlibDecompressor* decompressor); 44 45 // Parses the dates in 'input' and fills result. Makes sure that the results 46 // do not overlap. 47 // If 'anchor_start_end' is true the extracted results need to start at the 48 // beginning of 'input' and end at the end of it. 49 bool Parse(const std::string& input, int64 reference_time_ms_utc, 50 const std::string& reference_timezone, const std::string& locales, 51 ModeFlag mode, AnnotationUsecase annotation_usecase, 52 bool anchor_start_end, 53 std::vector<DatetimeParseResultSpan>* results) const; 54 55 // Same as above but takes UnicodeText. 56 bool Parse(const UnicodeText& input, int64 reference_time_ms_utc, 57 const std::string& reference_timezone, const std::string& locales, 58 ModeFlag mode, AnnotationUsecase annotation_usecase, 59 bool anchor_start_end, 60 std::vector<DatetimeParseResultSpan>* results) const; 61 62 protected: 63 explicit DatetimeParser(const DatetimeModel* model, const UniLib* unilib, 64 const CalendarLib* calendarlib, 65 ZlibDecompressor* decompressor); 66 67 // Returns a list of locale ids for given locale spec string (comma-separated 68 // locale names). Assigns the first parsed locale to reference_locale. 69 std::vector<int> ParseAndExpandLocales(const std::string& locales, 70 std::string* reference_locale) const; 71 72 // Helper function that finds datetime spans, only using the rules associated 73 // with the given locales. 74 bool FindSpansUsingLocales( 75 const std::vector<int>& locale_ids, const UnicodeText& input, 76 const int64 reference_time_ms_utc, const std::string& reference_timezone, 77 ModeFlag mode, AnnotationUsecase annotation_usecase, 78 bool anchor_start_end, const std::string& reference_locale, 79 std::unordered_set<int>* executed_rules, 80 std::vector<DatetimeParseResultSpan>* found_spans) const; 81 82 bool ParseWithRule(const CompiledRule& rule, const UnicodeText& input, 83 int64 reference_time_ms_utc, 84 const std::string& reference_timezone, 85 const std::string& reference_locale, const int locale_id, 86 bool anchor_start_end, 87 std::vector<DatetimeParseResultSpan>* result) const; 88 89 // Converts the current match in 'matcher' into DatetimeParseResult. 90 bool ExtractDatetime(const CompiledRule& rule, 91 const UniLib::RegexMatcher& matcher, 92 int64 reference_time_ms_utc, 93 const std::string& reference_timezone, 94 const std::string& reference_locale, int locale_id, 95 std::vector<DatetimeParseResult>* results, 96 CodepointSpan* result_span) const; 97 98 // Parse and extract information from current match in 'matcher'. 99 bool HandleParseMatch(const CompiledRule& rule, 100 const UniLib::RegexMatcher& matcher, 101 int64 reference_time_ms_utc, 102 const std::string& reference_timezone, 103 const std::string& reference_locale, int locale_id, 104 std::vector<DatetimeParseResultSpan>* result) const; 105 106 private: 107 bool initialized_; 108 const UniLib& unilib_; 109 const CalendarLib& calendarlib_; 110 std::vector<CompiledRule> rules_; 111 std::unordered_map<int, std::vector<int>> locale_to_rules_; 112 std::vector<std::unique_ptr<const UniLib::RegexPattern>> extractor_rules_; 113 std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>> 114 type_and_locale_to_extractor_rule_; 115 std::unordered_map<std::string, int> locale_string_to_id_; 116 std::vector<int> default_locale_ids_; 117 bool use_extractors_for_locating_; 118 bool generate_alternative_interpretations_when_ambiguous_; 119 bool prefer_future_for_unspecified_date_; 120 }; 121 122 } // namespace libtextclassifier3 123 124 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_PARSER_H_ 125