1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_REGEX_PARSER_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_REGEX_PARSER_H_ 19 20 #include <memory> 21 #include <string> 22 #include <unordered_map> 23 #include <unordered_set> 24 #include <vector> 25 26 #include "annotator/datetime/extractor.h" 27 #include "annotator/datetime/parser.h" 28 #include "annotator/model_generated.h" 29 #include "annotator/types.h" 30 #include "utils/base/integral_types.h" 31 #include "utils/base/statusor.h" 32 #include "utils/calendar/calendar.h" 33 #include "utils/strings/stringpiece.h" 34 #include "utils/utf8/unicodetext.h" 35 #include "utils/utf8/unilib.h" 36 #include "utils/zlib/zlib.h" 37 38 namespace libtextclassifier3 { 39 40 // Parses datetime expressions in the input and resolves them to actual absolute 41 // time. 42 class RegexDatetimeParser : public DatetimeParser { 43 public: 44 static std::unique_ptr<DatetimeParser> Instance( 45 const DatetimeModel* model, const UniLib* unilib, 46 const CalendarLib* calendarlib, ZlibDecompressor* decompressor); 47 48 // Parses the dates in 'input' and fills result. Makes sure that the results 49 // do not overlap. 50 // If 'anchor_start_end' is true the extracted results need to start at the 51 // beginning of 'input' and end at the end of it. 52 StatusOr<std::vector<DatetimeParseResultSpan>> Parse( 53 const std::string& input, int64 reference_time_ms_utc, 54 const std::string& reference_timezone, const LocaleList& locale_list, 55 ModeFlag mode, AnnotationUsecase annotation_usecase, 56 bool anchor_start_end) const override; 57 58 // Same as above but takes UnicodeText. 59 StatusOr<std::vector<DatetimeParseResultSpan>> Parse( 60 const UnicodeText& input, int64 reference_time_ms_utc, 61 const std::string& reference_timezone, const LocaleList& locale_list, 62 ModeFlag mode, AnnotationUsecase annotation_usecase, 63 bool anchor_start_end) const override; 64 65 protected: 66 explicit RegexDatetimeParser(const DatetimeModel* model, const UniLib* unilib, 67 const CalendarLib* calendarlib, 68 ZlibDecompressor* decompressor); 69 70 // Returns a list of locale ids for given locale spec string (collection of 71 // locale names). 72 std::vector<int> ParseAndExpandLocales( 73 const std::vector<StringPiece>& locales) const; 74 75 // Helper function that finds datetime spans, only using the rules associated 76 // with the given locales. 77 StatusOr<std::vector<DatetimeParseResultSpan>> FindSpansUsingLocales( 78 const std::vector<int>& locale_ids, const UnicodeText& input, 79 const int64 reference_time_ms_utc, const std::string& reference_timezone, 80 ModeFlag mode, AnnotationUsecase annotation_usecase, 81 bool anchor_start_end, const std::string& reference_locale, 82 std::unordered_set<int>* executed_rules) const; 83 84 StatusOr<std::vector<DatetimeParseResultSpan>> ParseWithRule( 85 const CompiledRule& rule, const UnicodeText& input, 86 int64 reference_time_ms_utc, const std::string& reference_timezone, 87 const std::string& reference_locale, const int locale_id, 88 bool anchor_start_end) const; 89 90 // Converts the current match in 'matcher' into DatetimeParseResult. 91 bool ExtractDatetime(const CompiledRule& rule, 92 const UniLib::RegexMatcher& matcher, 93 int64 reference_time_ms_utc, 94 const std::string& reference_timezone, 95 const std::string& reference_locale, int locale_id, 96 std::vector<DatetimeParseResult>* results, 97 CodepointSpan* result_span) const; 98 99 // Parse and extract information from current match in 'matcher'. 100 StatusOr<std::vector<DatetimeParseResultSpan>> HandleParseMatch( 101 const CompiledRule& rule, const UniLib::RegexMatcher& matcher, 102 int64 reference_time_ms_utc, const std::string& reference_timezone, 103 const std::string& reference_locale, int locale_id) const; 104 105 private: 106 bool initialized_; 107 const UniLib& unilib_; 108 const CalendarLib& calendarlib_; 109 std::vector<CompiledRule> rules_; 110 std::unordered_map<int, std::vector<int>> locale_to_rules_; 111 std::vector<std::unique_ptr<const UniLib::RegexPattern>> extractor_rules_; 112 std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>> 113 type_and_locale_to_extractor_rule_; 114 std::unordered_map<std::string, int> locale_string_to_id_; 115 std::vector<int> default_locale_ids_; 116 bool use_extractors_for_locating_; 117 bool generate_alternative_interpretations_when_ambiguous_; 118 bool prefer_future_for_unspecified_date_; 119 }; 120 121 } // namespace libtextclassifier3 122 123 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_REGEX_PARSER_H_ 124