1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_PARSER_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_PARSER_H_ 19 20 #include <memory> 21 #include <string> 22 #include <unordered_map> 23 #include <unordered_set> 24 #include <vector> 25 26 #include "annotator/datetime/extractor.h" 27 #include "annotator/model_generated.h" 28 #include "annotator/types.h" 29 #include "utils/base/integral_types.h" 30 #include "utils/calendar/calendar.h" 31 #include "utils/utf8/unilib.h" 32 #include "utils/zlib/zlib.h" 33 34 namespace libtextclassifier3 { 35 36 // Parses datetime expressions in the input and resolves them to actual absolute 37 // time. 38 class DatetimeParser { 39 public: 40 static std::unique_ptr<DatetimeParser> Instance( 41 const DatetimeModel* model, const UniLib& unilib, 42 const CalendarLib& calendarlib, ZlibDecompressor* decompressor); 43 44 // Parses the dates in 'input' and fills result. Makes sure that the results 45 // do not overlap. 46 // If 'anchor_start_end' is true the extracted results need to start at the 47 // beginning of 'input' and end at the end of it. 48 bool Parse(const std::string& input, int64 reference_time_ms_utc, 49 const std::string& reference_timezone, const std::string& locales, 50 ModeFlag mode, AnnotationUsecase annotation_usecase, 51 bool anchor_start_end, 52 std::vector<DatetimeParseResultSpan>* results) const; 53 54 // Same as above but takes UnicodeText. 55 bool Parse(const UnicodeText& input, int64 reference_time_ms_utc, 56 const std::string& reference_timezone, const std::string& locales, 57 ModeFlag mode, AnnotationUsecase annotation_usecase, 58 bool anchor_start_end, 59 std::vector<DatetimeParseResultSpan>* results) const; 60 61 #ifdef TC3_TEST_ONLY TestOnlySetGenerateAlternativeInterpretationsWhenAmbiguous(bool value)62 void TestOnlySetGenerateAlternativeInterpretationsWhenAmbiguous(bool value) { 63 generate_alternative_interpretations_when_ambiguous_ = value; 64 } 65 #endif // TC3_TEST_ONLY 66 67 protected: 68 DatetimeParser(const DatetimeModel* model, const UniLib& unilib, 69 const CalendarLib& calendarlib, 70 ZlibDecompressor* decompressor); 71 72 // Returns a list of locale ids for given locale spec string (comma-separated 73 // locale names). Assigns the first parsed locale to reference_locale. 74 std::vector<int> ParseAndExpandLocales(const std::string& locales, 75 std::string* reference_locale) const; 76 77 // Helper function that finds datetime spans, only using the rules associated 78 // with the given locales. 79 bool FindSpansUsingLocales( 80 const std::vector<int>& locale_ids, const UnicodeText& input, 81 const int64 reference_time_ms_utc, const std::string& reference_timezone, 82 ModeFlag mode, AnnotationUsecase annotation_usecase, 83 bool anchor_start_end, const std::string& reference_locale, 84 std::unordered_set<int>* executed_rules, 85 std::vector<DatetimeParseResultSpan>* found_spans) const; 86 87 bool ParseWithRule(const CompiledRule& rule, const UnicodeText& input, 88 int64 reference_time_ms_utc, 89 const std::string& reference_timezone, 90 const std::string& reference_locale, const int locale_id, 91 bool anchor_start_end, 92 std::vector<DatetimeParseResultSpan>* result) const; 93 94 void FillInterpretations(const DateParseData& parse, 95 std::vector<DateParseData>* interpretations) const; 96 97 // Converts the current match in 'matcher' into DatetimeParseResult. 98 bool ExtractDatetime(const CompiledRule& rule, 99 const UniLib::RegexMatcher& matcher, 100 int64 reference_time_ms_utc, 101 const std::string& reference_timezone, 102 const std::string& reference_locale, int locale_id, 103 std::vector<DatetimeParseResult>* results, 104 CodepointSpan* result_span) const; 105 106 // Parse and extract information from current match in 'matcher'. 107 bool HandleParseMatch(const CompiledRule& rule, 108 const UniLib::RegexMatcher& matcher, 109 int64 reference_time_ms_utc, 110 const std::string& reference_timezone, 111 const std::string& reference_locale, int locale_id, 112 std::vector<DatetimeParseResultSpan>* result) const; 113 114 private: 115 bool initialized_; 116 const UniLib& unilib_; 117 const CalendarLib& calendarlib_; 118 std::vector<CompiledRule> rules_; 119 std::unordered_map<int, std::vector<int>> locale_to_rules_; 120 std::vector<std::unique_ptr<const UniLib::RegexPattern>> extractor_rules_; 121 std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>> 122 type_and_locale_to_extractor_rule_; 123 std::unordered_map<std::string, int> locale_string_to_id_; 124 std::vector<int> default_locale_ids_; 125 bool use_extractors_for_locating_; 126 bool generate_alternative_interpretations_when_ambiguous_; 127 }; 128 129 } // namespace libtextclassifier3 130 131 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_PARSER_H_ 132