• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_PARSER_H_
18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_PARSER_H_
19 
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <vector>
25 
26 #include "annotator/datetime/extractor.h"
27 #include "annotator/model_generated.h"
28 #include "annotator/types.h"
29 #include "utils/base/integral_types.h"
30 #include "utils/calendar/calendar.h"
31 #include "utils/utf8/unilib.h"
32 #include "utils/zlib/zlib.h"
33 
34 namespace libtextclassifier3 {
35 
36 // Parses datetime expressions in the input and resolves them to actual absolute
37 // time.
38 class DatetimeParser {
39  public:
40   static std::unique_ptr<DatetimeParser> Instance(
41       const DatetimeModel* model, const UniLib& unilib,
42       const CalendarLib& calendarlib, ZlibDecompressor* decompressor);
43 
44   // Parses the dates in 'input' and fills result. Makes sure that the results
45   // do not overlap.
46   // If 'anchor_start_end' is true the extracted results need to start at the
47   // beginning of 'input' and end at the end of it.
48   bool Parse(const std::string& input, int64 reference_time_ms_utc,
49              const std::string& reference_timezone, const std::string& locales,
50              ModeFlag mode, AnnotationUsecase annotation_usecase,
51              bool anchor_start_end,
52              std::vector<DatetimeParseResultSpan>* results) const;
53 
54   // Same as above but takes UnicodeText.
55   bool Parse(const UnicodeText& input, int64 reference_time_ms_utc,
56              const std::string& reference_timezone, const std::string& locales,
57              ModeFlag mode, AnnotationUsecase annotation_usecase,
58              bool anchor_start_end,
59              std::vector<DatetimeParseResultSpan>* results) const;
60 
61 #ifdef TC3_TEST_ONLY
TestOnlySetGenerateAlternativeInterpretationsWhenAmbiguous(bool value)62   void TestOnlySetGenerateAlternativeInterpretationsWhenAmbiguous(bool value) {
63     generate_alternative_interpretations_when_ambiguous_ = value;
64   }
65 #endif  // TC3_TEST_ONLY
66 
67  protected:
68   DatetimeParser(const DatetimeModel* model, const UniLib& unilib,
69                  const CalendarLib& calendarlib,
70                  ZlibDecompressor* decompressor);
71 
72   // Returns a list of locale ids for given locale spec string (comma-separated
73   // locale names). Assigns the first parsed locale to reference_locale.
74   std::vector<int> ParseAndExpandLocales(const std::string& locales,
75                                          std::string* reference_locale) const;
76 
77   // Helper function that finds datetime spans, only using the rules associated
78   // with the given locales.
79   bool FindSpansUsingLocales(
80       const std::vector<int>& locale_ids, const UnicodeText& input,
81       const int64 reference_time_ms_utc, const std::string& reference_timezone,
82       ModeFlag mode, AnnotationUsecase annotation_usecase,
83       bool anchor_start_end, const std::string& reference_locale,
84       std::unordered_set<int>* executed_rules,
85       std::vector<DatetimeParseResultSpan>* found_spans) const;
86 
87   bool ParseWithRule(const CompiledRule& rule, const UnicodeText& input,
88                      int64 reference_time_ms_utc,
89                      const std::string& reference_timezone,
90                      const std::string& reference_locale, const int locale_id,
91                      bool anchor_start_end,
92                      std::vector<DatetimeParseResultSpan>* result) const;
93 
94   void FillInterpretations(const DateParseData& parse,
95                            std::vector<DateParseData>* interpretations) const;
96 
97   // Converts the current match in 'matcher' into DatetimeParseResult.
98   bool ExtractDatetime(const CompiledRule& rule,
99                        const UniLib::RegexMatcher& matcher,
100                        int64 reference_time_ms_utc,
101                        const std::string& reference_timezone,
102                        const std::string& reference_locale, int locale_id,
103                        std::vector<DatetimeParseResult>* results,
104                        CodepointSpan* result_span) const;
105 
106   // Parse and extract information from current match in 'matcher'.
107   bool HandleParseMatch(const CompiledRule& rule,
108                         const UniLib::RegexMatcher& matcher,
109                         int64 reference_time_ms_utc,
110                         const std::string& reference_timezone,
111                         const std::string& reference_locale, int locale_id,
112                         std::vector<DatetimeParseResultSpan>* result) const;
113 
114  private:
115   bool initialized_;
116   const UniLib& unilib_;
117   const CalendarLib& calendarlib_;
118   std::vector<CompiledRule> rules_;
119   std::unordered_map<int, std::vector<int>> locale_to_rules_;
120   std::vector<std::unique_ptr<const UniLib::RegexPattern>> extractor_rules_;
121   std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>>
122       type_and_locale_to_extractor_rule_;
123   std::unordered_map<std::string, int> locale_string_to_id_;
124   std::vector<int> default_locale_ids_;
125   bool use_extractors_for_locating_;
126   bool generate_alternative_interpretations_when_ambiguous_;
127 };
128 
129 }  // namespace libtextclassifier3
130 
131 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_PARSER_H_
132