• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "annotator/datetime/extractor.h"
18 
19 #include <algorithm>
20 
21 #include "annotator/datetime/utils.h"
22 #include "annotator/model_generated.h"
23 #include "annotator/types.h"
24 #include "utils/base/logging.h"
25 
26 namespace libtextclassifier3 {
27 
Extract(DatetimeParsedData * result,CodepointSpan * result_span) const28 bool DatetimeExtractor::Extract(DatetimeParsedData* result,
29                                 CodepointSpan* result_span) const {
30   *result_span = {kInvalidIndex, kInvalidIndex};
31 
32   if (rule_.regex->groups() == nullptr) {
33     return false;
34   }
35 
36   // In the current implementation of extractor, the assumption is that there
37   // can only be one relative field.
38   DatetimeComponent::ComponentType component_type;
39   DatetimeComponent::RelativeQualifier relative_qualifier =
40       DatetimeComponent::RelativeQualifier::UNSPECIFIED;
41   int relative_count = 0;
42 
43   for (int group_id = 0; group_id < rule_.regex->groups()->size(); group_id++) {
44     UnicodeText group_text;
45     const int group_type = rule_.regex->groups()->Get(group_id);
46     if (group_type == DatetimeGroupType_GROUP_UNUSED) {
47       continue;
48     }
49     if (!GroupTextFromMatch(group_id, &group_text)) {
50       TC3_LOG(ERROR) << "Couldn't retrieve group.";
51       return false;
52     }
53     // The pattern can have a group defined in a part that was not matched,
54     // e.g. an optional part. In this case we'll get an empty content here.
55     if (group_text.empty()) {
56       continue;
57     }
58 
59     switch (group_type) {
60       case DatetimeGroupType_GROUP_YEAR: {
61         int year;
62         if (!ParseYear(group_text, &(year))) {
63           TC3_LOG(ERROR) << "Couldn't extract YEAR.";
64           return false;
65         }
66         result->SetAbsoluteValue(DatetimeComponent::ComponentType::YEAR, year);
67         break;
68       }
69       case DatetimeGroupType_GROUP_MONTH: {
70         int month;
71         if (!ParseMonth(group_text, &(month))) {
72           TC3_LOG(ERROR) << "Couldn't extract MONTH.";
73           return false;
74         }
75         result->SetAbsoluteValue(DatetimeComponent::ComponentType::MONTH,
76                                  month);
77         break;
78       }
79       case DatetimeGroupType_GROUP_DAY: {
80         int day_of_month;
81         if (!ParseDigits(group_text, &(day_of_month))) {
82           TC3_LOG(ERROR) << "Couldn't extract DAY.";
83           return false;
84         }
85         result->SetAbsoluteValue(DatetimeComponent::ComponentType::DAY_OF_MONTH,
86                                  day_of_month);
87         break;
88       }
89       case DatetimeGroupType_GROUP_HOUR: {
90         int hour;
91         if (!ParseDigits(group_text, &(hour))) {
92           TC3_LOG(ERROR) << "Couldn't extract HOUR.";
93           return false;
94         }
95         result->SetAbsoluteValue(DatetimeComponent::ComponentType::HOUR, hour);
96         break;
97       }
98       case DatetimeGroupType_GROUP_MINUTE: {
99         int minute;
100         if (!ParseDigits(group_text, &(minute)) &&
101             !ParseWrittenNumber(group_text, &(minute))) {
102           TC3_LOG(ERROR) << "Couldn't extract MINUTE.";
103           return false;
104         }
105         result->SetAbsoluteValue(DatetimeComponent::ComponentType::MINUTE,
106                                  minute);
107         break;
108       }
109       case DatetimeGroupType_GROUP_SECOND: {
110         int second;
111         if (!ParseDigits(group_text, &(second))) {
112           TC3_LOG(ERROR) << "Couldn't extract SECOND.";
113           return false;
114         }
115         result->SetAbsoluteValue(DatetimeComponent::ComponentType::SECOND,
116                                  second);
117         break;
118       }
119       case DatetimeGroupType_GROUP_AMPM: {
120         int meridiem;
121         if (!ParseMeridiem(group_text, &(meridiem))) {
122           TC3_LOG(ERROR) << "Couldn't extract AMPM.";
123           return false;
124         }
125         result->SetAbsoluteValue(DatetimeComponent::ComponentType::MERIDIEM,
126                                  meridiem);
127         break;
128       }
129       case DatetimeGroupType_GROUP_RELATIONDISTANCE: {
130         relative_count = 0;
131         if (!ParseRelationDistance(group_text, &(relative_count))) {
132           TC3_LOG(ERROR) << "Couldn't extract RELATION_DISTANCE_FIELD.";
133           return false;
134         }
135         break;
136       }
137       case DatetimeGroupType_GROUP_RELATION: {
138         if (!ParseRelativeValue(group_text, &relative_qualifier)) {
139           TC3_LOG(ERROR) << "Couldn't extract RELATION_FIELD.";
140           return false;
141         }
142         ParseRelationAndConvertToRelativeCount(group_text, &relative_count);
143         if (relative_qualifier ==
144                 DatetimeComponent::RelativeQualifier::TOMORROW ||
145             relative_qualifier == DatetimeComponent::RelativeQualifier::NOW ||
146             relative_qualifier ==
147                 DatetimeComponent::RelativeQualifier::YESTERDAY) {
148           if (!ParseFieldType(group_text, &component_type)) {
149             TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
150             return false;
151           }
152         }
153         break;
154       }
155       case DatetimeGroupType_GROUP_RELATIONTYPE: {
156         if (!ParseFieldType(group_text, &component_type)) {
157           TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
158           return false;
159         }
160         if (component_type == DatetimeComponent::ComponentType::DAY_OF_WEEK) {
161           int day_of_week;
162           if (!ParseDayOfWeek(group_text, &day_of_week)) {
163             TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
164             return false;
165           }
166           result->SetAbsoluteValue(component_type, day_of_week);
167         }
168         break;
169       }
170       case DatetimeGroupType_GROUP_ABSOLUTETIME: {
171         std::unordered_map<DatetimeComponent::ComponentType, int> values;
172         if (!ParseAbsoluteDateValues(group_text, &values)) {
173           TC3_LOG(ERROR) << "Couldn't extract Component values.";
174           return false;
175         }
176         for (const std::pair<const DatetimeComponent::ComponentType, int>&
177                  date_time_pair : values) {
178           result->SetAbsoluteValue(date_time_pair.first, date_time_pair.second);
179         }
180         break;
181       }
182       case DatetimeGroupType_GROUP_DUMMY1:
183       case DatetimeGroupType_GROUP_DUMMY2:
184         break;
185       default:
186         TC3_LOG(INFO) << "Unknown group type.";
187         continue;
188     }
189     if (!UpdateMatchSpan(group_id, result_span)) {
190       TC3_LOG(ERROR) << "Couldn't update span.";
191       return false;
192     }
193   }
194 
195   if (relative_qualifier != DatetimeComponent::RelativeQualifier::UNSPECIFIED) {
196     result->SetRelativeValue(component_type, relative_qualifier);
197     result->SetRelativeCount(component_type, relative_count);
198   }
199 
200   if (result_span->first == kInvalidIndex ||
201       result_span->second == kInvalidIndex) {
202     *result_span = {kInvalidIndex, kInvalidIndex};
203   }
204 
205   return true;
206 }
207 
RuleIdForType(DatetimeExtractorType type,int * rule_id) const208 bool DatetimeExtractor::RuleIdForType(DatetimeExtractorType type,
209                                       int* rule_id) const {
210   auto type_it = type_and_locale_to_rule_.find(type);
211   if (type_it == type_and_locale_to_rule_.end()) {
212     return false;
213   }
214 
215   auto locale_it = type_it->second.find(locale_id_);
216   if (locale_it == type_it->second.end()) {
217     return false;
218   }
219   *rule_id = locale_it->second;
220   return true;
221 }
222 
ExtractType(const UnicodeText & input,DatetimeExtractorType extractor_type,UnicodeText * match_result) const223 bool DatetimeExtractor::ExtractType(const UnicodeText& input,
224                                     DatetimeExtractorType extractor_type,
225                                     UnicodeText* match_result) const {
226   int rule_id;
227   if (!RuleIdForType(extractor_type, &rule_id)) {
228     return false;
229   }
230 
231   std::unique_ptr<UniLib::RegexMatcher> matcher =
232       rules_[rule_id]->Matcher(input);
233   if (!matcher) {
234     return false;
235   }
236 
237   int status;
238   if (!matcher->Find(&status)) {
239     return false;
240   }
241 
242   if (match_result != nullptr) {
243     *match_result = matcher->Group(&status);
244     if (status != UniLib::RegexMatcher::kNoError) {
245       return false;
246     }
247   }
248   return true;
249 }
250 
GroupTextFromMatch(int group_id,UnicodeText * result) const251 bool DatetimeExtractor::GroupTextFromMatch(int group_id,
252                                            UnicodeText* result) const {
253   int status;
254   *result = matcher_.Group(group_id, &status);
255   if (status != UniLib::RegexMatcher::kNoError) {
256     return false;
257   }
258   return true;
259 }
260 
UpdateMatchSpan(int group_id,CodepointSpan * span) const261 bool DatetimeExtractor::UpdateMatchSpan(int group_id,
262                                         CodepointSpan* span) const {
263   int status;
264   const int match_start = matcher_.Start(group_id, &status);
265   if (status != UniLib::RegexMatcher::kNoError) {
266     return false;
267   }
268   const int match_end = matcher_.End(group_id, &status);
269   if (status != UniLib::RegexMatcher::kNoError) {
270     return false;
271   }
272   if (span->first == kInvalidIndex || span->first > match_start) {
273     span->first = match_start;
274   }
275   if (span->second == kInvalidIndex || span->second < match_end) {
276     span->second = match_end;
277   }
278 
279   return true;
280 }
281 
282 template <typename T>
MapInput(const UnicodeText & input,const std::vector<std::pair<DatetimeExtractorType,T>> & mapping,T * result) const283 bool DatetimeExtractor::MapInput(
284     const UnicodeText& input,
285     const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
286     T* result) const {
287   for (const auto& type_value_pair : mapping) {
288     if (ExtractType(input, type_value_pair.first)) {
289       *result = type_value_pair.second;
290       return true;
291     }
292   }
293   return false;
294 }
295 
ParseWrittenNumber(const UnicodeText & input,int * parsed_number) const296 bool DatetimeExtractor::ParseWrittenNumber(const UnicodeText& input,
297                                            int* parsed_number) const {
298   std::vector<std::pair<int, int>> found_numbers;
299   for (const auto& type_value_pair :
300        std::vector<std::pair<DatetimeExtractorType, int>>{
301            {DatetimeExtractorType_ZERO, 0},
302            {DatetimeExtractorType_ONE, 1},
303            {DatetimeExtractorType_TWO, 2},
304            {DatetimeExtractorType_THREE, 3},
305            {DatetimeExtractorType_FOUR, 4},
306            {DatetimeExtractorType_FIVE, 5},
307            {DatetimeExtractorType_SIX, 6},
308            {DatetimeExtractorType_SEVEN, 7},
309            {DatetimeExtractorType_EIGHT, 8},
310            {DatetimeExtractorType_NINE, 9},
311            {DatetimeExtractorType_TEN, 10},
312            {DatetimeExtractorType_ELEVEN, 11},
313            {DatetimeExtractorType_TWELVE, 12},
314            {DatetimeExtractorType_THIRTEEN, 13},
315            {DatetimeExtractorType_FOURTEEN, 14},
316            {DatetimeExtractorType_FIFTEEN, 15},
317            {DatetimeExtractorType_SIXTEEN, 16},
318            {DatetimeExtractorType_SEVENTEEN, 17},
319            {DatetimeExtractorType_EIGHTEEN, 18},
320            {DatetimeExtractorType_NINETEEN, 19},
321            {DatetimeExtractorType_TWENTY, 20},
322            {DatetimeExtractorType_THIRTY, 30},
323            {DatetimeExtractorType_FORTY, 40},
324            {DatetimeExtractorType_FIFTY, 50},
325            {DatetimeExtractorType_SIXTY, 60},
326            {DatetimeExtractorType_SEVENTY, 70},
327            {DatetimeExtractorType_EIGHTY, 80},
328            {DatetimeExtractorType_NINETY, 90},
329            {DatetimeExtractorType_HUNDRED, 100},
330            {DatetimeExtractorType_THOUSAND, 1000},
331        }) {
332     int rule_id;
333     if (!RuleIdForType(type_value_pair.first, &rule_id)) {
334       return false;
335     }
336 
337     std::unique_ptr<UniLib::RegexMatcher> matcher =
338         rules_[rule_id]->Matcher(input);
339     if (!matcher) {
340       return false;
341     }
342     int status;
343     while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {
344       int span_start = matcher->Start(&status);
345       if (status != UniLib::RegexMatcher::kNoError) {
346         return false;
347       }
348       found_numbers.push_back({span_start, type_value_pair.second});
349     }
350   }
351 
352   std::stable_sort(
353       found_numbers.begin(), found_numbers.end(),
354       [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
355         return a.first < b.first;
356       });
357 
358   int sum = 0;
359   int running_value = -1;
360   // Simple math to make sure we handle written numerical modifiers correctly
361   // so that :="fifty one  thousand and one" maps to 51001 and not 50 1 1000 1.
362   for (const std::pair<int, int>& position_number_pair : found_numbers) {
363     if (running_value >= 0) {
364       if (running_value > position_number_pair.second) {
365         sum += running_value;
366         running_value = position_number_pair.second;
367       } else {
368         running_value *= position_number_pair.second;
369       }
370     } else {
371       running_value = position_number_pair.second;
372     }
373   }
374   sum += running_value;
375   *parsed_number = sum;
376   return true;
377 }
378 
ParseDigits(const UnicodeText & input,int * parsed_digits) const379 bool DatetimeExtractor::ParseDigits(const UnicodeText& input,
380                                     int* parsed_digits) const {
381   UnicodeText digit;
382   if (!ExtractType(input, DatetimeExtractorType_DIGITS, &digit)) {
383     return false;
384   }
385 
386   if (!unilib_.ParseInt32(digit, parsed_digits)) {
387     return false;
388   }
389   return true;
390 }
391 
ParseYear(const UnicodeText & input,int * parsed_year) const392 bool DatetimeExtractor::ParseYear(const UnicodeText& input,
393                                   int* parsed_year) const {
394   if (!ParseDigits(input, parsed_year)) {
395     return false;
396   }
397   *parsed_year = GetAdjustedYear(*parsed_year);
398 
399   return true;
400 }
401 
ParseMonth(const UnicodeText & input,int * parsed_month) const402 bool DatetimeExtractor::ParseMonth(const UnicodeText& input,
403                                    int* parsed_month) const {
404   if (ParseDigits(input, parsed_month)) {
405     return true;
406   }
407 
408   if (MapInput(input,
409                {
410                    {DatetimeExtractorType_JANUARY, 1},
411                    {DatetimeExtractorType_FEBRUARY, 2},
412                    {DatetimeExtractorType_MARCH, 3},
413                    {DatetimeExtractorType_APRIL, 4},
414                    {DatetimeExtractorType_MAY, 5},
415                    {DatetimeExtractorType_JUNE, 6},
416                    {DatetimeExtractorType_JULY, 7},
417                    {DatetimeExtractorType_AUGUST, 8},
418                    {DatetimeExtractorType_SEPTEMBER, 9},
419                    {DatetimeExtractorType_OCTOBER, 10},
420                    {DatetimeExtractorType_NOVEMBER, 11},
421                    {DatetimeExtractorType_DECEMBER, 12},
422                },
423                parsed_month)) {
424     return true;
425   }
426 
427   return false;
428 }
429 
ParseAbsoluteDateValues(const UnicodeText & input,std::unordered_map<DatetimeComponent::ComponentType,int> * values) const430 bool DatetimeExtractor::ParseAbsoluteDateValues(
431     const UnicodeText& input,
432     std::unordered_map<DatetimeComponent::ComponentType, int>* values) const {
433   if (MapInput(input,
434                {
435                    {DatetimeExtractorType_NOON,
436                     {{DatetimeComponent::ComponentType::MERIDIEM, 1},
437                      {DatetimeComponent::ComponentType::MINUTE, 0},
438                      {DatetimeComponent::ComponentType::HOUR, 12}}},
439                    {DatetimeExtractorType_MIDNIGHT,
440                     {{DatetimeComponent::ComponentType::MERIDIEM, 0},
441                      {DatetimeComponent::ComponentType::MINUTE, 0},
442                      {DatetimeComponent::ComponentType::HOUR, 0}}},
443                },
444                values)) {
445     return true;
446   }
447   return false;
448 }
449 
ParseMeridiem(const UnicodeText & input,int * parsed_meridiem) const450 bool DatetimeExtractor::ParseMeridiem(const UnicodeText& input,
451                                       int* parsed_meridiem) const {
452   return MapInput(input,
453                   {
454                       {DatetimeExtractorType_AM, 0 /* AM */},
455                       {DatetimeExtractorType_PM, 1 /* PM */},
456                   },
457                   parsed_meridiem);
458 }
459 
ParseRelationDistance(const UnicodeText & input,int * parsed_distance) const460 bool DatetimeExtractor::ParseRelationDistance(const UnicodeText& input,
461                                               int* parsed_distance) const {
462   if (ParseDigits(input, parsed_distance)) {
463     return true;
464   }
465   if (ParseWrittenNumber(input, parsed_distance)) {
466     return true;
467   }
468   return false;
469 }
470 
ParseRelativeValue(const UnicodeText & input,DatetimeComponent::RelativeQualifier * parsed_relative_value) const471 bool DatetimeExtractor::ParseRelativeValue(
472     const UnicodeText& input,
473     DatetimeComponent::RelativeQualifier* parsed_relative_value) const {
474   return MapInput(input,
475                   {
476                       {DatetimeExtractorType_NOW,
477                        DatetimeComponent::RelativeQualifier::NOW},
478                       {DatetimeExtractorType_YESTERDAY,
479                        DatetimeComponent::RelativeQualifier::YESTERDAY},
480                       {DatetimeExtractorType_TOMORROW,
481                        DatetimeComponent::RelativeQualifier::TOMORROW},
482                       {DatetimeExtractorType_NEXT,
483                        DatetimeComponent::RelativeQualifier::NEXT},
484                       {DatetimeExtractorType_NEXT_OR_SAME,
485                        DatetimeComponent::RelativeQualifier::THIS},
486                       {DatetimeExtractorType_LAST,
487                        DatetimeComponent::RelativeQualifier::LAST},
488                       {DatetimeExtractorType_PAST,
489                        DatetimeComponent::RelativeQualifier::PAST},
490                       {DatetimeExtractorType_FUTURE,
491                        DatetimeComponent::RelativeQualifier::FUTURE},
492                   },
493                   parsed_relative_value);
494 }
495 
ParseRelationAndConvertToRelativeCount(const UnicodeText & input,int * relative_count) const496 bool DatetimeExtractor::ParseRelationAndConvertToRelativeCount(
497     const UnicodeText& input, int* relative_count) const {
498   return MapInput(input,
499                   {
500                       {DatetimeExtractorType_NOW, 0},
501                       {DatetimeExtractorType_YESTERDAY, -1},
502                       {DatetimeExtractorType_TOMORROW, 1},
503                       {DatetimeExtractorType_NEXT, 1},
504                       {DatetimeExtractorType_NEXT_OR_SAME, 1},
505                       {DatetimeExtractorType_LAST, -1},
506                       {DatetimeExtractorType_PAST, -1},
507                   },
508                   relative_count);
509 }
510 
ParseDayOfWeek(const UnicodeText & input,int * parsed_day_of_week) const511 bool DatetimeExtractor::ParseDayOfWeek(const UnicodeText& input,
512                                        int* parsed_day_of_week) const {
513   return MapInput(input,
514                   {
515                       {DatetimeExtractorType_SUNDAY, kSunday},
516                       {DatetimeExtractorType_MONDAY, kMonday},
517                       {DatetimeExtractorType_TUESDAY, kTuesday},
518                       {DatetimeExtractorType_WEDNESDAY, kWednesday},
519                       {DatetimeExtractorType_THURSDAY, kThursday},
520                       {DatetimeExtractorType_FRIDAY, kFriday},
521                       {DatetimeExtractorType_SATURDAY, kSaturday},
522                   },
523                   parsed_day_of_week);
524 }
525 
ParseFieldType(const UnicodeText & input,DatetimeComponent::ComponentType * parsed_field_type) const526 bool DatetimeExtractor::ParseFieldType(
527     const UnicodeText& input,
528     DatetimeComponent::ComponentType* parsed_field_type) const {
529   return MapInput(
530       input,
531       {
532           {DatetimeExtractorType_MONDAY,
533            DatetimeComponent::ComponentType::DAY_OF_WEEK},
534           {DatetimeExtractorType_TUESDAY,
535            DatetimeComponent::ComponentType::DAY_OF_WEEK},
536           {DatetimeExtractorType_WEDNESDAY,
537            DatetimeComponent::ComponentType::DAY_OF_WEEK},
538           {DatetimeExtractorType_THURSDAY,
539            DatetimeComponent::ComponentType::DAY_OF_WEEK},
540           {DatetimeExtractorType_FRIDAY,
541            DatetimeComponent::ComponentType::DAY_OF_WEEK},
542           {DatetimeExtractorType_SATURDAY,
543            DatetimeComponent::ComponentType::DAY_OF_WEEK},
544           {DatetimeExtractorType_SUNDAY,
545            DatetimeComponent::ComponentType::DAY_OF_WEEK},
546           {DatetimeExtractorType_SECONDS,
547            DatetimeComponent::ComponentType::SECOND},
548           {DatetimeExtractorType_MINUTES,
549            DatetimeComponent::ComponentType::MINUTE},
550           {DatetimeExtractorType_NOW,
551            DatetimeComponent::ComponentType::DAY_OF_MONTH},
552           {DatetimeExtractorType_HOURS, DatetimeComponent::ComponentType::HOUR},
553           {DatetimeExtractorType_DAY,
554            DatetimeComponent::ComponentType::DAY_OF_MONTH},
555           {DatetimeExtractorType_TOMORROW,
556            DatetimeComponent::ComponentType::DAY_OF_MONTH},
557           {DatetimeExtractorType_YESTERDAY,
558            DatetimeComponent::ComponentType::DAY_OF_MONTH},
559           {DatetimeExtractorType_WEEK, DatetimeComponent::ComponentType::WEEK},
560           {DatetimeExtractorType_MONTH,
561            DatetimeComponent::ComponentType::MONTH},
562           {DatetimeExtractorType_YEAR, DatetimeComponent::ComponentType::YEAR},
563       },
564       parsed_field_type);
565 }
566 
567 }  // namespace libtextclassifier3
568