• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "annotator/datetime/extractor.h"
18 
19 #include "utils/base/logging.h"
20 
21 namespace libtextclassifier3 {
22 
Extract(DatetimeParsedData * result,CodepointSpan * result_span) const23 bool DatetimeExtractor::Extract(DatetimeParsedData* result,
24                                 CodepointSpan* result_span) const {
25   *result_span = {kInvalidIndex, kInvalidIndex};
26 
27   if (rule_.regex->groups() == nullptr) {
28     return false;
29   }
30 
31   // In the current implementation of extractor, the assumption is that there
32   // can only be one relative field.
33   DatetimeComponent::ComponentType component_type;
34   DatetimeComponent::RelativeQualifier relative_qualifier =
35       DatetimeComponent::RelativeQualifier::UNSPECIFIED;
36   int relative_count = 0;
37 
38   for (int group_id = 0; group_id < rule_.regex->groups()->size(); group_id++) {
39     UnicodeText group_text;
40     const int group_type = rule_.regex->groups()->Get(group_id);
41     if (group_type == DatetimeGroupType_GROUP_UNUSED) {
42       continue;
43     }
44     if (!GroupTextFromMatch(group_id, &group_text)) {
45       TC3_LOG(ERROR) << "Couldn't retrieve group.";
46       return false;
47     }
48     // The pattern can have a group defined in a part that was not matched,
49     // e.g. an optional part. In this case we'll get an empty content here.
50     if (group_text.empty()) {
51       continue;
52     }
53 
54     switch (group_type) {
55       case DatetimeGroupType_GROUP_YEAR: {
56         int year;
57         if (!ParseYear(group_text, &(year))) {
58           TC3_LOG(ERROR) << "Couldn't extract YEAR.";
59           return false;
60         }
61         result->SetAbsoluteValue(DatetimeComponent::ComponentType::YEAR, year);
62         break;
63       }
64       case DatetimeGroupType_GROUP_MONTH: {
65         int month;
66         if (!ParseMonth(group_text, &(month))) {
67           TC3_LOG(ERROR) << "Couldn't extract MONTH.";
68           return false;
69         }
70         result->SetAbsoluteValue(DatetimeComponent::ComponentType::MONTH,
71                                  month);
72         break;
73       }
74       case DatetimeGroupType_GROUP_DAY: {
75         int day_of_month;
76         if (!ParseDigits(group_text, &(day_of_month))) {
77           TC3_LOG(ERROR) << "Couldn't extract DAY.";
78           return false;
79         }
80         result->SetAbsoluteValue(DatetimeComponent::ComponentType::DAY_OF_MONTH,
81                                  day_of_month);
82         break;
83       }
84       case DatetimeGroupType_GROUP_HOUR: {
85         int hour;
86         if (!ParseDigits(group_text, &(hour))) {
87           TC3_LOG(ERROR) << "Couldn't extract HOUR.";
88           return false;
89         }
90         result->SetAbsoluteValue(DatetimeComponent::ComponentType::HOUR, hour);
91         break;
92       }
93       case DatetimeGroupType_GROUP_MINUTE: {
94         int minute;
95         if (!ParseDigits(group_text, &(minute)) &&
96             !ParseWrittenNumber(group_text, &(minute))) {
97           TC3_LOG(ERROR) << "Couldn't extract MINUTE.";
98           return false;
99         }
100         result->SetAbsoluteValue(DatetimeComponent::ComponentType::MINUTE,
101                                  minute);
102         break;
103       }
104       case DatetimeGroupType_GROUP_SECOND: {
105         int second;
106         if (!ParseDigits(group_text, &(second))) {
107           TC3_LOG(ERROR) << "Couldn't extract SECOND.";
108           return false;
109         }
110         result->SetAbsoluteValue(DatetimeComponent::ComponentType::SECOND,
111                                  second);
112         break;
113       }
114       case DatetimeGroupType_GROUP_AMPM: {
115         int meridiem;
116         if (!ParseMeridiem(group_text, &(meridiem))) {
117           TC3_LOG(ERROR) << "Couldn't extract AMPM.";
118           return false;
119         }
120         result->SetAbsoluteValue(DatetimeComponent::ComponentType::MERIDIEM,
121                                  meridiem);
122         break;
123       }
124       case DatetimeGroupType_GROUP_RELATIONDISTANCE: {
125         relative_count = 0;
126         if (!ParseRelationDistance(group_text, &(relative_count))) {
127           TC3_LOG(ERROR) << "Couldn't extract RELATION_DISTANCE_FIELD.";
128           return false;
129         }
130         break;
131       }
132       case DatetimeGroupType_GROUP_RELATION: {
133         if (!ParseRelativeValue(group_text, &relative_qualifier)) {
134           TC3_LOG(ERROR) << "Couldn't extract RELATION_FIELD.";
135           return false;
136         }
137         ParseRelationAndConvertToRelativeCount(group_text, &relative_count);
138         if (relative_qualifier ==
139                 DatetimeComponent::RelativeQualifier::TOMORROW ||
140             relative_qualifier == DatetimeComponent::RelativeQualifier::NOW ||
141             relative_qualifier ==
142                 DatetimeComponent::RelativeQualifier::YESTERDAY) {
143           if (!ParseFieldType(group_text, &component_type)) {
144             TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
145             return false;
146           }
147         }
148         break;
149       }
150       case DatetimeGroupType_GROUP_RELATIONTYPE: {
151         if (!ParseFieldType(group_text, &component_type)) {
152           TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
153           return false;
154         }
155         if (component_type == DatetimeComponent::ComponentType::DAY_OF_WEEK) {
156           int day_of_week;
157           if (!ParseDayOfWeek(group_text, &day_of_week)) {
158             TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
159             return false;
160           }
161           result->SetAbsoluteValue(component_type, day_of_week);
162         }
163         break;
164       }
165       case DatetimeGroupType_GROUP_DUMMY1:
166       case DatetimeGroupType_GROUP_DUMMY2:
167         break;
168       default:
169         TC3_LOG(INFO) << "Unknown group type.";
170         continue;
171     }
172     if (!UpdateMatchSpan(group_id, result_span)) {
173       TC3_LOG(ERROR) << "Couldn't update span.";
174       return false;
175     }
176   }
177 
178   if (relative_qualifier != DatetimeComponent::RelativeQualifier::UNSPECIFIED) {
179     result->SetRelativeValue(component_type, relative_qualifier);
180     result->SetRelativeCount(component_type, relative_count);
181   }
182 
183   if (result_span->first == kInvalidIndex ||
184       result_span->second == kInvalidIndex) {
185     *result_span = {kInvalidIndex, kInvalidIndex};
186   }
187 
188   return true;
189 }
190 
RuleIdForType(DatetimeExtractorType type,int * rule_id) const191 bool DatetimeExtractor::RuleIdForType(DatetimeExtractorType type,
192                                       int* rule_id) const {
193   auto type_it = type_and_locale_to_rule_.find(type);
194   if (type_it == type_and_locale_to_rule_.end()) {
195     return false;
196   }
197 
198   auto locale_it = type_it->second.find(locale_id_);
199   if (locale_it == type_it->second.end()) {
200     return false;
201   }
202   *rule_id = locale_it->second;
203   return true;
204 }
205 
ExtractType(const UnicodeText & input,DatetimeExtractorType extractor_type,UnicodeText * match_result) const206 bool DatetimeExtractor::ExtractType(const UnicodeText& input,
207                                     DatetimeExtractorType extractor_type,
208                                     UnicodeText* match_result) const {
209   int rule_id;
210   if (!RuleIdForType(extractor_type, &rule_id)) {
211     return false;
212   }
213 
214   std::unique_ptr<UniLib::RegexMatcher> matcher =
215       rules_[rule_id]->Matcher(input);
216   if (!matcher) {
217     return false;
218   }
219 
220   int status;
221   if (!matcher->Find(&status)) {
222     return false;
223   }
224 
225   if (match_result != nullptr) {
226     *match_result = matcher->Group(&status);
227     if (status != UniLib::RegexMatcher::kNoError) {
228       return false;
229     }
230   }
231   return true;
232 }
233 
GroupTextFromMatch(int group_id,UnicodeText * result) const234 bool DatetimeExtractor::GroupTextFromMatch(int group_id,
235                                            UnicodeText* result) const {
236   int status;
237   *result = matcher_.Group(group_id, &status);
238   if (status != UniLib::RegexMatcher::kNoError) {
239     return false;
240   }
241   return true;
242 }
243 
UpdateMatchSpan(int group_id,CodepointSpan * span) const244 bool DatetimeExtractor::UpdateMatchSpan(int group_id,
245                                         CodepointSpan* span) const {
246   int status;
247   const int match_start = matcher_.Start(group_id, &status);
248   if (status != UniLib::RegexMatcher::kNoError) {
249     return false;
250   }
251   const int match_end = matcher_.End(group_id, &status);
252   if (status != UniLib::RegexMatcher::kNoError) {
253     return false;
254   }
255   if (span->first == kInvalidIndex || span->first > match_start) {
256     span->first = match_start;
257   }
258   if (span->second == kInvalidIndex || span->second < match_end) {
259     span->second = match_end;
260   }
261 
262   return true;
263 }
264 
265 template <typename T>
MapInput(const UnicodeText & input,const std::vector<std::pair<DatetimeExtractorType,T>> & mapping,T * result) const266 bool DatetimeExtractor::MapInput(
267     const UnicodeText& input,
268     const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
269     T* result) const {
270   for (const auto& type_value_pair : mapping) {
271     if (ExtractType(input, type_value_pair.first)) {
272       *result = type_value_pair.second;
273       return true;
274     }
275   }
276   return false;
277 }
278 
ParseWrittenNumber(const UnicodeText & input,int * parsed_number) const279 bool DatetimeExtractor::ParseWrittenNumber(const UnicodeText& input,
280                                            int* parsed_number) const {
281   std::vector<std::pair<int, int>> found_numbers;
282   for (const auto& type_value_pair :
283        std::vector<std::pair<DatetimeExtractorType, int>>{
284            {DatetimeExtractorType_ZERO, 0},
285            {DatetimeExtractorType_ONE, 1},
286            {DatetimeExtractorType_TWO, 2},
287            {DatetimeExtractorType_THREE, 3},
288            {DatetimeExtractorType_FOUR, 4},
289            {DatetimeExtractorType_FIVE, 5},
290            {DatetimeExtractorType_SIX, 6},
291            {DatetimeExtractorType_SEVEN, 7},
292            {DatetimeExtractorType_EIGHT, 8},
293            {DatetimeExtractorType_NINE, 9},
294            {DatetimeExtractorType_TEN, 10},
295            {DatetimeExtractorType_ELEVEN, 11},
296            {DatetimeExtractorType_TWELVE, 12},
297            {DatetimeExtractorType_THIRTEEN, 13},
298            {DatetimeExtractorType_FOURTEEN, 14},
299            {DatetimeExtractorType_FIFTEEN, 15},
300            {DatetimeExtractorType_SIXTEEN, 16},
301            {DatetimeExtractorType_SEVENTEEN, 17},
302            {DatetimeExtractorType_EIGHTEEN, 18},
303            {DatetimeExtractorType_NINETEEN, 19},
304            {DatetimeExtractorType_TWENTY, 20},
305            {DatetimeExtractorType_THIRTY, 30},
306            {DatetimeExtractorType_FORTY, 40},
307            {DatetimeExtractorType_FIFTY, 50},
308            {DatetimeExtractorType_SIXTY, 60},
309            {DatetimeExtractorType_SEVENTY, 70},
310            {DatetimeExtractorType_EIGHTY, 80},
311            {DatetimeExtractorType_NINETY, 90},
312            {DatetimeExtractorType_HUNDRED, 100},
313            {DatetimeExtractorType_THOUSAND, 1000},
314        }) {
315     int rule_id;
316     if (!RuleIdForType(type_value_pair.first, &rule_id)) {
317       return false;
318     }
319 
320     std::unique_ptr<UniLib::RegexMatcher> matcher =
321         rules_[rule_id]->Matcher(input);
322     if (!matcher) {
323       return false;
324     }
325     int status;
326     while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {
327       int span_start = matcher->Start(&status);
328       if (status != UniLib::RegexMatcher::kNoError) {
329         return false;
330       }
331       found_numbers.push_back({span_start, type_value_pair.second});
332     }
333   }
334 
335   std::sort(found_numbers.begin(), found_numbers.end(),
336             [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
337               return a.first < b.first;
338             });
339 
340   int sum = 0;
341   int running_value = -1;
342   // Simple math to make sure we handle written numerical modifiers correctly
343   // so that :="fifty one  thousand and one" maps to 51001 and not 50 1 1000 1.
344   for (const std::pair<int, int>& position_number_pair : found_numbers) {
345     if (running_value >= 0) {
346       if (running_value > position_number_pair.second) {
347         sum += running_value;
348         running_value = position_number_pair.second;
349       } else {
350         running_value *= position_number_pair.second;
351       }
352     } else {
353       running_value = position_number_pair.second;
354     }
355   }
356   sum += running_value;
357   *parsed_number = sum;
358   return true;
359 }
360 
ParseDigits(const UnicodeText & input,int * parsed_digits) const361 bool DatetimeExtractor::ParseDigits(const UnicodeText& input,
362                                     int* parsed_digits) const {
363   UnicodeText digit;
364   if (!ExtractType(input, DatetimeExtractorType_DIGITS, &digit)) {
365     return false;
366   }
367 
368   if (!unilib_.ParseInt32(digit, parsed_digits)) {
369     return false;
370   }
371   return true;
372 }
373 
ParseYear(const UnicodeText & input,int * parsed_year) const374 bool DatetimeExtractor::ParseYear(const UnicodeText& input,
375                                   int* parsed_year) const {
376   if (!ParseDigits(input, parsed_year)) {
377     return false;
378   }
379 
380   // Logic to decide if XX will be 20XX or 19XX
381   if (*parsed_year < 100) {
382     if (*parsed_year < 50) {
383       *parsed_year += 2000;
384     } else {
385       *parsed_year += 1900;
386     }
387   }
388 
389   return true;
390 }
391 
ParseMonth(const UnicodeText & input,int * parsed_month) const392 bool DatetimeExtractor::ParseMonth(const UnicodeText& input,
393                                    int* parsed_month) const {
394   if (ParseDigits(input, parsed_month)) {
395     return true;
396   }
397 
398   if (MapInput(input,
399                {
400                    {DatetimeExtractorType_JANUARY, 1},
401                    {DatetimeExtractorType_FEBRUARY, 2},
402                    {DatetimeExtractorType_MARCH, 3},
403                    {DatetimeExtractorType_APRIL, 4},
404                    {DatetimeExtractorType_MAY, 5},
405                    {DatetimeExtractorType_JUNE, 6},
406                    {DatetimeExtractorType_JULY, 7},
407                    {DatetimeExtractorType_AUGUST, 8},
408                    {DatetimeExtractorType_SEPTEMBER, 9},
409                    {DatetimeExtractorType_OCTOBER, 10},
410                    {DatetimeExtractorType_NOVEMBER, 11},
411                    {DatetimeExtractorType_DECEMBER, 12},
412                },
413                parsed_month)) {
414     return true;
415   }
416 
417   return false;
418 }
419 
ParseMeridiem(const UnicodeText & input,int * parsed_meridiem) const420 bool DatetimeExtractor::ParseMeridiem(const UnicodeText& input,
421                                       int* parsed_meridiem) const {
422   return MapInput(input,
423                   {
424                       {DatetimeExtractorType_AM, 0 /* AM */},
425                       {DatetimeExtractorType_PM, 1 /* PM */},
426                   },
427                   parsed_meridiem);
428 }
429 
ParseRelationDistance(const UnicodeText & input,int * parsed_distance) const430 bool DatetimeExtractor::ParseRelationDistance(const UnicodeText& input,
431                                               int* parsed_distance) const {
432   if (ParseDigits(input, parsed_distance)) {
433     return true;
434   }
435   if (ParseWrittenNumber(input, parsed_distance)) {
436     return true;
437   }
438   return false;
439 }
440 
ParseRelativeValue(const UnicodeText & input,DatetimeComponent::RelativeQualifier * parsed_relative_value) const441 bool DatetimeExtractor::ParseRelativeValue(
442     const UnicodeText& input,
443     DatetimeComponent::RelativeQualifier* parsed_relative_value) const {
444   return MapInput(input,
445                   {
446                       {DatetimeExtractorType_NOW,
447                        DatetimeComponent::RelativeQualifier::NOW},
448                       {DatetimeExtractorType_YESTERDAY,
449                        DatetimeComponent::RelativeQualifier::YESTERDAY},
450                       {DatetimeExtractorType_TOMORROW,
451                        DatetimeComponent::RelativeQualifier::TOMORROW},
452                       {DatetimeExtractorType_NEXT,
453                        DatetimeComponent::RelativeQualifier::NEXT},
454                       {DatetimeExtractorType_NEXT_OR_SAME,
455                        DatetimeComponent::RelativeQualifier::THIS},
456                       {DatetimeExtractorType_LAST,
457                        DatetimeComponent::RelativeQualifier::LAST},
458                       {DatetimeExtractorType_PAST,
459                        DatetimeComponent::RelativeQualifier::PAST},
460                       {DatetimeExtractorType_FUTURE,
461                        DatetimeComponent::RelativeQualifier::FUTURE},
462                   },
463                   parsed_relative_value);
464 }
465 
ParseRelationAndConvertToRelativeCount(const UnicodeText & input,int * relative_count) const466 bool DatetimeExtractor::ParseRelationAndConvertToRelativeCount(
467     const UnicodeText& input, int* relative_count) const {
468   return MapInput(input,
469                   {
470                       {DatetimeExtractorType_NOW, 0},
471                       {DatetimeExtractorType_YESTERDAY, -1},
472                       {DatetimeExtractorType_TOMORROW, 1},
473                       {DatetimeExtractorType_NEXT, 1},
474                       {DatetimeExtractorType_NEXT_OR_SAME, 1},
475                       {DatetimeExtractorType_LAST, -1},
476                       {DatetimeExtractorType_PAST, -1},
477                   },
478                   relative_count);
479 }
480 
ParseDayOfWeek(const UnicodeText & input,int * parsed_day_of_week) const481 bool DatetimeExtractor::ParseDayOfWeek(const UnicodeText& input,
482                                        int* parsed_day_of_week) const {
483   return MapInput(input,
484                   {
485                       {DatetimeExtractorType_SUNDAY, kSunday},
486                       {DatetimeExtractorType_MONDAY, kMonday},
487                       {DatetimeExtractorType_TUESDAY, kTuesday},
488                       {DatetimeExtractorType_WEDNESDAY, kWednesday},
489                       {DatetimeExtractorType_THURSDAY, kThursday},
490                       {DatetimeExtractorType_FRIDAY, kFriday},
491                       {DatetimeExtractorType_SATURDAY, kSaturday},
492                   },
493                   parsed_day_of_week);
494 }
495 
ParseFieldType(const UnicodeText & input,DatetimeComponent::ComponentType * parsed_field_type) const496 bool DatetimeExtractor::ParseFieldType(
497     const UnicodeText& input,
498     DatetimeComponent::ComponentType* parsed_field_type) const {
499   return MapInput(
500       input,
501       {
502           {DatetimeExtractorType_MONDAY,
503            DatetimeComponent::ComponentType::DAY_OF_WEEK},
504           {DatetimeExtractorType_TUESDAY,
505            DatetimeComponent::ComponentType::DAY_OF_WEEK},
506           {DatetimeExtractorType_WEDNESDAY,
507            DatetimeComponent::ComponentType::DAY_OF_WEEK},
508           {DatetimeExtractorType_THURSDAY,
509            DatetimeComponent::ComponentType::DAY_OF_WEEK},
510           {DatetimeExtractorType_FRIDAY,
511            DatetimeComponent::ComponentType::DAY_OF_WEEK},
512           {DatetimeExtractorType_SATURDAY,
513            DatetimeComponent::ComponentType::DAY_OF_WEEK},
514           {DatetimeExtractorType_SUNDAY,
515            DatetimeComponent::ComponentType::DAY_OF_WEEK},
516           {DatetimeExtractorType_SECONDS,
517            DatetimeComponent::ComponentType::SECOND},
518           {DatetimeExtractorType_MINUTES,
519            DatetimeComponent::ComponentType::MINUTE},
520           {DatetimeExtractorType_NOW,
521            DatetimeComponent::ComponentType::DAY_OF_MONTH},
522           {DatetimeExtractorType_HOURS, DatetimeComponent::ComponentType::HOUR},
523           {DatetimeExtractorType_DAY,
524            DatetimeComponent::ComponentType::DAY_OF_MONTH},
525           {DatetimeExtractorType_TOMORROW,
526            DatetimeComponent::ComponentType::DAY_OF_MONTH},
527           {DatetimeExtractorType_YESTERDAY,
528            DatetimeComponent::ComponentType::DAY_OF_MONTH},
529           {DatetimeExtractorType_WEEK, DatetimeComponent::ComponentType::WEEK},
530           {DatetimeExtractorType_MONTH,
531            DatetimeComponent::ComponentType::MONTH},
532           {DatetimeExtractorType_YEAR, DatetimeComponent::ComponentType::YEAR},
533       },
534       parsed_field_type);
535 }
536 
537 }  // namespace libtextclassifier3
538