1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "annotator/datetime/extractor.h"
18
19 #include <algorithm>
20
21 #include "annotator/datetime/utils.h"
22 #include "annotator/model_generated.h"
23 #include "annotator/types.h"
24 #include "utils/base/logging.h"
25
26 namespace libtextclassifier3 {
27
Extract(DatetimeParsedData * result,CodepointSpan * result_span) const28 bool DatetimeExtractor::Extract(DatetimeParsedData* result,
29 CodepointSpan* result_span) const {
30 *result_span = {kInvalidIndex, kInvalidIndex};
31
32 if (rule_.regex->groups() == nullptr) {
33 return false;
34 }
35
36 // In the current implementation of extractor, the assumption is that there
37 // can only be one relative field.
38 DatetimeComponent::ComponentType component_type;
39 DatetimeComponent::RelativeQualifier relative_qualifier =
40 DatetimeComponent::RelativeQualifier::UNSPECIFIED;
41 int relative_count = 0;
42
43 for (int group_id = 0; group_id < rule_.regex->groups()->size(); group_id++) {
44 UnicodeText group_text;
45 const int group_type = rule_.regex->groups()->Get(group_id);
46 if (group_type == DatetimeGroupType_GROUP_UNUSED) {
47 continue;
48 }
49 if (!GroupTextFromMatch(group_id, &group_text)) {
50 TC3_LOG(ERROR) << "Couldn't retrieve group.";
51 return false;
52 }
53 // The pattern can have a group defined in a part that was not matched,
54 // e.g. an optional part. In this case we'll get an empty content here.
55 if (group_text.empty()) {
56 continue;
57 }
58
59 switch (group_type) {
60 case DatetimeGroupType_GROUP_YEAR: {
61 int year;
62 if (!ParseYear(group_text, &(year))) {
63 TC3_LOG(ERROR) << "Couldn't extract YEAR.";
64 return false;
65 }
66 result->SetAbsoluteValue(DatetimeComponent::ComponentType::YEAR, year);
67 break;
68 }
69 case DatetimeGroupType_GROUP_MONTH: {
70 int month;
71 if (!ParseMonth(group_text, &(month))) {
72 TC3_LOG(ERROR) << "Couldn't extract MONTH.";
73 return false;
74 }
75 result->SetAbsoluteValue(DatetimeComponent::ComponentType::MONTH,
76 month);
77 break;
78 }
79 case DatetimeGroupType_GROUP_DAY: {
80 int day_of_month;
81 if (!ParseDigits(group_text, &(day_of_month))) {
82 TC3_LOG(ERROR) << "Couldn't extract DAY.";
83 return false;
84 }
85 result->SetAbsoluteValue(DatetimeComponent::ComponentType::DAY_OF_MONTH,
86 day_of_month);
87 break;
88 }
89 case DatetimeGroupType_GROUP_HOUR: {
90 int hour;
91 if (!ParseDigits(group_text, &(hour))) {
92 TC3_LOG(ERROR) << "Couldn't extract HOUR.";
93 return false;
94 }
95 result->SetAbsoluteValue(DatetimeComponent::ComponentType::HOUR, hour);
96 break;
97 }
98 case DatetimeGroupType_GROUP_MINUTE: {
99 int minute;
100 if (!ParseDigits(group_text, &(minute)) &&
101 !ParseWrittenNumber(group_text, &(minute))) {
102 TC3_LOG(ERROR) << "Couldn't extract MINUTE.";
103 return false;
104 }
105 result->SetAbsoluteValue(DatetimeComponent::ComponentType::MINUTE,
106 minute);
107 break;
108 }
109 case DatetimeGroupType_GROUP_SECOND: {
110 int second;
111 if (!ParseDigits(group_text, &(second))) {
112 TC3_LOG(ERROR) << "Couldn't extract SECOND.";
113 return false;
114 }
115 result->SetAbsoluteValue(DatetimeComponent::ComponentType::SECOND,
116 second);
117 break;
118 }
119 case DatetimeGroupType_GROUP_AMPM: {
120 int meridiem;
121 if (!ParseMeridiem(group_text, &(meridiem))) {
122 TC3_LOG(ERROR) << "Couldn't extract AMPM.";
123 return false;
124 }
125 result->SetAbsoluteValue(DatetimeComponent::ComponentType::MERIDIEM,
126 meridiem);
127 break;
128 }
129 case DatetimeGroupType_GROUP_RELATIONDISTANCE: {
130 relative_count = 0;
131 if (!ParseRelationDistance(group_text, &(relative_count))) {
132 TC3_LOG(ERROR) << "Couldn't extract RELATION_DISTANCE_FIELD.";
133 return false;
134 }
135 break;
136 }
137 case DatetimeGroupType_GROUP_RELATION: {
138 if (!ParseRelativeValue(group_text, &relative_qualifier)) {
139 TC3_LOG(ERROR) << "Couldn't extract RELATION_FIELD.";
140 return false;
141 }
142 ParseRelationAndConvertToRelativeCount(group_text, &relative_count);
143 if (relative_qualifier ==
144 DatetimeComponent::RelativeQualifier::TOMORROW ||
145 relative_qualifier == DatetimeComponent::RelativeQualifier::NOW ||
146 relative_qualifier ==
147 DatetimeComponent::RelativeQualifier::YESTERDAY) {
148 if (!ParseFieldType(group_text, &component_type)) {
149 TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
150 return false;
151 }
152 }
153 break;
154 }
155 case DatetimeGroupType_GROUP_RELATIONTYPE: {
156 if (!ParseFieldType(group_text, &component_type)) {
157 TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
158 return false;
159 }
160 if (component_type == DatetimeComponent::ComponentType::DAY_OF_WEEK) {
161 int day_of_week;
162 if (!ParseDayOfWeek(group_text, &day_of_week)) {
163 TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
164 return false;
165 }
166 result->SetAbsoluteValue(component_type, day_of_week);
167 }
168 break;
169 }
170 case DatetimeGroupType_GROUP_ABSOLUTETIME: {
171 std::unordered_map<DatetimeComponent::ComponentType, int> values;
172 if (!ParseAbsoluteDateValues(group_text, &values)) {
173 TC3_LOG(ERROR) << "Couldn't extract Component values.";
174 return false;
175 }
176 for (const std::pair<const DatetimeComponent::ComponentType, int>&
177 date_time_pair : values) {
178 result->SetAbsoluteValue(date_time_pair.first, date_time_pair.second);
179 }
180 break;
181 }
182 case DatetimeGroupType_GROUP_DUMMY1:
183 case DatetimeGroupType_GROUP_DUMMY2:
184 break;
185 default:
186 TC3_LOG(INFO) << "Unknown group type.";
187 continue;
188 }
189 if (!UpdateMatchSpan(group_id, result_span)) {
190 TC3_LOG(ERROR) << "Couldn't update span.";
191 return false;
192 }
193 }
194
195 if (relative_qualifier != DatetimeComponent::RelativeQualifier::UNSPECIFIED) {
196 result->SetRelativeValue(component_type, relative_qualifier);
197 result->SetRelativeCount(component_type, relative_count);
198 }
199
200 if (result_span->first == kInvalidIndex ||
201 result_span->second == kInvalidIndex) {
202 *result_span = {kInvalidIndex, kInvalidIndex};
203 }
204
205 return true;
206 }
207
RuleIdForType(DatetimeExtractorType type,int * rule_id) const208 bool DatetimeExtractor::RuleIdForType(DatetimeExtractorType type,
209 int* rule_id) const {
210 auto type_it = type_and_locale_to_rule_.find(type);
211 if (type_it == type_and_locale_to_rule_.end()) {
212 return false;
213 }
214
215 auto locale_it = type_it->second.find(locale_id_);
216 if (locale_it == type_it->second.end()) {
217 return false;
218 }
219 *rule_id = locale_it->second;
220 return true;
221 }
222
ExtractType(const UnicodeText & input,DatetimeExtractorType extractor_type,UnicodeText * match_result) const223 bool DatetimeExtractor::ExtractType(const UnicodeText& input,
224 DatetimeExtractorType extractor_type,
225 UnicodeText* match_result) const {
226 int rule_id;
227 if (!RuleIdForType(extractor_type, &rule_id)) {
228 return false;
229 }
230
231 std::unique_ptr<UniLib::RegexMatcher> matcher =
232 rules_[rule_id]->Matcher(input);
233 if (!matcher) {
234 return false;
235 }
236
237 int status;
238 if (!matcher->Find(&status)) {
239 return false;
240 }
241
242 if (match_result != nullptr) {
243 *match_result = matcher->Group(&status);
244 if (status != UniLib::RegexMatcher::kNoError) {
245 return false;
246 }
247 }
248 return true;
249 }
250
GroupTextFromMatch(int group_id,UnicodeText * result) const251 bool DatetimeExtractor::GroupTextFromMatch(int group_id,
252 UnicodeText* result) const {
253 int status;
254 *result = matcher_.Group(group_id, &status);
255 if (status != UniLib::RegexMatcher::kNoError) {
256 return false;
257 }
258 return true;
259 }
260
UpdateMatchSpan(int group_id,CodepointSpan * span) const261 bool DatetimeExtractor::UpdateMatchSpan(int group_id,
262 CodepointSpan* span) const {
263 int status;
264 const int match_start = matcher_.Start(group_id, &status);
265 if (status != UniLib::RegexMatcher::kNoError) {
266 return false;
267 }
268 const int match_end = matcher_.End(group_id, &status);
269 if (status != UniLib::RegexMatcher::kNoError) {
270 return false;
271 }
272 if (span->first == kInvalidIndex || span->first > match_start) {
273 span->first = match_start;
274 }
275 if (span->second == kInvalidIndex || span->second < match_end) {
276 span->second = match_end;
277 }
278
279 return true;
280 }
281
282 template <typename T>
MapInput(const UnicodeText & input,const std::vector<std::pair<DatetimeExtractorType,T>> & mapping,T * result) const283 bool DatetimeExtractor::MapInput(
284 const UnicodeText& input,
285 const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
286 T* result) const {
287 for (const auto& type_value_pair : mapping) {
288 if (ExtractType(input, type_value_pair.first)) {
289 *result = type_value_pair.second;
290 return true;
291 }
292 }
293 return false;
294 }
295
ParseWrittenNumber(const UnicodeText & input,int * parsed_number) const296 bool DatetimeExtractor::ParseWrittenNumber(const UnicodeText& input,
297 int* parsed_number) const {
298 std::vector<std::pair<int, int>> found_numbers;
299 for (const auto& type_value_pair :
300 std::vector<std::pair<DatetimeExtractorType, int>>{
301 {DatetimeExtractorType_ZERO, 0},
302 {DatetimeExtractorType_ONE, 1},
303 {DatetimeExtractorType_TWO, 2},
304 {DatetimeExtractorType_THREE, 3},
305 {DatetimeExtractorType_FOUR, 4},
306 {DatetimeExtractorType_FIVE, 5},
307 {DatetimeExtractorType_SIX, 6},
308 {DatetimeExtractorType_SEVEN, 7},
309 {DatetimeExtractorType_EIGHT, 8},
310 {DatetimeExtractorType_NINE, 9},
311 {DatetimeExtractorType_TEN, 10},
312 {DatetimeExtractorType_ELEVEN, 11},
313 {DatetimeExtractorType_TWELVE, 12},
314 {DatetimeExtractorType_THIRTEEN, 13},
315 {DatetimeExtractorType_FOURTEEN, 14},
316 {DatetimeExtractorType_FIFTEEN, 15},
317 {DatetimeExtractorType_SIXTEEN, 16},
318 {DatetimeExtractorType_SEVENTEEN, 17},
319 {DatetimeExtractorType_EIGHTEEN, 18},
320 {DatetimeExtractorType_NINETEEN, 19},
321 {DatetimeExtractorType_TWENTY, 20},
322 {DatetimeExtractorType_THIRTY, 30},
323 {DatetimeExtractorType_FORTY, 40},
324 {DatetimeExtractorType_FIFTY, 50},
325 {DatetimeExtractorType_SIXTY, 60},
326 {DatetimeExtractorType_SEVENTY, 70},
327 {DatetimeExtractorType_EIGHTY, 80},
328 {DatetimeExtractorType_NINETY, 90},
329 {DatetimeExtractorType_HUNDRED, 100},
330 {DatetimeExtractorType_THOUSAND, 1000},
331 }) {
332 int rule_id;
333 if (!RuleIdForType(type_value_pair.first, &rule_id)) {
334 return false;
335 }
336
337 std::unique_ptr<UniLib::RegexMatcher> matcher =
338 rules_[rule_id]->Matcher(input);
339 if (!matcher) {
340 return false;
341 }
342 int status;
343 while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {
344 int span_start = matcher->Start(&status);
345 if (status != UniLib::RegexMatcher::kNoError) {
346 return false;
347 }
348 found_numbers.push_back({span_start, type_value_pair.second});
349 }
350 }
351
352 std::stable_sort(
353 found_numbers.begin(), found_numbers.end(),
354 [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
355 return a.first < b.first;
356 });
357
358 int sum = 0;
359 int running_value = -1;
360 // Simple math to make sure we handle written numerical modifiers correctly
361 // so that :="fifty one thousand and one" maps to 51001 and not 50 1 1000 1.
362 for (const std::pair<int, int>& position_number_pair : found_numbers) {
363 if (running_value >= 0) {
364 if (running_value > position_number_pair.second) {
365 sum += running_value;
366 running_value = position_number_pair.second;
367 } else {
368 running_value *= position_number_pair.second;
369 }
370 } else {
371 running_value = position_number_pair.second;
372 }
373 }
374 sum += running_value;
375 *parsed_number = sum;
376 return true;
377 }
378
ParseDigits(const UnicodeText & input,int * parsed_digits) const379 bool DatetimeExtractor::ParseDigits(const UnicodeText& input,
380 int* parsed_digits) const {
381 UnicodeText digit;
382 if (!ExtractType(input, DatetimeExtractorType_DIGITS, &digit)) {
383 return false;
384 }
385
386 if (!unilib_.ParseInt32(digit, parsed_digits)) {
387 return false;
388 }
389 return true;
390 }
391
ParseYear(const UnicodeText & input,int * parsed_year) const392 bool DatetimeExtractor::ParseYear(const UnicodeText& input,
393 int* parsed_year) const {
394 if (!ParseDigits(input, parsed_year)) {
395 return false;
396 }
397 *parsed_year = GetAdjustedYear(*parsed_year);
398
399 return true;
400 }
401
ParseMonth(const UnicodeText & input,int * parsed_month) const402 bool DatetimeExtractor::ParseMonth(const UnicodeText& input,
403 int* parsed_month) const {
404 if (ParseDigits(input, parsed_month)) {
405 return true;
406 }
407
408 if (MapInput(input,
409 {
410 {DatetimeExtractorType_JANUARY, 1},
411 {DatetimeExtractorType_FEBRUARY, 2},
412 {DatetimeExtractorType_MARCH, 3},
413 {DatetimeExtractorType_APRIL, 4},
414 {DatetimeExtractorType_MAY, 5},
415 {DatetimeExtractorType_JUNE, 6},
416 {DatetimeExtractorType_JULY, 7},
417 {DatetimeExtractorType_AUGUST, 8},
418 {DatetimeExtractorType_SEPTEMBER, 9},
419 {DatetimeExtractorType_OCTOBER, 10},
420 {DatetimeExtractorType_NOVEMBER, 11},
421 {DatetimeExtractorType_DECEMBER, 12},
422 },
423 parsed_month)) {
424 return true;
425 }
426
427 return false;
428 }
429
ParseAbsoluteDateValues(const UnicodeText & input,std::unordered_map<DatetimeComponent::ComponentType,int> * values) const430 bool DatetimeExtractor::ParseAbsoluteDateValues(
431 const UnicodeText& input,
432 std::unordered_map<DatetimeComponent::ComponentType, int>* values) const {
433 if (MapInput(input,
434 {
435 {DatetimeExtractorType_NOON,
436 {{DatetimeComponent::ComponentType::MERIDIEM, 1},
437 {DatetimeComponent::ComponentType::MINUTE, 0},
438 {DatetimeComponent::ComponentType::HOUR, 12}}},
439 {DatetimeExtractorType_MIDNIGHT,
440 {{DatetimeComponent::ComponentType::MERIDIEM, 0},
441 {DatetimeComponent::ComponentType::MINUTE, 0},
442 {DatetimeComponent::ComponentType::HOUR, 0}}},
443 },
444 values)) {
445 return true;
446 }
447 return false;
448 }
449
ParseMeridiem(const UnicodeText & input,int * parsed_meridiem) const450 bool DatetimeExtractor::ParseMeridiem(const UnicodeText& input,
451 int* parsed_meridiem) const {
452 return MapInput(input,
453 {
454 {DatetimeExtractorType_AM, 0 /* AM */},
455 {DatetimeExtractorType_PM, 1 /* PM */},
456 },
457 parsed_meridiem);
458 }
459
ParseRelationDistance(const UnicodeText & input,int * parsed_distance) const460 bool DatetimeExtractor::ParseRelationDistance(const UnicodeText& input,
461 int* parsed_distance) const {
462 if (ParseDigits(input, parsed_distance)) {
463 return true;
464 }
465 if (ParseWrittenNumber(input, parsed_distance)) {
466 return true;
467 }
468 return false;
469 }
470
ParseRelativeValue(const UnicodeText & input,DatetimeComponent::RelativeQualifier * parsed_relative_value) const471 bool DatetimeExtractor::ParseRelativeValue(
472 const UnicodeText& input,
473 DatetimeComponent::RelativeQualifier* parsed_relative_value) const {
474 return MapInput(input,
475 {
476 {DatetimeExtractorType_NOW,
477 DatetimeComponent::RelativeQualifier::NOW},
478 {DatetimeExtractorType_YESTERDAY,
479 DatetimeComponent::RelativeQualifier::YESTERDAY},
480 {DatetimeExtractorType_TOMORROW,
481 DatetimeComponent::RelativeQualifier::TOMORROW},
482 {DatetimeExtractorType_NEXT,
483 DatetimeComponent::RelativeQualifier::NEXT},
484 {DatetimeExtractorType_NEXT_OR_SAME,
485 DatetimeComponent::RelativeQualifier::THIS},
486 {DatetimeExtractorType_LAST,
487 DatetimeComponent::RelativeQualifier::LAST},
488 {DatetimeExtractorType_PAST,
489 DatetimeComponent::RelativeQualifier::PAST},
490 {DatetimeExtractorType_FUTURE,
491 DatetimeComponent::RelativeQualifier::FUTURE},
492 },
493 parsed_relative_value);
494 }
495
ParseRelationAndConvertToRelativeCount(const UnicodeText & input,int * relative_count) const496 bool DatetimeExtractor::ParseRelationAndConvertToRelativeCount(
497 const UnicodeText& input, int* relative_count) const {
498 return MapInput(input,
499 {
500 {DatetimeExtractorType_NOW, 0},
501 {DatetimeExtractorType_YESTERDAY, -1},
502 {DatetimeExtractorType_TOMORROW, 1},
503 {DatetimeExtractorType_NEXT, 1},
504 {DatetimeExtractorType_NEXT_OR_SAME, 1},
505 {DatetimeExtractorType_LAST, -1},
506 {DatetimeExtractorType_PAST, -1},
507 },
508 relative_count);
509 }
510
ParseDayOfWeek(const UnicodeText & input,int * parsed_day_of_week) const511 bool DatetimeExtractor::ParseDayOfWeek(const UnicodeText& input,
512 int* parsed_day_of_week) const {
513 return MapInput(input,
514 {
515 {DatetimeExtractorType_SUNDAY, kSunday},
516 {DatetimeExtractorType_MONDAY, kMonday},
517 {DatetimeExtractorType_TUESDAY, kTuesday},
518 {DatetimeExtractorType_WEDNESDAY, kWednesday},
519 {DatetimeExtractorType_THURSDAY, kThursday},
520 {DatetimeExtractorType_FRIDAY, kFriday},
521 {DatetimeExtractorType_SATURDAY, kSaturday},
522 },
523 parsed_day_of_week);
524 }
525
ParseFieldType(const UnicodeText & input,DatetimeComponent::ComponentType * parsed_field_type) const526 bool DatetimeExtractor::ParseFieldType(
527 const UnicodeText& input,
528 DatetimeComponent::ComponentType* parsed_field_type) const {
529 return MapInput(
530 input,
531 {
532 {DatetimeExtractorType_MONDAY,
533 DatetimeComponent::ComponentType::DAY_OF_WEEK},
534 {DatetimeExtractorType_TUESDAY,
535 DatetimeComponent::ComponentType::DAY_OF_WEEK},
536 {DatetimeExtractorType_WEDNESDAY,
537 DatetimeComponent::ComponentType::DAY_OF_WEEK},
538 {DatetimeExtractorType_THURSDAY,
539 DatetimeComponent::ComponentType::DAY_OF_WEEK},
540 {DatetimeExtractorType_FRIDAY,
541 DatetimeComponent::ComponentType::DAY_OF_WEEK},
542 {DatetimeExtractorType_SATURDAY,
543 DatetimeComponent::ComponentType::DAY_OF_WEEK},
544 {DatetimeExtractorType_SUNDAY,
545 DatetimeComponent::ComponentType::DAY_OF_WEEK},
546 {DatetimeExtractorType_SECONDS,
547 DatetimeComponent::ComponentType::SECOND},
548 {DatetimeExtractorType_MINUTES,
549 DatetimeComponent::ComponentType::MINUTE},
550 {DatetimeExtractorType_NOW,
551 DatetimeComponent::ComponentType::DAY_OF_MONTH},
552 {DatetimeExtractorType_HOURS, DatetimeComponent::ComponentType::HOUR},
553 {DatetimeExtractorType_DAY,
554 DatetimeComponent::ComponentType::DAY_OF_MONTH},
555 {DatetimeExtractorType_TOMORROW,
556 DatetimeComponent::ComponentType::DAY_OF_MONTH},
557 {DatetimeExtractorType_YESTERDAY,
558 DatetimeComponent::ComponentType::DAY_OF_MONTH},
559 {DatetimeExtractorType_WEEK, DatetimeComponent::ComponentType::WEEK},
560 {DatetimeExtractorType_MONTH,
561 DatetimeComponent::ComponentType::MONTH},
562 {DatetimeExtractorType_YEAR, DatetimeComponent::ComponentType::YEAR},
563 },
564 parsed_field_type);
565 }
566
567 } // namespace libtextclassifier3
568