1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "annotator/datetime/extractor.h"
18
19 #include "annotator/datetime/utils.h"
20 #include "annotator/model_generated.h"
21 #include "annotator/types.h"
22 #include "utils/base/logging.h"
23
24 namespace libtextclassifier3 {
25
Extract(DatetimeParsedData * result,CodepointSpan * result_span) const26 bool DatetimeExtractor::Extract(DatetimeParsedData* result,
27 CodepointSpan* result_span) const {
28 *result_span = {kInvalidIndex, kInvalidIndex};
29
30 if (rule_.regex->groups() == nullptr) {
31 return false;
32 }
33
34 // In the current implementation of extractor, the assumption is that there
35 // can only be one relative field.
36 DatetimeComponent::ComponentType component_type;
37 DatetimeComponent::RelativeQualifier relative_qualifier =
38 DatetimeComponent::RelativeQualifier::UNSPECIFIED;
39 int relative_count = 0;
40
41 for (int group_id = 0; group_id < rule_.regex->groups()->size(); group_id++) {
42 UnicodeText group_text;
43 const int group_type = rule_.regex->groups()->Get(group_id);
44 if (group_type == DatetimeGroupType_GROUP_UNUSED) {
45 continue;
46 }
47 if (!GroupTextFromMatch(group_id, &group_text)) {
48 TC3_LOG(ERROR) << "Couldn't retrieve group.";
49 return false;
50 }
51 // The pattern can have a group defined in a part that was not matched,
52 // e.g. an optional part. In this case we'll get an empty content here.
53 if (group_text.empty()) {
54 continue;
55 }
56
57 switch (group_type) {
58 case DatetimeGroupType_GROUP_YEAR: {
59 int year;
60 if (!ParseYear(group_text, &(year))) {
61 TC3_LOG(ERROR) << "Couldn't extract YEAR.";
62 return false;
63 }
64 result->SetAbsoluteValue(DatetimeComponent::ComponentType::YEAR, year);
65 break;
66 }
67 case DatetimeGroupType_GROUP_MONTH: {
68 int month;
69 if (!ParseMonth(group_text, &(month))) {
70 TC3_LOG(ERROR) << "Couldn't extract MONTH.";
71 return false;
72 }
73 result->SetAbsoluteValue(DatetimeComponent::ComponentType::MONTH,
74 month);
75 break;
76 }
77 case DatetimeGroupType_GROUP_DAY: {
78 int day_of_month;
79 if (!ParseDigits(group_text, &(day_of_month))) {
80 TC3_LOG(ERROR) << "Couldn't extract DAY.";
81 return false;
82 }
83 result->SetAbsoluteValue(DatetimeComponent::ComponentType::DAY_OF_MONTH,
84 day_of_month);
85 break;
86 }
87 case DatetimeGroupType_GROUP_HOUR: {
88 int hour;
89 if (!ParseDigits(group_text, &(hour))) {
90 TC3_LOG(ERROR) << "Couldn't extract HOUR.";
91 return false;
92 }
93 result->SetAbsoluteValue(DatetimeComponent::ComponentType::HOUR, hour);
94 break;
95 }
96 case DatetimeGroupType_GROUP_MINUTE: {
97 int minute;
98 if (!ParseDigits(group_text, &(minute)) &&
99 !ParseWrittenNumber(group_text, &(minute))) {
100 TC3_LOG(ERROR) << "Couldn't extract MINUTE.";
101 return false;
102 }
103 result->SetAbsoluteValue(DatetimeComponent::ComponentType::MINUTE,
104 minute);
105 break;
106 }
107 case DatetimeGroupType_GROUP_SECOND: {
108 int second;
109 if (!ParseDigits(group_text, &(second))) {
110 TC3_LOG(ERROR) << "Couldn't extract SECOND.";
111 return false;
112 }
113 result->SetAbsoluteValue(DatetimeComponent::ComponentType::SECOND,
114 second);
115 break;
116 }
117 case DatetimeGroupType_GROUP_AMPM: {
118 int meridiem;
119 if (!ParseMeridiem(group_text, &(meridiem))) {
120 TC3_LOG(ERROR) << "Couldn't extract AMPM.";
121 return false;
122 }
123 result->SetAbsoluteValue(DatetimeComponent::ComponentType::MERIDIEM,
124 meridiem);
125 break;
126 }
127 case DatetimeGroupType_GROUP_RELATIONDISTANCE: {
128 relative_count = 0;
129 if (!ParseRelationDistance(group_text, &(relative_count))) {
130 TC3_LOG(ERROR) << "Couldn't extract RELATION_DISTANCE_FIELD.";
131 return false;
132 }
133 break;
134 }
135 case DatetimeGroupType_GROUP_RELATION: {
136 if (!ParseRelativeValue(group_text, &relative_qualifier)) {
137 TC3_LOG(ERROR) << "Couldn't extract RELATION_FIELD.";
138 return false;
139 }
140 ParseRelationAndConvertToRelativeCount(group_text, &relative_count);
141 if (relative_qualifier ==
142 DatetimeComponent::RelativeQualifier::TOMORROW ||
143 relative_qualifier == DatetimeComponent::RelativeQualifier::NOW ||
144 relative_qualifier ==
145 DatetimeComponent::RelativeQualifier::YESTERDAY) {
146 if (!ParseFieldType(group_text, &component_type)) {
147 TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
148 return false;
149 }
150 }
151 break;
152 }
153 case DatetimeGroupType_GROUP_RELATIONTYPE: {
154 if (!ParseFieldType(group_text, &component_type)) {
155 TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
156 return false;
157 }
158 if (component_type == DatetimeComponent::ComponentType::DAY_OF_WEEK) {
159 int day_of_week;
160 if (!ParseDayOfWeek(group_text, &day_of_week)) {
161 TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
162 return false;
163 }
164 result->SetAbsoluteValue(component_type, day_of_week);
165 }
166 break;
167 }
168 case DatetimeGroupType_GROUP_ABSOLUTETIME: {
169 std::unordered_map<DatetimeComponent::ComponentType, int> values;
170 if (!ParseAbsoluteDateValues(group_text, &values)) {
171 TC3_LOG(ERROR) << "Couldn't extract Component values.";
172 return false;
173 }
174 for (const std::pair<const DatetimeComponent::ComponentType, int>&
175 date_time_pair : values) {
176 result->SetAbsoluteValue(date_time_pair.first, date_time_pair.second);
177 }
178 break;
179 }
180 case DatetimeGroupType_GROUP_DUMMY1:
181 case DatetimeGroupType_GROUP_DUMMY2:
182 break;
183 default:
184 TC3_LOG(INFO) << "Unknown group type.";
185 continue;
186 }
187 if (!UpdateMatchSpan(group_id, result_span)) {
188 TC3_LOG(ERROR) << "Couldn't update span.";
189 return false;
190 }
191 }
192
193 if (relative_qualifier != DatetimeComponent::RelativeQualifier::UNSPECIFIED) {
194 result->SetRelativeValue(component_type, relative_qualifier);
195 result->SetRelativeCount(component_type, relative_count);
196 }
197
198 if (result_span->first == kInvalidIndex ||
199 result_span->second == kInvalidIndex) {
200 *result_span = {kInvalidIndex, kInvalidIndex};
201 }
202
203 return true;
204 }
205
RuleIdForType(DatetimeExtractorType type,int * rule_id) const206 bool DatetimeExtractor::RuleIdForType(DatetimeExtractorType type,
207 int* rule_id) const {
208 auto type_it = type_and_locale_to_rule_.find(type);
209 if (type_it == type_and_locale_to_rule_.end()) {
210 return false;
211 }
212
213 auto locale_it = type_it->second.find(locale_id_);
214 if (locale_it == type_it->second.end()) {
215 return false;
216 }
217 *rule_id = locale_it->second;
218 return true;
219 }
220
ExtractType(const UnicodeText & input,DatetimeExtractorType extractor_type,UnicodeText * match_result) const221 bool DatetimeExtractor::ExtractType(const UnicodeText& input,
222 DatetimeExtractorType extractor_type,
223 UnicodeText* match_result) const {
224 int rule_id;
225 if (!RuleIdForType(extractor_type, &rule_id)) {
226 return false;
227 }
228
229 std::unique_ptr<UniLib::RegexMatcher> matcher =
230 rules_[rule_id]->Matcher(input);
231 if (!matcher) {
232 return false;
233 }
234
235 int status;
236 if (!matcher->Find(&status)) {
237 return false;
238 }
239
240 if (match_result != nullptr) {
241 *match_result = matcher->Group(&status);
242 if (status != UniLib::RegexMatcher::kNoError) {
243 return false;
244 }
245 }
246 return true;
247 }
248
GroupTextFromMatch(int group_id,UnicodeText * result) const249 bool DatetimeExtractor::GroupTextFromMatch(int group_id,
250 UnicodeText* result) const {
251 int status;
252 *result = matcher_.Group(group_id, &status);
253 if (status != UniLib::RegexMatcher::kNoError) {
254 return false;
255 }
256 return true;
257 }
258
UpdateMatchSpan(int group_id,CodepointSpan * span) const259 bool DatetimeExtractor::UpdateMatchSpan(int group_id,
260 CodepointSpan* span) const {
261 int status;
262 const int match_start = matcher_.Start(group_id, &status);
263 if (status != UniLib::RegexMatcher::kNoError) {
264 return false;
265 }
266 const int match_end = matcher_.End(group_id, &status);
267 if (status != UniLib::RegexMatcher::kNoError) {
268 return false;
269 }
270 if (span->first == kInvalidIndex || span->first > match_start) {
271 span->first = match_start;
272 }
273 if (span->second == kInvalidIndex || span->second < match_end) {
274 span->second = match_end;
275 }
276
277 return true;
278 }
279
280 template <typename T>
MapInput(const UnicodeText & input,const std::vector<std::pair<DatetimeExtractorType,T>> & mapping,T * result) const281 bool DatetimeExtractor::MapInput(
282 const UnicodeText& input,
283 const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
284 T* result) const {
285 for (const auto& type_value_pair : mapping) {
286 if (ExtractType(input, type_value_pair.first)) {
287 *result = type_value_pair.second;
288 return true;
289 }
290 }
291 return false;
292 }
293
ParseWrittenNumber(const UnicodeText & input,int * parsed_number) const294 bool DatetimeExtractor::ParseWrittenNumber(const UnicodeText& input,
295 int* parsed_number) const {
296 std::vector<std::pair<int, int>> found_numbers;
297 for (const auto& type_value_pair :
298 std::vector<std::pair<DatetimeExtractorType, int>>{
299 {DatetimeExtractorType_ZERO, 0},
300 {DatetimeExtractorType_ONE, 1},
301 {DatetimeExtractorType_TWO, 2},
302 {DatetimeExtractorType_THREE, 3},
303 {DatetimeExtractorType_FOUR, 4},
304 {DatetimeExtractorType_FIVE, 5},
305 {DatetimeExtractorType_SIX, 6},
306 {DatetimeExtractorType_SEVEN, 7},
307 {DatetimeExtractorType_EIGHT, 8},
308 {DatetimeExtractorType_NINE, 9},
309 {DatetimeExtractorType_TEN, 10},
310 {DatetimeExtractorType_ELEVEN, 11},
311 {DatetimeExtractorType_TWELVE, 12},
312 {DatetimeExtractorType_THIRTEEN, 13},
313 {DatetimeExtractorType_FOURTEEN, 14},
314 {DatetimeExtractorType_FIFTEEN, 15},
315 {DatetimeExtractorType_SIXTEEN, 16},
316 {DatetimeExtractorType_SEVENTEEN, 17},
317 {DatetimeExtractorType_EIGHTEEN, 18},
318 {DatetimeExtractorType_NINETEEN, 19},
319 {DatetimeExtractorType_TWENTY, 20},
320 {DatetimeExtractorType_THIRTY, 30},
321 {DatetimeExtractorType_FORTY, 40},
322 {DatetimeExtractorType_FIFTY, 50},
323 {DatetimeExtractorType_SIXTY, 60},
324 {DatetimeExtractorType_SEVENTY, 70},
325 {DatetimeExtractorType_EIGHTY, 80},
326 {DatetimeExtractorType_NINETY, 90},
327 {DatetimeExtractorType_HUNDRED, 100},
328 {DatetimeExtractorType_THOUSAND, 1000},
329 }) {
330 int rule_id;
331 if (!RuleIdForType(type_value_pair.first, &rule_id)) {
332 return false;
333 }
334
335 std::unique_ptr<UniLib::RegexMatcher> matcher =
336 rules_[rule_id]->Matcher(input);
337 if (!matcher) {
338 return false;
339 }
340 int status;
341 while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {
342 int span_start = matcher->Start(&status);
343 if (status != UniLib::RegexMatcher::kNoError) {
344 return false;
345 }
346 found_numbers.push_back({span_start, type_value_pair.second});
347 }
348 }
349
350 std::sort(found_numbers.begin(), found_numbers.end(),
351 [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
352 return a.first < b.first;
353 });
354
355 int sum = 0;
356 int running_value = -1;
357 // Simple math to make sure we handle written numerical modifiers correctly
358 // so that :="fifty one thousand and one" maps to 51001 and not 50 1 1000 1.
359 for (const std::pair<int, int>& position_number_pair : found_numbers) {
360 if (running_value >= 0) {
361 if (running_value > position_number_pair.second) {
362 sum += running_value;
363 running_value = position_number_pair.second;
364 } else {
365 running_value *= position_number_pair.second;
366 }
367 } else {
368 running_value = position_number_pair.second;
369 }
370 }
371 sum += running_value;
372 *parsed_number = sum;
373 return true;
374 }
375
ParseDigits(const UnicodeText & input,int * parsed_digits) const376 bool DatetimeExtractor::ParseDigits(const UnicodeText& input,
377 int* parsed_digits) const {
378 UnicodeText digit;
379 if (!ExtractType(input, DatetimeExtractorType_DIGITS, &digit)) {
380 return false;
381 }
382
383 if (!unilib_.ParseInt32(digit, parsed_digits)) {
384 return false;
385 }
386 return true;
387 }
388
ParseYear(const UnicodeText & input,int * parsed_year) const389 bool DatetimeExtractor::ParseYear(const UnicodeText& input,
390 int* parsed_year) const {
391 if (!ParseDigits(input, parsed_year)) {
392 return false;
393 }
394 *parsed_year = GetAdjustedYear(*parsed_year);
395
396 return true;
397 }
398
ParseMonth(const UnicodeText & input,int * parsed_month) const399 bool DatetimeExtractor::ParseMonth(const UnicodeText& input,
400 int* parsed_month) const {
401 if (ParseDigits(input, parsed_month)) {
402 return true;
403 }
404
405 if (MapInput(input,
406 {
407 {DatetimeExtractorType_JANUARY, 1},
408 {DatetimeExtractorType_FEBRUARY, 2},
409 {DatetimeExtractorType_MARCH, 3},
410 {DatetimeExtractorType_APRIL, 4},
411 {DatetimeExtractorType_MAY, 5},
412 {DatetimeExtractorType_JUNE, 6},
413 {DatetimeExtractorType_JULY, 7},
414 {DatetimeExtractorType_AUGUST, 8},
415 {DatetimeExtractorType_SEPTEMBER, 9},
416 {DatetimeExtractorType_OCTOBER, 10},
417 {DatetimeExtractorType_NOVEMBER, 11},
418 {DatetimeExtractorType_DECEMBER, 12},
419 },
420 parsed_month)) {
421 return true;
422 }
423
424 return false;
425 }
426
ParseAbsoluteDateValues(const UnicodeText & input,std::unordered_map<DatetimeComponent::ComponentType,int> * values) const427 bool DatetimeExtractor::ParseAbsoluteDateValues(
428 const UnicodeText& input,
429 std::unordered_map<DatetimeComponent::ComponentType, int>* values) const {
430 if (MapInput(input,
431 {
432 {DatetimeExtractorType_NOON,
433 {{DatetimeComponent::ComponentType::MERIDIEM, 1},
434 {DatetimeComponent::ComponentType::MINUTE, 0},
435 {DatetimeComponent::ComponentType::HOUR, 12}}},
436 {DatetimeExtractorType_MIDNIGHT,
437 {{DatetimeComponent::ComponentType::MERIDIEM, 0},
438 {DatetimeComponent::ComponentType::MINUTE, 0},
439 {DatetimeComponent::ComponentType::HOUR, 0}}},
440 },
441 values)) {
442 return true;
443 }
444 return false;
445 }
446
ParseMeridiem(const UnicodeText & input,int * parsed_meridiem) const447 bool DatetimeExtractor::ParseMeridiem(const UnicodeText& input,
448 int* parsed_meridiem) const {
449 return MapInput(input,
450 {
451 {DatetimeExtractorType_AM, 0 /* AM */},
452 {DatetimeExtractorType_PM, 1 /* PM */},
453 },
454 parsed_meridiem);
455 }
456
ParseRelationDistance(const UnicodeText & input,int * parsed_distance) const457 bool DatetimeExtractor::ParseRelationDistance(const UnicodeText& input,
458 int* parsed_distance) const {
459 if (ParseDigits(input, parsed_distance)) {
460 return true;
461 }
462 if (ParseWrittenNumber(input, parsed_distance)) {
463 return true;
464 }
465 return false;
466 }
467
ParseRelativeValue(const UnicodeText & input,DatetimeComponent::RelativeQualifier * parsed_relative_value) const468 bool DatetimeExtractor::ParseRelativeValue(
469 const UnicodeText& input,
470 DatetimeComponent::RelativeQualifier* parsed_relative_value) const {
471 return MapInput(input,
472 {
473 {DatetimeExtractorType_NOW,
474 DatetimeComponent::RelativeQualifier::NOW},
475 {DatetimeExtractorType_YESTERDAY,
476 DatetimeComponent::RelativeQualifier::YESTERDAY},
477 {DatetimeExtractorType_TOMORROW,
478 DatetimeComponent::RelativeQualifier::TOMORROW},
479 {DatetimeExtractorType_NEXT,
480 DatetimeComponent::RelativeQualifier::NEXT},
481 {DatetimeExtractorType_NEXT_OR_SAME,
482 DatetimeComponent::RelativeQualifier::THIS},
483 {DatetimeExtractorType_LAST,
484 DatetimeComponent::RelativeQualifier::LAST},
485 {DatetimeExtractorType_PAST,
486 DatetimeComponent::RelativeQualifier::PAST},
487 {DatetimeExtractorType_FUTURE,
488 DatetimeComponent::RelativeQualifier::FUTURE},
489 },
490 parsed_relative_value);
491 }
492
ParseRelationAndConvertToRelativeCount(const UnicodeText & input,int * relative_count) const493 bool DatetimeExtractor::ParseRelationAndConvertToRelativeCount(
494 const UnicodeText& input, int* relative_count) const {
495 return MapInput(input,
496 {
497 {DatetimeExtractorType_NOW, 0},
498 {DatetimeExtractorType_YESTERDAY, -1},
499 {DatetimeExtractorType_TOMORROW, 1},
500 {DatetimeExtractorType_NEXT, 1},
501 {DatetimeExtractorType_NEXT_OR_SAME, 1},
502 {DatetimeExtractorType_LAST, -1},
503 {DatetimeExtractorType_PAST, -1},
504 },
505 relative_count);
506 }
507
ParseDayOfWeek(const UnicodeText & input,int * parsed_day_of_week) const508 bool DatetimeExtractor::ParseDayOfWeek(const UnicodeText& input,
509 int* parsed_day_of_week) const {
510 return MapInput(input,
511 {
512 {DatetimeExtractorType_SUNDAY, kSunday},
513 {DatetimeExtractorType_MONDAY, kMonday},
514 {DatetimeExtractorType_TUESDAY, kTuesday},
515 {DatetimeExtractorType_WEDNESDAY, kWednesday},
516 {DatetimeExtractorType_THURSDAY, kThursday},
517 {DatetimeExtractorType_FRIDAY, kFriday},
518 {DatetimeExtractorType_SATURDAY, kSaturday},
519 },
520 parsed_day_of_week);
521 }
522
ParseFieldType(const UnicodeText & input,DatetimeComponent::ComponentType * parsed_field_type) const523 bool DatetimeExtractor::ParseFieldType(
524 const UnicodeText& input,
525 DatetimeComponent::ComponentType* parsed_field_type) const {
526 return MapInput(
527 input,
528 {
529 {DatetimeExtractorType_MONDAY,
530 DatetimeComponent::ComponentType::DAY_OF_WEEK},
531 {DatetimeExtractorType_TUESDAY,
532 DatetimeComponent::ComponentType::DAY_OF_WEEK},
533 {DatetimeExtractorType_WEDNESDAY,
534 DatetimeComponent::ComponentType::DAY_OF_WEEK},
535 {DatetimeExtractorType_THURSDAY,
536 DatetimeComponent::ComponentType::DAY_OF_WEEK},
537 {DatetimeExtractorType_FRIDAY,
538 DatetimeComponent::ComponentType::DAY_OF_WEEK},
539 {DatetimeExtractorType_SATURDAY,
540 DatetimeComponent::ComponentType::DAY_OF_WEEK},
541 {DatetimeExtractorType_SUNDAY,
542 DatetimeComponent::ComponentType::DAY_OF_WEEK},
543 {DatetimeExtractorType_SECONDS,
544 DatetimeComponent::ComponentType::SECOND},
545 {DatetimeExtractorType_MINUTES,
546 DatetimeComponent::ComponentType::MINUTE},
547 {DatetimeExtractorType_NOW,
548 DatetimeComponent::ComponentType::DAY_OF_MONTH},
549 {DatetimeExtractorType_HOURS, DatetimeComponent::ComponentType::HOUR},
550 {DatetimeExtractorType_DAY,
551 DatetimeComponent::ComponentType::DAY_OF_MONTH},
552 {DatetimeExtractorType_TOMORROW,
553 DatetimeComponent::ComponentType::DAY_OF_MONTH},
554 {DatetimeExtractorType_YESTERDAY,
555 DatetimeComponent::ComponentType::DAY_OF_MONTH},
556 {DatetimeExtractorType_WEEK, DatetimeComponent::ComponentType::WEEK},
557 {DatetimeExtractorType_MONTH,
558 DatetimeComponent::ComponentType::MONTH},
559 {DatetimeExtractorType_YEAR, DatetimeComponent::ComponentType::YEAR},
560 },
561 parsed_field_type);
562 }
563
564 } // namespace libtextclassifier3
565