1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "annotator/datetime/extractor.h"
18
19 #include "utils/base/logging.h"
20
21 namespace libtextclassifier3 {
22
Extract(DatetimeParsedData * result,CodepointSpan * result_span) const23 bool DatetimeExtractor::Extract(DatetimeParsedData* result,
24 CodepointSpan* result_span) const {
25 *result_span = {kInvalidIndex, kInvalidIndex};
26
27 if (rule_.regex->groups() == nullptr) {
28 return false;
29 }
30
31 // In the current implementation of extractor, the assumption is that there
32 // can only be one relative field.
33 DatetimeComponent::ComponentType component_type;
34 DatetimeComponent::RelativeQualifier relative_qualifier =
35 DatetimeComponent::RelativeQualifier::UNSPECIFIED;
36 int relative_count = 0;
37
38 for (int group_id = 0; group_id < rule_.regex->groups()->size(); group_id++) {
39 UnicodeText group_text;
40 const int group_type = rule_.regex->groups()->Get(group_id);
41 if (group_type == DatetimeGroupType_GROUP_UNUSED) {
42 continue;
43 }
44 if (!GroupTextFromMatch(group_id, &group_text)) {
45 TC3_LOG(ERROR) << "Couldn't retrieve group.";
46 return false;
47 }
48 // The pattern can have a group defined in a part that was not matched,
49 // e.g. an optional part. In this case we'll get an empty content here.
50 if (group_text.empty()) {
51 continue;
52 }
53
54 switch (group_type) {
55 case DatetimeGroupType_GROUP_YEAR: {
56 int year;
57 if (!ParseYear(group_text, &(year))) {
58 TC3_LOG(ERROR) << "Couldn't extract YEAR.";
59 return false;
60 }
61 result->SetAbsoluteValue(DatetimeComponent::ComponentType::YEAR, year);
62 break;
63 }
64 case DatetimeGroupType_GROUP_MONTH: {
65 int month;
66 if (!ParseMonth(group_text, &(month))) {
67 TC3_LOG(ERROR) << "Couldn't extract MONTH.";
68 return false;
69 }
70 result->SetAbsoluteValue(DatetimeComponent::ComponentType::MONTH,
71 month);
72 break;
73 }
74 case DatetimeGroupType_GROUP_DAY: {
75 int day_of_month;
76 if (!ParseDigits(group_text, &(day_of_month))) {
77 TC3_LOG(ERROR) << "Couldn't extract DAY.";
78 return false;
79 }
80 result->SetAbsoluteValue(DatetimeComponent::ComponentType::DAY_OF_MONTH,
81 day_of_month);
82 break;
83 }
84 case DatetimeGroupType_GROUP_HOUR: {
85 int hour;
86 if (!ParseDigits(group_text, &(hour))) {
87 TC3_LOG(ERROR) << "Couldn't extract HOUR.";
88 return false;
89 }
90 result->SetAbsoluteValue(DatetimeComponent::ComponentType::HOUR, hour);
91 break;
92 }
93 case DatetimeGroupType_GROUP_MINUTE: {
94 int minute;
95 if (!ParseDigits(group_text, &(minute)) &&
96 !ParseWrittenNumber(group_text, &(minute))) {
97 TC3_LOG(ERROR) << "Couldn't extract MINUTE.";
98 return false;
99 }
100 result->SetAbsoluteValue(DatetimeComponent::ComponentType::MINUTE,
101 minute);
102 break;
103 }
104 case DatetimeGroupType_GROUP_SECOND: {
105 int second;
106 if (!ParseDigits(group_text, &(second))) {
107 TC3_LOG(ERROR) << "Couldn't extract SECOND.";
108 return false;
109 }
110 result->SetAbsoluteValue(DatetimeComponent::ComponentType::SECOND,
111 second);
112 break;
113 }
114 case DatetimeGroupType_GROUP_AMPM: {
115 int meridiem;
116 if (!ParseMeridiem(group_text, &(meridiem))) {
117 TC3_LOG(ERROR) << "Couldn't extract AMPM.";
118 return false;
119 }
120 result->SetAbsoluteValue(DatetimeComponent::ComponentType::MERIDIEM,
121 meridiem);
122 break;
123 }
124 case DatetimeGroupType_GROUP_RELATIONDISTANCE: {
125 relative_count = 0;
126 if (!ParseRelationDistance(group_text, &(relative_count))) {
127 TC3_LOG(ERROR) << "Couldn't extract RELATION_DISTANCE_FIELD.";
128 return false;
129 }
130 break;
131 }
132 case DatetimeGroupType_GROUP_RELATION: {
133 if (!ParseRelativeValue(group_text, &relative_qualifier)) {
134 TC3_LOG(ERROR) << "Couldn't extract RELATION_FIELD.";
135 return false;
136 }
137 ParseRelationAndConvertToRelativeCount(group_text, &relative_count);
138 if (relative_qualifier ==
139 DatetimeComponent::RelativeQualifier::TOMORROW ||
140 relative_qualifier == DatetimeComponent::RelativeQualifier::NOW ||
141 relative_qualifier ==
142 DatetimeComponent::RelativeQualifier::YESTERDAY) {
143 if (!ParseFieldType(group_text, &component_type)) {
144 TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
145 return false;
146 }
147 }
148 break;
149 }
150 case DatetimeGroupType_GROUP_RELATIONTYPE: {
151 if (!ParseFieldType(group_text, &component_type)) {
152 TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
153 return false;
154 }
155 if (component_type == DatetimeComponent::ComponentType::DAY_OF_WEEK) {
156 int day_of_week;
157 if (!ParseDayOfWeek(group_text, &day_of_week)) {
158 TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
159 return false;
160 }
161 result->SetAbsoluteValue(component_type, day_of_week);
162 }
163 break;
164 }
165 case DatetimeGroupType_GROUP_DUMMY1:
166 case DatetimeGroupType_GROUP_DUMMY2:
167 break;
168 default:
169 TC3_LOG(INFO) << "Unknown group type.";
170 continue;
171 }
172 if (!UpdateMatchSpan(group_id, result_span)) {
173 TC3_LOG(ERROR) << "Couldn't update span.";
174 return false;
175 }
176 }
177
178 if (relative_qualifier != DatetimeComponent::RelativeQualifier::UNSPECIFIED) {
179 result->SetRelativeValue(component_type, relative_qualifier);
180 result->SetRelativeCount(component_type, relative_count);
181 }
182
183 if (result_span->first == kInvalidIndex ||
184 result_span->second == kInvalidIndex) {
185 *result_span = {kInvalidIndex, kInvalidIndex};
186 }
187
188 return true;
189 }
190
RuleIdForType(DatetimeExtractorType type,int * rule_id) const191 bool DatetimeExtractor::RuleIdForType(DatetimeExtractorType type,
192 int* rule_id) const {
193 auto type_it = type_and_locale_to_rule_.find(type);
194 if (type_it == type_and_locale_to_rule_.end()) {
195 return false;
196 }
197
198 auto locale_it = type_it->second.find(locale_id_);
199 if (locale_it == type_it->second.end()) {
200 return false;
201 }
202 *rule_id = locale_it->second;
203 return true;
204 }
205
ExtractType(const UnicodeText & input,DatetimeExtractorType extractor_type,UnicodeText * match_result) const206 bool DatetimeExtractor::ExtractType(const UnicodeText& input,
207 DatetimeExtractorType extractor_type,
208 UnicodeText* match_result) const {
209 int rule_id;
210 if (!RuleIdForType(extractor_type, &rule_id)) {
211 return false;
212 }
213
214 std::unique_ptr<UniLib::RegexMatcher> matcher =
215 rules_[rule_id]->Matcher(input);
216 if (!matcher) {
217 return false;
218 }
219
220 int status;
221 if (!matcher->Find(&status)) {
222 return false;
223 }
224
225 if (match_result != nullptr) {
226 *match_result = matcher->Group(&status);
227 if (status != UniLib::RegexMatcher::kNoError) {
228 return false;
229 }
230 }
231 return true;
232 }
233
GroupTextFromMatch(int group_id,UnicodeText * result) const234 bool DatetimeExtractor::GroupTextFromMatch(int group_id,
235 UnicodeText* result) const {
236 int status;
237 *result = matcher_.Group(group_id, &status);
238 if (status != UniLib::RegexMatcher::kNoError) {
239 return false;
240 }
241 return true;
242 }
243
UpdateMatchSpan(int group_id,CodepointSpan * span) const244 bool DatetimeExtractor::UpdateMatchSpan(int group_id,
245 CodepointSpan* span) const {
246 int status;
247 const int match_start = matcher_.Start(group_id, &status);
248 if (status != UniLib::RegexMatcher::kNoError) {
249 return false;
250 }
251 const int match_end = matcher_.End(group_id, &status);
252 if (status != UniLib::RegexMatcher::kNoError) {
253 return false;
254 }
255 if (span->first == kInvalidIndex || span->first > match_start) {
256 span->first = match_start;
257 }
258 if (span->second == kInvalidIndex || span->second < match_end) {
259 span->second = match_end;
260 }
261
262 return true;
263 }
264
265 template <typename T>
MapInput(const UnicodeText & input,const std::vector<std::pair<DatetimeExtractorType,T>> & mapping,T * result) const266 bool DatetimeExtractor::MapInput(
267 const UnicodeText& input,
268 const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
269 T* result) const {
270 for (const auto& type_value_pair : mapping) {
271 if (ExtractType(input, type_value_pair.first)) {
272 *result = type_value_pair.second;
273 return true;
274 }
275 }
276 return false;
277 }
278
ParseWrittenNumber(const UnicodeText & input,int * parsed_number) const279 bool DatetimeExtractor::ParseWrittenNumber(const UnicodeText& input,
280 int* parsed_number) const {
281 std::vector<std::pair<int, int>> found_numbers;
282 for (const auto& type_value_pair :
283 std::vector<std::pair<DatetimeExtractorType, int>>{
284 {DatetimeExtractorType_ZERO, 0},
285 {DatetimeExtractorType_ONE, 1},
286 {DatetimeExtractorType_TWO, 2},
287 {DatetimeExtractorType_THREE, 3},
288 {DatetimeExtractorType_FOUR, 4},
289 {DatetimeExtractorType_FIVE, 5},
290 {DatetimeExtractorType_SIX, 6},
291 {DatetimeExtractorType_SEVEN, 7},
292 {DatetimeExtractorType_EIGHT, 8},
293 {DatetimeExtractorType_NINE, 9},
294 {DatetimeExtractorType_TEN, 10},
295 {DatetimeExtractorType_ELEVEN, 11},
296 {DatetimeExtractorType_TWELVE, 12},
297 {DatetimeExtractorType_THIRTEEN, 13},
298 {DatetimeExtractorType_FOURTEEN, 14},
299 {DatetimeExtractorType_FIFTEEN, 15},
300 {DatetimeExtractorType_SIXTEEN, 16},
301 {DatetimeExtractorType_SEVENTEEN, 17},
302 {DatetimeExtractorType_EIGHTEEN, 18},
303 {DatetimeExtractorType_NINETEEN, 19},
304 {DatetimeExtractorType_TWENTY, 20},
305 {DatetimeExtractorType_THIRTY, 30},
306 {DatetimeExtractorType_FORTY, 40},
307 {DatetimeExtractorType_FIFTY, 50},
308 {DatetimeExtractorType_SIXTY, 60},
309 {DatetimeExtractorType_SEVENTY, 70},
310 {DatetimeExtractorType_EIGHTY, 80},
311 {DatetimeExtractorType_NINETY, 90},
312 {DatetimeExtractorType_HUNDRED, 100},
313 {DatetimeExtractorType_THOUSAND, 1000},
314 }) {
315 int rule_id;
316 if (!RuleIdForType(type_value_pair.first, &rule_id)) {
317 return false;
318 }
319
320 std::unique_ptr<UniLib::RegexMatcher> matcher =
321 rules_[rule_id]->Matcher(input);
322 if (!matcher) {
323 return false;
324 }
325 int status;
326 while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {
327 int span_start = matcher->Start(&status);
328 if (status != UniLib::RegexMatcher::kNoError) {
329 return false;
330 }
331 found_numbers.push_back({span_start, type_value_pair.second});
332 }
333 }
334
335 std::sort(found_numbers.begin(), found_numbers.end(),
336 [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
337 return a.first < b.first;
338 });
339
340 int sum = 0;
341 int running_value = -1;
342 // Simple math to make sure we handle written numerical modifiers correctly
343 // so that :="fifty one thousand and one" maps to 51001 and not 50 1 1000 1.
344 for (const std::pair<int, int>& position_number_pair : found_numbers) {
345 if (running_value >= 0) {
346 if (running_value > position_number_pair.second) {
347 sum += running_value;
348 running_value = position_number_pair.second;
349 } else {
350 running_value *= position_number_pair.second;
351 }
352 } else {
353 running_value = position_number_pair.second;
354 }
355 }
356 sum += running_value;
357 *parsed_number = sum;
358 return true;
359 }
360
ParseDigits(const UnicodeText & input,int * parsed_digits) const361 bool DatetimeExtractor::ParseDigits(const UnicodeText& input,
362 int* parsed_digits) const {
363 UnicodeText digit;
364 if (!ExtractType(input, DatetimeExtractorType_DIGITS, &digit)) {
365 return false;
366 }
367
368 if (!unilib_.ParseInt32(digit, parsed_digits)) {
369 return false;
370 }
371 return true;
372 }
373
ParseYear(const UnicodeText & input,int * parsed_year) const374 bool DatetimeExtractor::ParseYear(const UnicodeText& input,
375 int* parsed_year) const {
376 if (!ParseDigits(input, parsed_year)) {
377 return false;
378 }
379
380 // Logic to decide if XX will be 20XX or 19XX
381 if (*parsed_year < 100) {
382 if (*parsed_year < 50) {
383 *parsed_year += 2000;
384 } else {
385 *parsed_year += 1900;
386 }
387 }
388
389 return true;
390 }
391
ParseMonth(const UnicodeText & input,int * parsed_month) const392 bool DatetimeExtractor::ParseMonth(const UnicodeText& input,
393 int* parsed_month) const {
394 if (ParseDigits(input, parsed_month)) {
395 return true;
396 }
397
398 if (MapInput(input,
399 {
400 {DatetimeExtractorType_JANUARY, 1},
401 {DatetimeExtractorType_FEBRUARY, 2},
402 {DatetimeExtractorType_MARCH, 3},
403 {DatetimeExtractorType_APRIL, 4},
404 {DatetimeExtractorType_MAY, 5},
405 {DatetimeExtractorType_JUNE, 6},
406 {DatetimeExtractorType_JULY, 7},
407 {DatetimeExtractorType_AUGUST, 8},
408 {DatetimeExtractorType_SEPTEMBER, 9},
409 {DatetimeExtractorType_OCTOBER, 10},
410 {DatetimeExtractorType_NOVEMBER, 11},
411 {DatetimeExtractorType_DECEMBER, 12},
412 },
413 parsed_month)) {
414 return true;
415 }
416
417 return false;
418 }
419
ParseMeridiem(const UnicodeText & input,int * parsed_meridiem) const420 bool DatetimeExtractor::ParseMeridiem(const UnicodeText& input,
421 int* parsed_meridiem) const {
422 return MapInput(input,
423 {
424 {DatetimeExtractorType_AM, 0 /* AM */},
425 {DatetimeExtractorType_PM, 1 /* PM */},
426 },
427 parsed_meridiem);
428 }
429
ParseRelationDistance(const UnicodeText & input,int * parsed_distance) const430 bool DatetimeExtractor::ParseRelationDistance(const UnicodeText& input,
431 int* parsed_distance) const {
432 if (ParseDigits(input, parsed_distance)) {
433 return true;
434 }
435 if (ParseWrittenNumber(input, parsed_distance)) {
436 return true;
437 }
438 return false;
439 }
440
ParseRelativeValue(const UnicodeText & input,DatetimeComponent::RelativeQualifier * parsed_relative_value) const441 bool DatetimeExtractor::ParseRelativeValue(
442 const UnicodeText& input,
443 DatetimeComponent::RelativeQualifier* parsed_relative_value) const {
444 return MapInput(input,
445 {
446 {DatetimeExtractorType_NOW,
447 DatetimeComponent::RelativeQualifier::NOW},
448 {DatetimeExtractorType_YESTERDAY,
449 DatetimeComponent::RelativeQualifier::YESTERDAY},
450 {DatetimeExtractorType_TOMORROW,
451 DatetimeComponent::RelativeQualifier::TOMORROW},
452 {DatetimeExtractorType_NEXT,
453 DatetimeComponent::RelativeQualifier::NEXT},
454 {DatetimeExtractorType_NEXT_OR_SAME,
455 DatetimeComponent::RelativeQualifier::THIS},
456 {DatetimeExtractorType_LAST,
457 DatetimeComponent::RelativeQualifier::LAST},
458 {DatetimeExtractorType_PAST,
459 DatetimeComponent::RelativeQualifier::PAST},
460 {DatetimeExtractorType_FUTURE,
461 DatetimeComponent::RelativeQualifier::FUTURE},
462 },
463 parsed_relative_value);
464 }
465
ParseRelationAndConvertToRelativeCount(const UnicodeText & input,int * relative_count) const466 bool DatetimeExtractor::ParseRelationAndConvertToRelativeCount(
467 const UnicodeText& input, int* relative_count) const {
468 return MapInput(input,
469 {
470 {DatetimeExtractorType_NOW, 0},
471 {DatetimeExtractorType_YESTERDAY, -1},
472 {DatetimeExtractorType_TOMORROW, 1},
473 {DatetimeExtractorType_NEXT, 1},
474 {DatetimeExtractorType_NEXT_OR_SAME, 1},
475 {DatetimeExtractorType_LAST, -1},
476 {DatetimeExtractorType_PAST, -1},
477 },
478 relative_count);
479 }
480
ParseDayOfWeek(const UnicodeText & input,int * parsed_day_of_week) const481 bool DatetimeExtractor::ParseDayOfWeek(const UnicodeText& input,
482 int* parsed_day_of_week) const {
483 return MapInput(input,
484 {
485 {DatetimeExtractorType_SUNDAY, kSunday},
486 {DatetimeExtractorType_MONDAY, kMonday},
487 {DatetimeExtractorType_TUESDAY, kTuesday},
488 {DatetimeExtractorType_WEDNESDAY, kWednesday},
489 {DatetimeExtractorType_THURSDAY, kThursday},
490 {DatetimeExtractorType_FRIDAY, kFriday},
491 {DatetimeExtractorType_SATURDAY, kSaturday},
492 },
493 parsed_day_of_week);
494 }
495
ParseFieldType(const UnicodeText & input,DatetimeComponent::ComponentType * parsed_field_type) const496 bool DatetimeExtractor::ParseFieldType(
497 const UnicodeText& input,
498 DatetimeComponent::ComponentType* parsed_field_type) const {
499 return MapInput(
500 input,
501 {
502 {DatetimeExtractorType_MONDAY,
503 DatetimeComponent::ComponentType::DAY_OF_WEEK},
504 {DatetimeExtractorType_TUESDAY,
505 DatetimeComponent::ComponentType::DAY_OF_WEEK},
506 {DatetimeExtractorType_WEDNESDAY,
507 DatetimeComponent::ComponentType::DAY_OF_WEEK},
508 {DatetimeExtractorType_THURSDAY,
509 DatetimeComponent::ComponentType::DAY_OF_WEEK},
510 {DatetimeExtractorType_FRIDAY,
511 DatetimeComponent::ComponentType::DAY_OF_WEEK},
512 {DatetimeExtractorType_SATURDAY,
513 DatetimeComponent::ComponentType::DAY_OF_WEEK},
514 {DatetimeExtractorType_SUNDAY,
515 DatetimeComponent::ComponentType::DAY_OF_WEEK},
516 {DatetimeExtractorType_SECONDS,
517 DatetimeComponent::ComponentType::SECOND},
518 {DatetimeExtractorType_MINUTES,
519 DatetimeComponent::ComponentType::MINUTE},
520 {DatetimeExtractorType_NOW,
521 DatetimeComponent::ComponentType::DAY_OF_MONTH},
522 {DatetimeExtractorType_HOURS, DatetimeComponent::ComponentType::HOUR},
523 {DatetimeExtractorType_DAY,
524 DatetimeComponent::ComponentType::DAY_OF_MONTH},
525 {DatetimeExtractorType_TOMORROW,
526 DatetimeComponent::ComponentType::DAY_OF_MONTH},
527 {DatetimeExtractorType_YESTERDAY,
528 DatetimeComponent::ComponentType::DAY_OF_MONTH},
529 {DatetimeExtractorType_WEEK, DatetimeComponent::ComponentType::WEEK},
530 {DatetimeExtractorType_MONTH,
531 DatetimeComponent::ComponentType::MONTH},
532 {DatetimeExtractorType_YEAR, DatetimeComponent::ComponentType::YEAR},
533 },
534 parsed_field_type);
535 }
536
537 } // namespace libtextclassifier3
538