1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "annotator/duration/duration.h"
18
19 #include <climits>
20 #include <cstdlib>
21
22 #include "annotator/collections.h"
23 #include "annotator/types.h"
24 #include "utils/base/logging.h"
25 #include "utils/strings/numbers.h"
26
27 namespace libtextclassifier3 {
28
29 using DurationUnit = internal::DurationUnit;
30
31 namespace internal {
32
33 namespace {
FillDurationUnitMap(const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> * expressions,DurationUnit duration_unit,std::unordered_map<std::string,DurationUnit> * target_map)34 void FillDurationUnitMap(
35 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*
36 expressions,
37 DurationUnit duration_unit,
38 std::unordered_map<std::string, DurationUnit>* target_map) {
39 if (expressions == nullptr) {
40 return;
41 }
42
43 for (const flatbuffers::String* expression_string : *expressions) {
44 (*target_map)[expression_string->c_str()] = duration_unit;
45 }
46 }
47 } // namespace
48
BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions * options)49 std::unordered_map<std::string, DurationUnit> BuildTokenToDurationUnitMapping(
50 const DurationAnnotatorOptions* options) {
51 std::unordered_map<std::string, DurationUnit> mapping;
52 FillDurationUnitMap(options->week_expressions(), DurationUnit::WEEK,
53 &mapping);
54 FillDurationUnitMap(options->day_expressions(), DurationUnit::DAY, &mapping);
55 FillDurationUnitMap(options->hour_expressions(), DurationUnit::HOUR,
56 &mapping);
57 FillDurationUnitMap(options->minute_expressions(), DurationUnit::MINUTE,
58 &mapping);
59 FillDurationUnitMap(options->second_expressions(), DurationUnit::SECOND,
60 &mapping);
61 return mapping;
62 }
63
BuildStringSet(const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> * strings)64 std::unordered_set<std::string> BuildStringSet(
65 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*
66 strings) {
67 std::unordered_set<std::string> result;
68 if (strings == nullptr) {
69 return result;
70 }
71
72 for (const flatbuffers::String* string_value : *strings) {
73 result.insert(string_value->c_str());
74 }
75
76 return result;
77 }
78
79 } // namespace internal
80
ClassifyText(const UnicodeText & context,CodepointSpan selection_indices,AnnotationUsecase annotation_usecase,ClassificationResult * classification_result) const81 bool DurationAnnotator::ClassifyText(
82 const UnicodeText& context, CodepointSpan selection_indices,
83 AnnotationUsecase annotation_usecase,
84 ClassificationResult* classification_result) const {
85 if (!options_->enabled() || ((options_->enabled_annotation_usecases() &
86 (1 << annotation_usecase))) == 0) {
87 return false;
88 }
89
90 const UnicodeText selection =
91 UnicodeText::Substring(context, selection_indices.first,
92 selection_indices.second, /*do_copy=*/false);
93 const std::vector<Token> tokens = feature_processor_->Tokenize(selection);
94
95 AnnotatedSpan annotated_span;
96 if (FindDurationStartingAt(context, tokens, 0, &annotated_span) !=
97 tokens.size()) {
98 return false;
99 }
100
101 TC3_CHECK(!annotated_span.classification.empty());
102
103 *classification_result = annotated_span.classification[0];
104 return true;
105 }
106
FindAll(const UnicodeText & context,const std::vector<Token> & tokens,AnnotationUsecase annotation_usecase,std::vector<AnnotatedSpan> * results) const107 bool DurationAnnotator::FindAll(const UnicodeText& context,
108 const std::vector<Token>& tokens,
109 AnnotationUsecase annotation_usecase,
110 std::vector<AnnotatedSpan>* results) const {
111 if (!options_->enabled() || ((options_->enabled_annotation_usecases() &
112 (1 << annotation_usecase))) == 0) {
113 return true;
114 }
115
116 for (int i = 0; i < tokens.size();) {
117 AnnotatedSpan span;
118 const int next_i = FindDurationStartingAt(context, tokens, i, &span);
119 if (next_i != i) {
120 results->push_back(span);
121 i = next_i;
122 } else {
123 i++;
124 }
125 }
126 return true;
127 }
128
FindDurationStartingAt(const UnicodeText & context,const std::vector<Token> & tokens,int start_token_index,AnnotatedSpan * result) const129 int DurationAnnotator::FindDurationStartingAt(const UnicodeText& context,
130 const std::vector<Token>& tokens,
131 int start_token_index,
132 AnnotatedSpan* result) const {
133 CodepointIndex start_index = kInvalidIndex;
134 CodepointIndex end_index = kInvalidIndex;
135
136 bool has_quantity = false;
137 ParsedDurationAtom parsed_duration;
138
139 std::vector<ParsedDurationAtom> parsed_duration_atoms;
140
141 // This is the core algorithm for finding the duration expressions. It
142 // basically iterates over tokens and changes the state variables above as it
143 // goes.
144 int token_index;
145 for (token_index = start_token_index; token_index < tokens.size();
146 token_index++) {
147 const Token& token = tokens[token_index];
148
149 if (ParseQuantityToken(token, &parsed_duration)) {
150 has_quantity = true;
151 if (start_index == kInvalidIndex) {
152 start_index = token.start;
153 }
154 end_index = token.end;
155 } else if (ParseDurationUnitToken(token, &parsed_duration.unit)) {
156 if (start_index == kInvalidIndex) {
157 start_index = token.start;
158 }
159 end_index = token.end;
160 parsed_duration_atoms.push_back(parsed_duration);
161 has_quantity = false;
162 parsed_duration = ParsedDurationAtom();
163 } else if (ParseFillerToken(token)) {
164 } else {
165 break;
166 }
167 }
168
169 if (parsed_duration_atoms.empty()) {
170 return start_token_index;
171 }
172
173 const bool parse_ended_without_unit_for_last_mentioned_quantity =
174 has_quantity;
175
176 ClassificationResult classification{Collections::Duration(),
177 options_->score()};
178 classification.priority_score = options_->priority_score();
179 classification.duration_ms =
180 ParsedDurationAtomsToMillis(parsed_duration_atoms);
181
182 // Process suffix expressions like "and half" that don't have the
183 // duration_unit explicitly mentioned.
184 if (parse_ended_without_unit_for_last_mentioned_quantity &&
185 parsed_duration.plus_half) {
186 ParsedDurationAtom atom = ParsedDurationAtom::Half();
187 atom.unit = parsed_duration_atoms.rbegin()->unit;
188 classification.duration_ms += ParsedDurationAtomsToMillis({atom});
189 }
190
191 result->span = feature_processor_->StripBoundaryCodepoints(
192 context, {start_index, end_index});
193 result->classification.push_back(classification);
194 result->source = AnnotatedSpan::Source::DURATION;
195
196 return token_index;
197 }
198
ParsedDurationAtomsToMillis(const std::vector<ParsedDurationAtom> & atoms) const199 int64 DurationAnnotator::ParsedDurationAtomsToMillis(
200 const std::vector<ParsedDurationAtom>& atoms) const {
201 int64 result = 0;
202 for (auto atom : atoms) {
203 int multiplier;
204 switch (atom.unit) {
205 case DurationUnit::WEEK:
206 multiplier = 7 * 24 * 60 * 60 * 1000;
207 break;
208 case DurationUnit::DAY:
209 multiplier = 24 * 60 * 60 * 1000;
210 break;
211 case DurationUnit::HOUR:
212 multiplier = 60 * 60 * 1000;
213 break;
214 case DurationUnit::MINUTE:
215 multiplier = 60 * 1000;
216 break;
217 case DurationUnit::SECOND:
218 multiplier = 1000;
219 break;
220 case DurationUnit::UNKNOWN:
221 TC3_LOG(ERROR) << "Requesting parse of UNKNOWN duration duration_unit.";
222 return -1;
223 break;
224 }
225
226 int value = atom.value;
227 // This condition handles expressions like "an hour", where the quantity is
228 // not specified. In this case we assume quantity 1. Except for cases like
229 // "half hour".
230 if (value == 0 && !atom.plus_half) {
231 value = 1;
232 }
233 result += value * multiplier;
234 result += atom.plus_half * multiplier / 2;
235 }
236 return result;
237 }
238
ParseQuantityToken(const Token & token,ParsedDurationAtom * value) const239 bool DurationAnnotator::ParseQuantityToken(const Token& token,
240 ParsedDurationAtom* value) const {
241 if (token.value.empty()) {
242 return false;
243 }
244
245 std::string token_value_buffer;
246 const std::string& token_value = feature_processor_->StripBoundaryCodepoints(
247 token.value, &token_value_buffer);
248
249 if (half_expressions_.find(token_value) != half_expressions_.end()) {
250 value->plus_half = true;
251 return true;
252 }
253
254 int32 parsed_value;
255 if (ParseInt32(token_value.c_str(), &parsed_value)) {
256 value->value = parsed_value;
257 return true;
258 }
259
260 return false;
261 }
262
ParseDurationUnitToken(const Token & token,DurationUnit * duration_unit) const263 bool DurationAnnotator::ParseDurationUnitToken(
264 const Token& token, DurationUnit* duration_unit) const {
265 std::string token_value_buffer;
266 const std::string& token_value = feature_processor_->StripBoundaryCodepoints(
267 token.value, &token_value_buffer);
268
269 const auto it = token_value_to_duration_unit_.find(token_value);
270 if (it == token_value_to_duration_unit_.end()) {
271 return false;
272 }
273
274 *duration_unit = it->second;
275 return true;
276 }
277
ParseFillerToken(const Token & token) const278 bool DurationAnnotator::ParseFillerToken(const Token& token) const {
279 std::string token_value_buffer;
280 const std::string& token_value = feature_processor_->StripBoundaryCodepoints(
281 token.value, &token_value_buffer);
282
283 if (filler_expressions_.find(token_value) == filler_expressions_.end()) {
284 return false;
285 }
286
287 return true;
288 }
289
290 } // namespace libtextclassifier3
291