1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "annotator/number/number_test-include.h"
18
19 #include <string>
20 #include <vector>
21
22 #include "annotator/collections.h"
23 #include "annotator/model_generated.h"
24 #include "annotator/types-test-util.h"
25 #include "annotator/types.h"
26 #include "utils/tokenizer-utils.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "gmock/gmock.h"
29 #include "gtest/gtest.h"
30
31 namespace libtextclassifier3 {
32 namespace test_internal {
33
34 using ::testing::AllOf;
35 using ::testing::ElementsAre;
36 using ::testing::Field;
37 using ::testing::Matcher;
38 using ::testing::UnorderedElementsAre;
39
40 const NumberAnnotatorOptions*
TestingNumberAnnotatorOptions()41 NumberAnnotatorTest::TestingNumberAnnotatorOptions() {
42 static const flatbuffers::DetachedBuffer* options_data = []() {
43 NumberAnnotatorOptionsT options;
44 options.enabled = true;
45 options.priority_score = -10.0;
46 options.float_number_priority_score = 1.0;
47 options.enabled_annotation_usecases =
48 1 << AnnotationUsecase_ANNOTATION_USECASE_RAW;
49 options.max_number_of_digits = 20;
50
51 options.percentage_priority_score = 1.0;
52 options.percentage_annotation_usecases =
53 (1 << AnnotationUsecase_ANNOTATION_USECASE_RAW) +
54 (1 << AnnotationUsecase_ANNOTATION_USECASE_SMART);
55 std::set<std::string> percent_suffixes({"パーセント", "percent", "pércént",
56 "pc", "pct", "%", "٪", "﹪", "%"});
57 for (const std::string& string_value : percent_suffixes) {
58 options.percentage_pieces_string.append(string_value);
59 options.percentage_pieces_string.push_back('\0');
60 }
61
62 flatbuffers::FlatBufferBuilder builder;
63 builder.Finish(NumberAnnotatorOptions::Pack(builder, &options));
64 return new flatbuffers::DetachedBuffer(builder.Release());
65 }();
66
67 return flatbuffers::GetRoot<NumberAnnotatorOptions>(options_data->data());
68 }
69
70 MATCHER_P(IsCorrectCollection, collection, "collection is " + collection) {
71 return arg.collection == collection;
72 }
73
74 MATCHER_P(IsCorrectNumericValue, numeric_value,
75 "numeric value is " + std::to_string(numeric_value)) {
76 return arg.numeric_value == numeric_value;
77 }
78
79 MATCHER_P(IsCorrectNumericDoubleValue, numeric_double_value,
80 "numeric double value is " + std::to_string(numeric_double_value)) {
81 return arg.numeric_double_value == numeric_double_value;
82 }
83
84 MATCHER_P(IsCorrectScore, score, "score is " + std::to_string(score)) {
85 return arg.score == score;
86 }
87
88 MATCHER_P(IsCorrectPriortyScore, priority_score,
89 "priority score is " + std::to_string(priority_score)) {
90 return arg.priority_score == priority_score;
91 }
92
93 MATCHER_P(IsCorrectSpan, span,
94 "span is (" + std::to_string(span.first) + "," +
95 std::to_string(span.second) + ")") {
96 return arg.span == span;
97 }
98
99 MATCHER_P(Classification, inner, "") {
100 return testing::ExplainMatchResult(inner, arg.classification,
101 result_listener);
102 }
103
IsAnnotatedSpan(const CodepointSpan & codepoint_span,const std::string & collection,const int int_value,const double double_value,const float priority_score=-10,const float score=1)104 static Matcher<AnnotatedSpan> IsAnnotatedSpan(
105 const CodepointSpan& codepoint_span, const std::string& collection,
106 const int int_value, const double double_value,
107 const float priority_score = -10, const float score = 1) {
108 return AllOf(
109 IsCorrectSpan(codepoint_span),
110 Classification(ElementsAre(AllOf(
111 IsCorrectCollection(collection), IsCorrectNumericValue(int_value),
112 IsCorrectNumericDoubleValue(double_value), IsCorrectScore(score),
113 IsCorrectPriortyScore(priority_score)))));
114 }
115
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberCorrectly)116 TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberCorrectly) {
117 ClassificationResult classification_result;
118 EXPECT_TRUE(number_annotator_.ClassifyText(
119 UTF8ToUnicodeText("... 12345 ..."), {4, 9},
120 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
121
122 EXPECT_EQ(classification_result.collection, "number");
123 EXPECT_EQ(classification_result.numeric_value, 12345);
124 EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
125 }
126
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberAsFloatCorrectly)127 TEST_F(NumberAnnotatorTest, ClassifiesAndParsesNumberAsFloatCorrectly) {
128 ClassificationResult classification_result;
129 EXPECT_TRUE(number_annotator_.ClassifyText(
130 UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15},
131 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
132
133 EXPECT_EQ(classification_result.collection, "number");
134 EXPECT_EQ(classification_result.numeric_value, 12345);
135 EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345.12345);
136 }
137
TEST_F(NumberAnnotatorTest,ClassifiesAndParsesNumberAsFloatCorrectlyWithoutDecimals)138 TEST_F(NumberAnnotatorTest,
139 ClassifiesAndParsesNumberAsFloatCorrectlyWithoutDecimals) {
140 ClassificationResult classification_result;
141 // The dot after a number is considered punctuation, not part of a floating
142 // number.
143 EXPECT_TRUE(number_annotator_.ClassifyText(
144 UTF8ToUnicodeText("... 12345. ..."), {4, 9},
145 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
146 EXPECT_FALSE(number_annotator_.ClassifyText(
147 UTF8ToUnicodeText("... 12345. ..."), {4, 10},
148 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
149
150 EXPECT_EQ(classification_result.collection, "number");
151 EXPECT_EQ(classification_result.numeric_value, 12345);
152 EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
153
154 EXPECT_TRUE(number_annotator_.ClassifyText(
155 UTF8ToUnicodeText("... 12345. ..."), {4, 9},
156 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
157 EXPECT_EQ(classification_result.collection, "number");
158 EXPECT_EQ(classification_result.numeric_value, 12345);
159 EXPECT_FLOAT_EQ(classification_result.numeric_double_value, 12345);
160 }
161
TEST_F(NumberAnnotatorTest,FindsAllIntegerAndFloatNumbersInText)162 TEST_F(NumberAnnotatorTest, FindsAllIntegerAndFloatNumbersInText) {
163 std::vector<AnnotatedSpan> result;
164 // In the context "68.9#" -> 68.9 is a number because # is punctuation.
165 // In the context "68.9#?" -> 68.9 is not a number because is followed by two
166 // punctuation signs.
167 EXPECT_TRUE(number_annotator_.FindAll(
168 UTF8ToUnicodeText("how much is 2 plus 5 divided by 7% minus 3.14 "
169 "what about 68.9# or 68.9#?"),
170 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
171
172 EXPECT_THAT(result,
173 UnorderedElementsAre(
174 IsAnnotatedSpan(CodepointSpan(12, 13), "number",
175 /*int_value=*/2, /*double_value=*/2.0),
176 IsAnnotatedSpan(CodepointSpan(19, 20), "number",
177 /*int_value=*/5, /*double_value=*/5.0),
178 IsAnnotatedSpan(CodepointSpan(32, 33), "number",
179 /*int_value=*/7, /*double_value=*/7.0),
180 IsAnnotatedSpan(CodepointSpan(32, 34), "percentage",
181 /*int_value=*/7, /*double_value=*/7.0,
182 /*priority_score=*/1),
183 IsAnnotatedSpan(CodepointSpan(41, 45), "number",
184 /*int_value=*/3, /*double_value=*/3.14,
185 /*priority_score=*/1),
186 IsAnnotatedSpan(CodepointSpan(57, 61), "number",
187 /*int_value=*/68, /*double_value=*/68.9,
188 /*priority_score=*/1)));
189 }
190
TEST_F(NumberAnnotatorTest,ClassifiesNonNumberCorrectly)191 TEST_F(NumberAnnotatorTest, ClassifiesNonNumberCorrectly) {
192 ClassificationResult classification_result;
193 EXPECT_FALSE(number_annotator_.ClassifyText(
194 UTF8ToUnicodeText("... 123a45 ..."), {4, 10},
195 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
196 EXPECT_FALSE(number_annotator_.ClassifyText(
197 UTF8ToUnicodeText("... 12345..12345 ..."), {4, 16},
198 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
199 EXPECT_FALSE(number_annotator_.ClassifyText(
200 UTF8ToUnicodeText("... 12345a ..."), {4, 11},
201 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
202 }
203
TEST_F(NumberAnnotatorTest,ClassifiesNumberSelectionCorrectly)204 TEST_F(NumberAnnotatorTest, ClassifiesNumberSelectionCorrectly) {
205 ClassificationResult classification_result;
206 // Punctuation after a number is not part of the number.
207 EXPECT_TRUE(number_annotator_.ClassifyText(
208 UTF8ToUnicodeText("... 14, ..."), {4, 6},
209 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
210 EXPECT_EQ(classification_result.collection, "number");
211 EXPECT_EQ(classification_result.numeric_value, 14);
212 EXPECT_EQ(classification_result.numeric_double_value, 14);
213
214 EXPECT_FALSE(number_annotator_.ClassifyText(
215 UTF8ToUnicodeText("... 14, ..."), {4, 7},
216 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
217 }
218
TEST_F(NumberAnnotatorTest,ClassifiesPercentageSignCorrectly)219 TEST_F(NumberAnnotatorTest, ClassifiesPercentageSignCorrectly) {
220 ClassificationResult classification_result;
221 EXPECT_TRUE(number_annotator_.ClassifyText(
222 UTF8ToUnicodeText("... 99% ..."), {4, 7},
223 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
224
225 EXPECT_EQ(classification_result.collection, "percentage");
226 EXPECT_EQ(classification_result.numeric_value, 99);
227 EXPECT_EQ(classification_result.numeric_double_value, 99);
228 }
229
TEST_F(NumberAnnotatorTest,ClassifiesPercentageWordCorrectly)230 TEST_F(NumberAnnotatorTest, ClassifiesPercentageWordCorrectly) {
231 ClassificationResult classification_result;
232 EXPECT_TRUE(number_annotator_.ClassifyText(
233 UTF8ToUnicodeText("... 15 percent ..."), {4, 14},
234 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
235
236 EXPECT_EQ(classification_result.collection, "percentage");
237 EXPECT_EQ(classification_result.numeric_value, 15);
238 EXPECT_EQ(classification_result.numeric_double_value, 15);
239 }
240
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiPercentageIncorrectSuffix)241 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiPercentageIncorrectSuffix) {
242 ClassificationResult classification_result;
243 EXPECT_FALSE(number_annotator_.ClassifyText(
244 UTF8ToUnicodeText("15 café"), {0, 7},
245 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
246 }
247
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiFrPercentageCorrectSuffix)248 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiFrPercentageCorrectSuffix) {
249 ClassificationResult classification_result;
250 EXPECT_TRUE(number_annotator_.ClassifyText(
251 UTF8ToUnicodeText("25 pércént"), {0, 10},
252 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
253
254 EXPECT_EQ(classification_result.collection, "percentage");
255 EXPECT_EQ(classification_result.numeric_value, 25);
256 EXPECT_EQ(classification_result.numeric_double_value, 25);
257 }
258
TEST_F(NumberAnnotatorTest,ClassifiesNonAsciiJaPercentageCorrectSuffix)259 TEST_F(NumberAnnotatorTest, ClassifiesNonAsciiJaPercentageCorrectSuffix) {
260 ClassificationResult classification_result;
261 EXPECT_TRUE(number_annotator_.ClassifyText(
262 UTF8ToUnicodeText("10パーセント"), {0, 7},
263 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
264 EXPECT_EQ(classification_result.collection, "percentage");
265 EXPECT_EQ(classification_result.numeric_value, 10);
266 EXPECT_EQ(classification_result.numeric_double_value, 10);
267
268 std::vector<AnnotatedSpan> result;
269 EXPECT_TRUE(number_annotator_.FindAll(
270 UTF8ToUnicodeText("明日の降水確率は10パーセント 音量を12にセット"),
271 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
272 EXPECT_THAT(result,
273 UnorderedElementsAre(
274 IsAnnotatedSpan(CodepointSpan(8, 10), "number",
275 /*int_value=*/10, /*double_value=*/10.0),
276 IsAnnotatedSpan(CodepointSpan(8, 15), "percentage",
277 /*int_value=*/10, /*double_value=*/10.0,
278 /*priority_score=*/1),
279 IsAnnotatedSpan(CodepointSpan(20, 22), "number",
280 /*int_value=*/12, /*double_value=*/12.0)));
281 }
282
TEST_F(NumberAnnotatorTest,FindsAllNumbersInText)283 TEST_F(NumberAnnotatorTest, FindsAllNumbersInText) {
284 std::vector<AnnotatedSpan> result;
285 EXPECT_TRUE(number_annotator_.FindAll(
286 UTF8ToUnicodeText("... 12345 ... 9 is my number and 27% or 68# #38 #39 "
287 "but not $99."),
288 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
289
290 EXPECT_THAT(
291 result,
292 UnorderedElementsAre(
293 IsAnnotatedSpan(CodepointSpan(4, 9), "number",
294 /*int_value=*/12345, /*double_value=*/12345.0),
295 IsAnnotatedSpan(CodepointSpan(14, 15), "number",
296 /*int_value=*/9, /*double_value=*/9.0),
297 IsAnnotatedSpan(CodepointSpan(33, 35), "number",
298 /*int_value=*/27, /*double_value=*/27.0),
299 IsAnnotatedSpan(CodepointSpan(33, 36), "percentage",
300 /*int_value=*/27, /*double_value=*/27.0,
301 /*priority_score=*/1),
302 IsAnnotatedSpan(CodepointSpan(40, 42), "number",
303 /*int_value=*/68, /*double_value=*/68.0),
304 IsAnnotatedSpan(CodepointSpan(45, 47), "number",
305 /*int_value=*/38, /*double_value=*/38.0),
306 IsAnnotatedSpan(CodepointSpan(49, 51), "number",
307 /*int_value=*/39, /*double_value=*/39.0)));
308 }
309
TEST_F(NumberAnnotatorTest,FindsNoNumberInText)310 TEST_F(NumberAnnotatorTest, FindsNoNumberInText) {
311 std::vector<AnnotatedSpan> result;
312 EXPECT_TRUE(number_annotator_.FindAll(
313 UTF8ToUnicodeText("... 12345a ... 12345..12345 and 123a45 are not valid. "
314 "And -#5% is also bad."),
315 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
316 ASSERT_EQ(result.size(), 0);
317 }
318
TEST_F(NumberAnnotatorTest,FindsNumberWithPunctuation)319 TEST_F(NumberAnnotatorTest, FindsNumberWithPunctuation) {
320 std::vector<AnnotatedSpan> result;
321 // A number should be followed by only one punctuation signs => 15 is not a
322 // number.
323 EXPECT_TRUE(number_annotator_.FindAll(
324 UTF8ToUnicodeText(
325 "It's 12, 13, 14! Or 15??? For sure 16: 17; 18. and -19"),
326 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
327
328 EXPECT_THAT(result,
329 UnorderedElementsAre(
330 IsAnnotatedSpan(CodepointSpan(5, 7), "number",
331 /*int_value=*/12, /*double_value=*/12.0),
332 IsAnnotatedSpan(CodepointSpan(9, 11), "number",
333 /*int_value=*/13, /*double_value=*/13.0),
334 IsAnnotatedSpan(CodepointSpan(13, 15), "number",
335 /*int_value=*/14, /*double_value=*/14.0),
336 IsAnnotatedSpan(CodepointSpan(35, 37), "number",
337 /*int_value=*/16, /*double_value=*/16.0),
338 IsAnnotatedSpan(CodepointSpan(39, 41), "number",
339 /*int_value=*/17, /*double_value=*/17.0),
340 IsAnnotatedSpan(CodepointSpan(43, 45), "number",
341 /*int_value=*/18, /*double_value=*/18.0),
342 IsAnnotatedSpan(CodepointSpan(51, 54), "number",
343 /*int_value=*/-19, /*double_value=*/-19.0)));
344 }
345
TEST_F(NumberAnnotatorTest,FindsFloatNumberWithPunctuation)346 TEST_F(NumberAnnotatorTest, FindsFloatNumberWithPunctuation) {
347 std::vector<AnnotatedSpan> result;
348 EXPECT_TRUE(number_annotator_.FindAll(
349 UTF8ToUnicodeText("It's 12.123, 13.45, 14.54321! Or 15.1? Maybe 16.33: "
350 "17.21; but for sure 18.90."),
351 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
352
353 EXPECT_THAT(result,
354 UnorderedElementsAre(
355 IsAnnotatedSpan(CodepointSpan(5, 11), "number",
356 /*int_value=*/12, /*double_value=*/12.123,
357 /*priority_score=*/1),
358 IsAnnotatedSpan(CodepointSpan(13, 18), "number",
359 /*int_value=*/13, /*double_value=*/13.45,
360 /*priority_score=*/1),
361 IsAnnotatedSpan(CodepointSpan(20, 28), "number",
362 /*int_value=*/14, /*double_value=*/14.54321,
363 /*priority_score=*/1),
364 IsAnnotatedSpan(CodepointSpan(33, 37), "number",
365 /*int_value=*/15, /*double_value=*/15.1,
366 /*priority_score=*/1),
367 IsAnnotatedSpan(CodepointSpan(45, 50), "number",
368 /*int_value=*/16, /*double_value=*/16.33,
369 /*priority_score=*/1),
370 IsAnnotatedSpan(CodepointSpan(52, 57), "number",
371 /*int_value=*/17, /*double_value=*/17.21,
372 /*priority_score=*/1),
373 IsAnnotatedSpan(CodepointSpan(72, 77), "number",
374 /*int_value=*/18, /*double_value=*/18.9,
375 /*priority_score=*/1)));
376 }
377
TEST_F(NumberAnnotatorTest,HandlesNumbersAtBeginning)378 TEST_F(NumberAnnotatorTest, HandlesNumbersAtBeginning) {
379 std::vector<AnnotatedSpan> result;
380 EXPECT_TRUE(number_annotator_.FindAll(
381 UTF8ToUnicodeText("-5"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
382 &result));
383
384 EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan(
385 CodepointSpan(0, 2), "number",
386 /*int_value=*/-5, /*double_value=*/-5)));
387 }
388
TEST_F(NumberAnnotatorTest,HandlesNegativeNumbers)389 TEST_F(NumberAnnotatorTest, HandlesNegativeNumbers) {
390 std::vector<AnnotatedSpan> result;
391 EXPECT_TRUE(number_annotator_.FindAll(
392 UTF8ToUnicodeText("Number -5 and -5% and not number --5%"),
393 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
394
395 EXPECT_THAT(result,
396 UnorderedElementsAre(
397 IsAnnotatedSpan(CodepointSpan(7, 9), "number",
398 /*int_value=*/-5, /*double_value=*/-5),
399 IsAnnotatedSpan(CodepointSpan(14, 16), "number",
400 /*int_value=*/-5, /*double_value=*/-5),
401 IsAnnotatedSpan(CodepointSpan(14, 17), "percentage",
402 /*int_value=*/-5, /*double_value=*/-5,
403 /*priority_score=*/1)));
404 }
405
TEST_F(NumberAnnotatorTest,FindGoodPercentageContexts)406 TEST_F(NumberAnnotatorTest, FindGoodPercentageContexts) {
407 std::vector<AnnotatedSpan> result;
408 EXPECT_TRUE(number_annotator_.FindAll(
409 UTF8ToUnicodeText(
410 "5 percent, 10 pct, 25 pc and 17%, -5 percent, 10% are percentages"),
411 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
412
413 EXPECT_THAT(result,
414 UnorderedElementsAre(
415 IsAnnotatedSpan(CodepointSpan(0, 1), "number",
416 /*int_value=*/5, /*double_value=*/5),
417 IsAnnotatedSpan(CodepointSpan(0, 9), "percentage",
418 /*int_value=*/5, /*double_value=*/5,
419 /*priority_score=*/1),
420 IsAnnotatedSpan(CodepointSpan(11, 13), "number",
421 /*int_value=*/10, /*double_value=*/10),
422 IsAnnotatedSpan(CodepointSpan(11, 17), "percentage",
423 /*int_value=*/10, /*double_value=*/10,
424 /*priority_score=*/1),
425 IsAnnotatedSpan(CodepointSpan(19, 21), "number",
426 /*int_value=*/25, /*double_value=*/25),
427 IsAnnotatedSpan(CodepointSpan(19, 24), "percentage",
428 /*int_value=*/25, /*double_value=*/25,
429 /*priority_score=*/1),
430 IsAnnotatedSpan(CodepointSpan(29, 31), "number",
431 /*int_value=*/17, /*double_value=*/17),
432 IsAnnotatedSpan(CodepointSpan(29, 32), "percentage",
433 /*int_value=*/17, /*double_value=*/17,
434 /*priority_score=*/1),
435 IsAnnotatedSpan(CodepointSpan(34, 36), "number",
436 /*int_value=*/-5, /*double_value=*/-5),
437 IsAnnotatedSpan(CodepointSpan(34, 44), "percentage",
438 /*int_value=*/-5, /*double_value=*/-5,
439 /*priority_score=*/1),
440 IsAnnotatedSpan(CodepointSpan(46, 48), "number",
441 /*int_value=*/10, /*double_value=*/10),
442 IsAnnotatedSpan(CodepointSpan(46, 49), "percentage",
443 /*int_value=*/10, /*double_value=*/10,
444 /*priority_score=*/1)));
445 }
446
TEST_F(NumberAnnotatorTest,FindSinglePercentageInContext)447 TEST_F(NumberAnnotatorTest, FindSinglePercentageInContext) {
448 std::vector<AnnotatedSpan> result;
449 EXPECT_TRUE(number_annotator_.FindAll(
450 UTF8ToUnicodeText("5%"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
451 &result));
452
453 EXPECT_THAT(result, UnorderedElementsAre(
454 IsAnnotatedSpan(CodepointSpan(0, 1), "number",
455 /*int_value=*/5, /*double_value=*/5),
456 IsAnnotatedSpan(CodepointSpan(0, 2), "percentage",
457 /*int_value=*/5, /*double_value=*/5,
458 /*priority_score=*/1)));
459 }
460
TEST_F(NumberAnnotatorTest,IgnoreBadPercentageContexts)461 TEST_F(NumberAnnotatorTest, IgnoreBadPercentageContexts) {
462 std::vector<AnnotatedSpan> result;
463 // A valid number is followed by only one punctuation element.
464 EXPECT_TRUE(number_annotator_.FindAll(
465 UTF8ToUnicodeText("10, pct, 25 prc, 5#: percentage are not percentages"),
466 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
467
468 EXPECT_THAT(result,
469 UnorderedElementsAre(
470 IsAnnotatedSpan(CodepointSpan(0, 2), "number",
471 /*int_value=*/10, /*double_value=*/10),
472 IsAnnotatedSpan(CodepointSpan(9, 11), "number",
473 /*int_value=*/25, /*double_value=*/25)));
474 }
475
TEST_F(NumberAnnotatorTest,IgnoreBadPercentagePunctuationContexts)476 TEST_F(NumberAnnotatorTest, IgnoreBadPercentagePunctuationContexts) {
477 std::vector<AnnotatedSpan> result;
478 EXPECT_TRUE(number_annotator_.FindAll(
479 UTF8ToUnicodeText(
480 "#!24% or :?33 percent are not valid percentages, nor numbers."),
481 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
482
483 EXPECT_TRUE(result.empty());
484 }
485
TEST_F(NumberAnnotatorTest,FindPercentageInNonAsciiContext)486 TEST_F(NumberAnnotatorTest, FindPercentageInNonAsciiContext) {
487 std::vector<AnnotatedSpan> result;
488 EXPECT_TRUE(number_annotator_.FindAll(
489 UTF8ToUnicodeText(
490 "At the café 10% or 25 percent of people are nice. Only 10%!"),
491 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
492
493 EXPECT_THAT(result,
494 UnorderedElementsAre(
495 IsAnnotatedSpan(CodepointSpan(12, 14), "number",
496 /*int_value=*/10, /*double_value=*/10),
497 IsAnnotatedSpan(CodepointSpan(12, 15), "percentage",
498 /*int_value=*/10, /*double_value=*/10,
499 /*priority_score=*/1),
500 IsAnnotatedSpan(CodepointSpan(19, 21), "number",
501 /*int_value=*/25, /*double_value=*/25),
502 IsAnnotatedSpan(CodepointSpan(19, 29), "percentage",
503 /*int_value=*/25, /*double_value=*/25,
504 /*priority_score=*/1),
505 IsAnnotatedSpan(CodepointSpan(55, 57), "number",
506 /*int_value=*/10, /*double_value=*/10),
507 IsAnnotatedSpan(CodepointSpan(55, 58), "percentage",
508 /*int_value=*/10, /*double_value=*/10,
509 /*priority_score=*/1)));
510 }
511
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalIgnoredCharactersDoesNotParseIt)512 TEST_F(NumberAnnotatorTest,
513 WhenPercentSuffixWithAdditionalIgnoredCharactersDoesNotParseIt) {
514 ClassificationResult classification_result;
515 EXPECT_FALSE(number_annotator_.ClassifyText(
516 UTF8ToUnicodeText("23#!? percent"), {0, 13},
517 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
518 }
519
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomTokensDoesNotParseIt)520 TEST_F(NumberAnnotatorTest,
521 WhenPercentSuffixWithAdditionalRandomTokensDoesNotParseIt) {
522 ClassificationResult classification_result;
523 EXPECT_FALSE(number_annotator_.ClassifyText(
524 UTF8ToUnicodeText("23 asdf 3.14 pct asdf"), {0, 21},
525 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
526 }
527
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomPrefixSuffixDoesNotParseIt)528 TEST_F(NumberAnnotatorTest,
529 WhenPercentSuffixWithAdditionalRandomPrefixSuffixDoesNotParseIt) {
530 ClassificationResult classification_result;
531 EXPECT_FALSE(number_annotator_.ClassifyText(
532 UTF8ToUnicodeText("abdf23 percentabdf"), {0, 18},
533 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
534 }
535
TEST_F(NumberAnnotatorTest,WhenPercentSuffixWithAdditionalRandomStringsDoesNotParsesIt)536 TEST_F(NumberAnnotatorTest,
537 WhenPercentSuffixWithAdditionalRandomStringsDoesNotParsesIt) {
538 ClassificationResult classification_result;
539 EXPECT_FALSE(number_annotator_.ClassifyText(
540 UTF8ToUnicodeText("#?!23 percent#!?"), {0, 16},
541 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
542 }
543
TEST_F(NumberAnnotatorTest,WhenBothPercentSymbolAndSuffixDoesNotParseIt)544 TEST_F(NumberAnnotatorTest, WhenBothPercentSymbolAndSuffixDoesNotParseIt) {
545 ClassificationResult classification_result;
546 EXPECT_FALSE(number_annotator_.ClassifyText(
547 UTF8ToUnicodeText("23% percent"), {0, 11},
548 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
549 }
550
TEST_F(NumberAnnotatorTest,WhenPercentSymbolWithAdditionalPrefixCharactersDoesNotParsesIt)551 TEST_F(NumberAnnotatorTest,
552 WhenPercentSymbolWithAdditionalPrefixCharactersDoesNotParsesIt) {
553 ClassificationResult classification_result;
554 EXPECT_FALSE(number_annotator_.ClassifyText(
555 UTF8ToUnicodeText("#?23%"), {0, 5},
556 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
557 }
558
TEST_F(NumberAnnotatorTest,WhenNumberWithAdditionalCharactersDoesNotParsesIt)559 TEST_F(NumberAnnotatorTest, WhenNumberWithAdditionalCharactersDoesNotParsesIt) {
560 ClassificationResult classification_result;
561 EXPECT_FALSE(number_annotator_.ClassifyText(
562 UTF8ToUnicodeText("23#!?"), {0, 5},
563 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
564 }
565
TEST_F(NumberAnnotatorTest,WhenPercentSymbolWithAdditionalCharactersDoesNotParsesIt)566 TEST_F(NumberAnnotatorTest,
567 WhenPercentSymbolWithAdditionalCharactersDoesNotParsesIt) {
568 ClassificationResult classification_result;
569 // ! does not belong to the percentage annotation
570 EXPECT_TRUE(number_annotator_.ClassifyText(
571 UTF8ToUnicodeText("23%!"), {0, 3},
572 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
573 EXPECT_EQ(classification_result.collection, "percentage");
574 EXPECT_EQ(classification_result.numeric_value, 23);
575 EXPECT_EQ(classification_result.numeric_double_value, 23);
576
577 EXPECT_FALSE(number_annotator_.ClassifyText(
578 UTF8ToUnicodeText("23%!"), {0, 4},
579 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
580 }
581
TEST_F(NumberAnnotatorTest,WhenAdditionalCharactersWithMisplacedPercentSymbolDoesNotParsesIt)582 TEST_F(NumberAnnotatorTest,
583 WhenAdditionalCharactersWithMisplacedPercentSymbolDoesNotParsesIt) {
584 ClassificationResult classification_result;
585 EXPECT_FALSE(number_annotator_.ClassifyText(
586 UTF8ToUnicodeText("23.:;%"), {0, 6},
587 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
588 }
589
TEST_F(NumberAnnotatorTest,WhenMultipleMinusSignsDoesNotParsesIt)590 TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsDoesNotParsesIt) {
591 ClassificationResult classification_result;
592 EXPECT_TRUE(number_annotator_.ClassifyText(
593 UTF8ToUnicodeText("--11"), {1, 4},
594 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
595 EXPECT_THAT(classification_result,
596 AllOf(Field(&ClassificationResult::collection, "number"),
597 Field(&ClassificationResult::numeric_value, -11),
598 Field(&ClassificationResult::numeric_double_value, -11)));
599
600 EXPECT_FALSE(number_annotator_.ClassifyText(
601 UTF8ToUnicodeText("--11"), {0, 4},
602 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
603 }
604
TEST_F(NumberAnnotatorTest,WhenMultipleMinusSignsPercentSignDoesNotParsesIt)605 TEST_F(NumberAnnotatorTest, WhenMultipleMinusSignsPercentSignDoesNotParsesIt) {
606 ClassificationResult classification_result;
607 EXPECT_TRUE(number_annotator_.ClassifyText(
608 UTF8ToUnicodeText("--11%"), {1, 5},
609 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
610 EXPECT_THAT(classification_result,
611 AllOf(Field(&ClassificationResult::collection, "percentage"),
612 Field(&ClassificationResult::numeric_value, -11),
613 Field(&ClassificationResult::numeric_double_value, -11)));
614
615 EXPECT_FALSE(number_annotator_.ClassifyText(
616 UTF8ToUnicodeText("--11%"), {0, 5},
617 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
618 }
619
TEST_F(NumberAnnotatorTest,WhenPlusMinusSignsDoesNotParsesIt)620 TEST_F(NumberAnnotatorTest, WhenPlusMinusSignsDoesNotParsesIt) {
621 ClassificationResult classification_result;
622 EXPECT_TRUE(number_annotator_.ClassifyText(
623 UTF8ToUnicodeText("+-11"), {1, 4},
624 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
625 EXPECT_THAT(classification_result,
626 AllOf(Field(&ClassificationResult::collection, "number"),
627 Field(&ClassificationResult::numeric_value, -11),
628 Field(&ClassificationResult::numeric_double_value, -11)));
629
630 EXPECT_FALSE(number_annotator_.ClassifyText(
631 UTF8ToUnicodeText("+-11"), {0, 4},
632 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
633 }
634
TEST_F(NumberAnnotatorTest,WhenMinusPlusSignsDoesNotParsesIt)635 TEST_F(NumberAnnotatorTest, WhenMinusPlusSignsDoesNotParsesIt) {
636 ClassificationResult classification_result;
637 // + right before a number is not included in the number annotation
638 EXPECT_FALSE(number_annotator_.ClassifyText(
639 UTF8ToUnicodeText("-+11"), {1, 4},
640 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
641 EXPECT_FALSE(number_annotator_.ClassifyText(
642 UTF8ToUnicodeText("-+11"), {0, 4},
643 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
644 }
645
TEST_F(NumberAnnotatorTest,WhenMinusSignSuffixDoesNotParsesIt)646 TEST_F(NumberAnnotatorTest, WhenMinusSignSuffixDoesNotParsesIt) {
647 ClassificationResult classification_result;
648 EXPECT_FALSE(number_annotator_.ClassifyText(
649 UTF8ToUnicodeText("10-"), {0, 3},
650 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
651 }
652
TEST_F(NumberAnnotatorTest,WhenMultipleCharSuffixDoesNotParsesIt)653 TEST_F(NumberAnnotatorTest, WhenMultipleCharSuffixDoesNotParsesIt) {
654 ClassificationResult classification_result;
655 EXPECT_TRUE(number_annotator_.ClassifyText(
656 UTF8ToUnicodeText("10**"), {0, 2},
657 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
658 EXPECT_THAT(classification_result,
659 AllOf(Field(&ClassificationResult::collection, "number"),
660 Field(&ClassificationResult::numeric_value, 10),
661 Field(&ClassificationResult::numeric_double_value, 10)));
662
663 EXPECT_FALSE(number_annotator_.ClassifyText(
664 UTF8ToUnicodeText("10**"), {0, 3},
665 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
666 EXPECT_FALSE(number_annotator_.ClassifyText(
667 UTF8ToUnicodeText("10**"), {0, 4},
668 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
669 }
670
TEST_F(NumberAnnotatorTest,WhenMultipleCharPrefixDoesNotParsesIt)671 TEST_F(NumberAnnotatorTest, WhenMultipleCharPrefixDoesNotParsesIt) {
672 ClassificationResult classification_result;
673 EXPECT_FALSE(number_annotator_.ClassifyText(
674 UTF8ToUnicodeText("**10"), {1, 4},
675 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
676 EXPECT_FALSE(number_annotator_.ClassifyText(
677 UTF8ToUnicodeText("**10"), {0, 4},
678 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
679 }
680
TEST_F(NumberAnnotatorTest,WhenLowestSupportedNumberParsesIt)681 TEST_F(NumberAnnotatorTest, WhenLowestSupportedNumberParsesIt) {
682 ClassificationResult classification_result;
683 EXPECT_TRUE(number_annotator_.ClassifyText(
684 UTF8ToUnicodeText("-1000000000"), {0, 11},
685 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
686
687 EXPECT_THAT(
688 classification_result,
689 AllOf(Field(&ClassificationResult::collection, "number"),
690 Field(&ClassificationResult::numeric_value, -1000000000),
691 Field(&ClassificationResult::numeric_double_value, -1000000000)));
692 }
693
TEST_F(NumberAnnotatorTest,WhenLargestSupportedNumberParsesIt)694 TEST_F(NumberAnnotatorTest, WhenLargestSupportedNumberParsesIt) {
695 ClassificationResult classification_result;
696 EXPECT_TRUE(number_annotator_.ClassifyText(
697 UTF8ToUnicodeText("1000000000"), {0, 10},
698 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
699
700 EXPECT_THAT(
701 classification_result,
702 AllOf(Field(&ClassificationResult::collection, "number"),
703 Field(&ClassificationResult::numeric_value, 1000000000),
704 Field(&ClassificationResult::numeric_double_value, 1000000000)));
705 }
706
TEST_F(NumberAnnotatorTest,WhenLowestSupportedFloatNumberParsesIt)707 TEST_F(NumberAnnotatorTest, WhenLowestSupportedFloatNumberParsesIt) {
708 ClassificationResult classification_result;
709 EXPECT_TRUE(number_annotator_.ClassifyText(
710 UTF8ToUnicodeText("-999999999.999999999"), {0, 20},
711 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
712
713 EXPECT_THAT(classification_result,
714 AllOf(Field(&ClassificationResult::collection, "number"),
715 Field(&ClassificationResult::numeric_value, -1000000000),
716 Field(&ClassificationResult::numeric_double_value,
717 -999999999.999999999)));
718 }
719
TEST_F(NumberAnnotatorTest,WhenLargestFloatSupportedNumberParsesIt)720 TEST_F(NumberAnnotatorTest, WhenLargestFloatSupportedNumberParsesIt) {
721 ClassificationResult classification_result;
722 EXPECT_TRUE(number_annotator_.ClassifyText(
723 UTF8ToUnicodeText("999999999.999999999"), {0, 19},
724 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
725
726 EXPECT_THAT(classification_result,
727 AllOf(Field(&ClassificationResult::collection, "number"),
728 Field(&ClassificationResult::numeric_value, 1000000000),
729 Field(&ClassificationResult::numeric_double_value,
730 999999999.999999999)));
731 }
732
TEST_F(NumberAnnotatorTest,WhenLargeNumberDoesNotParseIt)733 TEST_F(NumberAnnotatorTest, WhenLargeNumberDoesNotParseIt) {
734 ClassificationResult classification_result;
735 EXPECT_FALSE(number_annotator_.ClassifyText(
736 UTF8ToUnicodeText("1234567890123456789012345678901234567890"), {0, 40},
737 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
738 }
739
TEST_F(NumberAnnotatorTest,WhenMinusInTheMiddleDoesNotParseIt)740 TEST_F(NumberAnnotatorTest, WhenMinusInTheMiddleDoesNotParseIt) {
741 ClassificationResult classification_result;
742 EXPECT_FALSE(number_annotator_.ClassifyText(
743 UTF8ToUnicodeText("2016-2017"), {0, 9},
744 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
745 }
746
TEST_F(NumberAnnotatorTest,WhenSuffixWithoutNumberDoesNotParseIt)747 TEST_F(NumberAnnotatorTest, WhenSuffixWithoutNumberDoesNotParseIt) {
748 std::vector<AnnotatedSpan> result;
749 EXPECT_TRUE(number_annotator_.FindAll(
750 UTF8ToUnicodeText("... % ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
751 &result));
752
753 ASSERT_EQ(result.size(), 0);
754 }
755
TEST_F(NumberAnnotatorTest,WhenPrefixWithoutNumberDoesNotParseIt)756 TEST_F(NumberAnnotatorTest, WhenPrefixWithoutNumberDoesNotParseIt) {
757 std::vector<AnnotatedSpan> result;
758 EXPECT_TRUE(number_annotator_.FindAll(
759 UTF8ToUnicodeText("... $ ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
760 &result));
761
762 ASSERT_EQ(result.size(), 0);
763 }
764
TEST_F(NumberAnnotatorTest,WhenPrefixAndSuffixWithoutNumberDoesNotParseIt)765 TEST_F(NumberAnnotatorTest, WhenPrefixAndSuffixWithoutNumberDoesNotParseIt) {
766 std::vector<AnnotatedSpan> result;
767 EXPECT_TRUE(number_annotator_.FindAll(
768 UTF8ToUnicodeText("... $% ..."), AnnotationUsecase_ANNOTATION_USECASE_RAW,
769 &result));
770
771 ASSERT_EQ(result.size(), 0);
772 }
773
TEST_F(NumberAnnotatorTest,ForNumberAnnotationsSetsScoreAndPriorityScore)774 TEST_F(NumberAnnotatorTest, ForNumberAnnotationsSetsScoreAndPriorityScore) {
775 ClassificationResult classification_result;
776 EXPECT_TRUE(number_annotator_.ClassifyText(
777 UTF8ToUnicodeText("... 12345 ..."), {4, 9},
778 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
779
780 EXPECT_EQ(classification_result.collection, "number");
781 EXPECT_EQ(classification_result.numeric_value, 12345);
782 EXPECT_EQ(classification_result.numeric_double_value, 12345);
783 EXPECT_EQ(classification_result.score, 1);
784 EXPECT_EQ(classification_result.priority_score, -10);
785
786 std::vector<AnnotatedSpan> result;
787 EXPECT_TRUE(number_annotator_.FindAll(
788 UTF8ToUnicodeText("Come at 9 or 10 ok?"),
789 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
790
791 EXPECT_THAT(result,
792 UnorderedElementsAre(
793 IsAnnotatedSpan(CodepointSpan(8, 9), "number",
794 /*int_value=*/9, /*double_value=*/9),
795 IsAnnotatedSpan(CodepointSpan(13, 15), "number",
796 /*int_value=*/10, /*double_value=*/10)));
797 }
798
TEST_F(NumberAnnotatorTest,ForFloatNumberAnnotationsSetsScoreAndPriorityScore)799 TEST_F(NumberAnnotatorTest,
800 ForFloatNumberAnnotationsSetsScoreAndPriorityScore) {
801 ClassificationResult classification_result;
802 EXPECT_TRUE(number_annotator_.ClassifyText(
803 UTF8ToUnicodeText("... 12345.12345 ..."), {4, 15},
804 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
805 EXPECT_EQ(classification_result.collection, "number");
806 EXPECT_EQ(classification_result.numeric_value, 12345);
807 EXPECT_EQ(classification_result.numeric_double_value, 12345.12345);
808 EXPECT_EQ(classification_result.score, 1);
809 EXPECT_EQ(classification_result.priority_score, 1);
810
811 std::vector<AnnotatedSpan> result;
812 EXPECT_TRUE(number_annotator_.FindAll(
813 UTF8ToUnicodeText("Results are between 12.5 and 13.5, right?"),
814 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
815 EXPECT_THAT(result,
816 UnorderedElementsAre(
817 IsAnnotatedSpan(CodepointSpan(20, 24), "number",
818 /*int_value=*/12, /*double_value=*/12.5,
819 /*priority_score=*/1),
820 IsAnnotatedSpan(CodepointSpan(29, 33), "number",
821 /*int_value=*/13, /*double_value=*/13.5,
822 /*priority_score=*/1)));
823 }
824
TEST_F(NumberAnnotatorTest,ForPercentageAnnotationsSetsScoreAndPriorityScore)825 TEST_F(NumberAnnotatorTest, ForPercentageAnnotationsSetsScoreAndPriorityScore) {
826 ClassificationResult classification_result;
827 EXPECT_TRUE(number_annotator_.ClassifyText(
828 UTF8ToUnicodeText("... 12345% ..."), {4, 10},
829 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
830 EXPECT_EQ(classification_result.collection, "percentage");
831 EXPECT_EQ(classification_result.numeric_value, 12345);
832 EXPECT_EQ(classification_result.numeric_double_value, 12345);
833 EXPECT_EQ(classification_result.score, 1);
834 EXPECT_EQ(classification_result.priority_score, 1);
835
836 EXPECT_TRUE(number_annotator_.ClassifyText(
837 UTF8ToUnicodeText("... 12345 percent ..."), {4, 17},
838 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
839 EXPECT_EQ(classification_result.collection, "percentage");
840 EXPECT_EQ(classification_result.numeric_value, 12345);
841 EXPECT_EQ(classification_result.numeric_double_value, 12345);
842 EXPECT_EQ(classification_result.score, 1);
843 EXPECT_EQ(classification_result.priority_score, 1);
844
845 std::vector<AnnotatedSpan> result;
846 EXPECT_TRUE(number_annotator_.FindAll(
847 UTF8ToUnicodeText("Results are between 9% and 10 percent."),
848 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
849 EXPECT_THAT(result,
850 UnorderedElementsAre(
851 IsAnnotatedSpan(CodepointSpan(20, 21), "number",
852 /*int_value=*/9, /*double_value=*/9),
853 IsAnnotatedSpan(CodepointSpan(20, 22), "percentage",
854 /*int_value=*/9, /*double_value=*/9,
855 /*priority_score=*/1),
856 IsAnnotatedSpan(CodepointSpan(27, 29), "number",
857 /*int_value=*/10, /*double_value=*/10),
858 IsAnnotatedSpan(CodepointSpan(27, 37), "percentage",
859 /*int_value=*/10, /*double_value=*/10,
860 /*priority_score=*/1)));
861 }
862
TEST_F(NumberAnnotatorTest,NumberDisabledPercentageEnabledForSmartUsecase)863 TEST_F(NumberAnnotatorTest, NumberDisabledPercentageEnabledForSmartUsecase) {
864 ClassificationResult classification_result;
865 EXPECT_FALSE(number_annotator_.ClassifyText(
866 UTF8ToUnicodeText("... 12345 ..."), {4, 9},
867 AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
868
869 EXPECT_TRUE(number_annotator_.ClassifyText(
870 UTF8ToUnicodeText("... 12345% ..."), {4, 10},
871 AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
872 EXPECT_EQ(classification_result.collection, "percentage");
873 EXPECT_EQ(classification_result.numeric_value, 12345);
874 EXPECT_EQ(classification_result.numeric_double_value, 12345.0);
875 EXPECT_EQ(classification_result.score, 1);
876 EXPECT_EQ(classification_result.priority_score, 1);
877
878 EXPECT_TRUE(number_annotator_.ClassifyText(
879 UTF8ToUnicodeText("... 12345percent ..."), {4, 16},
880 AnnotationUsecase_ANNOTATION_USECASE_SMART, &classification_result));
881 EXPECT_EQ(classification_result.collection, "percentage");
882 EXPECT_EQ(classification_result.numeric_value, 12345);
883 EXPECT_EQ(classification_result.numeric_double_value, 12345);
884 EXPECT_EQ(classification_result.score, 1);
885 EXPECT_EQ(classification_result.priority_score, 1);
886
887 std::vector<AnnotatedSpan> result;
888 EXPECT_TRUE(number_annotator_.FindAll(
889 UTF8ToUnicodeText("Accuracy for experiment 3 is 9%."),
890 AnnotationUsecase_ANNOTATION_USECASE_SMART, &result));
891 EXPECT_THAT(result, UnorderedElementsAre(
892 IsAnnotatedSpan(CodepointSpan(29, 31), "percentage",
893 /*int_value=*/9, /*double_value=*/9.0,
894 /*priority_score=*/1)));
895 }
896
TEST_F(NumberAnnotatorTest,MathOperatorsNotAnnotatedAsNumbersFindAll)897 TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersFindAll) {
898 std::vector<AnnotatedSpan> result;
899 EXPECT_TRUE(number_annotator_.FindAll(
900 UTF8ToUnicodeText("how much is 2 + 2 or 5 - 96 * 89"),
901 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
902
903 EXPECT_THAT(result,
904 UnorderedElementsAre(
905 IsAnnotatedSpan(CodepointSpan(12, 13), "number",
906 /*int_value=*/2, /*double_value=*/2),
907 IsAnnotatedSpan(CodepointSpan(16, 17), "number",
908 /*int_value=*/2, /*double_value=*/2),
909 IsAnnotatedSpan(CodepointSpan(21, 22), "number",
910 /*int_value=*/5, /*double_value=*/5),
911 IsAnnotatedSpan(CodepointSpan(25, 27), "number",
912 /*int_value=*/96, /*double_value=*/96),
913 IsAnnotatedSpan(CodepointSpan(30, 32), "number",
914 /*int_value=*/89, /*double_value=*/89)));
915 }
916
TEST_F(NumberAnnotatorTest,MathOperatorsNotAnnotatedAsNumbersClassifyText)917 TEST_F(NumberAnnotatorTest, MathOperatorsNotAnnotatedAsNumbersClassifyText) {
918 ClassificationResult classification_result;
919 EXPECT_FALSE(number_annotator_.ClassifyText(
920 UTF8ToUnicodeText("2 + 2"), {2, 3},
921 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
922 EXPECT_FALSE(number_annotator_.ClassifyText(
923 UTF8ToUnicodeText("2 - 96 * 89"), {2, 3},
924 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
925 }
926
TEST_F(NumberAnnotatorTest,SlashSeparatesTwoNumbersFindAll)927 TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersFindAll) {
928 std::vector<AnnotatedSpan> result;
929 EXPECT_TRUE(number_annotator_.FindAll(
930 UTF8ToUnicodeText("what's 1 + 2/3 * 4/5 * 6 / 7"),
931 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
932
933 EXPECT_THAT(result,
934 UnorderedElementsAre(
935 IsAnnotatedSpan(CodepointSpan(7, 8), "number",
936 /*int_value=*/1, /*double_value=*/1),
937 IsAnnotatedSpan(CodepointSpan(11, 12), "number",
938 /*int_value=*/2, /*double_value=*/2),
939 IsAnnotatedSpan(CodepointSpan(13, 14), "number",
940 /*int_value=*/3, /*double_value=*/3),
941 IsAnnotatedSpan(CodepointSpan(17, 18), "number",
942 /*int_value=*/4, /*double_value=*/4),
943 IsAnnotatedSpan(CodepointSpan(19, 20), "number",
944 /*int_value=*/5, /*double_value=*/5),
945 IsAnnotatedSpan(CodepointSpan(23, 24), "number",
946 /*int_value=*/6, /*double_value=*/6),
947 IsAnnotatedSpan(CodepointSpan(27, 28), "number",
948 /*int_value=*/7, /*double_value=*/7)));
949 }
950
TEST_F(NumberAnnotatorTest,SlashSeparatesTwoNumbersClassifyText)951 TEST_F(NumberAnnotatorTest, SlashSeparatesTwoNumbersClassifyText) {
952 ClassificationResult classification_result;
953 EXPECT_TRUE(number_annotator_.ClassifyText(
954 UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {11, 12},
955 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
956 EXPECT_EQ(classification_result.collection, "number");
957 EXPECT_EQ(classification_result.numeric_value, 2);
958 EXPECT_EQ(classification_result.numeric_double_value, 2);
959 EXPECT_EQ(classification_result.score, 1);
960
961 EXPECT_TRUE(number_annotator_.ClassifyText(
962 UTF8ToUnicodeText("what's 1 + 2/3 * 4"), {13, 14},
963 AnnotationUsecase_ANNOTATION_USECASE_RAW, &classification_result));
964 EXPECT_EQ(classification_result.collection, "number");
965 EXPECT_EQ(classification_result.numeric_value, 3);
966 EXPECT_EQ(classification_result.numeric_double_value, 3);
967 EXPECT_EQ(classification_result.score, 1);
968 }
969
TEST_F(NumberAnnotatorTest,SlashDoesNotSeparatesTwoNumbersFindAll)970 TEST_F(NumberAnnotatorTest, SlashDoesNotSeparatesTwoNumbersFindAll) {
971 std::vector<AnnotatedSpan> result;
972 // 2 in the "2/" context is a number because / is punctuation
973 EXPECT_TRUE(number_annotator_.FindAll(
974 UTF8ToUnicodeText("what's 2a2/3 or 2/s4 or 2/ or /3 or //3 or 2//"),
975 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
976
977 EXPECT_THAT(result, UnorderedElementsAre(IsAnnotatedSpan(
978 CodepointSpan(24, 25), "number",
979 /*int_value=*/2, /*double_value=*/2)));
980 }
981
TEST_F(NumberAnnotatorTest,BracketsContextAnnotatedFindAll)982 TEST_F(NumberAnnotatorTest, BracketsContextAnnotatedFindAll) {
983 std::vector<AnnotatedSpan> result;
984 EXPECT_TRUE(number_annotator_.FindAll(
985 UTF8ToUnicodeText("The interval is: (12, 13) or [-12, -4.5)"),
986 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
987
988 EXPECT_THAT(result,
989 UnorderedElementsAre(
990 IsAnnotatedSpan(CodepointSpan(18, 20), "number",
991 /*int_value=*/12, /*double_value=*/12),
992 IsAnnotatedSpan(CodepointSpan(22, 24), "number",
993 /*int_value=*/13, /*double_value=*/13),
994 IsAnnotatedSpan(CodepointSpan(30, 33), "number",
995 /*int_value=*/-12, /*double_value=*/-12),
996 IsAnnotatedSpan(CodepointSpan(35, 39), "number",
997 /*int_value=*/-4, /*double_value=*/-4.5,
998 /*priority_score=*/1)));
999 }
1000
TEST_F(NumberAnnotatorTest,BracketsContextNotAnnotatedFindAll)1001 TEST_F(NumberAnnotatorTest, BracketsContextNotAnnotatedFindAll) {
1002 std::vector<AnnotatedSpan> result;
1003 EXPECT_TRUE(number_annotator_.FindAll(
1004 UTF8ToUnicodeText("The interval is: -(12, 138*)"),
1005 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
1006
1007 EXPECT_TRUE(result.empty());
1008 }
1009
TEST_F(NumberAnnotatorTest,FractionalNumberDotsFindAll)1010 TEST_F(NumberAnnotatorTest, FractionalNumberDotsFindAll) {
1011 std::vector<AnnotatedSpan> result;
1012 // Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period
1013 EXPECT_TRUE(number_annotator_.FindAll(
1014 UTF8ToUnicodeText("3.1 3﹒2 3.3"),
1015 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
1016
1017 EXPECT_THAT(result, UnorderedElementsAre(
1018 IsAnnotatedSpan(CodepointSpan(0, 3), "number",
1019 /*int_value=*/3, /*double_value=*/3.1,
1020 /*priority_score=*/1),
1021 IsAnnotatedSpan(CodepointSpan(4, 7), "number",
1022 /*int_value=*/3, /*double_value=*/3.2,
1023 /*priority_score=*/1),
1024 IsAnnotatedSpan(CodepointSpan(8, 11), "number",
1025 /*int_value=*/3, /*double_value=*/3.3,
1026 /*priority_score=*/1)));
1027 }
1028
TEST_F(NumberAnnotatorTest,NonAsciiDigitsFindAll)1029 TEST_F(NumberAnnotatorTest, NonAsciiDigitsFindAll) {
1030 std::vector<AnnotatedSpan> result;
1031 // Dots source: https://unicode-search.net/unicode-namesearch.pl?term=period
1032 // Digits source: https://unicode-search.net/unicode-namesearch.pl?term=digit
1033 EXPECT_TRUE(number_annotator_.FindAll(
1034 UTF8ToUnicodeText("3 3﹒2 3.3%"),
1035 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
1036
1037 EXPECT_THAT(result, UnorderedElementsAre(
1038 IsAnnotatedSpan(CodepointSpan(0, 1), "number",
1039 /*int_value=*/3, /*double_value=*/3),
1040 IsAnnotatedSpan(CodepointSpan(2, 5), "number",
1041 /*int_value=*/3, /*double_value=*/3.2,
1042 /*priority_score=*/1),
1043 IsAnnotatedSpan(CodepointSpan(6, 9), "number",
1044 /*int_value=*/3, /*double_value=*/3.3,
1045 /*priority_score=*/1),
1046 IsAnnotatedSpan(CodepointSpan(6, 10), "percentage",
1047 /*int_value=*/3, /*double_value=*/3.3,
1048 /*priority_score=*/1)));
1049 }
1050
TEST_F(NumberAnnotatorTest,AnnotatedZeroPrecededNumbersFindAll)1051 TEST_F(NumberAnnotatorTest, AnnotatedZeroPrecededNumbersFindAll) {
1052 std::vector<AnnotatedSpan> result;
1053 EXPECT_TRUE(number_annotator_.FindAll(
1054 UTF8ToUnicodeText("Numbers: 0.9 or 09 or 09.9 or 032310"),
1055 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
1056
1057 EXPECT_THAT(result, UnorderedElementsAre(
1058 IsAnnotatedSpan(CodepointSpan(9, 12), "number",
1059 /*int_value=*/0, /*double_value=*/0.9,
1060 /*priority_score=*/1),
1061 IsAnnotatedSpan(CodepointSpan(16, 18), "number",
1062 /*int_value=*/9, /*double_value=*/9),
1063 IsAnnotatedSpan(CodepointSpan(22, 26), "number",
1064 /*int_value=*/9, /*double_value=*/9.9,
1065 /*priority_score=*/1),
1066 IsAnnotatedSpan(CodepointSpan(30, 36), "number",
1067 /*int_value=*/32310,
1068 /*double_value=*/32310)));
1069 }
1070
TEST_F(NumberAnnotatorTest,ZeroAfterDotFindAll)1071 TEST_F(NumberAnnotatorTest, ZeroAfterDotFindAll) {
1072 std::vector<AnnotatedSpan> result;
1073 EXPECT_TRUE(number_annotator_.FindAll(
1074 UTF8ToUnicodeText("15.0 16.00"), AnnotationUsecase_ANNOTATION_USECASE_RAW,
1075 &result));
1076
1077 EXPECT_THAT(result,
1078 UnorderedElementsAre(
1079 IsAnnotatedSpan(CodepointSpan(0, 4), "number",
1080 /*int_value=*/15, /*double_value=*/15),
1081 IsAnnotatedSpan(CodepointSpan(5, 10), "number",
1082 /*int_value=*/16, /*double_value=*/16)));
1083 }
1084
TEST_F(NumberAnnotatorTest,NineDotNineFindAll)1085 TEST_F(NumberAnnotatorTest, NineDotNineFindAll) {
1086 std::vector<AnnotatedSpan> result;
1087 EXPECT_TRUE(number_annotator_.FindAll(
1088 UTF8ToUnicodeText("9.9 9.99 99.99 99.999 99.9999"),
1089 AnnotationUsecase_ANNOTATION_USECASE_RAW, &result));
1090
1091 EXPECT_THAT(result,
1092 UnorderedElementsAre(
1093 IsAnnotatedSpan(CodepointSpan(0, 3), "number",
1094 /*int_value=*/9, /*double_value=*/9.9,
1095 /*priority_score=*/1),
1096 IsAnnotatedSpan(CodepointSpan(4, 8), "number",
1097 /*int_value=*/9, /*double_value=*/9.99,
1098 /*priority_score=*/1),
1099 IsAnnotatedSpan(CodepointSpan(9, 14), "number",
1100 /*int_value=*/99, /*double_value=*/99.99,
1101 /*priority_score=*/1),
1102 IsAnnotatedSpan(CodepointSpan(15, 21), "number",
1103 /*int_value=*/99, /*double_value=*/99.999,
1104 /*priority_score=*/1),
1105 IsAnnotatedSpan(CodepointSpan(22, 29), "number",
1106 /*int_value=*/99, /*double_value=*/99.9999,
1107 /*priority_score=*/1)));
1108 }
1109
1110 } // namespace test_internal
1111 } // namespace libtextclassifier3
1112