1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "feature-processor.h"
18
19 #include "model-executor.h"
20 #include "tensor-view.h"
21
22 #include "gmock/gmock.h"
23 #include "gtest/gtest.h"
24
25 namespace libtextclassifier2 {
26 namespace {
27
28 using testing::ElementsAreArray;
29 using testing::FloatEq;
30 using testing::Matcher;
31
PackFeatureProcessorOptions(const FeatureProcessorOptionsT & options)32 flatbuffers::DetachedBuffer PackFeatureProcessorOptions(
33 const FeatureProcessorOptionsT& options) {
34 flatbuffers::FlatBufferBuilder builder;
35 builder.Finish(CreateFeatureProcessorOptions(builder, &options));
36 return builder.Release();
37 }
38
39 template <typename T>
Subvector(const std::vector<T> & vector,int start,int end)40 std::vector<T> Subvector(const std::vector<T>& vector, int start, int end) {
41 return std::vector<T>(vector.begin() + start, vector.begin() + end);
42 }
43
ElementsAreFloat(const std::vector<float> & values)44 Matcher<std::vector<float>> ElementsAreFloat(const std::vector<float>& values) {
45 std::vector<Matcher<float>> matchers;
46 for (const float value : values) {
47 matchers.push_back(FloatEq(value));
48 }
49 return ElementsAreArray(matchers);
50 }
51
52 class TestingFeatureProcessor : public FeatureProcessor {
53 public:
54 using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
55 using FeatureProcessor::FeatureProcessor;
56 using FeatureProcessor::ICUTokenize;
57 using FeatureProcessor::IsCodepointInRanges;
58 using FeatureProcessor::SpanToLabel;
59 using FeatureProcessor::StripTokensFromOtherLines;
60 using FeatureProcessor::supported_codepoint_ranges_;
61 using FeatureProcessor::SupportedCodepointsRatio;
62 };
63
64 // EmbeddingExecutor that always returns features based on
65 class FakeEmbeddingExecutor : public EmbeddingExecutor {
66 public:
AddEmbedding(const TensorView<int> & sparse_features,float * dest,int dest_size) const67 bool AddEmbedding(const TensorView<int>& sparse_features, float* dest,
68 int dest_size) const override {
69 TC_CHECK_GE(dest_size, 4);
70 EXPECT_EQ(sparse_features.size(), 1);
71 dest[0] = sparse_features.data()[0];
72 dest[1] = sparse_features.data()[0];
73 dest[2] = -sparse_features.data()[0];
74 dest[3] = -sparse_features.data()[0];
75 return true;
76 }
77
78 private:
79 std::vector<float> storage_;
80 };
81
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesMiddle)82 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
83 std::vector<Token> tokens{Token("Hělló", 0, 5),
84 Token("fěěbař@google.com", 6, 23),
85 Token("heře!", 24, 29)};
86
87 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
88
89 // clang-format off
90 EXPECT_THAT(tokens, ElementsAreArray(
91 {Token("Hělló", 0, 5),
92 Token("fěě", 6, 9),
93 Token("bař", 9, 12),
94 Token("@google.com", 12, 23),
95 Token("heře!", 24, 29)}));
96 // clang-format on
97 }
98
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesBegin)99 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
100 std::vector<Token> tokens{Token("Hělló", 0, 5),
101 Token("fěěbař@google.com", 6, 23),
102 Token("heře!", 24, 29)};
103
104 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
105
106 // clang-format off
107 EXPECT_THAT(tokens, ElementsAreArray(
108 {Token("Hělló", 0, 5),
109 Token("fěěbař", 6, 12),
110 Token("@google.com", 12, 23),
111 Token("heře!", 24, 29)}));
112 // clang-format on
113 }
114
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesEnd)115 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
116 std::vector<Token> tokens{Token("Hělló", 0, 5),
117 Token("fěěbař@google.com", 6, 23),
118 Token("heře!", 24, 29)};
119
120 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
121
122 // clang-format off
123 EXPECT_THAT(tokens, ElementsAreArray(
124 {Token("Hělló", 0, 5),
125 Token("fěě", 6, 9),
126 Token("bař@google.com", 9, 23),
127 Token("heře!", 24, 29)}));
128 // clang-format on
129 }
130
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesWhole)131 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
132 std::vector<Token> tokens{Token("Hělló", 0, 5),
133 Token("fěěbař@google.com", 6, 23),
134 Token("heře!", 24, 29)};
135
136 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
137
138 // clang-format off
139 EXPECT_THAT(tokens, ElementsAreArray(
140 {Token("Hělló", 0, 5),
141 Token("fěěbař@google.com", 6, 23),
142 Token("heře!", 24, 29)}));
143 // clang-format on
144 }
145
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesCrossToken)146 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
147 std::vector<Token> tokens{Token("Hělló", 0, 5),
148 Token("fěěbař@google.com", 6, 23),
149 Token("heře!", 24, 29)};
150
151 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
152
153 // clang-format off
154 EXPECT_THAT(tokens, ElementsAreArray(
155 {Token("Hě", 0, 2),
156 Token("lló", 2, 5),
157 Token("fěě", 6, 9),
158 Token("bař@google.com", 9, 23),
159 Token("heře!", 24, 29)}));
160 // clang-format on
161 }
162
TEST(FeatureProcessorTest,KeepLineWithClickFirst)163 TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
164 CREATE_UNILIB_FOR_TESTING;
165 FeatureProcessorOptionsT options;
166 options.only_use_line_with_click = true;
167 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
168 TestingFeatureProcessor feature_processor(
169 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
170 &unilib);
171
172 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
173 const CodepointSpan span = {0, 5};
174 // clang-format off
175 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
176 Token("Lině", 6, 10),
177 Token("Sěcond", 11, 17),
178 Token("Lině", 18, 22),
179 Token("Thiřd", 23, 28),
180 Token("Lině", 29, 33)};
181 // clang-format on
182
183 // Keeps the first line.
184 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
185 EXPECT_THAT(tokens,
186 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
187 }
188
TEST(FeatureProcessorTest,KeepLineWithClickSecond)189 TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
190 CREATE_UNILIB_FOR_TESTING;
191 FeatureProcessorOptionsT options;
192 options.only_use_line_with_click = true;
193 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
194 TestingFeatureProcessor feature_processor(
195 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
196 &unilib);
197
198 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
199 const CodepointSpan span = {18, 22};
200 // clang-format off
201 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
202 Token("Lině", 6, 10),
203 Token("Sěcond", 11, 17),
204 Token("Lině", 18, 22),
205 Token("Thiřd", 23, 28),
206 Token("Lině", 29, 33)};
207 // clang-format on
208
209 // Keeps the first line.
210 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
211 EXPECT_THAT(tokens, ElementsAreArray(
212 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
213 }
214
TEST(FeatureProcessorTest,KeepLineWithClickThird)215 TEST(FeatureProcessorTest, KeepLineWithClickThird) {
216 CREATE_UNILIB_FOR_TESTING;
217 FeatureProcessorOptionsT options;
218 options.only_use_line_with_click = true;
219 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
220 TestingFeatureProcessor feature_processor(
221 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
222 &unilib);
223
224 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
225 const CodepointSpan span = {24, 33};
226 // clang-format off
227 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
228 Token("Lině", 6, 10),
229 Token("Sěcond", 11, 17),
230 Token("Lině", 18, 22),
231 Token("Thiřd", 23, 28),
232 Token("Lině", 29, 33)};
233 // clang-format on
234
235 // Keeps the first line.
236 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
237 EXPECT_THAT(tokens, ElementsAreArray(
238 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
239 }
240
TEST(FeatureProcessorTest,KeepLineWithClickSecondWithPipe)241 TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
242 CREATE_UNILIB_FOR_TESTING;
243 FeatureProcessorOptionsT options;
244 options.only_use_line_with_click = true;
245 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
246 TestingFeatureProcessor feature_processor(
247 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
248 &unilib);
249
250 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
251 const CodepointSpan span = {18, 22};
252 // clang-format off
253 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
254 Token("Lině", 6, 10),
255 Token("Sěcond", 11, 17),
256 Token("Lině", 18, 22),
257 Token("Thiřd", 23, 28),
258 Token("Lině", 29, 33)};
259 // clang-format on
260
261 // Keeps the first line.
262 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
263 EXPECT_THAT(tokens, ElementsAreArray(
264 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
265 }
266
TEST(FeatureProcessorTest,KeepLineWithCrosslineClick)267 TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
268 CREATE_UNILIB_FOR_TESTING;
269 FeatureProcessorOptionsT options;
270 options.only_use_line_with_click = true;
271 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
272 TestingFeatureProcessor feature_processor(
273 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
274 &unilib);
275
276 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
277 const CodepointSpan span = {5, 23};
278 // clang-format off
279 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
280 Token("Lině", 6, 10),
281 Token("Sěcond", 18, 23),
282 Token("Lině", 19, 23),
283 Token("Thiřd", 23, 28),
284 Token("Lině", 29, 33)};
285 // clang-format on
286
287 // Keeps the first line.
288 feature_processor.StripTokensFromOtherLines(context, span, &tokens);
289 EXPECT_THAT(tokens, ElementsAreArray(
290 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
291 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
292 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
293 }
294
TEST(FeatureProcessorTest,SpanToLabel)295 TEST(FeatureProcessorTest, SpanToLabel) {
296 CREATE_UNILIB_FOR_TESTING;
297 FeatureProcessorOptionsT options;
298 options.context_size = 1;
299 options.max_selection_span = 1;
300 options.snap_label_span_boundaries_to_containing_tokens = false;
301
302 options.tokenization_codepoint_config.emplace_back(
303 new TokenizationCodepointRangeT());
304 auto& config = options.tokenization_codepoint_config.back();
305 config->start = 32;
306 config->end = 33;
307 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
308
309 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
310 TestingFeatureProcessor feature_processor(
311 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
312 &unilib);
313 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
314 ASSERT_EQ(3, tokens.size());
315 int label;
316 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
317 EXPECT_EQ(kInvalidLabel, label);
318 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
319 EXPECT_NE(kInvalidLabel, label);
320 TokenSpan token_span;
321 feature_processor.LabelToTokenSpan(label, &token_span);
322 EXPECT_EQ(0, token_span.first);
323 EXPECT_EQ(0, token_span.second);
324
325 // Reconfigure with snapping enabled.
326 options.snap_label_span_boundaries_to_containing_tokens = true;
327 flatbuffers::DetachedBuffer options2_fb =
328 PackFeatureProcessorOptions(options);
329 TestingFeatureProcessor feature_processor2(
330 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
331 &unilib);
332 int label2;
333 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
334 EXPECT_EQ(label, label2);
335 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
336 EXPECT_EQ(label, label2);
337 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
338 EXPECT_EQ(label, label2);
339
340 // Cross a token boundary.
341 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
342 EXPECT_EQ(kInvalidLabel, label2);
343 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
344 EXPECT_EQ(kInvalidLabel, label2);
345
346 // Multiple tokens.
347 options.context_size = 2;
348 options.max_selection_span = 2;
349 flatbuffers::DetachedBuffer options3_fb =
350 PackFeatureProcessorOptions(options);
351 TestingFeatureProcessor feature_processor3(
352 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
353 &unilib);
354 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
355 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
356 EXPECT_NE(kInvalidLabel, label2);
357 feature_processor3.LabelToTokenSpan(label2, &token_span);
358 EXPECT_EQ(1, token_span.first);
359 EXPECT_EQ(0, token_span.second);
360
361 int label3;
362 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
363 EXPECT_EQ(label2, label3);
364 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
365 EXPECT_EQ(label2, label3);
366 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
367 EXPECT_EQ(label2, label3);
368 }
369
TEST(FeatureProcessorTest,SpanToLabelIgnoresPunctuation)370 TEST(FeatureProcessorTest, SpanToLabelIgnoresPunctuation) {
371 CREATE_UNILIB_FOR_TESTING;
372 FeatureProcessorOptionsT options;
373 options.context_size = 1;
374 options.max_selection_span = 1;
375 options.snap_label_span_boundaries_to_containing_tokens = false;
376
377 options.tokenization_codepoint_config.emplace_back(
378 new TokenizationCodepointRangeT());
379 auto& config = options.tokenization_codepoint_config.back();
380 config->start = 32;
381 config->end = 33;
382 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
383
384 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
385 TestingFeatureProcessor feature_processor(
386 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
387 &unilib);
388 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
389 ASSERT_EQ(3, tokens.size());
390 int label;
391 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
392 EXPECT_EQ(kInvalidLabel, label);
393 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
394 EXPECT_NE(kInvalidLabel, label);
395 TokenSpan token_span;
396 feature_processor.LabelToTokenSpan(label, &token_span);
397 EXPECT_EQ(0, token_span.first);
398 EXPECT_EQ(0, token_span.second);
399
400 // Reconfigure with snapping enabled.
401 options.snap_label_span_boundaries_to_containing_tokens = true;
402 flatbuffers::DetachedBuffer options2_fb =
403 PackFeatureProcessorOptions(options);
404 TestingFeatureProcessor feature_processor2(
405 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
406 &unilib);
407 int label2;
408 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
409 EXPECT_EQ(label, label2);
410 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
411 EXPECT_EQ(label, label2);
412 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
413 EXPECT_EQ(label, label2);
414
415 // Cross a token boundary.
416 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
417 EXPECT_EQ(kInvalidLabel, label2);
418 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
419 EXPECT_EQ(kInvalidLabel, label2);
420
421 // Multiple tokens.
422 options.context_size = 2;
423 options.max_selection_span = 2;
424 flatbuffers::DetachedBuffer options3_fb =
425 PackFeatureProcessorOptions(options);
426 TestingFeatureProcessor feature_processor3(
427 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
428 &unilib);
429 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
430 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
431 EXPECT_NE(kInvalidLabel, label2);
432 feature_processor3.LabelToTokenSpan(label2, &token_span);
433 EXPECT_EQ(1, token_span.first);
434 EXPECT_EQ(0, token_span.second);
435
436 int label3;
437 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
438 EXPECT_EQ(label2, label3);
439 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
440 EXPECT_EQ(label2, label3);
441 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
442 EXPECT_EQ(label2, label3);
443 }
444
TEST(FeatureProcessorTest,CenterTokenFromClick)445 TEST(FeatureProcessorTest, CenterTokenFromClick) {
446 int token_index;
447
448 // Exactly aligned indices.
449 token_index = internal::CenterTokenFromClick(
450 {6, 11},
451 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
452 EXPECT_EQ(token_index, 1);
453
454 // Click is contained in a token.
455 token_index = internal::CenterTokenFromClick(
456 {13, 17},
457 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
458 EXPECT_EQ(token_index, 2);
459
460 // Click spans two tokens.
461 token_index = internal::CenterTokenFromClick(
462 {6, 17},
463 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
464 EXPECT_EQ(token_index, kInvalidIndex);
465 }
466
TEST(FeatureProcessorTest,CenterTokenFromMiddleOfSelection)467 TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
468 int token_index;
469
470 // Selection of length 3. Exactly aligned indices.
471 token_index = internal::CenterTokenFromMiddleOfSelection(
472 {7, 27},
473 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
474 Token("Token4", 21, 27), Token("Token5", 28, 34)});
475 EXPECT_EQ(token_index, 2);
476
477 // Selection of length 1 token. Exactly aligned indices.
478 token_index = internal::CenterTokenFromMiddleOfSelection(
479 {21, 27},
480 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
481 Token("Token4", 21, 27), Token("Token5", 28, 34)});
482 EXPECT_EQ(token_index, 3);
483
484 // Selection marks sub-token range, with no tokens in it.
485 token_index = internal::CenterTokenFromMiddleOfSelection(
486 {29, 33},
487 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
488 Token("Token4", 21, 27), Token("Token5", 28, 34)});
489 EXPECT_EQ(token_index, kInvalidIndex);
490
491 // Selection of length 2. Sub-token indices.
492 token_index = internal::CenterTokenFromMiddleOfSelection(
493 {3, 25},
494 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
495 Token("Token4", 21, 27), Token("Token5", 28, 34)});
496 EXPECT_EQ(token_index, 1);
497
498 // Selection of length 1. Sub-token indices.
499 token_index = internal::CenterTokenFromMiddleOfSelection(
500 {22, 34},
501 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
502 Token("Token4", 21, 27), Token("Token5", 28, 34)});
503 EXPECT_EQ(token_index, 4);
504
505 // Some invalid ones.
506 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
507 EXPECT_EQ(token_index, -1);
508 }
509
TEST(FeatureProcessorTest,SupportedCodepointsRatio)510 TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
511 FeatureProcessorOptionsT options;
512 options.context_size = 2;
513 options.max_selection_span = 2;
514 options.snap_label_span_boundaries_to_containing_tokens = false;
515 options.feature_version = 2;
516 options.embedding_size = 4;
517 options.bounds_sensitive_features.reset(
518 new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
519 options.bounds_sensitive_features->enabled = true;
520 options.bounds_sensitive_features->num_tokens_before = 5;
521 options.bounds_sensitive_features->num_tokens_inside_left = 3;
522 options.bounds_sensitive_features->num_tokens_inside_right = 3;
523 options.bounds_sensitive_features->num_tokens_after = 5;
524 options.bounds_sensitive_features->include_inside_bag = true;
525 options.bounds_sensitive_features->include_inside_length = true;
526
527 options.tokenization_codepoint_config.emplace_back(
528 new TokenizationCodepointRangeT());
529 auto& config = options.tokenization_codepoint_config.back();
530 config->start = 32;
531 config->end = 33;
532 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
533
534 {
535 options.supported_codepoint_ranges.emplace_back(
536 new FeatureProcessorOptions_::CodepointRangeT());
537 auto& range = options.supported_codepoint_ranges.back();
538 range->start = 0;
539 range->end = 128;
540 }
541
542 {
543 options.supported_codepoint_ranges.emplace_back(
544 new FeatureProcessorOptions_::CodepointRangeT());
545 auto& range = options.supported_codepoint_ranges.back();
546 range->start = 10000;
547 range->end = 10001;
548 }
549
550 {
551 options.supported_codepoint_ranges.emplace_back(
552 new FeatureProcessorOptions_::CodepointRangeT());
553 auto& range = options.supported_codepoint_ranges.back();
554 range->start = 20000;
555 range->end = 30000;
556 }
557
558 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
559 CREATE_UNILIB_FOR_TESTING;
560 TestingFeatureProcessor feature_processor(
561 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
562 &unilib);
563 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
564 {0, 3}, feature_processor.Tokenize("aaa bbb ccc")),
565 FloatEq(1.0));
566 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
567 {0, 3}, feature_processor.Tokenize("aaa bbb ěěě")),
568 FloatEq(2.0 / 3));
569 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
570 {0, 3}, feature_processor.Tokenize("ěěě řřř ěěě")),
571 FloatEq(0.0));
572 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
573 -1, feature_processor.supported_codepoint_ranges_));
574 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
575 0, feature_processor.supported_codepoint_ranges_));
576 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
577 10, feature_processor.supported_codepoint_ranges_));
578 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
579 127, feature_processor.supported_codepoint_ranges_));
580 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
581 128, feature_processor.supported_codepoint_ranges_));
582 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
583 9999, feature_processor.supported_codepoint_ranges_));
584 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
585 10000, feature_processor.supported_codepoint_ranges_));
586 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
587 10001, feature_processor.supported_codepoint_ranges_));
588 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
589 25000, feature_processor.supported_codepoint_ranges_));
590
591 const std::vector<Token> tokens = {Token("ěěě", 0, 3), Token("řřř", 4, 7),
592 Token("eee", 8, 11)};
593
594 options.min_supported_codepoint_ratio = 0.0;
595 flatbuffers::DetachedBuffer options2_fb =
596 PackFeatureProcessorOptions(options);
597 TestingFeatureProcessor feature_processor2(
598 flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
599 &unilib);
600 EXPECT_TRUE(feature_processor2.HasEnoughSupportedCodepoints(
601 tokens, /*token_span=*/{0, 3}));
602
603 options.min_supported_codepoint_ratio = 0.2;
604 flatbuffers::DetachedBuffer options3_fb =
605 PackFeatureProcessorOptions(options);
606 TestingFeatureProcessor feature_processor3(
607 flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
608 &unilib);
609 EXPECT_TRUE(feature_processor3.HasEnoughSupportedCodepoints(
610 tokens, /*token_span=*/{0, 3}));
611
612 options.min_supported_codepoint_ratio = 0.5;
613 flatbuffers::DetachedBuffer options4_fb =
614 PackFeatureProcessorOptions(options);
615 TestingFeatureProcessor feature_processor4(
616 flatbuffers::GetRoot<FeatureProcessorOptions>(options4_fb.data()),
617 &unilib);
618 EXPECT_FALSE(feature_processor4.HasEnoughSupportedCodepoints(
619 tokens, /*token_span=*/{0, 3}));
620 }
621
TEST(FeatureProcessorTest,InSpanFeature)622 TEST(FeatureProcessorTest, InSpanFeature) {
623 FeatureProcessorOptionsT options;
624 options.context_size = 2;
625 options.max_selection_span = 2;
626 options.snap_label_span_boundaries_to_containing_tokens = false;
627 options.feature_version = 2;
628 options.embedding_size = 4;
629 options.extract_selection_mask_feature = true;
630
631 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
632 CREATE_UNILIB_FOR_TESTING;
633 TestingFeatureProcessor feature_processor(
634 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
635 &unilib);
636
637 std::unique_ptr<CachedFeatures> cached_features;
638
639 FakeEmbeddingExecutor embedding_executor;
640
641 const std::vector<Token> tokens = {Token("aaa", 0, 3), Token("bbb", 4, 7),
642 Token("ccc", 8, 11), Token("ddd", 12, 15)};
643
644 EXPECT_TRUE(feature_processor.ExtractFeatures(
645 tokens, /*token_span=*/{0, 4},
646 /*selection_span_for_feature=*/{4, 11}, &embedding_executor,
647 /*embedding_cache=*/nullptr, /*feature_vector_size=*/5,
648 &cached_features));
649 std::vector<float> features;
650 cached_features->AppendClickContextFeaturesForClick(1, &features);
651 ASSERT_EQ(features.size(), 25);
652 EXPECT_THAT(features[4], FloatEq(0.0));
653 EXPECT_THAT(features[9], FloatEq(0.0));
654 EXPECT_THAT(features[14], FloatEq(1.0));
655 EXPECT_THAT(features[19], FloatEq(1.0));
656 EXPECT_THAT(features[24], FloatEq(0.0));
657 }
658
TEST(FeatureProcessorTest,EmbeddingCache)659 TEST(FeatureProcessorTest, EmbeddingCache) {
660 FeatureProcessorOptionsT options;
661 options.context_size = 2;
662 options.max_selection_span = 2;
663 options.snap_label_span_boundaries_to_containing_tokens = false;
664 options.feature_version = 2;
665 options.embedding_size = 4;
666 options.bounds_sensitive_features.reset(
667 new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
668 options.bounds_sensitive_features->enabled = true;
669 options.bounds_sensitive_features->num_tokens_before = 3;
670 options.bounds_sensitive_features->num_tokens_inside_left = 2;
671 options.bounds_sensitive_features->num_tokens_inside_right = 2;
672 options.bounds_sensitive_features->num_tokens_after = 3;
673
674 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
675 CREATE_UNILIB_FOR_TESTING;
676 TestingFeatureProcessor feature_processor(
677 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
678 &unilib);
679
680 std::unique_ptr<CachedFeatures> cached_features;
681
682 FakeEmbeddingExecutor embedding_executor;
683
684 const std::vector<Token> tokens = {
685 Token("aaa", 0, 3), Token("bbb", 4, 7), Token("ccc", 8, 11),
686 Token("ddd", 12, 15), Token("eee", 16, 19), Token("fff", 20, 23)};
687
688 // We pre-populate the cache with dummy embeddings, to make sure they are
689 // used when populating the features vector.
690 const std::vector<float> cached_padding_features = {10.0, -10.0, 10.0, -10.0};
691 const std::vector<float> cached_features1 = {1.0, 2.0, 3.0, 4.0};
692 const std::vector<float> cached_features2 = {5.0, 6.0, 7.0, 8.0};
693 FeatureProcessor::EmbeddingCache embedding_cache = {
694 {{kInvalidIndex, kInvalidIndex}, cached_padding_features},
695 {{4, 7}, cached_features1},
696 {{12, 15}, cached_features2},
697 };
698
699 EXPECT_TRUE(feature_processor.ExtractFeatures(
700 tokens, /*token_span=*/{0, 6},
701 /*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
702 &embedding_executor, &embedding_cache, /*feature_vector_size=*/4,
703 &cached_features));
704 std::vector<float> features;
705 cached_features->AppendBoundsSensitiveFeaturesForSpan({2, 4}, &features);
706 ASSERT_EQ(features.size(), 40);
707 // Check that the dummy embeddings were used.
708 EXPECT_THAT(Subvector(features, 0, 4),
709 ElementsAreFloat(cached_padding_features));
710 EXPECT_THAT(Subvector(features, 8, 12), ElementsAreFloat(cached_features1));
711 EXPECT_THAT(Subvector(features, 16, 20), ElementsAreFloat(cached_features2));
712 EXPECT_THAT(Subvector(features, 24, 28), ElementsAreFloat(cached_features2));
713 EXPECT_THAT(Subvector(features, 36, 40),
714 ElementsAreFloat(cached_padding_features));
715 // Check that the real embeddings were cached.
716 EXPECT_EQ(embedding_cache.size(), 7);
717 EXPECT_THAT(Subvector(features, 4, 8),
718 ElementsAreFloat(embedding_cache.at({0, 3})));
719 EXPECT_THAT(Subvector(features, 12, 16),
720 ElementsAreFloat(embedding_cache.at({8, 11})));
721 EXPECT_THAT(Subvector(features, 20, 24),
722 ElementsAreFloat(embedding_cache.at({8, 11})));
723 EXPECT_THAT(Subvector(features, 28, 32),
724 ElementsAreFloat(embedding_cache.at({16, 19})));
725 EXPECT_THAT(Subvector(features, 32, 36),
726 ElementsAreFloat(embedding_cache.at({20, 23})));
727 }
728
TEST(FeatureProcessorTest,StripUnusedTokensWithNoRelativeClick)729 TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
730 std::vector<Token> tokens_orig{
731 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
732 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
733 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
734 Token("12", 0, 0)};
735
736 std::vector<Token> tokens;
737 int click_index;
738
739 // Try to click first token and see if it gets padded from left.
740 tokens = tokens_orig;
741 click_index = 0;
742 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
743 // clang-format off
744 EXPECT_EQ(tokens, std::vector<Token>({Token(),
745 Token(),
746 Token("0", 0, 0),
747 Token("1", 0, 0),
748 Token("2", 0, 0)}));
749 // clang-format on
750 EXPECT_EQ(click_index, 2);
751
752 // When we click the second token nothing should get padded.
753 tokens = tokens_orig;
754 click_index = 2;
755 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
756 // clang-format off
757 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
758 Token("1", 0, 0),
759 Token("2", 0, 0),
760 Token("3", 0, 0),
761 Token("4", 0, 0)}));
762 // clang-format on
763 EXPECT_EQ(click_index, 2);
764
765 // When we click the last token tokens should get padded from the right.
766 tokens = tokens_orig;
767 click_index = 12;
768 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
769 // clang-format off
770 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
771 Token("11", 0, 0),
772 Token("12", 0, 0),
773 Token(),
774 Token()}));
775 // clang-format on
776 EXPECT_EQ(click_index, 2);
777 }
778
TEST(FeatureProcessorTest,StripUnusedTokensWithRelativeClick)779 TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
780 std::vector<Token> tokens_orig{
781 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
782 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
783 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
784 Token("12", 0, 0)};
785
786 std::vector<Token> tokens;
787 int click_index;
788
789 // Try to click first token and see if it gets padded from left to maximum
790 // context_size.
791 tokens = tokens_orig;
792 click_index = 0;
793 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
794 // clang-format off
795 EXPECT_EQ(tokens, std::vector<Token>({Token(),
796 Token(),
797 Token("0", 0, 0),
798 Token("1", 0, 0),
799 Token("2", 0, 0),
800 Token("3", 0, 0),
801 Token("4", 0, 0),
802 Token("5", 0, 0)}));
803 // clang-format on
804 EXPECT_EQ(click_index, 2);
805
806 // Clicking to the middle with enough context should not produce any padding.
807 tokens = tokens_orig;
808 click_index = 6;
809 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
810 // clang-format off
811 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
812 Token("2", 0, 0),
813 Token("3", 0, 0),
814 Token("4", 0, 0),
815 Token("5", 0, 0),
816 Token("6", 0, 0),
817 Token("7", 0, 0),
818 Token("8", 0, 0),
819 Token("9", 0, 0)}));
820 // clang-format on
821 EXPECT_EQ(click_index, 5);
822
823 // Clicking at the end should pad right to maximum context_size.
824 tokens = tokens_orig;
825 click_index = 11;
826 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
827 // clang-format off
828 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
829 Token("7", 0, 0),
830 Token("8", 0, 0),
831 Token("9", 0, 0),
832 Token("10", 0, 0),
833 Token("11", 0, 0),
834 Token("12", 0, 0),
835 Token(),
836 Token()}));
837 // clang-format on
838 EXPECT_EQ(click_index, 5);
839 }
840
TEST(FeatureProcessorTest,InternalTokenizeOnScriptChange)841 TEST(FeatureProcessorTest, InternalTokenizeOnScriptChange) {
842 CREATE_UNILIB_FOR_TESTING;
843 FeatureProcessorOptionsT options;
844 options.tokenization_codepoint_config.emplace_back(
845 new TokenizationCodepointRangeT());
846 {
847 auto& config = options.tokenization_codepoint_config.back();
848 config->start = 0;
849 config->end = 256;
850 config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
851 config->script_id = 1;
852 }
853 options.tokenize_on_script_change = false;
854
855 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
856 TestingFeatureProcessor feature_processor(
857 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
858 &unilib);
859
860 EXPECT_EQ(feature_processor.Tokenize("앨라배마123웹사이트"),
861 std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
862
863 options.tokenize_on_script_change = true;
864 flatbuffers::DetachedBuffer options_fb2 =
865 PackFeatureProcessorOptions(options);
866 TestingFeatureProcessor feature_processor2(
867 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb2.data()),
868 &unilib);
869
870 EXPECT_EQ(feature_processor2.Tokenize("앨라배마123웹사이트"),
871 std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
872 Token("웹사이트", 7, 11)}));
873 }
874
875 #ifdef LIBTEXTCLASSIFIER_TEST_ICU
TEST(FeatureProcessorTest,ICUTokenize)876 TEST(FeatureProcessorTest, ICUTokenize) {
877 FeatureProcessorOptionsT options;
878 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
879
880 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
881 TestingFeatureProcessor feature_processor(
882 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
883 std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
884 ASSERT_EQ(tokens,
885 // clang-format off
886 std::vector<Token>({Token("พระบาท", 0, 6),
887 Token("สมเด็จ", 6, 12),
888 Token("พระ", 12, 15),
889 Token("ปร", 15, 17),
890 Token("มิ", 17, 19)}));
891 // clang-format on
892 }
893 #endif
894
895 #ifdef LIBTEXTCLASSIFIER_TEST_ICU
TEST(FeatureProcessorTest,ICUTokenizeWithWhitespaces)896 TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
897 FeatureProcessorOptionsT options;
898 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
899 options.icu_preserve_whitespace_tokens = true;
900
901 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
902 TestingFeatureProcessor feature_processor(
903 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
904 std::vector<Token> tokens =
905 feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
906 ASSERT_EQ(tokens,
907 // clang-format off
908 std::vector<Token>({Token("พระบาท", 0, 6),
909 Token(" ", 6, 7),
910 Token("สมเด็จ", 7, 13),
911 Token(" ", 13, 14),
912 Token("พระ", 14, 17),
913 Token(" ", 17, 18),
914 Token("ปร", 18, 20),
915 Token(" ", 20, 21),
916 Token("มิ", 21, 23)}));
917 // clang-format on
918 }
919 #endif
920
921 #ifdef LIBTEXTCLASSIFIER_TEST_ICU
TEST(FeatureProcessorTest,MixedTokenize)922 TEST(FeatureProcessorTest, MixedTokenize) {
923 FeatureProcessorOptionsT options;
924 options.tokenization_type = FeatureProcessorOptions_::TokenizationType_MIXED;
925
926 options.tokenization_codepoint_config.emplace_back(
927 new TokenizationCodepointRangeT());
928 auto& config = options.tokenization_codepoint_config.back();
929 config->start = 32;
930 config->end = 33;
931 config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
932
933 {
934 options.internal_tokenizer_codepoint_ranges.emplace_back(
935 new FeatureProcessorOptions_::CodepointRangeT());
936 auto& range = options.internal_tokenizer_codepoint_ranges.back();
937 range->start = 0;
938 range->end = 128;
939 }
940
941 {
942 options.internal_tokenizer_codepoint_ranges.emplace_back(
943 new FeatureProcessorOptions_::CodepointRangeT());
944 auto& range = options.internal_tokenizer_codepoint_ranges.back();
945 range->start = 128;
946 range->end = 256;
947 }
948
949 {
950 options.internal_tokenizer_codepoint_ranges.emplace_back(
951 new FeatureProcessorOptions_::CodepointRangeT());
952 auto& range = options.internal_tokenizer_codepoint_ranges.back();
953 range->start = 256;
954 range->end = 384;
955 }
956
957 {
958 options.internal_tokenizer_codepoint_ranges.emplace_back(
959 new FeatureProcessorOptions_::CodepointRangeT());
960 auto& range = options.internal_tokenizer_codepoint_ranges.back();
961 range->start = 384;
962 range->end = 592;
963 }
964
965 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
966 TestingFeatureProcessor feature_processor(
967 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
968 std::vector<Token> tokens = feature_processor.Tokenize(
969 "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
970 ASSERT_EQ(tokens,
971 // clang-format off
972 std::vector<Token>({Token("こんにちは", 0, 5),
973 Token("Japanese-ląnguagę", 5, 22),
974 Token("text", 23, 27),
975 Token("世界", 28, 30),
976 Token("http://www.google.com/", 31, 53)}));
977 // clang-format on
978 }
979 #endif
980
TEST(FeatureProcessorTest,IgnoredSpanBoundaryCodepoints)981 TEST(FeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {
982 CREATE_UNILIB_FOR_TESTING;
983 FeatureProcessorOptionsT options;
984 options.ignored_span_boundary_codepoints.push_back('.');
985 options.ignored_span_boundary_codepoints.push_back(',');
986 options.ignored_span_boundary_codepoints.push_back('[');
987 options.ignored_span_boundary_codepoints.push_back(']');
988
989 flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
990 TestingFeatureProcessor feature_processor(
991 flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
992 &unilib);
993
994 const std::string text1_utf8 = "ěščř";
995 const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);
996 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
997 text1.begin(), text1.end(),
998 /*count_from_beginning=*/true),
999 0);
1000 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1001 text1.begin(), text1.end(),
1002 /*count_from_beginning=*/false),
1003 0);
1004
1005 const std::string text2_utf8 = ".,abčd";
1006 const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);
1007 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1008 text2.begin(), text2.end(),
1009 /*count_from_beginning=*/true),
1010 2);
1011 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1012 text2.begin(), text2.end(),
1013 /*count_from_beginning=*/false),
1014 0);
1015
1016 const std::string text3_utf8 = ".,abčd[]";
1017 const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);
1018 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1019 text3.begin(), text3.end(),
1020 /*count_from_beginning=*/true),
1021 2);
1022 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1023 text3.begin(), text3.end(),
1024 /*count_from_beginning=*/false),
1025 2);
1026
1027 const std::string text4_utf8 = "[abčd]";
1028 const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);
1029 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1030 text4.begin(), text4.end(),
1031 /*count_from_beginning=*/true),
1032 1);
1033 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1034 text4.begin(), text4.end(),
1035 /*count_from_beginning=*/false),
1036 1);
1037
1038 const std::string text5_utf8 = "";
1039 const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);
1040 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1041 text5.begin(), text5.end(),
1042 /*count_from_beginning=*/true),
1043 0);
1044 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1045 text5.begin(), text5.end(),
1046 /*count_from_beginning=*/false),
1047 0);
1048
1049 const std::string text6_utf8 = "012345ěščř";
1050 const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);
1051 UnicodeText::const_iterator text6_begin = text6.begin();
1052 std::advance(text6_begin, 6);
1053 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1054 text6_begin, text6.end(),
1055 /*count_from_beginning=*/true),
1056 0);
1057 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1058 text6_begin, text6.end(),
1059 /*count_from_beginning=*/false),
1060 0);
1061
1062 const std::string text7_utf8 = "012345.,ěščř";
1063 const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);
1064 UnicodeText::const_iterator text7_begin = text7.begin();
1065 std::advance(text7_begin, 6);
1066 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1067 text7_begin, text7.end(),
1068 /*count_from_beginning=*/true),
1069 2);
1070 UnicodeText::const_iterator text7_end = text7.begin();
1071 std::advance(text7_end, 8);
1072 EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1073 text7.begin(), text7_end,
1074 /*count_from_beginning=*/false),
1075 2);
1076
1077 // Test not stripping.
1078 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1079 "Hello [[[Wořld]] or not?", {0, 24}),
1080 std::make_pair(0, 24));
1081 // Test basic stripping.
1082 EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1083 "Hello [[[Wořld]] or not?", {6, 16}),
1084 std::make_pair(9, 14));
1085 // Test stripping when everything is stripped.
1086 EXPECT_EQ(
1087 feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),
1088 std::make_pair(6, 6));
1089 // Test stripping empty string.
1090 EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),
1091 std::make_pair(0, 0));
1092 }
1093
TEST(FeatureProcessorTest,CodepointSpanToTokenSpan)1094 TEST(FeatureProcessorTest, CodepointSpanToTokenSpan) {
1095 const std::vector<Token> tokens{Token("Hělló", 0, 5),
1096 Token("fěěbař@google.com", 6, 23),
1097 Token("heře!", 24, 29)};
1098
1099 // Spans matching the tokens exactly.
1100 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));
1101 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));
1102 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));
1103 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));
1104 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));
1105 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));
1106
1107 // Snapping to containing tokens has no effect.
1108 EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));
1109 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));
1110 EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));
1111 EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));
1112 EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));
1113 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));
1114
1115 // Span boundaries inside tokens.
1116 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));
1117 EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));
1118
1119 // Tokens adjacent to the span, but not overlapping.
1120 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));
1121 EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));
1122 }
1123
1124 } // namespace
1125 } // namespace libtextclassifier2
1126