• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "feature-processor.h"
18 
19 #include "model-executor.h"
20 #include "tensor-view.h"
21 
22 #include "gmock/gmock.h"
23 #include "gtest/gtest.h"
24 
25 namespace libtextclassifier2 {
26 namespace {
27 
28 using testing::ElementsAreArray;
29 using testing::FloatEq;
30 using testing::Matcher;
31 
PackFeatureProcessorOptions(const FeatureProcessorOptionsT & options)32 flatbuffers::DetachedBuffer PackFeatureProcessorOptions(
33     const FeatureProcessorOptionsT& options) {
34   flatbuffers::FlatBufferBuilder builder;
35   builder.Finish(CreateFeatureProcessorOptions(builder, &options));
36   return builder.Release();
37 }
38 
39 template <typename T>
Subvector(const std::vector<T> & vector,int start,int end)40 std::vector<T> Subvector(const std::vector<T>& vector, int start, int end) {
41   return std::vector<T>(vector.begin() + start, vector.begin() + end);
42 }
43 
ElementsAreFloat(const std::vector<float> & values)44 Matcher<std::vector<float>> ElementsAreFloat(const std::vector<float>& values) {
45   std::vector<Matcher<float>> matchers;
46   for (const float value : values) {
47     matchers.push_back(FloatEq(value));
48   }
49   return ElementsAreArray(matchers);
50 }
51 
52 class TestingFeatureProcessor : public FeatureProcessor {
53  public:
54   using FeatureProcessor::CountIgnoredSpanBoundaryCodepoints;
55   using FeatureProcessor::FeatureProcessor;
56   using FeatureProcessor::ICUTokenize;
57   using FeatureProcessor::IsCodepointInRanges;
58   using FeatureProcessor::SpanToLabel;
59   using FeatureProcessor::StripTokensFromOtherLines;
60   using FeatureProcessor::supported_codepoint_ranges_;
61   using FeatureProcessor::SupportedCodepointsRatio;
62 };
63 
64 // EmbeddingExecutor that always returns features based on
65 class FakeEmbeddingExecutor : public EmbeddingExecutor {
66  public:
AddEmbedding(const TensorView<int> & sparse_features,float * dest,int dest_size) const67   bool AddEmbedding(const TensorView<int>& sparse_features, float* dest,
68                     int dest_size) const override {
69     TC_CHECK_GE(dest_size, 4);
70     EXPECT_EQ(sparse_features.size(), 1);
71     dest[0] = sparse_features.data()[0];
72     dest[1] = sparse_features.data()[0];
73     dest[2] = -sparse_features.data()[0];
74     dest[3] = -sparse_features.data()[0];
75     return true;
76   }
77 
78  private:
79   std::vector<float> storage_;
80 };
81 
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesMiddle)82 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
83   std::vector<Token> tokens{Token("Hělló", 0, 5),
84                             Token("fěěbař@google.com", 6, 23),
85                             Token("heře!", 24, 29)};
86 
87   internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
88 
89   // clang-format off
90   EXPECT_THAT(tokens, ElementsAreArray(
91                           {Token("Hělló", 0, 5),
92                            Token("fěě", 6, 9),
93                            Token("bař", 9, 12),
94                            Token("@google.com", 12, 23),
95                            Token("heře!", 24, 29)}));
96   // clang-format on
97 }
98 
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesBegin)99 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
100   std::vector<Token> tokens{Token("Hělló", 0, 5),
101                             Token("fěěbař@google.com", 6, 23),
102                             Token("heře!", 24, 29)};
103 
104   internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
105 
106   // clang-format off
107   EXPECT_THAT(tokens, ElementsAreArray(
108                           {Token("Hělló", 0, 5),
109                            Token("fěěbař", 6, 12),
110                            Token("@google.com", 12, 23),
111                            Token("heře!", 24, 29)}));
112   // clang-format on
113 }
114 
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesEnd)115 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
116   std::vector<Token> tokens{Token("Hělló", 0, 5),
117                             Token("fěěbař@google.com", 6, 23),
118                             Token("heře!", 24, 29)};
119 
120   internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
121 
122   // clang-format off
123   EXPECT_THAT(tokens, ElementsAreArray(
124                           {Token("Hělló", 0, 5),
125                            Token("fěě", 6, 9),
126                            Token("bař@google.com", 9, 23),
127                            Token("heře!", 24, 29)}));
128   // clang-format on
129 }
130 
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesWhole)131 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
132   std::vector<Token> tokens{Token("Hělló", 0, 5),
133                             Token("fěěbař@google.com", 6, 23),
134                             Token("heře!", 24, 29)};
135 
136   internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
137 
138   // clang-format off
139   EXPECT_THAT(tokens, ElementsAreArray(
140                           {Token("Hělló", 0, 5),
141                            Token("fěěbař@google.com", 6, 23),
142                            Token("heře!", 24, 29)}));
143   // clang-format on
144 }
145 
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesCrossToken)146 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
147   std::vector<Token> tokens{Token("Hělló", 0, 5),
148                             Token("fěěbař@google.com", 6, 23),
149                             Token("heře!", 24, 29)};
150 
151   internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
152 
153   // clang-format off
154   EXPECT_THAT(tokens, ElementsAreArray(
155                           {Token("Hě", 0, 2),
156                            Token("lló", 2, 5),
157                            Token("fěě", 6, 9),
158                            Token("bař@google.com", 9, 23),
159                            Token("heře!", 24, 29)}));
160   // clang-format on
161 }
162 
TEST(FeatureProcessorTest,KeepLineWithClickFirst)163 TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
164   CREATE_UNILIB_FOR_TESTING;
165   FeatureProcessorOptionsT options;
166   options.only_use_line_with_click = true;
167   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
168   TestingFeatureProcessor feature_processor(
169       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
170       &unilib);
171 
172   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
173   const CodepointSpan span = {0, 5};
174   // clang-format off
175   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
176                                Token("Lině", 6, 10),
177                                Token("Sěcond", 11, 17),
178                                Token("Lině", 18, 22),
179                                Token("Thiřd", 23, 28),
180                                Token("Lině", 29, 33)};
181   // clang-format on
182 
183   // Keeps the first line.
184   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
185   EXPECT_THAT(tokens,
186               ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
187 }
188 
TEST(FeatureProcessorTest,KeepLineWithClickSecond)189 TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
190   CREATE_UNILIB_FOR_TESTING;
191   FeatureProcessorOptionsT options;
192   options.only_use_line_with_click = true;
193   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
194   TestingFeatureProcessor feature_processor(
195       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
196       &unilib);
197 
198   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
199   const CodepointSpan span = {18, 22};
200   // clang-format off
201   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
202                                Token("Lině", 6, 10),
203                                Token("Sěcond", 11, 17),
204                                Token("Lině", 18, 22),
205                                Token("Thiřd", 23, 28),
206                                Token("Lině", 29, 33)};
207   // clang-format on
208 
209   // Keeps the first line.
210   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
211   EXPECT_THAT(tokens, ElementsAreArray(
212                           {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
213 }
214 
TEST(FeatureProcessorTest,KeepLineWithClickThird)215 TEST(FeatureProcessorTest, KeepLineWithClickThird) {
216   CREATE_UNILIB_FOR_TESTING;
217   FeatureProcessorOptionsT options;
218   options.only_use_line_with_click = true;
219   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
220   TestingFeatureProcessor feature_processor(
221       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
222       &unilib);
223 
224   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
225   const CodepointSpan span = {24, 33};
226   // clang-format off
227   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
228                                Token("Lině", 6, 10),
229                                Token("Sěcond", 11, 17),
230                                Token("Lině", 18, 22),
231                                Token("Thiřd", 23, 28),
232                                Token("Lině", 29, 33)};
233   // clang-format on
234 
235   // Keeps the first line.
236   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
237   EXPECT_THAT(tokens, ElementsAreArray(
238                           {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
239 }
240 
TEST(FeatureProcessorTest,KeepLineWithClickSecondWithPipe)241 TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
242   CREATE_UNILIB_FOR_TESTING;
243   FeatureProcessorOptionsT options;
244   options.only_use_line_with_click = true;
245   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
246   TestingFeatureProcessor feature_processor(
247       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
248       &unilib);
249 
250   const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
251   const CodepointSpan span = {18, 22};
252   // clang-format off
253   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
254                                Token("Lině", 6, 10),
255                                Token("Sěcond", 11, 17),
256                                Token("Lině", 18, 22),
257                                Token("Thiřd", 23, 28),
258                                Token("Lině", 29, 33)};
259   // clang-format on
260 
261   // Keeps the first line.
262   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
263   EXPECT_THAT(tokens, ElementsAreArray(
264                           {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
265 }
266 
TEST(FeatureProcessorTest,KeepLineWithCrosslineClick)267 TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
268   CREATE_UNILIB_FOR_TESTING;
269   FeatureProcessorOptionsT options;
270   options.only_use_line_with_click = true;
271   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
272   TestingFeatureProcessor feature_processor(
273       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
274       &unilib);
275 
276   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
277   const CodepointSpan span = {5, 23};
278   // clang-format off
279   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
280                                Token("Lině", 6, 10),
281                                Token("Sěcond", 18, 23),
282                                Token("Lině", 19, 23),
283                                Token("Thiřd", 23, 28),
284                                Token("Lině", 29, 33)};
285   // clang-format on
286 
287   // Keeps the first line.
288   feature_processor.StripTokensFromOtherLines(context, span, &tokens);
289   EXPECT_THAT(tokens, ElementsAreArray(
290                           {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
291                            Token("Sěcond", 18, 23), Token("Lině", 19, 23),
292                            Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
293 }
294 
TEST(FeatureProcessorTest,SpanToLabel)295 TEST(FeatureProcessorTest, SpanToLabel) {
296   CREATE_UNILIB_FOR_TESTING;
297   FeatureProcessorOptionsT options;
298   options.context_size = 1;
299   options.max_selection_span = 1;
300   options.snap_label_span_boundaries_to_containing_tokens = false;
301 
302   options.tokenization_codepoint_config.emplace_back(
303       new TokenizationCodepointRangeT());
304   auto& config = options.tokenization_codepoint_config.back();
305   config->start = 32;
306   config->end = 33;
307   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
308 
309   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
310   TestingFeatureProcessor feature_processor(
311       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
312       &unilib);
313   std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
314   ASSERT_EQ(3, tokens.size());
315   int label;
316   ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
317   EXPECT_EQ(kInvalidLabel, label);
318   ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
319   EXPECT_NE(kInvalidLabel, label);
320   TokenSpan token_span;
321   feature_processor.LabelToTokenSpan(label, &token_span);
322   EXPECT_EQ(0, token_span.first);
323   EXPECT_EQ(0, token_span.second);
324 
325   // Reconfigure with snapping enabled.
326   options.snap_label_span_boundaries_to_containing_tokens = true;
327   flatbuffers::DetachedBuffer options2_fb =
328       PackFeatureProcessorOptions(options);
329   TestingFeatureProcessor feature_processor2(
330       flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
331       &unilib);
332   int label2;
333   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
334   EXPECT_EQ(label, label2);
335   ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
336   EXPECT_EQ(label, label2);
337   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
338   EXPECT_EQ(label, label2);
339 
340   // Cross a token boundary.
341   ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
342   EXPECT_EQ(kInvalidLabel, label2);
343   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
344   EXPECT_EQ(kInvalidLabel, label2);
345 
346   // Multiple tokens.
347   options.context_size = 2;
348   options.max_selection_span = 2;
349   flatbuffers::DetachedBuffer options3_fb =
350       PackFeatureProcessorOptions(options);
351   TestingFeatureProcessor feature_processor3(
352       flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
353       &unilib);
354   tokens = feature_processor3.Tokenize("zero, one, two, three, four");
355   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
356   EXPECT_NE(kInvalidLabel, label2);
357   feature_processor3.LabelToTokenSpan(label2, &token_span);
358   EXPECT_EQ(1, token_span.first);
359   EXPECT_EQ(0, token_span.second);
360 
361   int label3;
362   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
363   EXPECT_EQ(label2, label3);
364   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
365   EXPECT_EQ(label2, label3);
366   ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
367   EXPECT_EQ(label2, label3);
368 }
369 
TEST(FeatureProcessorTest,SpanToLabelIgnoresPunctuation)370 TEST(FeatureProcessorTest, SpanToLabelIgnoresPunctuation) {
371   CREATE_UNILIB_FOR_TESTING;
372   FeatureProcessorOptionsT options;
373   options.context_size = 1;
374   options.max_selection_span = 1;
375   options.snap_label_span_boundaries_to_containing_tokens = false;
376 
377   options.tokenization_codepoint_config.emplace_back(
378       new TokenizationCodepointRangeT());
379   auto& config = options.tokenization_codepoint_config.back();
380   config->start = 32;
381   config->end = 33;
382   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
383 
384   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
385   TestingFeatureProcessor feature_processor(
386       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
387       &unilib);
388   std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
389   ASSERT_EQ(3, tokens.size());
390   int label;
391   ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
392   EXPECT_EQ(kInvalidLabel, label);
393   ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
394   EXPECT_NE(kInvalidLabel, label);
395   TokenSpan token_span;
396   feature_processor.LabelToTokenSpan(label, &token_span);
397   EXPECT_EQ(0, token_span.first);
398   EXPECT_EQ(0, token_span.second);
399 
400   // Reconfigure with snapping enabled.
401   options.snap_label_span_boundaries_to_containing_tokens = true;
402   flatbuffers::DetachedBuffer options2_fb =
403       PackFeatureProcessorOptions(options);
404   TestingFeatureProcessor feature_processor2(
405       flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
406       &unilib);
407   int label2;
408   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
409   EXPECT_EQ(label, label2);
410   ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
411   EXPECT_EQ(label, label2);
412   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
413   EXPECT_EQ(label, label2);
414 
415   // Cross a token boundary.
416   ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
417   EXPECT_EQ(kInvalidLabel, label2);
418   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
419   EXPECT_EQ(kInvalidLabel, label2);
420 
421   // Multiple tokens.
422   options.context_size = 2;
423   options.max_selection_span = 2;
424   flatbuffers::DetachedBuffer options3_fb =
425       PackFeatureProcessorOptions(options);
426   TestingFeatureProcessor feature_processor3(
427       flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
428       &unilib);
429   tokens = feature_processor3.Tokenize("zero, one, two, three, four");
430   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
431   EXPECT_NE(kInvalidLabel, label2);
432   feature_processor3.LabelToTokenSpan(label2, &token_span);
433   EXPECT_EQ(1, token_span.first);
434   EXPECT_EQ(0, token_span.second);
435 
436   int label3;
437   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
438   EXPECT_EQ(label2, label3);
439   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
440   EXPECT_EQ(label2, label3);
441   ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
442   EXPECT_EQ(label2, label3);
443 }
444 
TEST(FeatureProcessorTest,CenterTokenFromClick)445 TEST(FeatureProcessorTest, CenterTokenFromClick) {
446   int token_index;
447 
448   // Exactly aligned indices.
449   token_index = internal::CenterTokenFromClick(
450       {6, 11},
451       {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
452   EXPECT_EQ(token_index, 1);
453 
454   // Click is contained in a token.
455   token_index = internal::CenterTokenFromClick(
456       {13, 17},
457       {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
458   EXPECT_EQ(token_index, 2);
459 
460   // Click spans two tokens.
461   token_index = internal::CenterTokenFromClick(
462       {6, 17},
463       {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
464   EXPECT_EQ(token_index, kInvalidIndex);
465 }
466 
TEST(FeatureProcessorTest,CenterTokenFromMiddleOfSelection)467 TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
468   int token_index;
469 
470   // Selection of length 3. Exactly aligned indices.
471   token_index = internal::CenterTokenFromMiddleOfSelection(
472       {7, 27},
473       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
474        Token("Token4", 21, 27), Token("Token5", 28, 34)});
475   EXPECT_EQ(token_index, 2);
476 
477   // Selection of length 1 token. Exactly aligned indices.
478   token_index = internal::CenterTokenFromMiddleOfSelection(
479       {21, 27},
480       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
481        Token("Token4", 21, 27), Token("Token5", 28, 34)});
482   EXPECT_EQ(token_index, 3);
483 
484   // Selection marks sub-token range, with no tokens in it.
485   token_index = internal::CenterTokenFromMiddleOfSelection(
486       {29, 33},
487       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
488        Token("Token4", 21, 27), Token("Token5", 28, 34)});
489   EXPECT_EQ(token_index, kInvalidIndex);
490 
491   // Selection of length 2. Sub-token indices.
492   token_index = internal::CenterTokenFromMiddleOfSelection(
493       {3, 25},
494       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
495        Token("Token4", 21, 27), Token("Token5", 28, 34)});
496   EXPECT_EQ(token_index, 1);
497 
498   // Selection of length 1. Sub-token indices.
499   token_index = internal::CenterTokenFromMiddleOfSelection(
500       {22, 34},
501       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
502        Token("Token4", 21, 27), Token("Token5", 28, 34)});
503   EXPECT_EQ(token_index, 4);
504 
505   // Some invalid ones.
506   token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
507   EXPECT_EQ(token_index, -1);
508 }
509 
TEST(FeatureProcessorTest,SupportedCodepointsRatio)510 TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
511   FeatureProcessorOptionsT options;
512   options.context_size = 2;
513   options.max_selection_span = 2;
514   options.snap_label_span_boundaries_to_containing_tokens = false;
515   options.feature_version = 2;
516   options.embedding_size = 4;
517   options.bounds_sensitive_features.reset(
518       new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
519   options.bounds_sensitive_features->enabled = true;
520   options.bounds_sensitive_features->num_tokens_before = 5;
521   options.bounds_sensitive_features->num_tokens_inside_left = 3;
522   options.bounds_sensitive_features->num_tokens_inside_right = 3;
523   options.bounds_sensitive_features->num_tokens_after = 5;
524   options.bounds_sensitive_features->include_inside_bag = true;
525   options.bounds_sensitive_features->include_inside_length = true;
526 
527   options.tokenization_codepoint_config.emplace_back(
528       new TokenizationCodepointRangeT());
529   auto& config = options.tokenization_codepoint_config.back();
530   config->start = 32;
531   config->end = 33;
532   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
533 
534   {
535     options.supported_codepoint_ranges.emplace_back(
536         new FeatureProcessorOptions_::CodepointRangeT());
537     auto& range = options.supported_codepoint_ranges.back();
538     range->start = 0;
539     range->end = 128;
540   }
541 
542   {
543     options.supported_codepoint_ranges.emplace_back(
544         new FeatureProcessorOptions_::CodepointRangeT());
545     auto& range = options.supported_codepoint_ranges.back();
546     range->start = 10000;
547     range->end = 10001;
548   }
549 
550   {
551     options.supported_codepoint_ranges.emplace_back(
552         new FeatureProcessorOptions_::CodepointRangeT());
553     auto& range = options.supported_codepoint_ranges.back();
554     range->start = 20000;
555     range->end = 30000;
556   }
557 
558   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
559   CREATE_UNILIB_FOR_TESTING;
560   TestingFeatureProcessor feature_processor(
561       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
562       &unilib);
563   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
564                   {0, 3}, feature_processor.Tokenize("aaa bbb ccc")),
565               FloatEq(1.0));
566   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
567                   {0, 3}, feature_processor.Tokenize("aaa bbb ěěě")),
568               FloatEq(2.0 / 3));
569   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
570                   {0, 3}, feature_processor.Tokenize("ěěě řřř ěěě")),
571               FloatEq(0.0));
572   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
573       -1, feature_processor.supported_codepoint_ranges_));
574   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
575       0, feature_processor.supported_codepoint_ranges_));
576   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
577       10, feature_processor.supported_codepoint_ranges_));
578   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
579       127, feature_processor.supported_codepoint_ranges_));
580   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
581       128, feature_processor.supported_codepoint_ranges_));
582   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
583       9999, feature_processor.supported_codepoint_ranges_));
584   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
585       10000, feature_processor.supported_codepoint_ranges_));
586   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
587       10001, feature_processor.supported_codepoint_ranges_));
588   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
589       25000, feature_processor.supported_codepoint_ranges_));
590 
591   const std::vector<Token> tokens = {Token("ěěě", 0, 3), Token("řřř", 4, 7),
592                                      Token("eee", 8, 11)};
593 
594   options.min_supported_codepoint_ratio = 0.0;
595   flatbuffers::DetachedBuffer options2_fb =
596       PackFeatureProcessorOptions(options);
597   TestingFeatureProcessor feature_processor2(
598       flatbuffers::GetRoot<FeatureProcessorOptions>(options2_fb.data()),
599       &unilib);
600   EXPECT_TRUE(feature_processor2.HasEnoughSupportedCodepoints(
601       tokens, /*token_span=*/{0, 3}));
602 
603   options.min_supported_codepoint_ratio = 0.2;
604   flatbuffers::DetachedBuffer options3_fb =
605       PackFeatureProcessorOptions(options);
606   TestingFeatureProcessor feature_processor3(
607       flatbuffers::GetRoot<FeatureProcessorOptions>(options3_fb.data()),
608       &unilib);
609   EXPECT_TRUE(feature_processor3.HasEnoughSupportedCodepoints(
610       tokens, /*token_span=*/{0, 3}));
611 
612   options.min_supported_codepoint_ratio = 0.5;
613   flatbuffers::DetachedBuffer options4_fb =
614       PackFeatureProcessorOptions(options);
615   TestingFeatureProcessor feature_processor4(
616       flatbuffers::GetRoot<FeatureProcessorOptions>(options4_fb.data()),
617       &unilib);
618   EXPECT_FALSE(feature_processor4.HasEnoughSupportedCodepoints(
619       tokens, /*token_span=*/{0, 3}));
620 }
621 
TEST(FeatureProcessorTest,InSpanFeature)622 TEST(FeatureProcessorTest, InSpanFeature) {
623   FeatureProcessorOptionsT options;
624   options.context_size = 2;
625   options.max_selection_span = 2;
626   options.snap_label_span_boundaries_to_containing_tokens = false;
627   options.feature_version = 2;
628   options.embedding_size = 4;
629   options.extract_selection_mask_feature = true;
630 
631   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
632   CREATE_UNILIB_FOR_TESTING;
633   TestingFeatureProcessor feature_processor(
634       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
635       &unilib);
636 
637   std::unique_ptr<CachedFeatures> cached_features;
638 
639   FakeEmbeddingExecutor embedding_executor;
640 
641   const std::vector<Token> tokens = {Token("aaa", 0, 3), Token("bbb", 4, 7),
642                                      Token("ccc", 8, 11), Token("ddd", 12, 15)};
643 
644   EXPECT_TRUE(feature_processor.ExtractFeatures(
645       tokens, /*token_span=*/{0, 4},
646       /*selection_span_for_feature=*/{4, 11}, &embedding_executor,
647       /*embedding_cache=*/nullptr, /*feature_vector_size=*/5,
648       &cached_features));
649   std::vector<float> features;
650   cached_features->AppendClickContextFeaturesForClick(1, &features);
651   ASSERT_EQ(features.size(), 25);
652   EXPECT_THAT(features[4], FloatEq(0.0));
653   EXPECT_THAT(features[9], FloatEq(0.0));
654   EXPECT_THAT(features[14], FloatEq(1.0));
655   EXPECT_THAT(features[19], FloatEq(1.0));
656   EXPECT_THAT(features[24], FloatEq(0.0));
657 }
658 
TEST(FeatureProcessorTest,EmbeddingCache)659 TEST(FeatureProcessorTest, EmbeddingCache) {
660   FeatureProcessorOptionsT options;
661   options.context_size = 2;
662   options.max_selection_span = 2;
663   options.snap_label_span_boundaries_to_containing_tokens = false;
664   options.feature_version = 2;
665   options.embedding_size = 4;
666   options.bounds_sensitive_features.reset(
667       new FeatureProcessorOptions_::BoundsSensitiveFeaturesT());
668   options.bounds_sensitive_features->enabled = true;
669   options.bounds_sensitive_features->num_tokens_before = 3;
670   options.bounds_sensitive_features->num_tokens_inside_left = 2;
671   options.bounds_sensitive_features->num_tokens_inside_right = 2;
672   options.bounds_sensitive_features->num_tokens_after = 3;
673 
674   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
675   CREATE_UNILIB_FOR_TESTING;
676   TestingFeatureProcessor feature_processor(
677       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
678       &unilib);
679 
680   std::unique_ptr<CachedFeatures> cached_features;
681 
682   FakeEmbeddingExecutor embedding_executor;
683 
684   const std::vector<Token> tokens = {
685       Token("aaa", 0, 3),   Token("bbb", 4, 7),   Token("ccc", 8, 11),
686       Token("ddd", 12, 15), Token("eee", 16, 19), Token("fff", 20, 23)};
687 
688   // We pre-populate the cache with dummy embeddings, to make sure they are
689   // used when populating the features vector.
690   const std::vector<float> cached_padding_features = {10.0, -10.0, 10.0, -10.0};
691   const std::vector<float> cached_features1 = {1.0, 2.0, 3.0, 4.0};
692   const std::vector<float> cached_features2 = {5.0, 6.0, 7.0, 8.0};
693   FeatureProcessor::EmbeddingCache embedding_cache = {
694       {{kInvalidIndex, kInvalidIndex}, cached_padding_features},
695       {{4, 7}, cached_features1},
696       {{12, 15}, cached_features2},
697   };
698 
699   EXPECT_TRUE(feature_processor.ExtractFeatures(
700       tokens, /*token_span=*/{0, 6},
701       /*selection_span_for_feature=*/{kInvalidIndex, kInvalidIndex},
702       &embedding_executor, &embedding_cache, /*feature_vector_size=*/4,
703       &cached_features));
704   std::vector<float> features;
705   cached_features->AppendBoundsSensitiveFeaturesForSpan({2, 4}, &features);
706   ASSERT_EQ(features.size(), 40);
707   // Check that the dummy embeddings were used.
708   EXPECT_THAT(Subvector(features, 0, 4),
709               ElementsAreFloat(cached_padding_features));
710   EXPECT_THAT(Subvector(features, 8, 12), ElementsAreFloat(cached_features1));
711   EXPECT_THAT(Subvector(features, 16, 20), ElementsAreFloat(cached_features2));
712   EXPECT_THAT(Subvector(features, 24, 28), ElementsAreFloat(cached_features2));
713   EXPECT_THAT(Subvector(features, 36, 40),
714               ElementsAreFloat(cached_padding_features));
715   // Check that the real embeddings were cached.
716   EXPECT_EQ(embedding_cache.size(), 7);
717   EXPECT_THAT(Subvector(features, 4, 8),
718               ElementsAreFloat(embedding_cache.at({0, 3})));
719   EXPECT_THAT(Subvector(features, 12, 16),
720               ElementsAreFloat(embedding_cache.at({8, 11})));
721   EXPECT_THAT(Subvector(features, 20, 24),
722               ElementsAreFloat(embedding_cache.at({8, 11})));
723   EXPECT_THAT(Subvector(features, 28, 32),
724               ElementsAreFloat(embedding_cache.at({16, 19})));
725   EXPECT_THAT(Subvector(features, 32, 36),
726               ElementsAreFloat(embedding_cache.at({20, 23})));
727 }
728 
TEST(FeatureProcessorTest,StripUnusedTokensWithNoRelativeClick)729 TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
730   std::vector<Token> tokens_orig{
731       Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0),  Token("3", 0, 0),
732       Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0),  Token("7", 0, 0),
733       Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
734       Token("12", 0, 0)};
735 
736   std::vector<Token> tokens;
737   int click_index;
738 
739   // Try to click first token and see if it gets padded from left.
740   tokens = tokens_orig;
741   click_index = 0;
742   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
743   // clang-format off
744   EXPECT_EQ(tokens, std::vector<Token>({Token(),
745                                         Token(),
746                                         Token("0", 0, 0),
747                                         Token("1", 0, 0),
748                                         Token("2", 0, 0)}));
749   // clang-format on
750   EXPECT_EQ(click_index, 2);
751 
752   // When we click the second token nothing should get padded.
753   tokens = tokens_orig;
754   click_index = 2;
755   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
756   // clang-format off
757   EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
758                                         Token("1", 0, 0),
759                                         Token("2", 0, 0),
760                                         Token("3", 0, 0),
761                                         Token("4", 0, 0)}));
762   // clang-format on
763   EXPECT_EQ(click_index, 2);
764 
765   // When we click the last token tokens should get padded from the right.
766   tokens = tokens_orig;
767   click_index = 12;
768   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
769   // clang-format off
770   EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
771                                         Token("11", 0, 0),
772                                         Token("12", 0, 0),
773                                         Token(),
774                                         Token()}));
775   // clang-format on
776   EXPECT_EQ(click_index, 2);
777 }
778 
TEST(FeatureProcessorTest,StripUnusedTokensWithRelativeClick)779 TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
780   std::vector<Token> tokens_orig{
781       Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0),  Token("3", 0, 0),
782       Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0),  Token("7", 0, 0),
783       Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
784       Token("12", 0, 0)};
785 
786   std::vector<Token> tokens;
787   int click_index;
788 
789   // Try to click first token and see if it gets padded from left to maximum
790   // context_size.
791   tokens = tokens_orig;
792   click_index = 0;
793   internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
794   // clang-format off
795   EXPECT_EQ(tokens, std::vector<Token>({Token(),
796                                         Token(),
797                                         Token("0", 0, 0),
798                                         Token("1", 0, 0),
799                                         Token("2", 0, 0),
800                                         Token("3", 0, 0),
801                                         Token("4", 0, 0),
802                                         Token("5", 0, 0)}));
803   // clang-format on
804   EXPECT_EQ(click_index, 2);
805 
806   // Clicking to the middle with enough context should not produce any padding.
807   tokens = tokens_orig;
808   click_index = 6;
809   internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
810   // clang-format off
811   EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
812                                         Token("2", 0, 0),
813                                         Token("3", 0, 0),
814                                         Token("4", 0, 0),
815                                         Token("5", 0, 0),
816                                         Token("6", 0, 0),
817                                         Token("7", 0, 0),
818                                         Token("8", 0, 0),
819                                         Token("9", 0, 0)}));
820   // clang-format on
821   EXPECT_EQ(click_index, 5);
822 
823   // Clicking at the end should pad right to maximum context_size.
824   tokens = tokens_orig;
825   click_index = 11;
826   internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
827   // clang-format off
828   EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
829                                         Token("7", 0, 0),
830                                         Token("8", 0, 0),
831                                         Token("9", 0, 0),
832                                         Token("10", 0, 0),
833                                         Token("11", 0, 0),
834                                         Token("12", 0, 0),
835                                         Token(),
836                                         Token()}));
837   // clang-format on
838   EXPECT_EQ(click_index, 5);
839 }
840 
TEST(FeatureProcessorTest,InternalTokenizeOnScriptChange)841 TEST(FeatureProcessorTest, InternalTokenizeOnScriptChange) {
842   CREATE_UNILIB_FOR_TESTING;
843   FeatureProcessorOptionsT options;
844   options.tokenization_codepoint_config.emplace_back(
845       new TokenizationCodepointRangeT());
846   {
847     auto& config = options.tokenization_codepoint_config.back();
848     config->start = 0;
849     config->end = 256;
850     config->role = TokenizationCodepointRange_::Role_DEFAULT_ROLE;
851     config->script_id = 1;
852   }
853   options.tokenize_on_script_change = false;
854 
855   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
856   TestingFeatureProcessor feature_processor(
857       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
858       &unilib);
859 
860   EXPECT_EQ(feature_processor.Tokenize("앨라배마123웹사이트"),
861             std::vector<Token>({Token("앨라배마123웹사이트", 0, 11)}));
862 
863   options.tokenize_on_script_change = true;
864   flatbuffers::DetachedBuffer options_fb2 =
865       PackFeatureProcessorOptions(options);
866   TestingFeatureProcessor feature_processor2(
867       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb2.data()),
868       &unilib);
869 
870   EXPECT_EQ(feature_processor2.Tokenize("앨라배마123웹사이트"),
871             std::vector<Token>({Token("앨라배마", 0, 4), Token("123", 4, 7),
872                                 Token("웹사이트", 7, 11)}));
873 }
874 
875 #ifdef LIBTEXTCLASSIFIER_TEST_ICU
TEST(FeatureProcessorTest,ICUTokenize)876 TEST(FeatureProcessorTest, ICUTokenize) {
877   FeatureProcessorOptionsT options;
878   options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
879 
880   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
881   TestingFeatureProcessor feature_processor(
882       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
883   std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
884   ASSERT_EQ(tokens,
885             // clang-format off
886             std::vector<Token>({Token("พระบาท", 0, 6),
887                                 Token("สมเด็จ", 6, 12),
888                                 Token("พระ", 12, 15),
889                                 Token("ปร", 15, 17),
890                                 Token("มิ", 17, 19)}));
891   // clang-format on
892 }
893 #endif
894 
895 #ifdef LIBTEXTCLASSIFIER_TEST_ICU
TEST(FeatureProcessorTest,ICUTokenizeWithWhitespaces)896 TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
897   FeatureProcessorOptionsT options;
898   options.tokenization_type = FeatureProcessorOptions_::TokenizationType_ICU;
899   options.icu_preserve_whitespace_tokens = true;
900 
901   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
902   TestingFeatureProcessor feature_processor(
903       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
904   std::vector<Token> tokens =
905       feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
906   ASSERT_EQ(tokens,
907             // clang-format off
908             std::vector<Token>({Token("พระบาท", 0, 6),
909                                 Token(" ", 6, 7),
910                                 Token("สมเด็จ", 7, 13),
911                                 Token(" ", 13, 14),
912                                 Token("พระ", 14, 17),
913                                 Token(" ", 17, 18),
914                                 Token("ปร", 18, 20),
915                                 Token(" ", 20, 21),
916                                 Token("มิ", 21, 23)}));
917   // clang-format on
918 }
919 #endif
920 
921 #ifdef LIBTEXTCLASSIFIER_TEST_ICU
TEST(FeatureProcessorTest,MixedTokenize)922 TEST(FeatureProcessorTest, MixedTokenize) {
923   FeatureProcessorOptionsT options;
924   options.tokenization_type = FeatureProcessorOptions_::TokenizationType_MIXED;
925 
926   options.tokenization_codepoint_config.emplace_back(
927       new TokenizationCodepointRangeT());
928   auto& config = options.tokenization_codepoint_config.back();
929   config->start = 32;
930   config->end = 33;
931   config->role = TokenizationCodepointRange_::Role_WHITESPACE_SEPARATOR;
932 
933   {
934     options.internal_tokenizer_codepoint_ranges.emplace_back(
935         new FeatureProcessorOptions_::CodepointRangeT());
936     auto& range = options.internal_tokenizer_codepoint_ranges.back();
937     range->start = 0;
938     range->end = 128;
939   }
940 
941   {
942     options.internal_tokenizer_codepoint_ranges.emplace_back(
943         new FeatureProcessorOptions_::CodepointRangeT());
944     auto& range = options.internal_tokenizer_codepoint_ranges.back();
945     range->start = 128;
946     range->end = 256;
947   }
948 
949   {
950     options.internal_tokenizer_codepoint_ranges.emplace_back(
951         new FeatureProcessorOptions_::CodepointRangeT());
952     auto& range = options.internal_tokenizer_codepoint_ranges.back();
953     range->start = 256;
954     range->end = 384;
955   }
956 
957   {
958     options.internal_tokenizer_codepoint_ranges.emplace_back(
959         new FeatureProcessorOptions_::CodepointRangeT());
960     auto& range = options.internal_tokenizer_codepoint_ranges.back();
961     range->start = 384;
962     range->end = 592;
963   }
964 
965   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
966   TestingFeatureProcessor feature_processor(
967       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()));
968   std::vector<Token> tokens = feature_processor.Tokenize(
969       "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
970   ASSERT_EQ(tokens,
971             // clang-format off
972             std::vector<Token>({Token("こんにちは", 0, 5),
973                                 Token("Japanese-ląnguagę", 5, 22),
974                                 Token("text", 23, 27),
975                                 Token("世界", 28, 30),
976                                 Token("http://www.google.com/", 31, 53)}));
977   // clang-format on
978 }
979 #endif
980 
TEST(FeatureProcessorTest,IgnoredSpanBoundaryCodepoints)981 TEST(FeatureProcessorTest, IgnoredSpanBoundaryCodepoints) {
982   CREATE_UNILIB_FOR_TESTING;
983   FeatureProcessorOptionsT options;
984   options.ignored_span_boundary_codepoints.push_back('.');
985   options.ignored_span_boundary_codepoints.push_back(',');
986   options.ignored_span_boundary_codepoints.push_back('[');
987   options.ignored_span_boundary_codepoints.push_back(']');
988 
989   flatbuffers::DetachedBuffer options_fb = PackFeatureProcessorOptions(options);
990   TestingFeatureProcessor feature_processor(
991       flatbuffers::GetRoot<FeatureProcessorOptions>(options_fb.data()),
992       &unilib);
993 
994   const std::string text1_utf8 = "ěščř";
995   const UnicodeText text1 = UTF8ToUnicodeText(text1_utf8, /*do_copy=*/false);
996   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
997                 text1.begin(), text1.end(),
998                 /*count_from_beginning=*/true),
999             0);
1000   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1001                 text1.begin(), text1.end(),
1002                 /*count_from_beginning=*/false),
1003             0);
1004 
1005   const std::string text2_utf8 = ".,abčd";
1006   const UnicodeText text2 = UTF8ToUnicodeText(text2_utf8, /*do_copy=*/false);
1007   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1008                 text2.begin(), text2.end(),
1009                 /*count_from_beginning=*/true),
1010             2);
1011   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1012                 text2.begin(), text2.end(),
1013                 /*count_from_beginning=*/false),
1014             0);
1015 
1016   const std::string text3_utf8 = ".,abčd[]";
1017   const UnicodeText text3 = UTF8ToUnicodeText(text3_utf8, /*do_copy=*/false);
1018   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1019                 text3.begin(), text3.end(),
1020                 /*count_from_beginning=*/true),
1021             2);
1022   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1023                 text3.begin(), text3.end(),
1024                 /*count_from_beginning=*/false),
1025             2);
1026 
1027   const std::string text4_utf8 = "[abčd]";
1028   const UnicodeText text4 = UTF8ToUnicodeText(text4_utf8, /*do_copy=*/false);
1029   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1030                 text4.begin(), text4.end(),
1031                 /*count_from_beginning=*/true),
1032             1);
1033   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1034                 text4.begin(), text4.end(),
1035                 /*count_from_beginning=*/false),
1036             1);
1037 
1038   const std::string text5_utf8 = "";
1039   const UnicodeText text5 = UTF8ToUnicodeText(text5_utf8, /*do_copy=*/false);
1040   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1041                 text5.begin(), text5.end(),
1042                 /*count_from_beginning=*/true),
1043             0);
1044   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1045                 text5.begin(), text5.end(),
1046                 /*count_from_beginning=*/false),
1047             0);
1048 
1049   const std::string text6_utf8 = "012345ěščř";
1050   const UnicodeText text6 = UTF8ToUnicodeText(text6_utf8, /*do_copy=*/false);
1051   UnicodeText::const_iterator text6_begin = text6.begin();
1052   std::advance(text6_begin, 6);
1053   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1054                 text6_begin, text6.end(),
1055                 /*count_from_beginning=*/true),
1056             0);
1057   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1058                 text6_begin, text6.end(),
1059                 /*count_from_beginning=*/false),
1060             0);
1061 
1062   const std::string text7_utf8 = "012345.,ěščř";
1063   const UnicodeText text7 = UTF8ToUnicodeText(text7_utf8, /*do_copy=*/false);
1064   UnicodeText::const_iterator text7_begin = text7.begin();
1065   std::advance(text7_begin, 6);
1066   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1067                 text7_begin, text7.end(),
1068                 /*count_from_beginning=*/true),
1069             2);
1070   UnicodeText::const_iterator text7_end = text7.begin();
1071   std::advance(text7_end, 8);
1072   EXPECT_EQ(feature_processor.CountIgnoredSpanBoundaryCodepoints(
1073                 text7.begin(), text7_end,
1074                 /*count_from_beginning=*/false),
1075             2);
1076 
1077   // Test not stripping.
1078   EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1079                 "Hello [[[Wořld]] or not?", {0, 24}),
1080             std::make_pair(0, 24));
1081   // Test basic stripping.
1082   EXPECT_EQ(feature_processor.StripBoundaryCodepoints(
1083                 "Hello [[[Wořld]] or not?", {6, 16}),
1084             std::make_pair(9, 14));
1085   // Test stripping when everything is stripped.
1086   EXPECT_EQ(
1087       feature_processor.StripBoundaryCodepoints("Hello [[[]] or not?", {6, 11}),
1088       std::make_pair(6, 6));
1089   // Test stripping empty string.
1090   EXPECT_EQ(feature_processor.StripBoundaryCodepoints("", {0, 0}),
1091             std::make_pair(0, 0));
1092 }
1093 
TEST(FeatureProcessorTest,CodepointSpanToTokenSpan)1094 TEST(FeatureProcessorTest, CodepointSpanToTokenSpan) {
1095   const std::vector<Token> tokens{Token("Hělló", 0, 5),
1096                                   Token("fěěbař@google.com", 6, 23),
1097                                   Token("heře!", 24, 29)};
1098 
1099   // Spans matching the tokens exactly.
1100   EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}));
1101   EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}));
1102   EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}));
1103   EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}));
1104   EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}));
1105   EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}));
1106 
1107   // Snapping to containing tokens has no effect.
1108   EXPECT_EQ(TokenSpan(0, 1), CodepointSpanToTokenSpan(tokens, {0, 5}, true));
1109   EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {6, 23}, true));
1110   EXPECT_EQ(TokenSpan(2, 3), CodepointSpanToTokenSpan(tokens, {24, 29}, true));
1111   EXPECT_EQ(TokenSpan(0, 2), CodepointSpanToTokenSpan(tokens, {0, 23}, true));
1112   EXPECT_EQ(TokenSpan(1, 3), CodepointSpanToTokenSpan(tokens, {6, 29}, true));
1113   EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {0, 29}, true));
1114 
1115   // Span boundaries inside tokens.
1116   EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {1, 28}));
1117   EXPECT_EQ(TokenSpan(0, 3), CodepointSpanToTokenSpan(tokens, {1, 28}, true));
1118 
1119   // Tokens adjacent to the span, but not overlapping.
1120   EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}));
1121   EXPECT_EQ(TokenSpan(1, 2), CodepointSpanToTokenSpan(tokens, {5, 24}, true));
1122 }
1123 
1124 }  // namespace
1125 }  // namespace libtextclassifier2
1126