• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "smartselect/feature-processor.h"
18 
19 #include "gmock/gmock.h"
20 #include "gtest/gtest.h"
21 
22 namespace libtextclassifier {
23 namespace {
24 
25 using testing::ElementsAreArray;
26 using testing::FloatEq;
27 
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesMiddle)28 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
29   std::vector<Token> tokens{Token("Hělló", 0, 5),
30                             Token("fěěbař@google.com", 6, 23),
31                             Token("heře!", 24, 29)};
32 
33   internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
34 
35   // clang-format off
36   EXPECT_THAT(tokens, ElementsAreArray(
37                           {Token("Hělló", 0, 5),
38                            Token("fěě", 6, 9),
39                            Token("bař", 9, 12),
40                            Token("@google.com", 12, 23),
41                            Token("heře!", 24, 29)}));
42   // clang-format on
43 }
44 
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesBegin)45 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
46   std::vector<Token> tokens{Token("Hělló", 0, 5),
47                             Token("fěěbař@google.com", 6, 23),
48                             Token("heře!", 24, 29)};
49 
50   internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
51 
52   // clang-format off
53   EXPECT_THAT(tokens, ElementsAreArray(
54                           {Token("Hělló", 0, 5),
55                            Token("fěěbař", 6, 12),
56                            Token("@google.com", 12, 23),
57                            Token("heře!", 24, 29)}));
58   // clang-format on
59 }
60 
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesEnd)61 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
62   std::vector<Token> tokens{Token("Hělló", 0, 5),
63                             Token("fěěbař@google.com", 6, 23),
64                             Token("heře!", 24, 29)};
65 
66   internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
67 
68   // clang-format off
69   EXPECT_THAT(tokens, ElementsAreArray(
70                           {Token("Hělló", 0, 5),
71                            Token("fěě", 6, 9),
72                            Token("bař@google.com", 9, 23),
73                            Token("heře!", 24, 29)}));
74   // clang-format on
75 }
76 
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesWhole)77 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
78   std::vector<Token> tokens{Token("Hělló", 0, 5),
79                             Token("fěěbař@google.com", 6, 23),
80                             Token("heře!", 24, 29)};
81 
82   internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
83 
84   // clang-format off
85   EXPECT_THAT(tokens, ElementsAreArray(
86                           {Token("Hělló", 0, 5),
87                            Token("fěěbař@google.com", 6, 23),
88                            Token("heře!", 24, 29)}));
89   // clang-format on
90 }
91 
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesCrossToken)92 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
93   std::vector<Token> tokens{Token("Hělló", 0, 5),
94                             Token("fěěbař@google.com", 6, 23),
95                             Token("heře!", 24, 29)};
96 
97   internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
98 
99   // clang-format off
100   EXPECT_THAT(tokens, ElementsAreArray(
101                           {Token("Hě", 0, 2),
102                            Token("lló", 2, 5),
103                            Token("fěě", 6, 9),
104                            Token("bař@google.com", 9, 23),
105                            Token("heře!", 24, 29)}));
106   // clang-format on
107 }
108 
TEST(FeatureProcessorTest,KeepLineWithClickFirst)109 TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
110   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
111   const CodepointSpan span = {0, 5};
112   // clang-format off
113   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
114                                Token("Lině", 6, 10),
115                                Token("Sěcond", 11, 17),
116                                Token("Lině", 18, 22),
117                                Token("Thiřd", 23, 28),
118                                Token("Lině", 29, 33)};
119   // clang-format on
120 
121   // Keeps the first line.
122   internal::StripTokensFromOtherLines(context, span, &tokens);
123   EXPECT_THAT(tokens,
124               ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
125 }
126 
TEST(FeatureProcessorTest,KeepLineWithClickSecond)127 TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
128   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
129   const CodepointSpan span = {18, 22};
130   // clang-format off
131   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
132                                Token("Lině", 6, 10),
133                                Token("Sěcond", 11, 17),
134                                Token("Lině", 18, 22),
135                                Token("Thiřd", 23, 28),
136                                Token("Lině", 29, 33)};
137   // clang-format on
138 
139   // Keeps the first line.
140   internal::StripTokensFromOtherLines(context, span, &tokens);
141   EXPECT_THAT(tokens, ElementsAreArray(
142                           {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
143 }
144 
TEST(FeatureProcessorTest,KeepLineWithClickThird)145 TEST(FeatureProcessorTest, KeepLineWithClickThird) {
146   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
147   const CodepointSpan span = {24, 33};
148   // clang-format off
149   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
150                                Token("Lině", 6, 10),
151                                Token("Sěcond", 11, 17),
152                                Token("Lině", 18, 22),
153                                Token("Thiřd", 23, 28),
154                                Token("Lině", 29, 33)};
155   // clang-format on
156 
157   // Keeps the first line.
158   internal::StripTokensFromOtherLines(context, span, &tokens);
159   EXPECT_THAT(tokens, ElementsAreArray(
160                           {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
161 }
162 
TEST(FeatureProcessorTest,KeepLineWithClickSecondWithPipe)163 TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
164   const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
165   const CodepointSpan span = {18, 22};
166   // clang-format off
167   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
168                                Token("Lině", 6, 10),
169                                Token("Sěcond", 11, 17),
170                                Token("Lině", 18, 22),
171                                Token("Thiřd", 23, 28),
172                                Token("Lině", 29, 33)};
173   // clang-format on
174 
175   // Keeps the first line.
176   internal::StripTokensFromOtherLines(context, span, &tokens);
177   EXPECT_THAT(tokens, ElementsAreArray(
178                           {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
179 }
180 
TEST(FeatureProcessorTest,KeepLineWithCrosslineClick)181 TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
182   const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
183   const CodepointSpan span = {5, 23};
184   // clang-format off
185   std::vector<Token> tokens = {Token("Fiřst", 0, 5),
186                                Token("Lině", 6, 10),
187                                Token("Sěcond", 18, 23),
188                                Token("Lině", 19, 23),
189                                Token("Thiřd", 23, 28),
190                                Token("Lině", 29, 33)};
191   // clang-format on
192 
193   // Keeps the first line.
194   internal::StripTokensFromOtherLines(context, span, &tokens);
195   EXPECT_THAT(tokens, ElementsAreArray(
196                           {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
197                            Token("Sěcond", 18, 23), Token("Lině", 19, 23),
198                            Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
199 }
200 
201 class TestingFeatureProcessor : public FeatureProcessor {
202  public:
203   using FeatureProcessor::FeatureProcessor;
204   using FeatureProcessor::SpanToLabel;
205   using FeatureProcessor::SupportedCodepointsRatio;
206   using FeatureProcessor::IsCodepointInRanges;
207   using FeatureProcessor::ICUTokenize;
208   using FeatureProcessor::supported_codepoint_ranges_;
209 };
210 
TEST(FeatureProcessorTest,SpanToLabel)211 TEST(FeatureProcessorTest, SpanToLabel) {
212   FeatureProcessorOptions options;
213   options.set_context_size(1);
214   options.set_max_selection_span(1);
215   options.set_snap_label_span_boundaries_to_containing_tokens(false);
216 
217   TokenizationCodepointRange* config =
218       options.add_tokenization_codepoint_config();
219   config->set_start(32);
220   config->set_end(33);
221   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
222 
223   TestingFeatureProcessor feature_processor(options);
224   std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
225   ASSERT_EQ(3, tokens.size());
226   int label;
227   ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
228   EXPECT_EQ(kInvalidLabel, label);
229   ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
230   EXPECT_NE(kInvalidLabel, label);
231   TokenSpan token_span;
232   feature_processor.LabelToTokenSpan(label, &token_span);
233   EXPECT_EQ(0, token_span.first);
234   EXPECT_EQ(0, token_span.second);
235 
236   // Reconfigure with snapping enabled.
237   options.set_snap_label_span_boundaries_to_containing_tokens(true);
238   TestingFeatureProcessor feature_processor2(options);
239   int label2;
240   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
241   EXPECT_EQ(label, label2);
242   ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
243   EXPECT_EQ(label, label2);
244   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
245   EXPECT_EQ(label, label2);
246 
247   // Cross a token boundary.
248   ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
249   EXPECT_EQ(kInvalidLabel, label2);
250   ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
251   EXPECT_EQ(kInvalidLabel, label2);
252 
253   // Multiple tokens.
254   options.set_context_size(2);
255   options.set_max_selection_span(2);
256   TestingFeatureProcessor feature_processor3(options);
257   tokens = feature_processor3.Tokenize("zero, one, two, three, four");
258   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
259   EXPECT_NE(kInvalidLabel, label2);
260   feature_processor3.LabelToTokenSpan(label2, &token_span);
261   EXPECT_EQ(1, token_span.first);
262   EXPECT_EQ(0, token_span.second);
263 
264   int label3;
265   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
266   EXPECT_EQ(label2, label3);
267   ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
268   EXPECT_EQ(label2, label3);
269   ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
270   EXPECT_EQ(label2, label3);
271 }
272 
TEST(FeatureProcessorTest,CenterTokenFromClick)273 TEST(FeatureProcessorTest, CenterTokenFromClick) {
274   int token_index;
275 
276   // Exactly aligned indices.
277   token_index = internal::CenterTokenFromClick(
278       {6, 11},
279       {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
280   EXPECT_EQ(token_index, 1);
281 
282   // Click is contained in a token.
283   token_index = internal::CenterTokenFromClick(
284       {13, 17},
285       {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
286   EXPECT_EQ(token_index, 2);
287 
288   // Click spans two tokens.
289   token_index = internal::CenterTokenFromClick(
290       {6, 17},
291       {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
292   EXPECT_EQ(token_index, kInvalidIndex);
293 }
294 
TEST(FeatureProcessorTest,CenterTokenFromMiddleOfSelection)295 TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
296   int token_index;
297 
298   // Selection of length 3. Exactly aligned indices.
299   token_index = internal::CenterTokenFromMiddleOfSelection(
300       {7, 27},
301       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
302        Token("Token4", 21, 27), Token("Token5", 28, 34)});
303   EXPECT_EQ(token_index, 2);
304 
305   // Selection of length 1 token. Exactly aligned indices.
306   token_index = internal::CenterTokenFromMiddleOfSelection(
307       {21, 27},
308       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
309        Token("Token4", 21, 27), Token("Token5", 28, 34)});
310   EXPECT_EQ(token_index, 3);
311 
312   // Selection marks sub-token range, with no tokens in it.
313   token_index = internal::CenterTokenFromMiddleOfSelection(
314       {29, 33},
315       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
316        Token("Token4", 21, 27), Token("Token5", 28, 34)});
317   EXPECT_EQ(token_index, kInvalidIndex);
318 
319   // Selection of length 2. Sub-token indices.
320   token_index = internal::CenterTokenFromMiddleOfSelection(
321       {3, 25},
322       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
323        Token("Token4", 21, 27), Token("Token5", 28, 34)});
324   EXPECT_EQ(token_index, 1);
325 
326   // Selection of length 1. Sub-token indices.
327   token_index = internal::CenterTokenFromMiddleOfSelection(
328       {22, 34},
329       {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
330        Token("Token4", 21, 27), Token("Token5", 28, 34)});
331   EXPECT_EQ(token_index, 4);
332 
333   // Some invalid ones.
334   token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
335   EXPECT_EQ(token_index, -1);
336 }
337 
TEST(FeatureProcessorTest,SupportedCodepointsRatio)338 TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
339   FeatureProcessorOptions options;
340   options.set_context_size(2);
341   options.set_max_selection_span(2);
342   options.set_snap_label_span_boundaries_to_containing_tokens(false);
343 
344   TokenizationCodepointRange* config =
345       options.add_tokenization_codepoint_config();
346   config->set_start(32);
347   config->set_end(33);
348   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
349 
350   FeatureProcessorOptions::CodepointRange* range;
351   range = options.add_supported_codepoint_ranges();
352   range->set_start(0);
353   range->set_end(128);
354 
355   range = options.add_supported_codepoint_ranges();
356   range->set_start(10000);
357   range->set_end(10001);
358 
359   range = options.add_supported_codepoint_ranges();
360   range->set_start(20000);
361   range->set_end(30000);
362 
363   TestingFeatureProcessor feature_processor(options);
364   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
365                   1, feature_processor.Tokenize("aaa bbb ccc")),
366               FloatEq(1.0));
367   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
368                   1, feature_processor.Tokenize("aaa bbb ěěě")),
369               FloatEq(2.0 / 3));
370   EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
371                   1, feature_processor.Tokenize("ěěě řřř ěěě")),
372               FloatEq(0.0));
373   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
374       -1, feature_processor.supported_codepoint_ranges_));
375   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
376       0, feature_processor.supported_codepoint_ranges_));
377   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
378       10, feature_processor.supported_codepoint_ranges_));
379   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
380       127, feature_processor.supported_codepoint_ranges_));
381   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
382       128, feature_processor.supported_codepoint_ranges_));
383   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
384       9999, feature_processor.supported_codepoint_ranges_));
385   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
386       10000, feature_processor.supported_codepoint_ranges_));
387   EXPECT_FALSE(feature_processor.IsCodepointInRanges(
388       10001, feature_processor.supported_codepoint_ranges_));
389   EXPECT_TRUE(feature_processor.IsCodepointInRanges(
390       25000, feature_processor.supported_codepoint_ranges_));
391 
392   std::vector<Token> tokens;
393   int click_pos;
394   std::vector<float> extra_features;
395   std::unique_ptr<CachedFeatures> cached_features;
396 
397   auto feature_fn = [](const std::vector<int>& sparse_features,
398                        const std::vector<float>& dense_features,
399                        float* embedding) { return true; };
400 
401   options.set_min_supported_codepoint_ratio(0.0);
402   TestingFeatureProcessor feature_processor2(options);
403   EXPECT_TRUE(feature_processor2.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
404                                                  feature_fn, 2, &tokens,
405                                                  &click_pos, &cached_features));
406 
407   options.set_min_supported_codepoint_ratio(0.2);
408   TestingFeatureProcessor feature_processor3(options);
409   EXPECT_TRUE(feature_processor3.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
410                                                  feature_fn, 2, &tokens,
411                                                  &click_pos, &cached_features));
412 
413   options.set_min_supported_codepoint_ratio(0.5);
414   TestingFeatureProcessor feature_processor4(options);
415   EXPECT_FALSE(feature_processor4.ExtractFeatures(
416       "ěěě řřř eee", {4, 7}, {0, 0}, feature_fn, 2, &tokens, &click_pos,
417       &cached_features));
418 }
419 
TEST(FeatureProcessorTest,StripUnusedTokensWithNoRelativeClick)420 TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
421   std::vector<Token> tokens_orig{
422       Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0),  Token("3", 0, 0),
423       Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0),  Token("7", 0, 0),
424       Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
425       Token("12", 0, 0)};
426 
427   std::vector<Token> tokens;
428   int click_index;
429 
430   // Try to click first token and see if it gets padded from left.
431   tokens = tokens_orig;
432   click_index = 0;
433   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
434   // clang-format off
435   EXPECT_EQ(tokens, std::vector<Token>({Token(),
436                                         Token(),
437                                         Token("0", 0, 0),
438                                         Token("1", 0, 0),
439                                         Token("2", 0, 0)}));
440   // clang-format on
441   EXPECT_EQ(click_index, 2);
442 
443   // When we click the second token nothing should get padded.
444   tokens = tokens_orig;
445   click_index = 2;
446   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
447   // clang-format off
448   EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
449                                         Token("1", 0, 0),
450                                         Token("2", 0, 0),
451                                         Token("3", 0, 0),
452                                         Token("4", 0, 0)}));
453   // clang-format on
454   EXPECT_EQ(click_index, 2);
455 
456   // When we click the last token tokens should get padded from the right.
457   tokens = tokens_orig;
458   click_index = 12;
459   internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
460   // clang-format off
461   EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
462                                         Token("11", 0, 0),
463                                         Token("12", 0, 0),
464                                         Token(),
465                                         Token()}));
466   // clang-format on
467   EXPECT_EQ(click_index, 2);
468 }
469 
TEST(FeatureProcessorTest,StripUnusedTokensWithRelativeClick)470 TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
471   std::vector<Token> tokens_orig{
472       Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0),  Token("3", 0, 0),
473       Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0),  Token("7", 0, 0),
474       Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
475       Token("12", 0, 0)};
476 
477   std::vector<Token> tokens;
478   int click_index;
479 
480   // Try to click first token and see if it gets padded from left to maximum
481   // context_size.
482   tokens = tokens_orig;
483   click_index = 0;
484   internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
485   // clang-format off
486   EXPECT_EQ(tokens, std::vector<Token>({Token(),
487                                         Token(),
488                                         Token("0", 0, 0),
489                                         Token("1", 0, 0),
490                                         Token("2", 0, 0),
491                                         Token("3", 0, 0),
492                                         Token("4", 0, 0),
493                                         Token("5", 0, 0)}));
494   // clang-format on
495   EXPECT_EQ(click_index, 2);
496 
497   // Clicking to the middle with enough context should not produce any padding.
498   tokens = tokens_orig;
499   click_index = 6;
500   internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
501   // clang-format off
502   EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
503                                         Token("2", 0, 0),
504                                         Token("3", 0, 0),
505                                         Token("4", 0, 0),
506                                         Token("5", 0, 0),
507                                         Token("6", 0, 0),
508                                         Token("7", 0, 0),
509                                         Token("8", 0, 0),
510                                         Token("9", 0, 0)}));
511   // clang-format on
512   EXPECT_EQ(click_index, 5);
513 
514   // Clicking at the end should pad right to maximum context_size.
515   tokens = tokens_orig;
516   click_index = 11;
517   internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
518   // clang-format off
519   EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
520                                         Token("7", 0, 0),
521                                         Token("8", 0, 0),
522                                         Token("9", 0, 0),
523                                         Token("10", 0, 0),
524                                         Token("11", 0, 0),
525                                         Token("12", 0, 0),
526                                         Token(),
527                                         Token()}));
528   // clang-format on
529   EXPECT_EQ(click_index, 5);
530 }
531 
TEST(FeatureProcessorTest,ICUTokenize)532 TEST(FeatureProcessorTest, ICUTokenize) {
533   FeatureProcessorOptions options;
534   options.set_tokenization_type(
535       libtextclassifier::FeatureProcessorOptions::ICU);
536 
537   TestingFeatureProcessor feature_processor(options);
538   std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
539   ASSERT_EQ(tokens,
540             // clang-format off
541             std::vector<Token>({Token("พระบาท", 0, 6),
542                                 Token("สมเด็จ", 6, 12),
543                                 Token("พระ", 12, 15),
544                                 Token("ปร", 15, 17),
545                                 Token("มิ", 17, 19)}));
546   // clang-format on
547 }
548 
TEST(FeatureProcessorTest,ICUTokenizeWithWhitespaces)549 TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
550   FeatureProcessorOptions options;
551   options.set_tokenization_type(
552       libtextclassifier::FeatureProcessorOptions::ICU);
553   options.set_icu_preserve_whitespace_tokens(true);
554 
555   TestingFeatureProcessor feature_processor(options);
556   std::vector<Token> tokens =
557       feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
558   ASSERT_EQ(tokens,
559             // clang-format off
560             std::vector<Token>({Token("พระบาท", 0, 6),
561                                 Token(" ", 6, 7),
562                                 Token("สมเด็จ", 7, 13),
563                                 Token(" ", 13, 14),
564                                 Token("พระ", 14, 17),
565                                 Token(" ", 17, 18),
566                                 Token("ปร", 18, 20),
567                                 Token(" ", 20, 21),
568                                 Token("มิ", 21, 23)}));
569   // clang-format on
570 }
571 
TEST(FeatureProcessorTest,MixedTokenize)572 TEST(FeatureProcessorTest, MixedTokenize) {
573   FeatureProcessorOptions options;
574   options.set_tokenization_type(
575       libtextclassifier::FeatureProcessorOptions::MIXED);
576 
577   TokenizationCodepointRange* config =
578       options.add_tokenization_codepoint_config();
579   config->set_start(32);
580   config->set_end(33);
581   config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
582 
583   FeatureProcessorOptions::CodepointRange* range;
584   range = options.add_internal_tokenizer_codepoint_ranges();
585   range->set_start(0);
586   range->set_end(128);
587 
588   range = options.add_internal_tokenizer_codepoint_ranges();
589   range->set_start(128);
590   range->set_end(256);
591 
592   range = options.add_internal_tokenizer_codepoint_ranges();
593   range->set_start(256);
594   range->set_end(384);
595 
596   range = options.add_internal_tokenizer_codepoint_ranges();
597   range->set_start(384);
598   range->set_end(592);
599 
600   TestingFeatureProcessor feature_processor(options);
601   std::vector<Token> tokens = feature_processor.Tokenize(
602       "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
603   ASSERT_EQ(tokens,
604             // clang-format off
605             std::vector<Token>({Token("こんにちは", 0, 5),
606                                 Token("Japanese-ląnguagę", 5, 22),
607                                 Token("text", 23, 27),
608                                 Token("世界", 28, 30),
609                                 Token("http://www.google.com/", 31, 53)}));
610   // clang-format on
611 }
612 
613 }  // namespace
614 }  // namespace libtextclassifier
615