1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "smartselect/feature-processor.h"
18
19 #include "gmock/gmock.h"
20 #include "gtest/gtest.h"
21
22 namespace libtextclassifier {
23 namespace {
24
25 using testing::ElementsAreArray;
26 using testing::FloatEq;
27
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesMiddle)28 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesMiddle) {
29 std::vector<Token> tokens{Token("Hělló", 0, 5),
30 Token("fěěbař@google.com", 6, 23),
31 Token("heře!", 24, 29)};
32
33 internal::SplitTokensOnSelectionBoundaries({9, 12}, &tokens);
34
35 // clang-format off
36 EXPECT_THAT(tokens, ElementsAreArray(
37 {Token("Hělló", 0, 5),
38 Token("fěě", 6, 9),
39 Token("bař", 9, 12),
40 Token("@google.com", 12, 23),
41 Token("heře!", 24, 29)}));
42 // clang-format on
43 }
44
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesBegin)45 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesBegin) {
46 std::vector<Token> tokens{Token("Hělló", 0, 5),
47 Token("fěěbař@google.com", 6, 23),
48 Token("heře!", 24, 29)};
49
50 internal::SplitTokensOnSelectionBoundaries({6, 12}, &tokens);
51
52 // clang-format off
53 EXPECT_THAT(tokens, ElementsAreArray(
54 {Token("Hělló", 0, 5),
55 Token("fěěbař", 6, 12),
56 Token("@google.com", 12, 23),
57 Token("heře!", 24, 29)}));
58 // clang-format on
59 }
60
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesEnd)61 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesEnd) {
62 std::vector<Token> tokens{Token("Hělló", 0, 5),
63 Token("fěěbař@google.com", 6, 23),
64 Token("heře!", 24, 29)};
65
66 internal::SplitTokensOnSelectionBoundaries({9, 23}, &tokens);
67
68 // clang-format off
69 EXPECT_THAT(tokens, ElementsAreArray(
70 {Token("Hělló", 0, 5),
71 Token("fěě", 6, 9),
72 Token("bař@google.com", 9, 23),
73 Token("heře!", 24, 29)}));
74 // clang-format on
75 }
76
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesWhole)77 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesWhole) {
78 std::vector<Token> tokens{Token("Hělló", 0, 5),
79 Token("fěěbař@google.com", 6, 23),
80 Token("heře!", 24, 29)};
81
82 internal::SplitTokensOnSelectionBoundaries({6, 23}, &tokens);
83
84 // clang-format off
85 EXPECT_THAT(tokens, ElementsAreArray(
86 {Token("Hělló", 0, 5),
87 Token("fěěbař@google.com", 6, 23),
88 Token("heře!", 24, 29)}));
89 // clang-format on
90 }
91
TEST(FeatureProcessorTest,SplitTokensOnSelectionBoundariesCrossToken)92 TEST(FeatureProcessorTest, SplitTokensOnSelectionBoundariesCrossToken) {
93 std::vector<Token> tokens{Token("Hělló", 0, 5),
94 Token("fěěbař@google.com", 6, 23),
95 Token("heře!", 24, 29)};
96
97 internal::SplitTokensOnSelectionBoundaries({2, 9}, &tokens);
98
99 // clang-format off
100 EXPECT_THAT(tokens, ElementsAreArray(
101 {Token("Hě", 0, 2),
102 Token("lló", 2, 5),
103 Token("fěě", 6, 9),
104 Token("bař@google.com", 9, 23),
105 Token("heře!", 24, 29)}));
106 // clang-format on
107 }
108
TEST(FeatureProcessorTest,KeepLineWithClickFirst)109 TEST(FeatureProcessorTest, KeepLineWithClickFirst) {
110 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
111 const CodepointSpan span = {0, 5};
112 // clang-format off
113 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
114 Token("Lině", 6, 10),
115 Token("Sěcond", 11, 17),
116 Token("Lině", 18, 22),
117 Token("Thiřd", 23, 28),
118 Token("Lině", 29, 33)};
119 // clang-format on
120
121 // Keeps the first line.
122 internal::StripTokensFromOtherLines(context, span, &tokens);
123 EXPECT_THAT(tokens,
124 ElementsAreArray({Token("Fiřst", 0, 5), Token("Lině", 6, 10)}));
125 }
126
TEST(FeatureProcessorTest,KeepLineWithClickSecond)127 TEST(FeatureProcessorTest, KeepLineWithClickSecond) {
128 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
129 const CodepointSpan span = {18, 22};
130 // clang-format off
131 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
132 Token("Lině", 6, 10),
133 Token("Sěcond", 11, 17),
134 Token("Lině", 18, 22),
135 Token("Thiřd", 23, 28),
136 Token("Lině", 29, 33)};
137 // clang-format on
138
139 // Keeps the first line.
140 internal::StripTokensFromOtherLines(context, span, &tokens);
141 EXPECT_THAT(tokens, ElementsAreArray(
142 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
143 }
144
TEST(FeatureProcessorTest,KeepLineWithClickThird)145 TEST(FeatureProcessorTest, KeepLineWithClickThird) {
146 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
147 const CodepointSpan span = {24, 33};
148 // clang-format off
149 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
150 Token("Lině", 6, 10),
151 Token("Sěcond", 11, 17),
152 Token("Lině", 18, 22),
153 Token("Thiřd", 23, 28),
154 Token("Lině", 29, 33)};
155 // clang-format on
156
157 // Keeps the first line.
158 internal::StripTokensFromOtherLines(context, span, &tokens);
159 EXPECT_THAT(tokens, ElementsAreArray(
160 {Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
161 }
162
TEST(FeatureProcessorTest,KeepLineWithClickSecondWithPipe)163 TEST(FeatureProcessorTest, KeepLineWithClickSecondWithPipe) {
164 const std::string context = "Fiřst Lině|Sěcond Lině\nThiřd Lině";
165 const CodepointSpan span = {18, 22};
166 // clang-format off
167 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
168 Token("Lině", 6, 10),
169 Token("Sěcond", 11, 17),
170 Token("Lině", 18, 22),
171 Token("Thiřd", 23, 28),
172 Token("Lině", 29, 33)};
173 // clang-format on
174
175 // Keeps the first line.
176 internal::StripTokensFromOtherLines(context, span, &tokens);
177 EXPECT_THAT(tokens, ElementsAreArray(
178 {Token("Sěcond", 11, 17), Token("Lině", 18, 22)}));
179 }
180
TEST(FeatureProcessorTest,KeepLineWithCrosslineClick)181 TEST(FeatureProcessorTest, KeepLineWithCrosslineClick) {
182 const std::string context = "Fiřst Lině\nSěcond Lině\nThiřd Lině";
183 const CodepointSpan span = {5, 23};
184 // clang-format off
185 std::vector<Token> tokens = {Token("Fiřst", 0, 5),
186 Token("Lině", 6, 10),
187 Token("Sěcond", 18, 23),
188 Token("Lině", 19, 23),
189 Token("Thiřd", 23, 28),
190 Token("Lině", 29, 33)};
191 // clang-format on
192
193 // Keeps the first line.
194 internal::StripTokensFromOtherLines(context, span, &tokens);
195 EXPECT_THAT(tokens, ElementsAreArray(
196 {Token("Fiřst", 0, 5), Token("Lině", 6, 10),
197 Token("Sěcond", 18, 23), Token("Lině", 19, 23),
198 Token("Thiřd", 23, 28), Token("Lině", 29, 33)}));
199 }
200
201 class TestingFeatureProcessor : public FeatureProcessor {
202 public:
203 using FeatureProcessor::FeatureProcessor;
204 using FeatureProcessor::SpanToLabel;
205 using FeatureProcessor::SupportedCodepointsRatio;
206 using FeatureProcessor::IsCodepointInRanges;
207 using FeatureProcessor::ICUTokenize;
208 using FeatureProcessor::supported_codepoint_ranges_;
209 };
210
TEST(FeatureProcessorTest,SpanToLabel)211 TEST(FeatureProcessorTest, SpanToLabel) {
212 FeatureProcessorOptions options;
213 options.set_context_size(1);
214 options.set_max_selection_span(1);
215 options.set_snap_label_span_boundaries_to_containing_tokens(false);
216
217 TokenizationCodepointRange* config =
218 options.add_tokenization_codepoint_config();
219 config->set_start(32);
220 config->set_end(33);
221 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
222
223 TestingFeatureProcessor feature_processor(options);
224 std::vector<Token> tokens = feature_processor.Tokenize("one, two, three");
225 ASSERT_EQ(3, tokens.size());
226 int label;
227 ASSERT_TRUE(feature_processor.SpanToLabel({5, 8}, tokens, &label));
228 EXPECT_EQ(kInvalidLabel, label);
229 ASSERT_TRUE(feature_processor.SpanToLabel({5, 9}, tokens, &label));
230 EXPECT_NE(kInvalidLabel, label);
231 TokenSpan token_span;
232 feature_processor.LabelToTokenSpan(label, &token_span);
233 EXPECT_EQ(0, token_span.first);
234 EXPECT_EQ(0, token_span.second);
235
236 // Reconfigure with snapping enabled.
237 options.set_snap_label_span_boundaries_to_containing_tokens(true);
238 TestingFeatureProcessor feature_processor2(options);
239 int label2;
240 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 8}, tokens, &label2));
241 EXPECT_EQ(label, label2);
242 ASSERT_TRUE(feature_processor2.SpanToLabel({6, 9}, tokens, &label2));
243 EXPECT_EQ(label, label2);
244 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 9}, tokens, &label2));
245 EXPECT_EQ(label, label2);
246
247 // Cross a token boundary.
248 ASSERT_TRUE(feature_processor2.SpanToLabel({4, 9}, tokens, &label2));
249 EXPECT_EQ(kInvalidLabel, label2);
250 ASSERT_TRUE(feature_processor2.SpanToLabel({5, 10}, tokens, &label2));
251 EXPECT_EQ(kInvalidLabel, label2);
252
253 // Multiple tokens.
254 options.set_context_size(2);
255 options.set_max_selection_span(2);
256 TestingFeatureProcessor feature_processor3(options);
257 tokens = feature_processor3.Tokenize("zero, one, two, three, four");
258 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 15}, tokens, &label2));
259 EXPECT_NE(kInvalidLabel, label2);
260 feature_processor3.LabelToTokenSpan(label2, &token_span);
261 EXPECT_EQ(1, token_span.first);
262 EXPECT_EQ(0, token_span.second);
263
264 int label3;
265 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 14}, tokens, &label3));
266 EXPECT_EQ(label2, label3);
267 ASSERT_TRUE(feature_processor3.SpanToLabel({6, 13}, tokens, &label3));
268 EXPECT_EQ(label2, label3);
269 ASSERT_TRUE(feature_processor3.SpanToLabel({7, 13}, tokens, &label3));
270 EXPECT_EQ(label2, label3);
271 }
272
TEST(FeatureProcessorTest,CenterTokenFromClick)273 TEST(FeatureProcessorTest, CenterTokenFromClick) {
274 int token_index;
275
276 // Exactly aligned indices.
277 token_index = internal::CenterTokenFromClick(
278 {6, 11},
279 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
280 EXPECT_EQ(token_index, 1);
281
282 // Click is contained in a token.
283 token_index = internal::CenterTokenFromClick(
284 {13, 17},
285 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
286 EXPECT_EQ(token_index, 2);
287
288 // Click spans two tokens.
289 token_index = internal::CenterTokenFromClick(
290 {6, 17},
291 {Token("Hělló", 0, 5), Token("world", 6, 11), Token("heře!", 12, 17)});
292 EXPECT_EQ(token_index, kInvalidIndex);
293 }
294
TEST(FeatureProcessorTest,CenterTokenFromMiddleOfSelection)295 TEST(FeatureProcessorTest, CenterTokenFromMiddleOfSelection) {
296 int token_index;
297
298 // Selection of length 3. Exactly aligned indices.
299 token_index = internal::CenterTokenFromMiddleOfSelection(
300 {7, 27},
301 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
302 Token("Token4", 21, 27), Token("Token5", 28, 34)});
303 EXPECT_EQ(token_index, 2);
304
305 // Selection of length 1 token. Exactly aligned indices.
306 token_index = internal::CenterTokenFromMiddleOfSelection(
307 {21, 27},
308 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
309 Token("Token4", 21, 27), Token("Token5", 28, 34)});
310 EXPECT_EQ(token_index, 3);
311
312 // Selection marks sub-token range, with no tokens in it.
313 token_index = internal::CenterTokenFromMiddleOfSelection(
314 {29, 33},
315 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
316 Token("Token4", 21, 27), Token("Token5", 28, 34)});
317 EXPECT_EQ(token_index, kInvalidIndex);
318
319 // Selection of length 2. Sub-token indices.
320 token_index = internal::CenterTokenFromMiddleOfSelection(
321 {3, 25},
322 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
323 Token("Token4", 21, 27), Token("Token5", 28, 34)});
324 EXPECT_EQ(token_index, 1);
325
326 // Selection of length 1. Sub-token indices.
327 token_index = internal::CenterTokenFromMiddleOfSelection(
328 {22, 34},
329 {Token("Token1", 0, 6), Token("Token2", 7, 13), Token("Token3", 14, 20),
330 Token("Token4", 21, 27), Token("Token5", 28, 34)});
331 EXPECT_EQ(token_index, 4);
332
333 // Some invalid ones.
334 token_index = internal::CenterTokenFromMiddleOfSelection({7, 27}, {});
335 EXPECT_EQ(token_index, -1);
336 }
337
TEST(FeatureProcessorTest,SupportedCodepointsRatio)338 TEST(FeatureProcessorTest, SupportedCodepointsRatio) {
339 FeatureProcessorOptions options;
340 options.set_context_size(2);
341 options.set_max_selection_span(2);
342 options.set_snap_label_span_boundaries_to_containing_tokens(false);
343
344 TokenizationCodepointRange* config =
345 options.add_tokenization_codepoint_config();
346 config->set_start(32);
347 config->set_end(33);
348 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
349
350 FeatureProcessorOptions::CodepointRange* range;
351 range = options.add_supported_codepoint_ranges();
352 range->set_start(0);
353 range->set_end(128);
354
355 range = options.add_supported_codepoint_ranges();
356 range->set_start(10000);
357 range->set_end(10001);
358
359 range = options.add_supported_codepoint_ranges();
360 range->set_start(20000);
361 range->set_end(30000);
362
363 TestingFeatureProcessor feature_processor(options);
364 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
365 1, feature_processor.Tokenize("aaa bbb ccc")),
366 FloatEq(1.0));
367 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
368 1, feature_processor.Tokenize("aaa bbb ěěě")),
369 FloatEq(2.0 / 3));
370 EXPECT_THAT(feature_processor.SupportedCodepointsRatio(
371 1, feature_processor.Tokenize("ěěě řřř ěěě")),
372 FloatEq(0.0));
373 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
374 -1, feature_processor.supported_codepoint_ranges_));
375 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
376 0, feature_processor.supported_codepoint_ranges_));
377 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
378 10, feature_processor.supported_codepoint_ranges_));
379 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
380 127, feature_processor.supported_codepoint_ranges_));
381 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
382 128, feature_processor.supported_codepoint_ranges_));
383 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
384 9999, feature_processor.supported_codepoint_ranges_));
385 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
386 10000, feature_processor.supported_codepoint_ranges_));
387 EXPECT_FALSE(feature_processor.IsCodepointInRanges(
388 10001, feature_processor.supported_codepoint_ranges_));
389 EXPECT_TRUE(feature_processor.IsCodepointInRanges(
390 25000, feature_processor.supported_codepoint_ranges_));
391
392 std::vector<Token> tokens;
393 int click_pos;
394 std::vector<float> extra_features;
395 std::unique_ptr<CachedFeatures> cached_features;
396
397 auto feature_fn = [](const std::vector<int>& sparse_features,
398 const std::vector<float>& dense_features,
399 float* embedding) { return true; };
400
401 options.set_min_supported_codepoint_ratio(0.0);
402 TestingFeatureProcessor feature_processor2(options);
403 EXPECT_TRUE(feature_processor2.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
404 feature_fn, 2, &tokens,
405 &click_pos, &cached_features));
406
407 options.set_min_supported_codepoint_ratio(0.2);
408 TestingFeatureProcessor feature_processor3(options);
409 EXPECT_TRUE(feature_processor3.ExtractFeatures("ěěě řřř eee", {4, 7}, {0, 0},
410 feature_fn, 2, &tokens,
411 &click_pos, &cached_features));
412
413 options.set_min_supported_codepoint_ratio(0.5);
414 TestingFeatureProcessor feature_processor4(options);
415 EXPECT_FALSE(feature_processor4.ExtractFeatures(
416 "ěěě řřř eee", {4, 7}, {0, 0}, feature_fn, 2, &tokens, &click_pos,
417 &cached_features));
418 }
419
TEST(FeatureProcessorTest,StripUnusedTokensWithNoRelativeClick)420 TEST(FeatureProcessorTest, StripUnusedTokensWithNoRelativeClick) {
421 std::vector<Token> tokens_orig{
422 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
423 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
424 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
425 Token("12", 0, 0)};
426
427 std::vector<Token> tokens;
428 int click_index;
429
430 // Try to click first token and see if it gets padded from left.
431 tokens = tokens_orig;
432 click_index = 0;
433 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
434 // clang-format off
435 EXPECT_EQ(tokens, std::vector<Token>({Token(),
436 Token(),
437 Token("0", 0, 0),
438 Token("1", 0, 0),
439 Token("2", 0, 0)}));
440 // clang-format on
441 EXPECT_EQ(click_index, 2);
442
443 // When we click the second token nothing should get padded.
444 tokens = tokens_orig;
445 click_index = 2;
446 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
447 // clang-format off
448 EXPECT_EQ(tokens, std::vector<Token>({Token("0", 0, 0),
449 Token("1", 0, 0),
450 Token("2", 0, 0),
451 Token("3", 0, 0),
452 Token("4", 0, 0)}));
453 // clang-format on
454 EXPECT_EQ(click_index, 2);
455
456 // When we click the last token tokens should get padded from the right.
457 tokens = tokens_orig;
458 click_index = 12;
459 internal::StripOrPadTokens({0, 0}, 2, &tokens, &click_index);
460 // clang-format off
461 EXPECT_EQ(tokens, std::vector<Token>({Token("10", 0, 0),
462 Token("11", 0, 0),
463 Token("12", 0, 0),
464 Token(),
465 Token()}));
466 // clang-format on
467 EXPECT_EQ(click_index, 2);
468 }
469
TEST(FeatureProcessorTest,StripUnusedTokensWithRelativeClick)470 TEST(FeatureProcessorTest, StripUnusedTokensWithRelativeClick) {
471 std::vector<Token> tokens_orig{
472 Token("0", 0, 0), Token("1", 0, 0), Token("2", 0, 0), Token("3", 0, 0),
473 Token("4", 0, 0), Token("5", 0, 0), Token("6", 0, 0), Token("7", 0, 0),
474 Token("8", 0, 0), Token("9", 0, 0), Token("10", 0, 0), Token("11", 0, 0),
475 Token("12", 0, 0)};
476
477 std::vector<Token> tokens;
478 int click_index;
479
480 // Try to click first token and see if it gets padded from left to maximum
481 // context_size.
482 tokens = tokens_orig;
483 click_index = 0;
484 internal::StripOrPadTokens({2, 3}, 2, &tokens, &click_index);
485 // clang-format off
486 EXPECT_EQ(tokens, std::vector<Token>({Token(),
487 Token(),
488 Token("0", 0, 0),
489 Token("1", 0, 0),
490 Token("2", 0, 0),
491 Token("3", 0, 0),
492 Token("4", 0, 0),
493 Token("5", 0, 0)}));
494 // clang-format on
495 EXPECT_EQ(click_index, 2);
496
497 // Clicking to the middle with enough context should not produce any padding.
498 tokens = tokens_orig;
499 click_index = 6;
500 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
501 // clang-format off
502 EXPECT_EQ(tokens, std::vector<Token>({Token("1", 0, 0),
503 Token("2", 0, 0),
504 Token("3", 0, 0),
505 Token("4", 0, 0),
506 Token("5", 0, 0),
507 Token("6", 0, 0),
508 Token("7", 0, 0),
509 Token("8", 0, 0),
510 Token("9", 0, 0)}));
511 // clang-format on
512 EXPECT_EQ(click_index, 5);
513
514 // Clicking at the end should pad right to maximum context_size.
515 tokens = tokens_orig;
516 click_index = 11;
517 internal::StripOrPadTokens({3, 1}, 2, &tokens, &click_index);
518 // clang-format off
519 EXPECT_EQ(tokens, std::vector<Token>({Token("6", 0, 0),
520 Token("7", 0, 0),
521 Token("8", 0, 0),
522 Token("9", 0, 0),
523 Token("10", 0, 0),
524 Token("11", 0, 0),
525 Token("12", 0, 0),
526 Token(),
527 Token()}));
528 // clang-format on
529 EXPECT_EQ(click_index, 5);
530 }
531
TEST(FeatureProcessorTest,ICUTokenize)532 TEST(FeatureProcessorTest, ICUTokenize) {
533 FeatureProcessorOptions options;
534 options.set_tokenization_type(
535 libtextclassifier::FeatureProcessorOptions::ICU);
536
537 TestingFeatureProcessor feature_processor(options);
538 std::vector<Token> tokens = feature_processor.Tokenize("พระบาทสมเด็จพระปรมิ");
539 ASSERT_EQ(tokens,
540 // clang-format off
541 std::vector<Token>({Token("พระบาท", 0, 6),
542 Token("สมเด็จ", 6, 12),
543 Token("พระ", 12, 15),
544 Token("ปร", 15, 17),
545 Token("มิ", 17, 19)}));
546 // clang-format on
547 }
548
TEST(FeatureProcessorTest,ICUTokenizeWithWhitespaces)549 TEST(FeatureProcessorTest, ICUTokenizeWithWhitespaces) {
550 FeatureProcessorOptions options;
551 options.set_tokenization_type(
552 libtextclassifier::FeatureProcessorOptions::ICU);
553 options.set_icu_preserve_whitespace_tokens(true);
554
555 TestingFeatureProcessor feature_processor(options);
556 std::vector<Token> tokens =
557 feature_processor.Tokenize("พระบาท สมเด็จ พระ ปร มิ");
558 ASSERT_EQ(tokens,
559 // clang-format off
560 std::vector<Token>({Token("พระบาท", 0, 6),
561 Token(" ", 6, 7),
562 Token("สมเด็จ", 7, 13),
563 Token(" ", 13, 14),
564 Token("พระ", 14, 17),
565 Token(" ", 17, 18),
566 Token("ปร", 18, 20),
567 Token(" ", 20, 21),
568 Token("มิ", 21, 23)}));
569 // clang-format on
570 }
571
TEST(FeatureProcessorTest,MixedTokenize)572 TEST(FeatureProcessorTest, MixedTokenize) {
573 FeatureProcessorOptions options;
574 options.set_tokenization_type(
575 libtextclassifier::FeatureProcessorOptions::MIXED);
576
577 TokenizationCodepointRange* config =
578 options.add_tokenization_codepoint_config();
579 config->set_start(32);
580 config->set_end(33);
581 config->set_role(TokenizationCodepointRange::WHITESPACE_SEPARATOR);
582
583 FeatureProcessorOptions::CodepointRange* range;
584 range = options.add_internal_tokenizer_codepoint_ranges();
585 range->set_start(0);
586 range->set_end(128);
587
588 range = options.add_internal_tokenizer_codepoint_ranges();
589 range->set_start(128);
590 range->set_end(256);
591
592 range = options.add_internal_tokenizer_codepoint_ranges();
593 range->set_start(256);
594 range->set_end(384);
595
596 range = options.add_internal_tokenizer_codepoint_ranges();
597 range->set_start(384);
598 range->set_end(592);
599
600 TestingFeatureProcessor feature_processor(options);
601 std::vector<Token> tokens = feature_processor.Tokenize(
602 "こんにちはJapanese-ląnguagę text 世界 http://www.google.com/");
603 ASSERT_EQ(tokens,
604 // clang-format off
605 std::vector<Token>({Token("こんにちは", 0, 5),
606 Token("Japanese-ląnguagę", 5, 22),
607 Token("text", 23, 27),
608 Token("世界", 28, 30),
609 Token("http://www.google.com/", 31, 53)}));
610 // clang-format on
611 }
612
613 } // namespace
614 } // namespace libtextclassifier
615