1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <vector>
16
17 #include "tensorflow/core/framework/fake_input.h"
18 #include "tensorflow/core/framework/node_def_builder.h"
19 #include "tensorflow/core/framework/shape_inference.h"
20 #include "tensorflow/core/framework/shape_inference_testutil.h"
21 #include "tensorflow/core/framework/tensor.h"
22 #include "tensorflow/core/framework/tensor_shape.h"
23 #include "tensorflow/core/framework/tensor_testutil.h"
24 #include "tensorflow/core/framework/types.pb.h"
25 #include "tensorflow/core/kernels/ops_testutil.h"
26 #include "tensorflow/core/lib/core/status.h"
27 #include "tensorflow/core/lib/core/status_test_util.h"
28
29 namespace tensorflow {
30 namespace text {
31
32 using tensorflow::FakeInput;
33 using tensorflow::NodeDefBuilder;
34 using tensorflow::Status;
35 using tensorflow::TensorShape;
36
37 class NgramKernelTest : public tensorflow::OpsTestBase {
38 public:
MakeOp(string separator,std::vector<int> ngram_width,string left_pad,string right_pad,int pad_width,bool preserve)39 void MakeOp(string separator, std::vector<int> ngram_width, string left_pad,
40 string right_pad, int pad_width, bool preserve) {
41 TF_ASSERT_OK(NodeDefBuilder("tested_op", "StringNGrams")
42 .Attr("separator", separator)
43 .Attr("ngram_widths", ngram_width)
44 .Attr("left_pad", left_pad)
45 .Attr("right_pad", right_pad)
46 .Attr("pad_width", pad_width)
47 .Attr("preserve_short_sequences", preserve)
48 .Input(FakeInput())
49 .Input(FakeInput())
50 .Finalize(node_def()));
51 TF_ASSERT_OK(InitOp());
52 }
53
assert_string_equal(const std::vector<tstring> & expected,const Tensor & value)54 void assert_string_equal(const std::vector<tstring> &expected,
55 const Tensor &value) {
56 Tensor expected_tensor(allocator(), DT_STRING,
57 TensorShape({static_cast<int64>(expected.size())}));
58 test::FillValues<tstring>(&expected_tensor, expected);
59 test::ExpectTensorEqual<tstring>(expected_tensor, value);
60 }
assert_int64_equal(const std::vector<int64> & expected,const Tensor & value)61 void assert_int64_equal(const std::vector<int64> &expected,
62 const Tensor &value) {
63 Tensor expected_tensor(allocator(), DT_INT64,
64 TensorShape({static_cast<int64>(expected.size())}));
65 test::FillValues<int64>(&expected_tensor, expected);
66 test::ExpectTensorEqual<int64>(expected_tensor, value);
67 }
68 };
69
TEST_F(NgramKernelTest,TestPaddedTrigrams)70 TEST_F(NgramKernelTest, TestPaddedTrigrams) {
71 MakeOp("|", {3}, "LP", "RP", -1, false);
72 // Batch items are:
73 // 0: "a", "b", "c", "d"
74 // 1: "e", "f"
75 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
76 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
77 TF_ASSERT_OK(RunOpKernel());
78
79 std::vector<tstring> expected_values( //
80 {"LP|LP|a", "LP|a|b", "a|b|c", "b|c|d", "c|d|RP", "d|RP|RP", // 0
81 "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"}); // 1
82 std::vector<int64> expected_splits({0, 6, 10});
83
84 assert_string_equal(expected_values, *GetOutput(0));
85 assert_int64_equal(expected_splits, *GetOutput(1));
86 }
87
TEST_F(NgramKernelTest,TestPaddedBigramsAndTrigrams)88 TEST_F(NgramKernelTest, TestPaddedBigramsAndTrigrams) {
89 MakeOp("|", {2, 3}, "LP", "RP", -1, false);
90 // Batch items are:
91 // 0: "a", "b", "c", "d"
92 // 1: "e", "f"
93 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
94 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
95 TF_ASSERT_OK(RunOpKernel());
96
97 std::vector<tstring> expected_values(
98 {"LP|a", "a|b", "b|c", "c|d", "d|RP", "LP|LP|a", "LP|a|b", "a|b|c",
99 "b|c|d", "c|d|RP", "d|RP|RP", // 0
100 "LP|e", "e|f", "f|RP", "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"}); // 1
101 std::vector<int64> expected_splits({0, 11, 18});
102
103 assert_string_equal(expected_values, *GetOutput(0));
104 assert_int64_equal(expected_splits, *GetOutput(1));
105 }
106
TEST_F(NgramKernelTest,TestPaddedBigrams)107 TEST_F(NgramKernelTest, TestPaddedBigrams) {
108 MakeOp("|", {2}, "LP", "RP", -1, false);
109 // Batch items are:
110 // 0: "a", "b", "c", "d"
111 // 1: "e", "f"
112 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
113 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
114 TF_ASSERT_OK(RunOpKernel());
115
116 std::vector<tstring> expected_values( //
117 {"LP|a", "a|b", "b|c", "c|d", "d|RP", // 0
118 "LP|e", "e|f", "f|RP"}); // 1
119 std::vector<int64> expected_splits({0, 5, 8});
120
121 assert_string_equal(expected_values, *GetOutput(0));
122 assert_int64_equal(expected_splits, *GetOutput(1));
123 }
124
TEST_F(NgramKernelTest,TestPaddingIsAtMostNGramSizeMinus1)125 TEST_F(NgramKernelTest, TestPaddingIsAtMostNGramSizeMinus1) {
126 MakeOp("|", {2}, "LP", "RP", 4, false);
127 // Batch items are:
128 // 0: "a", "b", "c", "d"
129 // 1: "e", "f"
130 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
131 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
132 TF_ASSERT_OK(RunOpKernel());
133
134 std::vector<tstring> expected_values( //
135 {"LP|a", "a|b", "b|c", "c|d", "d|RP", // 0
136 "LP|e", "e|f", "f|RP"}); // 1
137 std::vector<int64> expected_splits({0, 5, 8});
138
139 assert_string_equal(expected_values, *GetOutput(0));
140 assert_int64_equal(expected_splits, *GetOutput(1));
141 }
142
TEST_F(NgramKernelTest,TestPaddedUnigramAndBigrams)143 TEST_F(NgramKernelTest, TestPaddedUnigramAndBigrams) {
144 MakeOp("|", {1, 2}, "LP", "RP", -1, false);
145 // Batch items are:
146 // 0: "a", "b", "c", "d"
147 // 1: "e", "f"
148 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
149 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
150 TF_ASSERT_OK(RunOpKernel());
151
152 std::vector<tstring> expected_values( //
153 {"a", "b", "c", "d", "LP|a", "a|b", "b|c", "c|d", "d|RP", // 0
154 "e", "f", "LP|e", "e|f", "f|RP"}); // 1
155 std::vector<int64> expected_splits({0, 9, 14});
156
157 assert_string_equal(expected_values, *GetOutput(0));
158 assert_int64_equal(expected_splits, *GetOutput(1));
159 }
160
TEST_F(NgramKernelTest,TestOverlappingPaddedNGrams)161 TEST_F(NgramKernelTest, TestOverlappingPaddedNGrams) {
162 // This test validates that n-grams with both left and right padding in a
163 // single ngram token are created correctly.
164 MakeOp("|", {3}, "LP", "RP", -1, false);
165 // Batch items are:
166 // 0: "a"
167 // 1: "b", "c", "d"
168 // 2: "e", "f"
169 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
170 AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
171 TF_ASSERT_OK(RunOpKernel());
172
173 std::vector<tstring> expected_values( //
174 {"LP|LP|a", "LP|a|RP", "a|RP|RP", // ngrams for elem. 0
175 "LP|LP|b", "LP|b|c", "b|c|d", "c|d|RP", "d|RP|RP", // ngrams for elem. 1
176 "LP|LP|e", "LP|e|f", "e|f|RP", "f|RP|RP"}); // ngrams for elem. 2
177 std::vector<int64> expected_splits({0, 3, 8, 12});
178
179 assert_string_equal(expected_values, *GetOutput(0));
180 assert_int64_equal(expected_splits, *GetOutput(1));
181 }
182
TEST_F(NgramKernelTest,TestOverlappingPaddedMultiCharNGrams)183 TEST_F(NgramKernelTest, TestOverlappingPaddedMultiCharNGrams) {
184 MakeOp("|", {3}, "LP", "RP", -1, false);
185 // Batch items are:
186 // 0: "a"
187 // 1: "b", "c", "d"
188 // 2: "e", "f"
189 AddInputFromArray<tstring>(TensorShape({6}),
190 {"aa", "bb", "cc", "dd", "ee", "ff"});
191 AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
192 TF_ASSERT_OK(RunOpKernel());
193
194 std::vector<tstring> expected_values( //
195 {"LP|LP|aa", "LP|aa|RP", "aa|RP|RP", //
196 "LP|LP|bb", "LP|bb|cc", "bb|cc|dd", "cc|dd|RP", "dd|RP|RP", //
197 "LP|LP|ee", "LP|ee|ff", "ee|ff|RP", "ff|RP|RP"}); //
198 std::vector<int64> expected_splits({0, 3, 8, 12});
199
200 assert_string_equal(expected_values, *GetOutput(0));
201 assert_int64_equal(expected_splits, *GetOutput(1));
202 }
203
TEST_F(NgramKernelTest,TestMultiOverlappingPaddedNGrams)204 TEST_F(NgramKernelTest, TestMultiOverlappingPaddedNGrams) {
205 // This test validates that n-grams with more than 1 padding value on each
206 // side are created correctly.
207 MakeOp("|", {5}, "LP", "RP", -1, false);
208 // Batch items are:
209 // 0: "a"
210 AddInputFromArray<tstring>(TensorShape({1}), {"a"});
211 AddInputFromArray<int64>(TensorShape({2}), {0, 1});
212 TF_ASSERT_OK(RunOpKernel());
213
214 std::vector<tstring> expected_values({"LP|LP|LP|LP|a", "LP|LP|LP|a|RP",
215 "LP|LP|a|RP|RP", "LP|a|RP|RP|RP",
216 "a|RP|RP|RP|RP"});
217 std::vector<int64> expected_splits({0, 5});
218
219 assert_string_equal(expected_values, *GetOutput(0));
220 assert_int64_equal(expected_splits, *GetOutput(1));
221 }
222
TEST_F(NgramKernelTest,TestUnpaddedTrigrams)223 TEST_F(NgramKernelTest, TestUnpaddedTrigrams) {
224 MakeOp("|", {3}, "", "", 0, false);
225 // Batch items are:
226 // 0: "a", "b", "c", "d"
227 // 1: "e", "f"
228 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
229 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
230 TF_ASSERT_OK(RunOpKernel());
231
232 std::vector<tstring> expected_values({"a|b|c", "b|c|d"});
233 std::vector<int64> expected_splits({0, 2, 2});
234
235 assert_string_equal(expected_values, *GetOutput(0));
236 assert_int64_equal(expected_splits, *GetOutput(1));
237 }
238
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithEmptySequence)239 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithEmptySequence) {
240 MakeOp("|", {3}, "", "", 0, false);
241 // Batch items are:
242 // 0: "a", "b", "c", "d"
243 // 1: "e", "f"
244 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
245 AddInputFromArray<int64>(TensorShape({4}), {0, 4, 4, 6});
246 TF_ASSERT_OK(RunOpKernel());
247
248 std::vector<tstring> expected_values({"a|b|c", "b|c|d"});
249 std::vector<int64> expected_splits({0, 2, 2, 2});
250
251 assert_string_equal(expected_values, *GetOutput(0));
252 assert_int64_equal(expected_splits, *GetOutput(1));
253 }
254
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithPreserveShort)255 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithPreserveShort) {
256 MakeOp("|", {3}, "", "", 0, true);
257 // Batch items are:
258 // 0: "a", "b", "c", "d"
259 // 1: "e", "f"
260 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
261 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
262 TF_ASSERT_OK(RunOpKernel());
263
264 std::vector<tstring> expected_values({"a|b|c", "b|c|d", "e|f"});
265 std::vector<int64> expected_splits({0, 2, 3});
266
267 assert_string_equal(expected_values, *GetOutput(0));
268 assert_int64_equal(expected_splits, *GetOutput(1));
269 }
270
TEST_F(NgramKernelTest,TestUnpaddedTrigramsWithPreserveShortAndEmptySequence)271 TEST_F(NgramKernelTest, TestUnpaddedTrigramsWithPreserveShortAndEmptySequence) {
272 MakeOp("|", {3}, "", "", 0, true);
273 // Batch items are:
274 // 0: "a", "b", "c", "d"
275 // 1: "e", "f"
276 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
277 AddInputFromArray<int64>(TensorShape({4}), {0, 4, 4, 6});
278 TF_ASSERT_OK(RunOpKernel());
279
280 std::vector<tstring> expected_values({"a|b|c", "b|c|d", "e|f"});
281 std::vector<int64> expected_splits({0, 2, 2, 3});
282
283 assert_string_equal(expected_values, *GetOutput(0));
284 assert_int64_equal(expected_splits, *GetOutput(1));
285 }
286
TEST_F(NgramKernelTest,TestUnpaddedTrigramsAndQuadgramsWithPreserveShort)287 TEST_F(NgramKernelTest, TestUnpaddedTrigramsAndQuadgramsWithPreserveShort) {
288 MakeOp("|", {4, 3}, "", "", 0, true);
289 // Batch items are:
290 // 0: "a", "b", "c", "d"
291 // 1: "e", "f"
292 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
293 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
294 TF_ASSERT_OK(RunOpKernel());
295
296 std::vector<tstring> expected_values({"a|b|c|d", "a|b|c", "b|c|d", "e|f"});
297 std::vector<int64> expected_splits({0, 3, 4});
298
299 assert_string_equal(expected_values, *GetOutput(0));
300 assert_int64_equal(expected_splits, *GetOutput(1));
301 }
302
TEST_F(NgramKernelTest,TestUnpaddedBigramsAndTrigrams)303 TEST_F(NgramKernelTest, TestUnpaddedBigramsAndTrigrams) {
304 MakeOp("|", {2, 3}, "", "", 0, false);
305 // Batch items are:
306 // 0: "a", "b", "c", "d"
307 // 1: "e", "f"
308 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
309 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
310 TF_ASSERT_OK(RunOpKernel());
311
312 std::vector<tstring> expected_values(
313 {"a|b", "b|c", "c|d", "a|b|c", "b|c|d", "e|f"});
314 std::vector<int64> expected_splits({0, 5, 6});
315
316 assert_string_equal(expected_values, *GetOutput(0));
317 assert_int64_equal(expected_splits, *GetOutput(1));
318 }
319
TEST_F(NgramKernelTest,TestUnpaddedBigramsAndTrigramsWithPreserveShort)320 TEST_F(NgramKernelTest, TestUnpaddedBigramsAndTrigramsWithPreserveShort) {
321 MakeOp("|", {2, 3}, "", "", 0, true);
322 // Batch items are:
323 // 0: "a", "b", "c", "d"
324 // 1: "e", "f"
325 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
326 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
327 TF_ASSERT_OK(RunOpKernel());
328
329 // Note that in this case, because the bigram 'e|f' was already generated,
330 // the op will not generate a special preserve_short bigram.
331 std::vector<tstring> expected_values(
332 {"a|b", "b|c", "c|d", "a|b|c", "b|c|d", "e|f"});
333 std::vector<int64> expected_splits({0, 5, 6});
334
335 assert_string_equal(expected_values, *GetOutput(0));
336 assert_int64_equal(expected_splits, *GetOutput(1));
337 }
338
TEST_F(NgramKernelTest,TestUnpaddedTrigramsAndBigramsWithPreserveShort)339 TEST_F(NgramKernelTest, TestUnpaddedTrigramsAndBigramsWithPreserveShort) {
340 MakeOp("|", {3, 2}, "", "", 0, true);
341 // Batch items are:
342 // 0: "a", "b", "c", "d"
343 // 1: "e", "f"
344 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
345 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
346 TF_ASSERT_OK(RunOpKernel());
347
348 // Note that in this case, because the bigram 'e|f' was already generated,
349 // the op will not generate a special preserve_short bigram.
350 std::vector<tstring> expected_values(
351 {"a|b|c", "b|c|d", "a|b", "b|c", "c|d", "e|f"});
352 std::vector<int64> expected_splits({0, 5, 6});
353
354 assert_string_equal(expected_values, *GetOutput(0));
355 assert_int64_equal(expected_splits, *GetOutput(1));
356 }
357
TEST_F(NgramKernelTest,TestUnpaddedBigrams)358 TEST_F(NgramKernelTest, TestUnpaddedBigrams) {
359 MakeOp("|", {2}, "", "", 0, false);
360 // Batch items are:
361 // 0: "a", "b", "c", "d"
362 // 1: "e", "f"
363 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
364 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
365 TF_ASSERT_OK(RunOpKernel());
366
367 std::vector<tstring> expected_values({"a|b", "b|c", "c|d", "e|f"});
368 std::vector<int64> expected_splits({0, 3, 4});
369
370 assert_string_equal(expected_values, *GetOutput(0));
371 assert_int64_equal(expected_splits, *GetOutput(1));
372 }
373
TEST_F(NgramKernelTest,TestOverlappingUnpaddedNGrams)374 TEST_F(NgramKernelTest, TestOverlappingUnpaddedNGrams) {
375 MakeOp("|", {3}, "", "", 0, false);
376 // Batch items are:
377 // 0: "a"
378 // 1: "b", "c", "d"
379 // 2: "e", "f"
380 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
381 AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
382 TF_ASSERT_OK(RunOpKernel());
383
384 std::vector<tstring> expected_values({"b|c|d"});
385 std::vector<int64> expected_splits({0, 0, 1, 1});
386
387 assert_string_equal(expected_values, *GetOutput(0));
388 assert_int64_equal(expected_splits, *GetOutput(1));
389 }
390
TEST_F(NgramKernelTest,TestOverlappingUnpaddedNGramsNoOutput)391 TEST_F(NgramKernelTest, TestOverlappingUnpaddedNGramsNoOutput) {
392 MakeOp("|", {5}, "", "", 0, false);
393 // Batch items are:
394 // 0: "a"
395 // 1: "b", "c", "d"
396 // 2: "e", "f"
397 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
398 AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
399 TF_ASSERT_OK(RunOpKernel());
400
401 std::vector<tstring> expected_values({});
402 std::vector<int64> expected_splits({0, 0, 0, 0});
403
404 assert_string_equal(expected_values, *GetOutput(0));
405 assert_int64_equal(expected_splits, *GetOutput(1));
406 }
407
TEST_F(NgramKernelTest,TestSinglyPaddedTrigrams)408 TEST_F(NgramKernelTest, TestSinglyPaddedTrigrams) {
409 MakeOp("|", {3}, "LP", "RP", 1, false);
410 // Batch items are:
411 // 0: "a", "b", "c", "d"
412 // 1: "e", "f"
413 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
414 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
415 TF_ASSERT_OK(RunOpKernel());
416
417 std::vector<tstring> expected_values({"LP|a|b", "a|b|c", "b|c|d",
418 "c|d|RP", //
419 "LP|e|f", "e|f|RP"});
420 std::vector<int64> expected_splits({0, 4, 6});
421
422 assert_string_equal(expected_values, *GetOutput(0));
423 assert_int64_equal(expected_splits, *GetOutput(1));
424 }
425
TEST_F(NgramKernelTest,TestSinglyPaddedBigrams)426 TEST_F(NgramKernelTest, TestSinglyPaddedBigrams) {
427 MakeOp("|", {2}, "LP", "RP", 1, false);
428 // Batch items are:
429 // 0: "a", "b", "c", "d"
430 // 1: "e", "f"
431 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
432 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
433 TF_ASSERT_OK(RunOpKernel());
434
435 std::vector<tstring> expected_values({"LP|a", "a|b", "b|c", "c|d", "d|RP", //
436 "LP|e", "e|f", "f|RP"});
437 std::vector<int64> expected_splits({0, 5, 8});
438
439 assert_string_equal(expected_values, *GetOutput(0));
440 assert_int64_equal(expected_splits, *GetOutput(1));
441 }
442
TEST_F(NgramKernelTest,TestSinglyPaddedBigramsAnd5grams)443 TEST_F(NgramKernelTest, TestSinglyPaddedBigramsAnd5grams) {
444 MakeOp("|", {2, 5}, "LP", "RP", 1, false);
445 // Batch items are:
446 // 0: "a", "b", "c", "d"
447 // 1: "e", "f"
448 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
449 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
450 TF_ASSERT_OK(RunOpKernel());
451
452 std::vector<tstring> expected_values( //
453 {"LP|a", "a|b", "b|c", "c|d", "d|RP", "LP|a|b|c|d", "a|b|c|d|RP", //
454 "LP|e", "e|f", "f|RP"});
455 std::vector<int64> expected_splits({0, 7, 10});
456
457 assert_string_equal(expected_values, *GetOutput(0));
458 assert_int64_equal(expected_splits, *GetOutput(1));
459 }
460
TEST_F(NgramKernelTest,TestSinglyPadded5gramsWithPreserveShort)461 TEST_F(NgramKernelTest, TestSinglyPadded5gramsWithPreserveShort) {
462 MakeOp("|", {5}, "LP", "RP", 1, true);
463 // Batch items are:
464 // 0: "a", "b", "c", "d"
465 // 1: "e", "f"
466 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
467 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
468 TF_ASSERT_OK(RunOpKernel());
469
470 std::vector<tstring> expected_values( //
471 {"LP|a|b|c|d", "a|b|c|d|RP", //
472 "LP|e|f|RP"});
473 std::vector<int64> expected_splits({0, 2, 3});
474
475 assert_string_equal(expected_values, *GetOutput(0));
476 assert_int64_equal(expected_splits, *GetOutput(1));
477 }
478
TEST_F(NgramKernelTest,TestOverlappingSinglyPaddedNGrams)479 TEST_F(NgramKernelTest, TestOverlappingSinglyPaddedNGrams) {
480 MakeOp("|", {3}, "LP", "RP", 1, false);
481 // Batch items are:
482 // 0: "a"
483 // 1: "b", "c", "d"
484 // 2: "e", "f"
485 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
486 AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
487 TF_ASSERT_OK(RunOpKernel());
488
489 std::vector<tstring> expected_values(
490 {"LP|a|RP", // ngrams for elem. 0
491 "LP|b|c", "b|c|d", "c|d|RP", // ngrams for elem. 1
492 "LP|e|f", "e|f|RP"}); // ngrams for elem. 2
493 std::vector<int64> expected_splits({0, 1, 4, 6});
494
495 assert_string_equal(expected_values, *GetOutput(0));
496 assert_int64_equal(expected_splits, *GetOutput(1));
497 }
498
TEST_F(NgramKernelTest,TestOverlappingSinglyPaddedNGramsNoOutput)499 TEST_F(NgramKernelTest, TestOverlappingSinglyPaddedNGramsNoOutput) {
500 MakeOp("|", {5}, "LP", "RP", 1, false);
501 // Batch items are:
502 // 0: "a"
503 // 1: "b", "c", "d"
504 // 2: "e", "f"
505 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
506 AddInputFromArray<int64>(TensorShape({4}), {0, 1, 4, 6});
507 TF_ASSERT_OK(RunOpKernel());
508
509 std::vector<tstring> expected_values({"LP|b|c|d|RP"});
510 std::vector<int64> expected_splits({0, 0, 1, 1});
511
512 assert_string_equal(expected_values, *GetOutput(0));
513 assert_int64_equal(expected_splits, *GetOutput(1));
514 }
515
TEST_F(NgramKernelTest,TestSinglyPaddedUnigrams)516 TEST_F(NgramKernelTest, TestSinglyPaddedUnigrams) {
517 MakeOp("|", {1}, "LP", "RP", 1, false);
518 // Batch items are:
519 // 0: "a", "b", "c", "d"
520 // 1: "e", "f"
521 AddInputFromArray<tstring>(TensorShape({6}), {"a", "b", "c", "d", "e", "f"});
522 AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
523 TF_ASSERT_OK(RunOpKernel());
524
525 std::vector<tstring> expected_values({"a", "b", "c", "d", "e", "f"});
526 std::vector<int64> expected_splits({0, 4, 6});
527
528 assert_string_equal(expected_values, *GetOutput(0));
529 assert_int64_equal(expected_splits, *GetOutput(1));
530 }
531
TEST_F(NgramKernelTest,TestEmptyInput)532 TEST_F(NgramKernelTest, TestEmptyInput) {
533 MakeOp("|", {1}, "LP", "RP", 3, false);
534 AddInputFromArray<tstring>(TensorShape({0}), {});
535 AddInputFromArray<int64>(TensorShape({0}), {});
536 TF_ASSERT_OK(RunOpKernel());
537
538 std::vector<tstring> expected_values({});
539 std::vector<int64> expected_splits({});
540
541 assert_string_equal(expected_values, *GetOutput(0));
542 assert_int64_equal(expected_splits, *GetOutput(1));
543 }
544
TEST_F(NgramKernelTest,ShapeFn)545 TEST_F(NgramKernelTest, ShapeFn) {
546 ShapeInferenceTestOp op("StringNGrams");
547 INFER_OK(op, "?;?", "[?];[?]");
548 INFER_OK(op, "[1];?", "[?];[?]");
549 INFER_OK(op, "[1];[2]", "[?];in1");
550 INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[];?");
551 INFER_ERROR("Shape must be rank 1 but is rank 0", op, "?;[]");
552 }
553
554 } // namespace text
555 } // namespace tensorflow
556