1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <functional>
17 #include <memory>
18 #include <random>
19 #include <vector>
20
21 #include "tensorflow/cc/ops/array_ops.h"
22 #include "tensorflow/cc/ops/const_op.h"
23 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
24 #include "tensorflow/core/framework/allocator.h"
25 #include "tensorflow/core/framework/fake_input.h"
26 #include "tensorflow/core/framework/node_def_builder.h"
27 #include "tensorflow/core/framework/op_kernel.h"
28 #include "tensorflow/core/framework/tensor.h"
29 #include "tensorflow/core/framework/tensor_testutil.h"
30 #include "tensorflow/core/framework/types.h"
31 #include "tensorflow/core/framework/types.pb.h"
32 #include "tensorflow/core/kernels/ops_testutil.h"
33 #include "tensorflow/core/lib/core/status_test_util.h"
34 #include "tensorflow/core/platform/test_benchmark.h"
35
36 namespace tensorflow {
37 namespace {
38
39 class DequantizeOpTest : public OpsTestBase {
40 protected:
41 template <typename T>
ComputeDequantizeMinCombinedUsingEigen(const Tensor & input,float min_range,float max_range,Tensor * output)42 void ComputeDequantizeMinCombinedUsingEigen(const Tensor& input,
43 float min_range, float max_range,
44 Tensor* output) {
45 float half_range =
46 !std::is_signed<T>::value
47 ? 0.0f
48 : (static_cast<float>(std::numeric_limits<T>::max()) -
49 std::numeric_limits<T>::min() + 1) /
50 2.0f;
51 const float scale_factor =
52 (max_range - min_range) /
53 (static_cast<float>(std::numeric_limits<T>::max()) -
54 std::numeric_limits<T>::min());
55 output->flat<float>() =
56 ((input.flat<T>().template cast<int>().template cast<float>() +
57 half_range) *
58 scale_factor) +
59 min_range;
60 }
61
62 // Compares dequantize min vs the same using eigen. This tests that a change
63 // to not use eigen gives equivalent results to using eigen.
64 template <typename T>
RunDequantizeMinCombinedTest(float min_range,float max_range,const string & op_name)65 void RunDequantizeMinCombinedTest(float min_range, float max_range,
66 const string& op_name) {
67 TF_ASSERT_OK(NodeDefBuilder("dequantize_op", op_name)
68 .Input(FakeInput(DataTypeToEnum<T>::v()))
69 .Input(FakeInput(DT_FLOAT))
70 .Input(FakeInput(DT_FLOAT))
71 .Attr("T", DataTypeToEnum<T>::v())
72 .Attr("mode", "MIN_COMBINED")
73 .Finalize(node_def()));
74 TF_ASSERT_OK(InitOp());
75
76 std::vector<T> input;
77 for (int64_t i = std::numeric_limits<T>::min();
78 i < std::numeric_limits<T>::max(); ++i) {
79 input.push_back(static_cast<T>(i));
80 }
81 TensorShape shape({static_cast<int64>(input.size())});
82 AddInputFromArray<T>(shape, input);
83 AddInputFromArray<float>(TensorShape({}), {min_range});
84 AddInputFromArray<float>(TensorShape({}), {max_range});
85 TF_ASSERT_OK(RunOpKernel());
86 Tensor expected(allocator(), DT_FLOAT, shape);
87 ComputeDequantizeMinCombinedUsingEigen<T>(GetInput(0), min_range, max_range,
88 &expected);
89 test::ExpectTensorEqual<float>(expected, *GetOutput(0));
90 }
91
92 // Compares dequantize min vs the same using eigen. This tests that a change
93 // to not use eigen gives equivalent results to using eigen.
94 template <typename T>
RunDequantizeBfloat16MinCombinedTest(float min_range,float max_range)95 void RunDequantizeBfloat16MinCombinedTest(float min_range, float max_range) {
96 TF_ASSERT_OK(NodeDefBuilder("dequantize_op_bfloat16", "Dequantize")
97 .Input(FakeInput(DataTypeToEnum<T>::v()))
98 .Input(FakeInput(DT_FLOAT))
99 .Input(FakeInput(DT_FLOAT))
100 .Attr("T", DataTypeToEnum<T>::v())
101 .Attr("mode", "MIN_COMBINED")
102 .Attr("dtype", DT_BFLOAT16)
103 .Finalize(node_def()));
104 TF_ASSERT_OK(InitOp());
105
106 std::vector<T> input;
107 for (int64_t i = std::numeric_limits<T>::min();
108 i < std::numeric_limits<T>::max(); ++i) {
109 input.push_back(static_cast<T>(i));
110 }
111 TensorShape shape({static_cast<int64>(input.size())});
112 AddInputFromArray<T>(shape, input);
113 AddInputFromArray<float>(TensorShape({}), {min_range});
114 AddInputFromArray<float>(TensorShape({}), {max_range});
115 TF_ASSERT_OK(RunOpKernel());
116
117 Tensor expected_float32(allocator(), DT_FLOAT, shape);
118 ComputeDequantizeMinCombinedUsingEigen<T>(GetInput(0), min_range, max_range,
119 &expected_float32);
120 Tensor expected(allocator(), DT_BFLOAT16, shape);
121 expected.flat<bfloat16>() = expected_float32.flat<float>().cast<bfloat16>();
122
123 test::ExpectTensorEqual<bfloat16>(expected, *GetOutput(0));
124 }
125
126 // Creates a tensor with the specified dims, using values chosen from data,
127 // multiplied by (1 + index) along the axis dimension.
128 template <typename T>
ScalePerSliceAlongAxis(std::vector<int64> dims,int axis,const std::vector<T> & data)129 std::vector<T> ScalePerSliceAlongAxis(std::vector<int64> dims, int axis,
130 const std::vector<T>& data) {
131 uint32 seed = 123;
132 std::minstd_rand rng(seed);
133 int64_t out_size = 1;
134 for (int dim : dims) {
135 out_size *= dim;
136 }
137 int minor_size = 1;
138 for (int i = axis + 1; i < dims.size(); ++i) {
139 minor_size *= dims[i];
140 }
141 std::vector<T> out(out_size);
142 int num_slices = (axis == -1) ? 1 : dims[axis];
143 for (int out_idx = 0; out_idx < out_size; ++out_idx) {
144 int in_idx = rng() % data.size();
145 T multiplier = ((out_idx / minor_size) % num_slices) + 1;
146 out[out_idx] = data[in_idx] * multiplier;
147 }
148 return out;
149 }
150
151 template <typename T>
RunDequantizeScaledTest(float min_range,float max_range,int axis,const std::vector<T> & values,const std::vector<float> & expected)152 void RunDequantizeScaledTest(float min_range, float max_range, int axis,
153 const std::vector<T>& values,
154 const std::vector<float>& expected) {
155 const std::vector<int64> dims = {2, 3, 4, 5};
156 int num_slices = (axis == -1) ? 1 : dims[axis];
157 TF_ASSERT_OK(NodeDefBuilder("dequantize_op", "Dequantize")
158 .Input(FakeInput(DataTypeToEnum<T>::v()))
159 .Input(FakeInput(DT_FLOAT))
160 .Input(FakeInput(DT_FLOAT))
161 .Attr("T", DataTypeToEnum<T>::v())
162 .Attr("mode", "SCALED")
163 .Attr("axis", axis)
164 .Finalize(node_def()));
165 TF_ASSERT_OK(InitOp());
166
167 AddInputFromArray<T>(TensorShape(dims),
168 ScalePerSliceAlongAxis(dims, -1, values));
169 std::vector<float> min_ranges(num_slices), max_ranges(num_slices);
170 for (int slice_idx = 0; slice_idx < num_slices; ++slice_idx) {
171 min_ranges[slice_idx] = (slice_idx + 1) * min_range;
172 max_ranges[slice_idx] = (slice_idx + 1) * max_range;
173 }
174 AddInputFromArray<float>(TensorShape({num_slices}), min_ranges);
175 AddInputFromArray<float>(TensorShape({num_slices}), max_ranges);
176 TF_ASSERT_OK(RunOpKernel());
177
178 Tensor expected_tensor(allocator(), DT_FLOAT, TensorShape(dims));
179 test::FillValues<float>(&expected_tensor,
180 ScalePerSliceAlongAxis(dims, axis, expected));
181 test::ExpectClose(expected_tensor, *GetOutput(0));
182 }
183 };
184
185 struct ParameterizedDequantizeOpTest
186 : public OpsTestBase,
187 public ::testing::WithParamInterface<int> {};
188
TEST_F(DequantizeOpTest,DequantizeMinCombinedQuint8)189 TEST_F(DequantizeOpTest, DequantizeMinCombinedQuint8) {
190 RunDequantizeMinCombinedTest<quint8>(0, 255.0f, "Dequantize");
191 }
TEST_F(DequantizeOpTest,DequantizeMinCombinedQint8)192 TEST_F(DequantizeOpTest, DequantizeMinCombinedQint8) {
193 RunDequantizeMinCombinedTest<qint8>(0, 255.0f, "Dequantize");
194 }
TEST_F(DequantizeOpTest,DequantizeMinCombinedQint16)195 TEST_F(DequantizeOpTest, DequantizeMinCombinedQint16) {
196 RunDequantizeMinCombinedTest<qint16>(0, 255.0f, "Dequantize");
197 }
TEST_F(DequantizeOpTest,DequantizeMinCombinedQuint16)198 TEST_F(DequantizeOpTest, DequantizeMinCombinedQuint16) {
199 RunDequantizeMinCombinedTest<quint16>(0, 255.0f, "Dequantize");
200 }
201
TEST_F(DequantizeOpTest,DequantizeBfloat16MinCombinedQuint8)202 TEST_F(DequantizeOpTest, DequantizeBfloat16MinCombinedQuint8) {
203 RunDequantizeBfloat16MinCombinedTest<quint8>(0, 255.0f);
204 }
TEST_F(DequantizeOpTest,DequantizeBfloat16MinCombinedQint8)205 TEST_F(DequantizeOpTest, DequantizeBfloat16MinCombinedQint8) {
206 RunDequantizeBfloat16MinCombinedTest<qint8>(0, 255.0f);
207 }
TEST_F(DequantizeOpTest,DequantizeBfloat16MinCombinedQint16)208 TEST_F(DequantizeOpTest, DequantizeBfloat16MinCombinedQint16) {
209 RunDequantizeBfloat16MinCombinedTest<qint16>(0, 255.0f);
210 }
TEST_F(DequantizeOpTest,DequantizeBfloat16MinCombinedQuint16)211 TEST_F(DequantizeOpTest, DequantizeBfloat16MinCombinedQuint16) {
212 RunDequantizeBfloat16MinCombinedTest<quint16>(0, 255.0f);
213 }
214
TEST_F(DequantizeOpTest,DequantizeScaledQuint8Zero)215 TEST_F(DequantizeOpTest, DequantizeScaledQuint8Zero) {
216 RunDequantizeScaledTest<quint8>(-255.0f, 127.0f, -1, {0}, {0.0});
217 }
TEST_F(DequantizeOpTest,DequantizeScaledQuint8CheckIgnoresNegative)218 TEST_F(DequantizeOpTest, DequantizeScaledQuint8CheckIgnoresNegative) {
219 RunDequantizeScaledTest<quint8>(-512.0f, 255.0f, -1, {255}, {255.0});
220 }
TEST_F(DequantizeOpTest,DequantizeScaledQuint8ScaleDown)221 TEST_F(DequantizeOpTest, DequantizeScaledQuint8ScaleDown) {
222 RunDequantizeScaledTest<quint8>(-1.0f, 2.0f, -1, {255}, {2.0});
223 }
TEST_F(DequantizeOpTest,DequantizeScaledQuint8ScaleUp)224 TEST_F(DequantizeOpTest, DequantizeScaledQuint8ScaleUp) {
225 RunDequantizeScaledTest<quint8>(200.0f, 400.0f, -1, {255}, {400.0});
226 }
227
TEST_F(DequantizeOpTest,DequantizeScaledQint8Zero)228 TEST_F(DequantizeOpTest, DequantizeScaledQint8Zero) {
229 RunDequantizeScaledTest<qint8>(-255.0f, 127.0f, -1, {0}, {0.0});
230 }
TEST_F(DequantizeOpTest,DequantizeScaledQint8ScaleIdentity)231 TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleIdentity) {
232 RunDequantizeScaledTest<qint8>(-10.0f, 127.0f, -1, {-127}, {-127.0});
233 }
TEST_F(DequantizeOpTest,DequantizeScaledQint8ScaleDown)234 TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleDown) {
235 RunDequantizeScaledTest<qint8>(-2.0f, 1.0f, -1, {-128}, {-2.0});
236 }
TEST_F(DequantizeOpTest,DequantizeScaledQint8ScaleUp)237 TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleUp) {
238 RunDequantizeScaledTest<qint8>(-1.0f, 300.0f, -1, {42}, {99.212601});
239 }
TEST_F(DequantizeOpTest,DequantizeScaledQint8Axis1)240 TEST_F(DequantizeOpTest, DequantizeScaledQint8Axis1) {
241 RunDequantizeScaledTest<qint8>(-12.8f, 12.7f, 1, {-20, -10, 0, 1, 10, 20},
242 {-2.0, -1.0, 0.0, 0.1, 1.0, 2.0});
243 }
TEST_F(DequantizeOpTest,DequantizeScaledQint8Axis3)244 TEST_F(DequantizeOpTest, DequantizeScaledQint8Axis3) {
245 RunDequantizeScaledTest<qint8>(-12.8f, 12.7f, 3, {-20, -10, 0, 1, 10, 20},
246 {-2.0, -1.0, 0.0, 0.1, 1.0, 2.0});
247 }
248
249 template <typename T>
BM_DequantizeMinCombinedCpu(::testing::benchmark::State & state)250 static void BM_DequantizeMinCombinedCpu(::testing::benchmark::State& state) {
251 auto root = Scope::NewRootScope().ExitOnError();
252 const int64_t num_values = 1500 * 250;
253 std::vector<T> inputs;
254
255 inputs.reserve(num_values);
256 for (int i = 0; i < num_values; ++i) inputs.push_back(i);
257
258 ops::Dequantize(root, test::AsTensor<T>(inputs), test::AsScalar<float>(-1.5f),
259 test::AsScalar<float>(20.5f),
260 ops::Dequantize::Attrs().Mode("MIN_COMBINED"));
261 TF_CHECK_OK(root.status());
262 Graph* g = new Graph(OpRegistry::Global());
263 TF_CHECK_OK(root.ToGraph(g));
264
265 test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
266 state.SetBytesProcessed(state.iterations() * num_values *
267 (sizeof(float) + sizeof(T)));
268 state.SetItemsProcessed(state.iterations());
269 }
270
BM_DequantizeMinCombinedCpuQuint16(::testing::benchmark::State & state)271 void BM_DequantizeMinCombinedCpuQuint16(::testing::benchmark::State& state) {
272 BM_DequantizeMinCombinedCpu<quint16>(state);
273 }
274
BM_DequantizeMinCombinedCpuQint16(::testing::benchmark::State & state)275 void BM_DequantizeMinCombinedCpuQint16(::testing::benchmark::State& state) {
276 BM_DequantizeMinCombinedCpu<qint16>(state);
277 }
278
BM_DequantizeMinCombinedCpuQuint8(::testing::benchmark::State & state)279 void BM_DequantizeMinCombinedCpuQuint8(::testing::benchmark::State& state) {
280 BM_DequantizeMinCombinedCpu<quint8>(state);
281 }
282
BM_DequantizeMinCombinedCpuQint8(::testing::benchmark::State & state)283 void BM_DequantizeMinCombinedCpuQint8(::testing::benchmark::State& state) {
284 BM_DequantizeMinCombinedCpu<qint8>(state);
285 }
286
287 BENCHMARK(BM_DequantizeMinCombinedCpuQuint16);
288 BENCHMARK(BM_DequantizeMinCombinedCpuQint16);
289 BENCHMARK(BM_DequantizeMinCombinedCpuQuint8);
290 BENCHMARK(BM_DequantizeMinCombinedCpuQint8);
291
292 template <typename T>
BM_DequantizeBfloat16MinCombinedCpu(::testing::benchmark::State & state)293 static void BM_DequantizeBfloat16MinCombinedCpu(
294 ::testing::benchmark::State& state) {
295 auto root = Scope::NewRootScope().ExitOnError();
296 const int64_t num_values = 1500 * 250;
297 std::vector<T> inputs;
298
299 inputs.reserve(num_values);
300 for (int i = 0; i < num_values; ++i) inputs.push_back(i);
301
302 ops::Dequantize(root, test::AsTensor<T>(inputs), test::AsScalar<float>(-1.5f),
303 test::AsScalar<float>(20.5f),
304 ops::Dequantize::Attrs().Dtype(DT_BFLOAT16));
305 TF_CHECK_OK(root.status());
306 Graph* g = new Graph(OpRegistry::Global());
307 TF_CHECK_OK(root.ToGraph(g));
308
309 test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
310 state.SetBytesProcessed(state.iterations() * num_values *
311 (sizeof(bfloat16) + sizeof(T)));
312 state.SetItemsProcessed(state.iterations());
313 }
314
BM_DequantizeBfloat16MinCombinedCpuQuint16(::testing::benchmark::State & state)315 void BM_DequantizeBfloat16MinCombinedCpuQuint16(
316 ::testing::benchmark::State& state) {
317 BM_DequantizeBfloat16MinCombinedCpu<quint16>(state);
318 }
319
BM_DequantizeBfloat16MinCombinedCpuQint16(::testing::benchmark::State & state)320 void BM_DequantizeBfloat16MinCombinedCpuQint16(
321 ::testing::benchmark::State& state) {
322 BM_DequantizeBfloat16MinCombinedCpu<qint16>(state);
323 }
324
BM_DequantizeBfloat16MinCombinedCpuQuint8(::testing::benchmark::State & state)325 void BM_DequantizeBfloat16MinCombinedCpuQuint8(
326 ::testing::benchmark::State& state) {
327 BM_DequantizeBfloat16MinCombinedCpu<quint8>(state);
328 }
329
BM_DequantizeBfloat16MinCombinedCpuQint8(::testing::benchmark::State & state)330 void BM_DequantizeBfloat16MinCombinedCpuQint8(
331 ::testing::benchmark::State& state) {
332 BM_DequantizeBfloat16MinCombinedCpu<qint8>(state);
333 }
334
335 BENCHMARK(BM_DequantizeBfloat16MinCombinedCpuQuint16);
336 BENCHMARK(BM_DequantizeBfloat16MinCombinedCpuQint16);
337 BENCHMARK(BM_DequantizeBfloat16MinCombinedCpuQuint8);
338 BENCHMARK(BM_DequantizeBfloat16MinCombinedCpuQint8);
339
340 } // namespace
341 } // namespace tensorflow
342