1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // Implements a quantized eight-bit version of the matmul operation.
17
18 #define EIGEN_USE_THREADS
19
20 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
21 #define USE_NEON
22 #include <arm_neon.h>
23 #endif
24
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/kernels/meta_support.h"
28 #include "tensorflow/core/kernels/quantization_utils.h"
29 #include "tensorflow/core/lib/core/errors.h"
30 #include "tensorflow/core/util/bcast.h"
31
32 namespace tensorflow {
33 namespace {
34
35 template <class T, class Toutput>
ScalarMultiply(OpKernelContext * context,const T * full_input,int32 full_input_offset,int64 num_elements,T scalar_input,int32 scalar_input_offset,Toutput * output)36 void ScalarMultiply(OpKernelContext* context, const T* full_input,
37 int32 full_input_offset, int64 num_elements, T scalar_input,
38 int32 scalar_input_offset, Toutput* output) {
39 const int32 scalar_minus_offset =
40 static_cast<int32>(scalar_input) - scalar_input_offset;
41 for (int i = 0; i < num_elements; ++i) {
42 output[i] = (static_cast<int32>(full_input[i]) - full_input_offset) *
43 scalar_minus_offset;
44 }
45 }
46
47 #ifdef USE_NEON
48
49 template <>
ScalarMultiply(OpKernelContext * context,const quint8 * full_input,int32 full_input_offset,int64 num_elements,quint8 scalar_input,int32 scalar_input_offset,qint32 * output)50 void ScalarMultiply<quint8, qint32>(OpKernelContext* context,
51 const quint8* full_input,
52 int32 full_input_offset, int64 num_elements,
53 quint8 scalar_input,
54 int32 scalar_input_offset, qint32* output) {
55 const int16 scalar_minus_offset =
56 static_cast<int16>(scalar_input) - scalar_input_offset;
57 const int16x4_t scalar_minus_offset_16x4 = vmov_n_s16(scalar_minus_offset);
58 const uint8x8_t full_input_offset_8x8 = vmov_n_u8(full_input_offset);
59 // Go through the results in 16-element chunks for NEON acceleration.
60 int i;
61 for (i = 0; i < (num_elements - 15); i += 16) {
62 // Load the tensor inputs.
63 const uint8* full_input_ptr = &(full_input->value) + i;
64 const uint8x16_t full_input_8x16 = vld1q_u8(full_input_ptr);
65
66 // Break into two sets of vectors so we can do further calculations
67 // easily.
68 const uint8x8_t full_input_high_8x8 = vget_high_u8(full_input_8x16);
69 const uint8x8_t full_input_low_8x8 = vget_low_u8(full_input_8x16);
70
71 // Subtract off the offset value to get 16-bit results.
72 const int16x8_t full_input_minus_offset_high_16x8 = vreinterpretq_s16_u16(
73 vsubl_u8(full_input_high_8x8, full_input_offset_8x8));
74 const int16x8_t full_input_minus_offset_low_16x8 = vreinterpretq_s16_u16(
75 vsubl_u8(full_input_low_8x8, full_input_offset_8x8));
76
77 // We have to work with 4-wide vectors, so extract them.
78 const int16x4_t x_high_high_16x4 =
79 vget_high_s16(full_input_minus_offset_high_16x8);
80 const int16x4_t x_high_low_16x4 =
81 vget_low_s16(full_input_minus_offset_high_16x8);
82 const int16x4_t x_low_high_16x4 =
83 vget_high_s16(full_input_minus_offset_low_16x8);
84 const int16x4_t x_low_low_16x4 =
85 vget_low_s16(full_input_minus_offset_low_16x8);
86
87 // Perform the multiplication.
88 const int32x4_t z_high_high_32x4 =
89 vmull_s16(x_high_high_16x4, scalar_minus_offset_16x4);
90 const int32x4_t z_high_low_32x4 =
91 vmull_s16(x_high_low_16x4, scalar_minus_offset_16x4);
92 const int32x4_t z_low_high_32x4 =
93 vmull_s16(x_low_high_16x4, scalar_minus_offset_16x4);
94 const int32x4_t z_low_low_32x4 =
95 vmull_s16(x_low_low_16x4, scalar_minus_offset_16x4);
96
97 // Write out the results.
98 int32* output_ptr = &(output->value) + i;
99 vst1q_s32(output_ptr + 0, z_low_low_32x4);
100 vst1q_s32(output_ptr + 4, z_low_high_32x4);
101 vst1q_s32(output_ptr + 8, z_high_low_32x4);
102 vst1q_s32(output_ptr + 12, z_high_high_32x4);
103 }
104 // Finish up any remaining elements that weren't a multiple of 16.
105 for (; i < num_elements; ++i) {
106 output[i] = (static_cast<int32>(full_input[i]) - full_input_offset) *
107 scalar_minus_offset;
108 }
109 }
110 #endif // USE_NEON
111
112 template <class T, class Toutput>
VectorMultiply(OpKernelContext * context,const T * x_data,int32 offset_x,const T * y_data,int32 offset_y,int64 num_elements,Toutput * output)113 void VectorMultiply(OpKernelContext* context, const T* x_data, int32 offset_x,
114 const T* y_data, int32 offset_y, int64 num_elements,
115 Toutput* output) {
116 for (int i = 0; i < num_elements; ++i) {
117 output[i] = (static_cast<int32>(x_data[i]) - offset_x) *
118 (static_cast<int32>(y_data[i]) - offset_y);
119 }
120 }
121
122 #ifdef USE_NEON
123 template <>
VectorMultiply(OpKernelContext * context,const quint8 * x_data,int32 offset_x,const quint8 * y_data,int32 offset_y,int64 num_elements,qint32 * output)124 void VectorMultiply<quint8, qint32>(OpKernelContext* context,
125 const quint8* x_data, int32 offset_x,
126 const quint8* y_data, int32 offset_y,
127 int64 num_elements, qint32* output) {
128 const uint8x8_t offset_x_8x8 = vmov_n_u8(offset_x);
129 const uint8x8_t offset_y_8x8 = vmov_n_u8(offset_y);
130 int i;
131 // Go through the results in 16-element chunks for NEON acceleration.
132 for (i = 0; i < (num_elements - 15); i += 16) {
133 // Load the vector inputs.
134 const uint8* x_data_ptr = &(x_data->value) + i;
135 const uint8x16_t x_8x16 = vld1q_u8(x_data_ptr);
136 const uint8* y_data_ptr = &(y_data->value) + i;
137 const uint8x16_t y_8x16 = vld1q_u8(y_data_ptr);
138
139 // Break into two sets of vectors so we can do further calculations easily.
140 const uint8x8_t x_high_8x8 = vget_high_u8(x_8x16);
141 const uint8x8_t x_low_8x8 = vget_low_u8(x_8x16);
142 const uint8x8_t y_high_8x8 = vget_high_u8(y_8x16);
143 const uint8x8_t y_low_8x8 = vget_low_u8(y_8x16);
144
145 // Subtract off the offset values to get 16-bit results.
146 const int16x8_t x_minus_offset_high_16x8 =
147 vreinterpretq_s16_u16(vsubl_u8(x_high_8x8, offset_x_8x8));
148 const int16x8_t x_minus_offset_low_16x8 =
149 vreinterpretq_s16_u16(vsubl_u8(x_low_8x8, offset_x_8x8));
150 const int16x8_t y_minus_offset_high_16x8 =
151 vreinterpretq_s16_u16(vsubl_u8(y_high_8x8, offset_y_8x8));
152 const int16x8_t y_minus_offset_low_16x8 =
153 vreinterpretq_s16_u16(vsubl_u8(y_low_8x8, offset_y_8x8));
154
155 // We have to work with 4-wide vectors, so extract them.
156 const int16x4_t x_high_high_16x4 = vget_high_s16(x_minus_offset_high_16x8);
157 const int16x4_t x_high_low_16x4 = vget_low_s16(x_minus_offset_high_16x8);
158 const int16x4_t x_low_high_16x4 = vget_high_s16(x_minus_offset_low_16x8);
159 const int16x4_t x_low_low_16x4 = vget_low_s16(x_minus_offset_low_16x8);
160 const int16x4_t y_high_high_16x4 = vget_high_s16(y_minus_offset_high_16x8);
161 const int16x4_t y_high_low_16x4 = vget_low_s16(y_minus_offset_high_16x8);
162 const int16x4_t y_low_high_16x4 = vget_high_s16(y_minus_offset_low_16x8);
163 const int16x4_t y_low_low_16x4 = vget_low_s16(y_minus_offset_low_16x8);
164
165 // Perform the multiplication.
166 const int32x4_t z_high_high_32x4 =
167 vmull_s16(x_high_high_16x4, y_high_high_16x4);
168 const int32x4_t z_high_low_32x4 =
169 vmull_s16(x_high_low_16x4, y_high_low_16x4);
170 const int32x4_t z_low_high_32x4 =
171 vmull_s16(x_low_high_16x4, y_low_high_16x4);
172 const int32x4_t z_low_low_32x4 = vmull_s16(x_low_low_16x4, y_low_low_16x4);
173
174 // Write out the results.
175 int32* output_ptr = &(output->value) + i;
176 vst1q_s32(output_ptr + 0, z_low_low_32x4);
177 vst1q_s32(output_ptr + 4, z_low_high_32x4);
178 vst1q_s32(output_ptr + 8, z_high_low_32x4);
179 vst1q_s32(output_ptr + 12, z_high_high_32x4);
180 }
181 for (; i < num_elements; ++i) {
182 output[i] = (static_cast<int32>(x_data[i]) - offset_x) *
183 (static_cast<int32>(y_data[i]) - offset_y);
184 }
185 }
186 #endif // USE_NEON
187
188 template <class T, class Toutput>
VectorTensorMultiply(const T * vector_data,int32 vector_offset,int64 vector_num_elements,const T * tensor_data,int32 tensor_offset,int64 tensor_num_elements,Toutput * output)189 void VectorTensorMultiply(const T* vector_data, int32 vector_offset,
190 int64 vector_num_elements, const T* tensor_data,
191 int32 tensor_offset, int64 tensor_num_elements,
192 Toutput* output) {
193 for (int i = 0; i < tensor_num_elements; ++i) {
194 const int64 vector_i = i % vector_num_elements;
195 output[i] = (static_cast<int32>(vector_data[vector_i]) - vector_offset) *
196 (static_cast<int32>(tensor_data[i]) - tensor_offset);
197 }
198 }
199
200 #ifdef USE_NEON
201 template <>
VectorTensorMultiply(const quint8 * vector_data,int32 vector_offset,int64 vector_num_elements,const quint8 * tensor_data,int32 tensor_offset,int64 tensor_num_elements,qint32 * output)202 void VectorTensorMultiply<quint8, qint32>(
203 const quint8* vector_data, int32 vector_offset, int64 vector_num_elements,
204 const quint8* tensor_data, int32 tensor_offset, int64 tensor_num_elements,
205 qint32* output) {
206 const uint8x8_t offset_x_8x8 = vmov_n_u8(vector_offset);
207 const uint8x8_t offset_y_8x8 = vmov_n_u8(tensor_offset);
208 CHECK_EQ(0, tensor_num_elements % vector_num_elements);
209 for (int base_i = 0; base_i < tensor_num_elements;
210 base_i += vector_num_elements) {
211 int i = base_i;
212 const int end_i = base_i + vector_num_elements;
213 // Go through the results in 16-element chunks for NEON acceleration.
214 int vector_i;
215 for (vector_i = 0; vector_i < (vector_num_elements - 15);
216 vector_i += 16, i += 16) {
217 // Load the vector inputs.
218 const uint8* x_data_ptr = &(vector_data->value) + vector_i;
219 const uint8x16_t x_8x16 = vld1q_u8(x_data_ptr);
220 const uint8* y_data_ptr = &(tensor_data->value) + i;
221 const uint8x16_t y_8x16 = vld1q_u8(y_data_ptr);
222
223 // Break into two sets of vectors so we can do further calculations
224 // easily.
225 const uint8x8_t x_high_8x8 = vget_high_u8(x_8x16);
226 const uint8x8_t x_low_8x8 = vget_low_u8(x_8x16);
227 const uint8x8_t y_high_8x8 = vget_high_u8(y_8x16);
228 const uint8x8_t y_low_8x8 = vget_low_u8(y_8x16);
229
230 // Subtract off the offset values to get 16-bit results.
231 const int16x8_t x_minus_offset_high_16x8 =
232 vreinterpretq_s16_u16(vsubl_u8(x_high_8x8, offset_x_8x8));
233 const int16x8_t x_minus_offset_low_16x8 =
234 vreinterpretq_s16_u16(vsubl_u8(x_low_8x8, offset_x_8x8));
235 const int16x8_t y_minus_offset_high_16x8 =
236 vreinterpretq_s16_u16(vsubl_u8(y_high_8x8, offset_y_8x8));
237 const int16x8_t y_minus_offset_low_16x8 =
238 vreinterpretq_s16_u16(vsubl_u8(y_low_8x8, offset_y_8x8));
239
240 // We have to work with 4-wide vectors, so extract them.
241 const int16x4_t x_high_high_16x4 =
242 vget_high_s16(x_minus_offset_high_16x8);
243 const int16x4_t x_high_low_16x4 = vget_low_s16(x_minus_offset_high_16x8);
244 const int16x4_t x_low_high_16x4 = vget_high_s16(x_minus_offset_low_16x8);
245 const int16x4_t x_low_low_16x4 = vget_low_s16(x_minus_offset_low_16x8);
246 const int16x4_t y_high_high_16x4 =
247 vget_high_s16(y_minus_offset_high_16x8);
248 const int16x4_t y_high_low_16x4 = vget_low_s16(y_minus_offset_high_16x8);
249 const int16x4_t y_low_high_16x4 = vget_high_s16(y_minus_offset_low_16x8);
250 const int16x4_t y_low_low_16x4 = vget_low_s16(y_minus_offset_low_16x8);
251
252 // Perform the multiplication.
253 const int32x4_t z_high_high_32x4 =
254 vmull_s16(x_high_high_16x4, y_high_high_16x4);
255 const int32x4_t z_high_low_32x4 =
256 vmull_s16(x_high_low_16x4, y_high_low_16x4);
257 const int32x4_t z_low_high_32x4 =
258 vmull_s16(x_low_high_16x4, y_low_high_16x4);
259 const int32x4_t z_low_low_32x4 =
260 vmull_s16(x_low_low_16x4, y_low_low_16x4);
261
262 // Write out the results.
263 int32* output_ptr = &(output->value) + i;
264 vst1q_s32(output_ptr + 0, z_low_low_32x4);
265 vst1q_s32(output_ptr + 4, z_low_high_32x4);
266 vst1q_s32(output_ptr + 8, z_high_low_32x4);
267 vst1q_s32(output_ptr + 12, z_high_high_32x4);
268 }
269 for (; i < end_i; ++i, ++vector_i) {
270 output[i] = (static_cast<int32>(vector_data[vector_i]) - vector_offset) *
271 (static_cast<int32>(tensor_data[i]) - tensor_offset);
272 }
273 }
274 }
275 #endif // USE_NEON
276
277 } // namespace
278
279 template <class T, class Toutput>
280 class QuantizedMulOp : public OpKernel {
281 public:
QuantizedMulOp(OpKernelConstruction * context)282 explicit QuantizedMulOp(OpKernelConstruction* context) : OpKernel(context) {}
283
Compute(OpKernelContext * context)284 void Compute(OpKernelContext* context) override {
285 const Tensor& x = context->input(0);
286 const Tensor& y = context->input(1);
287 const float min_x = context->input(2).flat<float>()(0);
288 const float max_x = context->input(3).flat<float>()(0);
289 const float min_y = context->input(4).flat<float>()(0);
290 const float max_y = context->input(5).flat<float>()(0);
291
292 BCast bcast(BCast::FromShape(x.shape()), BCast::FromShape(y.shape()));
293 if (!bcast.IsValid()) {
294 context->SetStatus(errors::InvalidArgument(
295 "Incompatible shapes: ", x.shape().DebugString(), " vs. ",
296 y.shape().DebugString()));
297 return;
298 }
299 Tensor* z;
300 OP_REQUIRES_OK(context, context->allocate_output(
301 0, BCast::ToShape(bcast.output_shape()), &z));
302
303 // Make sure that we have valid quantization ranges for the input buffers.
304 // If the difference between the min and max is negative or zero, it makes
305 // it hard to do meaningful intermediate operations on the values.
306 OP_REQUIRES(context, (max_x > min_x),
307 errors::InvalidArgument("max_x must be larger than min_a."));
308 OP_REQUIRES(context, (max_y > min_y),
309 errors::InvalidArgument("max_x must be larger than min_b."));
310 const int32 offset_x = FloatToQuantizedUnclamped<T>(0.0f, min_x, max_x);
311 const int32 offset_y = FloatToQuantizedUnclamped<T>(0.0f, min_y, max_y);
312 const T* x_data = x.flat<T>().data();
313 const T* y_data = y.flat<T>().data();
314 Toutput* z_data = z->flat<Toutput>().data();
315
316 const int ndims = bcast.x_reshape().size();
317 if (ndims <= 1) {
318 if (x.NumElements() == 1) {
319 ScalarMultiply<T, Toutput>(context, y_data, offset_y, y.NumElements(),
320 x_data[0], offset_x, z_data);
321 } else if (y.NumElements() == 1) {
322 ScalarMultiply<T, Toutput>(context, x_data, offset_x, x.NumElements(),
323 y_data[0], offset_y, z_data);
324 } else {
325 VectorMultiply<T, Toutput>(context, x_data, offset_x, y_data, offset_y,
326 x.NumElements(), z_data);
327 }
328 } else if (ndims == 2) {
329 const T* vector_data;
330 int64 vector_num_elements;
331 int32 vector_offset;
332 const T* tensor_data;
333 int64 tensor_num_elements;
334 int32 tensor_offset;
335 if (x.NumElements() < y.NumElements()) {
336 vector_data = x_data;
337 vector_num_elements = x.NumElements();
338 vector_offset = offset_x;
339 tensor_data = y_data;
340 tensor_num_elements = y.NumElements();
341 tensor_offset = offset_y;
342 } else {
343 vector_data = y_data;
344 vector_num_elements = y.NumElements();
345 vector_offset = offset_y;
346 tensor_data = x_data;
347 tensor_num_elements = x.NumElements();
348 tensor_offset = offset_x;
349 }
350 VectorTensorMultiply<T, Toutput>(
351 vector_data, vector_offset, vector_num_elements, tensor_data,
352 tensor_offset, tensor_num_elements, z_data);
353 } else {
354 LOG(INFO) << "ndims=" << ndims;
355 LOG(INFO) << "bcast.x_reshape()="
356 << TensorShape(bcast.x_reshape()).DebugString();
357 LOG(INFO) << "bcast.y_reshape()="
358 << TensorShape(bcast.y_reshape()).DebugString();
359 LOG(INFO) << "bcast.x_bcast()="
360 << TensorShape(bcast.x_bcast()).DebugString();
361 LOG(INFO) << "bcast.y_bcast()="
362 << TensorShape(bcast.y_bcast()).DebugString();
363
364 context->SetStatus(errors::Unimplemented(
365 "Broadcast between ", context->input(0).shape().DebugString(),
366 " and ", context->input(1).shape().DebugString(),
367 " is not supported yet."));
368 return;
369 }
370
371 float min_z_value;
372 float max_z_value;
373 QuantizationRangeForMultiplication<T, T, Toutput>(
374 min_x, max_x, min_y, max_y, &min_z_value, &max_z_value);
375 Tensor* z_min = nullptr;
376 OP_REQUIRES_OK(context, context->allocate_output(1, {}, &z_min));
377 z_min->flat<float>()(0) = min_z_value;
378
379 Tensor* z_max = nullptr;
380 OP_REQUIRES_OK(context, context->allocate_output(2, {}, &z_max));
381 z_max->flat<float>()(0) = max_z_value;
382 }
383 };
384
385 REGISTER_KERNEL_BUILDER(Name("QuantizedMul")
386 .Device(DEVICE_CPU)
387 .TypeConstraint<quint8>("T1")
388 .TypeConstraint<quint8>("T2")
389 .TypeConstraint<qint32>("Toutput"),
390 QuantizedMulOp<quint8, qint32>);
391
392 } // namespace tensorflow
393