• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Implements a quantized eight-bit version of the matmul operation.
17 
18 #define EIGEN_USE_THREADS
19 
20 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
21 #define USE_NEON
22 #include <arm_neon.h>
23 #endif
24 
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/kernels/meta_support.h"
28 #include "tensorflow/core/kernels/quantization_utils.h"
29 #include "tensorflow/core/lib/core/errors.h"
30 #include "tensorflow/core/util/bcast.h"
31 
32 namespace tensorflow {
33 namespace {
34 
35 template <class T, class Toutput>
ScalarMultiply(OpKernelContext * context,const T * full_input,int32 full_input_offset,int64 num_elements,T scalar_input,int32 scalar_input_offset,Toutput * output)36 void ScalarMultiply(OpKernelContext* context, const T* full_input,
37                     int32 full_input_offset, int64 num_elements, T scalar_input,
38                     int32 scalar_input_offset, Toutput* output) {
39   const int32 scalar_minus_offset =
40       static_cast<int32>(scalar_input) - scalar_input_offset;
41   for (int i = 0; i < num_elements; ++i) {
42     output[i] = (static_cast<int32>(full_input[i]) - full_input_offset) *
43                 scalar_minus_offset;
44   }
45 }
46 
47 #ifdef USE_NEON
48 
49 template <>
ScalarMultiply(OpKernelContext * context,const quint8 * full_input,int32 full_input_offset,int64 num_elements,quint8 scalar_input,int32 scalar_input_offset,qint32 * output)50 void ScalarMultiply<quint8, qint32>(OpKernelContext* context,
51                                     const quint8* full_input,
52                                     int32 full_input_offset, int64 num_elements,
53                                     quint8 scalar_input,
54                                     int32 scalar_input_offset, qint32* output) {
55   const int16 scalar_minus_offset =
56       static_cast<int16>(scalar_input) - scalar_input_offset;
57   const int16x4_t scalar_minus_offset_16x4 = vmov_n_s16(scalar_minus_offset);
58   const uint8x8_t full_input_offset_8x8 = vmov_n_u8(full_input_offset);
59   // Go through the results in 16-element chunks for NEON acceleration.
60   int i;
61   for (i = 0; i < (num_elements - 15); i += 16) {
62     // Load the tensor inputs.
63     const uint8* full_input_ptr = &(full_input->value) + i;
64     const uint8x16_t full_input_8x16 = vld1q_u8(full_input_ptr);
65 
66     // Break into two sets of vectors so we can do further calculations
67     // easily.
68     const uint8x8_t full_input_high_8x8 = vget_high_u8(full_input_8x16);
69     const uint8x8_t full_input_low_8x8 = vget_low_u8(full_input_8x16);
70 
71     // Subtract off the offset value to get 16-bit results.
72     const int16x8_t full_input_minus_offset_high_16x8 = vreinterpretq_s16_u16(
73         vsubl_u8(full_input_high_8x8, full_input_offset_8x8));
74     const int16x8_t full_input_minus_offset_low_16x8 = vreinterpretq_s16_u16(
75         vsubl_u8(full_input_low_8x8, full_input_offset_8x8));
76 
77     // We have to work with 4-wide vectors, so extract them.
78     const int16x4_t x_high_high_16x4 =
79         vget_high_s16(full_input_minus_offset_high_16x8);
80     const int16x4_t x_high_low_16x4 =
81         vget_low_s16(full_input_minus_offset_high_16x8);
82     const int16x4_t x_low_high_16x4 =
83         vget_high_s16(full_input_minus_offset_low_16x8);
84     const int16x4_t x_low_low_16x4 =
85         vget_low_s16(full_input_minus_offset_low_16x8);
86 
87     // Perform the multiplication.
88     const int32x4_t z_high_high_32x4 =
89         vmull_s16(x_high_high_16x4, scalar_minus_offset_16x4);
90     const int32x4_t z_high_low_32x4 =
91         vmull_s16(x_high_low_16x4, scalar_minus_offset_16x4);
92     const int32x4_t z_low_high_32x4 =
93         vmull_s16(x_low_high_16x4, scalar_minus_offset_16x4);
94     const int32x4_t z_low_low_32x4 =
95         vmull_s16(x_low_low_16x4, scalar_minus_offset_16x4);
96 
97     // Write out the results.
98     int32* output_ptr = &(output->value) + i;
99     vst1q_s32(output_ptr + 0, z_low_low_32x4);
100     vst1q_s32(output_ptr + 4, z_low_high_32x4);
101     vst1q_s32(output_ptr + 8, z_high_low_32x4);
102     vst1q_s32(output_ptr + 12, z_high_high_32x4);
103   }
104   // Finish up any remaining elements that weren't a multiple of 16.
105   for (; i < num_elements; ++i) {
106     output[i] = (static_cast<int32>(full_input[i]) - full_input_offset) *
107                 scalar_minus_offset;
108   }
109 }
110 #endif  // USE_NEON
111 
112 template <class T, class Toutput>
VectorMultiply(OpKernelContext * context,const T * x_data,int32 offset_x,const T * y_data,int32 offset_y,int64 num_elements,Toutput * output)113 void VectorMultiply(OpKernelContext* context, const T* x_data, int32 offset_x,
114                     const T* y_data, int32 offset_y, int64 num_elements,
115                     Toutput* output) {
116   for (int i = 0; i < num_elements; ++i) {
117     output[i] = (static_cast<int32>(x_data[i]) - offset_x) *
118                 (static_cast<int32>(y_data[i]) - offset_y);
119   }
120 }
121 
122 #ifdef USE_NEON
123 template <>
VectorMultiply(OpKernelContext * context,const quint8 * x_data,int32 offset_x,const quint8 * y_data,int32 offset_y,int64 num_elements,qint32 * output)124 void VectorMultiply<quint8, qint32>(OpKernelContext* context,
125                                     const quint8* x_data, int32 offset_x,
126                                     const quint8* y_data, int32 offset_y,
127                                     int64 num_elements, qint32* output) {
128   const uint8x8_t offset_x_8x8 = vmov_n_u8(offset_x);
129   const uint8x8_t offset_y_8x8 = vmov_n_u8(offset_y);
130   int i;
131   // Go through the results in 16-element chunks for NEON acceleration.
132   for (i = 0; i < (num_elements - 15); i += 16) {
133     // Load the vector inputs.
134     const uint8* x_data_ptr = &(x_data->value) + i;
135     const uint8x16_t x_8x16 = vld1q_u8(x_data_ptr);
136     const uint8* y_data_ptr = &(y_data->value) + i;
137     const uint8x16_t y_8x16 = vld1q_u8(y_data_ptr);
138 
139     // Break into two sets of vectors so we can do further calculations easily.
140     const uint8x8_t x_high_8x8 = vget_high_u8(x_8x16);
141     const uint8x8_t x_low_8x8 = vget_low_u8(x_8x16);
142     const uint8x8_t y_high_8x8 = vget_high_u8(y_8x16);
143     const uint8x8_t y_low_8x8 = vget_low_u8(y_8x16);
144 
145     // Subtract off the offset values to get 16-bit results.
146     const int16x8_t x_minus_offset_high_16x8 =
147         vreinterpretq_s16_u16(vsubl_u8(x_high_8x8, offset_x_8x8));
148     const int16x8_t x_minus_offset_low_16x8 =
149         vreinterpretq_s16_u16(vsubl_u8(x_low_8x8, offset_x_8x8));
150     const int16x8_t y_minus_offset_high_16x8 =
151         vreinterpretq_s16_u16(vsubl_u8(y_high_8x8, offset_y_8x8));
152     const int16x8_t y_minus_offset_low_16x8 =
153         vreinterpretq_s16_u16(vsubl_u8(y_low_8x8, offset_y_8x8));
154 
155     // We have to work with 4-wide vectors, so extract them.
156     const int16x4_t x_high_high_16x4 = vget_high_s16(x_minus_offset_high_16x8);
157     const int16x4_t x_high_low_16x4 = vget_low_s16(x_minus_offset_high_16x8);
158     const int16x4_t x_low_high_16x4 = vget_high_s16(x_minus_offset_low_16x8);
159     const int16x4_t x_low_low_16x4 = vget_low_s16(x_minus_offset_low_16x8);
160     const int16x4_t y_high_high_16x4 = vget_high_s16(y_minus_offset_high_16x8);
161     const int16x4_t y_high_low_16x4 = vget_low_s16(y_minus_offset_high_16x8);
162     const int16x4_t y_low_high_16x4 = vget_high_s16(y_minus_offset_low_16x8);
163     const int16x4_t y_low_low_16x4 = vget_low_s16(y_minus_offset_low_16x8);
164 
165     // Perform the multiplication.
166     const int32x4_t z_high_high_32x4 =
167         vmull_s16(x_high_high_16x4, y_high_high_16x4);
168     const int32x4_t z_high_low_32x4 =
169         vmull_s16(x_high_low_16x4, y_high_low_16x4);
170     const int32x4_t z_low_high_32x4 =
171         vmull_s16(x_low_high_16x4, y_low_high_16x4);
172     const int32x4_t z_low_low_32x4 = vmull_s16(x_low_low_16x4, y_low_low_16x4);
173 
174     // Write out the results.
175     int32* output_ptr = &(output->value) + i;
176     vst1q_s32(output_ptr + 0, z_low_low_32x4);
177     vst1q_s32(output_ptr + 4, z_low_high_32x4);
178     vst1q_s32(output_ptr + 8, z_high_low_32x4);
179     vst1q_s32(output_ptr + 12, z_high_high_32x4);
180   }
181   for (; i < num_elements; ++i) {
182     output[i] = (static_cast<int32>(x_data[i]) - offset_x) *
183                 (static_cast<int32>(y_data[i]) - offset_y);
184   }
185 }
186 #endif  // USE_NEON
187 
188 template <class T, class Toutput>
VectorTensorMultiply(const T * vector_data,int32 vector_offset,int64 vector_num_elements,const T * tensor_data,int32 tensor_offset,int64 tensor_num_elements,Toutput * output)189 void VectorTensorMultiply(const T* vector_data, int32 vector_offset,
190                           int64 vector_num_elements, const T* tensor_data,
191                           int32 tensor_offset, int64 tensor_num_elements,
192                           Toutput* output) {
193   for (int i = 0; i < tensor_num_elements; ++i) {
194     const int64 vector_i = i % vector_num_elements;
195     output[i] = (static_cast<int32>(vector_data[vector_i]) - vector_offset) *
196                 (static_cast<int32>(tensor_data[i]) - tensor_offset);
197   }
198 }
199 
200 #ifdef USE_NEON
201 template <>
VectorTensorMultiply(const quint8 * vector_data,int32 vector_offset,int64 vector_num_elements,const quint8 * tensor_data,int32 tensor_offset,int64 tensor_num_elements,qint32 * output)202 void VectorTensorMultiply<quint8, qint32>(
203     const quint8* vector_data, int32 vector_offset, int64 vector_num_elements,
204     const quint8* tensor_data, int32 tensor_offset, int64 tensor_num_elements,
205     qint32* output) {
206   const uint8x8_t offset_x_8x8 = vmov_n_u8(vector_offset);
207   const uint8x8_t offset_y_8x8 = vmov_n_u8(tensor_offset);
208   CHECK_EQ(0, tensor_num_elements % vector_num_elements);
209   for (int base_i = 0; base_i < tensor_num_elements;
210        base_i += vector_num_elements) {
211     int i = base_i;
212     const int end_i = base_i + vector_num_elements;
213     // Go through the results in 16-element chunks for NEON acceleration.
214     int vector_i;
215     for (vector_i = 0; vector_i < (vector_num_elements - 15);
216          vector_i += 16, i += 16) {
217       // Load the vector inputs.
218       const uint8* x_data_ptr = &(vector_data->value) + vector_i;
219       const uint8x16_t x_8x16 = vld1q_u8(x_data_ptr);
220       const uint8* y_data_ptr = &(tensor_data->value) + i;
221       const uint8x16_t y_8x16 = vld1q_u8(y_data_ptr);
222 
223       // Break into two sets of vectors so we can do further calculations
224       // easily.
225       const uint8x8_t x_high_8x8 = vget_high_u8(x_8x16);
226       const uint8x8_t x_low_8x8 = vget_low_u8(x_8x16);
227       const uint8x8_t y_high_8x8 = vget_high_u8(y_8x16);
228       const uint8x8_t y_low_8x8 = vget_low_u8(y_8x16);
229 
230       // Subtract off the offset values to get 16-bit results.
231       const int16x8_t x_minus_offset_high_16x8 =
232           vreinterpretq_s16_u16(vsubl_u8(x_high_8x8, offset_x_8x8));
233       const int16x8_t x_minus_offset_low_16x8 =
234           vreinterpretq_s16_u16(vsubl_u8(x_low_8x8, offset_x_8x8));
235       const int16x8_t y_minus_offset_high_16x8 =
236           vreinterpretq_s16_u16(vsubl_u8(y_high_8x8, offset_y_8x8));
237       const int16x8_t y_minus_offset_low_16x8 =
238           vreinterpretq_s16_u16(vsubl_u8(y_low_8x8, offset_y_8x8));
239 
240       // We have to work with 4-wide vectors, so extract them.
241       const int16x4_t x_high_high_16x4 =
242           vget_high_s16(x_minus_offset_high_16x8);
243       const int16x4_t x_high_low_16x4 = vget_low_s16(x_minus_offset_high_16x8);
244       const int16x4_t x_low_high_16x4 = vget_high_s16(x_minus_offset_low_16x8);
245       const int16x4_t x_low_low_16x4 = vget_low_s16(x_minus_offset_low_16x8);
246       const int16x4_t y_high_high_16x4 =
247           vget_high_s16(y_minus_offset_high_16x8);
248       const int16x4_t y_high_low_16x4 = vget_low_s16(y_minus_offset_high_16x8);
249       const int16x4_t y_low_high_16x4 = vget_high_s16(y_minus_offset_low_16x8);
250       const int16x4_t y_low_low_16x4 = vget_low_s16(y_minus_offset_low_16x8);
251 
252       // Perform the multiplication.
253       const int32x4_t z_high_high_32x4 =
254           vmull_s16(x_high_high_16x4, y_high_high_16x4);
255       const int32x4_t z_high_low_32x4 =
256           vmull_s16(x_high_low_16x4, y_high_low_16x4);
257       const int32x4_t z_low_high_32x4 =
258           vmull_s16(x_low_high_16x4, y_low_high_16x4);
259       const int32x4_t z_low_low_32x4 =
260           vmull_s16(x_low_low_16x4, y_low_low_16x4);
261 
262       // Write out the results.
263       int32* output_ptr = &(output->value) + i;
264       vst1q_s32(output_ptr + 0, z_low_low_32x4);
265       vst1q_s32(output_ptr + 4, z_low_high_32x4);
266       vst1q_s32(output_ptr + 8, z_high_low_32x4);
267       vst1q_s32(output_ptr + 12, z_high_high_32x4);
268     }
269     for (; i < end_i; ++i, ++vector_i) {
270       output[i] = (static_cast<int32>(vector_data[vector_i]) - vector_offset) *
271                   (static_cast<int32>(tensor_data[i]) - tensor_offset);
272     }
273   }
274 }
275 #endif  // USE_NEON
276 
277 }  // namespace
278 
279 template <class T, class Toutput>
280 class QuantizedMulOp : public OpKernel {
281  public:
QuantizedMulOp(OpKernelConstruction * context)282   explicit QuantizedMulOp(OpKernelConstruction* context) : OpKernel(context) {}
283 
Compute(OpKernelContext * context)284   void Compute(OpKernelContext* context) override {
285     const Tensor& x = context->input(0);
286     const Tensor& y = context->input(1);
287     const float min_x = context->input(2).flat<float>()(0);
288     const float max_x = context->input(3).flat<float>()(0);
289     const float min_y = context->input(4).flat<float>()(0);
290     const float max_y = context->input(5).flat<float>()(0);
291 
292     BCast bcast(BCast::FromShape(x.shape()), BCast::FromShape(y.shape()));
293     if (!bcast.IsValid()) {
294       context->SetStatus(errors::InvalidArgument(
295           "Incompatible shapes: ", x.shape().DebugString(), " vs. ",
296           y.shape().DebugString()));
297       return;
298     }
299     Tensor* z;
300     OP_REQUIRES_OK(context, context->allocate_output(
301                                 0, BCast::ToShape(bcast.output_shape()), &z));
302 
303     // Make sure that we have valid quantization ranges for the input buffers.
304     // If the difference between the min and max is negative or zero, it makes
305     // it hard to do meaningful intermediate operations on the values.
306     OP_REQUIRES(context, (max_x > min_x),
307                 errors::InvalidArgument("max_x must be larger than min_a."));
308     OP_REQUIRES(context, (max_y > min_y),
309                 errors::InvalidArgument("max_x must be larger than min_b."));
310     const int32 offset_x = FloatToQuantizedUnclamped<T>(0.0f, min_x, max_x);
311     const int32 offset_y = FloatToQuantizedUnclamped<T>(0.0f, min_y, max_y);
312     const T* x_data = x.flat<T>().data();
313     const T* y_data = y.flat<T>().data();
314     Toutput* z_data = z->flat<Toutput>().data();
315 
316     const int ndims = bcast.x_reshape().size();
317     if (ndims <= 1) {
318       if (x.NumElements() == 1) {
319         ScalarMultiply<T, Toutput>(context, y_data, offset_y, y.NumElements(),
320                                    x_data[0], offset_x, z_data);
321       } else if (y.NumElements() == 1) {
322         ScalarMultiply<T, Toutput>(context, x_data, offset_x, x.NumElements(),
323                                    y_data[0], offset_y, z_data);
324       } else {
325         VectorMultiply<T, Toutput>(context, x_data, offset_x, y_data, offset_y,
326                                    x.NumElements(), z_data);
327       }
328     } else if (ndims == 2) {
329       const T* vector_data;
330       int64 vector_num_elements;
331       int32 vector_offset;
332       const T* tensor_data;
333       int64 tensor_num_elements;
334       int32 tensor_offset;
335       if (x.NumElements() < y.NumElements()) {
336         vector_data = x_data;
337         vector_num_elements = x.NumElements();
338         vector_offset = offset_x;
339         tensor_data = y_data;
340         tensor_num_elements = y.NumElements();
341         tensor_offset = offset_y;
342       } else {
343         vector_data = y_data;
344         vector_num_elements = y.NumElements();
345         vector_offset = offset_y;
346         tensor_data = x_data;
347         tensor_num_elements = x.NumElements();
348         tensor_offset = offset_x;
349       }
350       VectorTensorMultiply<T, Toutput>(
351           vector_data, vector_offset, vector_num_elements, tensor_data,
352           tensor_offset, tensor_num_elements, z_data);
353     } else {
354       LOG(INFO) << "ndims=" << ndims;
355       LOG(INFO) << "bcast.x_reshape()="
356                 << TensorShape(bcast.x_reshape()).DebugString();
357       LOG(INFO) << "bcast.y_reshape()="
358                 << TensorShape(bcast.y_reshape()).DebugString();
359       LOG(INFO) << "bcast.x_bcast()="
360                 << TensorShape(bcast.x_bcast()).DebugString();
361       LOG(INFO) << "bcast.y_bcast()="
362                 << TensorShape(bcast.y_bcast()).DebugString();
363 
364       context->SetStatus(errors::Unimplemented(
365           "Broadcast between ", context->input(0).shape().DebugString(),
366           " and ", context->input(1).shape().DebugString(),
367           " is not supported yet."));
368       return;
369     }
370 
371     float min_z_value;
372     float max_z_value;
373     QuantizationRangeForMultiplication<T, T, Toutput>(
374         min_x, max_x, min_y, max_y, &min_z_value, &max_z_value);
375     Tensor* z_min = nullptr;
376     OP_REQUIRES_OK(context, context->allocate_output(1, {}, &z_min));
377     z_min->flat<float>()(0) = min_z_value;
378 
379     Tensor* z_max = nullptr;
380     OP_REQUIRES_OK(context, context->allocate_output(2, {}, &z_max));
381     z_max->flat<float>()(0) = max_z_value;
382   }
383 };
384 
385 REGISTER_KERNEL_BUILDER(Name("QuantizedMul")
386                             .Device(DEVICE_CPU)
387                             .TypeConstraint<quint8>("T1")
388                             .TypeConstraint<quint8>("T2")
389                             .TypeConstraint<qint32>("Toutput"),
390                         QuantizedMulOp<quint8, qint32>);
391 
392 }  // namespace tensorflow
393