• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
17 
18 #include <algorithm>
19 
20 #include "tensorflow/lite/kernels/internal/common.h"
21 
22 namespace tflite {
23 
24 namespace reference_ops {
25 
26 template <typename T>
DivCheckArithmeticParams(const ArithmeticParams & params)27 inline void DivCheckArithmeticParams(const ArithmeticParams& params) {
28   TFLITE_DCHECK_LE(params.quantized_activation_min,
29                    params.quantized_activation_max);
30   // Input offset is negative input zero point. Activation tensors are
31   // asymmetric quantized so they span the full int8 range.
32   constexpr int32_t max_value =
33       static_cast<int32_t>(std::numeric_limits<T>::max());
34   TFLITE_DCHECK_GE(params.input1_offset, -max_value);
35   TFLITE_DCHECK_LE(params.input1_offset, max_value);
36   TFLITE_DCHECK_GE(params.input2_offset, -max_value);
37   TFLITE_DCHECK_LE(params.input2_offset, max_value);
38   TFLITE_DCHECK_GE(params.output_offset, -max_value);
39   TFLITE_DCHECK_LE(params.output_offset, max_value);
40 }
41 
42 // Element-wise div that can often be used for inner loop of broadcast Div as
43 // well as the non-broadcast Div.
44 template <typename T>
DivElementwise(int size,const ArithmeticParams & params,const T * input1_data,const T * input2_data,T * output_data)45 inline void DivElementwise(int size, const ArithmeticParams& params,
46                            const T* input1_data, const T* input2_data,
47                            T* output_data) {
48   DivCheckArithmeticParams<T>(params);
49 
50   for (int i = 0; i < size; ++i) {
51     const int32_t input1_val = params.input1_offset + input1_data[i];
52     const int32_t input2_val = params.input2_offset + input2_data[i];
53     TFLITE_DCHECK_NE(input2_val, 0);
54     int recip_shift;
55     const int32_t input2_inv =
56         (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
57                          : -GetReciprocal(-input2_val, 31, &recip_shift);
58     const int headroom = CountLeadingSignBits(input1_val);
59     const int32_t unscaled_quotient =
60         MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
61                                                     headroom);
62     const int total_shift = params.output_shift - recip_shift - headroom;
63     const int32_t unclamped_result =
64         params.output_offset +
65         MultiplyByQuantizedMultiplierSmallerThanOneExp(
66             unscaled_quotient, params.output_multiplier, total_shift);
67     const int32_t clamped_output =
68         std::min(params.quantized_activation_max,
69                  std::max(params.quantized_activation_min, unclamped_result));
70     output_data[i] = static_cast<T>(clamped_output);
71   }
72 }
73 
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)74 inline void Div(const ArithmeticParams& params,
75                 const RuntimeShape& input1_shape, const uint8_t* input1_data,
76                 const RuntimeShape& input2_shape, const uint8_t* input2_data,
77                 const RuntimeShape& output_shape, uint8_t* output_data) {
78   TFLITE_DCHECK_LE(params.quantized_activation_min,
79                    params.quantized_activation_max);
80   const int flat_size =
81       MatchingElementsSize(input1_shape, input2_shape, output_shape);
82 
83   DivElementwise(flat_size, params, input1_data, input2_data, output_data);
84 }
85 
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const int8_t * input1_data,const RuntimeShape & input2_shape,const int8_t * input2_data,const RuntimeShape & output_shape,int8_t * output_data)86 inline void Div(const ArithmeticParams& params,
87                 const RuntimeShape& input1_shape, const int8_t* input1_data,
88                 const RuntimeShape& input2_shape, const int8_t* input2_data,
89                 const RuntimeShape& output_shape, int8_t* output_data) {
90   TFLITE_DCHECK_LE(params.quantized_activation_min,
91                    params.quantized_activation_max);
92   const int flat_size =
93       MatchingElementsSize(input1_shape, input2_shape, output_shape);
94 
95   DivElementwise(flat_size, params, input1_data, input2_data, output_data);
96 }
97 
98 template <typename T, int N = 5>
BroadcastDivSlowQuantized(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)99 inline void BroadcastDivSlowQuantized(
100     const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
101     const T* input1_data, const RuntimeShape& unextended_input2_shape,
102     const T* input2_data, const RuntimeShape& unextended_output_shape,
103     T* output_data) {
104   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
105   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
106   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
107 
108   NdArrayDesc<N> desc1;
109   NdArrayDesc<N> desc2;
110   NdArrayDesc<N> output_desc;
111   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
112                                       unextended_input2_shape, &desc1, &desc2);
113   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
114                  &output_desc);
115 
116   DivCheckArithmeticParams<T>(params);
117 
118   auto div_func = [&](int indexes[N]) {
119     const int32_t input1_val =
120         params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
121     const int32_t input2_val =
122         params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
123     TFLITE_DCHECK_NE(input2_val, 0);
124     int recip_shift;
125     const int32_t input2_inv =
126         (input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
127                          : -GetReciprocal(-input2_val, 31, &recip_shift);
128     const int headroom = CountLeadingSignBits(input1_val);
129     const int32_t unscaled_quotient =
130         MultiplyByQuantizedMultiplierGreaterThanOne(input1_val, input2_inv,
131                                                     headroom);
132     const int total_shift = params.output_shift - recip_shift - headroom;
133     const int32_t unclamped_result =
134         params.output_offset +
135         MultiplyByQuantizedMultiplierSmallerThanOneExp(
136             unscaled_quotient, params.output_multiplier, total_shift);
137     const int32_t clamped_output =
138         std::min(params.quantized_activation_max,
139                  std::max(params.quantized_activation_min, unclamped_result));
140     output_data[SubscriptToIndex(output_desc, indexes)] =
141         static_cast<T>(clamped_output);
142   };
143   NDOpsHelper<N>(output_desc, div_func);
144 }
145 
146 template <int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const uint8_t * input1_data,const RuntimeShape & unextended_input2_shape,const uint8_t * input2_data,const RuntimeShape & unextended_output_shape,uint8_t * output_data)147 inline void BroadcastDivSlow(const ArithmeticParams& params,
148                              const RuntimeShape& unextended_input1_shape,
149                              const uint8_t* input1_data,
150                              const RuntimeShape& unextended_input2_shape,
151                              const uint8_t* input2_data,
152                              const RuntimeShape& unextended_output_shape,
153                              uint8_t* output_data) {
154   BroadcastDivSlowQuantized<uint8_t, N>(
155       params, unextended_input1_shape, input1_data, unextended_input2_shape,
156       input2_data, unextended_output_shape, output_data);
157 }
158 
159 template <int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const int8_t * input1_data,const RuntimeShape & unextended_input2_shape,const int8_t * input2_data,const RuntimeShape & unextended_output_shape,int8_t * output_data)160 inline void BroadcastDivSlow(const ArithmeticParams& params,
161                              const RuntimeShape& unextended_input1_shape,
162                              const int8_t* input1_data,
163                              const RuntimeShape& unextended_input2_shape,
164                              const int8_t* input2_data,
165                              const RuntimeShape& unextended_output_shape,
166                              int8_t* output_data) {
167   BroadcastDivSlowQuantized<int8_t, N>(
168       params, unextended_input1_shape, input1_data, unextended_input2_shape,
169       input2_data, unextended_output_shape, output_data);
170 }
171 
172 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
173 // dimensionality if the runtime code does a single loop over one dimension
174 // that handles broadcasting as the base case. The code generator would then
175 // generate max(D1, D2) nested for loops.
176 template <typename T, int N = 5>
BroadcastDivSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)177 void BroadcastDivSlow(const ArithmeticParams& params,
178                       const RuntimeShape& unextended_input1_shape,
179                       const T* input1_data,
180                       const RuntimeShape& unextended_input2_shape,
181                       const T* input2_data,
182                       const RuntimeShape& unextended_output_shape,
183                       T* output_data) {
184   T output_activation_min;
185   T output_activation_max;
186   GetActivationParams(params, &output_activation_min, &output_activation_max);
187 
188   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
189   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
190   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
191 
192   NdArrayDesc<N> desc1;
193   NdArrayDesc<N> desc2;
194   NdArrayDesc<N> output_desc;
195   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
196                                       unextended_input2_shape, &desc1, &desc2);
197   CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
198                  &output_desc);
199 
200   // In Tensorflow, the dimensions are canonically named (batch_number, row,
201   // col, channel), with extents (batches, height, width, depth), with the
202   // trailing dimension changing most rapidly (channels has the smallest
203   // stride, typically 1 element).
204   //
205   // In generated C code, we store arrays with the dimensions reversed. The
206   // first dimension has smallest stride.
207 
208   auto div_func = [&](int indexes[N]) {
209     output_data[SubscriptToIndex(output_desc, indexes)] =
210         ActivationFunctionWithMinMax(
211             input1_data[SubscriptToIndex(desc1, indexes)] /
212                 input2_data[SubscriptToIndex(desc2, indexes)],
213             output_activation_min, output_activation_max);
214   };
215   NDOpsHelper<N>(output_desc, div_func);
216 }
217 
218 template <typename T>
Div(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)219 inline void Div(const ArithmeticParams& params,
220                 const RuntimeShape& input1_shape, const T* input1_data,
221                 const RuntimeShape& input2_shape, const T* input2_data,
222                 const RuntimeShape& output_shape, T* output_data) {
223   T output_activation_min;
224   T output_activation_max;
225   GetActivationParams(params, &output_activation_min, &output_activation_max);
226 
227   const int flat_size =
228       MatchingElementsSize(input1_shape, input2_shape, output_shape);
229   for (int i = 0; i < flat_size; ++i) {
230     output_data[i] = ActivationFunctionWithMinMax(
231         input1_data[i] / input2_data[i], output_activation_min,
232         output_activation_max);
233   }
234 }
235 
236 }  // namespace reference_ops
237 }  // namespace tflite
238 
239 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DIV_H_
240