1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/micro/kernels/softmax.h"
17
18 #include "tensorflow/lite/c/builtin_op_data.h"
19 #include "tensorflow/lite/c/common.h"
20 #include "tensorflow/lite/kernels/internal/common.h"
21 #include "tensorflow/lite/kernels/internal/quantization_util.h"
22 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
23 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
24 #include "tensorflow/lite/kernels/kernel_util.h"
25 #include "tensorflow/lite/kernels/op_macros.h"
26 #include "tensorflow/lite/micro/kernels/kernel_util.h"
27 #include "tensorflow/lite/micro/kernels/xtensa/xtensa.h"
28
29 namespace tflite {
30 namespace {
31
32 #if defined(HIFIMINI)
33 struct OpData {
34 uint16_t* exp_lut;
35 };
36 #elif defined(FUSION_F1)
37 struct OpData {
38 SoftmaxParams params;
39 int scratch_tensor_index;
40 };
41 #endif
42
43 #if defined(HIFIMINI)
44 // Number of unique int8_t and int16_t values. Used in exponent lookup table
45 // computation.
46 constexpr int kInt8Range =
47 std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min() + 1;
48 constexpr int kInt16Range = std::numeric_limits<int16_t>::max() -
49 std::numeric_limits<int16_t>::min() + 1;
50 // Each 16-bit precalculated exponent is expressed as a Q0.16 fixedpoint
51 // value. We special-case e^0 since 1.0 requires 1 integer bit to
52 // express.
53 constexpr int kExpFractionalBits = 16;
54 // e^0 expressed as Q1.15 exceeds the int16_t range, so it must be handled
55 // specially.
56 constexpr int kMaxExponentValue = (1 << kExpFractionalBits);
57
58 // Quantized softmax with int8_t input and int16_t output.
59 // Passing OpData by value does not have much savings in this op, but following
60 // that as a best practice, at least for the xtensa kernels. See b/155656675 for
61 // more details.
SoftmaxHifimini(OpData op_data,const RuntimeShape & input_shape,const int8_t * input_data,const RuntimeShape & output_shape,int16_t * output_data)62 TfLiteStatus SoftmaxHifimini(OpData op_data, const RuntimeShape& input_shape,
63 const int8_t* input_data,
64 const RuntimeShape& output_shape,
65 int16_t* output_data) {
66 // The last dimension is depth. Outer size is the total input size
67 // divided by depth.
68 const int trailing_dim = input_shape.DimensionsCount() - 1;
69 const int outer_size =
70 MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
71 const int depth =
72 MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
73
74 for (int i = 0; i < outer_size; ++i) {
75 int8_t max_in_row = std::numeric_limits<int8_t>::min();
76 for (int c = 0; c < depth; ++c) {
77 max_in_row = std::max(max_in_row, input_data[i * depth + c]);
78 }
79
80 uint32_t sum_of_exps = 0;
81 for (int c = 0; c < depth; ++c) {
82 TFLITE_DCHECK(max_in_row >= input_data[i * depth + c]);
83 uint8_t input_diff = max_in_row - input_data[i * depth + c];
84
85 sum_of_exps +=
86 input_diff == 0 ? kMaxExponentValue : op_data.exp_lut[input_diff];
87 }
88
89 // Ensure we cannot overflow the full_range_output value. We need to
90 // guarantee that kInt16Range * max(input_data) / sum_of_exps < kInt16Range.
91 TFLITE_DCHECK(sum_of_exps >= kMaxExponentValue);
92
93 for (int c = 0; c < depth; ++c) {
94 uint8_t input_diff = max_in_row - input_data[i * depth + c];
95 // Special case for diff == 0
96 uint32_t unscaled_output =
97 input_diff == 0 ? kMaxExponentValue : op_data.exp_lut[input_diff];
98 int64_t scaled_output = static_cast<int64_t>(unscaled_output) *
99 static_cast<int64_t>(kInt16Range);
100 int32_t full_range_output =
101 scaled_output / sum_of_exps + std::numeric_limits<int16_t>::min();
102 // Round up if remainder exceeds half of the divider value.
103 uint32_t remainder = scaled_output % sum_of_exps;
104 if (remainder * 2 >= sum_of_exps) {
105 full_range_output++;
106 }
107 output_data[i * depth + c] = static_cast<int16_t>(std::max(
108 std::min(full_range_output,
109 static_cast<int32_t>(std::numeric_limits<int16_t>::max())),
110 static_cast<int32_t>(std::numeric_limits<int16_t>::min())));
111 }
112 }
113 return kTfLiteOk;
114 }
115
CalculateSoftmaxOpDataHifimini(TfLiteContext * context,const TfLiteTensor * input,TfLiteTensor * output,const TfLiteSoftmaxParams * params,OpData * op_data)116 TfLiteStatus CalculateSoftmaxOpDataHifimini(TfLiteContext* context,
117 const TfLiteTensor* input,
118 TfLiteTensor* output,
119 const TfLiteSoftmaxParams* params,
120 OpData* op_data) {
121 if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
122 if (input->type == kTfLiteUInt8) {
123 TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
124 } else {
125 if (output->type == kTfLiteInt16) {
126 TF_LITE_ENSURE_EQ(context, output->params.zero_point,
127 std::numeric_limits<int16_t>::min());
128 // NOTE: Current int16_t softmax output does not require symmetric
129 // scaling
130 // - so no need to verify scale here.
131 } else {
132 TF_LITE_ENSURE_EQ(context, output->params.zero_point,
133 std::numeric_limits<int8_t>::min());
134 TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
135 }
136 }
137
138 // Precompute e^(-x * input_scale * beta) for every possible int8_t input.
139 // This computation is used for every iteration of Softmax. We must compute
140 // using pre-scaled inputs to avoid introducing additional error, while
141 // restricting our input range to the int8_t range. This is valid since beta
142 // and input scale are constant for a given op in the graph. Skip index 0
143 // since that is a special case which requires 1 integer bit instead of 0.
144 for (int i = 1; i <= kInt8Range; i++) {
145 float scaled_input = i * input->params.scale;
146 float exp_value =
147 std::exp((-scaled_input) * static_cast<float>(params->beta));
148
149 float exponent_scaled =
150 std::round(exp_value * static_cast<float>(1 << kExpFractionalBits));
151 op_data->exp_lut[i] = static_cast<uint16_t>(exponent_scaled);
152 }
153 }
154 return kTfLiteOk;
155 }
156
PrepareHifimini(TfLiteContext * context,TfLiteNode * node)157 TfLiteStatus PrepareHifimini(TfLiteContext* context, TfLiteNode* node) {
158 auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
159
160 TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
161 TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
162 const TfLiteTensor* input = GetInput(context, node, 0);
163 TfLiteTensor* output = GetOutput(context, node, 0);
164 TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
165
166 TFLITE_DCHECK(node->user_data != nullptr);
167 OpData* op_data = static_cast<OpData*>(node->user_data);
168
169 // Allocate an array to precompute exponents over all int8_t inputs, applying
170 // the scale and beta before calculating exp. It is mandatory to apply beta
171 // and scale here, since each softmax op may have different beta and scale
172 // values. Beta and scale will remain constant for a given softmax op.
173 op_data->exp_lut = static_cast<uint16_t*>(context->AllocatePersistentBuffer(
174 context, (kInt8Range + 1) * sizeof(uint16_t)));
175 TF_LITE_ENSURE(context, op_data->exp_lut != nullptr);
176
177 TF_LITE_ENSURE_STATUS(
178 CalculateSoftmaxOpDataHifimini(context, input, output, params, op_data));
179
180 return kTfLiteOk;
181 }
182 #endif // defined(HIFIMINI)
183
184 #if defined(FUSION_F1)
PrepareHifi4(TfLiteContext * context,TfLiteNode * node)185 TfLiteStatus PrepareHifi4(TfLiteContext* context, TfLiteNode* node) {
186 TF_LITE_ENSURE_OK(context, SoftmaxPrepare(context, node));
187
188 // Calculate scratch memory requirements and request scratch buffer
189 const TfLiteTensor* input = GetInput(context, node, 0);
190 const TfLiteTensor* output = GetOutput(context, node, 0);
191
192 const RuntimeShape& input_shape = GetTensorShape(input);
193 const RuntimeShape& output_shape = GetTensorShape(output);
194 const int trailing_dim = input_shape.DimensionsCount() - 1;
195 const int depth =
196 MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
197
198 if (input->type == kTfLiteInt8) {
199 int required_scratch =
200 get_softmax_scratch_size(PREC_ASYM8S, PREC_ASYM8S, depth);
201 TF_LITE_ENSURE(context, required_scratch > 0);
202
203 auto* data = static_cast<OpData*>(node->user_data);
204 TF_LITE_ENSURE_OK(
205 context, context->RequestScratchBufferInArena(
206 context, required_scratch, &(data->scratch_tensor_index)));
207 }
208
209 return kTfLiteOk;
210 }
211
EvalHifi4(const OpData * op_data,const TfLiteEvalTensor * input,TfLiteEvalTensor * output,TfLiteContext * context)212 TfLiteStatus EvalHifi4(const OpData* op_data, const TfLiteEvalTensor* input,
213 TfLiteEvalTensor* output, TfLiteContext* context) {
214 const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
215 const int8_t* input_data = tflite::micro::GetTensorData<int8_t>(input);
216 const RuntimeShape& output_shape = tflite::micro::GetTensorShape(output);
217 int16_t* output_data = tflite::micro::GetTensorData<int16_t>(output);
218 const int trailing_dim = input_shape.DimensionsCount() - 1;
219 const int outer_size =
220 MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
221 const int depth =
222 MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
223
224 void* p_scratch = static_cast<void*>(
225 context->GetScratchBuffer(context, op_data->scratch_tensor_index));
226
227 for (int i = 0; i < outer_size; ++i) {
228 int err = xa_nn_vec_softmax_asym8s_16(
229 &output_data[i * depth], &input_data[i * depth],
230 op_data->params.diff_min, op_data->params.input_left_shift,
231 op_data->params.input_multiplier, depth, p_scratch);
232 TF_LITE_ENSURE(context, err == 0);
233 }
234 return kTfLiteOk;
235 }
236
237 #endif // defined(FUSION_F1)
238
Init(TfLiteContext * context,const char * buffer,size_t length)239 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
240 #if defined(HIFIMINI) || defined(FUSION_F1)
241 TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
242 return context->AllocatePersistentBuffer(context, sizeof(OpData));
243 #else
244 return SoftmaxInit(context, buffer, length);
245 #endif
246 }
247
Prepare(TfLiteContext * context,TfLiteNode * node)248 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
249 #if defined(HIFIMINI)
250 return PrepareHifimini(context, node);
251 #elif defined(FUSION_F1)
252 return PrepareHifi4(context, node);
253 #else
254 return SoftmaxPrepare(context, node);
255 #endif
256 }
257
Eval(TfLiteContext * context,TfLiteNode * node)258 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
259 const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
260 TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
261 TFLITE_DCHECK(node->user_data != nullptr);
262
263 #if defined(HIFIMINI)
264 auto* op_data = static_cast<OpData*>(node->user_data);
265
266 if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
267 return SoftmaxHifimini(*op_data, tflite::micro::GetTensorShape(input),
268 tflite::micro::GetTensorData<int8_t>(input),
269 tflite::micro::GetTensorShape(output),
270 tflite::micro::GetTensorData<int16_t>(output));
271 } else {
272 TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
273 TfLiteTypeGetName(input->type), input->type);
274 return kTfLiteError;
275 }
276 #else // !defined(HIFIMINI)
277 switch (input->type) {
278 case kTfLiteFloat32: {
279 SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data);
280 tflite::reference_ops::Softmax(
281 op_data, tflite::micro::GetTensorShape(input),
282 tflite::micro::GetTensorData<float>(input),
283 tflite::micro::GetTensorShape(output),
284 tflite::micro::GetTensorData<float>(output));
285 return kTfLiteOk;
286 }
287 case kTfLiteInt8: {
288 if (output->type == kTfLiteInt16) {
289 #if defined(FUSION_F1)
290 return EvalHifi4(static_cast<OpData*>(node->user_data), input, output,
291 context);
292 #else
293 SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data);
294 tflite::reference_ops::Softmax(
295 op_data, tflite::micro::GetTensorShape(input),
296 tflite::micro::GetTensorData<int8_t>(input),
297 tflite::micro::GetTensorShape(output),
298 tflite::micro::GetTensorData<int16_t>(output));
299 #endif
300 } else {
301 SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data);
302 tflite::reference_ops::Softmax(
303 op_data, tflite::micro::GetTensorShape(input),
304 tflite::micro::GetTensorData<int8_t>(input),
305 tflite::micro::GetTensorShape(output),
306 tflite::micro::GetTensorData<int8_t>(output));
307 }
308 return kTfLiteOk;
309 }
310 default:
311 TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
312 TfLiteTypeGetName(input->type), input->type);
313 return kTfLiteError;
314 }
315 #endif // !defined(HIFIMINI)
316 }
317
318 } // namespace
319
Register_SOFTMAX()320 TfLiteRegistration Register_SOFTMAX() {
321 return {/*init=*/Init,
322 /*free=*/nullptr,
323 /*prepare=*/Prepare,
324 /*invoke=*/Eval,
325 /*profiling_string=*/nullptr,
326 /*builtin_code=*/0,
327 /*custom_name=*/nullptr,
328 /*version=*/0};
329 }
330
331 } // namespace tflite
332