1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ACTIVATIONS_H_
17
18 #include <algorithm>
19
20 #include "ruy/profiler/instrumentation.h" // from @ruy
21 #include "tensorflow/lite/kernels/internal/common.h"
22 #include "tensorflow/lite/kernels/internal/types.h"
23
24 namespace tflite {
25 namespace reference_ops {
26
SaturatingLeftShift(int16_t value,int amount)27 inline int16_t SaturatingLeftShift(int16_t value, int amount) {
28 int64_t result = static_cast<int64_t>(value) * (1 << amount);
29 result = std::min<int64_t>(result, std::numeric_limits<int16_t>::max());
30 result = std::max<int64_t>(result, std::numeric_limits<int16_t>::min());
31 return result;
32 }
33
34 // Similar to ARM instruction SQDMULH.
35 // Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
36 // rounding to zero instead of to nearest (SQRDMULH).
SaturatingDoublingHighMul(std::int16_t a,std::int16_t b)37 inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
38 bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
39 std::int32_t a_32(a);
40 std::int32_t b_32(b);
41 std::int32_t ab_32 = a_32 * b_32;
42 std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
43 return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
44 }
45
46 template <typename T>
HardSwish(const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & output_shape,T * output_data)47 inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
48 const RuntimeShape& output_shape, T* output_data) {
49 ruy::profiler::ScopeLabel label("ReferenceHardSwish/Float");
50 auto matching_size = MatchingFlatSize(input_shape, output_shape);
51 const T* in_end = input_data + matching_size;
52 for (; input_data < in_end; input_data++, output_data++) {
53 const float in = *input_data;
54 *output_data =
55 in * std::min(static_cast<T>(6), std::max(static_cast<T>(0), in + 3)) /
56 6;
57 }
58 }
59
60 template <typename T>
HardSwish(const HardSwishParams & params,const RuntimeShape & input_shape,const T * input_data,const RuntimeShape & output_shape,T * output_data)61 inline void HardSwish(const HardSwishParams& params,
62 const RuntimeShape& input_shape, const T* input_data,
63 const RuntimeShape& output_shape, T* output_data) {
64 ruy::profiler::ScopeLabel label("ReferenceHardSwish/Quantized");
65
66 const int flat_size = MatchingFlatSize(input_shape, output_shape);
67
68 for (int i = 0; i < flat_size; i++) {
69 const int16_t input_value = input_data[i] - params.input_zero_point;
70 // Left-shift as much as we can without overflow/saturation to put
71 // significant bits in the high bits of our 16-bit fixedpoint values, so
72 // that fixed-point approximate computations below are as accurate as
73 // possible.
74 const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
75 // Compute the input value on essentially the output scale, just not
76 // right-shifted yet. This is the value that we'll use in the (x >= +3)
77 // case, and that in the general case we'll multiply against the "relu-ish"
78 // fixed-point multiplier in [0, 1].
79 const int16_t input_value_on_preshift_output_scale =
80 gemmlowp::SaturatingRoundingDoublingHighMul(
81 input_value_on_hires_input_scale,
82 params.output_multiplier_fixedpoint_int16);
83 // Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
84 // is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
85 // case, it is just that plus saturation at the boundaries of [-3, 3].
86 // First, we rescale from [-3, 3] to [-1, 1], saturating.
87 // That is done by rescaling the input value with a fixed-point multiplier
88 // (reluish_multiplier_fixedpoint) and bit-shift such that we represent
89 // that input value on the scale where the real value 3.0f is represented
90 // by the quantized value 32768. (+32768 is actually not representable as
91 // int16_t, so this saturates at +32767, and that is seen empirically to be
92 // a negligible contribution to numerical error/bias).
93 //
94 // This code is careful to correctly implement any magnitude of multiplier,
95 // involving either a right shift or a left shift, with correct saturation
96 // behavior in the left-shift case. This forces this code to be more
97 // complicated, but is necessary for real applications: a partially
98 // trained quantized MobileNet v3-small model that motivated this code
99 // exhibits some large [min, max] range boundaries, of the order of
100 // magnitude of 10 or 100 depending on layers.
101 //
102 // The next few lines are basically just an ordinary
103 // MultiplyByQuantizedMultiplier, except that we are more careful here
104 // about the fine details of saturation when left-shifting, because here
105 // overflow in left-shift is a common case, not an anomaly as
106 // MultiplyByQuantizedMultiplier assumes.
107 int16_t reluish_value = input_value_on_hires_input_scale;
108 // Shift left, saturating, as much as we can while ensuring that this
109 // saturation will not contribute to the result. That is, left shift amount
110 // reduced by 1.
111 if (params.reluish_multiplier_exponent > 0) {
112 reluish_value = SaturatingLeftShift(
113 reluish_value, params.reluish_multiplier_exponent - 1);
114 }
115 // Apply the fixed-point multiplier, dividing the value by a divisor
116 // ranging in [1, 2].
117 reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
118 reluish_value, params.reluish_multiplier_fixedpoint_int16);
119 // Apply the last bit of left-shift. Thus, in the left-shifting case, if
120 // any saturation affects the result, it is happening here --- any
121 // saturation having occurred above is overwritten here, not affecting the
122 // result.
123 if (params.reluish_multiplier_exponent > 0) {
124 reluish_value = SaturatingLeftShift(reluish_value, 1);
125 }
126 // Shift right, in the right-shifting case.
127 if (params.reluish_multiplier_exponent < 0) {
128 reluish_value = gemmlowp::RoundingDivideByPOT(
129 reluish_value, -params.reluish_multiplier_exponent);
130 }
131 // At this point we have rescaled the value into a 16bit fixedpoint
132 // reluish_value in [-1, 1].
133 // We now convert that to a 16bit fixedpoint value in [0, 1].
134 reluish_value = (reluish_value + (1 << 15)) >> 1;
135 // Use of SaturatingDoublingHighMul here is important to cancel the biases
136 // from the above SaturatingRoundingDoublingHighMul.
137 //
138 // On a partially trained MobileNet-v3-small,
139 //
140 // | bias on | ImageNet
141 // | quantized | Top-1
142 // Operation used here | values | accuracy (50k)
143 // --------------------------------------+------------+-----------
144 // SaturatingDoublingHighMul | -0.0024 | 58.920
145 // SaturatingRoundingDoublingHighMul | -0.0067 | 58.064
146 //
147 // In activations_test, this is covered by this testcase:
148 // QuantizedActivationsOpTest.HardSwishBias
149 //
150 const int16_t preshift_output_value = SaturatingDoublingHighMul(
151 reluish_value, input_value_on_preshift_output_scale);
152 // We were so far operating on the pre-shift output scale. Now we finally
153 // apply that output shift, arriving at the final output scale.
154 int16_t output_value = gemmlowp::RoundingDivideByPOT(
155 preshift_output_value, -params.output_multiplier_exponent);
156 output_value += params.output_zero_point;
157 output_value =
158 std::min<int16_t>(output_value, std::numeric_limits<T>::max());
159 output_value =
160 std::max<int16_t>(output_value, std::numeric_limits<T>::min());
161 output_data[i] = output_value;
162 }
163 }
164
165 } // namespace reference_ops
166 } // namespace tflite
167
168 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_CONV_H_
169