1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h"
17
18 #include <string>
19
20 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
21
22 namespace tflite {
23 namespace gpu {
24
25 namespace {
26
GetVectorReduceCode()27 std::string GetVectorReduceCode() {
28 return R"(float reduce_vector(float4 v) {
29 return dot(v, INIT_FLOAT4(1.0f));
30 })";
31 }
32
GetReduceCode(const GpuInfo & gpu_info,int reduction_size)33 std::string GetReduceCode(const GpuInfo& gpu_info, int reduction_size) {
34 // If it is supported, use the built-in work_group_reduce_add function.
35 // Otherwise, implement a reduction using __local memory.
36
37 // In the reduction step add upper half of the still-to-be-summed vector to
38 // the lower half, while taking care of odd sizes and rounding. E.g.:
39 // Number of items still to be summed before: 5
40 // Local memory before: [a, b, c, d, e];
41 // Local memory after: [a+d, b+e, c, d, e];
42 // Threads doing work: id < 2 = floor(5/2)
43 // Offset to the added items: 3 = ceil(5/2)
44 // Number of items still to be summed after: 3 = ceil(5/2)
45 std::string result;
46 if (gpu_info.IsApiOpenCl()) {
47 result += R"(
48 #if (__OPENCL_C_VERSION__ >= 200) && (__OPENCL_C_VERSION__ < 300) && \
49 !defined(__opencl_c_work_group_collective_functions)
50 #define __opencl_c_work_group_collective_functions 1
51 #endif
52 )";
53 }
54 result += R"(
55 #ifdef __opencl_c_work_group_collective_functions
56 #define local_reduce(item, tmp, local_id) work_group_reduce_add(item)
57 #else // !defined(__opencl_c_work_group_collective_functions)
58 float local_reduce(float item, __local float* tmp, int local_id) {
59 tmp[local_id] = item;
60 LOCAL_MEM_BARRIER;
61 // The number of items still need to be summed
62 )";
63 result += " int reduction_size = " + std::to_string(reduction_size) + ";\n";
64 result += R"( while (reduction_size > 1) {
65 const int active_thread_limit = reduction_size / 2;
66 const int offset = (reduction_size + 1) / 2;
67 if (local_id < active_thread_limit) {
68 item += tmp[local_id + offset];
69 tmp[local_id] = item;
70 }
71 LOCAL_MEM_BARRIER;
72 reduction_size = offset;
73 }
74 return tmp[0];
75 }
76 #endif // defined(__opencl_c_work_group_collective_functions)
77 )";
78 return result;
79 }
80
GetFilterCode()81 std::string GetFilterCode() {
82 return R"(
83 float4 filter_outside_tensor(float4 x, int num_channels, int slice) {
84 return select(x, INIT_FLOAT4(0.0f), slice * 4 + INIT_INT4v4(0, 1, 2, 3) >= num_channels);
85 }
86 )";
87 }
88 } // namespace
89
MeanStdDevNormalization(const OperationDef & definition,const GpuInfo & gpu_info,const int tensor_slices)90 MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
91 const GpuInfo& gpu_info,
92 const int tensor_slices)
93 : GPUOperation(definition) {
94 // The kernel code does not inherently need a fixed size, but in order to not
95 // hardcode the __local array's size for the reductions, we would need to pass
96 // that size to the kernel at runtime, and that is currently not supported.
97 // For now, fix workgroup size to the biggest supported by the device, but not
98 // larger than the number of tensor slices.
99 int desired_work_group_size =
100 std::min(tensor_slices, gpu_info.GetMaxWorkGroupSizeForX());
101 if (gpu_info.IsMali()) {
102 // Don't use more than 64 work items per work group on ARM Mali. They
103 // implement local memory using the global memory, larger workgroups have
104 // severe performance penalty.
105 desired_work_group_size = 64;
106 }
107 if (gpu_info.IsAdreno()) {
108 AdrenoInfo info = gpu_info.adreno_info;
109 if (info.IsAdreno3xx()) {
110 if (info.adreno_gpu == AdrenoGpu::kAdreno320 ||
111 info.adreno_gpu == AdrenoGpu::kAdreno330) {
112 desired_work_group_size = 128;
113 } else {
114 desired_work_group_size = 64;
115 }
116 } else if (info.IsAdreno4xx()) {
117 if (info.adreno_gpu == AdrenoGpu::kAdreno430) {
118 desired_work_group_size = 256;
119 } else {
120 desired_work_group_size = 128;
121 }
122 } else if (info.IsAdreno5xx()) {
123 if (info.adreno_gpu == AdrenoGpu::kAdreno530 ||
124 info.adreno_gpu == AdrenoGpu::kAdreno540) {
125 desired_work_group_size = 256;
126 } else {
127 desired_work_group_size = 128;
128 }
129 }
130 }
131 if (gpu_info.IsPowerVR()) {
132 desired_work_group_size = 64;
133 }
134 if (gpu_info.IsApple()) {
135 desired_work_group_size = 64;
136 }
137 while (desired_work_group_size >= tensor_slices * 2) {
138 desired_work_group_size /= 2;
139 }
140 work_group_size_.x = desired_work_group_size;
141 work_group_size_.y = 1; // Required
142 work_group_size_.z = 1; // Required
143 code_ = GetNormalizationCode(gpu_info);
144 if (gpu_info.IsCL30OrHigher()) {
145 compiler_options_.push_back(CompilerOptions::kCl30);
146 } else if (gpu_info.IsCL20OrHigher()) {
147 compiler_options_.push_back(CompilerOptions::kCl20);
148 }
149 }
150
GetNormalizationCode(const GpuInfo & gpu_info)151 std::string MeanStdDevNormalization::GetNormalizationCode(
152 const GpuInfo& gpu_info) {
153 AddSrcTensor("src_tensor", definition_.src_tensors[0]);
154 AddDstTensor("dst_tensor", definition_.dst_tensors[0]);
155
156 std::string c;
157 c += GetVectorReduceCode();
158 c += GetReduceCode(gpu_info, work_group_size_.x);
159 c += GetFilterCode();
160 if (gpu_info.IsApiOpenCl()) {
161 c += "__attribute__((reqd_work_group_size(" +
162 std::to_string(work_group_size_.x) + ", 1, 1)))\n";
163 }
164 if (gpu_info.IsApiMetal()) {
165 c += "#define native_rsqrt(value) rsqrt(value)\n";
166 }
167 c += R"(MAIN_FUNCTION($0) {
168 #ifndef __opencl_c_work_group_collective_functions
169 __local float tmp[)" +
170 std::to_string(work_group_size_.x) + R"(];
171 #endif
172 int B = GLOBAL_ID_1;
173 // Calculate the total sum of the input tensor.
174 // First, get a local sum of input[local_id_x + N*local_size_x] for all N.
175 float4 private_sum4 = INIT_FLOAT4(0.0f);
176 for (int S = LOCAL_ID_0; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
177 float4 t = args.src_tensor.Read<float>(0, 0, S, B);
178 private_sum4 += filter_outside_tensor(t, args.src_tensor.Channels(), S);
179 }
180 // Reduce the vector to a single float and do a workgroup reduce.
181 float private_sum = reduce_vector(private_sum4);
182 float sum = local_reduce(private_sum, tmp, LOCAL_ID_0);
183 // Calculate the mean
184 float mean = sum / args.src_tensor.Channels();
185 // Calculate the squared sum of the difference from the mean.
186 float4 private_sum_diff_sq4 = INIT_FLOAT4(0.0f);
187 for (int S = LOCAL_ID_0; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
188 float4 t = args.src_tensor.Read<float>(0, 0, S, B);
189 float4 diff = filter_outside_tensor(t - mean, args.src_tensor.Channels(), S);
190 private_sum_diff_sq4 += diff * diff;
191 }
192 // Reduce
193 float private_sum_diff_sq = reduce_vector(private_sum_diff_sq4);
194 float sum_diff_sq = local_reduce(private_sum_diff_sq, tmp, LOCAL_ID_0);
195 // Calculate 1/stddev (with the 'regulazing constant' as in tensor_utils.cc)
196 float variance = sum_diff_sq / args.src_tensor.Channels();
197 float stddev_inv = native_rsqrt(variance + 1.0e-8f);
198 // Calculate (t-mean)/stddev for each element
199 for (int S = LOCAL_ID_0; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
200 float4 t = args.src_tensor.Read<float>(0, 0, S, B);
201 FLT4 result = TO_FLT4((t - mean) * stddev_inv);
202 args.dst_tensor.Write(result, 0, 0, S, B);
203 }
204 })";
205 return c;
206 }
207
GetGridSize() const208 int3 MeanStdDevNormalization::GetGridSize() const {
209 // To avoid dealing with global reductions, we restrict the grid size to the
210 // work group size in the first dimension.
211 const int grid_x = work_group_size_.x;
212 const int grid_y = src_[0]->Batch();
213 const int grid_z = 1;
214 return int3(grid_x, grid_y, grid_z);
215 }
216
CreateMeanStdDevNormalization(const OperationDef & definition,const GpuInfo & gpu_info,const int tensor_slices)217 MeanStdDevNormalization CreateMeanStdDevNormalization(
218 const OperationDef& definition, const GpuInfo& gpu_info,
219 const int tensor_slices) {
220 return MeanStdDevNormalization(definition, gpu_info, tensor_slices);
221 }
222
223 } // namespace gpu
224 } // namespace tflite
225