1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h"
17
18 #include <algorithm>
19 #include <string>
20
21 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
22
23 namespace tflite {
24 namespace gpu {
25
26 namespace {
27
GetReduceCode(const GpuInfo & gpu_info,int reduction_size)28 std::string GetReduceCode(const GpuInfo& gpu_info, int reduction_size) {
29 // If it is supported, use the built-in work_group_reduce_add function.
30 // Otherwise, implement a reduction using __local memory.
31
32 // In the reduction step add upper half of the still-to-be-summed vector to
33 // the lower half, while taking care of odd sizes and rounding. E.g.:
34 // Number of items still to be summed before: 5
35 // Local memory before: [a, b, c, d, e];
36 // Local memory after: [a+d, b+e, c, d, e];
37 // Threads doing work: id < 2 = floor(5/2)
38 // Offset to the added items: 3 = ceil(5/2)
39 // Number of items still to be summed after: 3 = ceil(5/2)
40 std::string result;
41 if (gpu_info.IsApiOpenCl()) {
42 result += R"(
43 #if (__OPENCL_C_VERSION__ >= 200) && (__OPENCL_C_VERSION__ < 300) && \
44 !defined(__opencl_c_work_group_collective_functions)
45 #define __opencl_c_work_group_collective_functions 1
46 #endif
47 )";
48 }
49 result += R"(
50 #ifdef __opencl_c_work_group_collective_functions
51 #define local_reduce(item, tmp, local_id) work_group_reduce_add(item)
52 #else // !defined(__opencl_c_work_group_collective_functions)
53 )";
54 if (gpu_info.IsGlsl()) {
55 result += "float local_reduce(float item, int local_id) {\n";
56 } else {
57 result +=
58 "float local_reduce(float item, __local float* shared_mem, int "
59 "local_id) {\n";
60 }
61 result += R"(
62 shared_mem[local_id] = item;
63 LOCAL_MEM_BARRIER;
64 // The number of items still need to be summed
65 )";
66 result += " int reduction_size = " + std::to_string(reduction_size) + ";\n";
67 result += R"( while (reduction_size > 1) {
68 int active_thread_limit = reduction_size / 2;
69 int offset = (reduction_size + 1) / 2;
70 if (local_id < active_thread_limit) {
71 item += shared_mem[local_id + offset];
72 shared_mem[local_id] = item;
73 }
74 LOCAL_MEM_BARRIER;
75 reduction_size = offset;
76 }
77 return shared_mem[0];
78 }
79 #endif // defined(__opencl_c_work_group_collective_functions)
80 )";
81 return result;
82 }
83
GetFilterCode(const GpuInfo & gpu_info)84 std::string GetFilterCode(const GpuInfo& gpu_info) {
85 if (gpu_info.IsGlsl()) {
86 return R"(
87 vec4 filter_outside_tensor(vec4 x, int num_channels, int slice) {
88 vec4 result;
89 result.x = slice * 4 + 0 < num_channels ? x.x : 0.0f;
90 result.y = slice * 4 + 1 < num_channels ? x.y : 0.0f;
91 result.z = slice * 4 + 2 < num_channels ? x.z : 0.0f;
92 result.w = slice * 4 + 3 < num_channels ? x.w : 0.0f;
93 return result;
94 }
95 )";
96 } else {
97 return R"(
98 float4 filter_outside_tensor(float4 x, int num_channels, int slice) {
99 return select(x, INIT_FLOAT4(0.0f), slice * 4 + INIT_INT4v4(0, 1, 2, 3) >= num_channels);
100 }
101 )";
102 }
103 }
104 } // namespace
105
MeanStdDevNormalization(const OperationDef & definition,const GpuInfo & gpu_info,const int tensor_slices)106 MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
107 const GpuInfo& gpu_info,
108 const int tensor_slices)
109 : GPUOperation(definition) {
110 // The kernel code does not inherently need a fixed size, but in order to not
111 // hardcode the __local array's size for the reductions, we would need to pass
112 // that size to the kernel at runtime, and that is currently not supported.
113 // For now, fix workgroup size to the biggest supported by the device, but not
114 // larger than the number of tensor slices.
115 int desired_work_group_size =
116 std::min(tensor_slices, gpu_info.GetMaxWorkGroupSizeForX());
117 if (gpu_info.IsMali()) {
118 // Don't use more than 64 work items per work group on ARM Mali. They
119 // implement local memory using the global memory, larger workgroups have
120 // severe performance penalty.
121 desired_work_group_size = 64;
122 }
123 if (gpu_info.IsAdreno()) {
124 AdrenoInfo info = gpu_info.adreno_info;
125 if (info.IsAdreno3xx()) {
126 if (info.adreno_gpu == AdrenoGpu::kAdreno320 ||
127 info.adreno_gpu == AdrenoGpu::kAdreno330) {
128 desired_work_group_size = 128;
129 } else {
130 desired_work_group_size = 64;
131 }
132 } else if (info.IsAdreno4xx()) {
133 if (info.adreno_gpu == AdrenoGpu::kAdreno430) {
134 desired_work_group_size = 256;
135 } else {
136 desired_work_group_size = 128;
137 }
138 } else if (info.IsAdreno5xx()) {
139 if (info.adreno_gpu == AdrenoGpu::kAdreno530 ||
140 info.adreno_gpu == AdrenoGpu::kAdreno540) {
141 desired_work_group_size = 256;
142 } else {
143 desired_work_group_size = 128;
144 }
145 }
146 }
147 if (gpu_info.IsPowerVR()) {
148 desired_work_group_size = 64;
149 }
150 if (gpu_info.IsApple()) {
151 desired_work_group_size = 64;
152 }
153 while (desired_work_group_size >= tensor_slices * 2) {
154 desired_work_group_size /= 2;
155 }
156 work_group_size_.x = desired_work_group_size;
157 work_group_size_.y = 1; // Required
158 work_group_size_.z = 1; // Required
159 code_ = GetNormalizationCode(gpu_info);
160 if (gpu_info.IsCL30OrHigher()) {
161 compiler_options_.push_back(CompilerOptions::kCl30);
162 } else if (gpu_info.IsCL20OrHigher()) {
163 compiler_options_.push_back(CompilerOptions::kCl20);
164 }
165 }
166
GetNormalizationCode(const GpuInfo & gpu_info)167 std::string MeanStdDevNormalization::GetNormalizationCode(
168 const GpuInfo& gpu_info) {
169 AddSrcTensor("src_tensor", definition_.src_tensors[0]);
170 AddDstTensor("dst_tensor", definition_.dst_tensors[0]);
171
172 std::string c;
173 if (gpu_info.IsGlsl()) {
174 c += "shared float shared_mem[" + std::to_string(work_group_size_.x) +
175 "];\n";
176 }
177 c += GetReduceCode(gpu_info, work_group_size_.x);
178 c += GetFilterCode(gpu_info);
179 if (gpu_info.IsApiOpenCl()) {
180 c += "__attribute__((reqd_work_group_size(" +
181 std::to_string(work_group_size_.x) + ", 1, 1)))\n";
182 }
183 if (gpu_info.IsApiMetal()) {
184 c += "#define native_rsqrt(value) rsqrt(value)\n";
185 }
186 if (gpu_info.IsGlsl()) {
187 c += "#define native_rsqrt(value) inversesqrt(value)\n";
188 }
189 if (gpu_info.IsGlsl()) {
190 c += "#define LOCAL_REDUCE(item, shared_mem, local_id) local_reduce(item, "
191 "local_id)\n";
192 } else {
193 c += "#define LOCAL_REDUCE(item, shared_mem, local_id) local_reduce(item, "
194 "shared_mem, local_id)\n";
195 }
196 c += "MAIN_FUNCTION($0) {\n";
197 if (!gpu_info.IsGlsl()) {
198 c += "#ifndef __opencl_c_work_group_collective_functions\n";
199 c += " __local float tmp[" + std::to_string(work_group_size_.x) + "];\n";
200 c += "#endif\n";
201 }
202 c += R"(
203 int B = GLOBAL_ID_1;
204 // Calculate the total sum of the input tensor.
205 // First, get a local sum of input[local_id_x + N*local_size_x] for all N.
206 float4 private_sum4 = INIT_FLOAT4(0.0f);
207 int local_id = LOCAL_ID_0;
208 for (int S = local_id; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
209 float4 t = args.src_tensor.Read<float>(0, 0, S, B);
210 private_sum4 += filter_outside_tensor(t, args.src_tensor.Channels(), S);
211 }
212 // Reduce the vector to a single float and do a workgroup reduce.
213 float private_sum = dot(private_sum4, INIT_FLOAT4(1.0f));
214 float sum = LOCAL_REDUCE(private_sum, tmp, local_id);
215 // Calculate the mean
216 float mean = sum / INIT_FLOAT(args.src_tensor.Channels());
217 // Calculate the squared sum of the difference from the mean.
218 float4 private_sum_diff_sq4 = INIT_FLOAT4(0.0f);
219 for (int S = local_id; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
220 float4 t = args.src_tensor.Read<float>(0, 0, S, B);
221 float4 diff = filter_outside_tensor(t - mean, args.src_tensor.Channels(), S);
222 private_sum_diff_sq4 += diff * diff;
223 }
224 // Reduce
225 float private_sum_diff_sq = dot(private_sum_diff_sq4, INIT_FLOAT4(1.0f));
226 float sum_diff_sq = LOCAL_REDUCE(private_sum_diff_sq, tmp, local_id);
227 // Calculate 1/stddev (with the 'regulazing constant' as in tensor_utils.cc)
228 float variance = sum_diff_sq / INIT_FLOAT(args.src_tensor.Channels());
229 float stddev_inv = native_rsqrt(variance + 1.0e-8f);
230 // Calculate (t-mean)/stddev for each element
231 for (int S = local_id; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
232 float4 t = args.src_tensor.Read<float>(0, 0, S, B);
233 FLT4 result = TO_FLT4((t - mean) * stddev_inv);
234 args.dst_tensor.Write(result, 0, 0, S, B);
235 }
236 })";
237 return c;
238 }
239
GetGridSize() const240 int3 MeanStdDevNormalization::GetGridSize() const {
241 // To avoid dealing with global reductions, we restrict the grid size to the
242 // work group size in the first dimension.
243 const int grid_x = work_group_size_.x;
244 const int grid_y = src_[0]->Batch();
245 const int grid_z = 1;
246 return int3(grid_x, grid_y, grid_z);
247 }
248
CreateMeanStdDevNormalization(const OperationDef & definition,const GpuInfo & gpu_info,const int tensor_slices)249 MeanStdDevNormalization CreateMeanStdDevNormalization(
250 const OperationDef& definition, const GpuInfo& gpu_info,
251 const int tensor_slices) {
252 return MeanStdDevNormalization(definition, gpu_info, tensor_slices);
253 }
254
255 } // namespace gpu
256 } // namespace tflite
257