• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/common/tasks/mean_stddev_normalization.h"
17 
18 #include <algorithm>
19 #include <string>
20 
21 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
22 
23 namespace tflite {
24 namespace gpu {
25 
26 namespace {
27 
GetReduceCode(const GpuInfo & gpu_info,int reduction_size)28 std::string GetReduceCode(const GpuInfo& gpu_info, int reduction_size) {
29   // If it is supported, use the built-in work_group_reduce_add function.
30   // Otherwise, implement a reduction using __local memory.
31 
32   // In the reduction step add upper half of the still-to-be-summed vector to
33   // the lower half, while taking care of odd sizes and rounding. E.g.:
34   // Number of items still to be summed before: 5
35   // Local memory before: [a, b, c, d, e];
36   // Local memory after: [a+d, b+e, c, d, e];
37   // Threads doing work: id < 2 = floor(5/2)
38   // Offset to the added items: 3 = ceil(5/2)
39   // Number of items still to be summed after: 3 = ceil(5/2)
40   std::string result;
41   if (gpu_info.IsApiOpenCl()) {
42     result += R"(
43 #if (__OPENCL_C_VERSION__ >= 200) && (__OPENCL_C_VERSION__ < 300) && \
44   !defined(__opencl_c_work_group_collective_functions)
45   #define __opencl_c_work_group_collective_functions 1
46 #endif
47 )";
48   }
49   result += R"(
50 #ifdef __opencl_c_work_group_collective_functions
51 #define local_reduce(item, tmp, local_id) work_group_reduce_add(item)
52 #else  // !defined(__opencl_c_work_group_collective_functions)
53 )";
54   if (gpu_info.IsGlsl()) {
55     result += "float local_reduce(float item, int local_id) {\n";
56   } else {
57     result +=
58         "float local_reduce(float item, __local float* shared_mem, int "
59         "local_id) {\n";
60   }
61   result += R"(
62   shared_mem[local_id] = item;
63   LOCAL_MEM_BARRIER;
64   // The number of items still need to be summed
65 )";
66   result += "  int reduction_size = " + std::to_string(reduction_size) + ";\n";
67   result += R"(  while (reduction_size > 1) {
68     int active_thread_limit = reduction_size / 2;
69     int offset = (reduction_size + 1) / 2;
70     if (local_id < active_thread_limit) {
71       item += shared_mem[local_id + offset];
72       shared_mem[local_id] = item;
73     }
74     LOCAL_MEM_BARRIER;
75     reduction_size = offset;
76   }
77   return shared_mem[0];
78 }
79 #endif  // defined(__opencl_c_work_group_collective_functions)
80 )";
81   return result;
82 }
83 
GetFilterCode(const GpuInfo & gpu_info)84 std::string GetFilterCode(const GpuInfo& gpu_info) {
85   if (gpu_info.IsGlsl()) {
86     return R"(
87 vec4 filter_outside_tensor(vec4 x, int num_channels, int slice) {
88   vec4 result;
89   result.x = slice * 4 + 0 < num_channels ? x.x : 0.0f;
90   result.y = slice * 4 + 1 < num_channels ? x.y : 0.0f;
91   result.z = slice * 4 + 2 < num_channels ? x.z : 0.0f;
92   result.w = slice * 4 + 3 < num_channels ? x.w : 0.0f;
93   return result;
94 }
95 )";
96   } else {
97     return R"(
98 float4 filter_outside_tensor(float4 x, int num_channels, int slice) {
99   return select(x, INIT_FLOAT4(0.0f), slice * 4 + INIT_INT4v4(0, 1, 2, 3) >= num_channels);
100 }
101 )";
102   }
103 }
104 }  // namespace
105 
MeanStdDevNormalization(const OperationDef & definition,const GpuInfo & gpu_info,const int tensor_slices)106 MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
107                                                  const GpuInfo& gpu_info,
108                                                  const int tensor_slices)
109     : GPUOperation(definition) {
110   // The kernel code does not inherently need a fixed size, but in order to not
111   // hardcode the __local array's size for the reductions, we would need to pass
112   // that size to the kernel at runtime, and that is currently not supported.
113   // For now, fix workgroup size to the biggest supported by the device, but not
114   // larger than the number of tensor slices.
115   int desired_work_group_size =
116       std::min(tensor_slices, gpu_info.GetMaxWorkGroupSizeForX());
117   if (gpu_info.IsMali()) {
118     // Don't use more than 64 work items per work group on ARM Mali. They
119     // implement local memory using the global memory, larger workgroups have
120     // severe performance penalty.
121     desired_work_group_size = 64;
122   }
123   if (gpu_info.IsAdreno()) {
124     AdrenoInfo info = gpu_info.adreno_info;
125     if (info.IsAdreno3xx()) {
126       if (info.adreno_gpu == AdrenoGpu::kAdreno320 ||
127           info.adreno_gpu == AdrenoGpu::kAdreno330) {
128         desired_work_group_size = 128;
129       } else {
130         desired_work_group_size = 64;
131       }
132     } else if (info.IsAdreno4xx()) {
133       if (info.adreno_gpu == AdrenoGpu::kAdreno430) {
134         desired_work_group_size = 256;
135       } else {
136         desired_work_group_size = 128;
137       }
138     } else if (info.IsAdreno5xx()) {
139       if (info.adreno_gpu == AdrenoGpu::kAdreno530 ||
140           info.adreno_gpu == AdrenoGpu::kAdreno540) {
141         desired_work_group_size = 256;
142       } else {
143         desired_work_group_size = 128;
144       }
145     }
146   }
147   if (gpu_info.IsPowerVR()) {
148     desired_work_group_size = 64;
149   }
150   if (gpu_info.IsApple()) {
151     desired_work_group_size = 64;
152   }
153   while (desired_work_group_size >= tensor_slices * 2) {
154     desired_work_group_size /= 2;
155   }
156   work_group_size_.x = desired_work_group_size;
157   work_group_size_.y = 1;  // Required
158   work_group_size_.z = 1;  // Required
159   code_ = GetNormalizationCode(gpu_info);
160   if (gpu_info.IsCL30OrHigher()) {
161     compiler_options_.push_back(CompilerOptions::kCl30);
162   } else if (gpu_info.IsCL20OrHigher()) {
163     compiler_options_.push_back(CompilerOptions::kCl20);
164   }
165 }
166 
GetNormalizationCode(const GpuInfo & gpu_info)167 std::string MeanStdDevNormalization::GetNormalizationCode(
168     const GpuInfo& gpu_info) {
169   AddSrcTensor("src_tensor", definition_.src_tensors[0]);
170   AddDstTensor("dst_tensor", definition_.dst_tensors[0]);
171 
172   std::string c;
173   if (gpu_info.IsGlsl()) {
174     c += "shared float shared_mem[" + std::to_string(work_group_size_.x) +
175          "];\n";
176   }
177   c += GetReduceCode(gpu_info, work_group_size_.x);
178   c += GetFilterCode(gpu_info);
179   if (gpu_info.IsApiOpenCl()) {
180     c += "__attribute__((reqd_work_group_size(" +
181          std::to_string(work_group_size_.x) + ", 1, 1)))\n";
182   }
183   if (gpu_info.IsApiMetal()) {
184     c += "#define native_rsqrt(value) rsqrt(value)\n";
185   }
186   if (gpu_info.IsGlsl()) {
187     c += "#define native_rsqrt(value) inversesqrt(value)\n";
188   }
189   if (gpu_info.IsGlsl()) {
190     c += "#define LOCAL_REDUCE(item, shared_mem, local_id) local_reduce(item, "
191          "local_id)\n";
192   } else {
193     c += "#define LOCAL_REDUCE(item, shared_mem, local_id) local_reduce(item, "
194          "shared_mem, local_id)\n";
195   }
196   c += "MAIN_FUNCTION($0) {\n";
197   if (!gpu_info.IsGlsl()) {
198     c += "#ifndef __opencl_c_work_group_collective_functions\n";
199     c += "  __local float tmp[" + std::to_string(work_group_size_.x) + "];\n";
200     c += "#endif\n";
201   }
202   c += R"(
203   int B = GLOBAL_ID_1;
204   // Calculate the total sum of the input tensor.
205   // First, get a local sum of input[local_id_x + N*local_size_x] for all N.
206   float4 private_sum4 = INIT_FLOAT4(0.0f);
207   int local_id = LOCAL_ID_0;
208   for (int S = local_id; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
209     float4 t = args.src_tensor.Read<float>(0, 0, S, B);
210     private_sum4 += filter_outside_tensor(t, args.src_tensor.Channels(), S);
211   }
212   // Reduce the vector to a single float and do a workgroup reduce.
213   float private_sum = dot(private_sum4, INIT_FLOAT4(1.0f));
214   float sum = LOCAL_REDUCE(private_sum, tmp, local_id);
215   // Calculate the mean
216   float mean = sum / INIT_FLOAT(args.src_tensor.Channels());
217   // Calculate the squared sum of the difference from the mean.
218   float4 private_sum_diff_sq4 = INIT_FLOAT4(0.0f);
219   for (int S = local_id; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
220     float4 t = args.src_tensor.Read<float>(0, 0, S, B);
221     float4 diff = filter_outside_tensor(t - mean, args.src_tensor.Channels(), S);
222     private_sum_diff_sq4 += diff * diff;
223   }
224   // Reduce
225   float private_sum_diff_sq = dot(private_sum_diff_sq4, INIT_FLOAT4(1.0f));
226   float sum_diff_sq = LOCAL_REDUCE(private_sum_diff_sq, tmp, local_id);
227   // Calculate 1/stddev (with the 'regulazing constant' as in tensor_utils.cc)
228   float variance = sum_diff_sq / INIT_FLOAT(args.src_tensor.Channels());
229   float stddev_inv = native_rsqrt(variance + 1.0e-8f);
230   // Calculate (t-mean)/stddev for each element
231   for (int S = local_id; S < args.src_tensor.Slices(); S += GROUP_SIZE_0) {
232     float4 t = args.src_tensor.Read<float>(0, 0, S, B);
233     FLT4 result = TO_FLT4((t - mean) * stddev_inv);
234     args.dst_tensor.Write(result, 0, 0, S, B);
235   }
236 })";
237   return c;
238 }
239 
GetGridSize() const240 int3 MeanStdDevNormalization::GetGridSize() const {
241   // To avoid dealing with global reductions, we restrict the grid size to the
242   // work group size in the first dimension.
243   const int grid_x = work_group_size_.x;
244   const int grid_y = src_[0]->Batch();
245   const int grid_z = 1;
246   return int3(grid_x, grid_y, grid_z);
247 }
248 
CreateMeanStdDevNormalization(const OperationDef & definition,const GpuInfo & gpu_info,const int tensor_slices)249 MeanStdDevNormalization CreateMeanStdDevNormalization(
250     const OperationDef& definition, const GpuInfo& gpu_info,
251     const int tensor_slices) {
252   return MeanStdDevNormalization(definition, gpu_info, tensor_slices);
253 }
254 
255 }  // namespace gpu
256 }  // namespace tflite
257