• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/gl/kernels/mean.h"
17 
18 #include <algorithm>
19 #include <cstdint>
20 #include <cstring>
21 #include <string>
22 #include <vector>
23 
24 #include "absl/memory/memory.h"
25 #include "absl/status/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/status.h"
27 #include "tensorflow/lite/delegates/gpu/common/types.h"
28 #include "tensorflow/lite/delegates/gpu/common/util.h"
29 
30 namespace tflite {
31 namespace gpu {
32 namespace gl {
33 namespace {
34 
UseSubgroupBasedImpl(const GpuInfo & gpu_info)35 bool UseSubgroupBasedImpl(const GpuInfo& gpu_info) {
36   return gpu_info.IsApiVulkan() &&
37          (gpu_info.vulkan_info.api_version_major > 1 ||
38           gpu_info.vulkan_info.api_version_minor >= 1) &&
39          gpu_info.vulkan_info.subgroup_size >= 32 &&
40          gpu_info.vulkan_info.supports_subgroup_arithmetic;
41 }
42 
43 // An implementation of Mean for desktop GPUs and some phones with recent
44 // Vulkan drivers. It is more parallel than the trivial Mean operation, but
45 // still limited to using a single work group.
GenerateSubgroupBasedMean(const NodeShader::GenerationContext & ctx,GeneratedCode * generated_code)46 void GenerateSubgroupBasedMean(const NodeShader::GenerationContext& ctx,
47                                GeneratedCode* generated_code) {
48   int height = ctx.input_shapes[0][1];
49   int width = ctx.input_shapes[0][2];
50   int depth = ctx.input_shapes[0][3];
51   std::vector<Variable> parameters = {
52       {"input_data_0_h", height},
53       {"input_data_0_w", width},
54       {"output_data_0_h", 1},
55       {"output_data_0_w", 1},
56   };
57 
58   std::string source = R"(
59   // Round columns and rows per invocation up, to ensure that we read the
60   // entire input.
61   const uint columns_per_invocation =
62       ($input_data_0_w$ + (gl_WorkGroupSize.x - 1))/gl_WorkGroupSize.x;
63   const uint rows_per_invocation =
64       ($input_data_0_h$ + (gl_WorkGroupSize.y - 1))/gl_WorkGroupSize.y;
65   const uint first_row = gl_GlobalInvocationID.y*rows_per_invocation;
66   const uint first_col = gl_GlobalInvocationID.x*columns_per_invocation;
67   const uint last_row_exclusive =
68       min(first_row+rows_per_invocation, $input_data_0_h$);
69   const uint last_column_exclusive =
70       min(first_col+columns_per_invocation, $input_data_0_w$);
71   vec4 value = vec4(0);
72   for (uint h = first_row; h < last_row_exclusive; ++h) {
73     for (uint w = first_col; w < last_column_exclusive; ++w) {
74       value += $input_data_0[w, h, gid.z]$;
75     }
76   }
77   highp vec4 subgroup_sum = subgroupAdd(value);
78   if(subgroupElect()) {
79     subgroup_sums[gl_SubgroupID] = subgroup_sum;
80   }
81 
82   memoryBarrierShared();
83   barrier();
84   // Do the final reduction in the first subgroup.
85   if(gl_SubgroupID == 0) {
86     highp vec4 subtotal = vec4(0);
87     if (gl_SubgroupInvocationID < gl_NumSubgroups) {
88       subtotal = subgroup_sums[gl_SubgroupInvocationID];
89     }
90     highp vec4 grand_total = subgroupAdd(subtotal);
91     if(subgroupElect()) {
92       highp vec4 result = grand_total / $input_data_0_w$ / $input_data_0_h$;
93       $output_data_0[0, 0, gid.z] = result$;
94     }
95   }
96   )";
97 
98   const uint32_t subgroup_size = ctx.gpu_info->vulkan_info.subgroup_size;
99   const uint32_t max_wg_size_x = ctx.gpu_info->GetMaxWorkGroupSizeForX();
100   const uint32_t max_wg_size_y = ctx.gpu_info->GetMaxWorkGroupSizeForY();
101   // Due to the design of the shader, at most subgroup_size subgroups can be
102   // launched. This may limit the maximal workgroup size.
103   const uint32_t max_wg_size =
104       std::min(static_cast<uint32_t>(ctx.gpu_info->GetMaxWorkGroupTotalSize()),
105                subgroup_size * subgroup_size);
106   const uint32_t max_number_of_subgroups = max_wg_size / subgroup_size;
107   uint32_t wg_size_x = 0;
108   uint32_t wg_size_y = 0;
109   if (width * height <= max_wg_size && width <= max_wg_size_x &&
110       height <= max_wg_size_y) {
111     wg_size_x = width;
112     wg_size_y = height;
113   } else {
114     // Approximately square workgroup. Also make sure to limit by driver limit
115     // and input size.
116     wg_size_x = std::min({static_cast<uint32_t>(std::sqrt(max_wg_size)),
117                           max_wg_size_x, static_cast<uint32_t>(width)});
118     wg_size_y = std::min({max_wg_size / wg_size_x, max_wg_size_y,
119                           static_cast<uint32_t>(height)});
120   }
121 
122   std::vector<Variable> shared_variables = {
123       {"subgroup_sums", std::vector<float4>(max_number_of_subgroups)},
124   };
125 
126   *generated_code = {
127       /*parameters=*/std::move(parameters),
128       /*objects=*/{},
129       /*shared_variables=*/{std::move(shared_variables)},
130       // Make sure we get one dispatch of size wg_size_x*wg_size_y*1 per layer.
131       /*workload=*/
132       uint3(wg_size_x, wg_size_y, uint32_t(DivideRoundUp(depth, 4))),
133       /*workgroup=*/uint3(wg_size_x, wg_size_y, 1u),
134       /*source_code=*/std::move(source),
135       /*input=*/IOStructure::ONLY_DEFINITIONS,
136       /*output=*/IOStructure::ONLY_DEFINITIONS,
137   };
138 }
139 
GenerateTrivialMean(const NodeShader::GenerationContext & ctx,GeneratedCode * generated_code)140 void GenerateTrivialMean(const NodeShader::GenerationContext& ctx,
141                          GeneratedCode* generated_code) {
142   std::vector<Variable> parameters = {
143       {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
144       {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}};
145 
146   std::string source = R"(
147     // Shaders may be compiled with a precision hint mediump, which means that
148     // GLSL compiler may drop the size of float data type from 32 to 16 bits.
149     // If "sum" and "size" variables are 16bit floats, their values range
150     // become not enough for providing a good results accuracy. That is why
151     // their precision is forced to be 32bit by using highp qualifier.
152 
153     highp vec4 sum = vec4(0.0);
154     highp float size = float($input_data_0_w$ * $input_data_0_h$);
155     for (int w = 0; w < $input_data_0_w$; w++) {
156       for (int h = 0; h < $input_data_0_h$; h++) {
157         sum += $input_data_0[w, h, gid.z]$;
158       }
159     }
160     value_0 = sum / size;
161   )";
162   *generated_code = {
163       /*parameters=*/std::move(parameters),
164       /*objects=*/{},
165       /*shared_variables=*/{},
166       /*workload=*/uint3(),
167       /*workgroup=*/uint3(1, 1, 4),
168       /*source_code=*/std::move(source),
169       /*input=*/IOStructure::ONLY_DEFINITIONS,
170       /*output=*/IOStructure::AUTO,
171   };
172 }
173 
174 class Mean : public NodeShader {
175  public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const176   absl::Status GenerateCode(const GenerationContext& ctx,
177                             GeneratedCode* generated_code) const final {
178     const auto& attr = absl::any_cast<const MeanAttributes&>(ctx.op_attr);
179     if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
180       return absl::InvalidArgumentError(
181           "Mean calculation is supported only for height and width.");
182     }
183 
184     if (!(ctx.input_shapes.size() == 1 && ctx.output_shapes.size() == 1 &&
185           ctx.output_shapes[0][1] == 1 && ctx.output_shapes[0][2] == 1 &&
186           ctx.output_shapes[0][3] == ctx.input_shapes[0][3])) {
187       return absl::InvalidArgumentError(
188           "Mean calculation is supported for one input and one 1x1 output with "
189           "the same channel count.");
190     }
191 
192     if (UseSubgroupBasedImpl(*ctx.gpu_info)) {
193       GenerateSubgroupBasedMean(ctx, generated_code);
194     } else {
195       GenerateTrivialMean(ctx, generated_code);
196     }
197     return absl::OkStatus();
198   }
199 };
200 
201 }  // namespace
202 
NewMeanNodeShader()203 std::unique_ptr<NodeShader> NewMeanNodeShader() {
204   return absl::make_unique<Mean>();
205 }
206 
207 }  // namespace gl
208 }  // namespace gpu
209 }  // namespace tflite
210