1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/gl/kernels/mean.h"
17
18 #include <algorithm>
19 #include <cstdint>
20 #include <cstring>
21 #include <string>
22 #include <vector>
23
24 #include "absl/memory/memory.h"
25 #include "absl/status/status.h"
26 #include "tensorflow/lite/delegates/gpu/common/status.h"
27 #include "tensorflow/lite/delegates/gpu/common/types.h"
28 #include "tensorflow/lite/delegates/gpu/common/util.h"
29
30 namespace tflite {
31 namespace gpu {
32 namespace gl {
33 namespace {
34
UseSubgroupBasedImpl(const GpuInfo & gpu_info)35 bool UseSubgroupBasedImpl(const GpuInfo& gpu_info) {
36 return gpu_info.IsApiVulkan() &&
37 (gpu_info.vulkan_info.api_version_major > 1 ||
38 gpu_info.vulkan_info.api_version_minor >= 1) &&
39 gpu_info.vulkan_info.subgroup_size >= 32 &&
40 gpu_info.vulkan_info.supports_subgroup_arithmetic;
41 }
42
43 // An implementation of Mean for desktop GPUs and some phones with recent
44 // Vulkan drivers. It is more parallel than the trivial Mean operation, but
45 // still limited to using a single work group.
GenerateSubgroupBasedMean(const NodeShader::GenerationContext & ctx,GeneratedCode * generated_code)46 void GenerateSubgroupBasedMean(const NodeShader::GenerationContext& ctx,
47 GeneratedCode* generated_code) {
48 int height = ctx.input_shapes[0][1];
49 int width = ctx.input_shapes[0][2];
50 int depth = ctx.input_shapes[0][3];
51 std::vector<Variable> parameters = {
52 {"input_data_0_h", height},
53 {"input_data_0_w", width},
54 {"output_data_0_h", 1},
55 {"output_data_0_w", 1},
56 };
57
58 std::string source = R"(
59 // Round columns and rows per invocation up, to ensure that we read the
60 // entire input.
61 const uint columns_per_invocation =
62 ($input_data_0_w$ + (gl_WorkGroupSize.x - 1))/gl_WorkGroupSize.x;
63 const uint rows_per_invocation =
64 ($input_data_0_h$ + (gl_WorkGroupSize.y - 1))/gl_WorkGroupSize.y;
65 const uint first_row = gl_GlobalInvocationID.y*rows_per_invocation;
66 const uint first_col = gl_GlobalInvocationID.x*columns_per_invocation;
67 const uint last_row_exclusive =
68 min(first_row+rows_per_invocation, $input_data_0_h$);
69 const uint last_column_exclusive =
70 min(first_col+columns_per_invocation, $input_data_0_w$);
71 vec4 value = vec4(0);
72 for (uint h = first_row; h < last_row_exclusive; ++h) {
73 for (uint w = first_col; w < last_column_exclusive; ++w) {
74 value += $input_data_0[w, h, gid.z]$;
75 }
76 }
77 highp vec4 subgroup_sum = subgroupAdd(value);
78 if(subgroupElect()) {
79 subgroup_sums[gl_SubgroupID] = subgroup_sum;
80 }
81
82 memoryBarrierShared();
83 barrier();
84 // Do the final reduction in the first subgroup.
85 if(gl_SubgroupID == 0) {
86 highp vec4 subtotal = vec4(0);
87 if (gl_SubgroupInvocationID < gl_NumSubgroups) {
88 subtotal = subgroup_sums[gl_SubgroupInvocationID];
89 }
90 highp vec4 grand_total = subgroupAdd(subtotal);
91 if(subgroupElect()) {
92 highp vec4 result = grand_total / $input_data_0_w$ / $input_data_0_h$;
93 $output_data_0[0, 0, gid.z] = result$;
94 }
95 }
96 )";
97
98 const uint32_t subgroup_size = ctx.gpu_info->vulkan_info.subgroup_size;
99 const uint32_t max_wg_size_x = ctx.gpu_info->GetMaxWorkGroupSizeForX();
100 const uint32_t max_wg_size_y = ctx.gpu_info->GetMaxWorkGroupSizeForY();
101 // Due to the design of the shader, at most subgroup_size subgroups can be
102 // launched. This may limit the maximal workgroup size.
103 const uint32_t max_wg_size =
104 std::min(static_cast<uint32_t>(ctx.gpu_info->GetMaxWorkGroupTotalSize()),
105 subgroup_size * subgroup_size);
106 const uint32_t max_number_of_subgroups = max_wg_size / subgroup_size;
107 uint32_t wg_size_x = 0;
108 uint32_t wg_size_y = 0;
109 if (width * height <= max_wg_size && width <= max_wg_size_x &&
110 height <= max_wg_size_y) {
111 wg_size_x = width;
112 wg_size_y = height;
113 } else {
114 // Approximately square workgroup. Also make sure to limit by driver limit
115 // and input size.
116 wg_size_x = std::min({static_cast<uint32_t>(std::sqrt(max_wg_size)),
117 max_wg_size_x, static_cast<uint32_t>(width)});
118 wg_size_y = std::min({max_wg_size / wg_size_x, max_wg_size_y,
119 static_cast<uint32_t>(height)});
120 }
121
122 std::vector<Variable> shared_variables = {
123 {"subgroup_sums", std::vector<float4>(max_number_of_subgroups)},
124 };
125
126 *generated_code = {
127 /*parameters=*/std::move(parameters),
128 /*objects=*/{},
129 /*shared_variables=*/{std::move(shared_variables)},
130 // Make sure we get one dispatch of size wg_size_x*wg_size_y*1 per layer.
131 /*workload=*/
132 uint3(wg_size_x, wg_size_y, uint32_t(DivideRoundUp(depth, 4))),
133 /*workgroup=*/uint3(wg_size_x, wg_size_y, 1u),
134 /*source_code=*/std::move(source),
135 /*input=*/IOStructure::ONLY_DEFINITIONS,
136 /*output=*/IOStructure::ONLY_DEFINITIONS,
137 };
138 }
139
GenerateTrivialMean(const NodeShader::GenerationContext & ctx,GeneratedCode * generated_code)140 void GenerateTrivialMean(const NodeShader::GenerationContext& ctx,
141 GeneratedCode* generated_code) {
142 std::vector<Variable> parameters = {
143 {"input_data_0_h", static_cast<int>(ctx.input_shapes[0][1])},
144 {"input_data_0_w", static_cast<int>(ctx.input_shapes[0][2])}};
145
146 std::string source = R"(
147 // Shaders may be compiled with a precision hint mediump, which means that
148 // GLSL compiler may drop the size of float data type from 32 to 16 bits.
149 // If "sum" and "size" variables are 16bit floats, their values range
150 // become not enough for providing a good results accuracy. That is why
151 // their precision is forced to be 32bit by using highp qualifier.
152
153 highp vec4 sum = vec4(0.0);
154 highp float size = float($input_data_0_w$ * $input_data_0_h$);
155 for (int w = 0; w < $input_data_0_w$; w++) {
156 for (int h = 0; h < $input_data_0_h$; h++) {
157 sum += $input_data_0[w, h, gid.z]$;
158 }
159 }
160 value_0 = sum / size;
161 )";
162 *generated_code = {
163 /*parameters=*/std::move(parameters),
164 /*objects=*/{},
165 /*shared_variables=*/{},
166 /*workload=*/uint3(),
167 /*workgroup=*/uint3(1, 1, 4),
168 /*source_code=*/std::move(source),
169 /*input=*/IOStructure::ONLY_DEFINITIONS,
170 /*output=*/IOStructure::AUTO,
171 };
172 }
173
174 class Mean : public NodeShader {
175 public:
GenerateCode(const GenerationContext & ctx,GeneratedCode * generated_code) const176 absl::Status GenerateCode(const GenerationContext& ctx,
177 GeneratedCode* generated_code) const final {
178 const auto& attr = absl::any_cast<const MeanAttributes&>(ctx.op_attr);
179 if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
180 return absl::InvalidArgumentError(
181 "Mean calculation is supported only for height and width.");
182 }
183
184 if (!(ctx.input_shapes.size() == 1 && ctx.output_shapes.size() == 1 &&
185 ctx.output_shapes[0][1] == 1 && ctx.output_shapes[0][2] == 1 &&
186 ctx.output_shapes[0][3] == ctx.input_shapes[0][3])) {
187 return absl::InvalidArgumentError(
188 "Mean calculation is supported for one input and one 1x1 output with "
189 "the same channel count.");
190 }
191
192 if (UseSubgroupBasedImpl(*ctx.gpu_info)) {
193 GenerateSubgroupBasedMean(ctx, generated_code);
194 } else {
195 GenerateTrivialMean(ctx, generated_code);
196 }
197 return absl::OkStatus();
198 }
199 };
200
201 } // namespace
202
NewMeanNodeShader()203 std::unique_ptr<NodeShader> NewMeanNodeShader() {
204 return absl::make_unique<Mean>();
205 }
206
207 } // namespace gl
208 } // namespace gpu
209 } // namespace tflite
210