1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed_3x3_thin.h"
17
18 #include <string>
19 #include <utility>
20 #include <vector>
21
22 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
23
24 namespace tflite {
25 namespace gpu {
26
ConvolutionTransposed3x3Thin(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)27 ConvolutionTransposed3x3Thin::ConvolutionTransposed3x3Thin(
28 const GpuInfo& gpu_info, const OperationDef& definition,
29 const ConvolutionTransposedAttributes& attr)
30 : GPUOperation(definition) {
31 if (gpu_info.IsApple()) {
32 weights_layout_ = WeightsLayout::kOICustomSpatialO4I4;
33 } else {
34 weights_layout_ = WeightsLayout::kOICustomSpatialI4O4;
35 }
36 code_ = GenerateConvolutionTransposedCode(
37 definition_, DivideRoundUp(attr.weights.shape.i, 4),
38 DivideRoundUp(attr.weights.shape.o, 4));
39 }
40
GenerateConvolutionTransposedCode(const OperationDef & op_def,int src_depth,int dst_depth)41 std::string ConvolutionTransposed3x3Thin::GenerateConvolutionTransposedCode(
42 const OperationDef& op_def, int src_depth, int dst_depth) {
43 auto src_desc = op_def.src_tensors[0];
44 src_desc.SetAddressMode(AddressMode::kZero);
45 AddSrcTensor("src_tensor", src_desc);
46 AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
47
48 if (op_def.src_tensors.size() == 2) {
49 // dynamic weights
50 BufferDescriptor desc;
51 desc.element_type = op_def.src_tensors[1].data_type;
52 desc.element_size = 4;
53 desc.memory_type = MemoryType::CONSTANT;
54 AddSrcBuffer("weights", desc);
55 }
56
57 const auto src_tensor_type = op_def.src_tensors[0].storage_type;
58
59 std::string c;
60
61 if (GetWeightsDescription().IsI4O4()) {
62 switch (op_def.precision) {
63 case CalculationsPrecision::F32:
64 case CalculationsPrecision::F16:
65 c += "#define CONV(R, SRC, F, i) \\\n";
66 c += " R += SRC.x * F[i + 0]; \\\n";
67 c += " R += SRC.y * F[i + 1]; \\\n";
68 c += " R += SRC.z * F[i + 2]; \\\n";
69 c += " R += SRC.w * F[i + 3]; \n";
70 break;
71 case CalculationsPrecision::F32_F16:
72 c += "#define CONV(R, SRC, F, i) \\\n";
73 c += " R += TO_ACCUM_TYPE(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
74 c += "+ SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n";
75 break;
76 }
77 } else {
78 // O4I4
79 c += "#define CONV(R, SRC, F, i) \\\n";
80 c += " R.x += dot(SRC, F[i + 0]); \\\n";
81 c += " R.y += dot(SRC, F[i + 1]); \\\n";
82 c += " R.z += dot(SRC, F[i + 2]); \\\n";
83 c += " R.w += dot(SRC, F[i + 3]); \n";
84 }
85
86 c += "MAIN_FUNCTION($0) {\n";
87 if (op_def.IsBatchSupported()) {
88 c += " int linear_id = GLOBAL_ID_0;\n";
89 c += " int X = linear_id / args.dst_tensor.Batch();\n";
90 c += " int B = linear_id % args.dst_tensor.Batch();\n";
91 c += " args.dst_tensor.SetBatchRef(B);\n";
92 c += " args.src_tensor.SetBatchRef(B);\n";
93 } else {
94 c += " int X = GLOBAL_ID_0;\n";
95 }
96 c += " int Y = GLOBAL_ID_1;\n";
97 c += " if (X >= args.src_tensor.Width() || Y >= args.src_tensor.Height()) "
98 "return;\n";
99 for (int d = 0; d < dst_depth; ++d) {
100 const std::string layer = std::to_string(d);
101 c += " ACCUM_FLT4 r" + layer + "[2][2];\n";
102 c += " r" + layer + "[0][0] = INIT_ACCUM_FLT4(0.0f);\n";
103 c += " r" + layer + "[0][1] = INIT_ACCUM_FLT4(0.0f);\n";
104 c += " r" + layer + "[1][0] = INIT_ACCUM_FLT4(0.0f);\n";
105 c += " r" + layer + "[1][1] = INIT_ACCUM_FLT4(0.0f);\n";
106 }
107 int filters_index = 0;
108 for (int s = 0; s < src_depth; ++s) {
109 const std::string z = std::to_string(s);
110 c += " {\n";
111 if (src_tensor_type == TensorStorageType::BUFFER) {
112 c += " bool x_in = X + 1 < args.src_tensor.Width();\n";
113 c += " bool y_in = Y + 1 < args.src_tensor.Height();\n";
114 c += " FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
115 c += " FLT4 src1 = INIT_FLT4(0.0);\n";
116 c += " FLT4 src2 = INIT_FLT4(0.0);\n";
117 c += " FLT4 src3 = INIT_FLT4(0.0);\n";
118 c += " if (x_in) {\n";
119 c += " src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
120 c += " }\n";
121 c += " if (y_in) {\n";
122 c += " src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
123 c += " }\n";
124 c += " if (x_in && y_in) {\n";
125 c += " src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
126 c += " }\n";
127 } else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
128 c += " args.src_tensor.GetAddress(c0, X, Y, " + z + ");\n";
129 c += " args.src_tensor.GetAddress(c1, X + 1, Y, " + z + ");\n";
130 c += " args.src_tensor.GetAddress(c2, X, Y + 1, " + z + ");\n";
131 c += " args.src_tensor.GetAddress(c3, X + 1, Y + 1, " + z + ");\n";
132 c += " bool x_in = X + 1 < args.src_tensor.Width();\n";
133 c += " bool y_in = Y + 1 < args.src_tensor.Height();\n";
134 c += " c1 = select(-1, c1, x_in);\n";
135 c += " c2 = select(-1, c2, y_in);\n";
136 c += " c3 = select(-1, c3, x_in && y_in);\n";
137 c += " FLT4 src0 = args.src_tensor.Read(c0);\n";
138 c += " FLT4 src1 = args.src_tensor.Read(c1);\n";
139 c += " FLT4 src2 = args.src_tensor.Read(c2);\n";
140 c += " FLT4 src3 = args.src_tensor.Read(c3);\n";
141 } else {
142 c += " FLT4 src0 = args.src_tensor.Read(X, Y, " + z + ");\n";
143 c += " FLT4 src1 = args.src_tensor.Read(X + 1, Y, " + z + ");\n";
144 c += " FLT4 src2 = args.src_tensor.Read(X, Y + 1, " + z + ");\n";
145 c += " FLT4 src3 = args.src_tensor.Read(X + 1, Y + 1, " + z + ");\n";
146 }
147 for (int d = 0; d < dst_depth; ++d) {
148 const std::string layer = std::to_string(d);
149 const std::string f_offset = std::to_string(filters_index);
150 filters_index++;
151 c += " {\n";
152 c += " __constant FLT4* L0 = args.weights.GetPtr() + 36 * " + f_offset +
153 ";\n";
154 c += " CONV(r" + layer + "[0][0], src0, L0, 0);\n";
155 c += " CONV(r" + layer + "[0][1], src0, L0, 4);\n";
156 c += " CONV(r" + layer + "[0][1], src1, L0, 8);\n";
157 c += " CONV(r" + layer + "[1][0], src0, L0, 12);\n";
158 c += " CONV(r" + layer + "[1][0], src2, L0, 16);\n";
159 c += " CONV(r" + layer + "[1][1], src0, L0, 20);\n";
160 c += " CONV(r" + layer + "[1][1], src1, L0, 24);\n";
161 c += " CONV(r" + layer + "[1][1], src2, L0, 28);\n";
162 c += " CONV(r" + layer + "[1][1], src3, L0, 32);\n";
163 c += " }\n";
164 }
165 c += " }\n";
166 }
167 c += " X *= 2;\n";
168 c += " Y *= 2;\n";
169 for (int d = 0; d < dst_depth; ++d) {
170 const std::string layer = std::to_string(d);
171 c += " {\n";
172 c += " FLT4 bias_val = args.biases.Read(" + layer + ");\n";
173 for (int y = 0; y < 2; ++y) {
174 for (int x = 0; x < 2; ++x) {
175 const std::string x_coord = "X + " + std::to_string(x);
176 const std::string y_coord = "Y + " + std::to_string(y);
177 c += " {\n";
178 c += " FLT4 result = TO_FLT4(r" + layer + "[" + std::to_string(y) +
179 "][" + std::to_string(x) + "]) + bias_val;\n";
180 c += " args.dst_tensor.Write(result, " + x_coord + ", " + y_coord +
181 ", " + layer + ");\n";
182 c += " }\n";
183 }
184 }
185 c += " }\n";
186 }
187 c += "}\n";
188
189 return c;
190 }
191
GetGridSize() const192 int3 ConvolutionTransposed3x3Thin::GetGridSize() const {
193 const int grid_x = src_[0]->Width() * dst_[0]->Batch();
194 const int grid_y = src_[0]->Height();
195 const int grid_z = 1;
196 return int3(grid_x, grid_y, grid_z);
197 }
198
GetSpatialWeightsRemap() const199 std::vector<int> ConvolutionTransposed3x3Thin::GetSpatialWeightsRemap() const {
200 return std::vector<int>{4, 5, 3, 7, 1, 8, 6, 2, 0};
201 }
202
UploadWeights(const tflite::gpu::Tensor<OHWI,DataType::FLOAT32> & weights)203 void ConvolutionTransposed3x3Thin::UploadWeights(
204 const tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& weights) {
205 const int flt_count =
206 GetTotalElementsCountForLayout(GetWeightsDescription(), weights.shape);
207
208 DataType weights_type = definition_.precision == CalculationsPrecision::F32
209 ? DataType::FLOAT32
210 : DataType::FLOAT16;
211
212 BufferDescriptor desc;
213 desc.element_type = weights_type;
214 desc.element_size = 4;
215 desc.memory_type = MemoryType::CONSTANT;
216 desc.size = flt_count * SizeOf(desc.element_type);
217 desc.data.resize(desc.size);
218
219 RearrangeWeights(weights, GetWeightsDescription(), weights_type,
220 absl::MakeSpan(desc.data));
221
222 args_.AddObject("weights",
223 absl::make_unique<BufferDescriptor>(std::move(desc)));
224 }
225
IsConvolutionTransposed3x3ThinSupported(const ConvolutionTransposedAttributes & attr)226 bool IsConvolutionTransposed3x3ThinSupported(
227 const ConvolutionTransposedAttributes& attr) {
228 return attr.weights.shape.o <= 8 && attr.weights.shape.w == 3 &&
229 attr.weights.shape.h == 3 && attr.stride.w == 2 &&
230 attr.stride.h == 2 && attr.padding.prepended.w == 1 &&
231 attr.padding.prepended.h == 1 && attr.padding.appended.w == 1 &&
232 attr.padding.appended.h == 1;
233 }
234
CreateConvolutionTransposed3x3Thin(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)235 ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3Thin(
236 const GpuInfo& gpu_info, const OperationDef& definition,
237 const ConvolutionTransposedAttributes& attr) {
238 ConvolutionTransposed3x3Thin result(gpu_info, definition, attr);
239 result.UploadWeights(attr.weights);
240
241 TensorLinearDescriptor desc;
242 desc.storage_type = LinearStorageType::TEXTURE_2D;
243 desc.element_type = definition.GetDataType();
244 desc.UploadLinearData(attr.bias);
245 result.args_.AddObject(
246 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
247 return result;
248 }
249
CreateConvolutionTransposed3x3ThinDynamicWeights(const GpuInfo & gpu_info,const OperationDef & definition,const ConvolutionTransposedAttributes & attr)250 ConvolutionTransposed3x3Thin CreateConvolutionTransposed3x3ThinDynamicWeights(
251 const GpuInfo& gpu_info, const OperationDef& definition,
252 const ConvolutionTransposedAttributes& attr) {
253 OperationDef new_def = definition;
254 new_def.src_tensors = {
255 definition.src_tensors[0]}; // leaving only src_tensor def, weights defs
256 // will be added later
257 const DataType weights_type = definition.GetDataType();
258 // add 1 src_tensor(buffer) for weights
259 new_def.src_tensors.push_back(
260 {weights_type, TensorStorageType::BUFFER, Layout::HWC});
261 ConvolutionTransposed3x3Thin result(gpu_info, new_def, attr);
262
263 TensorLinearDescriptor desc;
264 desc.storage_type = LinearStorageType::TEXTURE_2D;
265 desc.element_type = new_def.GetDataType();
266 desc.UploadLinearData(attr.bias);
267 result.args_.AddObject(
268 "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
269 return result;
270 }
271
272 } // namespace gpu
273 } // namespace tflite
274