1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
18
19 #include <vector>
20
21 #include "tensorflow/lite/delegates/gpu/common/operations.h"
22 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
23 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
24 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
25
26 namespace tflite {
27 namespace gpu {
28
29 // Depth Wise Convolution for kernel 3x3
30 // require:
31 // channels_multiplier = 1;
32 // kernel_size = 3x3;
33 // dilation.y = 1;
34 // stride.y = 2;
35 class DepthWiseConv3x3StrideH2 : public GPUOperation {
36 public:
37 DepthWiseConv3x3StrideH2() = default;
38 void GetPossibleKernelWorkGroups(
39 TuningType tuning_type, const GpuInfo& gpu_info,
40 const KernelInfo& kernel_info,
41 std::vector<int3>* work_groups) const override;
42 int3 GetGridSize() const override;
43
44 // Move only
45 DepthWiseConv3x3StrideH2(DepthWiseConv3x3StrideH2&& kernel) = default;
46 DepthWiseConv3x3StrideH2& operator=(DepthWiseConv3x3StrideH2&& kernel) =
47 default;
48 DepthWiseConv3x3StrideH2(const DepthWiseConv3x3StrideH2&) = delete;
49 DepthWiseConv3x3StrideH2& operator=(const DepthWiseConv3x3StrideH2&) = delete;
50
51 private:
DepthWiseConv3x3StrideH2(const OperationDef & definition)52 explicit DepthWiseConv3x3StrideH2(const OperationDef& definition)
53 : GPUOperation(definition) {}
54 friend DepthWiseConv3x3StrideH2 CreateDepthWiseConv3x3StrideH2(
55 const OperationDef& definition,
56 const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info);
57
58 template <DataType T>
59 void UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI, T>& weights,
60 const tflite::gpu::Tensor<Linear, T>& biases,
61 bool weights_are_buffer);
62 template <DataType S, typename T>
63 void RearrangeWeightsAndBiasesData(
64 const tflite::gpu::Tensor<OHWI, S>& weights,
65 const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst);
66
67 bool local_mem_uploads_;
68 };
69
70 template <DataType T>
UploadWeightsAndBiases(const tflite::gpu::Tensor<OHWI,T> & weights,const tflite::gpu::Tensor<Linear,T> & biases,bool weights_are_buffer)71 void DepthWiseConv3x3StrideH2::UploadWeightsAndBiases(
72 const tflite::gpu::Tensor<OHWI, T>& weights,
73 const tflite::gpu::Tensor<Linear, T>& biases, bool weights_are_buffer) {
74 const int src_depth = DivideRoundUp(weights.shape.i, 4);
75 int texture_width = 10; // 3x3 kernel + 1 bias
76 int texture_height = src_depth;
77 const int elements_count = texture_width * texture_height;
78 const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
79 const int float4_size = fp32_weights ? 16 : 8;
80
81 std::vector<uint8_t> data(float4_size * elements_count);
82 if (fp32_weights) {
83 float4* ptr = reinterpret_cast<float4*>(data.data());
84 RearrangeWeightsAndBiasesData(weights, biases,
85 absl::MakeSpan(ptr, elements_count));
86 } else {
87 half4* ptr = reinterpret_cast<half4*>(data.data());
88 RearrangeWeightsAndBiasesData(weights, biases,
89 absl::MakeSpan(ptr, elements_count));
90 }
91
92 if (weights_are_buffer) {
93 BufferDescriptor desc;
94 desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
95 desc.element_size = 4;
96 desc.size = float4_size * elements_count;
97 desc.data = std::move(data);
98 args_.AddObject("weights",
99 absl::make_unique<BufferDescriptor>(std::move(desc)));
100 } else {
101 Texture2DDescriptor desc;
102 desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
103 desc.size = int2(texture_width, texture_height);
104 desc.data = std::move(data);
105 args_.AddObject("weights",
106 absl::make_unique<Texture2DDescriptor>(std::move(desc)));
107 }
108 }
109
110 template <DataType S, typename T>
RearrangeWeightsAndBiasesData(const tflite::gpu::Tensor<OHWI,S> & weights,const tflite::gpu::Tensor<Linear,S> & biases,absl::Span<T> dst)111 void DepthWiseConv3x3StrideH2::RearrangeWeightsAndBiasesData(
112 const tflite::gpu::Tensor<OHWI, S>& weights,
113 const tflite::gpu::Tensor<Linear, S>& biases, absl::Span<T> dst) {
114 const int src_depth = DivideRoundUp(weights.shape.i, 4);
115
116 int counter = 0;
117 for (int s = 0; s < src_depth; ++s) {
118 for (int y = 0; y < 3; ++y) {
119 for (int x = 0; x < 3; ++x) {
120 T filter_val;
121 for (int i = 0; i < 4; ++i) {
122 const int s_ch = s * 4 + i;
123 if (s_ch < weights.shape.i) {
124 const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
125 filter_val[i] = weights.data[f_index];
126 } else {
127 filter_val[i] = 0.0f;
128 }
129 }
130 dst[counter++] = filter_val;
131 }
132 }
133
134 T bias_val;
135 for (int i = 0; i < 4; ++i) {
136 const int dst_ch = s * 4 + i;
137 bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
138 }
139 dst[counter++] = bias_val;
140 }
141 }
142
143 DepthWiseConv3x3StrideH2 CreateDepthWiseConv3x3StrideH2(
144 const OperationDef& definition,
145 const DepthwiseConvolution2DAttributes& attr, const GpuInfo& gpu_info);
146
147 bool IsDepthWiseConv3x3StrideH2Supported(
148 const DepthwiseConvolution2DAttributes& attr);
149
150 } // namespace gpu
151 } // namespace tflite
152
153 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_DEPTHWISE_CONV_3X3_STRIDE_H2_H_
154