1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
18
19 #include <stdint.h>
20
21 #include <string>
22 #include <utility>
23 #include <vector>
24
25 #include "absl/memory/memory.h"
26 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
27 #include "tensorflow/lite/delegates/gpu/common/operations.h"
28 #include "tensorflow/lite/delegates/gpu/common/shape.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
32 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
33 #include "tensorflow/lite/delegates/gpu/common/types.h"
34 #include "tensorflow/lite/delegates/gpu/common/util.h"
35
36 namespace tflite {
37 namespace gpu {
38
39 template <DataType T, typename S>
RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)40 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
41 S* dst) {
42 const int src_channels = weights.shape.i;
43 const int padded_src_channels = AlignByN(src_channels, 4);
44 const int dst_channels = weights.shape.o;
45 const int padded_dst_channels = AlignByN(dst_channels, 4);
46
47 // Change the travelsal order of the weight matrix in the following way:
48 // The matrix is segmented to blocks of 4x4. If (any) dimension of the matrix
49 // size is not divisible by 4, then pad with zeros. Each block is stored
50 // contigously. The 16 elements within a block are ordered as 4 elements of
51 // the first column, 4 elems of the second, etc. Blocks then traversed as
52 // columns first, rows last. As an example, an 8x8 matrix would be traversed
53 // as below.
54 //
55 // | 0 4 8 12 32 36 40 44 |
56 // | 1 5 9 13 33 37 41 45 |
57 // | 2 6 10 14 34 38 42 46 |
58 // | 3 7 11 15 35 39 43 47 |
59 // | 16 20 24 28 48 52 56 60 |
60 // | 17 21 25 29 49 53 57 61 |
61 // | 18 22 26 30 50 54 58 62 |
62 // | 19 23 27 31 51 55 59 63 |
63 //
64 // The benefit of doing this is that reading contigous 16 elements gives a 4x4
65 // block of the matrix, where the first 4 elements is the first row of the
66 // block, second 4 elements is the second row of the block, etc. Subsequent
67 // blocks contain elements of the same 4 columns.
68
69 for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
70 for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
71 for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
72 for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
73 int y = 4 * block_y + y_in_block;
74 int x = 4 * block_x + x_in_block;
75 // Consider destination as an array with extents
76 // [padded_src_channels/4][padded_dst_channels/4][4][4]
77 int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
78 x_in_block * 4 + y_in_block;
79 if (x < src_channels && y < dst_channels) {
80 dst[dst_index] = weights.data[src_channels * y + x];
81 } else {
82 dst[dst_index] = 0.0f;
83 }
84 }
85 }
86 }
87 }
88 }
89
90 template <DataType T, typename S>
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)91 void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
92 S* dst) {
93 const int src_channels = weights.shape.i;
94 const int src_depth = DivideRoundUp(src_channels, 4);
95 const int dst_channels = weights.shape.o;
96 const int dst_depth = DivideRoundUp(dst_channels, 4);
97
98 int counter = 0;
99 for (int d = 0; d < dst_depth; ++d) {
100 for (int s = 0; s < src_depth; ++s) {
101 for (int i = 0; i < 4; ++i) {
102 const int src_ch = s * 4 + i;
103 for (int j = 0; j < 4; ++j) {
104 const int dst_ch = d * 4 + j;
105 if (src_ch < src_channels && dst_ch < dst_channels) {
106 dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
107 } else {
108 dst[counter++] = 0.0f;
109 }
110 }
111 }
112 }
113 }
114 }
115
116 class FullyConnected : public GPUOperation {
117 public:
118 FullyConnected() = default;
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups)119 void GetPossibleKernelWorkGroups(
120 TuningType tuning_type, const GpuInfo& gpu_info,
121 const KernelInfo& kernel_info,
122 std::vector<int3>* work_groups) const override {
123 work_groups->push_back(work_group_size_);
124 }
125 int3 GetGridSize() const override;
126
127 // Move only
128 FullyConnected(FullyConnected&& kernel);
129 FullyConnected& operator=(FullyConnected&& kernel);
130 FullyConnected(const FullyConnected&) = delete;
131 FullyConnected& operator=(const FullyConnected&) = delete;
132
133 private:
134 FullyConnected(const OperationDef& definition, const GpuInfo& gpu_info);
135 friend FullyConnected CreateFullyConnected(
136 const GpuInfo& gpu_info, const OperationDef& definition,
137 const FullyConnectedAttributes& attr);
138
139 template <DataType T>
140 void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
141 bool weights_are_buffer);
142
143 std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
144 const GpuInfo& gpu_info);
145 };
146
147 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights,bool weights_are_buffer)148 void FullyConnected::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
149 bool weights_are_buffer) {
150 const int src_depth = DivideRoundUp(weights.shape.i, 4);
151 const int dst_depth = DivideRoundUp(weights.shape.o, 4);
152
153 const int elements_count = src_depth * dst_depth * 4;
154 const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
155
156 const int float4_size = f32_weights ? 16 : 8;
157
158 if (weights_are_buffer) {
159 BufferDescriptor desc;
160 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
161 desc.element_size = 16;
162 desc.size = float4_size * elements_count;
163 desc.data.resize(desc.size);
164
165 if (f32_weights) {
166 float* ptr = reinterpret_cast<float*>(desc.data.data());
167 RearrangeFCWeightsToIOO4I4(weights, ptr);
168 } else {
169 half* ptr = reinterpret_cast<half*>(desc.data.data());
170 RearrangeFCWeightsToIOO4I4(weights, ptr);
171 }
172
173 args_.AddObject("weights",
174 absl::make_unique<BufferDescriptor>(std::move(desc)));
175 } else {
176 Texture2DDescriptor desc;
177 desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
178 desc.size = int2(src_depth * 4, dst_depth);
179 desc.data.resize(float4_size * elements_count);
180
181 if (f32_weights) {
182 float* ptr = reinterpret_cast<float*>(desc.data.data());
183 RearrangeFCWeightsToOIO4I4(weights, ptr);
184 } else {
185 half* ptr = reinterpret_cast<half*>(desc.data.data());
186 RearrangeFCWeightsToOIO4I4(weights, ptr);
187 }
188
189 args_.AddObject("weights",
190 absl::make_unique<Texture2DDescriptor>(std::move(desc)));
191 }
192 }
193
194 FullyConnected CreateFullyConnected(const GpuInfo& gpu_info,
195 const OperationDef& definition,
196 const FullyConnectedAttributes& attr);
197
198 } // namespace gpu
199 } // namespace tflite
200
201 #endif // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_FULLY_CONNECTED_H_
202