• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
18 
19 #include <stdint.h>
20 
21 #include <string>
22 #include <utility>
23 #include <vector>
24 
25 #include "absl/memory/memory.h"
26 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
27 #include "tensorflow/lite/delegates/gpu/common/operations.h"
28 #include "tensorflow/lite/delegates/gpu/common/shape.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
32 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
33 #include "tensorflow/lite/delegates/gpu/common/types.h"
34 #include "tensorflow/lite/delegates/gpu/common/util.h"
35 
36 namespace tflite {
37 namespace gpu {
38 
39 template <DataType T, typename S>
RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)40 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
41                                 S* dst) {
42   const int src_channels = weights.shape.i;
43   const int padded_src_channels = AlignByN(src_channels, 4);
44   const int dst_channels = weights.shape.o;
45   const int padded_dst_channels = AlignByN(dst_channels, 4);
46 
47   for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
48     for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
49       for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
50         for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
51           int y = 4 * block_y + y_in_block;
52           int x = 4 * block_x + x_in_block;
53           int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
54                           x_in_block * 4 + y_in_block;
55           if (x < src_channels && y < dst_channels) {
56             dst[dst_index] = weights.data[src_channels * y + x];
57           } else {
58             dst[dst_index] = 0.0f;
59           }
60         }
61       }
62     }
63   }
64 }
65 
66 template <DataType T, typename S>
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)67 void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
68                                 S* dst) {
69   const int src_channels = weights.shape.i;
70   const int src_depth = DivideRoundUp(src_channels, 4);
71   const int dst_channels = weights.shape.o;
72   const int dst_depth = DivideRoundUp(dst_channels, 4);
73 
74   int counter = 0;
75   for (int d = 0; d < dst_depth; ++d) {
76     for (int s = 0; s < src_depth; ++s) {
77       for (int i = 0; i < 4; ++i) {
78         const int src_ch = s * 4 + i;
79         for (int j = 0; j < 4; ++j) {
80           const int dst_ch = d * 4 + j;
81           if (src_ch < src_channels && dst_ch < dst_channels) {
82             dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
83           } else {
84             dst[counter++] = 0.0f;
85           }
86         }
87       }
88     }
89   }
90 }
91 
92 class FCFCAdd : public GPUOperation {
93  public:
94   FCFCAdd() = default;
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups)95   void GetPossibleKernelWorkGroups(
96       TuningType tuning_type, const GpuInfo& gpu_info,
97       const KernelInfo& kernel_info,
98       std::vector<int3>* work_groups) const override {
99     work_groups->push_back(work_group_size_);
100   }
101   int3 GetGridSize() const override;
102 
103   // Move only
104   FCFCAdd(FCFCAdd&& kernel);
105   FCFCAdd& operator=(FCFCAdd&& kernel);
106   FCFCAdd(const FCFCAdd&) = delete;
107   FCFCAdd& operator=(const FCFCAdd&) = delete;
108 
109  private:
110   FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info);
111   friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
112                                const OperationDef& definition,
113                                const FullyConnectedAttributes& attr0,
114                                const FullyConnectedAttributes& attr1);
115 
116   template <DataType T>
117   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
118                      const std::string& name, bool weights_are_buffer);
119 
120   std::string GetFCFCAddKernelCode(const OperationDef& op_def,
121                                    const GpuInfo& gpu_info);
122 };
123 
124 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights,const std::string & name,bool weights_are_buffer)125 void FCFCAdd::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
126                             const std::string& name, bool weights_are_buffer) {
127   const int src_depth = DivideRoundUp(weights.shape.i, 4);
128   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
129 
130   const int elements_count = src_depth * dst_depth * 4;
131   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
132 
133   const int float4_size = f32_weights ? 16 : 8;
134 
135   if (weights_are_buffer) {
136     BufferDescriptor desc;
137     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
138     desc.element_size = 16;
139     desc.size = float4_size * elements_count;
140     desc.data.resize(desc.size);
141 
142     if (f32_weights) {
143       float* ptr = reinterpret_cast<float*>(desc.data.data());
144       RearrangeFCWeightsToIOO4I4(weights, ptr);
145     } else {
146       half* ptr = reinterpret_cast<half*>(desc.data.data());
147       RearrangeFCWeightsToIOO4I4(weights, ptr);
148     }
149 
150     args_.AddObject(name, absl::make_unique<BufferDescriptor>(std::move(desc)));
151   } else {
152     Texture2DDescriptor desc;
153     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
154     // desc.element_type = DataType::UINT8;
155     // desc.normalized = true;
156     // desc.normalized_type = f32_weights ? DataType::FLOAT32 :
157     // DataType::FLOAT16;
158     desc.size = int2(src_depth * 4, dst_depth);
159     desc.data.resize(float4_size * elements_count);
160 
161     if (f32_weights) {
162       float* ptr = reinterpret_cast<float*>(desc.data.data());
163       RearrangeFCWeightsToOIO4I4(weights, ptr);
164     } else {
165       half* ptr = reinterpret_cast<half*>(desc.data.data());
166       RearrangeFCWeightsToOIO4I4(weights, ptr);
167     }
168 
169     args_.AddObject(name,
170                     absl::make_unique<Texture2DDescriptor>(std::move(desc)));
171   }
172 }
173 
174 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
175                       const FullyConnectedAttributes& attr0,
176                       const FullyConnectedAttributes& attr1);
177 
178 }  // namespace gpu
179 }  // namespace tflite
180 
181 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
182