• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
18 
19 #include <stdint.h>
20 
21 #include <string>
22 #include <utility>
23 #include <vector>
24 
25 #include "absl/memory/memory.h"
26 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
27 #include "tensorflow/lite/delegates/gpu/common/operations.h"
28 #include "tensorflow/lite/delegates/gpu/common/shape.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/buffer_desc.h"
30 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
31 #include "tensorflow/lite/delegates/gpu/common/task/texture2d_desc.h"
32 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
33 #include "tensorflow/lite/delegates/gpu/common/types.h"
34 #include "tensorflow/lite/delegates/gpu/common/util.h"
35 
36 namespace tflite {
37 namespace gpu {
38 
39 template <DataType T, typename S>
RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)40 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
41                                 S* dst) {
42   const int src_channels = weights.shape.i;
43   const int padded_src_channels = AlignByN(src_channels, 4);
44   const int dst_channels = weights.shape.o;
45   const int padded_dst_channels = AlignByN(dst_channels, 4);
46 
47   for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
48     for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
49       for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
50         for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
51           int y = 4 * block_y + y_in_block;
52           int x = 4 * block_x + x_in_block;
53           int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
54                           x_in_block * 4 + y_in_block;
55           if (x < src_channels && y < dst_channels) {
56             dst[dst_index] = weights.data[src_channels * y + x];
57           } else {
58             dst[dst_index] = 0.0f;
59           }
60         }
61       }
62     }
63   }
64 }
65 
66 template <DataType T, typename S>
RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI,T> & weights,S * dst)67 void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
68                                 S* dst) {
69   const int src_channels = weights.shape.i;
70   const int src_depth = DivideRoundUp(src_channels, 4);
71   const int dst_channels = weights.shape.o;
72   const int dst_depth = DivideRoundUp(dst_channels, 4);
73 
74   int counter = 0;
75   for (int d = 0; d < dst_depth; ++d) {
76     for (int s = 0; s < src_depth; ++s) {
77       for (int i = 0; i < 4; ++i) {
78         const int src_ch = s * 4 + i;
79         for (int j = 0; j < 4; ++j) {
80           const int dst_ch = d * 4 + j;
81           if (src_ch < src_channels && dst_ch < dst_channels) {
82             dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
83           } else {
84             dst[counter++] = 0.0f;
85           }
86         }
87       }
88     }
89   }
90 }
91 
92 class FCFCAdd : public GPUOperation {
93  public:
94   FCFCAdd() = default;
GetPossibleKernelWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,std::vector<int3> * work_groups)95   void GetPossibleKernelWorkGroups(
96       TuningType tuning_type, const GpuInfo& gpu_info,
97       const KernelInfo& kernel_info,
98       std::vector<int3>* work_groups) const override {
99     work_groups->push_back(work_group_size_);
100   }
101   int3 GetGridSize() const override;
102 
103   // Move only
104   FCFCAdd(FCFCAdd&& kernel);
105   FCFCAdd& operator=(FCFCAdd&& kernel);
106   FCFCAdd(const FCFCAdd&) = delete;
107   FCFCAdd& operator=(const FCFCAdd&) = delete;
108 
109  private:
110   FCFCAdd(const OperationDef& definition, const GpuInfo& gpu_info);
111   friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
112                                const OperationDef& definition,
113                                const FullyConnectedAttributes& attr0,
114                                const FullyConnectedAttributes& attr1);
115   friend FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info,
116                                const OperationDef& definition,
117                                const FullyConnectedInt8Attributes& attr0,
118                                const FullyConnectedInt8Attributes& attr1);
119 
120   void UploadQuantizedWeights(
121       const tflite::gpu::Tensor<OHWI, DataType::INT8>& weights, float scale,
122       float zero_point, int index);
123 
124   template <DataType T>
125   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
126                      const std::string& name, bool weights_are_buffer);
127 
128   std::string GetFCFCAddKernelCode(const OperationDef& op_def,
129                                    const GpuInfo& gpu_info,
130                                    bool weights_are_buffer, bool quantized_0,
131                                    bool quantized_1);
132 };
133 
134 template <DataType T>
UploadWeights(const tflite::gpu::Tensor<OHWI,T> & weights,const std::string & name,bool weights_are_buffer)135 void FCFCAdd::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
136                             const std::string& name, bool weights_are_buffer) {
137   const int src_depth = DivideRoundUp(weights.shape.i, 4);
138   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
139 
140   const int elements_count = src_depth * dst_depth * 4;
141   const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
142 
143   const int float4_size = f32_weights ? 16 : 8;
144 
145   if (weights_are_buffer) {
146     BufferDescriptor desc;
147     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
148     desc.element_size = 16;
149     desc.size = float4_size * elements_count;
150     desc.data.resize(desc.size);
151 
152     if (f32_weights) {
153       float* ptr = reinterpret_cast<float*>(desc.data.data());
154       RearrangeFCWeightsToIOO4I4(weights, ptr);
155     } else {
156       half* ptr = reinterpret_cast<half*>(desc.data.data());
157       RearrangeFCWeightsToIOO4I4(weights, ptr);
158     }
159 
160     args_.AddObject(name, absl::make_unique<BufferDescriptor>(std::move(desc)));
161   } else {
162     Texture2DDescriptor desc;
163     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
164     // desc.element_type = DataType::UINT8;
165     // desc.normalized = true;
166     // desc.normalized_type = f32_weights ? DataType::FLOAT32 :
167     // DataType::FLOAT16;
168     desc.size = int2(src_depth * 4, dst_depth);
169     desc.data.resize(float4_size * elements_count);
170 
171     if (f32_weights) {
172       float* ptr = reinterpret_cast<float*>(desc.data.data());
173       RearrangeFCWeightsToOIO4I4(weights, ptr);
174     } else {
175       half* ptr = reinterpret_cast<half*>(desc.data.data());
176       RearrangeFCWeightsToOIO4I4(weights, ptr);
177     }
178 
179     args_.AddObject(name,
180                     absl::make_unique<Texture2DDescriptor>(std::move(desc)));
181   }
182 }
183 
184 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
185                       const FullyConnectedAttributes& attr0,
186                       const FullyConnectedAttributes& attr1);
187 
188 FCFCAdd CreateFCFCAdd(const GpuInfo& gpu_info, const OperationDef& definition,
189                       const FullyConnectedInt8Attributes& attr0,
190                       const FullyConnectedInt8Attributes& attr1);
191 
192 }  // namespace gpu
193 }  // namespace tflite
194 
195 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASKS_SPECIAL_FC_FC_ADD_H_
196