• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
17 #define TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
18 
19 // Depending on a build configuration this header provides custom kernel for
20 // Eigen tensor contractions (small matrix multiplication kernel used to
21 // multiple together blocks of the original tensors).
22 //
23 // 1) --define tensorflow_mkldnn_contraction_kernel=1
24 //    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at
25 //    runtime and use avx/avx2/fma/avx512 based on cpu status registers
26 //    (https://en.wikipedia.org/wiki/CPUID).
27 //
28 // If you use `tensor.contract(other_tensor)` in your code, you must include
29 // this header to get the benefit of custom contraction kernel:
30 //
31 //   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
32 //   #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
33 //   #endif
34 
35 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
36 
37 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
38 #include "mkldnn.h"
39 #endif
40 
41 namespace Eigen {
42 namespace internal {
43 
44 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
45 // Returns `true` iff we can use custom contraction kernels. This is a runtime
46 // check, that uses environment variables.
47 bool UseCustomContractionKernels();
48 
49 // Pack a 2D block of a Tensor expression into contiguous block of memory with
50 // col-major storage order. We do not have access to the underlying Tensor
51 // expression, we only have a DataMapper (TensorContractionInputMapper for
52 // tensor contractions, or blas_data_mapper for plain tensors), that provides a
53 // two-dimensional view into the Tensor expression.
54 //
55 // Default Eigen gemm_pack_rhs and gemm_pack_lhs pack blocks of tensor
56 // expressions into the packed format described in "Anatomy of High-Performance
57 // Matrix Multiplication" paper (1). Eigen::internal::gebp_kernel relies on this
58 // packing format for efficient micro-panel multiplication.
59 //
60 // This simple packing can be used with any '?gemm' function from BLAS
61 // libraries, that work with col-major matrices.
62 //
63 // (1) http://www.cs.utexas.edu/~flame/pubs/GotoTOMS_revision.pdf
64 //
65 // IMPORTANT: `gemm_pack_colmajor_block` always packs the block in column major
66 // order, DataMapperStorageOrder specifies the storage order of the underlying
67 // Tensor expression.
68 template <typename Scalar, typename IndexType, typename DataMapper,
69           int DataMapperStorageOrder>
70 struct gemm_pack_colmajor_block;
71 
72 // gemm_pack_colmajor_block for ColMajor storage order.
73 template <typename Scalar, typename IndexType, typename DataMapper>
74 struct gemm_pack_colmajor_block<Scalar, IndexType, DataMapper,
75                                 /*DataMapperStorageOrder*/ ColMajor> {
76   typedef typename internal::packet_traits<Scalar>::type Packet;
77   typedef typename DataMapper::LinearMapper LinearMapper;
78 
79   enum { PacketSize = internal::packet_traits<Scalar>::size };
80 
81   EIGEN_DONT_INLINE
82   void operator()(Scalar* block, const DataMapper& data_mapper, IndexType rows,
83                   IndexType cols) {
84     const IndexType unrolled_rows = rows - 4 * PacketSize;
85     const IndexType vectorized_rows = rows - PacketSize;
86 
87     for (IndexType col = 0; col < cols; ++col) {
88       LinearMapper lm = data_mapper.getLinearMapper(0, col);
89 
90       IndexType row = 0;
91       // Give compiler a strong possibility to unroll the loop.
92       for (; row <= unrolled_rows; row += 4 * PacketSize) {
93         for (IndexType j = 0; j < 4; ++j) {
94           const Packet p = lm.template loadPacket<Packet>(row + j * PacketSize);
95           internal::pstoreu(block + j * PacketSize, p);
96         }
97         block += 4 * PacketSize;
98       }
99       // Process remaining rows with packets.
100       for (; row <= vectorized_rows; row += PacketSize) {
101         const Packet p = lm.template loadPacket<Packet>(row);
102         internal::pstoreu(block, p);
103         block += PacketSize;
104       }
105       // Finalize with coefficients.
106       for (; row < rows; ++row) {
107         *block = lm(row);
108         ++block;
109       }
110     }
111   }
112 };
113 
114 #endif  // TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL
115 
116 // Enabled by build option: "--define tensorflow_mkldnn_contraction_kernel=1"
117 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
118 
119 template <typename Scalar, typename IndexType, typename OutputMapper,
120           bool ConjugateLhs = false, bool ConjugateRhs = false>
121 struct mkldnn_gemm_kernel;
122 
123 // mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
124 template <typename IndexType, typename OutputMapper, bool ConjugateLhs,
125           bool ConjugateRhs>
126 struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
127                           ConjugateLhs, ConjugateRhs> {
128   static_assert(!ConjugateLhs, "MKL-DNN kernel doesn't support ConjugateLhs");
129   static_assert(!ConjugateRhs, "MKL-DNN kernel doesn't support ConjugateRhs");
130 
131   EIGEN_DONT_INLINE
132   void operator()(const OutputMapper& output, const float* blockA,
133                   const float* blockB, const IndexType rows,
134                   const IndexType depth, const IndexType cols, float alpha) {
135     static const int max_index = (std::numeric_limits<int>::max)();
136 
137     eigen_assert(max_index >= rows);
138     eigen_assert(max_index >= cols);
139     eigen_assert(max_index >= depth);
140     eigen_assert(max_index >= output.stride());
141 
142     const int m = static_cast<int>(rows);
143     const int n = static_cast<int>(cols);
144     const int k = static_cast<int>(depth);
145 
146     const char transposeA = 'N';
147     const char transposeB = 'N';
148 
149     const int ldA = m;
150     const int ldB = k;
151     const int ldC = static_cast<int>(output.stride());
152 
153     const float beta = 1.0;
154 
155     mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
156                                       &alpha, blockA, &ldA, blockB, &ldB, &beta,
157                                       const_cast<float*>(output.data()), &ldC);
158     eigen_assert(st == 0);
159 
160     // eigen_assert is a no-op in optimized mode so we add these to avoid
161     // compiler's unused-variable errors.
162     EIGEN_UNUSED_VARIABLE(max_index);
163     EIGEN_UNUSED_VARIABLE(st);
164   }
165 };
166 
167 // For mkldnn_sgemm having the right dimensions (especially for small matrices)
168 // is more important than fitting all the working set in L1/L2 caches.
169 // TODO(ezhulenev): Do better heuristics.
170 template <typename StorageIndex, int sharding_type>
171 class TensorContractionBlocking<float, float, float, StorageIndex,
172                                 sharding_type> {
173   // For now mkldnn has only mkldnn_sgemm (gemm for floats).
174   using Scalar = float;
175 
176   // Adjust the block sizes to work well with mkldnn kernels.
177 
178   // Multiply default choice of block size along M and N dimensions.
179   // TODO(ezhulenev): Explore if this can work in general (kScaleM=2.0 worked
180   // well in some of models).
181   static constexpr float kScaleM = 1.5;
182   static constexpr float kScaleN = 1.0;
183 
184   // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48.
185   static const StorageIndex kUnrollM = 48;
186 
187   // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8.
188   static const StorageIndex kUnrollN = 24;
189 
190  public:
191   TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
192                             StorageIndex num_threads = 1)
193       : kc_(k), mc_(m), nc_(n) {
194     // 1. Compute block sizes using default Eigen heuristics.
195     if (sharding_type == ShardByCol) {
196       computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, mc_, nc_,
197                                                      num_threads);
198     } else {
199       computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, nc_, mc_,
200                                                      num_threads);
201     }
202 
203     // If dimensions do not pass basic sanity checks return immediately.
204     if (kc_ <= 0 || mc_ <= 0 || nc_ <= 0) return;
205 
206     // If we are using default Eigen gebp kernel there is no need to adjust the
207     // block sizes for MKL-DNN.
208     if (!UseCustomContractionKernels()) return;
209 
210     // 2. And refine them to work well with mkldnn sgemm.
211     mc_ = (std::min)(
212         m, Eigen::divup(static_cast<StorageIndex>(mc_ * kScaleM), kUnrollM) *
213                kUnrollM);
214     nc_ = (std::min)(
215         n, Eigen::divup(static_cast<StorageIndex>(nc_ * kScaleN), kUnrollN) *
216                kUnrollN);
217 
218     // We split Kth dimensions in roughly equal slices.
219     StorageIndex target_k_slices =
220         (std::max)(StorageIndex(1), Eigen::divup(k, kc_));
221     StorageIndex packet_size = internal::packet_traits<Scalar>::size;
222     if (packet_size < 8) packet_size = 8;
223     StorageIndex target_bk =
224         Eigen::divup(k / target_k_slices, packet_size) * packet_size;
225     kc_ = (std::min)(k, target_bk);
226   }
227 
228   EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
229   EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
230   EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
231 
232  private:
233   StorageIndex kc_;
234   StorageIndex mc_;
235   StorageIndex nc_;
236 };
237 
238 template <typename StorageIndex, typename OutputMapper, typename LhsMapper,
239           typename RhsMapper>
240 struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
241                                LhsMapper, RhsMapper> {
242   // For now mkldnn has only mkldnn_sgemm (gemm for floats).
243   using Scalar = float;
244   using Traits = typename internal::gebp_traits<Scalar, Scalar>;
245 
246   using LhsPacker =
247       gemm_pack_colmajor_block<Scalar, StorageIndex,
248                                typename LhsMapper::SubMapper, ColMajor>;
249   using RhsPacker =
250       gemm_pack_colmajor_block<Scalar, StorageIndex,
251                                typename RhsMapper::SubMapper, ColMajor>;
252   using GemmKernel = mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>;
253 
254   // Fallback on default Eigen pack and GEBP kernel if custom contraction
255   // kernels disabled at runtime.
256   using EigenLhsPacker =
257       gemm_pack_lhs<Scalar, StorageIndex, typename LhsMapper::SubMapper,
258                     Traits::mr, Traits::LhsProgress,
259                     typename Traits::LhsPacket4Packing, ColMajor>;
260   using EigenRhsPacker =
261       gemm_pack_rhs<Scalar, StorageIndex, typename RhsMapper::SubMapper,
262                     Traits::nr, ColMajor>;
263   using GebpKernel =
264       gebp_kernel<Scalar, Scalar, StorageIndex, OutputMapper, Traits::mr,
265                   Traits::nr,
266                   /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>;
267 
268   EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packLhs(
269       Scalar* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
270       const StorageIndex depth, const StorageIndex rows) {
271     if (UseCustomContractionKernels()) {
272       LhsPacker()(lhsBlock, data_mapper, rows, depth);
273     } else {
274       EigenLhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
275                        /*offset*/ 0);
276     }
277   }
278 
279   EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packRhs(
280       Scalar* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
281       const StorageIndex depth, const StorageIndex cols) {
282     if (UseCustomContractionKernels()) {
283       RhsPacker()(rhsBlock, data_mapper, depth, cols);
284     } else {
285       EigenRhsPacker()(rhsBlock, data_mapper, depth, cols);
286     }
287   }
288 
289   EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void invoke(
290       const OutputMapper& output_mapper, const Scalar* lhsBlock,
291       const Scalar* rhsBlock, const StorageIndex rows, const StorageIndex depth,
292       const StorageIndex cols, const Scalar alpha) {
293     if (UseCustomContractionKernels()) {
294       GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
295     } else {
296       GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
297                    /*strideA*/ -1, /*strideB*/ -1,
298                    /*offsetA*/ 0, /*offsetB*/ 0);
299     }
300   }
301 };
302 
303 #endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
304 
305 }  // namespace internal
306 }  // namespace Eigen
307 
308 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
309