1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_ 17 #define TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_ 18 19 // Depending on a build configuration this header provides custom kernel for 20 // Eigen tensor contractions (small matrix multiplication kernel used to 21 // multiple together blocks of the original tensors). 22 // 23 // 1) --define tensorflow_mkldnn_contraction_kernel=1 24 // Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at 25 // runtime and use avx/avx2/fma/avx512 based on cpu status registers 26 // (https://en.wikipedia.org/wiki/CPUID). 27 // 28 // If you use `tensor.contract(other_tensor)` in your code, you must include 29 // this header to get the benefit of custom contraction kernel: 30 // 31 // #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) 32 // #include "tensorflow/core/kernels/eigen_contraction_kernel.h" 33 // #endif 34 35 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 36 37 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL) 38 #include "mkldnn.h" 39 #endif 40 41 namespace Eigen { 42 namespace internal { 43 44 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) 45 // Returns `true` iff we can use custom contraction kernels. This is a runtime 46 // check, that uses environment variables. 47 bool UseCustomContractionKernels(); 48 49 // Pack a 2D block of a Tensor expression into contiguous block of memory with 50 // col-major storage order. We do not have access to the underlying Tensor 51 // expression, we only have a DataMapper (TensorContractionInputMapper for 52 // tensor contractions, or blas_data_mapper for plain tensors), that provides a 53 // two-dimensional view into the Tensor expression. 54 // 55 // Default Eigen gemm_pack_rhs and gemm_pack_lhs pack blocks of tensor 56 // expressions into the packed format described in "Anatomy of High-Performance 57 // Matrix Multiplication" paper (1). Eigen::internal::gebp_kernel relies on this 58 // packing format for efficient micro-panel multiplication. 59 // 60 // This simple packing can be used with any '?gemm' function from BLAS 61 // libraries, that work with col-major matrices. 62 // 63 // (1) http://www.cs.utexas.edu/~flame/pubs/GotoTOMS_revision.pdf 64 // 65 // IMPORTANT: `gemm_pack_colmajor_block` always packs the block in column major 66 // order, DataMapperStorageOrder specifies the storage order of the underlying 67 // Tensor expression. 68 template <typename Scalar, typename IndexType, typename DataMapper, 69 int DataMapperStorageOrder> 70 struct gemm_pack_colmajor_block; 71 72 // gemm_pack_colmajor_block for ColMajor storage order. 73 template <typename Scalar, typename IndexType, typename DataMapper> 74 struct gemm_pack_colmajor_block<Scalar, IndexType, DataMapper, 75 /*DataMapperStorageOrder*/ ColMajor> { 76 typedef typename internal::packet_traits<Scalar>::type Packet; 77 typedef typename DataMapper::LinearMapper LinearMapper; 78 79 enum { PacketSize = internal::packet_traits<Scalar>::size }; 80 81 EIGEN_DONT_INLINE 82 void operator()(Scalar* block, const DataMapper& data_mapper, IndexType rows, 83 IndexType cols) { 84 const IndexType unrolled_rows = rows - 4 * PacketSize; 85 const IndexType vectorized_rows = rows - PacketSize; 86 87 for (IndexType col = 0; col < cols; ++col) { 88 LinearMapper lm = data_mapper.getLinearMapper(0, col); 89 90 IndexType row = 0; 91 // Give compiler a strong possibility to unroll the loop. 92 for (; row <= unrolled_rows; row += 4 * PacketSize) { 93 for (IndexType j = 0; j < 4; ++j) { 94 const Packet p = lm.template loadPacket<Packet>(row + j * PacketSize); 95 internal::pstoreu(block + j * PacketSize, p); 96 } 97 block += 4 * PacketSize; 98 } 99 // Process remaining rows with packets. 100 for (; row <= vectorized_rows; row += PacketSize) { 101 const Packet p = lm.template loadPacket<Packet>(row); 102 internal::pstoreu(block, p); 103 block += PacketSize; 104 } 105 // Finalize with coefficients. 106 for (; row < rows; ++row) { 107 *block = lm(row); 108 ++block; 109 } 110 } 111 } 112 }; 113 114 #endif // TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL 115 116 // Enabled by build option: "--define tensorflow_mkldnn_contraction_kernel=1" 117 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL) 118 119 template <typename Scalar, typename IndexType, typename OutputMapper, 120 bool ConjugateLhs = false, bool ConjugateRhs = false> 121 struct mkldnn_gemm_kernel; 122 123 // mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm. 124 template <typename IndexType, typename OutputMapper, bool ConjugateLhs, 125 bool ConjugateRhs> 126 struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper, 127 ConjugateLhs, ConjugateRhs> { 128 static_assert(!ConjugateLhs, "MKL-DNN kernel doesn't support ConjugateLhs"); 129 static_assert(!ConjugateRhs, "MKL-DNN kernel doesn't support ConjugateRhs"); 130 131 EIGEN_DONT_INLINE 132 void operator()(const OutputMapper& output, const float* blockA, 133 const float* blockB, const IndexType rows, 134 const IndexType depth, const IndexType cols, float alpha) { 135 static const int max_index = (std::numeric_limits<int>::max)(); 136 137 eigen_assert(max_index >= rows); 138 eigen_assert(max_index >= cols); 139 eigen_assert(max_index >= depth); 140 eigen_assert(max_index >= output.stride()); 141 142 const int m = static_cast<int>(rows); 143 const int n = static_cast<int>(cols); 144 const int k = static_cast<int>(depth); 145 146 const char transposeA = 'N'; 147 const char transposeB = 'N'; 148 149 const int ldA = m; 150 const int ldB = k; 151 const int ldC = static_cast<int>(output.stride()); 152 153 const float beta = 1.0; 154 155 mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k, 156 &alpha, blockA, &ldA, blockB, &ldB, &beta, 157 const_cast<float*>(output.data()), &ldC); 158 eigen_assert(st == 0); 159 160 // eigen_assert is a no-op in optimized mode so we add these to avoid 161 // compiler's unused-variable errors. 162 EIGEN_UNUSED_VARIABLE(max_index); 163 EIGEN_UNUSED_VARIABLE(st); 164 } 165 }; 166 167 // For mkldnn_sgemm having the right dimensions (especially for small matrices) 168 // is more important than fitting all the working set in L1/L2 caches. 169 // TODO(ezhulenev): Do better heuristics. 170 template <typename StorageIndex, int sharding_type> 171 class TensorContractionBlocking<float, float, float, StorageIndex, 172 sharding_type> { 173 // For now mkldnn has only mkldnn_sgemm (gemm for floats). 174 using Scalar = float; 175 176 // Adjust the block sizes to work well with mkldnn kernels. 177 178 // Multiply default choice of block size along M and N dimensions. 179 // TODO(ezhulenev): Explore if this can work in general (kScaleM=2.0 worked 180 // well in some of models). 181 static constexpr float kScaleM = 1.5; 182 static constexpr float kScaleN = 1.0; 183 184 // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48. 185 static const StorageIndex kUnrollM = 48; 186 187 // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8. 188 static const StorageIndex kUnrollN = 24; 189 190 public: 191 TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, 192 StorageIndex num_threads = 1) 193 : kc_(k), mc_(m), nc_(n) { 194 // 1. Compute block sizes using default Eigen heuristics. 195 if (sharding_type == ShardByCol) { 196 computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, mc_, nc_, 197 num_threads); 198 } else { 199 computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, nc_, mc_, 200 num_threads); 201 } 202 203 // If dimensions do not pass basic sanity checks return immediately. 204 if (kc_ <= 0 || mc_ <= 0 || nc_ <= 0) return; 205 206 // If we are using default Eigen gebp kernel there is no need to adjust the 207 // block sizes for MKL-DNN. 208 if (!UseCustomContractionKernels()) return; 209 210 // 2. And refine them to work well with mkldnn sgemm. 211 mc_ = (std::min)( 212 m, Eigen::divup(static_cast<StorageIndex>(mc_ * kScaleM), kUnrollM) * 213 kUnrollM); 214 nc_ = (std::min)( 215 n, Eigen::divup(static_cast<StorageIndex>(nc_ * kScaleN), kUnrollN) * 216 kUnrollN); 217 218 // We split Kth dimensions in roughly equal slices. 219 StorageIndex target_k_slices = 220 (std::max)(StorageIndex(1), Eigen::divup(k, kc_)); 221 StorageIndex packet_size = internal::packet_traits<Scalar>::size; 222 if (packet_size < 8) packet_size = 8; 223 StorageIndex target_bk = 224 Eigen::divup(k / target_k_slices, packet_size) * packet_size; 225 kc_ = (std::min)(k, target_bk); 226 } 227 228 EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } 229 EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } 230 EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } 231 232 private: 233 StorageIndex kc_; 234 StorageIndex mc_; 235 StorageIndex nc_; 236 }; 237 238 template <typename StorageIndex, typename OutputMapper, typename LhsMapper, 239 typename RhsMapper> 240 struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper, 241 LhsMapper, RhsMapper> { 242 // For now mkldnn has only mkldnn_sgemm (gemm for floats). 243 using Scalar = float; 244 using Traits = typename internal::gebp_traits<Scalar, Scalar>; 245 246 using LhsPacker = 247 gemm_pack_colmajor_block<Scalar, StorageIndex, 248 typename LhsMapper::SubMapper, ColMajor>; 249 using RhsPacker = 250 gemm_pack_colmajor_block<Scalar, StorageIndex, 251 typename RhsMapper::SubMapper, ColMajor>; 252 using GemmKernel = mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>; 253 254 // Fallback on default Eigen pack and GEBP kernel if custom contraction 255 // kernels disabled at runtime. 256 using EigenLhsPacker = 257 gemm_pack_lhs<Scalar, StorageIndex, typename LhsMapper::SubMapper, 258 Traits::mr, Traits::LhsProgress, 259 typename Traits::LhsPacket4Packing, ColMajor>; 260 using EigenRhsPacker = 261 gemm_pack_rhs<Scalar, StorageIndex, typename RhsMapper::SubMapper, 262 Traits::nr, ColMajor>; 263 using GebpKernel = 264 gebp_kernel<Scalar, Scalar, StorageIndex, OutputMapper, Traits::mr, 265 Traits::nr, 266 /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>; 267 268 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packLhs( 269 Scalar* lhsBlock, const typename LhsMapper::SubMapper& data_mapper, 270 const StorageIndex depth, const StorageIndex rows) { 271 if (UseCustomContractionKernels()) { 272 LhsPacker()(lhsBlock, data_mapper, rows, depth); 273 } else { 274 EigenLhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, 275 /*offset*/ 0); 276 } 277 } 278 279 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packRhs( 280 Scalar* rhsBlock, const typename RhsMapper::SubMapper& data_mapper, 281 const StorageIndex depth, const StorageIndex cols) { 282 if (UseCustomContractionKernels()) { 283 RhsPacker()(rhsBlock, data_mapper, depth, cols); 284 } else { 285 EigenRhsPacker()(rhsBlock, data_mapper, depth, cols); 286 } 287 } 288 289 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void invoke( 290 const OutputMapper& output_mapper, const Scalar* lhsBlock, 291 const Scalar* rhsBlock, const StorageIndex rows, const StorageIndex depth, 292 const StorageIndex cols, const Scalar alpha) { 293 if (UseCustomContractionKernels()) { 294 GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha); 295 } else { 296 GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, 297 /*strideA*/ -1, /*strideB*/ -1, 298 /*offsetA*/ 0, /*offsetB*/ 0); 299 } 300 } 301 }; 302 303 #endif // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL) 304 305 } // namespace internal 306 } // namespace Eigen 307 308 #endif // TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_ 309