1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_ 17 #define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_ 18 19 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 20 21 // Note the following header is used in both TF and TFLite. Particularly, it's 22 // used for float TFLite Conv2D. 23 #include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h" 24 25 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) 26 #include "tensorflow/core/kernels/eigen_contraction_kernel.h" 27 28 namespace Eigen { 29 namespace internal { 30 31 // After we vectorized all loads from the underlying tensor using Packet ops, we 32 // have to finalize coefficients that do not fit into a packet. 33 template <typename Scalar, typename DataMapper, int packet_size, 34 bool masked_load_store> 35 struct FinalizeDataMapperCoeffs { 36 EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block, 37 const DataMapper& rhs, 38 Index base_idx, Index depth, 39 Index max_depth, bool pad = false) { 40 const Index num_coeffs = max_depth - depth; 41 eigen_assert(num_coeffs <= packet_size); 42 43 for (; depth < max_depth; ++depth) { 44 *block = pad ? Scalar(0) : rhs.coeffNoPadding(depth, base_idx); 45 ++block; 46 } 47 48 return num_coeffs; 49 } 50 }; 51 52 template <typename Scalar, typename DataMapper, int packet_size> 53 struct FinalizeDataMapperCoeffs<Scalar, DataMapper, packet_size, 54 /*masked_load_store=*/true> { 55 EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block, 56 const DataMapper& rhs, 57 Index base_idx, Index depth, 58 Index max_depth, bool pad = false) { 59 Index num_coeffs = max_depth - depth; 60 eigen_assert(num_coeffs <= packet_size); 61 if (num_coeffs == 0) return 0; 62 63 using Packet = typename packet_traits<Scalar>::type; 64 Packet p = pad ? pset1<Packet>(Scalar(0)) 65 : rhs.partialPacketNoPadding(depth, base_idx, num_coeffs); 66 internal::pstoreu(block, p, mask<Packet>(0, num_coeffs)); 67 68 return num_coeffs; 69 } 70 }; 71 72 // Pack a block of the right input matrix (in our case it's always a 73 // "virtual matrix" constructed from extracted image patches) in contiguous 74 // block in column-major storage order. Knowing the properties of the 75 // original patch op we can do it more efficient than the default 76 // gemm_pack_colmajor_block. 77 template <typename NewDimension, Index Rows, Index Cols, typename ArgType, 78 typename Device, typename Scalar, typename StorageIndex, 79 typename nocontract_t, typename contract_t, int packet_size, 80 bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> 81 struct gemm_pack_colmajor_block< 82 Scalar, StorageIndex, 83 TensorContractionSubMapper< 84 Scalar, StorageIndex, Rhs, 85 TensorEvaluator< 86 const TensorReshapingOp< 87 NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, 88 Device>, 89 nocontract_t, contract_t, packet_size, inner_dim_contiguous, 90 inner_dim_reordered, Alignment>, 91 ColMajor> { 92 typedef TensorContractionSubMapper< 93 Scalar, StorageIndex, Rhs, 94 TensorEvaluator< 95 const TensorReshapingOp< 96 NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, 97 Device>, 98 nocontract_t, contract_t, packet_size, inner_dim_contiguous, 99 inner_dim_reordered, Alignment> 100 SubMapper; 101 102 typedef SubMapper DataMapper; 103 typedef typename packet_traits<Scalar>::type Packet; 104 105 using CoeffFinalizer = FinalizeDataMapperCoeffs< 106 Scalar, DataMapper, packet_size, 107 TensorEvaluatorHasPartialPacket<typename DataMapper::TensorEvaluatorT, 108 Packet, Index>::value && 109 unpacket_traits<Packet>::masked_store_available>; 110 111 EIGEN_DONT_INLINE 112 void operator()(Scalar* block, const DataMapper rhs, StorageIndex rows, 113 StorageIndex cols) { 114 const bool standard_patches = !rhs.nonStandardPatches(); 115 116 if (standard_patches && (rhs.patchDepth() % packet_size == 0)) { 117 // Single packet always belong to single patch (row, col). 118 if (rhs.hasPadding()) { 119 packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/true, 120 /*has_padding=*/true>(block, rhs, rows, cols); 121 } else { 122 packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/true, 123 /*has_padding=*/false>(block, rhs, rows, cols); 124 } 125 126 } else if (standard_patches) { 127 // Single packet can span across multiple patch rows or columns. 128 if (rhs.hasPadding()) { 129 packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/false, 130 /*has_padding=*/true>(block, rhs, rows, cols); 131 } else { 132 packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/false, 133 /*has_padding=*/false>(block, rhs, rows, cols); 134 } 135 136 } else if (rhs.patchDepth() % packet_size == 0) { 137 // Single packet always belong to single patch (row, col). 138 packNonStandardPatches</*patch_depth_is_multiple_of_packet_size*/ 139 true>(block, rhs, rows, cols); 140 141 } else { 142 // Single packet can span across multiple patch rows or columns. 143 packNonStandardPatches</*patch_depth_is_multiple_of_packet_size*/ 144 false>(block, rhs, rows, cols); 145 } 146 } 147 148 private: 149 // (A) Standard image patches: 150 // 151 // (1) patch_row_inflate_strides == 1 AND 152 // (2) patch_col_inflate_strides == 1 153 // 154 // Standard patches guarantee that two inner most dimensions (depth and rows) 155 // are contiguous in memory and we can try to squeeze reads from them. 156 // 157 // (B) Non standard image patches: in_row/in_col and patch_row/patch_col 158 // strides can be not equal to 1, and for each [row, col] inside a patch we 159 // have to do additional computations to find corresponding row and col in the 160 // input tensor. Also we can no longer squeeze reads from inner dimensions. 161 // 162 // Additional parameters: 163 // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have 164 // depth dimension size to be a multiple of packet size, so we can skip all 165 // non vectorized loads and checks, because it's guaranteed that block size 166 // will be a multiple of a packet size (see TensorContractionBlocking). 167 // 168 // - has_padding: Input tensor has non-zero padding. In this case for each 169 // patch col and row we need to check that it doesn't correspond to the 170 // padded region of original input. 171 template <bool patch_depth_is_multiple_of_packet_size, bool has_padding> 172 EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block, 173 const DataMapper rhs, 174 StorageIndex rows, 175 StorageIndex cols) { 176 eigen_assert(!rhs.nonStandardPatches()); 177 178 // Give vectorized_rows the name used in all other gemm_pack_rhs above. 179 const StorageIndex peeled_k = (rows / packet_size) * packet_size; 180 181 const StorageIndex start_col = rhs.colOffset(); 182 const StorageIndex max_col = rhs.maxCol(peeled_k); 183 const StorageIndex rhs_depth_offset = rhs.depthOffset(); 184 185 for (StorageIndex col = 0; col < cols; ++col) { 186 SubMapper lm = rhs.getLinearMapper(0, col); 187 188 StorageIndex k = 0; 189 for (Index c = start_col; c < max_col; ++c) { 190 eigen_assert(k <= peeled_k); 191 192 const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0; 193 const StorageIndex max_row = rhs.maxRow(peeled_k, c); 194 const bool pad_col = has_padding && lm.padCol(c); 195 196 eigen_assert(has_padding || !lm.padCol(c)); 197 eigen_assert(has_padding || !lm.padAnyRow(start_row, max_row - 1)); 198 199 // We can squeeze reads for all rows in [start_row, max_row) range. 200 if (!has_padding || 201 (!pad_col && !lm.padAnyRow(start_row, max_row - 1))) { 202 const StorageIndex start_depth = 203 (c == start_col) ? rhs_depth_offset : 0; 204 205 const StorageIndex max_depth = 206 std::min<StorageIndex>(start_depth + (peeled_k - k), 207 (max_row - start_row) * rhs.patchDepth()); 208 209 const StorageIndex base_idx = lm.baseIndex(start_row, c); 210 211 if (patch_depth_is_multiple_of_packet_size) { 212 // If patch depth is a multiple of packet size, it's guaranteed that 213 // we can process all values in depth dimension with packets. 214 eigen_assert((max_depth - start_depth) % packet_size == 0); 215 StorageIndex d = start_depth; 216 217 const StorageIndex unrolled_depth = max_depth - 4 * packet_size; 218 for (; d <= unrolled_depth; d += 4 * packet_size) { 219 eigen_assert(k < peeled_k); 220 221 Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx); 222 Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx); 223 Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx); 224 Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx); 225 226 internal::pstoreu(block + 0 * packet_size, p0); 227 internal::pstoreu(block + 1 * packet_size, p1); 228 internal::pstoreu(block + 2 * packet_size, p2); 229 internal::pstoreu(block + 3 * packet_size, p3); 230 231 block += 4 * packet_size; 232 k += 4 * packet_size; 233 } 234 235 for (; d < max_depth; d += packet_size) { 236 eigen_assert(k < peeled_k); 237 internal::pstoreu(block, rhs.packetNoPadding(d, base_idx)); 238 block += packet_size; 239 k += packet_size; 240 } 241 242 } else { 243 StorageIndex d = start_depth; 244 245 const StorageIndex unrolled_depth = max_depth - 4 * packet_size; 246 for (; d <= unrolled_depth; d += 4 * packet_size) { 247 eigen_assert(k < peeled_k); 248 249 Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx); 250 Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx); 251 Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx); 252 Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx); 253 254 internal::pstoreu(block + 0 * packet_size, p0); 255 internal::pstoreu(block + 1 * packet_size, p1); 256 internal::pstoreu(block + 2 * packet_size, p2); 257 internal::pstoreu(block + 3 * packet_size, p3); 258 259 block += 4 * packet_size; 260 k += 4 * packet_size; 261 } 262 263 const StorageIndex vectorized_depth = max_depth - packet_size; 264 for (; d <= vectorized_depth; d += packet_size) { 265 eigen_assert(k < peeled_k); 266 internal::pstoreu(block, rhs.packetNoPadding(d, base_idx)); 267 block += packet_size; 268 k += packet_size; 269 } 270 271 eigen_assert(k <= peeled_k); 272 const Index num_coeffs = 273 CoeffFinalizer::finalize(block, rhs, base_idx, d, max_depth); 274 275 k += num_coeffs; 276 block += num_coeffs; 277 eigen_assert(k <= peeled_k); 278 } 279 280 // Go to the next column. 281 continue; 282 } 283 284 // If we are not allowed to squeeze reads along the `row` and `depth` 285 // dimensions, we must process rows one by one. 286 for (StorageIndex r = start_row; r < max_row; ++r) { 287 eigen_assert(k <= peeled_k); 288 289 const StorageIndex start_depth = 290 ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0; 291 const StorageIndex max_depth = 292 rhs.maxDepth(peeled_k - k, start_depth); 293 294 const bool pad = has_padding && (pad_col || lm.padRow(r)); 295 eigen_assert(has_padding || !lm.padRow(r)); 296 297 const StorageIndex base_idx = lm.baseIndex(r, c); 298 299 if (patch_depth_is_multiple_of_packet_size) { 300 // If patch depth is a multiple of packet size, it's guaranteed that 301 // we can process all values in depth dimension with packets. 302 eigen_assert((max_depth - start_depth) % packet_size == 0); 303 StorageIndex d = start_depth; 304 305 for (; d < max_depth; d += packet_size) { 306 eigen_assert(k < peeled_k); 307 const Packet p = (has_padding && pad) 308 ? pset1<Packet>(Scalar(0)) 309 : rhs.packetNoPadding(d, base_idx); 310 internal::pstoreu(block, p); 311 block += packet_size; 312 k += packet_size; 313 } 314 315 } else { 316 StorageIndex d = start_depth; 317 318 const StorageIndex vectorized_depth = max_depth - packet_size; 319 for (; d <= vectorized_depth; d += packet_size) { 320 eigen_assert(k < peeled_k); 321 const Packet p = (has_padding && pad) 322 ? pset1<Packet>(Scalar(0)) 323 : rhs.packetNoPadding(d, base_idx); 324 internal::pstoreu(block, p); 325 block += packet_size; 326 k += packet_size; 327 } 328 329 eigen_assert(k <= peeled_k); 330 const Index num_coeffs = CoeffFinalizer::finalize( 331 block, rhs, base_idx, d, max_depth, has_padding && pad); 332 333 k += num_coeffs; 334 block += num_coeffs; 335 eigen_assert(k <= peeled_k); 336 } 337 } 338 } 339 340 // The loop above should fill peeled_k elements. 341 eigen_assert(peeled_k == k); 342 343 // Fill remaining elements using loadCoeffStandard. 344 for (; k < rows; ++k) { 345 *block = lm.loadCoeffStandard(k); 346 ++block; 347 } 348 } 349 } 350 351 template <bool patch_depth_is_multiple_of_packet_size> 352 EIGEN_ALWAYS_INLINE void packNonStandardPatches(Scalar* block, 353 const DataMapper rhs, 354 StorageIndex rows, 355 StorageIndex cols) { 356 eigen_assert(rhs.nonStandardPatches()); 357 358 // Give vectorized_rows the name used in all other gemm_pack_rhs above. 359 const StorageIndex peeled_k = (rows / packet_size) * packet_size; 360 361 const StorageIndex start_col = rhs.colOffset(); 362 const StorageIndex max_col = rhs.maxCol(peeled_k); 363 const StorageIndex rhs_depth_offset = rhs.depthOffset(); 364 365 // Original input column and row after applying all non-standard strides and 366 // dilations. Computed by padOrSkip{Row,Col}. 367 Index orig_c = 0; 368 Index orig_r = 0; 369 370 for (StorageIndex col = 0; col < cols; ++col) { 371 SubMapper lm = rhs.getLinearMapper(0, col); 372 373 StorageIndex k = 0; 374 for (Index c = start_col; c < max_col; ++c) { 375 eigen_assert(k <= peeled_k); 376 377 const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0; 378 const StorageIndex max_row = rhs.maxRow(peeled_k, c); 379 const bool pad_or_skip_col = lm.padOrSkipCol(c, &orig_c); 380 381 for (StorageIndex r = start_row; r < max_row; ++r) { 382 eigen_assert(k <= peeled_k); 383 384 const StorageIndex start_depth = 385 ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0; 386 const StorageIndex max_depth = 387 rhs.maxDepth(peeled_k - k, start_depth); 388 389 const bool pad_or_skip = 390 pad_or_skip_col || lm.padOrSkipRow(r, &orig_r); 391 const StorageIndex base_idx = lm.origBaseIndex(orig_r, orig_c); 392 393 if (patch_depth_is_multiple_of_packet_size) { 394 // If patch depth is a multiple of packet size, it's guaranteed that 395 // we can process all values in depth dimension with packets. 396 eigen_assert((max_depth - start_depth) % packet_size == 0); 397 StorageIndex d = start_depth; 398 399 for (; d < max_depth; d += packet_size) { 400 eigen_assert(k < peeled_k); 401 const Packet p = pad_or_skip ? pset1<Packet>(Scalar(0)) 402 : rhs.packetNoPadding(d, base_idx); 403 internal::pstoreu(block, p); 404 block += packet_size; 405 k += packet_size; 406 } 407 408 } else { 409 const StorageIndex vectorized_depth = max_depth - packet_size; 410 StorageIndex d = start_depth; 411 for (; d <= vectorized_depth; d += packet_size) { 412 eigen_assert(k < peeled_k); 413 const Packet p = pad_or_skip ? pset1<Packet>(Scalar(0)) 414 : rhs.packetNoPadding(d, base_idx); 415 internal::pstoreu(block, p); 416 block += packet_size; 417 k += packet_size; 418 } 419 420 eigen_assert(k <= peeled_k); 421 const Index num_coeffs = CoeffFinalizer::finalize( 422 block, rhs, base_idx, d, max_depth, pad_or_skip); 423 424 k += num_coeffs; 425 block += num_coeffs; 426 eigen_assert(k <= peeled_k); 427 } 428 } 429 } 430 431 // The loop above should fill peeled_k elements. 432 eigen_assert(peeled_k == k); 433 434 // Fill remaining elements using loadCoeff. 435 for (; k < rows; ++k) { 436 *block = lm(k); 437 ++block; 438 } 439 } 440 } 441 }; 442 } // namespace internal 443 } // namespace Eigen 444 #endif // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) 445 #endif // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_ 446