1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_ 17 #define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_ 18 19 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 20 21 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) 22 #include "tensorflow/core/kernels/eigen_contraction_kernel.h" 23 24 namespace Eigen { 25 namespace internal { 26 // Pack a block of the right input matrix (in our case it's always a 27 // "virtual matrix" constructed from extracted image patches) in contiguous 28 // block in column-major storage order. Knowing the properties of the 29 // original patch op we can do it more efficient than the default 30 // gemm_pack_colmajor_block. 31 template <typename NewDimension, Index Rows, Index Cols, typename ArgType, 32 typename Device, typename Scalar, typename StorageIndex, 33 typename nocontract_t, typename contract_t, int packet_size, 34 bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> 35 struct gemm_pack_colmajor_block< 36 Scalar, StorageIndex, 37 TensorContractionSubMapper< 38 Scalar, StorageIndex, Rhs, 39 TensorEvaluator< 40 const TensorReshapingOp< 41 NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, 42 Device>, 43 nocontract_t, contract_t, packet_size, inner_dim_contiguous, 44 inner_dim_reordered, Alignment>, 45 ColMajor> { 46 typedef TensorContractionSubMapper< 47 Scalar, StorageIndex, Rhs, 48 TensorEvaluator< 49 const TensorReshapingOp< 50 NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, 51 Device>, 52 nocontract_t, contract_t, packet_size, inner_dim_contiguous, 53 inner_dim_reordered, Alignment> 54 SubMapper; 55 56 typedef SubMapper DataMapper; 57 typedef typename packet_traits<Scalar>::type Packet; 58 59 EIGEN_DONT_INLINE 60 void operator()(Scalar* block, const DataMapper rhs, StorageIndex rows, 61 StorageIndex cols) { 62 const bool standard_patches = !rhs.nonStandardPatches(); 63 64 if (standard_patches && (rhs.patchDepth() % packet_size == 0)) { 65 // Single packet always belong to single patch (row, col). 66 packStandardPatches</*patch_depth_is_multiple_of_packet_size*/ true>( 67 block, rhs, rows, cols); 68 69 } else if (standard_patches) { 70 // Single packet can span across multiple patch rows or columns. 71 packStandardPatches</*patch_depth_is_multiple_of_packet_size*/ false>( 72 block, rhs, rows, cols); 73 74 } else { 75 // With non-standard patches we don't do any vectorized loads. 76 // TODO(ezhulenev): It doesn't look like that we should completely give up 77 // on packets. Make this code path faster! 78 for (StorageIndex col = 0; col < cols; ++col) { 79 SubMapper lm = rhs.getLinearMapper(0, col); 80 for (StorageIndex i = 0; i < rows; ++i) { 81 *block = lm(i); 82 ++block; 83 } 84 } 85 } 86 } 87 88 private: 89 // Pack standard image patches: 90 // 91 // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have 92 // depth dimension size to be a multiple of packet size, so we can skip all 93 // non vectorized loads and checks. 94 template <bool patch_depth_is_multiple_of_packet_size> 95 EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block, 96 const DataMapper rhs, 97 StorageIndex rows, 98 StorageIndex cols) { 99 eigen_assert(!rhs.nonStandardPatches()); 100 101 // Give vectorized_rows the name used in all other gemm_pack_rhs above. 102 const StorageIndex peeled_k = (rows / packet_size) * packet_size; 103 104 const StorageIndex start_col = rhs.colOffset(); 105 const StorageIndex max_col = rhs.maxCol(peeled_k); 106 107 for (StorageIndex col = 0; col < cols; ++col) { 108 SubMapper lm = rhs.getLinearMapper(0, col); 109 110 StorageIndex k = 0; 111 for (Index c = start_col; c < max_col; ++c) { 112 eigen_assert(k <= peeled_k); 113 114 const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0; 115 const StorageIndex max_row = rhs.maxRow(peeled_k, c); 116 const bool pad_col = lm.padCol(c); 117 118 // We can squeeze reads for all rows in [start_row, max_row) range. 119 if (!pad_col && !lm.padAnyRow(start_row, max_row - 1)) { 120 const StorageIndex start_depth = 121 (c == start_col) ? rhs.depthOffset() : 0; 122 123 const StorageIndex max_depth = 124 std::min<StorageIndex>(start_depth + (peeled_k - k), 125 (max_row - start_row) * rhs.patchDepth()); 126 127 const StorageIndex base_idx = lm.baseIndex(start_row, c); 128 129 if (patch_depth_is_multiple_of_packet_size) { 130 // If patch depth is a multiple of packet size, it's guaranteed that 131 // we can process all values in depth dimension with packets. 132 eigen_assert((max_depth - start_depth) % packet_size == 0); 133 StorageIndex d = start_depth; 134 135 for (; d < max_depth; d += packet_size) { 136 eigen_assert(k < peeled_k); 137 internal::pstoreu(block, rhs.packetNoPadding(d, base_idx)); 138 block += packet_size; 139 k += packet_size; 140 } 141 142 } else { 143 StorageIndex d = start_depth; 144 const StorageIndex vectorized_depth = max_depth - packet_size; 145 146 for (; d <= vectorized_depth; d += packet_size) { 147 eigen_assert(k < peeled_k); 148 internal::pstoreu(block, rhs.packetNoPadding(d, base_idx)); 149 block += packet_size; 150 k += packet_size; 151 } 152 for (; d < max_depth; d++) { 153 eigen_assert(k < peeled_k); 154 *block = rhs.coeffNoPadding(d, base_idx); 155 ++block; 156 ++k; 157 } 158 } 159 160 // Go to the next column. 161 continue; 162 } 163 164 // If we are not allowed to squeeze reads along the `row` and `depth` 165 // dimensions, we must process rows one by one. 166 for (StorageIndex r = start_row; r < max_row; ++r) { 167 eigen_assert(k <= peeled_k); 168 169 const StorageIndex start_depth = 170 ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; 171 const StorageIndex max_depth = 172 rhs.maxDepth(peeled_k - k, start_depth); 173 174 const bool pad = pad_col || lm.padRow(r); 175 const StorageIndex base_idx = lm.baseIndex(r, c); 176 177 if (patch_depth_is_multiple_of_packet_size) { 178 // If patch depth is a multiple of packet size, it's guaranteed that 179 // we can process all values in depth dimension with packets. 180 eigen_assert((max_depth - start_depth) % packet_size == 0); 181 StorageIndex d = start_depth; 182 183 for (; d < max_depth; d += packet_size) { 184 eigen_assert(k < peeled_k); 185 const Packet p = pad ? pset1<Packet>(Scalar(0)) 186 : rhs.packetNoPadding(d, base_idx); 187 internal::pstoreu(block, p); 188 block += packet_size; 189 k += packet_size; 190 } 191 192 } else { 193 const StorageIndex max_vectorized_depth = max_depth - packet_size; 194 StorageIndex d = start_depth; 195 for (; d < max_vectorized_depth; d += packet_size) { 196 eigen_assert(k < peeled_k); 197 const Packet p = pad ? pset1<Packet>(Scalar(0)) 198 : rhs.packetNoPadding(d, base_idx); 199 internal::pstoreu(block, p); 200 block += packet_size; 201 k += packet_size; 202 } 203 for (; d < max_depth; d++) { 204 eigen_assert(k < peeled_k); 205 *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx); 206 ++block; 207 ++k; 208 } 209 } 210 } 211 } 212 213 // The loop above should fill peeled_k elements. 214 eigen_assert(peeled_k == k); 215 216 // Fill remaining elements using loadCoeffStandard. 217 for (; k < rows; ++k) { 218 *block = lm.loadCoeffStandard(k); 219 ++block; 220 } 221 } 222 } 223 }; 224 } // end namespace internal 225 } // end namespace Eigen 226 #endif // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) 227 228 // Note the following header is used in both TF and TFLite. Particularly, it's 229 // used for float TFLite Conv2D. 230 #include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h" 231 232 #endif // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_ 233