1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 12 13 namespace Eigen { 14 15 /** \class TensorEvaluator 16 * \ingroup CXX11_Tensor_Module 17 * 18 * \brief A cost model used to limit the number of threads used for evaluating 19 * tensor expression. 20 * 21 */ 22 23 // Class storing the cost of evaluating a tensor expression in terms of the 24 // estimated number of operand bytes loads, bytes stored, and compute cycles. 25 class TensorOpCost { 26 public: 27 // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple 28 // model based on minimal reciprocal throughput numbers from Intel or 29 // Agner Fog's tables would be better than what is there now. 30 template <typename ArgType> MulCost()31 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { 32 return internal::functor_traits< 33 internal::scalar_product_op<ArgType, ArgType> >::Cost; 34 } 35 template <typename ArgType> AddCost()36 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { 37 return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost; 38 } 39 template <typename ArgType> DivCost()40 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { 41 return internal::functor_traits< 42 internal::scalar_quotient_op<ArgType, ArgType> >::Cost; 43 } 44 template <typename ArgType> ModCost()45 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { 46 return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost; 47 } 48 template <typename SrcType, typename TargetType> CastCost()49 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { 50 return internal::functor_traits< 51 internal::scalar_cast_op<SrcType, TargetType> >::Cost; 52 } 53 54 EIGEN_DEVICE_FUNC TensorOpCost()55 TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} 56 EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded,double bytes_stored,double compute_cycles)57 TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) 58 : bytes_loaded_(bytes_loaded), 59 bytes_stored_(bytes_stored), 60 compute_cycles_(compute_cycles) {} 61 62 EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded,double bytes_stored,double compute_cycles,bool vectorized,double packet_size)63 TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, 64 bool vectorized, double packet_size) 65 : bytes_loaded_(bytes_loaded), 66 bytes_stored_(bytes_stored), 67 compute_cycles_(vectorized ? compute_cycles / packet_size 68 : compute_cycles) { 69 eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); 70 eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); 71 eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); 72 } 73 bytes_loaded()74 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { 75 return bytes_loaded_; 76 } bytes_stored()77 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { 78 return bytes_stored_; 79 } compute_cycles()80 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { 81 return compute_cycles_; 82 } total_cost(double load_cost,double store_cost,double compute_cost)83 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( 84 double load_cost, double store_cost, double compute_cost) const { 85 return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + 86 compute_cost * compute_cycles_; 87 } 88 89 // Drop memory access component. Intended for cases when memory accesses are 90 // sequential or are completely masked by computations. dropMemoryCost()91 EIGEN_DEVICE_FUNC void dropMemoryCost() { 92 bytes_loaded_ = 0; 93 bytes_stored_ = 0; 94 } 95 96 // TODO(rmlarsen): Define min in terms of total cost, not elementwise. cwiseMin(const TensorOpCost & rhs)97 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( 98 const TensorOpCost& rhs) const { 99 double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); 100 double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); 101 double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); 102 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); 103 } 104 105 // TODO(rmlarsen): Define max in terms of total cost, not elementwise. cwiseMax(const TensorOpCost & rhs)106 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( 107 const TensorOpCost& rhs) const { 108 double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); 109 double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); 110 double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); 111 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); 112 } 113 114 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( 115 const TensorOpCost& rhs) { 116 bytes_loaded_ += rhs.bytes_loaded(); 117 bytes_stored_ += rhs.bytes_stored(); 118 compute_cycles_ += rhs.compute_cycles(); 119 return *this; 120 } 121 122 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { 123 bytes_loaded_ *= rhs; 124 bytes_stored_ *= rhs; 125 compute_cycles_ *= rhs; 126 return *this; 127 } 128 129 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( 130 TensorOpCost lhs, const TensorOpCost& rhs) { 131 lhs += rhs; 132 return lhs; 133 } 134 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( 135 TensorOpCost lhs, double rhs) { 136 lhs *= rhs; 137 return lhs; 138 } 139 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( 140 double lhs, TensorOpCost rhs) { 141 rhs *= lhs; 142 return rhs; 143 } 144 145 friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { 146 return os << "[bytes_loaded = " << tc.bytes_loaded() 147 << ", bytes_stored = " << tc.bytes_stored() 148 << ", compute_cycles = " << tc.compute_cycles() << "]"; 149 } 150 151 private: 152 double bytes_loaded_; 153 double bytes_stored_; 154 double compute_cycles_; 155 }; 156 157 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads 158 // in [1:max_threads] instead of just switching multi-threading off for small 159 // work units. 160 template <typename Device> 161 class TensorCostModel { 162 public: 163 // Scaling from Eigen compute cost to device cycles. 164 static const int kDeviceCyclesPerComputeCycle = 1; 165 166 // Costs in device cycles. 167 static const int kStartupCycles = 100000; 168 static const int kPerThreadCycles = 100000; 169 static const int kTaskSize = 40000; 170 171 // Returns the number of threads in [1:max_threads] to use for 172 // evaluating an expression with the given output size and cost per 173 // coefficient. numThreads(double output_size,const TensorOpCost & cost_per_coeff,int max_threads)174 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( 175 double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { 176 double cost = totalCost(output_size, cost_per_coeff); 177 int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; 178 return numext::mini(max_threads, numext::maxi(1, threads)); 179 } 180 181 // taskSize assesses parallel task size. 182 // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task 183 // granularity needs to be increased to mitigate parallelization overheads. taskSize(double output_size,const TensorOpCost & cost_per_coeff)184 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( 185 double output_size, const TensorOpCost& cost_per_coeff) { 186 return totalCost(output_size, cost_per_coeff) / kTaskSize; 187 } 188 189 private: totalCost(double output_size,const TensorOpCost & cost_per_coeff)190 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( 191 double output_size, const TensorOpCost& cost_per_coeff) { 192 // Cost of memory fetches from L2 cache. 64 is typical cache line size. 193 // 11 is L2 cache latency on Haswell. 194 // We don't know whether data is in L1, L2 or L3. But we are most interested 195 // in single-threaded computational time around 100us-10ms (smaller time 196 // is too small for parallelization, larger time is not intersting 197 // either because we are probably using all available threads already). 198 // And for the target time range, L2 seems to be what matters. Data set 199 // fitting into L1 is too small to take noticeable time. Data set fitting 200 // only into L3 presumably will take more than 10ms to load and process. 201 const double kLoadCycles = 1.0 / 64 * 11; 202 const double kStoreCycles = 1.0 / 64 * 11; 203 // Scaling from Eigen compute cost to device cycles. 204 return output_size * 205 cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, 206 kDeviceCyclesPerComputeCycle); 207 } 208 }; 209 210 } // namespace Eigen 211 212 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 213