1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_ 17 #define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_ 18 19 #include <numeric> 20 21 #include "tensorflow/core/grappler/costs/cost_estimator.h" 22 #include "tensorflow/core/grappler/costs/op_context.h" 23 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h" 24 #include "tensorflow/core/lib/core/status.h" 25 #include "tensorflow/core/util/padding.h" 26 27 namespace tensorflow { 28 namespace grappler { 29 30 bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto, 31 TensorShapeProto* tensor_shape_proto); 32 TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape, 33 int rank, bool* found_unknown_shapes); 34 35 // Node costs; an intermediate structure used within op level cost estimator. 36 struct NodeCosts { 37 // If this FLAG is true, override calculated compute time with a minimum 38 // value, instead of calculating it from num_compute_ops and compute ops/sec. 39 // For example, PredictIdentity, PredictVariable, PredictMetadata set this 40 // FLAG. 41 bool minimum_cost_op = false; 42 43 // Compute ops. 44 int64 num_compute_ops = 0; 45 46 // Memory bytes accessed; note that these may be different to the size of 47 // tensors. 48 std::vector<int64> num_input_bytes_accessed; // ordered by input tensors. 49 std::vector<int64> num_output_bytes_accessed; // ordered by output ports. 50 int64 internal_read_bytes = 0; 51 int64 internal_write_bytes = 0; 52 53 // Convenience functions. num_total_input_bytesNodeCosts54 int64 num_total_input_bytes() const { 55 return std::accumulate(num_input_bytes_accessed.begin(), 56 num_input_bytes_accessed.end(), 0LL); 57 } num_total_read_bytesNodeCosts58 int64 num_total_read_bytes() const { 59 return num_total_input_bytes() + internal_read_bytes; 60 } num_total_output_bytesNodeCosts61 int64 num_total_output_bytes() const { 62 return std::accumulate(num_output_bytes_accessed.begin(), 63 num_output_bytes_accessed.end(), 0LL); 64 } num_total_write_bytesNodeCosts65 int64 num_total_write_bytes() const { 66 return num_total_output_bytes() + internal_write_bytes; 67 } num_bytes_accessedNodeCosts68 int64 num_bytes_accessed() const { 69 return num_total_read_bytes() + num_total_write_bytes(); 70 } 71 72 // Memory usage. 73 int64 max_memory = 0; 74 int64 persistent_memory = 0; 75 int64 temporary_memory = 0; 76 77 // Stats. 78 int64 num_nodes = 1; 79 int64 num_nodes_with_unknown_shapes = 0; 80 int64 num_nodes_with_unknown_op_type = 0; 81 int64 num_nodes_with_pure_memory_op = 0; 82 bool inaccurate = false; 83 84 // TODO(dyoon): this is added for compatibility; some old code is hard to 85 // migrate; hence, using these as a backup. Once we clean up, we'll delete 86 // these fields. New code should not use these. 87 bool has_costs = false; 88 Costs costs; 89 }; 90 91 class OpLevelCostEstimator { 92 public: 93 OpLevelCostEstimator(); ~OpLevelCostEstimator()94 virtual ~OpLevelCostEstimator() {} 95 96 virtual Costs PredictCosts(const OpContext& op_context) const; 97 98 // Returns basic device performance info. 99 virtual DeviceInfo GetDeviceInfo(const DeviceProperties& device) const; 100 101 protected: 102 // TODO(dyoon): Consider to remove PredictOpCountBasedCosts() with OpInfo. 103 // Naive cost estimate based on the given operations count and total 104 // input/output tensor sizes of the given op_info combined. 105 Costs PredictOpCountBasedCost(double operations, const OpInfo& op_info) const; 106 107 // Naive cost estimate based on the given operations count and the given total 108 // io size in bytes. Sizes of op_info inputs and outputs are not taken into 109 // consideration. 110 Costs PredictOpCountBasedCost(double operations, double input_io_bytes, 111 double output_io_bytes, 112 const OpInfo& op_info) const; 113 114 // Top-level method cost function (PredictCosts calls this method to get 115 // NodeCosts, and then converts it to Costs). PredictNodeCosts() calls other 116 // Predict methods depending on op types. 117 Status PredictNodeCosts(const OpContext& op_context, 118 NodeCosts* node_costs) const; 119 120 // Predict cost of an op for which no accurate estimator is defined. 121 Status PredictCostOfAnUnknownOp(const OpContext& op_context, 122 NodeCosts* node_costs) const; 123 124 // This family of routines predicts the costs to 125 // perform the specified TensorFlow Op on the 126 // device represented by a subclass. The default 127 // implementation just divides the operations to 128 // perform the op (from the "Count" routines, 129 // above) by the device peak operations per 130 // second. 131 // Implementation of costs other than 132 // execution_time is optional, depending on the 133 // device. 134 Status PredictNaryOp(const OpContext& op_context, 135 NodeCosts* node_costs) const; 136 Status PredictConv2D(const OpContext& op_context, 137 NodeCosts* node_costs) const; 138 Status PredictCwiseOp(const OpContext& op_context, 139 NodeCosts* node_costs) const; 140 Status PredictConv2DBackpropInput(const OpContext& op_context, 141 NodeCosts* node_costs) const; 142 Status PredictConv2DBackpropFilter(const OpContext& op_context, 143 NodeCosts* node_costs) const; 144 Status PredictFusedConv2DBiasActivation(const OpContext& op_context, 145 NodeCosts* node_costs) const; 146 Status PredictMatMul(const OpContext& op_context, 147 NodeCosts* node_costs) const; 148 Status PredictSparseTensorDenseMatMul(const OpContext& op_context, 149 NodeCosts* node_costs) const; 150 Status PredictNoOp(const OpContext& op_context, NodeCosts* node_costs) const; 151 Status PredictIdentity(const OpContext& op_context, 152 NodeCosts* node_costs) const; 153 Status PredictVariable(const OpContext& op_context, 154 NodeCosts* node_costs) const; 155 Status PredictBatchMatMul(const OpContext& op_context, 156 NodeCosts* node_costs) const; 157 Status PredictMetadata(const OpContext& op_context, 158 NodeCosts* node_costs) const; 159 Status PredictGatherOrSlice(const OpContext& op_context, 160 NodeCosts* node_costs) const; 161 Status PredictScatter(const OpContext& op_context, 162 NodeCosts* node_costs) const; 163 Status PredictMaxPool(const OpContext& op_context, 164 NodeCosts* node_costs) const; 165 Status PredictMaxPoolGrad(const OpContext& op_context, 166 NodeCosts* node_costs) const; 167 Status PredictAvgPool(const OpContext& op_context, 168 NodeCosts* node_costs) const; 169 Status PredictAvgPoolGrad(const OpContext& op_context, 170 NodeCosts* node_costs) const; 171 Status PredictFusedBatchNorm(const OpContext& op_context, 172 NodeCosts* node_costs) const; 173 Status PredictFusedBatchNormGrad(const OpContext& op_context, 174 NodeCosts* node_costs) const; 175 Status PredictEinsum(const OpContext& op_context, 176 NodeCosts* node_costs) const; 177 Status PredictAssignVariableOps(const OpContext& op_context, 178 NodeCosts* node_costs) const; 179 Status PredictPureMemoryOp(const OpContext& op_context, 180 NodeCosts* node_costs) const; 181 Status PredictSoftmax(const OpContext& op_context, 182 NodeCosts* node_costs) const; 183 Status PredictResizeBilinear(const OpContext& op_context, 184 NodeCosts* node_costs) const; 185 Status PredictCropAndResize(const OpContext& op_context, 186 NodeCosts* node_costs) const; 187 188 // Generic cost prediction method for fused operations. 189 Status PredictFusedOp(const OpContext& op_context, 190 const std::vector<OpContext>& fused_op_contexts, 191 NodeCosts* node_costs) const; 192 193 // Utility function for safe division. Returns 0 194 // if rhs is 0 or negative. SafeDiv(const double lhs,const double rhs)195 static double SafeDiv(const double lhs, const double rhs) { 196 if (rhs > 0) { 197 return lhs / rhs; 198 } else { 199 return 0.0; 200 } 201 } 202 203 // This family of routines counts the number of operations to perform the 204 // specified TensorFlow Op. 205 struct MatMulDimensions { 206 int m; 207 int n; 208 int k; 209 }; 210 struct BatchMatMulDimensions { 211 std::vector<int> batch_dims; 212 MatMulDimensions matmul_dims; 213 }; 214 struct ConvolutionDimensions { 215 int64 batch; // Batch size. 216 int64 ix; // Input size x. 217 int64 iy; // Input size y. 218 int64 iz; // Input depth. 219 int64 kx; // Kernel x. 220 int64 ky; // Kernel y. 221 int64 kz; // Kernel depth (in case of group convolution, this will be 222 // smaller than input depth). 223 int64 oz; // Output depth. 224 int64 ox; // Output size x. 225 int64 oy; // Output size y. 226 int64 sx; // Stride x. 227 int64 sy; // Stride y. 228 Padding padding; // SAME or VALID. 229 }; 230 static int64 CountConv2DOperations(const OpInfo& op_info, 231 bool* found_unknown_shapes); 232 static int64 CountConv2DOperations(const OpInfo& op_info, 233 ConvolutionDimensions* conv_info, 234 bool* found_unknown_shapes); 235 static int64 CountMatMulOperations(const OpInfo& op_info, 236 bool* found_unknown_shapes); 237 static int64 CountMatMulOperations(const OpInfo& op_info, 238 MatMulDimensions* mat_mul, 239 bool* found_unknown_shapes); 240 bool GenerateBatchMatmulContextFromEinsum(const OpContext& einsum_context, 241 OpContext* batch_matmul_context, 242 bool* found_unknown_shapes) const; 243 static int64 CountBatchMatMulOperations(const OpInfo& op_info, 244 bool* found_unknown_shapes); 245 static int64 CountBatchMatMulOperations(const OpInfo& op_info, 246 BatchMatMulDimensions* batch_mat_mul, 247 bool* found_unknown_shapes); 248 static int64 CountConv2DBackpropInputOperations( 249 const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims, 250 bool* found_unknown_shapes); 251 static int64 CountConv2DBackpropFilterOperations( 252 const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims, 253 bool* found_unknown_shapes); 254 255 // Calculate the element count of an input/output tensor. 256 static int64 CalculateTensorElementCount( 257 const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes); 258 259 // Calculate the total size in bytes of an input/output tensor. 260 static int64 CalculateTensorSize(const OpInfo::TensorProperties& tensor, 261 bool* found_unknown_shapes); 262 263 // Calculate the element count of the largest 264 // input of specified TensorFlow op. 265 static int64 CalculateLargestInputCount(const OpInfo& op_info, 266 bool* found_unknown_shapes); 267 268 // Calculate the total size in bytes of the all 269 // the inputs of specified TensorFlow op. 270 static int64 CalculateInputSize(const OpInfo& op_info, 271 bool* found_unknown_shapes); 272 273 // Same, but a vector format: one for each input. 274 static std::vector<int64> CalculateInputTensorSize( 275 const OpInfo& op_info, bool* found_unknown_shapes); 276 277 // Calculate the total size in bytes of the all 278 // the outputs of specified TensorFlow op. 279 static int64 CalculateOutputSize(const OpInfo& op_info, 280 bool* found_unknown_shapes); 281 282 // Same, but a vector format: one for each output. 283 static std::vector<int64> CalculateOutputTensorSize( 284 const OpInfo& op_info, bool* found_unknown_shapes); 285 286 // For convolution and its grad ops. 287 static ConvolutionDimensions ConvolutionDimensionsFromInputs( 288 const TensorShapeProto& original_image_shape, 289 const TensorShapeProto& original_filter_shape, const OpInfo& op_info, 290 bool* found_unknown_shapes); 291 292 // For Pooling, FusedBatchNorm, and their grad ops. 293 static ConvolutionDimensions OpDimensionsFromInputs( 294 const TensorShapeProto& original_image_shape, const OpInfo& op_info, 295 bool* found_unknown_shapes); 296 297 // Helper to construct child operation contexts for the component operations 298 // of fused ops. 299 static OpContext FusedChildContext( 300 const OpContext& parent, const string& op_name, 301 const OpInfo::TensorProperties& output, 302 const std::vector<OpInfo::TensorProperties>& inputs); 303 304 // Helper to construct tensor shapes. 305 static OpInfo::TensorProperties DescribeTensor( 306 DataType type, const std::vector<int64>& dims); 307 308 // Helper method for building common case NodeCosts. 309 static Status PredictDefaultNodeCosts(const int64 num_compute_ops, 310 const OpContext& op_context, 311 bool* found_unknown_shapes, 312 NodeCosts* node_costs); 313 314 protected: 315 std::map<string, int> elementwise_ops_; 316 typedef std::function<Status(const OpContext& op_context, NodeCosts*)> 317 CostImpl; 318 std::map<string, CostImpl> device_cost_impl_; 319 // If true, assume compute and memory overlap; hence, the op cost is max of 320 // compute_time and memory_time, instead of sum of those two. 321 bool compute_memory_overlap_; 322 std::set<string> persistent_ops_; 323 324 private: 325 friend class OpLevelCostEstimatorTest; 326 }; 327 328 } // end namespace grappler 329 } // end namespace tensorflow 330 331 #endif // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_ 332