1syntax = "proto3"; 2 3package tensorflow.tfprof; 4 5import "tensorflow/core/framework/attr_value.proto"; 6import "tensorflow/core/framework/step_stats.proto"; 7 8// It specifies the Python callstack that creates an op. 9message CodeDef { 10 repeated Trace traces = 1; 11 message Trace { 12 string file = 1 [deprecated = true]; // deprecated by file_id. 13 int64 file_id = 6; 14 15 int32 lineno = 2; 16 17 string function = 3 [deprecated = true]; // deprecated by function_id. 18 int64 function_id = 7; 19 20 string line = 4 [deprecated = true]; // deprecated line_id. 21 int64 line_id = 8; 22 23 int32 func_start_line = 5; 24 } 25} 26 27message OpLogEntry { 28 // op name. 29 string name = 1; 30 // float_ops is filled by tfprof Python API when called. It requires the 31 // op has RegisterStatistics defined. Currently, Conv2D, MatMul, etc, are 32 // implemented. 33 int64 float_ops = 2; 34 // User can define extra op type information for an op. This allows the user 35 // to select a group of ops precisely using op_type as a key. 36 repeated string types = 3; 37 // Used to support tfprof "code" view. 38 CodeDef code_def = 4; 39} 40 41message OpLogProto { 42 repeated OpLogEntry log_entries = 1; 43 44 // Maps from id of CodeDef file,function,line to its string 45 // In the future can also map other id of other fields to string. 46 map<int64, string> id_to_string = 2; 47} 48 49// A proto representation of the profiler's profile. 50// It allows serialization, shipping around and deserialization of the profiles. 51// 52// Please don't depend on the internals of the profile proto. 53message ProfileProto { 54 map<int64, ProfileNode> nodes = 1; 55 // Whether or not has code traces. 56 bool has_trace = 2; 57 // Whether or not the TF device tracer fails to return accelerator 58 // information (which could lead to 0 accelerator execution time). 59 bool miss_accelerator_stream = 5; 60 // Traced steps. 61 repeated int64 steps = 3; 62 63 // Maps from id of CodeDef file,function,line to its string 64 // In the future can also map other id of other fields to string. 65 map<int64, string> id_to_string = 4; 66} 67 68message ProfileNode { 69 // graph node name. 70 string name = 1; 71 // graph operation type. 72 string op = 9; 73 // A unique id for the node. 74 int64 id = 13; 75 76 map<int32, int64> inputs = 2; 77 map<int32, Tuple> input_shapes = 16; 78 map<int32, int64> outputs = 3; 79 map<int32, Tuple> output_shapes = 15; 80 // A map from source node id to its output index to current node. 81 map<int64, int32> src_output_index = 14; 82 83 repeated int64 shape = 4; 84 repeated string op_types = 5; 85 string canonical_device = 6; 86 string host_device = 7; 87 88 int64 float_ops = 8; 89 90 CodeDef trace = 10; 91 map<string, AttrValue> attrs = 11; 92 93 map<int64, ExecProfile> execs = 12; 94} 95 96message ExecProfile { 97 // Can be larger than 1 if run multiple times in loop. 98 int64 run_count = 1; 99 // The earliest/latest time including scheduling and execution. 100 int64 all_start_micros = 2; 101 int64 latest_end_micros = 3; 102 103 // device -> vector of {op_start_micros, op_exec_micros} pairs. 104 // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros} 105 // For accelerator, vector size can be larger than 1, multiple kernel fires 106 // or in tf.while_loop. 107 map<string, ExecTime> accelerator_execs = 4; 108 // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros} 109 // For cpu, vector size can be larger than 1 if in tf.while_loop. 110 map<string, ExecTime> cpu_execs = 5; 111 112 // Each entry to memory information of a scheduling of the node. 113 // Normally, there will be multiple entries in while_loop. 114 repeated ExecMemory memory_execs = 7; 115 // The allocation and deallocation times and sizes throughout execution. 116 repeated AllocationRecord allocations = 11; 117 // The devices related to this execution. 118 repeated string devices = 6; 119} 120 121message ExecTime { 122 repeated Tuple times = 1; 123} 124 125message ExecMemory { 126 // This is the timestamp when the memory information was tracked. 127 int64 memory_micros = 1; 128 // NOTE: Please don't depend on the following 4 fields yet. Due to 129 // TensorFlow internal tracing issues, the numbers can be quite wrong. 130 // TODO(xpan): Fix the TensorFlow internal tracing. 131 int64 host_temp_bytes = 2; 132 int64 host_persistent_bytes = 3; 133 int64 accelerator_temp_bytes = 4; 134 int64 accelerator_persistent_bytes = 5; 135 136 // Total bytes requested by the op. 137 int64 requested_bytes = 6; 138 // Total bytes requested by the op and released before op end. 139 int64 peak_bytes = 7; 140 // Total bytes requested by the op and not released after op end. 141 int64 residual_bytes = 8; 142 // Total bytes output by the op (not necessarily requested by the op). 143 int64 output_bytes = 9; 144 // The total number of bytes currently allocated by the allocator if >0. 145 int64 allocator_bytes_in_use = 10; 146 // The memory of each output of the operation. 147 map<int32, Memory> output_memory = 11; 148} 149 150message Tuple { 151 repeated int64 int64_values = 1; 152} 153 154message Memory { 155 int64 bytes = 1; 156 uint64 ptr = 2; 157} 158