1syntax = "proto3"; 2 3package tensorflow.profiler; 4 5import "google/protobuf/any.proto"; 6import "tensorflow/core/profiler/protobuf/op_metrics.proto"; 7 8// Breakdown of step-time on generic hardware. Note that these components are 9// mutually exclusive so that adding them together is equal to the step time. If 10// an execution time interval has multiple types of event happening, we need to 11// pick one of the event type to attribute the time interval to. 12message GenericStepBreakdown { 13 // Map event type to the accumulated duration in 14 // picoseconds of that type. 15 map<int32, uint64> type_ps = 1; 16} 17 18// Information about memory transfer to/from device memory. 19message DeviceMemoryTransfer { 20 uint64 occurrence = 1; 21 double time_us = 2; 22 uint64 bytes_transferred = 3; 23} 24 25// Next ID: 6 26// Result proto for StepInfo. 27message StepInfoResult { 28 // The step number. 29 uint32 step_num = 1; 30 // The step name. 31 string step_name = 5; 32 // The step duration in picoseconds. 33 uint64 duration_ps = 2; 34 // The start time of this step in picoseconds. 35 uint64 begin_ps = 3; 36 // Breakdown of the step-time. Can be unpacked into a GenericStepBreakdown. 37 google.protobuf.Any step_breakdown = 4; 38} 39 40// Result proto for metrics on flow events. 41message FlowEventInfo { 42 // Unique id for each send and recv pair. 43 uint64 flow_id = 1; 44 // Channel id generated by the XLA compiler, it is statically unique within an 45 // HloModule. 46 int64 channel_id = 2; 47 // The name of the hlo op. 48 string name = 3; 49 // Category of the hlo op. 50 string category = 4; 51 // The start time in picoseconds of the op event. 52 uint64 start_time_ps = 5; 53 // The end time in picoseconds of the op event. 54 uint64 end_time_ps = 6; 55 // The size of the op in bytes. 56 uint64 byte_size = 7; 57 // The replica id of the program running the flow event. 58 uint32 replica_id = 8; 59} 60 61// Result database for core to core flow events. 62message FlowDbResult { 63 repeated FlowEventInfo flow_info = 1; 64} 65 66// Result proto for all -educe ops. 67message AllReduceInfo { 68 // Unique id for all-reduce ops. 69 uint64 id = 1; 70 // The name of the hlo op. 71 string name = 2; 72 // For all-reduce nodes from different modules, if they have the same 73 // all_reduce_id, they will be 'Allreduce'd'. If empty, AllReduce will not be 74 // applied across modules. 75 uint64 all_reduce_id = 3; 76 // The start time in picoseconds of the op event. 77 uint64 start_time_ps = 4; 78 // The end time in picoseconds of the op event. 79 uint64 end_time_ps = 5; 80 // The size of the op in bytes. 81 uint64 byte_size = 6; 82} 83 84// Result database for all-reduce ops. 85message AllReduceDbResult { 86 repeated AllReduceInfo all_reduce_info = 1; 87} 88 89// Result proto for information in a step across all cores. 90message PerCoreStepInfo { 91 // The step number. 92 uint32 step_num = 1; 93 // A map from core_id to StepInfo. 94 map<uint32, StepInfoResult> step_info_per_core = 2; 95 // The result for the per-step HLO-metric database. 96 OpMetricsDb hlo_metrics_db = 3; 97 // The result for send and recv flows. 98 map<uint32, FlowDbResult> flow_db_per_core = 4; 99 // A map from core ID to program replica id. Replica id map could change 100 // during a profile session, but should stay stable within a step. 101 map<uint32, uint32> core_id_to_replica_id_map = 5; 102 // A map from core_id to all-reduce ops. 103 map<uint32, AllReduceDbResult> all_reduce_db_per_core = 6; 104 // Information about deivce memory transfers, categoried by source and 105 // destination. Ordered by following categories: 106 // 1. HostToDevice 107 // 2. DeviceToHost 108 // 3. DeviceToDevice 109 repeated DeviceMemoryTransfer device_memory_transfers = 7; 110} 111 112// Result proto for a StepDatabase. 113message StepDatabaseResult { 114 // A sequence of PerCoreStepInfo. 115 repeated PerCoreStepInfo step_sequence = 1; 116 // Whether the step db uses incomplete step information. 117 // This flag is set to true when: 118 // 1) no step marker or annotation present. 119 // 2) profiling duration is too short to cover a full step. 120 // If this flag is false, we will group and breakdown the 121 // profile by complete steps only and ignore incomplete steps. 122 // If this flag is true, we will simply aggregate and breakdown over the total 123 // profile as a single step. 124 bool use_incomplete_step = 2; 125 // Number of steps dropped during post processing. 126 uint32 num_steps_dropped = 3; 127 // If the step_sequence is empty because: 128 // * there is no step profiled on any host, then empty_intersect is false. 129 // * there are steps profiled on some host, but the intersection of steps 130 // over all hosts is empty, then empty_intersect is true. 131 bool empty_intersect = 4; 132} 133