1syntax = "proto3"; 2package tensorflow; 3 4import "tensorflow/core/framework/graph.proto"; 5import "tensorflow/core/protobuf/config.proto"; 6import "tensorflow/core/profiler/op_profile.proto"; 7 8// The ProfilerService service retrieves performance information about 9// the programs running on connected devices over a period of time. 10service ProfilerService { 11 // Starts a profiling session, blocks until it completes, and returns data. 12 rpc Profile(ProfileRequest) returns (ProfileResponse) { 13 } 14 // Collects profiling data and returns user-friendly metrics. 15 rpc Monitor(MonitorRequest) returns (MonitorResponse) { 16 } 17} 18 19message ProfileOptions { 20 // We don't collect the dataset ops by default for better trace-viewer 21 // scalability. The caller can mannually set this field to include the ops. 22 bool include_dataset_ops = 1; 23 24 // next-field: 2 25} 26 27message ToolRequestOptions { 28 // Required formats for the tool, it should be one of "json", "proto", "raw" 29 // etc. If not specified (backward compatible), use default format, i.e. most 30 // tools use json format. 31 string output_formats = 2; 32 33 // Whether save the result directly to repository or pass it back to caller. 34 // Default to false for backward compatibilities. 35 bool save_to_repo = 3; 36} 37 38message ProfileRequest { 39 // In future, the caller will be able to customize when profiling starts and 40 // stops. For now, it collects `duration_ms` milliseconds worth of data. 41 uint64 duration_ms = 1; 42 43 // The maximum number of events to return. By default (value 0), return all 44 // events. 45 uint64 max_events = 2; 46 47 // Required profiling tools name such as "input_pipeline_analyzer" etc 48 repeated string tools = 3; 49 50 // Specifies the requirement for each tools. 51 map<string, ToolRequestOptions> tool_options = 8; 52 53 // Optional profiling options that control how a TF session will be profiled. 54 ProfileOptions opts = 4; 55 56 // The place where we will dump profile data. We will normally use 57 // MODEL_DIR/plugin/profile/ as our repository root. 58 string repository_root = 5; 59 60 // The user provided profile session identifier. 61 string session_id = 6; 62 63 // The hostname of system where the profile should happen. 64 // We use it as identifier in part of our output filename. 65 string host_name = 7; 66 67 // In future, the caller will indicate which TF session is being profiled, and 68 // only data relating to that program will be returned. For now, we assume 69 // all activity during the profiling period is relevant. 70 // next-field: 9 71} 72 73message ProfileToolData { 74 // The file name which this data is associated (e.g. "input_pipeline.json", 75 // "cluster_xxx.memory_viewer.json"). 76 string name = 1; 77 78 // The data payload (likely json) for the specific tool. 79 bytes data = 2; 80} 81 82message ProfileResponse { 83 reserved 1; // was uint64 placeholder for returning something meaningful. 84 // Graphs of programs executed on devices during the profiling period. 85 repeated GraphDef computation_graph = 2; 86 87 // Performance profile that can be used to annotate HLO operations in the 88 // computation graph. 89 RunMetadata hlo_metadata = 5; 90 91 // Encoded Trace proto message that contains metadata about the trace captured 92 // during the profiling period. Describes the devices and resources that 93 // 'trace_events' refers to. 94 bytes encoded_trace = 3; 95 96 // Assembles a hierarchical performance profile based on HLOs in trace events. 97 // If the trace covers multiple programs, the longest-running one is analyzed. 98 // See op_profile.proto for the detailed semantics of the returned profile. 99 profiler.op_profile.Profile op_profile = 4; 100 101 // Data payload for each required tools. 102 repeated ProfileToolData tool_data = 6; 103 104 // When we write profiling data directly to repository directory, we need a 105 // way to figure out whether the captured trace is empty (due to idle TPU). 106 bool empty_trace = 7; 107 108 // next-field: 8 109} 110 111message MonitorRequest { 112 // Duration for which to profile between each update. 113 uint64 duration_ms = 1; 114 115 // Indicates the level at which we want to monitor. Currently, two levels are 116 // supported: 117 // Level 1: An ultra lightweight mode that captures only some utilization 118 // metrics. 119 // Level 2: More verbose than level 1. Collects utilization metrics, device 120 // information, step time information, etc. Do not use this option if the TPU 121 // host is being very heavily used. 122 int32 monitoring_level = 2; 123 124 // next-field: 3 125} 126 127message MonitorResponse { 128 // Properly formatted string data that can be directly returned back to user. 129 string data = 1; 130 131 // next-field: 2 132} 133