• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1syntax = "proto3";
2
3package xrt;
4
5import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
6import "tensorflow/compiler/xla/service/hlo.proto";
7import "tensorflow/compiler/xla/xla.proto";
8import "tensorflow/compiler/xla/xla_data.proto";
9
10message DeviceAssignment {
11  message ComputationDevice {
12    message DeviceMeshCoordinates {
13      // The mesh coordinates for the device. Usually (X, Y, Core), in the order
14      // in which they are returned in the TopologyProto.
15      //  X    = value(0)
16      //  Y    = value(1)
17      //  Core = value(2)
18      repeated int32 value = 1;
19    }
20    // As many replicas as there are in the replicated computation.
21    repeated DeviceMeshCoordinates replica_devices = 1;
22  }
23  // As many ComputationDevice as many there are computations (number
24  // of cores per replica).
25  repeated ComputationDevice computation_devices = 1;
26}
27
28// Options for an XLA compilation.
29message XLAComputationConfig {
30  // The number of replicas the computation will be run on. If this is
31  // default (0) it is interpreted as 1.
32  int32 num_replicas = 1;
33  // The number of "model-parallel" cores per replica. If this is
34  // default (0) it is interpreted as 1.
35  int32 num_cores_per_replica = 2;
36  // Optional metadata about host sends and recvs.
37  tensorflow.tf2xla.HostComputeMetadata host_compute_metadata = 3;
38
39  // The arg/result shapes for the whole computation.
40  xla.ProgramShapeProto program_shape = 4;
41  // The arg/result shapes for each core of a model-parallel
42  // computation. per_core_args_and_result_shapes is optional for a
43  // single-core computation.
44  repeated xla.ProgramShapeProto per_core_program_shape = 5;
45  // Describes how replicated computation instances should be assigned to
46  // devices. There are num_cores_per_replica computations, and each one will be
47  // sent and executed to the set of replica device numbers described in the
48  // DeviceAssignment proto.
49  DeviceAssignment device_assignment = 6;
50  // The debugging options to be passed to the XLA compilation process.
51  xla.DebugOptions debug_options = 7;
52
53  // Everything inside Experimental is subject to change and is not subject
54  // to API stability guarantees in
55  // https://www.tensorflow.org/guide/version_compat.
56  message Experimental {
57    message UpdateIndexPair {
58      int32 index = 1;
59      bool updated = 2;
60    }
61
62    // stateful_input_indices is only useful when using XRT-compiled
63    // programs together with standard TensorFlow TPU execution ops, so should
64    // be ignored by most clients.
65    //
66    // Optionally the client can pass information about which inputs
67    // to the computation are updates to "stateful" quantities. Each
68    // element of stateful_input_indices includes an index indicating
69    // which input argument it corresponds to, and a bool indicating
70    // whether the value is updated or not. If the XRT computation is
71    // going to be used with a TensorFlow TPU execution op then an
72    // input index must be present for each input that will correspond
73    // to a resource variable in the execution op, and may not be
74    // present for any other input.
75    repeated UpdateIndexPair stateful_input_indices = 1;
76  }
77
78  Experimental experimental = 8;
79}
80
81// Options and XLA computation for a compilation.
82message XLAComputation {
83  XLAComputationConfig config = 1;
84  xla.HloSnapshot hlo_snapshot = 2;
85}
86
87// Literal to allocate space for, and transfer to, device memory.
88message XLAAllocation {
89  reserved 1;
90  xla.LiteralProto value = 2;
91}
92
93// Node in a tree describing a tuple constructed from input handles. A
94// node is an internal node if tuples is non-empty, in which case
95// input_index and release_input_handle are ignored. Otherwise a node
96// is a leaf node. Each leaf XLATupleNode is the index of an input
97// which corresponds to a handle that will be grafted onto the output
98// tuple at that location. If release_input_handle is true that input
99// handle will be released and become invalid.  Inputs may be repeated
100// in which case leaves of the output tuple will alias. If an input is
101// repeated, release_input_handle must be false for every leaf where
102// that input appears.
103//
104// For example, if input 0 has shape {} and input 1 has shape {2,3}
105// then the XLATupleNode with structure {1,{0,1}} corresponds to a
106// tuple with shape {{2,3},{{},{2,3}}}.
107message XLATupleNode {
108  int32 input_index = 1;
109  bool release_input_handle = 2;
110  repeated XLATupleNode tuples = 3;
111}
112
113// Options for an XLA execution.
114message XRTExecutionConfig {
115  // Local device to run on. This is present because the execute Op
116  // may be placed on a device such as CPU or TPU_SYSTEM that
117  // logically manages multiple cores.
118  int32 device_ordinal = 1;
119  // Which model-parallel computation to run from the compiled bundle.
120  int32 core_index_in_replica = 2;
121  // Optional key to disambiguate between executions. This is only
122  // needed if multiple host send/recvs may be outstanding
123  // concurrently with executions.
124  string execution_instance_key = 3;
125  // If non-zero, rng_seed to reset the core with.
126  uint32 rng_seed = 4;
127  // If true, release allocation handles on the inputs after running.
128  bool release_input_handles = 5;
129  // If true, release the handle to the computation after running.
130  bool release_compilation_handle = 6;
131  // If set to true, and the result shape is a tuple, then instead of returning
132  // a single tuple allocation the execution will return a vector of
133  // allocations, one for each of the first-level elements of the result tuple.
134  bool return_exploded_tuple = 7;
135}
136
137message XRTChainedExecuteConfig {
138  // If non-zero, rng_seed to reset the core with.
139  uint32 rng_seed = 1;
140  // Which model-parallel computation to run from the compiled bundle.
141  int32 core_index_in_replica = 2;
142  // Optional key to disambiguate between executions. This is only needed if
143  // multiple host send/recvs may be outstanding concurrently with executions.
144  string execution_instance_key = 3;
145}
146
147// A single chained execute operation. An operation can either be a device data
148// load, or an existing (as in, previously compiled and accessible via its int64
149// handle) XLA computation execution.
150message XRTChainedExecuteOp {
151  // Represents an input for this operation.
152  message Input {
153    // The index within the XRTChainedExecutePlan.ops post-order of the source
154    // operation for this input.
155    int64 op_index = 1;
156    // The output index of the value generated by the operation at op_index.
157    // Zero (default value) means no index ({}) while if an indexing is
158    // required, output_index needs to be set to index+1.
159    // Thanks proto3!
160    int64 output_index = 2;
161  }
162  // Represents an output of the XRTChainedExecute operation, which should
163  // originate by the output of this operation.
164  message Output {
165    // The index in the value generated by this operation, which should be
166    // forwarded as XRTChainedExecute output. If output_index is zero (default
167    // value) the whole output will be used as result. This means that if the
168    // output shape is a tuple, the result will be the full tuple. Otherwise the
169    // real sub-tuple index will be output_index - 1.
170    int64 output_index = 1;
171    // The index in the vector of the results returned by the XRTChainedExecute
172    // operation, where this output should be forwarded.
173    int64 result_index = 2;
174  }
175
176  oneof op_oneof {
177    // The handle to an existing XRT device data.
178    int64 data_handle = 1;
179    // The handle to an existing XRT compiled computation.
180    int64 computation_handle = 2;
181  }
182  // The outputs of this XRTChainedExecuteOp operation.
183  repeated Output outputs = 3;
184  // The inputs of this XRTChainedExecuteOp operation. If data_handle is set,
185  // there are no inputs.
186  repeated Input inputs = 4;
187}
188
189// Execution plan for the XRTChainedExecute operation.
190message XRTChainedExecutePlan {
191  // The post order with the XRT computations to be executed.
192  repeated XRTChainedExecuteOp ops = 1;
193}
194
195// The message used to encode the options for the XRTMetricsCollect operation.
196message XRTMetricsCollect {
197  // A list of regular expressions to match the metric names. Empty means to
198  // return all the metrics reported by the collection registry.
199  repeated string metrics_regex = 1;
200}
201
202message Percentiles {
203  message Point {
204    // In the [0, 100] range.
205    double percentile = 1;
206    double value = 2;
207  }
208
209  // The time (in nanoseconds) of the first sample within the samples buffer.
210  uint64 start_nstime = 1;
211  // The time (in nanoseconds) of the last sample within the samples buffer.
212  uint64 end_nstime = 2;
213  // The minimum value of the samples within the samples buffer.
214  double min_value = 3;
215  // The maximum value of the samples within the samples buffer.
216  double max_value = 4;
217  // The mean value of the samples within the samples buffer.
218  double mean = 5;
219  // The stndard deviation of the samples within the samples buffer.
220  double stddev = 6;
221  // The number samples within the samples buffer.
222  uint64 num_samples = 7;
223  // The total number of times this metrics has been posted a value to.
224  uint64 total_samples = 8;
225  // The sum of all the posted values.
226  double accumulator = 9;
227  // The percentile points reported by the metric.
228  repeated Point points = 10;
229}
230
231message MetricValues {
232  // The metric name.
233  string name = 1;
234
235  oneof values_oneof {
236    Percentiles percentiles_value = 2;
237    int64 int64_value = 3;
238  }
239}
240
241message MetricsReport {
242  repeated MetricValues metrics = 1;
243}
244