• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
18 
19 #include "absl/types/optional.h"
20 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
21 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
22 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
23 #include "tensorflow/compiler/xla/status.h"
24 #include "tensorflow/compiler/xla/statusor.h"
25 #include "tensorflow/compiler/xla/types.h"
26 #include "tensorflow/compiler/xla/xla_data.pb.h"
27 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
28 
29 namespace xla {
30 namespace gpu {
31 
32 struct RunConvOptions {
33   // Nullable output-parameter pointer for profiling results.
34   se::dnn::ProfileResult* profile_result = nullptr;
35 
36   // Use this algorithm, instead of the one from the instruction.
37   absl::optional<se::dnn::AlgorithmDesc> algo_override;
38 };
39 
40 // Implementation struct exposed for debugging and log analysis.
41 struct GpuConvParams {
42   // Here are the fields related to cuDNN's fused convolution. The result thus
43   // is defined as:
44   //   activation(conv_result_scale * conv(x, w) +
45   //       side_input_scale * side_input + broadcast(bias))
46   //
47   // The most common fused conv is conv forward + relu/identity, for example.
48   //
49   // bias_buf is a single-dimensional array, with the length equal to the number
50   // of output features. It'll be broadcasted to the output shape in order to be
51   // added to the final results.
52   //
53   // side_input_buf, if valid, must have the same shape as the output buffer.
54   struct FusionParams {
55     se::dnn::ActivationMode mode;
56     double side_input_scale;
57     se::DeviceMemoryBase bias_buf;
58     se::DeviceMemoryBase side_input_buf;  // nullable
59   };
60 
61   CudnnConvKind kind;
62   se::dnn::BatchDescriptor input_descriptor;
63   se::dnn::FilterDescriptor filter_descriptor;
64   se::dnn::BatchDescriptor output_descriptor;
65   se::DeviceMemoryBase input_buf;
66   se::DeviceMemoryBase filter_buf;
67   se::DeviceMemoryBase output_buf;
68   se::dnn::ConvolutionDescriptor conv_desc;
69   se::dnn::AlgorithmConfig algorithm;
70   double conv_result_scale;
71 
72   absl::optional<FusionParams> fusion;
73 };
74 
75 // This file contains low-level routines for running cudnn convolutions.
76 
77 // Calls into cudnn to run the specified convolution.
78 //
79 // We provide one overload which takes a scratch buffer, and another which takes
80 // an allocator which is responsible for allocating the scratch space.  In
81 // theory the second one shouldn't be necessary -- users of this function could
82 // just ask cudnn how much scratch space it needs for a particular convolution.
83 // But in practice, StreamExecutor does not expose such an API, and in the name
84 // of parsimony, perhaps it's better not to add it.  Instead, the first time you
85 // call a convolution, you should call the version that takes a scratch
86 // allocator and take note of how much memory is used.  The next time you call
87 // the same conv, you can provide an explicitly preallocated scratch buffer of
88 // that size, if you like.
89 Status RunGpuConv(const HloCustomCallInstruction* conv,
90                   absl::Span<se::DeviceMemoryBase> operand_buffers,
91                   se::DeviceMemoryBase result_buffer,
92                   se::DeviceMemoryBase scratch_buf, se::Stream* stream,
93                   RunConvOptions = {});
94 
95 Status RunGpuConv(const HloCustomCallInstruction* conv,
96                   absl::Span<se::DeviceMemoryBase> operand_buffers,
97                   se::DeviceMemoryBase result_buffer,
98                   se::ScratchAllocator* scratch_allocator, se::Stream* stream,
99                   RunConvOptions = {});
100 
101 // Implementation details exposed for debugging and log analysis.
102 StatusOr<GpuConvParams> GetGpuConvParams(
103     const HloCustomCallInstruction* conv,
104     absl::Span<se::DeviceMemoryBase> operand_buffers,
105     se::DeviceMemoryBase result_buffer);
106 
107 }  // namespace gpu
108 }  // namespace xla
109 
110 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
111