• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_BATCHNORM_THUNK_H_
17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_BATCHNORM_THUNK_H_
18 
19 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
20 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
21 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.h"
22 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
23 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
24 #include "tensorflow/compiler/xla/types.h"
25 #include "tensorflow/core/lib/core/status.h"
26 
27 namespace xla {
28 namespace gpu {
29 
30 // This file contains thunks which call into cudnn to run the various flavors of
31 // batch normalization: BatchNormInference, BatchNormTraining, and
32 // BatchNormGrad, known to cudnn as BatchNormForwardInference,
33 // BatchNormForwardTraining, and BatchNormBackward.
34 //
35 // As an alternative to using these thunks, XLA can decompose batchnorm HLOs
36 // into smaller components using the BatchNormRewriter pass.  This can result in
37 // faster code because those individual components can fuse into their
38 // inputs/outputs, but it may also be slower if cudnn's batchnorm implementation
39 // outperforms the code XLA generates for these components.
40 //
41 // Currently these thunks require that their inputs are F32s.
42 //
43 // Note that these thunks do not take full advantage of the cudnn batchnorm
44 // functions.  For example, cudnn lets you bias and/or scale the input/output,
45 // but these thunks don't currently support that.
46 
47 class CudnnBatchNormForwardInferenceThunk : public Thunk {
48  public:
49   CudnnBatchNormForwardInferenceThunk(ThunkInfo thunk_info,
50                                       CudnnBatchNormConfig config,
51                                       const BufferAllocation::Slice& operand,
52                                       const BufferAllocation::Slice& scale,
53                                       const BufferAllocation::Slice& offset,
54                                       const BufferAllocation::Slice& mean,
55                                       const BufferAllocation::Slice& variance,
56                                       const BufferAllocation::Slice& output);
57 
58   CudnnBatchNormForwardInferenceThunk(
59       const CudnnBatchNormForwardInferenceThunk&) = delete;
60   CudnnBatchNormForwardInferenceThunk& operator=(
61       const CudnnBatchNormForwardInferenceThunk&) = delete;
62 
63   Status ExecuteOnStream(const ExecuteParams& params) override;
64 
65  private:
66   CudnnBatchNormConfig config_;
67   BufferAllocation::Slice operand_;
68   BufferAllocation::Slice scale_;
69   BufferAllocation::Slice offset_;
70   BufferAllocation::Slice mean_;
71   BufferAllocation::Slice variance_;
72   BufferAllocation::Slice output_;
73 };
74 
75 class CudnnBatchNormForwardTrainingThunk : public Thunk {
76  public:
77   CudnnBatchNormForwardTrainingThunk(
78       ThunkInfo thunk_info, CudnnBatchNormConfig config,
79       const BufferAllocation::Slice& operand,
80       const BufferAllocation::Slice& scale,
81       const BufferAllocation::Slice& offset,
82       const BufferAllocation::Slice& output_data,
83       const BufferAllocation::Slice& output_mean,
84       const BufferAllocation::Slice& output_inv_stddev);
85 
86   CudnnBatchNormForwardTrainingThunk(
87       const CudnnBatchNormForwardTrainingThunk&) = delete;
88   CudnnBatchNormForwardTrainingThunk& operator=(
89       const CudnnBatchNormForwardTrainingThunk&) = delete;
90 
91   Status ExecuteOnStream(const ExecuteParams& params) override;
92 
93  private:
94   CudnnBatchNormConfig config_;
95   BufferAllocation::Slice operand_;
96   BufferAllocation::Slice scale_;
97   BufferAllocation::Slice offset_;
98   BufferAllocation::Slice output_data_;
99   BufferAllocation::Slice output_mean_;
100   BufferAllocation::Slice output_inv_stddev_;
101 };
102 
103 class CudnnBatchNormBackwardThunk : public Thunk {
104  public:
105   CudnnBatchNormBackwardThunk(
106       ThunkInfo thunk_info, CudnnBatchNormConfig config,
107       const BufferAllocation::Slice& operand,
108       const BufferAllocation::Slice& scale, const BufferAllocation::Slice& mean,
109       const BufferAllocation::Slice& inv_stddev,
110       const BufferAllocation::Slice& grad_output,
111       const BufferAllocation::Slice& output_grad_data,
112       const BufferAllocation::Slice& output_grad_scale,
113       const BufferAllocation::Slice& output_grad_offset);
114 
115   CudnnBatchNormBackwardThunk(const CudnnBatchNormBackwardThunk&) = delete;
116   CudnnBatchNormBackwardThunk& operator=(const CudnnBatchNormBackwardThunk&) =
117       delete;
118 
119   Status ExecuteOnStream(const ExecuteParams& params) override;
120 
121  private:
122   const CudnnBatchNormConfig config_;
123   BufferAllocation::Slice operand_;
124   BufferAllocation::Slice scale_;
125   BufferAllocation::Slice mean_;
126   BufferAllocation::Slice inv_stddev_;
127   BufferAllocation::Slice grad_output_;
128   BufferAllocation::Slice output_grad_data_;
129   BufferAllocation::Slice output_grad_scale_;
130   BufferAllocation::Slice output_grad_offset_;
131 };
132 
133 }  // namespace gpu
134 }  // namespace xla
135 
136 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_BATCHNORM_THUNK_H_
137