• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // See docs in ../ops/linalg_ops.cc.
17 
18 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 40500
19 
20 #include <numeric>
21 #include <type_traits>
22 
23 #define EIGEN_USE_GPU
24 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
25 #include "tensorflow/core/framework/kernel_def_builder.h"
26 #include "tensorflow/core/framework/op_kernel.h"
27 #include "tensorflow/core/framework/tensor_shape.h"
28 #include "tensorflow/core/kernels/cast_op.h"
29 #include "tensorflow/core/kernels/cwise_ops.h"
30 #include "tensorflow/core/kernels/transpose_functor.h"
31 #include "tensorflow/core/lib/core/errors.h"
32 #include "tensorflow/core/platform/logging.h"
33 #include "tensorflow/core/platform/types.h"
34 #include "tensorflow/core/util/gpu_solvers.h"
35 
36 namespace tensorflow {
37 
38 typedef Eigen::GpuDevice GPUDevice;
39 
40 template <class Scalar>
41 class SelfAdjointEigV2OpGpu : public AsyncOpKernel {
42  public:
SelfAdjointEigV2OpGpu(OpKernelConstruction * context)43   explicit SelfAdjointEigV2OpGpu(OpKernelConstruction* context)
44       : AsyncOpKernel(context) {
45     OP_REQUIRES_OK(context, context->GetAttr("compute_v", &compute_v_));
46   }
47 
ComputeAsync(OpKernelContext * context,DoneCallback done)48   void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
49     const Tensor& input = context->input(0);
50     const int ndims = input.dims();
51     OP_REQUIRES_ASYNC(
52         context, ndims >= 2,
53         errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
54         done);
55     const int64_t n = input.dim_size(ndims - 1);
56     OP_REQUIRES_ASYNC(
57         context, input.dim_size(ndims - 2) == n,
58         errors::InvalidArgument("Input matrices must be squares, got",
59                                 input.dim_size(ndims - 2), " != ", n),
60         done);
61     const int64_t batch_size =
62         input.template flat_inner_dims<Scalar, 3>().dimension(0);
63 
64     // Allocate outputs.
65     Tensor* eigenvalues;
66     TensorShape eigenvalues_shape = input.shape();
67     eigenvalues_shape.RemoveLastDims(1);
68     OP_REQUIRES_OK_ASYNC(
69         context, context->allocate_output(0, eigenvalues_shape, &eigenvalues),
70         done);
71     Tensor* eigenvectors;
72     TensorShape eigenvectors_shape =
73         compute_v_ ? input.shape() : TensorShape({});
74     OP_REQUIRES_OK_ASYNC(
75         context, context->allocate_output(1, eigenvectors_shape, &eigenvectors),
76         done);
77 
78     if (input.NumElements() == 0) {
79       done();
80       return;
81     }
82 
83     // Allocate workspace.
84     // TODO(rmlarsen): Convert to std::make_unique when available.
85     std::unique_ptr<GpuSolver> solver(new GpuSolver(context));
86     Tensor eigenvalues_real;
87     using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
88     if (std::is_same<Scalar, RealScalar>::value) {
89       eigenvalues_real = *eigenvalues;
90     } else {
91       OP_REQUIRES_OK_ASYNC(
92           context,
93           solver->allocate_scoped_tensor(DataTypeToEnum<RealScalar>::value,
94                                          eigenvalues_shape, &eigenvalues_real),
95           done);
96     }
97 
98     Tensor input_copy;
99     OP_REQUIRES_OK_ASYNC(
100         context,
101         solver->forward_input_or_allocate_scoped_tensor(
102             {0}, DataTypeToEnum<Scalar>::value, input.shape(), &input_copy),
103         done);
104     // For real symmetric matrices, row-major and column-major are the same. For
105     // complex Hermitian, row-major and column-major differ by a conjugation,
106     // which is still cheaper than a transpose.
107     const GPUDevice& device = context->eigen_device<GPUDevice>();
108     if (!input.SharesBufferWith(input_copy)) {
109       if (Eigen::NumTraits<Scalar>::IsComplex) {
110         functor::UnaryFunctor<GPUDevice, functor::conj<Scalar>> conj;
111         conj(device, input_copy.flat<Scalar>() /*out*/,
112              input.flat<Scalar>() /*in*/);
113       } else {
114         device.memcpy(input_copy.flat<Scalar>().data(),
115                       input.flat<Scalar>().data(),
116                       input.NumElements() * sizeof(Scalar));
117       }
118     } else if (Eigen::NumTraits<Scalar>::IsComplex) {
119       functor::UnaryFunctor<GPUDevice, functor::conj<Scalar>> conj;
120       conj(device, const_cast<Tensor*>(&input)->flat<Scalar>() /*out*/,
121            input.flat<Scalar>() /*in*/);
122     }
123 
124 #if GOOGLE_CUDA
125     cublasFillMode_t fill = CUBLAS_FILL_MODE_UPPER;
126     cusolverEigMode_t jobz =
127         compute_v_ ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
128 #elif TENSORFLOW_USE_ROCM
129     hipsolverFillMode_t fill = HIPSOLVER_FILL_MODE_UPPER;
130     hipsolverEigMode_t jobz =
131         compute_v_ ? HIPSOLVER_EIG_MODE_VECTOR : HIPSOLVER_EIG_MODE_NOVECTOR;
132 #endif
133 
134     // Compute eigen decomposition in-place in input_copy.
135     std::vector<DeviceLapackInfo> dev_info;
136     dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "heevd"));
137     auto input_copy_reshaped = input_copy.flat_inner_dims<Scalar, 3>();
138     auto eigenvalues_real_reshaped =
139         eigenvalues_real.flat_inner_dims<RealScalar, 2>();
140     for (int batch = 0; batch < batch_size; ++batch) {
141       OP_REQUIRES_OK_ASYNC(
142           context,
143           solver->Heevd(jobz, fill, n, &input_copy_reshaped(batch, 0, 0), n,
144                         &eigenvalues_real_reshaped(batch, 0),
145                         dev_info.back().mutable_data() + batch),
146           done);
147     }
148 
149     if (!std::is_same<Scalar, RealScalar>::value) {
150       functor::CastFunctor<GPUDevice, Scalar, RealScalar> cast;
151       cast(device, eigenvalues->flat<Scalar>(),
152            const_cast<const Tensor*>(&eigenvalues_real)->flat<RealScalar>());
153     }
154 
155     if (compute_v_) {
156       // Transpose eigenvectors now stored in input_copy in column-major form to
157       // output in row-major form.
158       OP_REQUIRES_OK_ASYNC(
159           context, DoMatrixTranspose(device, input_copy, eigenvectors), done);
160     }
161 
162     // Asynchronously check return status from cuSolver kernels.
163     GpuSolver::CheckLapackInfoAndDeleteSolverAsync(std::move(solver), dev_info,
164                                                    std::move(done));
165   }
166 
167  private:
168   bool compute_v_;
169 
170   TF_DISALLOW_COPY_AND_ASSIGN(SelfAdjointEigV2OpGpu);
171 };
172 
173 #define REGISTER(Scalar)                                                       \
174   REGISTER_KERNEL_BUILDER(                                                     \
175       Name("SelfAdjointEigV2").Device(DEVICE_GPU).TypeConstraint<Scalar>("T"), \
176       (SelfAdjointEigV2OpGpu<Scalar>))
177 
178 REGISTER(float);
179 REGISTER(double);
180 REGISTER(complex64);
181 REGISTER(complex128);
182 
183 #undef REGISTER
184 
185 }  // namespace tensorflow
186 
187 #endif  // GOOGLE_CUDA
188