1 //===--- simple_example.cu - Simple example of using Acxxel ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// This file is a simple example of using Acxxel.
10 ///
11 //===----------------------------------------------------------------------===//
12
13 /// [Example simple saxpy]
14 #include "acxxel.h"
15
16 #include <array>
17 #include <cstdio>
18 #include <cstdlib>
19
20 // A standard CUDA kernel.
saxpyKernel(float A,float * X,float * Y,int N)21 __global__ void saxpyKernel(float A, float *X, float *Y, int N) {
22 int I = (blockDim.x * blockIdx.x) + threadIdx.x;
23 if (I < N)
24 X[I] = A * X[I] + Y[I];
25 }
26
27 // A host library wrapping the CUDA kernel. All Acxxel calls are in here.
28 template <size_t N>
saxpy(float A,std::array<float,N> & X,const std::array<float,N> & Y)29 void saxpy(float A, std::array<float, N> &X, const std::array<float, N> &Y) {
30 // Get the CUDA platform and make a CUDA stream.
31 acxxel::Platform *CUDA = acxxel::getCUDAPlatform().getValue();
32 acxxel::Stream Stream = CUDA->createStream().takeValue();
33
34 // Allocate space for device arrays.
35 auto DeviceX = CUDA->mallocD<float>(N).takeValue();
36 auto DeviceY = CUDA->mallocD<float>(N).takeValue();
37
38 // Copy X and Y out to the device.
39 Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY);
40
41 // Launch the kernel using triple-chevron notation.
42 saxpyKernel<<<1, N, 0, Stream>>>(A, DeviceX, DeviceY, N);
43
44 // Copy the results back to the host.
45 acxxel::Status Status = Stream.syncCopyDToH(DeviceX, X).takeStatus();
46
47 // Check for any errors.
48 if (Status.isError()) {
49 std::fprintf(stderr, "Error performing acxxel saxpy: %s\n",
50 Status.getMessage().c_str());
51 std::exit(EXIT_FAILURE);
52 }
53 }
54 /// [Example simple saxpy]
55
56 /// [Example CUDA simple saxpy]
57 template <size_t N>
cudaSaxpy(float A,std::array<float,N> & X,std::array<float,N> & Y)58 void cudaSaxpy(float A, std::array<float, N> &X, std::array<float, N> &Y) {
59 // This size is needed all over the place, so give it a name.
60 constexpr size_t Size = N * sizeof(float);
61
62 // Allocate space for device arrays.
63 float *DeviceX;
64 float *DeviceY;
65 cudaMalloc(&DeviceX, Size);
66 cudaMalloc(&DeviceY, Size);
67
68 // Copy X and Y out to the device.
69 cudaMemcpy(DeviceX, X.data(), Size, cudaMemcpyHostToDevice);
70 cudaMemcpy(DeviceY, Y.data(), Size, cudaMemcpyHostToDevice);
71
72 // Launch the kernel using triple-chevron notation.
73 saxpyKernel<<<1, N>>>(A, DeviceX, DeviceY, N);
74
75 // Copy the results back to the host.
76 cudaMemcpy(X.data(), DeviceX, Size, cudaMemcpyDeviceToHost);
77
78 // Free resources.
79 cudaFree(DeviceX);
80 cudaFree(DeviceY);
81
82 // Check for any errors.
83 cudaError_t Error = cudaGetLastError();
84 if (Error) {
85 std::fprintf(stderr, "Error performing cudart saxpy: %s\n",
86 cudaGetErrorString(Error));
87 std::exit(EXIT_FAILURE);
88 }
89 }
90 /// [Example CUDA simple saxpy]
91
testSaxpy(F && SaxpyFunction)92 template <typename F> void testSaxpy(F &&SaxpyFunction) {
93 float A = 2.f;
94 std::array<float, 3> X = {{0.f, 1.f, 2.f}};
95 std::array<float, 3> Y = {{3.f, 4.f, 5.f}};
96 std::array<float, 3> Expected = {{3.f, 6.f, 9.f}};
97 SaxpyFunction(A, X, Y);
98 for (int I = 0; I < 3; ++I)
99 if (X[I] != Expected[I]) {
100 std::fprintf(stderr, "Result mismatch at index %d, %f != %f\n", I, X[I],
101 Expected[I]);
102 std::exit(EXIT_FAILURE);
103 }
104 }
105
main()106 int main() {
107 testSaxpy(saxpy<3>);
108 testSaxpy(cudaSaxpy<3>);
109 }
110