//===--- simple_example.cu - Simple example of using Acxxel ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// This file is a simple example of using Acxxel. /// //===----------------------------------------------------------------------===// /// [Example simple saxpy] #include "acxxel.h" #include #include #include // A standard CUDA kernel. __global__ void saxpyKernel(float A, float *X, float *Y, int N) { int I = (blockDim.x * blockIdx.x) + threadIdx.x; if (I < N) X[I] = A * X[I] + Y[I]; } // A host library wrapping the CUDA kernel. All Acxxel calls are in here. template void saxpy(float A, std::array &X, const std::array &Y) { // Get the CUDA platform and make a CUDA stream. acxxel::Platform *CUDA = acxxel::getCUDAPlatform().getValue(); acxxel::Stream Stream = CUDA->createStream().takeValue(); // Allocate space for device arrays. auto DeviceX = CUDA->mallocD(N).takeValue(); auto DeviceY = CUDA->mallocD(N).takeValue(); // Copy X and Y out to the device. Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY); // Launch the kernel using triple-chevron notation. saxpyKernel<<<1, N, 0, Stream>>>(A, DeviceX, DeviceY, N); // Copy the results back to the host. acxxel::Status Status = Stream.syncCopyDToH(DeviceX, X).takeStatus(); // Check for any errors. if (Status.isError()) { std::fprintf(stderr, "Error performing acxxel saxpy: %s\n", Status.getMessage().c_str()); std::exit(EXIT_FAILURE); } } /// [Example simple saxpy] /// [Example CUDA simple saxpy] template void cudaSaxpy(float A, std::array &X, std::array &Y) { // This size is needed all over the place, so give it a name. constexpr size_t Size = N * sizeof(float); // Allocate space for device arrays. float *DeviceX; float *DeviceY; cudaMalloc(&DeviceX, Size); cudaMalloc(&DeviceY, Size); // Copy X and Y out to the device. cudaMemcpy(DeviceX, X.data(), Size, cudaMemcpyHostToDevice); cudaMemcpy(DeviceY, Y.data(), Size, cudaMemcpyHostToDevice); // Launch the kernel using triple-chevron notation. saxpyKernel<<<1, N>>>(A, DeviceX, DeviceY, N); // Copy the results back to the host. cudaMemcpy(X.data(), DeviceX, Size, cudaMemcpyDeviceToHost); // Free resources. cudaFree(DeviceX); cudaFree(DeviceY); // Check for any errors. cudaError_t Error = cudaGetLastError(); if (Error) { std::fprintf(stderr, "Error performing cudart saxpy: %s\n", cudaGetErrorString(Error)); std::exit(EXIT_FAILURE); } } /// [Example CUDA simple saxpy] template void testSaxpy(F &&SaxpyFunction) { float A = 2.f; std::array X = {{0.f, 1.f, 2.f}}; std::array Y = {{3.f, 4.f, 5.f}}; std::array Expected = {{3.f, 6.f, 9.f}}; SaxpyFunction(A, X, Y); for (int I = 0; I < 3; ++I) if (X[I] != Expected[I]) { std::fprintf(stderr, "Result mismatch at index %d, %f != %f\n", I, X[I], Expected[I]); std::exit(EXIT_FAILURE); } } int main() { testSaxpy(saxpy<3>); testSaxpy(cudaSaxpy<3>); }