1[/ 2 Copyright Oliver Kowalke 2017. 3 Distributed under the Boost Software License, Version 1.0. 4 (See accompanying file LICENSE_1_0.txt or copy at 5 http://www.boost.org/LICENSE_1_0.txt 6] 7 8[#cuda] 9[section:cuda CUDA] 10 11[@http://developer.nvidia.com/cuda-zone/ CUDA (Compute Unified Device Architecture)] is a platform for parallel computing 12on NVIDIA GPUs. The application programming interface of CUDA gives access to 13GPU's instruction set and computation resources (Execution of compute kernels). 14 15 16[heading Synchronization with CUDA streams] 17 18CUDA operation such as compute kernels or memory transfer (between host and 19device) can be grouped/queued by CUDA streams. are executed on the GPUs. 20Boost.Fiber enables a fiber to sleep (suspend) till a CUDA stream has completed 21its operations. This enables applications to run other fibers on the CPU without 22the need to spawn an additional OS-threads. And resume the fiber when the CUDA 23streams has finished. 24 25 __global__ 26 void kernel( int size, int * a, int * b, int * c) { 27 int idx = threadIdx.x + blockIdx.x * blockDim.x; 28 if ( idx < size) { 29 int idx1 = (idx + 1) % 256; 30 int idx2 = (idx + 2) % 256; 31 float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 32 float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 33 c[idx] = (as + bs) / 2; 34 } 35 } 36 37 boost::fibers::fiber f([&done]{ 38 cudaStream_t stream; 39 cudaStreamCreate( & stream); 40 int size = 1024 * 1024; 41 int full_size = 20 * size; 42 int * host_a, * host_b, * host_c; 43 cudaHostAlloc( & host_a, full_size * sizeof( int), cudaHostAllocDefault); 44 cudaHostAlloc( & host_b, full_size * sizeof( int), cudaHostAllocDefault); 45 cudaHostAlloc( & host_c, full_size * sizeof( int), cudaHostAllocDefault); 46 int * dev_a, * dev_b, * dev_c; 47 cudaMalloc( & dev_a, size * sizeof( int) ); 48 cudaMalloc( & dev_b, size * sizeof( int) ); 49 cudaMalloc( & dev_c, size * sizeof( int) ); 50 std::minstd_rand generator; 51 std::uniform_int_distribution<> distribution(1, 6); 52 for ( int i = 0; i < full_size; ++i) { 53 host_a[i] = distribution( generator); 54 host_b[i] = distribution( generator); 55 } 56 for ( int i = 0; i < full_size; i += size) { 57 cudaMemcpyAsync( dev_a, host_a + i, size * sizeof( int), cudaMemcpyHostToDevice, stream); 58 cudaMemcpyAsync( dev_b, host_b + i, size * sizeof( int), cudaMemcpyHostToDevice, stream); 59 kernel<<< size / 256, 256, 0, stream >>>( size, dev_a, dev_b, dev_c); 60 cudaMemcpyAsync( host_c + i, dev_c, size * sizeof( int), cudaMemcpyDeviceToHost, stream); 61 } 62 auto result = boost::fibers::cuda::waitfor_all( stream); // suspend fiber till CUDA stream has finished 63 BOOST_ASSERT( stream == std::get< 0 >( result) ); 64 BOOST_ASSERT( cudaSuccess == std::get< 1 >( result) ); 65 std::cout << "f1: GPU computation finished" << std::endl; 66 cudaFreeHost( host_a); 67 cudaFreeHost( host_b); 68 cudaFreeHost( host_c); 69 cudaFree( dev_a); 70 cudaFree( dev_b); 71 cudaFree( dev_c); 72 cudaStreamDestroy( stream); 73 }); 74 f.join(); 75 76 77[heading Synopsis] 78 79 #include <boost/fiber/cuda/waitfor.hpp> 80 81 namespace boost { 82 namespace fibers { 83 namespace cuda { 84 85 std::tuple< cudaStream_t, cudaError_t > waitfor_all( cudaStream_t st); 86 std::vector< std::tuple< cudaStream_t, cudaError_t > > waitfor_all( cudaStream_t ... st); 87 88 }}} 89 90 91[ns_function_heading cuda..waitfor] 92 93 #include <boost/fiber/cuda/waitfor.hpp> 94 95 namespace boost { 96 namespace fibers { 97 namespace cuda { 98 99 std::tuple< cudaStream_t, cudaError_t > waitfor_all( cudaStream_t st); 100 std::vector< std::tuple< cudaStream_t, cudaError_t > > waitfor_all( cudaStream_t ... st); 101 102 }}} 103 104[variablelist 105[[Effects:] [Suspends active fiber till CUDA stream has finished its operations.]] 106[[Returns:] [tuple of stream reference and the CUDA stream status]] 107] 108 109 110[endsect] 111