• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2014 Benoit Dequidt <benoit.dequidt@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #include <iostream>
12 #include <cstdlib>
13 
14 #include <boost/program_options.hpp>
15 
16 #include <boost/compute/core.hpp>
17 #include <boost/compute/algorithm/copy.hpp>
18 #include <boost/compute/container/vector.hpp>
19 #include <boost/compute/type_traits/type_name.hpp>
20 #include <boost/compute/utility/source.hpp>
21 
22 namespace compute = boost::compute;
23 namespace po = boost::program_options;
24 
25 using compute::uint_;
26 
27 const uint_ TILE_DIM = 32;
28 const uint_ BLOCK_ROWS = 8;
29 
30 // generate a copy kernel program
make_copy_kernel(const compute::context & context)31 compute::kernel make_copy_kernel(const compute::context& context)
32 {
33     // source for the copy_kernel program
34     const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
35         __kernel void copy_kernel(__global const float *src, __global float *dst)
36         {
37             uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
38             uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
39 
40             uint width = get_num_groups(0) * TILE_DIM;
41 
42             for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS){
43                 dst[(y+i)*width +x] = src[(y+i)*width + x];
44             }
45         }
46     );
47 
48     // setup compilation flags for the copy program
49     std::stringstream options;
50     options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
51 
52     // create and build the copy program
53     compute::program program =
54         compute::program::build_with_source(source, context, options.str());
55 
56     // create and return the copy kernel
57     return program.create_kernel("copy_kernel");
58 }
59 
60 // generate a naive transpose kernel
make_naive_transpose_kernel(const compute::context & context)61 compute::kernel make_naive_transpose_kernel(const compute::context& context)
62 {
63     // source for the naive_transpose kernel
64     const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
65         __kernel void naive_transpose(__global const float *src, __global float *dst)
66         {
67             uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
68             uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
69 
70             uint width = get_num_groups(0) * TILE_DIM;
71 
72             for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
73                 dst[x*width + y+i] = src[(y+i)*width + x];
74             }
75         }
76     );
77 
78     // setup compilation flags for the naive_transpose program
79     std::stringstream options;
80     options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
81 
82     // create and build the naive_transpose program
83     compute::program program =
84         compute::program::build_with_source(source, context, options.str());
85 
86     // create and return the naive_transpose kernel
87     return program.create_kernel("naive_transpose");
88 }
89 
90 // generates a coalesced transpose kernel
make_coalesced_transpose_kernel(const compute::context & context)91 compute::kernel make_coalesced_transpose_kernel(const compute::context& context)
92 {
93     // source for the coalesced_transpose kernel
94     const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
95         __kernel void coalesced_transpose(__global const float *src, __global float *dst)
96         {
97             __local float tile[TILE_DIM][TILE_DIM];
98 
99             // compute indexes
100             uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
101             uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
102 
103             uint width = get_num_groups(0) * TILE_DIM;
104 
105             // load inside local memory
106             for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
107                 tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
108             }
109 
110             barrier(CLK_LOCAL_MEM_FENCE);
111 
112             // transpose indexes
113             x = get_group_id(1) * TILE_DIM + get_local_id(0);
114             y = get_group_id(0) * TILE_DIM + get_local_id(1);
115 
116             // write output from local memory
117             for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
118                 dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
119             }
120         }
121     );
122 
123     // setup compilation flags for the coalesced_transpose program
124     std::stringstream options;
125     options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
126 
127     // create and build the coalesced_transpose program
128     compute::program program =
129         compute::program::build_with_source(source, context, options.str());
130 
131     // create and return coalesced_transpose kernel
132     return program.create_kernel("coalesced_transpose");
133 }
134 
135 // generate a coalesced withtout bank conflicts kernel
make_coalesced_no_bank_conflicts_kernel(const compute::context & context)136 compute::kernel make_coalesced_no_bank_conflicts_kernel(const compute::context& context)
137 {
138     const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
139         __kernel void coalesced_no_bank_conflicts(__global const float *src, __global float *dst)
140         {
141             // TILE_DIM+1 is here to avoid bank conflicts in local memory
142             __local float tile[TILE_DIM][TILE_DIM+1];
143 
144             // compute indexes
145             uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
146             uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
147 
148             uint width = get_num_groups(0) * TILE_DIM;
149 
150             // load inside local memory
151             for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
152                 tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
153             }
154 
155             barrier(CLK_LOCAL_MEM_FENCE);
156 
157             // transpose indexes
158             x = get_group_id(1) * TILE_DIM + get_local_id(0);
159             y = get_group_id(0) * TILE_DIM + get_local_id(1);
160 
161             // write output from local memory
162             for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
163                 dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
164             }
165         }
166     );
167 
168     // setup compilation flags for the coalesced_no_bank_conflicts program
169     std::stringstream options;
170     options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
171 
172     // create and build the coalesced_no_bank_conflicts program
173     compute::program program =
174         compute::program::build_with_source(source, context, options.str());
175 
176     // create and return the coalesced_no_bank_conflicts kernel
177     return program.create_kernel("coalesced_no_bank_conflicts");
178 }
179 
180 // compare 'expectedResult' to 'transposedMatrix'. prints an error message if not equal.
check_transposition(const std::vector<float> & expectedResult,uint_ size,const std::vector<float> & transposedMatrix)181 bool check_transposition(const std::vector<float>& expectedResult,
182                          uint_ size,
183                          const std::vector<float>& transposedMatrix)
184 {
185     for(uint_ i = 0 ; i < size ; ++i){
186         if(expectedResult[i] != transposedMatrix[i]){
187             std::cout << "idx = " << i << " , expected " << expectedResult[i]
188                       << " , got " << transposedMatrix[i] << std::endl;
189             std::cout << "FAILED" << std::endl;
190             return false;
191         }
192     }
193     return true;
194 }
195 
196 // generate a matrix inside 'in' and do the tranposition inside 'out'
generate_matrix(std::vector<float> & in,std::vector<float> & out,uint_ rows,uint_ cols)197 void generate_matrix(std::vector<float>& in, std::vector<float>& out, uint_ rows, uint_ cols)
198 {
199     // generate a matrix
200     for(uint_ i = 0 ; i < rows ; ++i){
201         for(uint_ j = 0 ; j < cols ; ++j){
202             in[i*cols + j] = i*cols + j;
203         }
204     }
205 
206     // store transposed result
207     for(uint_ j = 0; j < cols ; ++j){
208         for(uint_ i = 0 ; i < rows ; ++i){
209             out[j*rows + i] = in[i*cols + j];
210         }
211     }
212 }
213 
214 // neccessary for 64-bit integer on win32
215 #ifdef _WIN32
216 #define uint64_t unsigned __int64
217 #endif
218 
main(int argc,char * argv[])219 int main(int argc, char *argv[])
220 {
221     // setup command line arguments
222     po::options_description options("options");
223     options.add_options()
224         ("help", "show usage instructions")
225         ("rows", po::value<uint_>()->default_value(4096), "number of matrix rows")
226         ("cols", po::value<uint_>()->default_value(4096), "number of matrix columns")
227     ;
228 
229     // parse command line
230     po::variables_map vm;
231     po::store(po::parse_command_line(argc, argv, options), vm);
232     po::notify(vm);
233 
234     // check command line arguments
235     if(vm.count("help")){
236         std::cout << options << std::endl;
237         return 0;
238     }
239 
240     // get number rows and columns for the matrix
241     const uint_ rows = vm["rows"].as<uint_>();
242     const uint_ cols = vm["cols"].as<uint_>();
243 
244     // get the default device
245     compute::device device = compute::system::default_device();
246 
247     // print out device name and matrix information
248     std::cout << "Device: " << device.name() << std::endl;
249     std::cout << "Matrix Size: " << rows << "x" << cols << std::endl;
250     std::cout << "Grid Size: " << rows/TILE_DIM << "x" << cols/TILE_DIM << " blocks" << std::endl;
251     std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
252     std::cout << std::endl;
253 
254     // On OSX this example does not work on CPU devices
255     #if defined(__APPLE__)
256     if(device.type() & compute::device::cpu) {
257         std::cout << "On OSX this example does not work on CPU devices" << std::endl;
258         return 0;
259     }
260     #endif
261 
262     const size_t global_work_size[2] = {rows, cols*BLOCK_ROWS/TILE_DIM};
263     const size_t local_work_size[2] = {TILE_DIM, BLOCK_ROWS};
264 
265     // setup input data on the host
266     const uint_ size = rows * cols;
267     std::vector<float> h_input(size);
268     std::vector<float> h_output(size);
269     std::vector<float> expectedResult(size);
270     generate_matrix(h_input, expectedResult, rows, cols);
271 
272     // create a context for the device
273     compute::context context(device);
274 
275     // device vectors
276     compute::vector<float> d_input(size, context);
277     compute::vector<float> d_output(size, context);
278 
279     // command_queue with profiling
280     compute::command_queue queue(context, device, compute::command_queue::enable_profiling);
281 
282     // copy input data
283     compute::copy(h_input.begin(), h_input.end(), d_input.begin(), queue);
284 
285     // simple copy kernel
286     std::cout << "Testing copy_kernel:" << std::endl;
287     compute::kernel kernel = make_copy_kernel(context);
288     kernel.set_arg(0, d_input);
289     kernel.set_arg(1, d_output);
290 
291     compute::event start;
292     start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
293     queue.finish();
294     uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
295 
296     std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
297     std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
298     compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
299 
300     check_transposition(h_input, rows*cols, h_output);
301     std::cout << std::endl;
302 
303     // naive_transpose kernel
304     std::cout << "Testing naive_transpose:" << std::endl;
305     kernel = make_naive_transpose_kernel(context);
306     kernel.set_arg(0, d_input);
307     kernel.set_arg(1, d_output);
308 
309     start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
310     queue.finish();
311     elapsed = start.duration<boost::chrono::nanoseconds>().count();
312     std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
313     std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
314     compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
315 
316     check_transposition(expectedResult, rows*cols, h_output);
317     std::cout << std::endl;
318 
319     // coalesced_transpose kernel
320     std::cout << "Testing coalesced_transpose:" << std::endl;
321     kernel = make_coalesced_transpose_kernel(context);
322     kernel.set_arg(0, d_input);
323     kernel.set_arg(1, d_output);
324 
325     start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
326     queue.finish();
327     elapsed = start.duration<boost::chrono::nanoseconds>().count();
328     std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
329     std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
330 
331     compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
332 
333     check_transposition(expectedResult, rows*cols, h_output);
334     std::cout << std::endl;
335 
336     // coalesced_no_bank_conflicts kernel
337     std::cout << "Testing coalesced_no_bank_conflicts:" << std::endl;
338 
339     kernel = make_coalesced_no_bank_conflicts_kernel(context);
340     kernel.set_arg(0, d_input);
341     kernel.set_arg(1, d_output);
342 
343     start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
344     queue.finish();
345     elapsed = start.duration<boost::chrono::nanoseconds>().count();
346     std::cout << "  Elapsed: " << elapsed << " ns" << std::endl;
347     std::cout << "  BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
348 
349     compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
350 
351     check_transposition(expectedResult, rows*cols, h_output);
352     std::cout << std::endl;
353 
354     return 0;
355 }
356