• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
13 
14 #include <iterator>
15 
16 #include <boost/compute/utility/source.hpp>
17 #include <boost/compute/program.hpp>
18 #include <boost/compute/command_queue.hpp>
19 #include <boost/compute/detail/vendor.hpp>
20 #include <boost/compute/detail/parameter_cache.hpp>
21 #include <boost/compute/detail/work_size.hpp>
22 #include <boost/compute/detail/meta_kernel.hpp>
23 #include <boost/compute/type_traits/type_name.hpp>
24 #include <boost/compute/utility/program_cache.hpp>
25 
26 namespace boost {
27 namespace compute {
28 namespace detail {
29 
30 /// \internal
31 /// body reduction inside a warp
32 template<typename T,bool isNvidiaDevice>
33 struct ReduceBody
34 {
bodyboost::compute::detail::ReduceBody35     static std::string body()
36     {
37         std::stringstream k;
38         // local reduction
39         k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
40              "   barrier(CLK_LOCAL_MEM_FENCE);\n"  <<
41              "   uint mask = (i << 1) - 1;\n"      <<
42              "   if((lid & mask) == 0){\n"         <<
43              "       scratch[lid] += scratch[lid+i];\n" <<
44              "   }\n" <<
45             "}\n";
46         return k.str();
47     }
48 };
49 
50 /// \internal
51 /// body reduction inside a warp
52 /// for nvidia device we can use the "unsafe"
53 /// memory optimisation
54 template<typename T>
55 struct ReduceBody<T,true>
56 {
bodyboost::compute::detail::ReduceBody57     static std::string body()
58     {
59         std::stringstream k;
60         // local reduction
61         // we use TPB to compile only useful instruction
62         // local reduction when size is greater than warp size
63         k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
64         "if(TPB >= 1024){\n" <<
65             "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
66          "if(TPB >= 512){\n" <<
67             "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
68          "if(TPB >= 256){\n" <<
69             "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
70          "if(TPB >= 128){\n" <<
71             "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
72 
73         // warp reduction
74         "if(lid < 32){\n" <<
75             // volatile this way we don't need any barrier
76             "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
77             "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
78             "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
79             "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
80             "if(TPB >=  8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
81             "if(TPB >=  4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
82             "if(TPB >=  2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
83         "}\n";
84         return k.str();
85     }
86 };
87 
88 template<class InputIterator, class Function>
initial_reduce(InputIterator first,InputIterator last,buffer result,const Function & function,kernel & reduce_kernel,const uint_ vpt,const uint_ tpb,command_queue & queue)89 inline void initial_reduce(InputIterator first,
90                            InputIterator last,
91                            buffer result,
92                            const Function &function,
93                            kernel &reduce_kernel,
94                            const uint_ vpt,
95                            const uint_ tpb,
96                            command_queue &queue)
97 {
98     (void) function;
99     (void) reduce_kernel;
100 
101     typedef typename std::iterator_traits<InputIterator>::value_type Arg;
102     typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
103 
104     size_t count = std::distance(first, last);
105     detail::meta_kernel k("initial_reduce");
106     k.add_set_arg<const uint_>("count", uint_(count));
107     size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
108 
109     k <<
110         k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
111         k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
112 
113         "__local " << type_name<T>() << " scratch[TPB];\n" <<
114 
115         // private reduction
116         k.decl<T>("sum") << " = 0;\n" <<
117         "for(uint i = 0; i < VPT; i++){\n" <<
118         "    if(offset + lid + i*TPB < count){\n" <<
119         "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
120         "    }\n" <<
121         "}\n" <<
122 
123         "scratch[lid] = sum;\n" <<
124 
125         // local reduction
126         ReduceBody<T,false>::body() <<
127 
128         // write sum to output
129         "if(lid == 0){\n" <<
130         "    output[get_group_id(0)] = scratch[0];\n" <<
131         "}\n";
132 
133     const context &context = queue.get_context();
134     std::stringstream options;
135     options << "-DVPT=" << vpt << " -DTPB=" << tpb;
136     kernel generic_reduce_kernel = k.compile(context, options.str());
137     generic_reduce_kernel.set_arg(output_arg, result);
138 
139     size_t work_size = calculate_work_size(count, vpt, tpb);
140 
141     queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
142 }
143 
144 template<class T>
initial_reduce(const buffer_iterator<T> & first,const buffer_iterator<T> & last,const buffer & result,const plus<T> & function,kernel & reduce_kernel,const uint_ vpt,const uint_ tpb,command_queue & queue)145 inline void initial_reduce(const buffer_iterator<T> &first,
146                            const buffer_iterator<T> &last,
147                            const buffer &result,
148                            const plus<T> &function,
149                            kernel &reduce_kernel,
150                            const uint_ vpt,
151                            const uint_ tpb,
152                            command_queue &queue)
153 {
154     (void) function;
155 
156     size_t count = std::distance(first, last);
157 
158     reduce_kernel.set_arg(0, first.get_buffer());
159     reduce_kernel.set_arg(1, uint_(first.get_index()));
160     reduce_kernel.set_arg(2, uint_(count));
161     reduce_kernel.set_arg(3, result);
162     reduce_kernel.set_arg(4, uint_(0));
163 
164     size_t work_size = calculate_work_size(count, vpt, tpb);
165 
166     queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
167 }
168 
169 template<class InputIterator, class T, class Function>
reduce_on_gpu(InputIterator first,InputIterator last,buffer_iterator<T> result,Function function,command_queue & queue)170 inline void reduce_on_gpu(InputIterator first,
171                           InputIterator last,
172                           buffer_iterator<T> result,
173                           Function function,
174                           command_queue &queue)
175 {
176     const device &device = queue.get_device();
177     const context &context = queue.get_context();
178 
179     detail::meta_kernel k("reduce");
180     k.add_arg<const T*>(memory_object::global_memory, "input");
181     k.add_arg<const uint_>("offset");
182     k.add_arg<const uint_>("count");
183     k.add_arg<T*>(memory_object::global_memory, "output");
184     k.add_arg<const uint_>("output_offset");
185 
186     k <<
187         k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
188         "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
189         k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
190 
191         "__local " << type_name<T>() << " scratch[TPB];\n" <<
192         // private reduction
193         k.decl<T>("sum") << " = 0;\n" <<
194         "for(uint i = 0; i < VPT; i++){\n" <<
195         "    if(block_offset + lid + i*TPB < count){\n" <<
196         "        sum = sum + block[lid+i*TPB]; \n" <<
197         "    }\n" <<
198         "}\n" <<
199 
200         "scratch[lid] = sum;\n";
201 
202     // discrimination on vendor name
203     if(is_nvidia_device(device))
204         k << ReduceBody<T,true>::body();
205     else
206         k << ReduceBody<T,false>::body();
207 
208     k <<
209         // write sum to output
210          "if(lid == 0){\n" <<
211          "    output[output_offset + get_group_id(0)] = scratch[0];\n" <<
212          "}\n";
213 
214     std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
215 
216     // load parameters
217     boost::shared_ptr<parameter_cache> parameters =
218         detail::parameter_cache::get_global_cache(device);
219 
220     uint_ vpt = parameters->get(cache_key, "vpt", 8);
221     uint_ tpb = parameters->get(cache_key, "tpb", 128);
222 
223     // reduce program compiler flags
224     std::stringstream options;
225     options << "-DT=" << type_name<T>()
226             << " -DVPT=" << vpt
227             << " -DTPB=" << tpb;
228 
229     // load program
230     boost::shared_ptr<program_cache> cache =
231         program_cache::get_global_cache(context);
232 
233     program reduce_program = cache->get_or_build(
234         cache_key, options.str(), k.source(), context
235     );
236 
237     // create reduce kernel
238     kernel reduce_kernel(reduce_program, "reduce");
239 
240     size_t count = std::distance(first, last);
241 
242     // first pass, reduce from input to ping
243     buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
244     initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
245 
246     // update count after initial reduce
247     count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
248 
249     // middle pass(es), reduce between ping and pong
250     const buffer *input_buffer = &ping;
251     buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
252     const buffer *output_buffer = &pong;
253     if(count > vpt * tpb){
254         while(count > vpt * tpb){
255             reduce_kernel.set_arg(0, *input_buffer);
256             reduce_kernel.set_arg(1, uint_(0));
257             reduce_kernel.set_arg(2, uint_(count));
258             reduce_kernel.set_arg(3, *output_buffer);
259             reduce_kernel.set_arg(4, uint_(0));
260 
261             size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
262             if(work_size % tpb != 0){
263                 work_size += tpb - work_size % tpb;
264             }
265             queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
266 
267             std::swap(input_buffer, output_buffer);
268             count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
269         }
270     }
271 
272     // final pass, reduce from ping/pong to result
273     reduce_kernel.set_arg(0, *input_buffer);
274     reduce_kernel.set_arg(1, uint_(0));
275     reduce_kernel.set_arg(2, uint_(count));
276     reduce_kernel.set_arg(3, result.get_buffer());
277     reduce_kernel.set_arg(4, uint_(result.get_index()));
278 
279     queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
280 }
281 
282 } // end detail namespace
283 } // end compute namespace
284 } // end boost namespace
285 
286 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
287