• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
13 
14 #include <algorithm>
15 
16 #include <boost/compute/types.hpp>
17 #include <boost/compute/command_queue.hpp>
18 #include <boost/compute/algorithm/copy.hpp>
19 #include <boost/compute/allocator/pinned_allocator.hpp>
20 #include <boost/compute/container/vector.hpp>
21 #include <boost/compute/detail/meta_kernel.hpp>
22 #include <boost/compute/detail/iterator_range_size.hpp>
23 #include <boost/compute/detail/parameter_cache.hpp>
24 #include <boost/compute/memory/local_buffer.hpp>
25 #include <boost/compute/type_traits/type_name.hpp>
26 #include <boost/compute/utility/program_cache.hpp>
27 
28 namespace boost {
29 namespace compute {
30 namespace detail {
31 
32 template<class InputIterator>
find_extrema_with_reduce_requirements_met(InputIterator first,InputIterator last,command_queue & queue)33 bool find_extrema_with_reduce_requirements_met(InputIterator first,
34                                                InputIterator last,
35                                                command_queue &queue)
36 {
37     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
38 
39     const device &device = queue.get_device();
40 
41     // device must have dedicated local memory storage
42     // otherwise reduction would be highly inefficient
43     if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
44     {
45         return false;
46     }
47 
48     const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
49     // local memory size in bytes (per compute unit)
50     const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
51 
52     std::string cache_key = std::string("__boost_find_extrema_reduce_")
53         + type_name<input_type>();
54     // load parameters
55     boost::shared_ptr<parameter_cache> parameters =
56         detail::parameter_cache::get_global_cache(device);
57 
58     // Get preferred work group size
59     size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
60 
61     work_group_size = (std::min)(max_work_group_size, work_group_size);
62 
63     // local memory size needed to perform parallel reduction
64     size_t required_local_mem_size = 0;
65     // indices size
66     required_local_mem_size += sizeof(uint_) * work_group_size;
67     // values size
68     required_local_mem_size += sizeof(input_type) * work_group_size;
69 
70     // at least 4 work groups per compute unit otherwise reduction
71     // would be highly inefficient
72     return ((required_local_mem_size * 4) <= local_mem_size);
73 }
74 
75 /// \internal_
76 /// Algorithm finds the first extremum in given range, i.e., with the lowest
77 /// index.
78 ///
79 /// If \p use_input_idx is false, it's assumed that input data is ordered by
80 /// increasing index and \p input_idx is not used in the algorithm.
81 template<class InputIterator, class ResultIterator, class Compare>
find_extrema_with_reduce(InputIterator input,vector<uint_>::iterator input_idx,size_t count,ResultIterator result,vector<uint_>::iterator result_idx,size_t work_groups_no,size_t work_group_size,Compare compare,const bool find_minimum,const bool use_input_idx,command_queue & queue)82 inline void find_extrema_with_reduce(InputIterator input,
83                                      vector<uint_>::iterator input_idx,
84                                      size_t count,
85                                      ResultIterator result,
86                                      vector<uint_>::iterator result_idx,
87                                      size_t work_groups_no,
88                                      size_t work_group_size,
89                                      Compare compare,
90                                      const bool find_minimum,
91                                      const bool use_input_idx,
92                                      command_queue &queue)
93 {
94     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
95 
96     const context &context = queue.get_context();
97 
98     meta_kernel k("find_extrema_reduce");
99     size_t count_arg = k.add_arg<uint_>("count");
100     size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
101     size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
102 
103     k <<
104         // Work item global id
105         k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
106 
107         // Index of element that will be read from input buffer
108         k.decl<uint_>("idx") << " = gid;\n" <<
109 
110         k.decl<input_type>("acc") << ";\n" <<
111         k.decl<uint_>("acc_idx") << ";\n" <<
112         "if(gid < count) {\n" <<
113             // Real index of currently best element
114             "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
115             k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
116             "#else\n" <<
117             k.var<uint_>("acc_idx") << " = idx;\n" <<
118             "#endif\n" <<
119 
120             // Init accumulator with first[get_global_id(0)]
121             "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
122             "idx += get_global_size(0);\n" <<
123         "}\n" <<
124 
125         k.decl<bool>("compare_result") << ";\n" <<
126         k.decl<bool>("equal") << ";\n\n" <<
127         "while( idx < count ){\n" <<
128             // Next element
129             k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
130             "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
131             k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
132             "#endif\n" <<
133 
134             // Comparison between currently best element (acc) and next element
135             "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
136             "compare_result = " << compare(k.var<input_type>("next"),
137                                            k.var<input_type>("acc")) << ";\n" <<
138             "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
139             "equal = !compare_result && !" <<
140                 compare(k.var<input_type>("acc"),
141                         k.var<input_type>("next")) << ";\n" <<
142             "# endif\n" <<
143             "#else\n" <<
144             "compare_result = " << compare(k.var<input_type>("acc"),
145                                            k.var<input_type>("next")) << ";\n" <<
146             "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
147             "equal = !compare_result && !" <<
148                 compare(k.var<input_type>("next"),
149                         k.var<input_type>("acc")) << ";\n" <<
150             "# endif\n" <<
151             "#endif\n" <<
152 
153             // save the winner
154             "acc = compare_result ? acc : next;\n" <<
155             "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
156             "acc_idx = compare_result ? " <<
157                 "acc_idx : " <<
158                 "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
159             "#else\n" <<
160             "acc_idx = compare_result ? acc_idx : idx;\n" <<
161             "#endif\n" <<
162             "idx += get_global_size(0);\n" <<
163         "}\n\n" <<
164 
165         // Work item local id
166         k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
167         "block[lid] = acc;\n" <<
168         "block_idx[lid] = acc_idx;\n" <<
169         "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
170 
171         k.decl<uint_>("group_offset") <<
172             " = count - (get_local_size(0) * get_group_id(0));\n\n";
173 
174     k <<
175         "#pragma unroll\n"
176         "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
177              "offset = offset / 2) {\n" <<
178              "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
179                  k.decl<input_type>("mine") << " = block[lid];\n" <<
180                  k.decl<input_type>("other") << " = block[lid+offset];\n" <<
181                  "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
182                  "compare_result = " << compare(k.var<input_type>("other"),
183                                                 k.var<input_type>("mine")) << ";\n" <<
184                  "equal = !compare_result && !" <<
185                      compare(k.var<input_type>("mine"),
186                              k.var<input_type>("other")) << ";\n" <<
187                  "#else\n" <<
188                  "compare_result = " << compare(k.var<input_type>("mine"),
189                                                 k.var<input_type>("other")) << ";\n" <<
190                  "equal = !compare_result && !" <<
191                      compare(k.var<input_type>("other"),
192                              k.var<input_type>("mine")) << ";\n" <<
193                  "#endif\n" <<
194                  "block[lid] = compare_result ? mine : other;\n" <<
195                  k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
196                  k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
197                  "block_idx[lid] = compare_result ? " <<
198                      "mine_idx : " <<
199                      "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
200              "}\n"
201              "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
202         "}\n\n" <<
203 
204          // write block result to global output
205         "if(lid == 0){\n" <<
206             result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
207             result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
208         "}";
209 
210     std::string options;
211     if(!find_minimum){
212         options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
213     }
214     if(use_input_idx){
215         options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
216     }
217 
218     kernel kernel = k.compile(context, options);
219 
220     kernel.set_arg(count_arg, static_cast<uint_>(count));
221     kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
222     kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
223 
224     queue.enqueue_1d_range_kernel(kernel,
225                                   0,
226                                   work_groups_no * work_group_size,
227                                   work_group_size);
228 }
229 
230 template<class InputIterator, class ResultIterator, class Compare>
find_extrema_with_reduce(InputIterator input,size_t count,ResultIterator result,vector<uint_>::iterator result_idx,size_t work_groups_no,size_t work_group_size,Compare compare,const bool find_minimum,command_queue & queue)231 inline void find_extrema_with_reduce(InputIterator input,
232                                      size_t count,
233                                      ResultIterator result,
234                                      vector<uint_>::iterator result_idx,
235                                      size_t work_groups_no,
236                                      size_t work_group_size,
237                                      Compare compare,
238                                      const bool find_minimum,
239                                      command_queue &queue)
240 {
241     // dummy will not be used
242     buffer_iterator<uint_> dummy = result_idx;
243     return find_extrema_with_reduce(
244         input, dummy, count, result, result_idx, work_groups_no,
245         work_group_size, compare, find_minimum, false, queue
246     );
247 }
248 
249 // Space complexity: \Omega(2 * work-group-size * work-groups-per-compute-unit)
250 template<class InputIterator, class Compare>
find_extrema_with_reduce(InputIterator first,InputIterator last,Compare compare,const bool find_minimum,command_queue & queue)251 InputIterator find_extrema_with_reduce(InputIterator first,
252                                        InputIterator last,
253                                        Compare compare,
254                                        const bool find_minimum,
255                                        command_queue &queue)
256 {
257     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
258     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
259 
260     const context &context = queue.get_context();
261     const device &device = queue.get_device();
262 
263     // Getting information about used queue and device
264     const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
265     const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
266 
267     const size_t count = detail::iterator_range_size(first, last);
268 
269     std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
270         + type_name<input_type>();
271 
272     // load parameters
273     boost::shared_ptr<parameter_cache> parameters =
274         detail::parameter_cache::get_global_cache(device);
275 
276     // get preferred work group size and preferred number
277     // of work groups per compute unit
278     size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
279     size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
280 
281     // calculate work group size and number of work groups
282     work_group_size = (std::min)(max_work_group_size, work_group_size);
283     size_t work_groups_no = compute_units_no * work_groups_per_cu;
284     work_groups_no = (std::min)(
285         work_groups_no,
286         static_cast<size_t>(std::ceil(float(count) / work_group_size))
287     );
288 
289     // phase I: finding candidates for extremum
290 
291     // device buffors for extremum candidates and their indices
292     // each work-group computes its candidate
293     vector<input_type> candidates(work_groups_no, context);
294     vector<uint_> candidates_idx(work_groups_no, context);
295 
296     // finding candidates for first extremum and their indices
297     find_extrema_with_reduce(
298         first, count, candidates.begin(), candidates_idx.begin(),
299         work_groups_no, work_group_size, compare, find_minimum, queue
300     );
301 
302     // phase II: finding extremum from among the candidates
303 
304     // zero-copy buffers for final result (value and index)
305     vector<input_type, ::boost::compute::pinned_allocator<input_type> >
306         result(1, context);
307     vector<uint_, ::boost::compute::pinned_allocator<uint_> >
308         result_idx(1, context);
309 
310     // get extremum from among the candidates
311     find_extrema_with_reduce(
312         candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
313         result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
314     );
315 
316     // mapping extremum index to host
317     uint_* result_idx_host_ptr =
318         static_cast<uint_*>(
319             queue.enqueue_map_buffer(
320                 result_idx.get_buffer(), command_queue::map_read,
321                 0, sizeof(uint_)
322             )
323         );
324 
325     return first + static_cast<difference_type>(*result_idx_host_ptr);
326 }
327 
328 template<class InputIterator>
find_extrema_with_reduce(InputIterator first,InputIterator last,::boost::compute::less<typename std::iterator_traits<InputIterator>::value_type> compare,const bool find_minimum,command_queue & queue)329 InputIterator find_extrema_with_reduce(InputIterator first,
330                                        InputIterator last,
331                                        ::boost::compute::less<
332                                            typename std::iterator_traits<
333                                                InputIterator
334                                            >::value_type
335                                        >
336                                        compare,
337                                        const bool find_minimum,
338                                        command_queue &queue)
339 {
340     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
341     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
342 
343     const context &context = queue.get_context();
344     const device &device = queue.get_device();
345 
346     // Getting information about used queue and device
347     const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
348     const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
349 
350     const size_t count = detail::iterator_range_size(first, last);
351 
352     std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
353         + type_name<input_type>();
354 
355     // load parameters
356     boost::shared_ptr<parameter_cache> parameters =
357         detail::parameter_cache::get_global_cache(device);
358 
359     // get preferred work group size and preferred number
360     // of work groups per compute unit
361     size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
362     size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
363 
364     // calculate work group size and number of work groups
365     work_group_size = (std::min)(max_work_group_size, work_group_size);
366     size_t work_groups_no = compute_units_no * work_groups_per_cu;
367     work_groups_no = (std::min)(
368         work_groups_no,
369         static_cast<size_t>(std::ceil(float(count) / work_group_size))
370     );
371 
372     // phase I: finding candidates for extremum
373 
374     // device buffors for extremum candidates and their indices
375     // each work-group computes its candidate
376     // zero-copy buffers are used to eliminate copying data back to host
377     vector<input_type, ::boost::compute::pinned_allocator<input_type> >
378         candidates(work_groups_no, context);
379     vector<uint_, ::boost::compute::pinned_allocator <uint_> >
380         candidates_idx(work_groups_no, context);
381 
382     // finding candidates for first extremum and their indices
383     find_extrema_with_reduce(
384         first, count, candidates.begin(), candidates_idx.begin(),
385         work_groups_no, work_group_size, compare, find_minimum, queue
386     );
387 
388     // phase II: finding extremum from among the candidates
389 
390     // mapping candidates and their indices to host
391     input_type* candidates_host_ptr =
392         static_cast<input_type*>(
393             queue.enqueue_map_buffer(
394                 candidates.get_buffer(), command_queue::map_read,
395                 0, work_groups_no * sizeof(input_type)
396             )
397         );
398 
399     uint_* candidates_idx_host_ptr =
400         static_cast<uint_*>(
401             queue.enqueue_map_buffer(
402                 candidates_idx.get_buffer(), command_queue::map_read,
403                 0, work_groups_no * sizeof(uint_)
404             )
405         );
406 
407     input_type* i = candidates_host_ptr;
408     uint_* idx = candidates_idx_host_ptr;
409     uint_* extremum_idx = idx;
410     input_type extremum = *candidates_host_ptr;
411     i++; idx++;
412 
413     // find extremum (serial) from among the candidates on host
414     if(!find_minimum) {
415         while(idx != (candidates_idx_host_ptr + work_groups_no)) {
416             input_type next = *i;
417             bool compare_result =  next > extremum;
418             bool equal = next == extremum;
419             extremum = compare_result ? next : extremum;
420             extremum_idx = compare_result ? idx : extremum_idx;
421             extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
422             idx++, i++;
423         }
424     }
425     else {
426         while(idx != (candidates_idx_host_ptr + work_groups_no)) {
427             input_type next = *i;
428             bool compare_result = next < extremum;
429             bool equal = next == extremum;
430             extremum = compare_result ? next : extremum;
431             extremum_idx = compare_result ? idx : extremum_idx;
432             extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
433             idx++, i++;
434         }
435     }
436 
437     return first + static_cast<difference_type>(*extremum_idx);
438 }
439 
440 } // end detail namespace
441 } // end compute namespace
442 } // end boost namespace
443 
444 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
445