• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
13 
14 #include <boost/compute/kernel.hpp>
15 #include <boost/compute/program.hpp>
16 #include <boost/compute/command_queue.hpp>
17 #include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
18 #include <boost/compute/container/vector.hpp>
19 #include <boost/compute/detail/meta_kernel.hpp>
20 #include <boost/compute/detail/iterator_range_size.hpp>
21 
22 namespace boost {
23 namespace compute {
24 namespace detail {
25 
26 template<class KeyIterator, class ValueIterator, class Compare>
merge_blocks(KeyIterator keys_first,ValueIterator values_first,KeyIterator keys_result,ValueIterator values_result,Compare compare,size_t count,const size_t block_size,const bool sort_by_key,command_queue & queue)27 inline void merge_blocks(KeyIterator keys_first,
28                          ValueIterator values_first,
29                          KeyIterator keys_result,
30                          ValueIterator values_result,
31                          Compare compare,
32                          size_t count,
33                          const size_t block_size,
34                          const bool sort_by_key,
35                          command_queue &queue)
36 {
37     (void) values_result;
38     (void) values_first;
39 
40     meta_kernel k("merge_sort_on_cpu_merge_blocks");
41     size_t count_arg = k.add_arg<const uint_>("count");
42     size_t block_size_arg = k.add_arg<uint_>("block_size");
43 
44     k <<
45         k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" <<
46         k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" <<
47         k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" <<
48         k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" <<
49         k.decl<uint_>("result_idx") << " = b1_start;\n" <<
50 
51         // merging block 1 and block 2 (stable)
52         "while(b1_start < b1_end && b2_start < b2_end){\n" <<
53         "    if( " << compare(keys_first[k.var<uint_>("b2_start")],
54                               keys_first[k.var<uint_>("b1_start")]) << "){\n" <<
55         "        " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
56                       keys_first[k.var<uint_>("b2_start")] << ";\n";
57     if(sort_by_key){
58         k <<
59         "        " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
60                       values_first[k.var<uint_>("b2_start")] << ";\n";
61     }
62     k <<
63         "        b2_start++;\n" <<
64         "    }\n" <<
65         "    else {\n" <<
66         "        " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
67                       keys_first[k.var<uint_>("b1_start")] << ";\n";
68     if(sort_by_key){
69         k <<
70         "        " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
71                       values_first[k.var<uint_>("b1_start")] << ";\n";
72     }
73     k <<
74         "        b1_start++;\n" <<
75         "    }\n" <<
76         "    result_idx++;\n" <<
77         "}\n" <<
78         "while(b1_start < b1_end){\n" <<
79         "    " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
80                  keys_first[k.var<uint_>("b1_start")] << ";\n";
81     if(sort_by_key){
82         k <<
83         "    " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
84                       values_first[k.var<uint_>("b1_start")] << ";\n";
85     }
86     k <<
87         "    b1_start++;\n" <<
88         "    result_idx++;\n" <<
89         "}\n" <<
90         "while(b2_start < b2_end){\n" <<
91         "    " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
92                  keys_first[k.var<uint_>("b2_start")] << ";\n";
93     if(sort_by_key){
94         k <<
95         "    " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
96                       values_first[k.var<uint_>("b2_start")] << ";\n";
97     }
98     k <<
99         "    b2_start++;\n" <<
100         "    result_idx++;\n" <<
101         "}\n";
102 
103     const context &context = queue.get_context();
104     ::boost::compute::kernel kernel = k.compile(context);
105     kernel.set_arg(count_arg, static_cast<const uint_>(count));
106     kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
107 
108     const size_t global_size = static_cast<size_t>(
109         std::ceil(float(count) / (2 * block_size))
110     );
111     queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
112 }
113 
114 template<class Iterator, class Compare>
merge_blocks(Iterator first,Iterator result,Compare compare,size_t count,const size_t block_size,const bool sort_by_key,command_queue & queue)115 inline void merge_blocks(Iterator first,
116                          Iterator result,
117                          Compare compare,
118                          size_t count,
119                          const size_t block_size,
120                          const bool sort_by_key,
121                          command_queue &queue)
122 {
123     // dummy iterator as it's not sort by key
124     Iterator dummy;
125     merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue);
126 }
127 
128 template<class Iterator, class Compare>
dispatch_merge_blocks(Iterator first,Iterator result,Compare compare,size_t count,const size_t block_size,const size_t input_size_threshold,const size_t blocks_no_threshold,command_queue & queue)129 inline void dispatch_merge_blocks(Iterator first,
130                                   Iterator result,
131                                   Compare compare,
132                                   size_t count,
133                                   const size_t block_size,
134                                   const size_t input_size_threshold,
135                                   const size_t blocks_no_threshold,
136                                   command_queue &queue)
137 {
138     const size_t blocks_no = static_cast<size_t>(
139         std::ceil(float(count) / block_size)
140     );
141     // merge with merge path should used only for the large arrays and at the
142     // end of merging part when there are only a few big blocks left to be merged
143     if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){
144         Iterator last = first + count;
145         for(size_t i = 0; i < count; i+= 2*block_size)
146         {
147             Iterator first1 = (std::min)(first + i, last);
148             Iterator last1 = (std::min)(first1 + block_size, last);
149             Iterator first2 = last1;
150             Iterator last2 = (std::min)(first2 + block_size, last);
151             Iterator block_result = (std::min)(result + i, result + count);
152             merge_with_merge_path(first1, last1, first2, last2,
153                                   block_result, compare, queue);
154         }
155     }
156     else {
157         merge_blocks(first, result, compare, count, block_size, false, queue);
158     }
159 }
160 
161 template<class KeyIterator, class ValueIterator, class Compare>
block_insertion_sort(KeyIterator keys_first,ValueIterator values_first,Compare compare,const size_t count,const size_t block_size,const bool sort_by_key,command_queue & queue)162 inline void block_insertion_sort(KeyIterator keys_first,
163                                  ValueIterator values_first,
164                                  Compare compare,
165                                  const size_t count,
166                                  const size_t block_size,
167                                  const bool sort_by_key,
168                                  command_queue &queue)
169 {
170     (void) values_first;
171 
172     typedef typename std::iterator_traits<KeyIterator>::value_type K;
173     typedef typename std::iterator_traits<ValueIterator>::value_type T;
174 
175     meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
176     size_t count_arg = k.add_arg<uint_>("count");
177     size_t block_size_arg = k.add_arg<uint_>("block_size");
178 
179     k <<
180         k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
181         k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<
182 
183         // block insertion sort (stable)
184         "for(uint i = start+1; i < end; i++){\n" <<
185         "    " << k.decl<const K>("key") << " = " <<
186                   keys_first[k.var<uint_>("i")] << ";\n";
187     if(sort_by_key){
188         k <<
189         "    " << k.decl<const T>("value") << " = " <<
190                   values_first[k.var<uint_>("i")] << ";\n";
191     }
192     k <<
193         "    uint pos = i;\n" <<
194         "    while(pos > start && " <<
195                    compare(k.var<const K>("key"),
196                            keys_first[k.var<uint_>("pos-1")]) << "){\n" <<
197         "        " << keys_first[k.var<uint_>("pos")] << " = " <<
198                       keys_first[k.var<uint_>("pos-1")] << ";\n";
199     if(sort_by_key){
200         k <<
201         "        " << values_first[k.var<uint_>("pos")] << " = " <<
202                       values_first[k.var<uint_>("pos-1")] << ";\n";
203     }
204     k <<
205         "        pos--;\n" <<
206         "    }\n" <<
207         "    " << keys_first[k.var<uint_>("pos")] << " = key;\n";
208     if(sort_by_key) {
209         k <<
210         "    " << values_first[k.var<uint_>("pos")] << " = value;\n";
211     }
212     k <<
213         "}\n"; // block insertion sort
214 
215     const context &context = queue.get_context();
216     ::boost::compute::kernel kernel = k.compile(context);
217     kernel.set_arg(count_arg, static_cast<uint_>(count));
218     kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
219 
220     const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
221     queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
222 }
223 
224 template<class Iterator, class Compare>
block_insertion_sort(Iterator first,Compare compare,const size_t count,const size_t block_size,command_queue & queue)225 inline void block_insertion_sort(Iterator first,
226                                  Compare compare,
227                                  const size_t count,
228                                  const size_t block_size,
229                                  command_queue &queue)
230 {
231     // dummy iterator as it's not sort by key
232     Iterator dummy;
233     block_insertion_sort(first, dummy, compare, count, block_size, false, queue);
234 }
235 
236 // This sort is stable.
237 template<class Iterator, class Compare>
merge_sort_on_cpu(Iterator first,Iterator last,Compare compare,command_queue & queue)238 inline void merge_sort_on_cpu(Iterator first,
239                               Iterator last,
240                               Compare compare,
241                               command_queue &queue)
242 {
243     typedef typename std::iterator_traits<Iterator>::value_type value_type;
244 
245     size_t count = iterator_range_size(first, last);
246     if(count < 2){
247         return;
248     }
249     // for small input size only insertion sort is performed
250     else if(count <= 512){
251         block_insertion_sort(first, compare, count, count, queue);
252         return;
253     }
254 
255     const context &context = queue.get_context();
256     const device &device = queue.get_device();
257 
258     // loading parameters
259     std::string cache_key =
260         std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>();
261     boost::shared_ptr<parameter_cache> parameters =
262         detail::parameter_cache::get_global_cache(device);
263 
264     // When there is merge_with_path_blocks_no_threshold or less blocks left to
265     // merge AND input size is merge_with_merge_path_input_size_threshold or more
266     // merge_with_merge_path() algorithm is used to merge sorted blocks;
267     // otherwise merge_blocks() is used.
268     const size_t merge_with_path_blocks_no_threshold =
269         parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8);
270     const size_t merge_with_path_input_size_threshold =
271         parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152);
272 
273     const size_t block_size =
274         parameters->get(cache_key, "insertion_sort_block_size", 64);
275     block_insertion_sort(first, compare, count, block_size, queue);
276 
277     // temporary buffer for merge result
278     vector<value_type> temp(count, context);
279     bool result_in_temporary_buffer = false;
280 
281     for(size_t i = block_size; i < count; i *= 2){
282         result_in_temporary_buffer = !result_in_temporary_buffer;
283         if(result_in_temporary_buffer) {
284             dispatch_merge_blocks(first, temp.begin(), compare, count, i,
285                                   merge_with_path_input_size_threshold,
286                                   merge_with_path_blocks_no_threshold,
287                                   queue);
288         } else {
289             dispatch_merge_blocks(temp.begin(), first, compare, count, i,
290                                   merge_with_path_input_size_threshold,
291                                   merge_with_path_blocks_no_threshold,
292                                   queue);
293         }
294     }
295 
296     if(result_in_temporary_buffer) {
297         copy(temp.begin(), temp.end(), first, queue);
298     }
299 }
300 
301 // This sort is stable.
302 template<class KeyIterator, class ValueIterator, class Compare>
merge_sort_by_key_on_cpu(KeyIterator keys_first,KeyIterator keys_last,ValueIterator values_first,Compare compare,command_queue & queue)303 inline void merge_sort_by_key_on_cpu(KeyIterator keys_first,
304                                      KeyIterator keys_last,
305                                      ValueIterator values_first,
306                                      Compare compare,
307                                      command_queue &queue)
308 {
309     typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
310     typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
311 
312     size_t count = iterator_range_size(keys_first, keys_last);
313     if(count < 2){
314         return;
315     }
316     // for small input size only insertion sort is performed
317     else if(count <= 512){
318         block_insertion_sort(keys_first, values_first, compare,
319                              count, count, true, queue);
320         return;
321     }
322 
323     const context &context = queue.get_context();
324     const device &device = queue.get_device();
325 
326     // loading parameters
327     std::string cache_key =
328         std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>()
329         + "_with_" + type_name<key_type>();
330     boost::shared_ptr<parameter_cache> parameters =
331         detail::parameter_cache::get_global_cache(device);
332 
333     const size_t block_size =
334         parameters->get(cache_key, "insertion_sort_by_key_block_size", 64);
335     block_insertion_sort(keys_first, values_first, compare,
336                          count, block_size, true, queue);
337 
338     // temporary buffer for merge results
339     vector<value_type> values_temp(count, context);
340     vector<key_type> keys_temp(count, context);
341     bool result_in_temporary_buffer = false;
342 
343     for(size_t i = block_size; i < count; i *= 2){
344         result_in_temporary_buffer = !result_in_temporary_buffer;
345         if(result_in_temporary_buffer) {
346             merge_blocks(keys_first, values_first,
347                          keys_temp.begin(), values_temp.begin(),
348                          compare, count, i, true, queue);
349         } else {
350             merge_blocks(keys_temp.begin(), values_temp.begin(),
351                          keys_first, values_first,
352                          compare, count, i, true, queue);
353         }
354     }
355 
356     if(result_in_temporary_buffer) {
357         copy(keys_temp.begin(), keys_temp.end(), keys_first, queue);
358         copy(values_temp.begin(), values_temp.end(), values_first, queue);
359     }
360 }
361 
362 } // end detail namespace
363 } // end compute namespace
364 } // end boost namespace
365 
366 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
367