• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP
12 #define BOOST_COMPUTE_ALGORITHM_COPY_HPP
13 
14 #include <algorithm>
15 #include <iterator>
16 
17 #include <boost/utility/enable_if.hpp>
18 
19 #include <boost/mpl/and.hpp>
20 #include <boost/mpl/not.hpp>
21 #include <boost/mpl/or.hpp>
22 
23 #include <boost/compute/buffer.hpp>
24 #include <boost/compute/system.hpp>
25 #include <boost/compute/command_queue.hpp>
26 #include <boost/compute/algorithm/detail/copy_on_device.hpp>
27 #include <boost/compute/algorithm/detail/copy_to_device.hpp>
28 #include <boost/compute/algorithm/detail/copy_to_host.hpp>
29 #include <boost/compute/async/future.hpp>
30 #include <boost/compute/container/mapped_view.hpp>
31 #include <boost/compute/detail/device_ptr.hpp>
32 #include <boost/compute/detail/is_contiguous_iterator.hpp>
33 #include <boost/compute/detail/iterator_range_size.hpp>
34 #include <boost/compute/detail/parameter_cache.hpp>
35 #include <boost/compute/iterator/buffer_iterator.hpp>
36 #include <boost/compute/type_traits/type_name.hpp>
37 #include <boost/compute/type_traits/is_device_iterator.hpp>
38 
39 namespace boost {
40 namespace compute {
41 namespace detail {
42 
43 namespace mpl = boost::mpl;
44 
45 // meta-function returning true if copy() between InputIterator and
46 // OutputIterator can be implemented with clEnqueueCopyBuffer().
47 template<class InputIterator, class OutputIterator>
48 struct can_copy_with_copy_buffer :
49     mpl::and_<
50         mpl::or_<
51             boost::is_same<
52                 InputIterator,
53                 buffer_iterator<typename InputIterator::value_type>
54             >,
55             boost::is_same<
56                 InputIterator,
57                 detail::device_ptr<typename InputIterator::value_type>
58             >
59         >,
60         mpl::or_<
61             boost::is_same<
62                 OutputIterator,
63                 buffer_iterator<typename OutputIterator::value_type>
64             >,
65             boost::is_same<
66                 OutputIterator,
67                 detail::device_ptr<typename OutputIterator::value_type>
68             >
69         >,
70         boost::is_same<
71             typename InputIterator::value_type,
72             typename OutputIterator::value_type
73         >
74     >::type {};
75 
76 // meta-function returning true if value_types of HostIterator and
77 // DeviceIterator are same
78 template<class HostIterator, class DeviceIterator>
79 struct is_same_value_type :
80     boost::is_same<
81         typename boost::remove_cv<
82             typename std::iterator_traits<HostIterator>::value_type
83         >::type,
84         typename boost::remove_cv<
85             typename DeviceIterator::value_type
86         >::type
87     >::type {};
88 
89 // meta-function returning true if value_type of HostIterator is bool
90 template<class HostIterator>
91 struct is_bool_value_type :
92     boost::is_same<
93         typename boost::remove_cv<
94             typename std::iterator_traits<HostIterator>::value_type
95         >::type,
96         bool
97     >::type {};
98 
99 // host -> device (async)
100 template<class InputIterator, class OutputIterator>
101 inline future<OutputIterator>
dispatch_copy_async(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<mpl::not_<is_device_iterator<InputIterator>>,is_device_iterator<OutputIterator>,is_same_value_type<InputIterator,OutputIterator>>>::type * =0)102 dispatch_copy_async(InputIterator first,
103                     InputIterator last,
104                     OutputIterator result,
105                     command_queue &queue,
106                     const wait_list &events,
107                     typename boost::enable_if<
108                         mpl::and_<
109                             mpl::not_<
110                                 is_device_iterator<InputIterator>
111                             >,
112                             is_device_iterator<OutputIterator>,
113                             is_same_value_type<InputIterator, OutputIterator>
114                         >
115                     >::type* = 0)
116 {
117     BOOST_STATIC_ASSERT_MSG(
118         is_contiguous_iterator<InputIterator>::value,
119         "copy_async() is only supported for contiguous host iterators"
120     );
121 
122     return copy_to_device_async(first, last, result, queue, events);
123 }
124 
125 // host -> device (async)
126 // Type mismatch between InputIterator and OutputIterator value_types
127 template<class InputIterator, class OutputIterator>
128 inline future<OutputIterator>
dispatch_copy_async(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<mpl::not_<is_device_iterator<InputIterator>>,is_device_iterator<OutputIterator>,mpl::not_<is_same_value_type<InputIterator,OutputIterator>>>>::type * =0)129 dispatch_copy_async(InputIterator first,
130                     InputIterator last,
131                     OutputIterator result,
132                     command_queue &queue,
133                     const wait_list &events,
134                     typename boost::enable_if<
135                         mpl::and_<
136                             mpl::not_<
137                                 is_device_iterator<InputIterator>
138                             >,
139                             is_device_iterator<OutputIterator>,
140                             mpl::not_<
141                                 is_same_value_type<InputIterator, OutputIterator>
142                             >
143                         >
144                     >::type* = 0)
145 {
146     BOOST_STATIC_ASSERT_MSG(
147         is_contiguous_iterator<InputIterator>::value,
148         "copy_async() is only supported for contiguous host iterators"
149     );
150 
151     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
152 
153     const context &context = queue.get_context();
154     size_t count = iterator_range_size(first, last);
155 
156     if(count < size_t(1)) {
157         return future<OutputIterator>();
158     }
159 
160     // map [first; last) to device and run copy kernel
161     // on device for copying & casting
162     ::boost::compute::mapped_view<input_type> mapped_host(
163         // make sure it's a pointer to constant data
164         // to force read only mapping
165         const_cast<const input_type*>(
166             ::boost::addressof(*first)
167         ),
168         count,
169         context
170     );
171     return copy_on_device_async(
172         mapped_host.begin(), mapped_host.end(), result, queue, events
173     );
174 }
175 
176 // host -> device
177 // InputIterator is a contiguous iterator
178 template<class InputIterator, class OutputIterator>
179 inline OutputIterator
dispatch_copy(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<mpl::not_<is_device_iterator<InputIterator>>,is_device_iterator<OutputIterator>,is_same_value_type<InputIterator,OutputIterator>,is_contiguous_iterator<InputIterator>>>::type * =0)180 dispatch_copy(InputIterator first,
181               InputIterator last,
182               OutputIterator result,
183               command_queue &queue,
184               const wait_list &events,
185               typename boost::enable_if<
186                   mpl::and_<
187                       mpl::not_<
188                           is_device_iterator<InputIterator>
189                       >,
190                       is_device_iterator<OutputIterator>,
191                       is_same_value_type<InputIterator, OutputIterator>,
192                       is_contiguous_iterator<InputIterator>
193                   >
194               >::type* = 0)
195 {
196     return copy_to_device(first, last, result, queue, events);
197 }
198 
199 // host -> device
200 // Type mismatch between InputIterator and OutputIterator value_types
201 // InputIterator is a contiguous iterator
202 template<class InputIterator, class OutputIterator>
203 inline OutputIterator
dispatch_copy(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<mpl::not_<is_device_iterator<InputIterator>>,is_device_iterator<OutputIterator>,mpl::not_<is_same_value_type<InputIterator,OutputIterator>>,is_contiguous_iterator<InputIterator>>>::type * =0)204 dispatch_copy(InputIterator first,
205               InputIterator last,
206               OutputIterator result,
207               command_queue &queue,
208               const wait_list &events,
209               typename boost::enable_if<
210                   mpl::and_<
211                       mpl::not_<
212                           is_device_iterator<InputIterator>
213                       >,
214                       is_device_iterator<OutputIterator>,
215                       mpl::not_<
216                           is_same_value_type<InputIterator, OutputIterator>
217                       >,
218                       is_contiguous_iterator<InputIterator>
219                   >
220               >::type* = 0)
221 {
222     typedef typename OutputIterator::value_type output_type;
223     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
224 
225     const device &device = queue.get_device();
226 
227     // loading parameters
228     std::string cache_key =
229         std::string("__boost_compute_copy_to_device_")
230             + type_name<input_type>() + "_" + type_name<output_type>();
231     boost::shared_ptr<parameter_cache> parameters =
232         detail::parameter_cache::get_global_cache(device);
233 
234     uint_ map_copy_threshold;
235     uint_ direct_copy_threshold;
236 
237     // calculate default values of thresholds
238     if (device.type() & device::gpu) {
239         // GPUs
240         map_copy_threshold = 524288;  // 0.5 MB
241         direct_copy_threshold = 52428800; // 50 MB
242     }
243     else {
244         // CPUs and other devices
245         map_copy_threshold = 134217728; // 128 MB
246         direct_copy_threshold = 0; // it's never efficient for CPUs
247     }
248 
249     // load thresholds
250     map_copy_threshold =
251         parameters->get(
252             cache_key, "map_copy_threshold", map_copy_threshold
253         );
254     direct_copy_threshold =
255         parameters->get(
256             cache_key, "direct_copy_threshold", direct_copy_threshold
257         );
258 
259     // select copy method based on thresholds & input_size_bytes
260     size_t count = iterator_range_size(first, last);
261     size_t input_size_bytes = count * sizeof(input_type);
262 
263     // [0; map_copy_threshold) -> copy_to_device_map()
264     if(input_size_bytes < map_copy_threshold) {
265         return copy_to_device_map(first, last, result, queue, events);
266     }
267     // [map_copy_threshold; direct_copy_threshold) -> convert [first; last)
268     //     on host and then perform copy_to_device()
269     else if(input_size_bytes < direct_copy_threshold) {
270         std::vector<output_type> vector(first, last);
271         return copy_to_device(
272             vector.begin(), vector.end(), result, queue, events
273         );
274     }
275 
276     // [direct_copy_threshold; inf) -> map [first; last) to device and
277     //     run copy kernel on device for copying & casting
278     // At this point we are sure that count > 1 (first != last).
279 
280     // Perform async copy to device, wait for it to be finished and
281     // return the result.
282     // At this point we are sure that count > 1 (first != last), so event
283     // returned by dispatch_copy_async() must be valid.
284     return dispatch_copy_async(first, last, result, queue, events).get();
285 }
286 
287 // host -> device
288 // InputIterator is NOT a contiguous iterator
289 template<class InputIterator, class OutputIterator>
290 inline OutputIterator
dispatch_copy(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<mpl::not_<is_device_iterator<InputIterator>>,is_device_iterator<OutputIterator>,mpl::not_<is_contiguous_iterator<InputIterator>>>>::type * =0)291 dispatch_copy(InputIterator first,
292               InputIterator last,
293               OutputIterator result,
294               command_queue &queue,
295               const wait_list &events,
296               typename boost::enable_if<
297                   mpl::and_<
298                       mpl::not_<
299                           is_device_iterator<InputIterator>
300                       >,
301                       is_device_iterator<OutputIterator>,
302                       mpl::not_<
303                           is_contiguous_iterator<InputIterator>
304                       >
305                   >
306               >::type* = 0)
307 {
308     typedef typename OutputIterator::value_type output_type;
309     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
310 
311     const device &device = queue.get_device();
312 
313     // loading parameters
314     std::string cache_key =
315         std::string("__boost_compute_copy_to_device_")
316             + type_name<input_type>() + "_" + type_name<output_type>();
317     boost::shared_ptr<parameter_cache> parameters =
318         detail::parameter_cache::get_global_cache(device);
319 
320     uint_ map_copy_threshold;
321     uint_ direct_copy_threshold;
322 
323     // calculate default values of thresholds
324     if (device.type() & device::gpu) {
325         // GPUs
326         map_copy_threshold = 524288;  // 0.5 MB
327         direct_copy_threshold = 52428800; // 50 MB
328     }
329     else {
330         // CPUs and other devices
331         map_copy_threshold = 134217728; // 128 MB
332         direct_copy_threshold = 0; // it's never efficient for CPUs
333     }
334 
335     // load thresholds
336     map_copy_threshold =
337         parameters->get(
338             cache_key, "map_copy_threshold", map_copy_threshold
339         );
340     direct_copy_threshold =
341         parameters->get(
342             cache_key, "direct_copy_threshold", direct_copy_threshold
343         );
344 
345     // select copy method based on thresholds & input_size_bytes
346     size_t input_size = iterator_range_size(first, last);
347     size_t input_size_bytes = input_size * sizeof(input_type);
348 
349     // [0; map_copy_threshold) -> copy_to_device_map()
350     //
351     // if direct_copy_threshold is less than map_copy_threshold
352     // copy_to_device_map() is used for every input
353     if(input_size_bytes < map_copy_threshold
354         || direct_copy_threshold <= map_copy_threshold) {
355         return copy_to_device_map(first, last, result, queue, events);
356     }
357     // [map_copy_threshold; inf) -> convert [first; last)
358     //     on host and then perform copy_to_device()
359     std::vector<output_type> vector(first, last);
360     return copy_to_device(vector.begin(), vector.end(), result, queue, events);
361 }
362 
363 // device -> host (async)
364 template<class InputIterator, class OutputIterator>
365 inline future<OutputIterator>
dispatch_copy_async(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<is_device_iterator<InputIterator>,mpl::not_<is_device_iterator<OutputIterator>>,is_same_value_type<OutputIterator,InputIterator>>>::type * =0)366 dispatch_copy_async(InputIterator first,
367                     InputIterator last,
368                     OutputIterator result,
369                     command_queue &queue,
370                     const wait_list &events,
371                     typename boost::enable_if<
372                         mpl::and_<
373                             is_device_iterator<InputIterator>,
374                             mpl::not_<
375                                 is_device_iterator<OutputIterator>
376                             >,
377                             is_same_value_type<OutputIterator, InputIterator>
378                         >
379                     >::type* = 0)
380 {
381     BOOST_STATIC_ASSERT_MSG(
382         is_contiguous_iterator<OutputIterator>::value,
383         "copy_async() is only supported for contiguous host iterators"
384     );
385 
386     return copy_to_host_async(first, last, result, queue, events);
387 }
388 
389 // device -> host (async)
390 // Type mismatch between InputIterator and OutputIterator value_types
391 template<class InputIterator, class OutputIterator>
392 inline future<OutputIterator>
dispatch_copy_async(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<is_device_iterator<InputIterator>,mpl::not_<is_device_iterator<OutputIterator>>,mpl::not_<is_same_value_type<OutputIterator,InputIterator>>>>::type * =0)393 dispatch_copy_async(InputIterator first,
394                     InputIterator last,
395                     OutputIterator result,
396                     command_queue &queue,
397                     const wait_list &events,
398                     typename boost::enable_if<
399                         mpl::and_<
400                             is_device_iterator<InputIterator>,
401                             mpl::not_<
402                                 is_device_iterator<OutputIterator>
403                             >,
404                             mpl::not_<
405                                 is_same_value_type<OutputIterator, InputIterator>
406                             >
407                         >
408                     >::type* = 0)
409 {
410     BOOST_STATIC_ASSERT_MSG(
411         is_contiguous_iterator<OutputIterator>::value,
412         "copy_async() is only supported for contiguous host iterators"
413     );
414 
415     typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
416     const context &context = queue.get_context();
417     size_t count = iterator_range_size(first, last);
418 
419     if(count < size_t(1)) {
420         return future<OutputIterator>();
421     }
422 
423     // map host memory to device
424     buffer mapped_host(
425         context,
426         count * sizeof(output_type),
427         buffer::write_only | buffer::use_host_ptr,
428         static_cast<void*>(
429             ::boost::addressof(*result)
430         )
431     );
432     // copy async on device
433     ::boost::compute::future<buffer_iterator<output_type> > future =
434         copy_on_device_async(
435             first,
436             last,
437             make_buffer_iterator<output_type>(mapped_host),
438             queue,
439             events
440         );
441     // update host memory asynchronously by maping and unmaping memory
442     event map_event;
443     void* ptr = queue.enqueue_map_buffer_async(
444         mapped_host,
445         CL_MAP_READ,
446         0,
447         count * sizeof(output_type),
448         map_event,
449         future.get_event()
450     );
451     event unmap_event =
452         queue.enqueue_unmap_buffer(mapped_host, ptr, map_event);
453     return make_future(result + count, unmap_event);
454 }
455 
456 // device -> host
457 // OutputIterator is a contiguous iterator
458 template<class InputIterator, class OutputIterator>
459 inline OutputIterator
dispatch_copy(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<is_device_iterator<InputIterator>,mpl::not_<is_device_iterator<OutputIterator>>,is_same_value_type<OutputIterator,InputIterator>,is_contiguous_iterator<OutputIterator>,mpl::not_<is_bool_value_type<OutputIterator>>>>::type * =0)460 dispatch_copy(InputIterator first,
461               InputIterator last,
462               OutputIterator result,
463               command_queue &queue,
464               const wait_list &events,
465               typename boost::enable_if<
466                   mpl::and_<
467                       is_device_iterator<InputIterator>,
468                       mpl::not_<
469                           is_device_iterator<OutputIterator>
470                       >,
471                       is_same_value_type<OutputIterator, InputIterator>,
472                       is_contiguous_iterator<OutputIterator>,
473                       mpl::not_<
474                           is_bool_value_type<OutputIterator>
475                       >
476                   >
477               >::type* = 0)
478 {
479     return copy_to_host(first, last, result, queue, events);
480 }
481 
482 // device -> host
483 // Type mismatch between InputIterator and OutputIterator value_types
484 // OutputIterator is NOT a contiguous iterator or value_type of OutputIterator
485 // is a boolean type.
486 template<class InputIterator, class OutputIterator>
487 inline OutputIterator
dispatch_copy(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<is_device_iterator<InputIterator>,mpl::not_<is_device_iterator<OutputIterator>>,mpl::or_<mpl::not_<is_contiguous_iterator<OutputIterator>>,is_bool_value_type<OutputIterator>>>>::type * =0)488 dispatch_copy(InputIterator first,
489               InputIterator last,
490               OutputIterator result,
491               command_queue &queue,
492               const wait_list &events,
493               typename boost::enable_if<
494                   mpl::and_<
495                       is_device_iterator<InputIterator>,
496                       mpl::not_<
497                           is_device_iterator<OutputIterator>
498                       >,
499                       mpl::or_<
500                           mpl::not_<
501                               is_contiguous_iterator<OutputIterator>
502                           >,
503                           is_bool_value_type<OutputIterator>
504                       >
505                   >
506               >::type* = 0)
507 {
508     typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
509     typedef typename InputIterator::value_type input_type;
510 
511     const device &device = queue.get_device();
512 
513     // loading parameters
514     std::string cache_key =
515         std::string("__boost_compute_copy_to_host_")
516             + type_name<input_type>() + "_" + type_name<output_type>();
517     boost::shared_ptr<parameter_cache> parameters =
518         detail::parameter_cache::get_global_cache(device);
519 
520     uint_ map_copy_threshold;
521     uint_ direct_copy_threshold;
522 
523     // calculate default values of thresholds
524     if (device.type() & device::gpu) {
525         // GPUs
526         map_copy_threshold = 33554432;  // 30 MB
527         direct_copy_threshold = 0; // it's never efficient for GPUs
528     }
529     else {
530         // CPUs and other devices
531         map_copy_threshold = 134217728; // 128 MB
532         direct_copy_threshold = 0; // it's never efficient for CPUs
533     }
534 
535     // load thresholds
536     map_copy_threshold =
537         parameters->get(
538             cache_key, "map_copy_threshold", map_copy_threshold
539         );
540     direct_copy_threshold =
541         parameters->get(
542             cache_key, "direct_copy_threshold", direct_copy_threshold
543         );
544 
545     // select copy method based on thresholds & input_size_bytes
546     size_t count = iterator_range_size(first, last);
547     size_t input_size_bytes = count * sizeof(input_type);
548 
549     // [0; map_copy_threshold) -> copy_to_host_map()
550     //
551     // if direct_copy_threshold is less than map_copy_threshold
552     // copy_to_host_map() is used for every input
553     if(input_size_bytes < map_copy_threshold
554         || direct_copy_threshold <= map_copy_threshold) {
555         return copy_to_host_map(first, last, result, queue, events);
556     }
557     // [map_copy_threshold; inf) -> copy [first;last) to temporary vector
558     //     then copy (and convert) to result using std::copy()
559     std::vector<input_type> vector(count);
560     copy_to_host(first, last, vector.begin(), queue, events);
561     return std::copy(vector.begin(), vector.end(), result);
562 }
563 
564 // device -> host
565 // Type mismatch between InputIterator and OutputIterator value_types
566 // OutputIterator is a contiguous iterator
567 // value_type of OutputIterator is NOT a boolean type
568 template<class InputIterator, class OutputIterator>
569 inline OutputIterator
dispatch_copy(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<is_device_iterator<InputIterator>,mpl::not_<is_device_iterator<OutputIterator>>,mpl::not_<is_same_value_type<OutputIterator,InputIterator>>,is_contiguous_iterator<OutputIterator>,mpl::not_<is_bool_value_type<OutputIterator>>>>::type * =0)570 dispatch_copy(InputIterator first,
571               InputIterator last,
572               OutputIterator result,
573               command_queue &queue,
574               const wait_list &events,
575               typename boost::enable_if<
576                   mpl::and_<
577                       is_device_iterator<InputIterator>,
578                       mpl::not_<
579                           is_device_iterator<OutputIterator>
580                       >,
581                       mpl::not_<
582                           is_same_value_type<OutputIterator, InputIterator>
583                       >,
584                       is_contiguous_iterator<OutputIterator>,
585                       mpl::not_<
586                           is_bool_value_type<OutputIterator>
587                       >
588                   >
589               >::type* = 0)
590 {
591     typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
592     typedef typename InputIterator::value_type input_type;
593 
594     const device &device = queue.get_device();
595 
596     // loading parameters
597     std::string cache_key =
598         std::string("__boost_compute_copy_to_host_")
599             + type_name<input_type>() + "_" + type_name<output_type>();
600     boost::shared_ptr<parameter_cache> parameters =
601         detail::parameter_cache::get_global_cache(device);
602 
603     uint_ map_copy_threshold;
604     uint_ direct_copy_threshold;
605 
606     // calculate default values of thresholds
607     if (device.type() & device::gpu) {
608         // GPUs
609         map_copy_threshold = 524288;  // 0.5 MB
610         direct_copy_threshold = 52428800; // 50 MB
611     }
612     else {
613         // CPUs and other devices
614         map_copy_threshold = 134217728; // 128 MB
615         direct_copy_threshold = 0; // it's never efficient for CPUs
616     }
617 
618     // load thresholds
619     map_copy_threshold =
620         parameters->get(
621             cache_key, "map_copy_threshold", map_copy_threshold
622         );
623     direct_copy_threshold =
624         parameters->get(
625             cache_key, "direct_copy_threshold", direct_copy_threshold
626         );
627 
628     // select copy method based on thresholds & input_size_bytes
629     size_t count = iterator_range_size(first, last);
630     size_t input_size_bytes = count * sizeof(input_type);
631 
632     // [0; map_copy_threshold) -> copy_to_host_map()
633     if(input_size_bytes < map_copy_threshold) {
634         return copy_to_host_map(first, last, result, queue, events);
635     }
636     // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to
637     //     temporary vector then copy (and convert) to result using std::copy()
638     else if(input_size_bytes < direct_copy_threshold) {
639         std::vector<input_type> vector(count);
640         copy_to_host(first, last, vector.begin(), queue, events);
641         return std::copy(vector.begin(), vector.end(), result);
642     }
643 
644     // [direct_copy_threshold; inf) -> map [result; result + input_size) to
645     //     device and run copy kernel on device for copying & casting
646     // map host memory to device.
647 
648     // Perform async copy to host, wait for it to be finished and
649     // return the result.
650     // At this point we are sure that count > 1 (first != last), so event
651     // returned by dispatch_copy_async() must be valid.
652     return dispatch_copy_async(first, last, result, queue, events).get();
653 }
654 
655 // device -> device
656 template<class InputIterator, class OutputIterator>
657 inline OutputIterator
dispatch_copy(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<is_device_iterator<InputIterator>,is_device_iterator<OutputIterator>,mpl::not_<can_copy_with_copy_buffer<InputIterator,OutputIterator>>>>::type * =0)658 dispatch_copy(InputIterator first,
659               InputIterator last,
660               OutputIterator result,
661               command_queue &queue,
662               const wait_list &events,
663               typename boost::enable_if<
664                   mpl::and_<
665                       is_device_iterator<InputIterator>,
666                       is_device_iterator<OutputIterator>,
667                       mpl::not_<
668                           can_copy_with_copy_buffer<
669                               InputIterator, OutputIterator
670                           >
671                       >
672                   >
673               >::type* = 0)
674 {
675     return copy_on_device(first, last, result, queue, events);
676 }
677 
678 // device -> device (specialization for buffer iterators)
679 template<class InputIterator, class OutputIterator>
680 inline OutputIterator
dispatch_copy(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<is_device_iterator<InputIterator>,is_device_iterator<OutputIterator>,can_copy_with_copy_buffer<InputIterator,OutputIterator>>>::type * =0)681 dispatch_copy(InputIterator first,
682               InputIterator last,
683               OutputIterator result,
684               command_queue &queue,
685               const wait_list &events,
686               typename boost::enable_if<
687                   mpl::and_<
688                       is_device_iterator<InputIterator>,
689                       is_device_iterator<OutputIterator>,
690                       can_copy_with_copy_buffer<
691                           InputIterator, OutputIterator
692                       >
693                   >
694               >::type* = 0)
695 {
696     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
697     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
698 
699     difference_type n = std::distance(first, last);
700     if(n < 1){
701         // nothing to copy
702         return result;
703     }
704 
705     queue.enqueue_copy_buffer(first.get_buffer(),
706                               result.get_buffer(),
707                               first.get_index() * sizeof(value_type),
708                               result.get_index() * sizeof(value_type),
709                               static_cast<size_t>(n) * sizeof(value_type),
710                               events);
711     return result + n;
712 }
713 
714 // device -> device (async)
715 template<class InputIterator, class OutputIterator>
716 inline future<OutputIterator>
dispatch_copy_async(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<is_device_iterator<InputIterator>,is_device_iterator<OutputIterator>,mpl::not_<can_copy_with_copy_buffer<InputIterator,OutputIterator>>>>::type * =0)717 dispatch_copy_async(InputIterator first,
718                     InputIterator last,
719                     OutputIterator result,
720                     command_queue &queue,
721                     const wait_list &events,
722                     typename boost::enable_if<
723                         mpl::and_<
724                             is_device_iterator<InputIterator>,
725                             is_device_iterator<OutputIterator>,
726                             mpl::not_<
727                                 can_copy_with_copy_buffer<
728                                     InputIterator, OutputIterator
729                                 >
730                             >
731                         >
732                     >::type* = 0)
733 {
734     return copy_on_device_async(first, last, result, queue, events);
735 }
736 
737 // device -> device (async, specialization for buffer iterators)
738 template<class InputIterator, class OutputIterator>
739 inline future<OutputIterator>
dispatch_copy_async(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if<mpl::and_<is_device_iterator<InputIterator>,is_device_iterator<OutputIterator>,can_copy_with_copy_buffer<InputIterator,OutputIterator>>>::type * =0)740 dispatch_copy_async(InputIterator first,
741                     InputIterator last,
742                     OutputIterator result,
743                     command_queue &queue,
744                     const wait_list &events,
745                     typename boost::enable_if<
746                         mpl::and_<
747                             is_device_iterator<InputIterator>,
748                             is_device_iterator<OutputIterator>,
749                             can_copy_with_copy_buffer<
750                                 InputIterator, OutputIterator
751                             >
752                         >
753                     >::type* = 0)
754 {
755     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
756     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
757 
758     difference_type n = std::distance(first, last);
759     if(n < 1){
760         // nothing to copy
761         return make_future(result, event());
762     }
763 
764     event event_ =
765         queue.enqueue_copy_buffer(
766             first.get_buffer(),
767             result.get_buffer(),
768             first.get_index() * sizeof(value_type),
769             result.get_index() * sizeof(value_type),
770             static_cast<size_t>(n) * sizeof(value_type),
771             events
772         );
773 
774     return make_future(result + n, event_);
775 }
776 
777 // host -> host
778 template<class InputIterator, class OutputIterator>
779 inline OutputIterator
dispatch_copy(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events,typename boost::enable_if_c<!is_device_iterator<InputIterator>::value &&!is_device_iterator<OutputIterator>::value>::type * =0)780 dispatch_copy(InputIterator first,
781               InputIterator last,
782               OutputIterator result,
783               command_queue &queue,
784               const wait_list &events,
785               typename boost::enable_if_c<
786                   !is_device_iterator<InputIterator>::value &&
787                   !is_device_iterator<OutputIterator>::value
788               >::type* = 0)
789 {
790     (void) queue;
791     (void) events;
792 
793     return std::copy(first, last, result);
794 }
795 
796 } // end detail namespace
797 
798 /// Copies the values in the range [\p first, \p last) to the range
799 /// beginning at \p result.
800 ///
801 /// The generic copy() function can be used for a variety of data
802 /// transfer tasks and provides a standard interface to the following
803 /// OpenCL functions:
804 ///
805 /// \li \c clEnqueueReadBuffer()
806 /// \li \c clEnqueueWriteBuffer()
807 /// \li \c clEnqueueCopyBuffer()
808 ///
809 /// Unlike the aforementioned OpenCL functions, copy() will also work
810 /// with non-contiguous data-structures (e.g. \c std::list<T>) as
811 /// well as with "fancy" iterators (e.g. transform_iterator).
812 ///
813 /// \param first first element in the range to copy
814 /// \param last last element in the range to copy
815 /// \param result first element in the result range
816 /// \param queue command queue to perform the operation
817 ///
818 /// \return \c OutputIterator to the end of the result range
819 ///
820 /// For example, to copy an array of \c int values on the host to a vector on
821 /// the device:
822 /// \code
823 /// // array on the host
824 /// int data[] = { 1, 2, 3, 4 };
825 ///
826 /// // vector on the device
827 /// boost::compute::vector<int> vec(4, context);
828 ///
829 /// // copy values to the device vector
830 /// boost::compute::copy(data, data + 4, vec.begin(), queue);
831 /// \endcode
832 ///
833 /// The copy algorithm can also be used with standard containers such as
834 /// \c std::vector<T>:
835 /// \code
836 /// std::vector<int> host_vector = ...
837 /// boost::compute::vector<int> device_vector = ...
838 ///
839 /// // copy from the host to the device
840 /// boost::compute::copy(
841 ///     host_vector.begin(), host_vector.end(), device_vector.begin(), queue
842 /// );
843 ///
844 /// // copy from the device to the host
845 /// boost::compute::copy(
846 ///     device_vector.begin(), device_vector.end(), host_vector.begin(), queue
847 /// );
848 /// \endcode
849 ///
850 /// Space complexity: \Omega(1)
851 ///
852 /// \see copy_n(), copy_if(), copy_async()
853 template<class InputIterator, class OutputIterator>
copy(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue=system::default_queue (),const wait_list & events=wait_list ())854 inline OutputIterator copy(InputIterator first,
855                            InputIterator last,
856                            OutputIterator result,
857                            command_queue &queue = system::default_queue(),
858                            const wait_list &events = wait_list())
859 {
860     return detail::dispatch_copy(first, last, result, queue, events);
861 }
862 
863 /// Copies the values in the range [\p first, \p last) to the range
864 /// beginning at \p result. The copy is performed asynchronously.
865 ///
866 /// \see copy()
867 template<class InputIterator, class OutputIterator>
868 inline future<OutputIterator>
copy_async(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue=system::default_queue (),const wait_list & events=wait_list ())869 copy_async(InputIterator first,
870            InputIterator last,
871            OutputIterator result,
872            command_queue &queue = system::default_queue(),
873            const wait_list &events = wait_list())
874 {
875     return detail::dispatch_copy_async(first, last, result, queue, events);
876 }
877 
878 } // end compute namespace
879 } // end boost namespace
880 
881 #endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP
882