1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
13
14 #include <iterator>
15
16 #include <boost/compute/command_queue.hpp>
17 #include <boost/compute/async/future.hpp>
18 #include <boost/compute/iterator/buffer_iterator.hpp>
19 #include <boost/compute/iterator/discard_iterator.hpp>
20 #include <boost/compute/memory/svm_ptr.hpp>
21 #include <boost/compute/detail/iterator_range_size.hpp>
22 #include <boost/compute/detail/meta_kernel.hpp>
23 #include <boost/compute/detail/parameter_cache.hpp>
24 #include <boost/compute/detail/work_size.hpp>
25 #include <boost/compute/detail/vendor.hpp>
26
27 namespace boost {
28 namespace compute {
29 namespace detail {
30
31 template<class InputIterator, class OutputIterator>
copy_on_device_cpu(InputIterator first,OutputIterator result,size_t count,command_queue & queue,const wait_list & events)32 inline event copy_on_device_cpu(InputIterator first,
33 OutputIterator result,
34 size_t count,
35 command_queue &queue,
36 const wait_list &events)
37 {
38 meta_kernel k("copy");
39 const device& device = queue.get_device();
40
41 k <<
42 "uint block = " <<
43 "(uint)ceil(((float)count)/get_global_size(0));\n" <<
44 "uint index = get_global_id(0) * block;\n" <<
45 "uint end = min(count, index + block);\n" <<
46 "while(index < end){\n" <<
47 result[k.var<uint_>("index")] << '=' <<
48 first[k.var<uint_>("index")] << ";\n" <<
49 "index++;\n" <<
50 "}\n";
51
52 k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
53
54 size_t global_work_size = device.compute_units();
55 if(count <= 1024) global_work_size = 1;
56 return k.exec_1d(queue, 0, global_work_size, events);
57 }
58
59 template<class InputIterator, class OutputIterator>
copy_on_device_gpu(InputIterator first,OutputIterator result,size_t count,command_queue & queue,const wait_list & events)60 inline event copy_on_device_gpu(InputIterator first,
61 OutputIterator result,
62 size_t count,
63 command_queue &queue,
64 const wait_list &events)
65 {
66 typedef typename std::iterator_traits<InputIterator>::value_type input_type;
67
68 const device& device = queue.get_device();
69 boost::shared_ptr<parameter_cache> parameters =
70 detail::parameter_cache::get_global_cache(device);
71 std::string cache_key =
72 "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
73
74 uint_ vpt = parameters->get(cache_key, "vpt", 4);
75 uint_ tpb = parameters->get(cache_key, "tpb", 128);
76
77 meta_kernel k("copy");
78 k <<
79 "uint index = get_local_id(0) + " <<
80 "(" << vpt * tpb << " * get_group_id(0));\n" <<
81 "for(uint i = 0; i < " << vpt << "; i++){\n" <<
82 " if(index < count){\n" <<
83 result[k.var<uint_>("index")] << '=' <<
84 first[k.var<uint_>("index")] << ";\n" <<
85 " index += " << tpb << ";\n"
86 " }\n"
87 "}\n";
88
89 k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
90 size_t global_work_size = calculate_work_size(count, vpt, tpb);
91 return k.exec_1d(queue, 0, global_work_size, tpb, events);
92 }
93
94 template<class InputIterator, class OutputIterator>
dispatch_copy_on_device(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events)95 inline event dispatch_copy_on_device(InputIterator first,
96 InputIterator last,
97 OutputIterator result,
98 command_queue &queue,
99 const wait_list &events)
100 {
101 const size_t count = detail::iterator_range_size(first, last);
102
103 if(count == 0){
104 // nothing to do
105 return event();
106 }
107
108 const device& device = queue.get_device();
109 // copy_on_device_cpu() does not work for CPU on Apple platform
110 // due to bug in its compiler.
111 // See https://github.com/boostorg/compute/pull/626
112 if((device.type() & device::cpu) && !is_apple_platform_device(device))
113 {
114 return copy_on_device_cpu(first, result, count, queue, events);
115 }
116 return copy_on_device_gpu(first, result, count, queue, events);
117 }
118
119 template<class InputIterator, class OutputIterator>
copy_on_device(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events)120 inline OutputIterator copy_on_device(InputIterator first,
121 InputIterator last,
122 OutputIterator result,
123 command_queue &queue,
124 const wait_list &events)
125 {
126 dispatch_copy_on_device(first, last, result, queue, events);
127 return result + std::distance(first, last);
128 }
129
130 template<class InputIterator>
copy_on_device(InputIterator first,InputIterator last,discard_iterator result,command_queue & queue,const wait_list & events)131 inline discard_iterator copy_on_device(InputIterator first,
132 InputIterator last,
133 discard_iterator result,
134 command_queue &queue,
135 const wait_list &events)
136 {
137 (void) queue;
138 (void) events;
139
140 return result + std::distance(first, last);
141 }
142
143 template<class InputIterator, class OutputIterator>
copy_on_device_async(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events)144 inline future<OutputIterator> copy_on_device_async(InputIterator first,
145 InputIterator last,
146 OutputIterator result,
147 command_queue &queue,
148 const wait_list &events)
149 {
150 event event_ = dispatch_copy_on_device(first, last, result, queue, events);
151 return make_future(result + std::distance(first, last), event_);
152 }
153
154 #ifdef BOOST_COMPUTE_CL_VERSION_2_0
155 // copy_on_device() specialization for svm_ptr
156 template<class T>
copy_on_device(svm_ptr<T> first,svm_ptr<T> last,svm_ptr<T> result,command_queue & queue,const wait_list & events)157 inline svm_ptr<T> copy_on_device(svm_ptr<T> first,
158 svm_ptr<T> last,
159 svm_ptr<T> result,
160 command_queue &queue,
161 const wait_list &events)
162 {
163 size_t count = iterator_range_size(first, last);
164 if(count == 0){
165 return result;
166 }
167
168 queue.enqueue_svm_memcpy(
169 result.get(), first.get(), count * sizeof(T), events
170 );
171
172 return result + count;
173 }
174
175 template<class T>
copy_on_device_async(svm_ptr<T> first,svm_ptr<T> last,svm_ptr<T> result,command_queue & queue,const wait_list & events)176 inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
177 svm_ptr<T> last,
178 svm_ptr<T> result,
179 command_queue &queue,
180 const wait_list &events)
181 {
182 size_t count = iterator_range_size(first, last);
183 if(count == 0){
184 return future<svm_ptr<T> >();
185 }
186
187 event event_ = queue.enqueue_svm_memcpy_async(
188 result.get(), first.get(), count * sizeof(T), events
189 );
190
191 return make_future(result + count, event_);
192 }
193 #endif // BOOST_COMPUTE_CL_VERSION_2_0
194
195 } // end detail namespace
196 } // end compute namespace
197 } // end boost namespace
198
199 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
200