• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
13 
14 #include <iterator>
15 
16 #include <boost/compute/command_queue.hpp>
17 #include <boost/compute/async/future.hpp>
18 #include <boost/compute/iterator/buffer_iterator.hpp>
19 #include <boost/compute/iterator/discard_iterator.hpp>
20 #include <boost/compute/memory/svm_ptr.hpp>
21 #include <boost/compute/detail/iterator_range_size.hpp>
22 #include <boost/compute/detail/meta_kernel.hpp>
23 #include <boost/compute/detail/parameter_cache.hpp>
24 #include <boost/compute/detail/work_size.hpp>
25 #include <boost/compute/detail/vendor.hpp>
26 
27 namespace boost {
28 namespace compute {
29 namespace detail {
30 
31 template<class InputIterator, class OutputIterator>
copy_on_device_cpu(InputIterator first,OutputIterator result,size_t count,command_queue & queue,const wait_list & events)32 inline event copy_on_device_cpu(InputIterator first,
33                                 OutputIterator result,
34                                 size_t count,
35                                 command_queue &queue,
36                                 const wait_list &events)
37 {
38     meta_kernel k("copy");
39     const device& device = queue.get_device();
40 
41     k <<
42         "uint block = " <<
43             "(uint)ceil(((float)count)/get_global_size(0));\n" <<
44         "uint index = get_global_id(0) * block;\n" <<
45         "uint end = min(count, index + block);\n" <<
46         "while(index < end){\n" <<
47             result[k.var<uint_>("index")] << '=' <<
48                 first[k.var<uint_>("index")] << ";\n" <<
49             "index++;\n" <<
50         "}\n";
51 
52     k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
53 
54     size_t global_work_size = device.compute_units();
55     if(count <= 1024) global_work_size = 1;
56     return k.exec_1d(queue, 0, global_work_size, events);
57 }
58 
59 template<class InputIterator, class OutputIterator>
copy_on_device_gpu(InputIterator first,OutputIterator result,size_t count,command_queue & queue,const wait_list & events)60 inline event copy_on_device_gpu(InputIterator first,
61                                 OutputIterator result,
62                                 size_t count,
63                                 command_queue &queue,
64                                 const wait_list &events)
65 {
66     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
67 
68     const device& device = queue.get_device();
69     boost::shared_ptr<parameter_cache> parameters =
70         detail::parameter_cache::get_global_cache(device);
71     std::string cache_key =
72         "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
73 
74     uint_ vpt = parameters->get(cache_key, "vpt", 4);
75     uint_ tpb = parameters->get(cache_key, "tpb", 128);
76 
77     meta_kernel k("copy");
78     k <<
79         "uint index = get_local_id(0) + " <<
80             "(" << vpt * tpb << " * get_group_id(0));\n" <<
81         "for(uint i = 0; i < " << vpt << "; i++){\n" <<
82         "    if(index < count){\n" <<
83                 result[k.var<uint_>("index")] << '=' <<
84                     first[k.var<uint_>("index")] << ";\n" <<
85         "       index += " << tpb << ";\n"
86         "    }\n"
87         "}\n";
88 
89     k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
90     size_t global_work_size = calculate_work_size(count, vpt, tpb);
91     return k.exec_1d(queue, 0, global_work_size, tpb, events);
92 }
93 
94 template<class InputIterator, class OutputIterator>
dispatch_copy_on_device(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events)95 inline event dispatch_copy_on_device(InputIterator first,
96                                      InputIterator last,
97                                      OutputIterator result,
98                                      command_queue &queue,
99                                      const wait_list &events)
100 {
101     const size_t count = detail::iterator_range_size(first, last);
102 
103     if(count == 0){
104         // nothing to do
105         return event();
106     }
107 
108     const device& device = queue.get_device();
109     // copy_on_device_cpu() does not work for CPU on Apple platform
110     // due to bug in its compiler.
111     // See https://github.com/boostorg/compute/pull/626
112     if((device.type() & device::cpu) && !is_apple_platform_device(device))
113     {
114         return copy_on_device_cpu(first, result, count, queue, events);
115     }
116     return copy_on_device_gpu(first, result, count, queue, events);
117 }
118 
119 template<class InputIterator, class OutputIterator>
copy_on_device(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events)120 inline OutputIterator copy_on_device(InputIterator first,
121                                      InputIterator last,
122                                      OutputIterator result,
123                                      command_queue &queue,
124                                      const wait_list &events)
125 {
126     dispatch_copy_on_device(first, last, result, queue, events);
127     return result + std::distance(first, last);
128 }
129 
130 template<class InputIterator>
copy_on_device(InputIterator first,InputIterator last,discard_iterator result,command_queue & queue,const wait_list & events)131 inline discard_iterator copy_on_device(InputIterator first,
132                                        InputIterator last,
133                                        discard_iterator result,
134                                        command_queue &queue,
135                                        const wait_list &events)
136 {
137     (void) queue;
138     (void) events;
139 
140     return result + std::distance(first, last);
141 }
142 
143 template<class InputIterator, class OutputIterator>
copy_on_device_async(InputIterator first,InputIterator last,OutputIterator result,command_queue & queue,const wait_list & events)144 inline future<OutputIterator> copy_on_device_async(InputIterator first,
145                                                    InputIterator last,
146                                                    OutputIterator result,
147                                                    command_queue &queue,
148                                                    const wait_list &events)
149 {
150     event event_ = dispatch_copy_on_device(first, last, result, queue, events);
151     return make_future(result + std::distance(first, last), event_);
152 }
153 
154 #ifdef BOOST_COMPUTE_CL_VERSION_2_0
155 // copy_on_device() specialization for svm_ptr
156 template<class T>
copy_on_device(svm_ptr<T> first,svm_ptr<T> last,svm_ptr<T> result,command_queue & queue,const wait_list & events)157 inline svm_ptr<T> copy_on_device(svm_ptr<T> first,
158                                  svm_ptr<T> last,
159                                  svm_ptr<T> result,
160                                  command_queue &queue,
161                                  const wait_list &events)
162 {
163     size_t count = iterator_range_size(first, last);
164     if(count == 0){
165         return result;
166     }
167 
168     queue.enqueue_svm_memcpy(
169         result.get(), first.get(), count * sizeof(T), events
170     );
171 
172     return result + count;
173 }
174 
175 template<class T>
copy_on_device_async(svm_ptr<T> first,svm_ptr<T> last,svm_ptr<T> result,command_queue & queue,const wait_list & events)176 inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
177                                                 svm_ptr<T> last,
178                                                 svm_ptr<T> result,
179                                                 command_queue &queue,
180                                                 const wait_list &events)
181 {
182     size_t count = iterator_range_size(first, last);
183     if(count == 0){
184         return future<svm_ptr<T> >();
185     }
186 
187     event event_ = queue.enqueue_svm_memcpy_async(
188         result.get(), first.get(), count * sizeof(T), events
189     );
190 
191     return make_future(result + count, event_);
192 }
193 #endif // BOOST_COMPUTE_CL_VERSION_2_0
194 
195 } // end detail namespace
196 } // end compute namespace
197 } // end boost namespace
198 
199 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
200