• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
13 
14 #include <iterator>
15 
16 #include <boost/compute/types.hpp>
17 #include <boost/compute/functional.hpp>
18 #include <boost/compute/command_queue.hpp>
19 #include <boost/compute/container/detail/scalar.hpp>
20 #include <boost/compute/iterator/buffer_iterator.hpp>
21 #include <boost/compute/type_traits/type_name.hpp>
22 #include <boost/compute/detail/meta_kernel.hpp>
23 #include <boost/compute/detail/iterator_range_size.hpp>
24 #include <boost/compute/detail/parameter_cache.hpp>
25 
26 namespace boost {
27 namespace compute {
28 namespace detail {
29 
30 template<class InputIterator, class UnaryPredicate>
find_if_with_atomics_one_vpt(InputIterator first,InputIterator last,UnaryPredicate predicate,const size_t count,command_queue & queue)31 inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
32                                                   InputIterator last,
33                                                   UnaryPredicate predicate,
34                                                   const size_t count,
35                                                   command_queue &queue)
36 {
37     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
38     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
39 
40     const context &context = queue.get_context();
41 
42     detail::meta_kernel k("find_if");
43     size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
44     atomic_min<uint_> atomic_min_uint;
45 
46     k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
47       << k.decl<const value_type>("value") << "="
48       <<     first[k.var<const uint_>("i")] << ";\n"
49       << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
50       << "    " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
51       << "}\n";
52 
53     kernel kernel = k.compile(context);
54 
55     scalar<uint_> index(context);
56     kernel.set_arg(index_arg, index.get_buffer());
57 
58     // initialize index to the last iterator's index
59     index.write(static_cast<uint_>(count), queue);
60     queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
61 
62     // read index and return iterator
63     return first + static_cast<difference_type>(index.read(queue));
64 }
65 
66 template<class InputIterator, class UnaryPredicate>
find_if_with_atomics_multiple_vpt(InputIterator first,InputIterator last,UnaryPredicate predicate,const size_t count,const size_t vpt,command_queue & queue)67 inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
68                                                        InputIterator last,
69                                                        UnaryPredicate predicate,
70                                                        const size_t count,
71                                                        const size_t vpt,
72                                                        command_queue &queue)
73 {
74     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
75     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
76 
77     const context &context = queue.get_context();
78     const device &device = queue.get_device();
79 
80     detail::meta_kernel k("find_if");
81     size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
82     size_t count_arg = k.add_arg<const uint_>("count");
83     size_t vpt_arg = k.add_arg<const uint_>("vpt");
84     atomic_min<uint_> atomic_min_uint;
85 
86     // for GPUs reads from global memory are coalesced
87     if(device.type() & device::gpu) {
88         k <<
89             k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
90             k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
91             k.decl<const uint_>("end") << " = min(" <<
92                     "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
93                     "count" <<
94             ");\n" <<
95 
96             // checking if the index is already found
97             "__local uint local_index;\n" <<
98             "if(get_local_id(0) == 0){\n" <<
99             "    local_index = *index;\n " <<
100             "};\n" <<
101             "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
102             "if(local_index < id){\n" <<
103             "    return;\n" <<
104             "}\n" <<
105 
106             "while(id < end){\n" <<
107             "    " << k.decl<const value_type>("value") << " = " <<
108                       first[k.var<const uint_>("id")] << ";\n"
109             "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
110             "        " << atomic_min_uint(k.var<uint_ *>("index"),
111                                           k.var<uint_>("id")) << ";\n" <<
112             "        return;\n"
113             "    }\n" <<
114             "    id+=lsize;\n" <<
115             "}\n";
116     // for CPUs (and other devices) reads are ordered so the big cache is
117     // efficiently used.
118     } else {
119         k <<
120             k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
121             k.decl<const uint_>("end") << " = min(" <<
122                     "id + " << k.var<uint_>("vpt") << "," <<
123                     "count" <<
124             ");\n" <<
125             "while(id < end && (*index) > id){\n" <<
126             "    " << k.decl<const value_type>("value") << " = " <<
127                       first[k.var<const uint_>("id")] << ";\n"
128             "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
129             "        " << atomic_min_uint(k.var<uint_ *>("index"),
130                                           k.var<uint_>("id")) << ";\n" <<
131             "        return;\n" <<
132             "    }\n" <<
133             "    id++;\n" <<
134             "}\n";
135     }
136 
137     kernel kernel = k.compile(context);
138 
139     scalar<uint_> index(context);
140     kernel.set_arg(index_arg, index.get_buffer());
141     kernel.set_arg(count_arg, static_cast<uint_>(count));
142     kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
143 
144     // initialize index to the last iterator's index
145     index.write(static_cast<uint_>(count), queue);
146 
147     const size_t global_wg_size = static_cast<size_t>(
148         std::ceil(float(count) / vpt)
149     );
150     queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
151 
152     // read index and return iterator
153     return first + static_cast<difference_type>(index.read(queue));
154 }
155 
156 // Space complexity: O(1)
157 template<class InputIterator, class UnaryPredicate>
find_if_with_atomics(InputIterator first,InputIterator last,UnaryPredicate predicate,command_queue & queue)158 inline InputIterator find_if_with_atomics(InputIterator first,
159                                           InputIterator last,
160                                           UnaryPredicate predicate,
161                                           command_queue &queue)
162 {
163     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
164 
165     size_t count = detail::iterator_range_size(first, last);
166     if(count == 0){
167         return last;
168     }
169 
170     const device &device = queue.get_device();
171 
172     // load cached parameters
173     std::string cache_key = std::string("__boost_find_if_with_atomics_")
174         + type_name<value_type>();
175     boost::shared_ptr<parameter_cache> parameters =
176         detail::parameter_cache::get_global_cache(device);
177 
178     // for relatively small inputs on GPUs kernel checking one value per thread
179     // (work-item) is more efficient than its multiple values per thread version
180     if(device.type() & device::gpu){
181         const size_t one_vpt_threshold =
182             parameters->get(cache_key, "one_vpt_threshold", 1048576);
183         if(count <= one_vpt_threshold){
184             return find_if_with_atomics_one_vpt(
185                 first, last, predicate, count, queue
186             );
187         }
188     }
189 
190     // values per thread
191     size_t vpt;
192     if(device.type() & device::gpu){
193         // get vpt parameter
194         vpt = parameters->get(cache_key, "vpt", 32);
195     } else {
196         // for CPUs work is split equally between compute units
197         const size_t max_compute_units =
198             device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
199         vpt = static_cast<size_t>(
200             std::ceil(float(count) / max_compute_units)
201         );
202     }
203 
204     return find_if_with_atomics_multiple_vpt(
205         first, last, predicate, count, vpt, queue
206     );
207 }
208 
209 } // end detail namespace
210 } // end compute namespace
211 } // end boost namespace
212 
213 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
214