• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
12 #define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
13 
14 #include <numeric>
15 
16 #include <boost/compute/detail/meta_kernel.hpp>
17 #include <boost/compute/container/vector.hpp>
18 
19 namespace boost {
20 namespace compute {
21 namespace detail {
22 
23 template<class InputIterator, class Predicate>
24 class count_if_with_threads_kernel : meta_kernel
25 {
26 public:
27     typedef typename
28         std::iterator_traits<InputIterator>::value_type
29         value_type;
30 
count_if_with_threads_kernel()31     count_if_with_threads_kernel()
32         : meta_kernel("count_if_with_threads")
33     {
34     }
35 
set_args(InputIterator first,InputIterator last,Predicate predicate)36     void set_args(InputIterator first,
37                   InputIterator last,
38                   Predicate predicate)
39 
40     {
41         typedef typename std::iterator_traits<InputIterator>::value_type T;
42 
43         m_size = detail::iterator_range_size(first, last);
44 
45         m_size_arg = add_arg<const ulong_>("size");
46         m_counts_arg = add_arg<ulong_ *>(memory_object::global_memory, "counts");
47 
48         *this <<
49             // thread parameters
50             "const uint gid = get_global_id(0);\n" <<
51             "const uint block_size = size / get_global_size(0);\n" <<
52             "const uint start = block_size * gid;\n" <<
53             "uint end = 0;\n" <<
54             "if(gid == get_global_size(0) - 1)\n" <<
55             "    end = size;\n" <<
56             "else\n" <<
57             "    end = block_size * gid + block_size;\n" <<
58 
59             // count values
60             "uint count = 0;\n" <<
61             "for(uint i = start; i < end; i++){\n" <<
62                 decl<const T>("value") << "="
63                     << first[expr<uint_>("i")] << ";\n" <<
64                 if_(predicate(var<const T>("value"))) << "{\n" <<
65                     "count++;\n" <<
66                 "}\n" <<
67             "}\n" <<
68 
69             // write count
70             "counts[gid] = count;\n";
71     }
72 
exec(command_queue & queue)73     size_t exec(command_queue &queue)
74     {
75         const device &device = queue.get_device();
76         const context &context = queue.get_context();
77 
78         size_t threads = device.compute_units();
79 
80         const size_t minimum_block_size = 2048;
81         if(m_size / threads < minimum_block_size){
82             threads = static_cast<size_t>(
83                           (std::max)(
84                               std::ceil(float(m_size) / minimum_block_size),
85                               1.0f
86                           )
87                       );
88         }
89 
90         // storage for counts
91         ::boost::compute::vector<ulong_> counts(threads, context);
92 
93         // exec kernel
94         set_arg(m_size_arg, static_cast<ulong_>(m_size));
95         set_arg(m_counts_arg, counts.get_buffer());
96         exec_1d(queue, 0, threads, 1);
97 
98         // copy counts to the host
99         std::vector<ulong_> host_counts(threads);
100         ::boost::compute::copy(counts.begin(), counts.end(), host_counts.begin(), queue);
101 
102         // return sum of counts
103         return std::accumulate(host_counts.begin(), host_counts.end(), size_t(0));
104     }
105 
106 private:
107     size_t m_size;
108     size_t m_size_arg;
109     size_t m_counts_arg;
110 };
111 
112 // counts values that match the predicate using one thread per block. this is
113 // optimized for cpu-type devices with a small number of compute units.
114 template<class InputIterator, class Predicate>
count_if_with_threads(InputIterator first,InputIterator last,Predicate predicate,command_queue & queue)115 inline size_t count_if_with_threads(InputIterator first,
116                                     InputIterator last,
117                                     Predicate predicate,
118                                     command_queue &queue)
119 {
120     count_if_with_threads_kernel<InputIterator, Predicate> kernel;
121     kernel.set_args(first, last, predicate);
122     return kernel.exec(queue);
123 }
124 
125 } // end detail namespace
126 } // end compute namespace
127 } // end boost namespace
128 
129 #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
130