• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10 
11 #include <algorithm>
12 #include <iostream>
13 #include <numeric>
14 #include <vector>
15 
16 #include <boost/program_options.hpp>
17 
18 #include <boost/compute/system.hpp>
19 #include <boost/compute/algorithm/accumulate.hpp>
20 #include <boost/compute/container/vector.hpp>
21 
22 #include "perf.hpp"
23 
24 namespace po = boost::program_options;
25 namespace compute = boost::compute;
26 
rand_int()27 int rand_int()
28 {
29     return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
30 }
31 
32 template<class T>
perf_accumulate(const compute::vector<T> & data,const size_t trials,compute::command_queue & queue)33 double perf_accumulate(const compute::vector<T>& data,
34                        const size_t trials,
35                        compute::command_queue& queue)
36 {
37     perf_timer t;
38     for(size_t trial = 0; trial < trials; trial++){
39         t.start();
40         compute::accumulate(data.begin(), data.end(), T(0), queue);
41         queue.finish();
42         t.stop();
43     }
44     return t.min_time();
45 }
46 
47 template<class T>
tune_accumulate(const compute::vector<T> & data,const size_t trials,compute::command_queue & queue)48 void tune_accumulate(const compute::vector<T>& data,
49                      const size_t trials,
50                      compute::command_queue& queue)
51 {
52     boost::shared_ptr<compute::detail::parameter_cache>
53         params = compute::detail::parameter_cache::get_global_cache(queue.get_device());
54 
55     const std::string cache_key =
56         std::string("__boost_reduce_on_gpu_") + compute::type_name<T>();
57 
58     const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 };
59     const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
60 
61     double min_time = (std::numeric_limits<double>::max)();
62     compute::uint_ best_tpb = 0;
63     compute::uint_ best_vpt = 0;
64 
65     for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){
66         params->set(cache_key, "tpb", tpbs[i]);
67         for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){
68             params->set(cache_key, "vpt", vpts[j]);
69 
70             try {
71                 const double t = perf_accumulate(data, trials, queue);
72                 if(t < min_time){
73                     best_tpb = tpbs[i];
74                     best_vpt = vpts[j];
75                     min_time = t;
76                 }
77             }
78             catch(compute::opencl_error&){
79                 // invalid parameters for this device, skip
80             }
81         }
82     }
83 
84     // store optimal parameters
85     params->set(cache_key, "tpb", best_tpb);
86     params->set(cache_key, "vpt", best_vpt);
87 }
88 
main(int argc,char * argv[])89 int main(int argc, char *argv[])
90 {
91     // setup command line arguments
92     po::options_description options("options");
93     options.add_options()
94         ("help", "show usage instructions")
95         ("size", po::value<size_t>()->default_value(8192), "input size")
96         ("trials", po::value<size_t>()->default_value(3), "number of trials to run")
97         ("tune", "run tuning procedure")
98     ;
99     po::positional_options_description positional_options;
100     positional_options.add("size", 1);
101 
102     // parse command line
103     po::variables_map vm;
104     po::store(
105         po::command_line_parser(argc, argv)
106             .options(options).positional(positional_options).run(),
107         vm
108     );
109     po::notify(vm);
110 
111     const size_t size = vm["size"].as<size_t>();
112     const size_t trials = vm["trials"].as<size_t>();
113     std::cout << "size: " << size << std::endl;
114 
115     // setup context and queue for the default device
116     compute::device device = compute::system::default_device();
117     compute::context context(device);
118     compute::command_queue queue(context, device);
119     std::cout << "device: " << device.name() << std::endl;
120 
121     // create vector of random numbers on the host
122     std::vector<int> host_data(size);
123     std::generate(host_data.begin(), host_data.end(), rand_int);
124 
125     // create vector on the device and copy the data
126     compute::vector<int> device_data(
127         host_data.begin(), host_data.end(), queue
128     );
129 
130     // run tuning proceure (if requested)
131     if(vm.count("tune")){
132         tune_accumulate(device_data, trials, queue);
133     }
134 
135     // run benchmark
136     double t = perf_accumulate(device_data, trials, queue);
137     std::cout << "time: " << t / 1e6 << " ms" << std::endl;
138 
139     return 0;
140 }
141