1 /*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "bandwidth.h"
18
19 #include <ctype.h>
20 #include <pthread.h>
21 #include <sched.h>
22 #include <sys/resource.h>
23 #include <sys/time.h>
24 #include <unistd.h>
25
26 #include <map>
27 #include <vector>
28
29
30 typedef struct {
31 const char *name;
32 bool int_type;
33 } option_t;
34
35 option_t bandwidth_opts[] = {
36 { "size", true },
37 { "num_warm_loops", true },
38 { "num_loops", true },
39 { "type", false },
40 { NULL, false },
41 };
42
43 option_t per_core_opts[] = {
44 { "size", true },
45 { "num_warm_loops", true},
46 { "num_loops", true },
47 { "type", false },
48 { NULL, false },
49 };
50
51 option_t multithread_opts[] = {
52 { "size", true },
53 { "num_warm_loops", true},
54 { "num_loops", true },
55 { "type", false },
56 { "num_threads", true },
57 { NULL, false },
58 };
59
60 typedef union {
61 int int_value;
62 const char *char_value;
63 } arg_value_t;
64 typedef std::map<const char*, arg_value_t> arg_t;
65
processBandwidthOptions(int argc,char ** argv,option_t options[],arg_t * values)66 bool processBandwidthOptions(int argc, char** argv, option_t options[],
67 arg_t *values) {
68 for (int i = 1; i < argc; i++) {
69 if (argv[i][0] == '-' && argv[i][1] == '-' && !isdigit(argv[i][2])) {
70 char *arg = &argv[i][2];
71
72 for (int j = 0; options[j].name != NULL; j++) {
73 if (strcmp(arg, options[j].name) == 0) {
74 const char *name = options[j].name;
75 if (i == argc - 1) {
76 printf("The option --%s requires an argument.\n", name);
77 return false;
78 }
79 if (options[j].int_type) {
80 (*values)[name].int_value = strtol(argv[++i], NULL, 0);
81 } else {
82 (*values)[name].char_value = argv[++i];
83 }
84 }
85 }
86 }
87 }
88
89 return true;
90 }
91
createBandwidthBenchmarkObject(arg_t values)92 BandwidthBenchmark *createBandwidthBenchmarkObject(arg_t values) {
93 BandwidthBenchmark *bench = NULL;
94
95 const char *name = values["type"].char_value;
96 size_t size = 0;
97 if (values.count("size") > 0) {
98 size = values["size"].int_value;
99 }
100 if (strcmp(name, "copy_ldrd_strd") == 0) {
101 bench = new CopyLdrdStrdBenchmark();
102 } else if (strcmp(name, "copy_ldmia_stmia") == 0) {
103 bench = new CopyLdmiaStmiaBenchmark();
104 } else if (strcmp(name, "copy_vld1_vst1") == 0) {
105 bench = new CopyVld1Vst1Benchmark();
106 } else if (strcmp(name, "copy_vldr_vstr") == 0) {
107 bench = new CopyVldrVstrBenchmark();
108 } else if (strcmp(name, "copy_vldmia_vstmia") == 0) {
109 bench = new CopyVldmiaVstmiaBenchmark();
110 } else if (strcmp(name, "memcpy") == 0) {
111 bench = new MemcpyBenchmark();
112 } else if (strcmp(name, "write_strd") == 0) {
113 bench = new WriteStrdBenchmark();
114 } else if (strcmp(name, "write_stmia") == 0) {
115 bench = new WriteStmiaBenchmark();
116 } else if (strcmp(name, "write_vst1") == 0) {
117 bench = new WriteVst1Benchmark();
118 } else if (strcmp(name, "write_vstr") == 0) {
119 bench = new WriteVstrBenchmark();
120 } else if (strcmp(name, "write_vstmia") == 0) {
121 bench = new WriteVstmiaBenchmark();
122 } else if (strcmp(name, "memset") == 0) {
123 bench = new MemsetBenchmark();
124 } else if (strcmp(name, "read_ldrd") == 0) {
125 bench = new ReadLdrdBenchmark();
126 } else if (strcmp(name, "read_ldmia") == 0) {
127 bench = new ReadLdmiaBenchmark();
128 } else if (strcmp(name, "read_vld1") == 0) {
129 bench = new ReadVld1Benchmark();
130 } else if (strcmp(name, "read_vldr") == 0) {
131 bench = new ReadVldrBenchmark();
132 } else if (strcmp(name, "read_vldmia") == 0) {
133 bench = new ReadVldmiaBenchmark();
134 } else {
135 printf("Unknown type name %s\n", name);
136 return NULL;
137 }
138
139 if (!bench->setSize(size)) {
140 printf("Failed to allocate buffers for benchmark.\n");
141 delete bench;
142 return NULL;
143 }
144
145 if (values.count("num_warm_loops") > 0) {
146 bench->set_num_loops(values["num_warm_loops"].int_value);
147 }
148 if (values.count("num_loops") > 0) {
149 bench->set_num_loops(values["num_loops"].int_value);
150 }
151
152 return bench;
153 }
154
getAvailCpus(std::vector<int> * cpu_list)155 bool getAvailCpus(std::vector<int> *cpu_list) {
156 cpu_set_t cpuset;
157
158 CPU_ZERO(&cpuset);
159 if (sched_getaffinity(0, sizeof(cpuset), &cpuset) != 0) {
160 perror("sched_getaffinity failed.");
161 return false;
162 }
163
164 for (int i = 0; i < CPU_SETSIZE; i++) {
165 if (CPU_ISSET(i, &cpuset)) {
166 cpu_list->push_back(i);
167 }
168 }
169
170 return true;
171 }
172
173 typedef struct {
174 int core;
175 BandwidthBenchmark *bench;
176 double avg_mb;
177 volatile bool *run;
178 } thread_arg_t;
179
runBandwidthThread(void * data)180 void *runBandwidthThread(void *data) {
181 thread_arg_t *arg = reinterpret_cast<thread_arg_t *>(data);
182
183 if (arg->core >= 0) {
184 cpu_set_t cpuset;
185 CPU_ZERO(&cpuset);
186 CPU_SET(arg->core, &cpuset);
187 if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
188 perror("sched_setaffinity failed");
189 return NULL;
190 }
191 }
192
193 // Spinloop waiting for the run variable to get set to true.
194 while (!*arg->run) {
195 }
196
197 double avg_mb = 0;
198 for (int run = 1; ; run++) {
199 arg->bench->run();
200 if (!*arg->run) {
201 // Throw away the last data point since it's possible not
202 // all of the threads are running at this point.
203 break;
204 }
205 avg_mb = (avg_mb/run) * (run-1) + arg->bench->mb_per_sec()/run;
206 }
207 arg->avg_mb = avg_mb;
208
209 return NULL;
210 }
211
processThreadArgs(int argc,char ** argv,option_t options[],arg_t * values)212 bool processThreadArgs(int argc, char** argv, option_t options[],
213 arg_t *values) {
214 // Use some smaller values for the number of loops.
215 (*values)["num_warm_loops"].int_value = 1000000;
216 (*values)["num_loops"].int_value = 10000000;
217
218 if (!processBandwidthOptions(argc, argv, options, values)) {
219 return false;
220 }
221 if (values->count("size") > 0 && ((*values)["size"].int_value % 64) != 0) {
222 printf("The size values must be a multiple of 64.\n");
223 return false;
224 }
225 if (values->count("type") == 0) {
226 printf("Must specify the type value.\n");
227 return false;
228 }
229
230 BandwidthBenchmark *bench = createBandwidthBenchmarkObject(*values);
231 if (!bench) {
232 return false;
233 }
234
235 if (setpriority(PRIO_PROCESS, 0, -20)) {
236 perror("Unable to raise priority of process.");
237 return false;
238 }
239
240 printf("Calculating optimum run time...\n");
241 nsecs_t t = system_time();
242 bench->run();
243 t = system_time() - t;
244 // Since this is only going to be running single threaded, assume that
245 // if the number is set to ten times this value, we should get at least
246 // a couple of samples per thread.
247 int run_time = int((t/1000000000.0)*10 + 0.5) + 5;
248
249 (*values)["run_time"].int_value = run_time;
250 (*values)["size"].int_value = bench->size();
251 (*values)["num_warm_loops"].int_value = bench->num_warm_loops();
252 (*values)["num_loops"].int_value = bench->num_loops();
253 delete bench;
254
255 return true;
256 }
257
runThreadedTest(thread_arg_t args[],int num_threads,int run_time)258 bool runThreadedTest(thread_arg_t args[], int num_threads, int run_time) {
259 pthread_t threads[num_threads];
260 volatile bool run = false;
261
262 int rc;
263 for (int i = 0; i < num_threads; i++) {
264 args[i].run = &run;
265 rc = pthread_create(&threads[i], NULL, runBandwidthThread,
266 (void*)&args[i]);
267 if (rc != 0) {
268 printf("Failed to launch thread %d\n", i);
269 return false;
270 }
271 }
272
273 // Kick start the threads.
274 run = true;
275
276 // Let the threads run.
277 sleep(run_time);
278
279 // Stop the threads.
280 run = false;
281
282 // Wait for the threads to complete.
283 for (int i = 0; i < num_threads; i++) {
284 rc = pthread_join(threads[i], NULL);
285 if (rc != 0) {
286 printf("Thread %d failed to join.\n", i);
287 return false;
288 }
289 printf("Thread %d: bandwidth using %s %0.2f MB/s\n", i,
290 args[i].bench->getName(), args[i].avg_mb);
291 }
292
293 return true;
294 }
295
per_core_bandwidth(int argc,char ** argv)296 int per_core_bandwidth(int argc, char** argv) {
297 arg_t values;
298 if (!processThreadArgs(argc, argv, per_core_opts, &values)) {
299 return -1;
300 }
301
302 std::vector<int> cpu_list;
303 if (!getAvailCpus(&cpu_list)) {
304 printf("Failed to get available cpu list.\n");
305 return -1;
306 }
307
308 thread_arg_t args[cpu_list.size()];
309
310 int i = 0;
311 for (std::vector<int>::iterator it = cpu_list.begin();
312 it != cpu_list.end(); ++it, ++i) {
313 args[i].core = *it;
314 args[i].bench = createBandwidthBenchmarkObject(values);
315 if (!args[i].bench) {
316 for (int j = 0; j < i; j++)
317 delete args[j].bench;
318 return -1;
319 }
320 }
321
322 printf("Running on %d cores\n", cpu_list.size());
323 printf(" run_time = %ds\n", values["run_time"].int_value);
324 printf(" size = %d\n", values["size"].int_value);
325 printf(" num_warm_loops = %d\n", values["num_warm_loops"].int_value);
326 printf(" num_loops = %d\n", values["num_loops"].int_value);
327 printf("\n");
328
329 if (!runThreadedTest(args, cpu_list.size(), values["run_time"].int_value)) {
330 return -1;
331 }
332
333 return 0;
334 }
335
multithread_bandwidth(int argc,char ** argv)336 int multithread_bandwidth(int argc, char** argv) {
337 arg_t values;
338 if (!processThreadArgs(argc, argv, multithread_opts, &values)) {
339 return -1;
340 }
341 if (values.count("num_threads") == 0) {
342 printf("Must specify the num_threads value.\n");
343 return -1;
344 }
345 int num_threads = values["num_threads"].int_value;
346
347 thread_arg_t args[num_threads];
348
349 for (int i = 0; i < num_threads; i++) {
350 args[i].core = -1;
351 args[i].bench = createBandwidthBenchmarkObject(values);
352 if (!args[i].bench) {
353 for (int j = 0; j < i; j++)
354 delete args[j].bench;
355 return -1;
356 }
357 }
358
359 printf("Running %d threads\n", num_threads);
360 printf(" run_time = %ds\n", values["run_time"].int_value);
361 printf(" size = %d\n", values["size"].int_value);
362 printf(" num_warm_loops = %d\n", values["num_warm_loops"].int_value);
363 printf(" num_loops = %d\n", values["num_loops"].int_value);
364 printf("\n");
365
366 if (!runThreadedTest(args, num_threads, values["run_time"].int_value)) {
367 return -1;
368 }
369
370 return 0;
371 }
372
run_bandwidth_benchmark(int argc,char ** argv,const char * name,std::vector<BandwidthBenchmark * > bench_objs)373 bool run_bandwidth_benchmark(int argc, char** argv, const char *name,
374 std::vector<BandwidthBenchmark*> bench_objs) {
375 arg_t values;
376 values["size"].int_value = 0;
377 values["num_warm_loops"].int_value = 0;
378 values["num_loops"].int_value = 0;
379 if (!processBandwidthOptions(argc, argv, bandwidth_opts, &values)) {
380 return false;
381 }
382
383 size_t size = values["size"].int_value;
384 if ((size % 64) != 0) {
385 printf("The size value must be a multiple of 64.\n");
386 return false;
387 }
388
389 if (setpriority(PRIO_PROCESS, 0, -20)) {
390 perror("Unable to raise priority of process.");
391 return false;
392 }
393
394 bool preamble_printed = false;
395 size_t num_warm_loops = values["num_warm_loops"].int_value;
396 size_t num_loops = values["num_loops"].int_value;
397 for (std::vector<BandwidthBenchmark*>::iterator it = bench_objs.begin();
398 it != bench_objs.end(); ++it) {
399 if (!(*it)->canRun()) {
400 continue;
401 }
402 if (!(*it)->setSize(values["size"].int_value)) {
403 printf("Failed creating buffer for bandwidth test.\n");
404 return false;
405 }
406 if (num_warm_loops) {
407 (*it)->set_num_warm_loops(num_warm_loops);
408 }
409 if (num_loops) {
410 (*it)->set_num_loops(num_loops);
411 }
412 if (!preamble_printed) {
413 preamble_printed = true;
414 printf("Benchmarking %s bandwidth\n", name);
415 printf(" size = %d\n", (*it)->size());
416 printf(" num_warm_loops = %d\n", (*it)->num_warm_loops());
417 printf(" num_loops = %d\n\n", (*it)->num_loops());
418 }
419 (*it)->run();
420 printf(" %s bandwidth with %s: %0.2f MB/s\n", name, (*it)->getName(),
421 (*it)->mb_per_sec());
422 }
423
424 return true;
425 }
426
copy_bandwidth(int argc,char ** argv)427 int copy_bandwidth(int argc, char** argv) {
428 std::vector<BandwidthBenchmark*> bench_objs;
429 bench_objs.push_back(new CopyLdrdStrdBenchmark());
430 bench_objs.push_back(new CopyLdmiaStmiaBenchmark());
431 bench_objs.push_back(new CopyVld1Vst1Benchmark());
432 bench_objs.push_back(new CopyVldrVstrBenchmark());
433 bench_objs.push_back(new CopyVldmiaVstmiaBenchmark());
434 bench_objs.push_back(new MemcpyBenchmark());
435
436 if (!run_bandwidth_benchmark(argc, argv, "copy", bench_objs)) {
437 return -1;
438 }
439 return 0;
440 }
441
write_bandwidth(int argc,char ** argv)442 int write_bandwidth(int argc, char** argv) {
443 std::vector<BandwidthBenchmark*> bench_objs;
444 bench_objs.push_back(new WriteStrdBenchmark());
445 bench_objs.push_back(new WriteStmiaBenchmark());
446 bench_objs.push_back(new WriteVst1Benchmark());
447 bench_objs.push_back(new WriteVstrBenchmark());
448 bench_objs.push_back(new WriteVstmiaBenchmark());
449 bench_objs.push_back(new MemsetBenchmark());
450
451 if (!run_bandwidth_benchmark(argc, argv, "write", bench_objs)) {
452 return -1;
453 }
454
455 return 0;
456 }
457
read_bandwidth(int argc,char ** argv)458 int read_bandwidth(int argc, char** argv) {
459 std::vector<BandwidthBenchmark*> bench_objs;
460 bench_objs.push_back(new ReadLdrdBenchmark());
461 bench_objs.push_back(new ReadLdmiaBenchmark());
462 bench_objs.push_back(new ReadVld1Benchmark());
463 bench_objs.push_back(new ReadVldrBenchmark());
464 bench_objs.push_back(new ReadVldmiaBenchmark());
465
466 if (!run_bandwidth_benchmark(argc, argv, "read", bench_objs)) {
467 return -1;
468 }
469 return 0;
470 }
471