1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef __BANDWIDTH_H__ 18 #define __BANDWIDTH_H__ 19 20 #include <stdlib.h> 21 #include <string.h> 22 23 #include "utils/Compat.h" 24 #include "memtest.h" 25 26 // Bandwidth Class definitions. 27 class BandwidthBenchmark { 28 public: BandwidthBenchmark()29 BandwidthBenchmark() 30 : _size(0), 31 _num_warm_loops(DEFAULT_NUM_WARM_LOOPS), 32 _num_loops(DEFAULT_NUM_LOOPS) {} ~BandwidthBenchmark()33 virtual ~BandwidthBenchmark() {} 34 run()35 bool run() { 36 if (_size == 0) { 37 return false; 38 } 39 if (!canRun()) { 40 return false; 41 } 42 43 bench(_num_warm_loops); 44 45 nsecs_t t = system_time(); 46 bench(_num_loops); 47 t = system_time() - t; 48 49 _mb_per_sec = (_size*(_num_loops/_BYTES_PER_MB))/(t/_NUM_NS_PER_SEC); 50 51 return true; 52 } 53 canRun()54 bool canRun() { return !usesNeon() || isNeonSupported(); } 55 56 virtual bool setSize(size_t size) = 0; 57 58 virtual const char *getName() = 0; 59 60 virtual bool verify() = 0; 61 usesNeon()62 virtual bool usesNeon() { return false; } 63 isNeonSupported()64 bool isNeonSupported() { 65 #if defined(__ARM_NEON__) 66 return true; 67 #else 68 return false; 69 #endif 70 } 71 72 // Accessors/mutators. mb_per_sec()73 double mb_per_sec() { return _mb_per_sec; } num_warm_loops()74 size_t num_warm_loops() { return _num_warm_loops; } num_loops()75 size_t num_loops() { return _num_loops; } size()76 size_t size() { return _size; } 77 set_num_warm_loops(size_t num_warm_loops)78 void set_num_warm_loops(size_t num_warm_loops) { 79 _num_warm_loops = num_warm_loops; 80 } set_num_loops(size_t num_loops)81 void set_num_loops(size_t num_loops) { _num_loops = num_loops; } 82 83 // Static constants 84 static const unsigned int DEFAULT_NUM_WARM_LOOPS = 1000000; 85 static const unsigned int DEFAULT_NUM_LOOPS = 20000000; 86 87 protected: 88 virtual void bench(size_t num_loops) = 0; 89 90 double _mb_per_sec; 91 size_t _size; 92 size_t _num_warm_loops; 93 size_t _num_loops; 94 95 private: 96 // Static constants 97 static const CONSTEXPR double _NUM_NS_PER_SEC = 1000000000.0; 98 static const CONSTEXPR double _BYTES_PER_MB = 1024.0* 1024.0; 99 }; 100 101 class CopyBandwidthBenchmark : public BandwidthBenchmark { 102 public: CopyBandwidthBenchmark()103 CopyBandwidthBenchmark() : BandwidthBenchmark(), _src(NULL), _dst(NULL) { } 104 setSize(size_t size)105 bool setSize(size_t size) { 106 if (_src) { 107 free(_src); 108 } 109 if (_dst) { 110 free(_dst); 111 } 112 113 if (size == 0) { 114 _size = DEFAULT_COPY_SIZE; 115 } else { 116 _size = size; 117 } 118 119 _src = reinterpret_cast<char*>(memalign(64, _size)); 120 if (!_src) { 121 perror("Failed to allocate memory for test."); 122 return false; 123 } 124 _dst = reinterpret_cast<char*>(memalign(64, _size)); 125 if (!_dst) { 126 perror("Failed to allocate memory for test."); 127 return false; 128 } 129 130 return true; 131 } ~CopyBandwidthBenchmark()132 virtual ~CopyBandwidthBenchmark() { 133 if (_src) { 134 free(_src); 135 _src = NULL; 136 } 137 if (_dst) { 138 free(_dst); 139 _dst = NULL; 140 } 141 } 142 verify()143 bool verify() { 144 memset(_src, 0x23, _size); 145 memset(_dst, 0, _size); 146 bench(1); 147 if (memcmp(_src, _dst, _size) != 0) { 148 printf("Buffers failed to compare after one loop.\n"); 149 return false; 150 } 151 152 memset(_src, 0x23, _size); 153 memset(_dst, 0, _size); 154 _num_loops = 2; 155 bench(2); 156 if (memcmp(_src, _dst, _size) != 0) { 157 printf("Buffers failed to compare after two loops.\n"); 158 return false; 159 } 160 161 return true; 162 } 163 164 protected: 165 char *_src; 166 char *_dst; 167 168 static const unsigned int DEFAULT_COPY_SIZE = 8000; 169 }; 170 171 class CopyLdrdStrdBenchmark : public CopyBandwidthBenchmark { 172 public: CopyLdrdStrdBenchmark()173 CopyLdrdStrdBenchmark() : CopyBandwidthBenchmark() { } ~CopyLdrdStrdBenchmark()174 virtual ~CopyLdrdStrdBenchmark() {} 175 getName()176 const char *getName() { return "ldrd/strd"; } 177 178 protected: 179 // Copy using ldrd/strd instructions. bench(size_t num_loops)180 void bench(size_t num_loops) { 181 asm volatile( 182 "stmfd sp!, {r0,r1,r2,r3,r4,r6,r7}\n" 183 184 "mov r0, %0\n" 185 "mov r1, %1\n" 186 "mov r2, %2\n" 187 "mov r3, %3\n" 188 189 "0:\n" 190 "mov r4, r2, lsr #6\n" 191 192 "1:\n" 193 "ldrd r6, r7, [r0]\n" 194 "strd r6, r7, [r1]\n" 195 "ldrd r6, r7, [r0, #8]\n" 196 "strd r6, r7, [r1, #8]\n" 197 "ldrd r6, r7, [r0, #16]\n" 198 "strd r6, r7, [r1, #16]\n" 199 "ldrd r6, r7, [r0, #24]\n" 200 "strd r6, r7, [r1, #24]\n" 201 "ldrd r6, r7, [r0, #32]\n" 202 "strd r6, r7, [r1, #32]\n" 203 "ldrd r6, r7, [r0, #40]\n" 204 "strd r6, r7, [r1, #40]\n" 205 "ldrd r6, r7, [r0, #48]\n" 206 "strd r6, r7, [r1, #48]\n" 207 "ldrd r6, r7, [r0, #56]\n" 208 "strd r6, r7, [r1, #56]\n" 209 210 "add r0, r0, #64\n" 211 "add r1, r1, #64\n" 212 "subs r4, r4, #1\n" 213 "bgt 1b\n" 214 215 "sub r0, r0, r2\n" 216 "sub r1, r1, r2\n" 217 "subs r3, r3, #1\n" 218 "bgt 0b\n" 219 220 "ldmfd sp!, {r0,r1,r2,r3,r4,r6,r7}\n" 221 :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3"); 222 } 223 }; 224 225 class CopyLdmiaStmiaBenchmark : public CopyBandwidthBenchmark { 226 public: CopyLdmiaStmiaBenchmark()227 CopyLdmiaStmiaBenchmark() : CopyBandwidthBenchmark() { } ~CopyLdmiaStmiaBenchmark()228 virtual ~CopyLdmiaStmiaBenchmark() {} 229 getName()230 const char *getName() { return "ldmia/stmia"; } 231 232 protected: 233 // Copy using ldmia/stmia instructions. bench(size_t num_loops)234 void bench(size_t num_loops) { 235 asm volatile( 236 "stmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12}\n" 237 238 "mov r0, %0\n" 239 "mov r1, %1\n" 240 "mov r2, %2\n" 241 "mov r3, %3\n" 242 243 "0:\n" 244 "mov r4, r2, lsr #6\n" 245 246 "1:\n" 247 "ldmia r0!, {r5, r6, r7, r8, r9, r10, r11, r12}\n" 248 "stmia r1!, {r5, r6, r7, r8, r9, r10, r11, r12}\n" 249 "subs r4, r4, #1\n" 250 "ldmia r0!, {r5, r6, r7, r8, r9, r10, r11, r12}\n" 251 "stmia r1!, {r5, r6, r7, r8, r9, r10, r11, r12}\n" 252 "bgt 1b\n" 253 254 "sub r0, r0, r2\n" 255 "sub r1, r1, r2\n" 256 "subs r3, r3, #1\n" 257 "bgt 0b\n" 258 259 "ldmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12}\n" 260 :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3"); 261 } 262 }; 263 264 class CopyVld1Vst1Benchmark : public CopyBandwidthBenchmark { 265 public: CopyVld1Vst1Benchmark()266 CopyVld1Vst1Benchmark() : CopyBandwidthBenchmark() { } ~CopyVld1Vst1Benchmark()267 virtual ~CopyVld1Vst1Benchmark() {} 268 getName()269 const char *getName() { return "vld1/vst1"; } 270 usesNeon()271 bool usesNeon() { return true; } 272 273 protected: 274 // Copy using vld1/vst1 instructions. bench(size_t num_loops)275 void bench(size_t num_loops) { 276 #if defined(__ARM_NEON__) 277 asm volatile( 278 "stmfd sp!, {r0,r1,r2,r3,r4}\n" 279 280 "mov r0, %0\n" 281 "mov r1, %1\n" 282 "mov r2, %2\n" 283 "mov r3, %3\n" 284 285 "0:\n" 286 "mov r4, r2, lsr #6\n" 287 288 "1:\n" 289 "vld1.8 {d0-d3}, [r0]!\n" 290 "vld1.8 {d4-d7}, [r0]!\n" 291 "subs r4, r4, #1\n" 292 "vst1.8 {d0-d3}, [r1:128]!\n" 293 "vst1.8 {d4-d7}, [r1:128]!\n" 294 "bgt 1b\n" 295 296 "sub r0, r0, r2\n" 297 "sub r1, r1, r2\n" 298 "subs r3, r3, #1\n" 299 "bgt 0b\n" 300 301 "ldmfd sp!, {r0,r1,r2,r3,r4}\n" 302 :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3"); 303 #endif 304 } 305 }; 306 307 class CopyVldrVstrBenchmark : public CopyBandwidthBenchmark { 308 public: CopyVldrVstrBenchmark()309 CopyVldrVstrBenchmark() : CopyBandwidthBenchmark() { } ~CopyVldrVstrBenchmark()310 virtual ~CopyVldrVstrBenchmark() {} 311 getName()312 const char *getName() { return "vldr/vstr"; } 313 usesNeon()314 bool usesNeon() { return true; } 315 316 protected: 317 // Copy using vldr/vstr instructions. bench(size_t num_loops)318 void bench(size_t num_loops) { 319 #if defined(__ARM_NEON__) 320 asm volatile( 321 "stmfd sp!, {r0,r1,r2,r3,r4}\n" 322 323 "mov r0, %0\n" 324 "mov r1, %1\n" 325 "mov r2, %2\n" 326 "mov r3, %3\n" 327 328 "0:\n" 329 "mov r4, r2, lsr #6\n" 330 331 "1:\n" 332 "vldr d0, [r0, #0]\n" 333 "subs r4, r4, #1\n" 334 "vldr d1, [r0, #8]\n" 335 "vstr d0, [r1, #0]\n" 336 "vldr d0, [r0, #16]\n" 337 "vstr d1, [r1, #8]\n" 338 "vldr d1, [r0, #24]\n" 339 "vstr d0, [r1, #16]\n" 340 "vldr d0, [r0, #32]\n" 341 "vstr d1, [r1, #24]\n" 342 "vldr d1, [r0, #40]\n" 343 "vstr d0, [r1, #32]\n" 344 "vldr d0, [r0, #48]\n" 345 "vstr d1, [r1, #40]\n" 346 "vldr d1, [r0, #56]\n" 347 "vstr d0, [r1, #48]\n" 348 "add r0, r0, #64\n" 349 "vstr d1, [r1, #56]\n" 350 "add r1, r1, #64\n" 351 "bgt 1b\n" 352 353 "sub r0, r0, r2\n" 354 "sub r1, r1, r2\n" 355 "subs r3, r3, #1\n" 356 "bgt 0b\n" 357 358 "ldmfd sp!, {r0,r1,r2,r3,r4}\n" 359 :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3"); 360 #endif 361 } 362 }; 363 364 class CopyVldmiaVstmiaBenchmark : public CopyBandwidthBenchmark { 365 public: CopyVldmiaVstmiaBenchmark()366 CopyVldmiaVstmiaBenchmark() : CopyBandwidthBenchmark() { } ~CopyVldmiaVstmiaBenchmark()367 virtual ~CopyVldmiaVstmiaBenchmark() {} 368 getName()369 const char *getName() { return "vldmia/vstmia"; } 370 usesNeon()371 bool usesNeon() { return true; } 372 373 protected: 374 // Copy using vldmia/vstmia instructions. bench(size_t num_loops)375 void bench(size_t num_loops) { 376 #if defined(__ARM_NEON__) 377 asm volatile( 378 "stmfd sp!, {r0,r1,r2,r3,r4}\n" 379 380 "mov r0, %0\n" 381 "mov r1, %1\n" 382 "mov r2, %2\n" 383 "mov r3, %3\n" 384 385 "0:\n" 386 "mov r4, r2, lsr #6\n" 387 388 "1:\n" 389 "vldmia r0!, {d0-d7}\n" 390 "subs r4, r4, #1\n" 391 "vstmia r1!, {d0-d7}\n" 392 "bgt 1b\n" 393 394 "sub r0, r0, r2\n" 395 "sub r1, r1, r2\n" 396 "subs r3, r3, #1\n" 397 "bgt 0b\n" 398 399 "ldmfd sp!, {r0,r1,r2,r3,r4}\n" 400 :: "r" (_src), "r" (_dst), "r" (_size), "r" (num_loops) : "r0", "r1", "r2", "r3"); 401 #endif 402 } 403 }; 404 405 class MemcpyBenchmark : public CopyBandwidthBenchmark { 406 public: MemcpyBenchmark()407 MemcpyBenchmark() : CopyBandwidthBenchmark() { } ~MemcpyBenchmark()408 virtual ~MemcpyBenchmark() {} 409 getName()410 const char *getName() { return "memcpy"; } 411 412 protected: bench(size_t num_loops)413 void bench(size_t num_loops) { 414 for (size_t i = 0; i < num_loops; i++) { 415 memcpy(_dst, _src, _size); 416 } 417 } 418 }; 419 420 class SingleBufferBandwidthBenchmark : public BandwidthBenchmark { 421 public: SingleBufferBandwidthBenchmark()422 SingleBufferBandwidthBenchmark() : BandwidthBenchmark(), _buffer(NULL) { } ~SingleBufferBandwidthBenchmark()423 virtual ~SingleBufferBandwidthBenchmark() { 424 if (_buffer) { 425 free(_buffer); 426 _buffer = NULL; 427 } 428 } 429 setSize(size_t size)430 bool setSize(size_t size) { 431 if (_buffer) { 432 free(_buffer); 433 _buffer = NULL; 434 } 435 436 if (_size == 0) { 437 _size = DEFAULT_SINGLE_BUFFER_SIZE; 438 } else { 439 _size = size; 440 } 441 442 _buffer = reinterpret_cast<char*>(memalign(64, _size)); 443 if (!_buffer) { 444 perror("Failed to allocate memory for test."); 445 return false; 446 } 447 memset(_buffer, 0, _size); 448 449 return true; 450 } 451 verify()452 bool verify() { return true; } 453 454 protected: 455 char *_buffer; 456 457 static const unsigned int DEFAULT_SINGLE_BUFFER_SIZE = 16000; 458 }; 459 460 class WriteBandwidthBenchmark : public SingleBufferBandwidthBenchmark { 461 public: WriteBandwidthBenchmark()462 WriteBandwidthBenchmark() : SingleBufferBandwidthBenchmark() { } ~WriteBandwidthBenchmark()463 virtual ~WriteBandwidthBenchmark() { } 464 verify()465 bool verify() { 466 memset(_buffer, 0, _size); 467 bench(1); 468 for (size_t i = 0; i < _size; i++) { 469 if (_buffer[i] != 1) { 470 printf("Buffer failed to compare after one loop.\n"); 471 return false; 472 } 473 } 474 475 memset(_buffer, 0, _size); 476 bench(2); 477 for (size_t i = 0; i < _size; i++) { 478 if (_buffer[i] != 2) { 479 printf("Buffer failed to compare after two loops.\n"); 480 return false; 481 } 482 } 483 484 return true; 485 } 486 }; 487 488 class WriteStrdBenchmark : public WriteBandwidthBenchmark { 489 public: WriteStrdBenchmark()490 WriteStrdBenchmark() : WriteBandwidthBenchmark() { } ~WriteStrdBenchmark()491 virtual ~WriteStrdBenchmark() {} 492 getName()493 const char *getName() { return "strd"; } 494 495 protected: 496 // Write a given value using strd. bench(size_t num_loops)497 void bench(size_t num_loops) { 498 asm volatile( 499 "stmfd sp!, {r0,r1,r2,r3,r4,r5}\n" 500 501 "mov r0, %0\n" 502 "mov r1, %1\n" 503 "mov r2, %2\n" 504 505 "mov r4, #0\n" 506 "mov r5, #0\n" 507 508 "0:\n" 509 "mov r3, r1, lsr #5\n" 510 511 "add r4, r4, #0x01010101\n" 512 "mov r5, r4\n" 513 514 "1:\n" 515 "subs r3, r3, #1\n" 516 "strd r4, r5, [r0]\n" 517 "strd r4, r5, [r0, #8]\n" 518 "strd r4, r5, [r0, #16]\n" 519 "strd r4, r5, [r0, #24]\n" 520 "add r0, r0, #32\n" 521 "bgt 1b\n" 522 523 "sub r0, r0, r1\n" 524 "subs r2, r2, #1\n" 525 "bgt 0b\n" 526 527 "ldmfd sp!, {r0,r1,r2,r3,r4,r5}\n" 528 :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2"); 529 } 530 }; 531 532 class WriteStmiaBenchmark : public WriteBandwidthBenchmark { 533 public: WriteStmiaBenchmark()534 WriteStmiaBenchmark() : WriteBandwidthBenchmark() { } ~WriteStmiaBenchmark()535 virtual ~WriteStmiaBenchmark() {} 536 getName()537 const char *getName() { return "stmia"; } 538 539 protected: 540 // Write a given value using stmia. bench(size_t num_loops)541 void bench(size_t num_loops) { 542 asm volatile( 543 "stmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n" 544 545 "mov r0, %0\n" 546 "mov r1, %1\n" 547 "mov r2, %2\n" 548 549 "mov r4, #0\n" 550 551 "0:\n" 552 "mov r3, r1, lsr #5\n" 553 554 "add r4, r4, #0x01010101\n" 555 "mov r5, r4\n" 556 "mov r6, r4\n" 557 "mov r7, r4\n" 558 "mov r8, r4\n" 559 "mov r9, r4\n" 560 "mov r10, r4\n" 561 "mov r11, r4\n" 562 563 "1:\n" 564 "subs r3, r3, #1\n" 565 "stmia r0!, {r4, r5, r6, r7, r8, r9, r10, r11}\n" 566 "bgt 1b\n" 567 568 "sub r0, r0, r1\n" 569 "subs r2, r2, #1\n" 570 "bgt 0b\n" 571 572 "ldmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n" 573 :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2"); 574 } 575 }; 576 577 class WriteVst1Benchmark : public WriteBandwidthBenchmark { 578 public: WriteVst1Benchmark()579 WriteVst1Benchmark() : WriteBandwidthBenchmark() { } ~WriteVst1Benchmark()580 virtual ~WriteVst1Benchmark() {} 581 getName()582 const char *getName() { return "vst1"; } 583 usesNeon()584 bool usesNeon() { return true; } 585 586 protected: 587 // Write a given value using vst. bench(size_t num_loops)588 void bench(size_t num_loops) { 589 #if defined(__ARM_NEON__) 590 asm volatile( 591 "stmfd sp!, {r0,r1,r2,r3,r4}\n" 592 593 "mov r0, %0\n" 594 "mov r1, %1\n" 595 "mov r2, %2\n" 596 "mov r4, #0\n" 597 598 "0:\n" 599 "mov r3, r1, lsr #5\n" 600 601 "add r4, r4, #1\n" 602 "vdup.8 d0, r4\n" 603 "vmov d1, d0\n" 604 "vmov d2, d0\n" 605 "vmov d3, d0\n" 606 607 "1:\n" 608 "subs r3, r3, #1\n" 609 "vst1.8 {d0-d3}, [r0:128]!\n" 610 "bgt 1b\n" 611 612 "sub r0, r0, r1\n" 613 "subs r2, r2, #1\n" 614 "bgt 0b\n" 615 616 "ldmfd sp!, {r0,r1,r2,r3,r4}\n" 617 :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2"); 618 #endif 619 } 620 }; 621 622 class WriteVstrBenchmark : public WriteBandwidthBenchmark { 623 public: WriteVstrBenchmark()624 WriteVstrBenchmark() : WriteBandwidthBenchmark() { } ~WriteVstrBenchmark()625 virtual ~WriteVstrBenchmark() {} 626 getName()627 const char *getName() { return "vstr"; } 628 usesNeon()629 bool usesNeon() { return true; } 630 631 protected: 632 // Write a given value using vst. bench(size_t num_loops)633 void bench(size_t num_loops) { 634 #if defined(__ARM_NEON__) 635 asm volatile( 636 "stmfd sp!, {r0,r1,r2,r3,r4}\n" 637 638 "mov r0, %0\n" 639 "mov r1, %1\n" 640 "mov r2, %2\n" 641 "mov r4, #0\n" 642 643 "0:\n" 644 "mov r3, r1, lsr #5\n" 645 646 "add r4, r4, #1\n" 647 "vdup.8 d0, r4\n" 648 "vmov d1, d0\n" 649 "vmov d2, d0\n" 650 "vmov d3, d0\n" 651 652 "1:\n" 653 "vstr d0, [r0, #0]\n" 654 "subs r3, r3, #1\n" 655 "vstr d1, [r0, #8]\n" 656 "vstr d0, [r0, #16]\n" 657 "vstr d1, [r0, #24]\n" 658 "add r0, r0, #32\n" 659 "bgt 1b\n" 660 661 "sub r0, r0, r1\n" 662 "subs r2, r2, #1\n" 663 "bgt 0b\n" 664 665 "ldmfd sp!, {r0,r1,r2,r3,r4}\n" 666 :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2"); 667 #endif 668 } 669 }; 670 671 class WriteVstmiaBenchmark : public WriteBandwidthBenchmark { 672 public: WriteVstmiaBenchmark()673 WriteVstmiaBenchmark() : WriteBandwidthBenchmark() { } ~WriteVstmiaBenchmark()674 virtual ~WriteVstmiaBenchmark() {} 675 getName()676 const char *getName() { return "vstmia"; } 677 usesNeon()678 bool usesNeon() { return true; } 679 680 protected: 681 // Write a given value using vstmia. bench(size_t num_loops)682 void bench(size_t num_loops) { 683 #if defined(__ARM_NEON__) 684 asm volatile( 685 "stmfd sp!, {r0,r1,r2,r3,r4}\n" 686 687 "mov r0, %0\n" 688 "mov r1, %1\n" 689 "mov r2, %2\n" 690 "mov r4, #0\n" 691 692 "0:\n" 693 "mov r3, r1, lsr #5\n" 694 695 "add r4, r4, #1\n" 696 "vdup.8 d0, r4\n" 697 "vmov d1, d0\n" 698 "vmov d2, d0\n" 699 "vmov d3, d0\n" 700 701 "1:\n" 702 "subs r3, r3, #1\n" 703 "vstmia r0!, {d0-d3}\n" 704 "bgt 1b\n" 705 706 "sub r0, r0, r1\n" 707 "subs r2, r2, #1\n" 708 "bgt 0b\n" 709 710 "ldmfd sp!, {r0,r1,r2,r3,r4}\n" 711 :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2"); 712 #endif 713 } 714 }; 715 716 class MemsetBenchmark : public WriteBandwidthBenchmark { 717 public: MemsetBenchmark()718 MemsetBenchmark() : WriteBandwidthBenchmark() { } ~MemsetBenchmark()719 virtual ~MemsetBenchmark() {} 720 getName()721 const char *getName() { return "memset"; } 722 723 protected: bench(size_t num_loops)724 void bench(size_t num_loops) { 725 for (size_t i = 0; i < num_loops; i++) { 726 memset(_buffer, (i % 255) + 1, _size); 727 } 728 } 729 }; 730 731 class ReadLdrdBenchmark : public SingleBufferBandwidthBenchmark { 732 public: ReadLdrdBenchmark()733 ReadLdrdBenchmark() : SingleBufferBandwidthBenchmark() { } ~ReadLdrdBenchmark()734 virtual ~ReadLdrdBenchmark() {} 735 getName()736 const char *getName() { return "ldrd"; } 737 738 protected: 739 // Write a given value using strd. bench(size_t num_loops)740 void bench(size_t num_loops) { 741 asm volatile( 742 "stmfd sp!, {r0,r1,r2,r3,r4,r5}\n" 743 744 "mov r0, %0\n" 745 "mov r1, %1\n" 746 "mov r2, %2\n" 747 748 "0:\n" 749 "mov r3, r1, lsr #5\n" 750 751 "1:\n" 752 "subs r3, r3, #1\n" 753 "ldrd r4, r5, [r0]\n" 754 "ldrd r4, r5, [r0, #8]\n" 755 "ldrd r4, r5, [r0, #16]\n" 756 "ldrd r4, r5, [r0, #24]\n" 757 "add r0, r0, #32\n" 758 "bgt 1b\n" 759 760 "sub r0, r0, r1\n" 761 "subs r2, r2, #1\n" 762 "bgt 0b\n" 763 764 "ldmfd sp!, {r0,r1,r2,r3,r4,r5}\n" 765 :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2"); 766 } 767 }; 768 769 class ReadLdmiaBenchmark : public SingleBufferBandwidthBenchmark { 770 public: ReadLdmiaBenchmark()771 ReadLdmiaBenchmark() : SingleBufferBandwidthBenchmark() { } ~ReadLdmiaBenchmark()772 virtual ~ReadLdmiaBenchmark() {} 773 getName()774 const char *getName() { return "ldmia"; } 775 776 protected: 777 // Write a given value using stmia. bench(size_t num_loops)778 void bench(size_t num_loops) { 779 asm volatile( 780 "stmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n" 781 782 "mov r0, %0\n" 783 "mov r1, %1\n" 784 "mov r2, %2\n" 785 786 "0:\n" 787 "mov r3, r1, lsr #5\n" 788 789 "1:\n" 790 "subs r3, r3, #1\n" 791 "ldmia r0!, {r4, r5, r6, r7, r8, r9, r10, r11}\n" 792 "bgt 1b\n" 793 794 "sub r0, r0, r1\n" 795 "subs r2, r2, #1\n" 796 "bgt 0b\n" 797 798 "ldmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11}\n" 799 :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2"); 800 } 801 }; 802 803 class ReadVld1Benchmark : public SingleBufferBandwidthBenchmark { 804 public: ReadVld1Benchmark()805 ReadVld1Benchmark() : SingleBufferBandwidthBenchmark() { } ~ReadVld1Benchmark()806 virtual ~ReadVld1Benchmark() {} 807 getName()808 const char *getName() { return "vld1"; } 809 usesNeon()810 bool usesNeon() { return true; } 811 812 protected: 813 // Write a given value using vst. bench(size_t num_loops)814 void bench(size_t num_loops) { 815 #if defined(__ARM_NEON__) 816 asm volatile( 817 "stmfd sp!, {r0,r1,r2,r3}\n" 818 819 "mov r0, %0\n" 820 "mov r1, %1\n" 821 "mov r2, %2\n" 822 823 "0:\n" 824 "mov r3, r1, lsr #5\n" 825 826 "1:\n" 827 "subs r3, r3, #1\n" 828 "vld1.8 {d0-d3}, [r0:128]!\n" 829 "bgt 1b\n" 830 831 "sub r0, r0, r1\n" 832 "subs r2, r2, #1\n" 833 "bgt 0b\n" 834 835 "ldmfd sp!, {r0,r1,r2,r3}\n" 836 :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2"); 837 #endif 838 } 839 }; 840 841 class ReadVldrBenchmark : public SingleBufferBandwidthBenchmark { 842 public: ReadVldrBenchmark()843 ReadVldrBenchmark() : SingleBufferBandwidthBenchmark() { } ~ReadVldrBenchmark()844 virtual ~ReadVldrBenchmark() {} 845 getName()846 const char *getName() { return "vldr"; } 847 usesNeon()848 bool usesNeon() { return true; } 849 850 protected: 851 // Write a given value using vst. bench(size_t num_loops)852 void bench(size_t num_loops) { 853 #if defined(__ARM_NEON__) 854 asm volatile( 855 "stmfd sp!, {r0,r1,r2,r3}\n" 856 857 "mov r0, %0\n" 858 "mov r1, %1\n" 859 "mov r2, %2\n" 860 861 "0:\n" 862 "mov r3, r1, lsr #5\n" 863 864 "1:\n" 865 "vldr d0, [r0, #0]\n" 866 "subs r3, r3, #1\n" 867 "vldr d1, [r0, #8]\n" 868 "vldr d0, [r0, #16]\n" 869 "vldr d1, [r0, #24]\n" 870 "add r0, r0, #32\n" 871 "bgt 1b\n" 872 873 "sub r0, r0, r1\n" 874 "subs r2, r2, #1\n" 875 "bgt 0b\n" 876 877 "ldmfd sp!, {r0,r1,r2,r3}\n" 878 :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2"); 879 #endif 880 } 881 }; 882 883 884 class ReadVldmiaBenchmark : public SingleBufferBandwidthBenchmark { 885 public: ReadVldmiaBenchmark()886 ReadVldmiaBenchmark() : SingleBufferBandwidthBenchmark() { } ~ReadVldmiaBenchmark()887 virtual ~ReadVldmiaBenchmark() {} 888 getName()889 const char *getName() { return "vldmia"; } 890 usesNeon()891 bool usesNeon() { return true; } 892 893 protected: 894 // Write a given value using vstmia. bench(size_t num_loops)895 void bench(size_t num_loops) { 896 #if defined(__ARM_NEON__) 897 asm volatile( 898 "stmfd sp!, {r0,r1,r2,r3}\n" 899 900 "mov r0, %0\n" 901 "mov r1, %1\n" 902 "mov r2, %2\n" 903 904 "0:\n" 905 "mov r3, r1, lsr #5\n" 906 907 "1:\n" 908 "subs r3, r3, #1\n" 909 "vldmia r0!, {d0-d3}\n" 910 "bgt 1b\n" 911 912 "sub r0, r0, r1\n" 913 "subs r2, r2, #1\n" 914 "bgt 0b\n" 915 916 "ldmfd sp!, {r0,r1,r2,r3}\n" 917 :: "r" (_buffer), "r" (_size), "r" (num_loops) : "r0", "r1", "r2"); 918 #endif 919 } 920 }; 921 922 #endif // __BANDWIDTH_H__ 923