1 // Copyright 2006 Google Inc. All Rights Reserved.
2
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6
7 // http://www.apache.org/licenses/LICENSE-2.0
8
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // sat.cc : a stress test for stressful testing
16
17 // stressapptest (or SAT, from Stressful Application Test) is a test
18 // designed to stress the system, as well as provide a comprehensive
19 // memory interface test.
20
21 // stressapptest can be run using memory only, or using many system components.
22
23 #include <errno.h>
24 #include <pthread.h>
25 #include <signal.h>
26 #include <stdarg.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31
32 #include <sys/stat.h>
33 #include <sys/times.h>
34
35 // #define __USE_GNU
36 // #define __USE_LARGEFILE64
37 #include <fcntl.h>
38
39 #include <list>
40 #include <string>
41
42 // This file must work with autoconf on its public version,
43 // so these includes are correct.
44 #include "disk_blocks.h"
45 #include "logger.h"
46 #include "os.h"
47 #include "sat.h"
48 #include "sattypes.h"
49 #include "worker.h"
50
51 // stressapptest versioning here.
52 #ifndef PACKAGE_VERSION
53 static const char* kVersion = "1.0.0";
54 #else
55 static const char* kVersion = PACKAGE_VERSION;
56 #endif
57
58 // Global stressapptest reference, for use by signal handler.
59 // This makes Sat objects not safe for multiple instances.
60 namespace {
61 Sat *g_sat = NULL;
62
63 // Signal handler for catching break or kill.
64 //
65 // This must be installed after g_sat is assigned and while there is a single
66 // thread.
67 //
68 // This must be uninstalled while there is only a single thread, and of course
69 // before g_sat is cleared or deleted.
SatHandleBreak(int signal)70 void SatHandleBreak(int signal) {
71 g_sat->Break();
72 }
73 }
74
75 // Opens the logfile for writing if necessary
InitializeLogfile()76 bool Sat::InitializeLogfile() {
77 // Open logfile.
78 if (use_logfile_) {
79 logfile_ = open(logfilename_,
80 #if defined(O_DSYNC)
81 O_DSYNC |
82 #elif defined(O_SYNC)
83 O_SYNC |
84 #elif defined(O_FSYNC)
85 O_FSYNC |
86 #endif
87 O_WRONLY | O_CREAT,
88 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
89 if (logfile_ < 0) {
90 printf("Fatal Error: cannot open file %s for logging\n",
91 logfilename_);
92 bad_status();
93 return false;
94 }
95 // We seek to the end once instead of opening in append mode because no
96 // other processes should be writing to it while this one exists.
97 if (lseek(logfile_, 0, SEEK_END) == -1) {
98 printf("Fatal Error: cannot seek to end of logfile (%s)\n",
99 logfilename_);
100 bad_status();
101 return false;
102 }
103 Logger::GlobalLogger()->SetLogFd(logfile_);
104 }
105 return true;
106 }
107
108 // Check that the environment is known and safe to run on.
109 // Return 1 if good, 0 if unsuppported.
CheckEnvironment()110 bool Sat::CheckEnvironment() {
111 // Check that this is not a debug build. Debug builds lack
112 // enough performance to stress the system.
113 #if !defined NDEBUG
114 if (run_on_anything_) {
115 logprintf(1, "Log: Running DEBUG version of SAT, "
116 "with significantly reduced coverage.\n");
117 } else {
118 logprintf(0, "Process Error: Running DEBUG version of SAT, "
119 "with significantly reduced coverage.\n");
120 logprintf(0, "Log: Command line option '-A' bypasses this error.\n");
121 bad_status();
122 return false;
123 }
124 #elif !defined CHECKOPTS
125 #error Build system regression - COPTS disregarded.
126 #endif
127
128 // Check if the cpu frequency test is enabled and able to run.
129 if (cpu_freq_test_) {
130 if (!CpuFreqThread::CanRun()) {
131 logprintf(0, "Process Error: This platform does not support this "
132 "test.\n");
133 bad_status();
134 return false;
135 } else if (cpu_freq_threshold_ <= 0) {
136 logprintf(0, "Process Error: The cpu frequency test requires "
137 "--cpu_freq_threshold set to a value > 0\n");
138 bad_status();
139 return false;
140 } else if (cpu_freq_round_ < 0) {
141 logprintf(0, "Process Error: The --cpu_freq_round option must be greater"
142 " than or equal to zero. A value of zero means no rounding.\n");
143 bad_status();
144 return false;
145 }
146 }
147
148 // Use all CPUs if nothing is specified.
149 if (memory_threads_ == -1) {
150 memory_threads_ = os_->num_cpus();
151 logprintf(7, "Log: Defaulting to %d copy threads\n", memory_threads_);
152 }
153
154 // Use all memory if no size is specified.
155 if (size_mb_ == 0)
156 size_mb_ = os_->FindFreeMemSize() / kMegabyte;
157 size_ = static_cast<int64>(size_mb_) * kMegabyte;
158
159 // Autodetect file locations.
160 if (findfiles_ && (file_threads_ == 0)) {
161 // Get a space separated sting of disk locations.
162 list<string> locations = os_->FindFileDevices();
163
164 // Extract each one.
165 while (!locations.empty()) {
166 // Copy and remove the disk name.
167 string disk = locations.back();
168 locations.pop_back();
169
170 logprintf(12, "Log: disk at %s\n", disk.c_str());
171 file_threads_++;
172 filename_.push_back(disk + "/sat_disk.a");
173 file_threads_++;
174 filename_.push_back(disk + "/sat_disk.b");
175 }
176 }
177
178 // We'd better have some memory by this point.
179 if (size_ < 1) {
180 logprintf(0, "Process Error: No memory found to test.\n");
181 bad_status();
182 return false;
183 }
184
185 if (tag_mode_ && ((file_threads_ > 0) ||
186 (disk_threads_ > 0) ||
187 (net_threads_ > 0))) {
188 logprintf(0, "Process Error: Memory tag mode incompatible "
189 "with disk/network DMA.\n");
190 bad_status();
191 return false;
192 }
193
194 // If platform is 32 bit Xeon, floor memory size to multiple of 4.
195 if (address_mode_ == 32) {
196 size_mb_ = (size_mb_ / 4) * 4;
197 size_ = size_mb_ * kMegabyte;
198 logprintf(1, "Log: Flooring memory allocation to multiple of 4: %lldMB\n",
199 size_mb_);
200 }
201
202 // Check if this system is on the whitelist for supported systems.
203 if (!os_->IsSupported()) {
204 if (run_on_anything_) {
205 logprintf(1, "Log: Unsupported system. Running with reduced coverage.\n");
206 // This is ok, continue on.
207 } else {
208 logprintf(0, "Process Error: Unsupported system, "
209 "no error reporting available\n");
210 logprintf(0, "Log: Command line option '-A' bypasses this error.\n");
211 bad_status();
212 return false;
213 }
214 }
215
216 return true;
217 }
218
219 // Allocates memory to run the test on
AllocateMemory()220 bool Sat::AllocateMemory() {
221 // Allocate our test memory.
222 bool result = os_->AllocateTestMem(size_, paddr_base_);
223 if (!result) {
224 logprintf(0, "Process Error: failed to allocate memory\n");
225 bad_status();
226 return false;
227 }
228 return true;
229 }
230
231 // Sets up access to data patterns
InitializePatterns()232 bool Sat::InitializePatterns() {
233 // Initialize pattern data.
234 patternlist_ = new PatternList();
235 if (!patternlist_) {
236 logprintf(0, "Process Error: failed to allocate patterns\n");
237 bad_status();
238 return false;
239 }
240 if (!patternlist_->Initialize()) {
241 logprintf(0, "Process Error: failed to initialize patternlist\n");
242 bad_status();
243 return false;
244 }
245 return true;
246 }
247
248 // Get any valid page, no tag specified.
GetValid(struct page_entry * pe)249 bool Sat::GetValid(struct page_entry *pe) {
250 return GetValid(pe, kDontCareTag);
251 }
252
253
254 // Fetch and return empty and full pages into the empty and full pools.
GetValid(struct page_entry * pe,int32 tag)255 bool Sat::GetValid(struct page_entry *pe, int32 tag) {
256 bool result = false;
257 // Get valid page depending on implementation.
258 if (pe_q_implementation_ == SAT_FINELOCK)
259 result = finelock_q_->GetValid(pe, tag);
260 else if (pe_q_implementation_ == SAT_ONELOCK)
261 result = valid_->PopRandom(pe);
262
263 if (result) {
264 pe->addr = os_->PrepareTestMem(pe->offset, page_length_); // Map it.
265
266 // Tag this access and current pattern.
267 pe->ts = os_->GetTimestamp();
268 pe->lastpattern = pe->pattern;
269
270 return (pe->addr != 0); // Return success or failure.
271 }
272 return false;
273 }
274
PutValid(struct page_entry * pe)275 bool Sat::PutValid(struct page_entry *pe) {
276 if (pe->addr != 0)
277 os_->ReleaseTestMem(pe->addr, pe->offset, page_length_); // Unmap the page.
278 pe->addr = 0;
279
280 // Put valid page depending on implementation.
281 if (pe_q_implementation_ == SAT_FINELOCK)
282 return finelock_q_->PutValid(pe);
283 else if (pe_q_implementation_ == SAT_ONELOCK)
284 return valid_->Push(pe);
285 else
286 return false;
287 }
288
289 // Get an empty page with any tag.
GetEmpty(struct page_entry * pe)290 bool Sat::GetEmpty(struct page_entry *pe) {
291 return GetEmpty(pe, kDontCareTag);
292 }
293
GetEmpty(struct page_entry * pe,int32 tag)294 bool Sat::GetEmpty(struct page_entry *pe, int32 tag) {
295 bool result = false;
296 // Get empty page depending on implementation.
297 if (pe_q_implementation_ == SAT_FINELOCK)
298 result = finelock_q_->GetEmpty(pe, tag);
299 else if (pe_q_implementation_ == SAT_ONELOCK)
300 result = empty_->PopRandom(pe);
301
302 if (result) {
303 pe->addr = os_->PrepareTestMem(pe->offset, page_length_); // Map it.
304 return (pe->addr != 0); // Return success or failure.
305 }
306 return false;
307 }
308
PutEmpty(struct page_entry * pe)309 bool Sat::PutEmpty(struct page_entry *pe) {
310 if (pe->addr != 0)
311 os_->ReleaseTestMem(pe->addr, pe->offset, page_length_); // Unmap the page.
312 pe->addr = 0;
313
314 // Put empty page depending on implementation.
315 if (pe_q_implementation_ == SAT_FINELOCK)
316 return finelock_q_->PutEmpty(pe);
317 else if (pe_q_implementation_ == SAT_ONELOCK)
318 return empty_->Push(pe);
319 else
320 return false;
321 }
322
323 // Set up the bitmap of physical pages in case we want to see which pages were
324 // accessed under this run of SAT.
AddrMapInit()325 void Sat::AddrMapInit() {
326 if (!do_page_map_)
327 return;
328 // Find about how much physical mem is in the system.
329 // TODO(nsanders): Find some way to get the max
330 // and min phys addr in the system.
331 uint64 maxsize = os_->FindFreeMemSize() * 4;
332 sat_assert(maxsize != 0);
333
334 // Make a bitmask of this many pages. Assume that the memory is relatively
335 // zero based. This is true on x86, typically.
336 // This is one bit per page.
337 uint64 arraysize = maxsize / 4096 / 8;
338 unsigned char *bitmap = new unsigned char[arraysize];
339 sat_assert(bitmap);
340
341 // Mark every page as 0, not seen.
342 memset(bitmap, 0, arraysize);
343
344 page_bitmap_size_ = maxsize;
345 page_bitmap_ = bitmap;
346 }
347
348 // Add the 4k pages in this block to the array of pages SAT has seen.
AddrMapUpdate(struct page_entry * pe)349 void Sat::AddrMapUpdate(struct page_entry *pe) {
350 if (!do_page_map_)
351 return;
352
353 // Go through 4k page blocks.
354 uint64 arraysize = page_bitmap_size_ / 4096 / 8;
355
356 char *base = reinterpret_cast<char*>(pe->addr);
357 for (int i = 0; i < page_length_; i += 4096) {
358 uint64 paddr = os_->VirtualToPhysical(base + i);
359
360 uint32 offset = paddr / 4096 / 8;
361 unsigned char mask = 1 << ((paddr / 4096) % 8);
362
363 if (offset >= arraysize) {
364 logprintf(0, "Process Error: Physical address %#llx is "
365 "greater than expected %#llx.\n",
366 paddr, page_bitmap_size_);
367 sat_assert(0);
368 }
369 page_bitmap_[offset] |= mask;
370 }
371 }
372
373 // Print out the physical memory ranges that SAT has accessed.
AddrMapPrint()374 void Sat::AddrMapPrint() {
375 if (!do_page_map_)
376 return;
377
378 uint64 pages = page_bitmap_size_ / 4096;
379
380 uint64 last_page = 0;
381 bool valid_range = false;
382
383 logprintf(4, "Log: Printing tested physical ranges.\n");
384
385 for (uint64 i = 0; i < pages; i ++) {
386 int offset = i / 8;
387 unsigned char mask = 1 << (i % 8);
388
389 bool touched = page_bitmap_[offset] & mask;
390 if (touched && !valid_range) {
391 valid_range = true;
392 last_page = i * 4096;
393 } else if (!touched && valid_range) {
394 valid_range = false;
395 logprintf(4, "Log: %#016llx - %#016llx\n", last_page, (i * 4096) - 1);
396 }
397 }
398 logprintf(4, "Log: Done printing physical ranges.\n");
399 }
400
401 // Initializes page lists and fills pages with data patterns.
InitializePages()402 bool Sat::InitializePages() {
403 int result = 1;
404 // Calculate needed page totals.
405 int64 neededpages = memory_threads_ +
406 invert_threads_ +
407 check_threads_ +
408 net_threads_ +
409 file_threads_;
410
411 // Empty-valid page ratio is adjusted depending on queue implementation.
412 // since fine-grain-locked queue keeps both valid and empty entries in the
413 // same queue and randomly traverse to find pages, the empty-valid ratio
414 // should be more even.
415 if (pe_q_implementation_ == SAT_FINELOCK)
416 freepages_ = pages_ / 5 * 2; // Mark roughly 2/5 of all pages as Empty.
417 else
418 freepages_ = (pages_ / 100) + (2 * neededpages);
419
420 if (freepages_ < neededpages) {
421 logprintf(0, "Process Error: freepages < neededpages.\n");
422 logprintf(1, "Stats: Total: %lld, Needed: %lld, Marked free: %lld\n",
423 static_cast<int64>(pages_),
424 static_cast<int64>(neededpages),
425 static_cast<int64>(freepages_));
426 bad_status();
427 return false;
428 }
429
430 if (freepages_ > pages_/2) {
431 logprintf(0, "Process Error: not enough pages for IO\n");
432 logprintf(1, "Stats: Total: %lld, Needed: %lld, Available: %lld\n",
433 static_cast<int64>(pages_),
434 static_cast<int64>(freepages_),
435 static_cast<int64>(pages_/2));
436 bad_status();
437 return false;
438 }
439 logprintf(12, "Log: Allocating pages, Total: %lld Free: %lld\n",
440 pages_,
441 freepages_);
442
443 // Initialize page locations.
444 for (int64 i = 0; i < pages_; i++) {
445 struct page_entry pe;
446 init_pe(&pe);
447 pe.offset = i * page_length_;
448 result &= PutEmpty(&pe);
449 }
450
451 if (!result) {
452 logprintf(0, "Process Error: while initializing empty_ list\n");
453 bad_status();
454 return false;
455 }
456
457 // Fill valid pages with test patterns.
458 // Use fill threads to do this.
459 WorkerStatus fill_status;
460 WorkerVector fill_vector;
461
462 logprintf(12, "Starting Fill threads: %d threads, %d pages\n",
463 fill_threads_, pages_);
464 // Initialize the fill threads.
465 for (int i = 0; i < fill_threads_; i++) {
466 FillThread *thread = new FillThread();
467 thread->InitThread(i, this, os_, patternlist_, &fill_status);
468 if (i != fill_threads_ - 1) {
469 logprintf(12, "Starting Fill Threads %d: %d pages\n",
470 i, pages_ / fill_threads_);
471 thread->SetFillPages(pages_ / fill_threads_);
472 // The last thread finishes up all the leftover pages.
473 } else {
474 logprintf(12, "Starting Fill Threads %d: %d pages\n",
475 i, pages_ - pages_ / fill_threads_ * i);
476 thread->SetFillPages(pages_ - pages_ / fill_threads_ * i);
477 }
478 fill_vector.push_back(thread);
479 }
480
481 // Spawn the fill threads.
482 fill_status.Initialize();
483 for (WorkerVector::const_iterator it = fill_vector.begin();
484 it != fill_vector.end(); ++it)
485 (*it)->SpawnThread();
486
487 // Reap the finished fill threads.
488 for (WorkerVector::const_iterator it = fill_vector.begin();
489 it != fill_vector.end(); ++it) {
490 (*it)->JoinThread();
491 if ((*it)->GetStatus() != 1) {
492 logprintf(0, "Thread %d failed with status %d at %.2f seconds\n",
493 (*it)->ThreadID(), (*it)->GetStatus(),
494 (*it)->GetRunDurationUSec() * 1.0/1000000);
495 bad_status();
496 return false;
497 }
498 delete (*it);
499 }
500 fill_vector.clear();
501 fill_status.Destroy();
502 logprintf(12, "Log: Done filling pages.\n");
503 logprintf(12, "Log: Allocating pages.\n");
504
505 AddrMapInit();
506
507 // Initialize page locations.
508 for (int64 i = 0; i < pages_; i++) {
509 struct page_entry pe;
510 // Only get valid pages with uninitialized tags here.
511 if (GetValid(&pe, kInvalidTag)) {
512 int64 paddr = os_->VirtualToPhysical(pe.addr);
513 int32 region = os_->FindRegion(paddr);
514 region_[region]++;
515 pe.paddr = paddr;
516 pe.tag = 1 << region;
517 region_mask_ |= pe.tag;
518
519 // Generate a physical region map
520 AddrMapUpdate(&pe);
521
522 // Note: this does not allocate free pages among all regions
523 // fairly. However, with large enough (thousands) random number
524 // of pages being marked free in each region, the free pages
525 // count in each region end up pretty balanced.
526 if (i < freepages_) {
527 result &= PutEmpty(&pe);
528 } else {
529 result &= PutValid(&pe);
530 }
531 } else {
532 logprintf(0, "Log: didn't tag all pages. %d - %d = %d\n",
533 pages_, i, pages_ - i);
534 return false;
535 }
536 }
537 logprintf(12, "Log: Done allocating pages.\n");
538
539 AddrMapPrint();
540
541 for (int i = 0; i < 32; i++) {
542 if (region_mask_ & (1 << i)) {
543 region_count_++;
544 logprintf(12, "Log: Region %d: %d.\n", i, region_[i]);
545 }
546 }
547 logprintf(5, "Log: Region mask: 0x%x\n", region_mask_);
548
549 return true;
550 }
551
552 // Print SAT version info.
PrintVersion()553 bool Sat::PrintVersion() {
554 logprintf(1, "Stats: SAT revision %s, %d bit binary\n",
555 kVersion, address_mode_);
556 logprintf(5, "Log: %s from %s\n", Timestamp(), BuildChangelist());
557
558 return true;
559 }
560
561
562 // Initializes the resources that SAT needs to run.
563 // This needs to be called before Run(), and after ParseArgs().
564 // Returns true on success, false on error, and will exit() on help message.
Initialize()565 bool Sat::Initialize() {
566 g_sat = this;
567
568 // Initializes sync'd log file to ensure output is saved.
569 if (!InitializeLogfile())
570 return false;
571 Logger::GlobalLogger()->SetTimestampLogging(log_timestamps_);
572 Logger::GlobalLogger()->StartThread();
573
574 logprintf(5, "Log: Commandline - %s\n", cmdline_.c_str());
575 PrintVersion();
576
577 std::map<std::string, std::string> options;
578
579 GoogleOsOptions(&options);
580
581 // Initialize OS/Hardware interface.
582 os_ = OsLayerFactory(options);
583 if (!os_) {
584 bad_status();
585 return false;
586 }
587
588 if (min_hugepages_mbytes_ > 0)
589 os_->SetMinimumHugepagesSize(min_hugepages_mbytes_ * kMegabyte);
590
591 if (reserve_mb_ > 0)
592 os_->SetReserveSize(reserve_mb_);
593
594 if (channels_.size() > 0) {
595 logprintf(6, "Log: Decoding memory: %dx%d bit channels,"
596 "%d modules per channel (x%d), decoding hash 0x%x\n",
597 channels_.size(), channel_width_, channels_[0].size(),
598 channel_width_/channels_[0].size(), channel_hash_);
599 os_->SetDramMappingParams(channel_hash_, channel_width_, &channels_);
600 }
601
602 if (!os_->Initialize()) {
603 logprintf(0, "Process Error: Failed to initialize OS layer\n");
604 bad_status();
605 delete os_;
606 return false;
607 }
608
609 // Checks that OS/Build/Platform is supported.
610 if (!CheckEnvironment())
611 return false;
612
613 if (error_injection_)
614 os_->set_error_injection(true);
615
616 // Run SAT in monitor only mode, do not continue to allocate resources.
617 if (monitor_mode_) {
618 logprintf(5, "Log: Running in monitor-only mode. "
619 "Will not allocate any memory nor run any stress test. "
620 "Only polling ECC errors.\n");
621 return true;
622 }
623
624 // Allocate the memory to test.
625 if (!AllocateMemory())
626 return false;
627
628 logprintf(5, "Stats: Starting SAT, %dM, %d seconds\n",
629 static_cast<int>(size_/kMegabyte),
630 runtime_seconds_);
631
632 if (!InitializePatterns())
633 return false;
634
635 // Initialize memory allocation.
636 pages_ = size_ / page_length_;
637
638 // Allocate page queue depending on queue implementation switch.
639 if (pe_q_implementation_ == SAT_FINELOCK) {
640 finelock_q_ = new FineLockPEQueue(pages_, page_length_);
641 if (finelock_q_ == NULL)
642 return false;
643 finelock_q_->set_os(os_);
644 os_->set_err_log_callback(finelock_q_->get_err_log_callback());
645 } else if (pe_q_implementation_ == SAT_ONELOCK) {
646 empty_ = new PageEntryQueue(pages_);
647 valid_ = new PageEntryQueue(pages_);
648 if ((empty_ == NULL) || (valid_ == NULL))
649 return false;
650 }
651
652 if (!InitializePages()) {
653 logprintf(0, "Process Error: Initialize Pages failed\n");
654 return false;
655 }
656
657 return true;
658 }
659
660 // Constructor and destructor.
Sat()661 Sat::Sat() {
662 // Set defaults, command line might override these.
663 runtime_seconds_ = 20;
664 page_length_ = kSatPageSize;
665 disk_pages_ = kSatDiskPage;
666 pages_ = 0;
667 size_mb_ = 0;
668 size_ = size_mb_ * kMegabyte;
669 reserve_mb_ = 0;
670 min_hugepages_mbytes_ = 0;
671 freepages_ = 0;
672 paddr_base_ = 0;
673 channel_hash_ = kCacheLineSize;
674 channel_width_ = 64;
675
676 user_break_ = false;
677 verbosity_ = 8;
678 Logger::GlobalLogger()->SetVerbosity(verbosity_);
679 print_delay_ = 10;
680 strict_ = 1;
681 warm_ = 0;
682 run_on_anything_ = 0;
683 use_logfile_ = 0;
684 logfile_ = 0;
685 log_timestamps_ = true;
686 // Detect 32/64 bit binary.
687 void *pvoid = 0;
688 address_mode_ = sizeof(pvoid) * 8;
689 error_injection_ = false;
690 crazy_error_injection_ = false;
691 max_errorcount_ = 0; // Zero means no early exit.
692 stop_on_error_ = false;
693 error_poll_ = true;
694 findfiles_ = false;
695
696 do_page_map_ = false;
697 page_bitmap_ = 0;
698 page_bitmap_size_ = 0;
699
700 // Cache coherency data initialization.
701 cc_test_ = false; // Flag to trigger cc threads.
702 cc_cacheline_count_ = 2; // Two datastructures of cache line size.
703 cc_cacheline_size_ = 0; // Size of a cacheline (0 for auto-detect).
704 cc_inc_count_ = 1000; // Number of times to increment the shared variable.
705 cc_cacheline_data_ = 0; // Cache Line size datastructure.
706
707 // Cpu frequency data initialization.
708 cpu_freq_test_ = false; // Flag to trigger cpu frequency thread.
709 cpu_freq_threshold_ = 0; // Threshold, in MHz, at which a cpu fails.
710 cpu_freq_round_ = 10; // Round the computed frequency to this value.
711
712 sat_assert(0 == pthread_mutex_init(&worker_lock_, NULL));
713 file_threads_ = 0;
714 net_threads_ = 0;
715 listen_threads_ = 0;
716 // Default to autodetect number of cpus, and run that many threads.
717 memory_threads_ = -1;
718 invert_threads_ = 0;
719 fill_threads_ = 8;
720 check_threads_ = 0;
721 cpu_stress_threads_ = 0;
722 disk_threads_ = 0;
723 total_threads_ = 0;
724
725 region_mask_ = 0;
726 region_count_ = 0;
727 for (int i = 0; i < 32; i++) {
728 region_[i] = 0;
729 }
730 region_mode_ = 0;
731
732 errorcount_ = 0;
733 statuscount_ = 0;
734
735 valid_ = 0;
736 empty_ = 0;
737 finelock_q_ = 0;
738 // Default to use fine-grain lock for better performance.
739 pe_q_implementation_ = SAT_FINELOCK;
740
741 os_ = 0;
742 patternlist_ = 0;
743 logfilename_[0] = 0;
744
745 read_block_size_ = 512;
746 write_block_size_ = -1;
747 segment_size_ = -1;
748 cache_size_ = -1;
749 blocks_per_segment_ = -1;
750 read_threshold_ = -1;
751 write_threshold_ = -1;
752 non_destructive_ = 1;
753 monitor_mode_ = 0;
754 tag_mode_ = 0;
755 random_threads_ = 0;
756
757 pause_delay_ = 600;
758 pause_duration_ = 15;
759 }
760
761 // Destructor.
~Sat()762 Sat::~Sat() {
763 // We need to have called Cleanup() at this point.
764 // We should probably enforce this.
765 }
766
767
768 #define ARG_KVALUE(argument, variable, value) \
769 if (!strcmp(argv[i], argument)) { \
770 variable = value; \
771 continue; \
772 }
773
774 #define ARG_IVALUE(argument, variable) \
775 if (!strcmp(argv[i], argument)) { \
776 i++; \
777 if (i < argc) \
778 variable = strtoull(argv[i], NULL, 0); \
779 continue; \
780 }
781
782 #define ARG_SVALUE(argument, variable) \
783 if (!strcmp(argv[i], argument)) { \
784 i++; \
785 if (i < argc) \
786 snprintf(variable, sizeof(variable), "%s", argv[i]); \
787 continue; \
788 }
789
790 // Configures SAT from command line arguments.
791 // This will call exit() given a request for
792 // self-documentation or unexpected args.
ParseArgs(int argc,char ** argv)793 bool Sat::ParseArgs(int argc, char **argv) {
794 int i;
795 uint64 filesize = page_length_ * disk_pages_;
796
797 // Parse each argument.
798 for (i = 1; i < argc; i++) {
799 // Switch to fall back to corase-grain-lock queue. (for benchmarking)
800 ARG_KVALUE("--coarse_grain_lock", pe_q_implementation_, SAT_ONELOCK);
801
802 // Set number of megabyte to use.
803 ARG_IVALUE("-M", size_mb_);
804
805 // Specify the amount of megabytes to be reserved for system.
806 ARG_IVALUE("--reserve_memory", reserve_mb_);
807
808 // Set minimum megabytes of hugepages to require.
809 ARG_IVALUE("-H", min_hugepages_mbytes_);
810
811 // Set number of seconds to run.
812 ARG_IVALUE("-s", runtime_seconds_);
813
814 // Set number of memory copy threads.
815 ARG_IVALUE("-m", memory_threads_);
816
817 // Set number of memory invert threads.
818 ARG_IVALUE("-i", invert_threads_);
819
820 // Set number of check-only threads.
821 ARG_IVALUE("-c", check_threads_);
822
823 // Set number of cache line size datastructures.
824 ARG_IVALUE("--cc_inc_count", cc_inc_count_);
825
826 // Set number of cache line size datastructures
827 ARG_IVALUE("--cc_line_count", cc_cacheline_count_);
828
829 // Override the detected or assumed cache line size.
830 ARG_IVALUE("--cc_line_size", cc_cacheline_size_);
831
832 // Flag set when cache coherency tests need to be run
833 ARG_KVALUE("--cc_test", cc_test_, true);
834
835 // Set when the cpu_frequency test needs to be run
836 ARG_KVALUE("--cpu_freq_test", cpu_freq_test_, true);
837
838 // Set the threshold in MHz at which the cpu frequency test will fail.
839 ARG_IVALUE("--cpu_freq_threshold", cpu_freq_threshold_);
840
841 // Set the rounding value for the cpu frequency test. The default is to
842 // round to the nearest 10s value.
843 ARG_IVALUE("--cpu_freq_round", cpu_freq_round_);
844
845 // Set number of CPU stress threads.
846 ARG_IVALUE("-C", cpu_stress_threads_);
847
848 // Set logfile name.
849 ARG_SVALUE("-l", logfilename_);
850
851 // Verbosity level.
852 ARG_IVALUE("-v", verbosity_);
853
854 // Chatty printout level.
855 ARG_IVALUE("--printsec", print_delay_);
856
857 // Turn off timestamps logging.
858 ARG_KVALUE("--no_timestamps", log_timestamps_, false);
859
860 // Set maximum number of errors to collect. Stop running after this many.
861 ARG_IVALUE("--max_errors", max_errorcount_);
862
863 // Set pattern block size.
864 ARG_IVALUE("-p", page_length_);
865
866 // Set pattern block size.
867 ARG_IVALUE("--filesize", filesize);
868
869 // NUMA options.
870 ARG_KVALUE("--local_numa", region_mode_, kLocalNuma);
871 ARG_KVALUE("--remote_numa", region_mode_, kRemoteNuma);
872
873 // Autodetect tempfile locations.
874 ARG_KVALUE("--findfiles", findfiles_, 1);
875
876 // Inject errors to force miscompare code paths
877 ARG_KVALUE("--force_errors", error_injection_, true);
878 ARG_KVALUE("--force_errors_like_crazy", crazy_error_injection_, true);
879 if (crazy_error_injection_)
880 error_injection_ = true;
881
882 // Stop immediately on any arror, for debugging HW problems.
883 ARG_KVALUE("--stop_on_errors", stop_on_error_, 1);
884
885 // Don't use internal error polling, allow external detection.
886 ARG_KVALUE("--no_errors", error_poll_, 0);
887
888 // Never check data as you go.
889 ARG_KVALUE("-F", strict_, 0);
890
891 // Warm the cpu as you go.
892 ARG_KVALUE("-W", warm_, 1);
893
894 // Allow runnign on unknown systems with base unimplemented OsLayer
895 ARG_KVALUE("-A", run_on_anything_, 1);
896
897 // Size of read blocks for disk test.
898 ARG_IVALUE("--read-block-size", read_block_size_);
899
900 // Size of write blocks for disk test.
901 ARG_IVALUE("--write-block-size", write_block_size_);
902
903 // Size of segment for disk test.
904 ARG_IVALUE("--segment-size", segment_size_);
905
906 // Size of disk cache size for disk test.
907 ARG_IVALUE("--cache-size", cache_size_);
908
909 // Number of blocks to test per segment.
910 ARG_IVALUE("--blocks-per-segment", blocks_per_segment_);
911
912 // Maximum time a block read should take before warning.
913 ARG_IVALUE("--read-threshold", read_threshold_);
914
915 // Maximum time a block write should take before warning.
916 ARG_IVALUE("--write-threshold", write_threshold_);
917
918 // Do not write anything to disk in the disk test.
919 ARG_KVALUE("--destructive", non_destructive_, 0);
920
921 // Run SAT in monitor mode. No test load at all.
922 ARG_KVALUE("--monitor_mode", monitor_mode_, true);
923
924 // Run SAT in address mode. Tag all cachelines by virt addr.
925 ARG_KVALUE("--tag_mode", tag_mode_, true);
926
927 // Dump range map of tested pages..
928 ARG_KVALUE("--do_page_map", do_page_map_, true);
929
930 // Specify the physical address base to test.
931 ARG_IVALUE("--paddr_base", paddr_base_);
932
933 // Specify the frequency for power spikes.
934 ARG_IVALUE("--pause_delay", pause_delay_);
935
936 // Specify the duration of each pause (for power spikes).
937 ARG_IVALUE("--pause_duration", pause_duration_);
938
939 // Disk device names
940 if (!strcmp(argv[i], "-d")) {
941 i++;
942 if (i < argc) {
943 disk_threads_++;
944 diskfilename_.push_back(string(argv[i]));
945 blocktables_.push_back(new DiskBlockTable());
946 }
947 continue;
948 }
949
950 // Set number of disk random threads for each disk write thread.
951 ARG_IVALUE("--random-threads", random_threads_);
952
953 // Set a tempfile to use in a file thread.
954 if (!strcmp(argv[i], "-f")) {
955 i++;
956 if (i < argc) {
957 file_threads_++;
958 filename_.push_back(string(argv[i]));
959 }
960 continue;
961 }
962
963 // Set a hostname to use in a network thread.
964 if (!strcmp(argv[i], "-n")) {
965 i++;
966 if (i < argc) {
967 net_threads_++;
968 ipaddrs_.push_back(string(argv[i]));
969 }
970 continue;
971 }
972
973 // Run threads that listen for incoming SAT net connections.
974 ARG_KVALUE("--listen", listen_threads_, 1);
975
976 if (CheckGoogleSpecificArgs(argc, argv, &i)) {
977 continue;
978 }
979
980 ARG_IVALUE("--channel_hash", channel_hash_);
981 ARG_IVALUE("--channel_width", channel_width_);
982
983 if (!strcmp(argv[i], "--memory_channel")) {
984 i++;
985 if (i < argc) {
986 char *channel = argv[i];
987 channels_.push_back(vector<string>());
988 while (char* next = strchr(channel, ',')) {
989 channels_.back().push_back(string(channel, next - channel));
990 channel = next + 1;
991 }
992 channels_.back().push_back(string(channel));
993 }
994 continue;
995 }
996
997 // Default:
998 PrintVersion();
999 PrintHelp();
1000 if (strcmp(argv[i], "-h") && strcmp(argv[i], "--help")) {
1001 printf("\n Unknown argument %s\n", argv[i]);
1002 bad_status();
1003 exit(1);
1004 }
1005 // Forget it, we printed the help, just bail.
1006 // We don't want to print test status, or any log parser stuff.
1007 exit(0);
1008 }
1009
1010 Logger::GlobalLogger()->SetVerbosity(verbosity_);
1011
1012 // Update relevant data members with parsed input.
1013 // Translate MB into bytes.
1014 size_ = static_cast<int64>(size_mb_) * kMegabyte;
1015
1016 // Set logfile flag.
1017 if (strcmp(logfilename_, ""))
1018 use_logfile_ = 1;
1019 // Checks valid page length.
1020 if (page_length_ &&
1021 !(page_length_ & (page_length_ - 1)) &&
1022 (page_length_ > 1023)) {
1023 // Prints if we have changed from default.
1024 if (page_length_ != kSatPageSize)
1025 logprintf(12, "Log: Updating page size to %d\n", page_length_);
1026 } else {
1027 // Revert to default page length.
1028 logprintf(6, "Process Error: "
1029 "Invalid page size %d\n", page_length_);
1030 page_length_ = kSatPageSize;
1031 return false;
1032 }
1033
1034 // Set disk_pages_ if filesize or page size changed.
1035 if (filesize != static_cast<uint64>(page_length_) *
1036 static_cast<uint64>(disk_pages_)) {
1037 disk_pages_ = filesize / page_length_;
1038 if (disk_pages_ == 0)
1039 disk_pages_ = 1;
1040 }
1041
1042 // Validate memory channel parameters if supplied
1043 if (channels_.size()) {
1044 if (channels_.size() == 1) {
1045 channel_hash_ = 0;
1046 logprintf(7, "Log: "
1047 "Only one memory channel...deactivating interleave decoding.\n");
1048 } else if (channels_.size() > 2) {
1049 logprintf(6, "Process Error: "
1050 "Triple-channel mode not yet supported... sorry.\n");
1051 bad_status();
1052 return false;
1053 }
1054 for (uint i = 0; i < channels_.size(); i++)
1055 if (channels_[i].size() != channels_[0].size()) {
1056 logprintf(6, "Process Error: "
1057 "Channels 0 and %d have a different count of dram modules.\n", i);
1058 bad_status();
1059 return false;
1060 }
1061 if (channels_[0].size() & (channels_[0].size() - 1)) {
1062 logprintf(6, "Process Error: "
1063 "Amount of modules per memory channel is not a power of 2.\n");
1064 bad_status();
1065 return false;
1066 }
1067 if (channel_width_ < 16
1068 || channel_width_ & (channel_width_ - 1)) {
1069 logprintf(6, "Process Error: "
1070 "Channel width %d is invalid.\n", channel_width_);
1071 bad_status();
1072 return false;
1073 }
1074 if (channel_width_ / channels_[0].size() < 8) {
1075 logprintf(6, "Process Error: Chip width x%d must be x8 or greater.\n",
1076 channel_width_ / channels_[0].size());
1077 bad_status();
1078 return false;
1079 }
1080 }
1081
1082
1083 // Print each argument.
1084 for (int i = 0; i < argc; i++) {
1085 if (i)
1086 cmdline_ += " ";
1087 cmdline_ += argv[i];
1088 }
1089
1090 return true;
1091 }
1092
PrintHelp()1093 void Sat::PrintHelp() {
1094 printf("Usage: ./sat(32|64) [options]\n"
1095 " -M mbytes megabytes of ram to test\n"
1096 " --reserve-memory If not using hugepages, the amount of memory to "
1097 " reserve for the system\n"
1098 " -H mbytes minimum megabytes of hugepages to require\n"
1099 " -s seconds number of seconds to run\n"
1100 " -m threads number of memory copy threads to run\n"
1101 " -i threads number of memory invert threads to run\n"
1102 " -C threads number of memory CPU stress threads to run\n"
1103 " --findfiles find locations to do disk IO automatically\n"
1104 " -d device add a direct write disk thread with block "
1105 "device (or file) 'device'\n"
1106 " -f filename add a disk thread with "
1107 "tempfile 'filename'\n"
1108 " -l logfile log output to file 'logfile'\n"
1109 " --no_timestamps do not prefix timestamps to log messages\n"
1110 " --max_errors n exit early after finding 'n' errors\n"
1111 " -v level verbosity (0-20), default is 8\n"
1112 " --printsec secs How often to print 'seconds remaining'\n"
1113 " -W Use more CPU-stressful memory copy\n"
1114 " -A run in degraded mode on incompatible systems\n"
1115 " -p pagesize size in bytes of memory chunks\n"
1116 " --filesize size size of disk IO tempfiles\n"
1117 " -n ipaddr add a network thread connecting to "
1118 "system at 'ipaddr'\n"
1119 " --listen run a thread to listen for and respond "
1120 "to network threads.\n"
1121 " --no_errors run without checking for ECC or other errors\n"
1122 " --force_errors inject false errors to test error handling\n"
1123 " --force_errors_like_crazy inject a lot of false errors "
1124 "to test error handling\n"
1125 " -F don't result check each transaction\n"
1126 " --stop_on_errors Stop after finding the first error.\n"
1127 " --read-block-size size of block for reading (-d)\n"
1128 " --write-block-size size of block for writing (-d). If not "
1129 "defined, the size of block for writing will be defined as the "
1130 "size of block for reading\n"
1131 " --segment-size size of segments to split disk into (-d)\n"
1132 " --cache-size size of disk cache (-d)\n"
1133 " --blocks-per-segment number of blocks to read/write per "
1134 "segment per iteration (-d)\n"
1135 " --read-threshold maximum time (in us) a block read should "
1136 "take (-d)\n"
1137 " --write-threshold maximum time (in us) a block write "
1138 "should take (-d)\n"
1139 " --random-threads number of random threads for each disk "
1140 "write thread (-d)\n"
1141 " --destructive write/wipe disk partition (-d)\n"
1142 " --monitor_mode only do ECC error polling, no stress load.\n"
1143 " --cc_test do the cache coherency testing\n"
1144 " --cc_inc_count number of times to increment the "
1145 "cacheline's member\n"
1146 " --cc_line_count number of cache line sized datastructures "
1147 "to allocate for the cache coherency threads to operate\n"
1148 " --cc_line_size override the auto-detected cache line size\n"
1149 " --cpu_freq_test enable the cpu frequency test (requires the "
1150 "--cpu_freq_threshold argument to be set)\n"
1151 " --cpu_freq_threshold fail the cpu frequency test if the frequency "
1152 "goes below this value (specified in MHz)\n"
1153 " --cpu_freq_round round the computed frequency to this value, if set"
1154 " to zero, only round to the nearest MHz\n"
1155 " --paddr_base allocate memory starting from this address\n"
1156 " --pause_delay delay (in seconds) between power spikes\n"
1157 " --pause_duration duration (in seconds) of each pause\n"
1158 " --local_numa choose memory regions associated with "
1159 "each CPU to be tested by that CPU\n"
1160 " --remote_numa choose memory regions not associated with "
1161 "each CPU to be tested by that CPU\n"
1162 " --channel_hash mask of address bits XORed to determine channel. "
1163 "Mask 0x40 interleaves cachelines between channels\n"
1164 " --channel_width bits width in bits of each memory channel\n"
1165 " --memory_channel u1,u2 defines a comma-separated list of names "
1166 "for dram packages in a memory channel. Use multiple times to "
1167 "define multiple channels.\n");
1168 }
1169
CheckGoogleSpecificArgs(int argc,char ** argv,int * i)1170 bool Sat::CheckGoogleSpecificArgs(int argc, char **argv, int *i) {
1171 // Do nothing, no google-specific argument on public stressapptest
1172 return false;
1173 }
1174
GoogleOsOptions(std::map<std::string,std::string> * options)1175 void Sat::GoogleOsOptions(std::map<std::string, std::string> *options) {
1176 // Do nothing, no OS-specific argument on public stressapptest
1177 }
1178
1179 // Launch the SAT task threads. Returns 0 on error.
InitializeThreads()1180 void Sat::InitializeThreads() {
1181 // Memory copy threads.
1182 AcquireWorkerLock();
1183
1184 logprintf(12, "Log: Starting worker threads\n");
1185 WorkerVector *memory_vector = new WorkerVector();
1186
1187 // Error polling thread.
1188 // This may detect ECC corrected errors, disk problems, or
1189 // any other errors normally hidden from userspace.
1190 WorkerVector *error_vector = new WorkerVector();
1191 if (error_poll_) {
1192 ErrorPollThread *thread = new ErrorPollThread();
1193 thread->InitThread(total_threads_++, this, os_, patternlist_,
1194 &continuous_status_);
1195
1196 error_vector->insert(error_vector->end(), thread);
1197 } else {
1198 logprintf(5, "Log: Skipping error poll thread due to --no_errors flag\n");
1199 }
1200 workers_map_.insert(make_pair(kErrorType, error_vector));
1201
1202 // Only start error poll threads for monitor-mode SAT,
1203 // skip all other types of worker threads.
1204 if (monitor_mode_) {
1205 ReleaseWorkerLock();
1206 return;
1207 }
1208
1209 for (int i = 0; i < memory_threads_; i++) {
1210 CopyThread *thread = new CopyThread();
1211 thread->InitThread(total_threads_++, this, os_, patternlist_,
1212 &power_spike_status_);
1213
1214 if ((region_count_ > 1) && (region_mode_)) {
1215 int32 region = region_find(i % region_count_);
1216 cpu_set_t *cpuset = os_->FindCoreMask(region);
1217 sat_assert(cpuset);
1218 if (region_mode_ == kLocalNuma) {
1219 // Choose regions associated with this CPU.
1220 thread->set_cpu_mask(cpuset);
1221 thread->set_tag(1 << region);
1222 } else if (region_mode_ == kRemoteNuma) {
1223 // Choose regions not associated with this CPU..
1224 thread->set_cpu_mask(cpuset);
1225 thread->set_tag(region_mask_ & ~(1 << region));
1226 }
1227 } else {
1228 cpu_set_t available_cpus;
1229 thread->AvailableCpus(&available_cpus);
1230 int cores = cpuset_count(&available_cpus);
1231 // Don't restrict thread location if we have more than one
1232 // thread per core. Not so good for performance.
1233 if (cpu_stress_threads_ + memory_threads_ <= cores) {
1234 // Place a thread on alternating cores first.
1235 // This assures interleaved core use with no overlap.
1236 int nthcore = i;
1237 int nthbit = (((2 * nthcore) % cores) +
1238 (((2 * nthcore) / cores) % 2)) % cores;
1239 cpu_set_t all_cores;
1240 cpuset_set_ab(&all_cores, 0, cores);
1241 if (!cpuset_isequal(&available_cpus, &all_cores)) {
1242 // We are assuming the bits are contiguous.
1243 // Complain if this is not so.
1244 logprintf(0, "Log: cores = %s, expected %s\n",
1245 cpuset_format(&available_cpus).c_str(),
1246 cpuset_format(&all_cores).c_str());
1247 }
1248
1249 // Set thread affinity.
1250 thread->set_cpu_mask_to_cpu(nthbit);
1251 }
1252 }
1253 memory_vector->insert(memory_vector->end(), thread);
1254 }
1255 workers_map_.insert(make_pair(kMemoryType, memory_vector));
1256
1257 // File IO threads.
1258 WorkerVector *fileio_vector = new WorkerVector();
1259 for (int i = 0; i < file_threads_; i++) {
1260 FileThread *thread = new FileThread();
1261 thread->InitThread(total_threads_++, this, os_, patternlist_,
1262 &power_spike_status_);
1263 thread->SetFile(filename_[i].c_str());
1264 // Set disk threads high priority. They don't take much processor time,
1265 // but blocking them will delay disk IO.
1266 thread->SetPriority(WorkerThread::High);
1267
1268 fileio_vector->insert(fileio_vector->end(), thread);
1269 }
1270 workers_map_.insert(make_pair(kFileIOType, fileio_vector));
1271
1272 // Net IO threads.
1273 WorkerVector *netio_vector = new WorkerVector();
1274 WorkerVector *netslave_vector = new WorkerVector();
1275 if (listen_threads_ > 0) {
1276 // Create a network slave thread. This listens for connections.
1277 NetworkListenThread *thread = new NetworkListenThread();
1278 thread->InitThread(total_threads_++, this, os_, patternlist_,
1279 &continuous_status_);
1280
1281 netslave_vector->insert(netslave_vector->end(), thread);
1282 }
1283 for (int i = 0; i < net_threads_; i++) {
1284 NetworkThread *thread = new NetworkThread();
1285 thread->InitThread(total_threads_++, this, os_, patternlist_,
1286 &continuous_status_);
1287 thread->SetIP(ipaddrs_[i].c_str());
1288
1289 netio_vector->insert(netio_vector->end(), thread);
1290 }
1291 workers_map_.insert(make_pair(kNetIOType, netio_vector));
1292 workers_map_.insert(make_pair(kNetSlaveType, netslave_vector));
1293
1294 // Result check threads.
1295 WorkerVector *check_vector = new WorkerVector();
1296 for (int i = 0; i < check_threads_; i++) {
1297 CheckThread *thread = new CheckThread();
1298 thread->InitThread(total_threads_++, this, os_, patternlist_,
1299 &continuous_status_);
1300
1301 check_vector->insert(check_vector->end(), thread);
1302 }
1303 workers_map_.insert(make_pair(kCheckType, check_vector));
1304
1305 // Memory invert threads.
1306 logprintf(12, "Log: Starting invert threads\n");
1307 WorkerVector *invert_vector = new WorkerVector();
1308 for (int i = 0; i < invert_threads_; i++) {
1309 InvertThread *thread = new InvertThread();
1310 thread->InitThread(total_threads_++, this, os_, patternlist_,
1311 &continuous_status_);
1312
1313 invert_vector->insert(invert_vector->end(), thread);
1314 }
1315 workers_map_.insert(make_pair(kInvertType, invert_vector));
1316
1317 // Disk stress threads.
1318 WorkerVector *disk_vector = new WorkerVector();
1319 WorkerVector *random_vector = new WorkerVector();
1320 logprintf(12, "Log: Starting disk stress threads\n");
1321 for (int i = 0; i < disk_threads_; i++) {
1322 // Creating write threads
1323 DiskThread *thread = new DiskThread(blocktables_[i]);
1324 thread->InitThread(total_threads_++, this, os_, patternlist_,
1325 &power_spike_status_);
1326 thread->SetDevice(diskfilename_[i].c_str());
1327 if (thread->SetParameters(read_block_size_, write_block_size_,
1328 segment_size_, cache_size_,
1329 blocks_per_segment_,
1330 read_threshold_, write_threshold_,
1331 non_destructive_)) {
1332 disk_vector->insert(disk_vector->end(), thread);
1333 } else {
1334 logprintf(12, "Log: DiskThread::SetParameters() failed\n");
1335 delete thread;
1336 }
1337
1338 for (int j = 0; j < random_threads_; j++) {
1339 // Creating random threads
1340 RandomDiskThread *rthread = new RandomDiskThread(blocktables_[i]);
1341 rthread->InitThread(total_threads_++, this, os_, patternlist_,
1342 &power_spike_status_);
1343 rthread->SetDevice(diskfilename_[i].c_str());
1344 if (rthread->SetParameters(read_block_size_, write_block_size_,
1345 segment_size_, cache_size_,
1346 blocks_per_segment_,
1347 read_threshold_, write_threshold_,
1348 non_destructive_)) {
1349 random_vector->insert(random_vector->end(), rthread);
1350 } else {
1351 logprintf(12, "Log: RandomDiskThread::SetParameters() failed\n");
1352 delete rthread;
1353 }
1354 }
1355 }
1356
1357 workers_map_.insert(make_pair(kDiskType, disk_vector));
1358 workers_map_.insert(make_pair(kRandomDiskType, random_vector));
1359
1360 // CPU stress threads.
1361 WorkerVector *cpu_vector = new WorkerVector();
1362 logprintf(12, "Log: Starting cpu stress threads\n");
1363 for (int i = 0; i < cpu_stress_threads_; i++) {
1364 CpuStressThread *thread = new CpuStressThread();
1365 thread->InitThread(total_threads_++, this, os_, patternlist_,
1366 &continuous_status_);
1367
1368 // Don't restrict thread location if we have more than one
1369 // thread per core. Not so good for performance.
1370 cpu_set_t available_cpus;
1371 thread->AvailableCpus(&available_cpus);
1372 int cores = cpuset_count(&available_cpus);
1373 if (cpu_stress_threads_ + memory_threads_ <= cores) {
1374 // Place a thread on alternating cores first.
1375 // Go in reverse order for CPU stress threads. This assures interleaved
1376 // core use with no overlap.
1377 int nthcore = (cores - 1) - i;
1378 int nthbit = (((2 * nthcore) % cores) +
1379 (((2 * nthcore) / cores) % 2)) % cores;
1380 cpu_set_t all_cores;
1381 cpuset_set_ab(&all_cores, 0, cores);
1382 if (!cpuset_isequal(&available_cpus, &all_cores)) {
1383 logprintf(0, "Log: cores = %s, expected %s\n",
1384 cpuset_format(&available_cpus).c_str(),
1385 cpuset_format(&all_cores).c_str());
1386 }
1387
1388 // Set thread affinity.
1389 thread->set_cpu_mask_to_cpu(nthbit);
1390 }
1391
1392
1393 cpu_vector->insert(cpu_vector->end(), thread);
1394 }
1395 workers_map_.insert(make_pair(kCPUType, cpu_vector));
1396
1397 // CPU Cache Coherency Threads - one for each core available.
1398 if (cc_test_) {
1399 WorkerVector *cc_vector = new WorkerVector();
1400 logprintf(12, "Log: Starting cpu cache coherency threads\n");
1401
1402 // Allocate the shared datastructure to be worked on by the threads.
1403 cc_cacheline_data_ = reinterpret_cast<cc_cacheline_data*>(
1404 malloc(sizeof(cc_cacheline_data) * cc_cacheline_count_));
1405 sat_assert(cc_cacheline_data_ != NULL);
1406
1407 // Initialize the strucutre.
1408 memset(cc_cacheline_data_, 0,
1409 sizeof(cc_cacheline_data) * cc_cacheline_count_);
1410
1411 int num_cpus = CpuCount();
1412 char *num;
1413 // Calculate the number of cache lines needed just to give each core
1414 // its own counter.
1415 int line_size = cc_cacheline_size_;
1416 if (line_size <= 0) {
1417 line_size = CacheLineSize();
1418 if (line_size < kCacheLineSize)
1419 line_size = kCacheLineSize;
1420 logprintf(12, "Log: Using %d as cache line size\n", line_size);
1421 }
1422 // The number of cache lines needed to hold an array of num_cpus.
1423 // "num" must be the same type as cc_cacheline_data[X].num or the memory
1424 // size calculations will fail.
1425 int needed_lines = (sizeof(*num) * num_cpus + line_size - 1) / line_size;
1426 // Allocate all the nums once so that we get a single chunk
1427 // of contiguous memory.
1428 #ifdef HAVE_POSIX_MEMALIGN
1429 int err_result = posix_memalign(
1430 reinterpret_cast<void**>(&num),
1431 line_size, line_size * needed_lines * cc_cacheline_count_);
1432 #else
1433 num = reinterpret_cast<char*>(memalign(
1434 line_size, line_size * needed_lines * cc_cacheline_count_));
1435 int err_result = (num == 0);
1436 #endif
1437 sat_assert(err_result == 0);
1438
1439 int cline;
1440 for (cline = 0; cline < cc_cacheline_count_; cline++) {
1441 memset(num, 0, sizeof(*num) * num_cpus);
1442 cc_cacheline_data_[cline].num = num;
1443 num += (line_size * needed_lines) / sizeof(*num);
1444 }
1445
1446 int tnum;
1447 for (tnum = 0; tnum < num_cpus; tnum++) {
1448 CpuCacheCoherencyThread *thread =
1449 new CpuCacheCoherencyThread(cc_cacheline_data_, cc_cacheline_count_,
1450 tnum, num_cpus, cc_inc_count_);
1451 thread->InitThread(total_threads_++, this, os_, patternlist_,
1452 &continuous_status_);
1453 // Pin the thread to a particular core.
1454 thread->set_cpu_mask_to_cpu(tnum);
1455
1456 // Insert the thread into the vector.
1457 cc_vector->insert(cc_vector->end(), thread);
1458 }
1459 workers_map_.insert(make_pair(kCCType, cc_vector));
1460 }
1461
1462 if (cpu_freq_test_) {
1463 // Create the frequency test thread.
1464 logprintf(5, "Log: Running cpu frequency test: threshold set to %dMHz.\n",
1465 cpu_freq_threshold_);
1466 CpuFreqThread *thread = new CpuFreqThread(CpuCount(), cpu_freq_threshold_,
1467 cpu_freq_round_);
1468 // This thread should be paused when other threads are paused.
1469 thread->InitThread(total_threads_++, this, os_, NULL,
1470 &power_spike_status_);
1471
1472 WorkerVector *cpu_freq_vector = new WorkerVector();
1473 cpu_freq_vector->insert(cpu_freq_vector->end(), thread);
1474 workers_map_.insert(make_pair(kCPUFreqType, cpu_freq_vector));
1475 }
1476
1477 ReleaseWorkerLock();
1478 }
1479
1480 // Return the number of cpus actually present in the machine.
CpuCount()1481 int Sat::CpuCount() {
1482 return sysconf(_SC_NPROCESSORS_CONF);
1483 }
1484
1485 // Return the worst case (largest) cache line size of the various levels of
1486 // cache actually prsent in the machine.
CacheLineSize()1487 int Sat::CacheLineSize() {
1488 int max_linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
1489 int linesize = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
1490 if (linesize > max_linesize) max_linesize = linesize;
1491 linesize = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
1492 if (linesize > max_linesize) max_linesize = linesize;
1493 linesize = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
1494 if (linesize > max_linesize) max_linesize = linesize;
1495 return max_linesize;
1496 }
1497
1498 // Notify and reap worker threads.
JoinThreads()1499 void Sat::JoinThreads() {
1500 logprintf(12, "Log: Joining worker threads\n");
1501 power_spike_status_.StopWorkers();
1502 continuous_status_.StopWorkers();
1503
1504 AcquireWorkerLock();
1505 for (WorkerMap::const_iterator map_it = workers_map_.begin();
1506 map_it != workers_map_.end(); ++map_it) {
1507 for (WorkerVector::const_iterator it = map_it->second->begin();
1508 it != map_it->second->end(); ++it) {
1509 logprintf(12, "Log: Joining thread %d\n", (*it)->ThreadID());
1510 (*it)->JoinThread();
1511 }
1512 }
1513 ReleaseWorkerLock();
1514
1515 QueueStats();
1516
1517 // Finish up result checking.
1518 // Spawn 4 check threads to minimize check time.
1519 logprintf(12, "Log: Finished countdown, begin to result check\n");
1520 WorkerStatus reap_check_status;
1521 WorkerVector reap_check_vector;
1522
1523 // No need for check threads for monitor mode.
1524 if (!monitor_mode_) {
1525 // Initialize the check threads.
1526 for (int i = 0; i < fill_threads_; i++) {
1527 CheckThread *thread = new CheckThread();
1528 thread->InitThread(total_threads_++, this, os_, patternlist_,
1529 &reap_check_status);
1530 logprintf(12, "Log: Finished countdown, begin to result check\n");
1531 reap_check_vector.push_back(thread);
1532 }
1533 }
1534
1535 reap_check_status.Initialize();
1536 // Check threads should be marked to stop ASAP.
1537 reap_check_status.StopWorkers();
1538
1539 // Spawn the check threads.
1540 for (WorkerVector::const_iterator it = reap_check_vector.begin();
1541 it != reap_check_vector.end(); ++it) {
1542 logprintf(12, "Log: Spawning thread %d\n", (*it)->ThreadID());
1543 (*it)->SpawnThread();
1544 }
1545
1546 // Join the check threads.
1547 for (WorkerVector::const_iterator it = reap_check_vector.begin();
1548 it != reap_check_vector.end(); ++it) {
1549 logprintf(12, "Log: Joining thread %d\n", (*it)->ThreadID());
1550 (*it)->JoinThread();
1551 }
1552
1553 // Reap all children. Stopped threads should have already ended.
1554 // Result checking threads will end when they have finished
1555 // result checking.
1556 logprintf(12, "Log: Join all outstanding threads\n");
1557
1558 // Find all errors.
1559 errorcount_ = GetTotalErrorCount();
1560
1561 AcquireWorkerLock();
1562 for (WorkerMap::const_iterator map_it = workers_map_.begin();
1563 map_it != workers_map_.end(); ++map_it) {
1564 for (WorkerVector::const_iterator it = map_it->second->begin();
1565 it != map_it->second->end(); ++it) {
1566 logprintf(12, "Log: Reaping thread status %d\n", (*it)->ThreadID());
1567 if ((*it)->GetStatus() != 1) {
1568 logprintf(0, "Process Error: Thread %d failed with status %d at "
1569 "%.2f seconds\n",
1570 (*it)->ThreadID(), (*it)->GetStatus(),
1571 (*it)->GetRunDurationUSec()*1.0/1000000);
1572 bad_status();
1573 }
1574 int priority = 12;
1575 if ((*it)->GetErrorCount())
1576 priority = 5;
1577 logprintf(priority, "Log: Thread %d found %lld hardware incidents\n",
1578 (*it)->ThreadID(), (*it)->GetErrorCount());
1579 }
1580 }
1581 ReleaseWorkerLock();
1582
1583
1584 // Add in any errors from check threads.
1585 for (WorkerVector::const_iterator it = reap_check_vector.begin();
1586 it != reap_check_vector.end(); ++it) {
1587 logprintf(12, "Log: Reaping thread status %d\n", (*it)->ThreadID());
1588 if ((*it)->GetStatus() != 1) {
1589 logprintf(0, "Process Error: Thread %d failed with status %d at "
1590 "%.2f seconds\n",
1591 (*it)->ThreadID(), (*it)->GetStatus(),
1592 (*it)->GetRunDurationUSec()*1.0/1000000);
1593 bad_status();
1594 }
1595 errorcount_ += (*it)->GetErrorCount();
1596 int priority = 12;
1597 if ((*it)->GetErrorCount())
1598 priority = 5;
1599 logprintf(priority, "Log: Thread %d found %lld hardware incidents\n",
1600 (*it)->ThreadID(), (*it)->GetErrorCount());
1601 delete (*it);
1602 }
1603 reap_check_vector.clear();
1604 reap_check_status.Destroy();
1605 }
1606
1607 // Print queuing information.
QueueStats()1608 void Sat::QueueStats() {
1609 finelock_q_->QueueAnalysis();
1610 }
1611
AnalysisAllStats()1612 void Sat::AnalysisAllStats() {
1613 float max_runtime_sec = 0.;
1614 float total_data = 0.;
1615 float total_bandwidth = 0.;
1616 float thread_runtime_sec = 0.;
1617
1618 for (WorkerMap::const_iterator map_it = workers_map_.begin();
1619 map_it != workers_map_.end(); ++map_it) {
1620 for (WorkerVector::const_iterator it = map_it->second->begin();
1621 it != map_it->second->end(); ++it) {
1622 thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000.;
1623 total_data += (*it)->GetMemoryCopiedData();
1624 total_data += (*it)->GetDeviceCopiedData();
1625 if (thread_runtime_sec > max_runtime_sec) {
1626 max_runtime_sec = thread_runtime_sec;
1627 }
1628 }
1629 }
1630
1631 total_bandwidth = total_data / max_runtime_sec;
1632
1633 logprintf(0, "Stats: Completed: %.2fM in %.2fs %.2fMB/s, "
1634 "with %d hardware incidents, %d errors\n",
1635 total_data,
1636 max_runtime_sec,
1637 total_bandwidth,
1638 errorcount_,
1639 statuscount_);
1640 }
1641
MemoryStats()1642 void Sat::MemoryStats() {
1643 float memcopy_data = 0.;
1644 float memcopy_bandwidth = 0.;
1645 WorkerMap::const_iterator mem_it = workers_map_.find(
1646 static_cast<int>(kMemoryType));
1647 WorkerMap::const_iterator file_it = workers_map_.find(
1648 static_cast<int>(kFileIOType));
1649 sat_assert(mem_it != workers_map_.end());
1650 sat_assert(file_it != workers_map_.end());
1651 for (WorkerVector::const_iterator it = mem_it->second->begin();
1652 it != mem_it->second->end(); ++it) {
1653 memcopy_data += (*it)->GetMemoryCopiedData();
1654 memcopy_bandwidth += (*it)->GetMemoryBandwidth();
1655 }
1656 for (WorkerVector::const_iterator it = file_it->second->begin();
1657 it != file_it->second->end(); ++it) {
1658 memcopy_data += (*it)->GetMemoryCopiedData();
1659 memcopy_bandwidth += (*it)->GetMemoryBandwidth();
1660 }
1661 GoogleMemoryStats(&memcopy_data, &memcopy_bandwidth);
1662 logprintf(4, "Stats: Memory Copy: %.2fM at %.2fMB/s\n",
1663 memcopy_data,
1664 memcopy_bandwidth);
1665 }
1666
GoogleMemoryStats(float * memcopy_data,float * memcopy_bandwidth)1667 void Sat::GoogleMemoryStats(float *memcopy_data,
1668 float *memcopy_bandwidth) {
1669 // Do nothing, should be implemented by subclasses.
1670 }
1671
FileStats()1672 void Sat::FileStats() {
1673 float file_data = 0.;
1674 float file_bandwidth = 0.;
1675 WorkerMap::const_iterator file_it = workers_map_.find(
1676 static_cast<int>(kFileIOType));
1677 sat_assert(file_it != workers_map_.end());
1678 for (WorkerVector::const_iterator it = file_it->second->begin();
1679 it != file_it->second->end(); ++it) {
1680 file_data += (*it)->GetDeviceCopiedData();
1681 file_bandwidth += (*it)->GetDeviceBandwidth();
1682 }
1683 logprintf(4, "Stats: File Copy: %.2fM at %.2fMB/s\n",
1684 file_data,
1685 file_bandwidth);
1686 }
1687
CheckStats()1688 void Sat::CheckStats() {
1689 float check_data = 0.;
1690 float check_bandwidth = 0.;
1691 WorkerMap::const_iterator check_it = workers_map_.find(
1692 static_cast<int>(kCheckType));
1693 sat_assert(check_it != workers_map_.end());
1694 for (WorkerVector::const_iterator it = check_it->second->begin();
1695 it != check_it->second->end(); ++it) {
1696 check_data += (*it)->GetMemoryCopiedData();
1697 check_bandwidth += (*it)->GetMemoryBandwidth();
1698 }
1699 logprintf(4, "Stats: Data Check: %.2fM at %.2fMB/s\n",
1700 check_data,
1701 check_bandwidth);
1702 }
1703
NetStats()1704 void Sat::NetStats() {
1705 float net_data = 0.;
1706 float net_bandwidth = 0.;
1707 WorkerMap::const_iterator netio_it = workers_map_.find(
1708 static_cast<int>(kNetIOType));
1709 WorkerMap::const_iterator netslave_it = workers_map_.find(
1710 static_cast<int>(kNetSlaveType));
1711 sat_assert(netio_it != workers_map_.end());
1712 sat_assert(netslave_it != workers_map_.end());
1713 for (WorkerVector::const_iterator it = netio_it->second->begin();
1714 it != netio_it->second->end(); ++it) {
1715 net_data += (*it)->GetDeviceCopiedData();
1716 net_bandwidth += (*it)->GetDeviceBandwidth();
1717 }
1718 for (WorkerVector::const_iterator it = netslave_it->second->begin();
1719 it != netslave_it->second->end(); ++it) {
1720 net_data += (*it)->GetDeviceCopiedData();
1721 net_bandwidth += (*it)->GetDeviceBandwidth();
1722 }
1723 logprintf(4, "Stats: Net Copy: %.2fM at %.2fMB/s\n",
1724 net_data,
1725 net_bandwidth);
1726 }
1727
InvertStats()1728 void Sat::InvertStats() {
1729 float invert_data = 0.;
1730 float invert_bandwidth = 0.;
1731 WorkerMap::const_iterator invert_it = workers_map_.find(
1732 static_cast<int>(kInvertType));
1733 sat_assert(invert_it != workers_map_.end());
1734 for (WorkerVector::const_iterator it = invert_it->second->begin();
1735 it != invert_it->second->end(); ++it) {
1736 invert_data += (*it)->GetMemoryCopiedData();
1737 invert_bandwidth += (*it)->GetMemoryBandwidth();
1738 }
1739 logprintf(4, "Stats: Invert Data: %.2fM at %.2fMB/s\n",
1740 invert_data,
1741 invert_bandwidth);
1742 }
1743
DiskStats()1744 void Sat::DiskStats() {
1745 float disk_data = 0.;
1746 float disk_bandwidth = 0.;
1747 WorkerMap::const_iterator disk_it = workers_map_.find(
1748 static_cast<int>(kDiskType));
1749 WorkerMap::const_iterator random_it = workers_map_.find(
1750 static_cast<int>(kRandomDiskType));
1751 sat_assert(disk_it != workers_map_.end());
1752 sat_assert(random_it != workers_map_.end());
1753 for (WorkerVector::const_iterator it = disk_it->second->begin();
1754 it != disk_it->second->end(); ++it) {
1755 disk_data += (*it)->GetDeviceCopiedData();
1756 disk_bandwidth += (*it)->GetDeviceBandwidth();
1757 }
1758 for (WorkerVector::const_iterator it = random_it->second->begin();
1759 it != random_it->second->end(); ++it) {
1760 disk_data += (*it)->GetDeviceCopiedData();
1761 disk_bandwidth += (*it)->GetDeviceBandwidth();
1762 }
1763
1764 logprintf(4, "Stats: Disk: %.2fM at %.2fMB/s\n",
1765 disk_data,
1766 disk_bandwidth);
1767 }
1768
1769 // Process worker thread data for bandwidth information, and error results.
1770 // You can add more methods here just subclassing SAT.
RunAnalysis()1771 void Sat::RunAnalysis() {
1772 AnalysisAllStats();
1773 MemoryStats();
1774 FileStats();
1775 NetStats();
1776 CheckStats();
1777 InvertStats();
1778 DiskStats();
1779 }
1780
1781 // Get total error count, summing across all threads..
GetTotalErrorCount()1782 int64 Sat::GetTotalErrorCount() {
1783 int64 errors = 0;
1784
1785 AcquireWorkerLock();
1786 for (WorkerMap::const_iterator map_it = workers_map_.begin();
1787 map_it != workers_map_.end(); ++map_it) {
1788 for (WorkerVector::const_iterator it = map_it->second->begin();
1789 it != map_it->second->end(); ++it) {
1790 errors += (*it)->GetErrorCount();
1791 }
1792 }
1793 ReleaseWorkerLock();
1794 return errors;
1795 }
1796
1797
SpawnThreads()1798 void Sat::SpawnThreads() {
1799 logprintf(12, "Log: Initializing WorkerStatus objects\n");
1800 power_spike_status_.Initialize();
1801 continuous_status_.Initialize();
1802 logprintf(12, "Log: Spawning worker threads\n");
1803 for (WorkerMap::const_iterator map_it = workers_map_.begin();
1804 map_it != workers_map_.end(); ++map_it) {
1805 for (WorkerVector::const_iterator it = map_it->second->begin();
1806 it != map_it->second->end(); ++it) {
1807 logprintf(12, "Log: Spawning thread %d\n", (*it)->ThreadID());
1808 (*it)->SpawnThread();
1809 }
1810 }
1811 }
1812
1813 // Delete used worker thread objects.
DeleteThreads()1814 void Sat::DeleteThreads() {
1815 logprintf(12, "Log: Deleting worker threads\n");
1816 for (WorkerMap::const_iterator map_it = workers_map_.begin();
1817 map_it != workers_map_.end(); ++map_it) {
1818 for (WorkerVector::const_iterator it = map_it->second->begin();
1819 it != map_it->second->end(); ++it) {
1820 logprintf(12, "Log: Deleting thread %d\n", (*it)->ThreadID());
1821 delete (*it);
1822 }
1823 delete map_it->second;
1824 }
1825 workers_map_.clear();
1826 logprintf(12, "Log: Destroying WorkerStatus objects\n");
1827 power_spike_status_.Destroy();
1828 continuous_status_.Destroy();
1829 }
1830
1831 namespace {
1832 // Calculates the next time an action in Sat::Run() should occur, based on a
1833 // schedule derived from a start point and a regular frequency.
1834 //
1835 // Using frequencies instead of intervals with their accompanying drift allows
1836 // users to better predict when the actions will occur throughout a run.
1837 //
1838 // Arguments:
1839 // frequency: seconds
1840 // start: unixtime
1841 // now: unixtime
1842 //
1843 // Returns: unixtime
NextOccurance(time_t frequency,time_t start,time_t now)1844 inline time_t NextOccurance(time_t frequency, time_t start, time_t now) {
1845 return start + frequency + (((now - start) / frequency) * frequency);
1846 }
1847 }
1848
1849 // Run the actual test.
Run()1850 bool Sat::Run() {
1851 // Install signal handlers to gracefully exit in the middle of a run.
1852 //
1853 // Why go through this whole rigmarole? It's the only standards-compliant
1854 // (C++ and POSIX) way to handle signals in a multithreaded program.
1855 // Specifically:
1856 //
1857 // 1) (C++) The value of a variable not of type "volatile sig_atomic_t" is
1858 // unspecified upon entering a signal handler and, if modified by the
1859 // handler, is unspecified after leaving the handler.
1860 //
1861 // 2) (POSIX) After the value of a variable is changed in one thread, another
1862 // thread is only guaranteed to see the new value after both threads have
1863 // acquired or released the same mutex or rwlock, synchronized to the
1864 // same barrier, or similar.
1865 //
1866 // #1 prevents the use of #2 in a signal handler, so the signal handler must
1867 // be called in the same thread that reads the "volatile sig_atomic_t"
1868 // variable it sets. We enforce that by blocking the signals in question in
1869 // the worker threads, forcing them to be handled by this thread.
1870 logprintf(12, "Log: Installing signal handlers\n");
1871 sigset_t new_blocked_signals;
1872 sigemptyset(&new_blocked_signals);
1873 sigaddset(&new_blocked_signals, SIGINT);
1874 sigaddset(&new_blocked_signals, SIGTERM);
1875 sigset_t prev_blocked_signals;
1876 pthread_sigmask(SIG_BLOCK, &new_blocked_signals, &prev_blocked_signals);
1877 sighandler_t prev_sigint_handler = signal(SIGINT, SatHandleBreak);
1878 sighandler_t prev_sigterm_handler = signal(SIGTERM, SatHandleBreak);
1879
1880 // Kick off all the worker threads.
1881 logprintf(12, "Log: Launching worker threads\n");
1882 InitializeThreads();
1883 SpawnThreads();
1884 pthread_sigmask(SIG_SETMASK, &prev_blocked_signals, NULL);
1885
1886 logprintf(12, "Log: Starting countdown with %d seconds\n", runtime_seconds_);
1887
1888 // In seconds.
1889 static const time_t kSleepFrequency = 5;
1890 // All of these are in seconds. You probably want them to be >=
1891 // kSleepFrequency and multiples of kSleepFrequency, but neither is necessary.
1892 static const time_t kInjectionFrequency = 10;
1893 // print_delay_ determines "seconds remaining" chatty update.
1894
1895 const time_t start = time(NULL);
1896 const time_t end = start + runtime_seconds_;
1897 time_t now = start;
1898 time_t next_print = start + print_delay_;
1899 time_t next_pause = start + pause_delay_;
1900 time_t next_resume = 0;
1901 time_t next_injection;
1902 if (crazy_error_injection_) {
1903 next_injection = start + kInjectionFrequency;
1904 } else {
1905 next_injection = 0;
1906 }
1907
1908 while (now < end) {
1909 // This is an int because it's for logprintf().
1910 const int seconds_remaining = end - now;
1911
1912 if (user_break_) {
1913 // Handle early exit.
1914 logprintf(0, "Log: User exiting early (%d seconds remaining)\n",
1915 seconds_remaining);
1916 break;
1917 }
1918
1919 // If we have an error limit, check it here and see if we should exit.
1920 if (max_errorcount_ != 0) {
1921 uint64 errors = GetTotalErrorCount();
1922 if (errors > max_errorcount_) {
1923 logprintf(0, "Log: Exiting early (%d seconds remaining) "
1924 "due to excessive failures (%lld)\n",
1925 seconds_remaining,
1926 errors);
1927 break;
1928 }
1929 }
1930
1931 if (now >= next_print) {
1932 // Print a count down message.
1933 logprintf(5, "Log: Seconds remaining: %d\n", seconds_remaining);
1934 next_print = NextOccurance(print_delay_, start, now);
1935 }
1936
1937 if (next_injection && now >= next_injection) {
1938 // Inject an error.
1939 logprintf(4, "Log: Injecting error (%d seconds remaining)\n",
1940 seconds_remaining);
1941 struct page_entry src;
1942 GetValid(&src);
1943 src.pattern = patternlist_->GetPattern(0);
1944 PutValid(&src);
1945 next_injection = NextOccurance(kInjectionFrequency, start, now);
1946 }
1947
1948 if (next_pause && now >= next_pause) {
1949 // Tell worker threads to pause in preparation for a power spike.
1950 logprintf(4, "Log: Pausing worker threads in preparation for power spike "
1951 "(%d seconds remaining)\n", seconds_remaining);
1952 power_spike_status_.PauseWorkers();
1953 logprintf(12, "Log: Worker threads paused\n");
1954 next_pause = 0;
1955 next_resume = now + pause_duration_;
1956 }
1957
1958 if (next_resume && now >= next_resume) {
1959 // Tell worker threads to resume in order to cause a power spike.
1960 logprintf(4, "Log: Resuming worker threads to cause a power spike (%d "
1961 "seconds remaining)\n", seconds_remaining);
1962 power_spike_status_.ResumeWorkers();
1963 logprintf(12, "Log: Worker threads resumed\n");
1964 next_pause = NextOccurance(pause_delay_, start, now);
1965 next_resume = 0;
1966 }
1967
1968 sat_sleep(NextOccurance(kSleepFrequency, start, now) - now);
1969 now = time(NULL);
1970 }
1971
1972 JoinThreads();
1973
1974 logprintf(0, "Stats: Found %lld hardware incidents\n", errorcount_);
1975
1976 if (!monitor_mode_)
1977 RunAnalysis();
1978
1979 DeleteThreads();
1980
1981 logprintf(12, "Log: Uninstalling signal handlers\n");
1982 signal(SIGINT, prev_sigint_handler);
1983 signal(SIGTERM, prev_sigterm_handler);
1984
1985 return true;
1986 }
1987
1988 // Clean up all resources.
Cleanup()1989 bool Sat::Cleanup() {
1990 g_sat = NULL;
1991 Logger::GlobalLogger()->StopThread();
1992 Logger::GlobalLogger()->SetStdoutOnly();
1993 if (logfile_) {
1994 close(logfile_);
1995 logfile_ = 0;
1996 }
1997 if (patternlist_) {
1998 patternlist_->Destroy();
1999 delete patternlist_;
2000 patternlist_ = 0;
2001 }
2002 if (os_) {
2003 os_->FreeTestMem();
2004 delete os_;
2005 os_ = 0;
2006 }
2007 if (empty_) {
2008 delete empty_;
2009 empty_ = 0;
2010 }
2011 if (valid_) {
2012 delete valid_;
2013 valid_ = 0;
2014 }
2015 if (finelock_q_) {
2016 delete finelock_q_;
2017 finelock_q_ = 0;
2018 }
2019 if (page_bitmap_) {
2020 delete[] page_bitmap_;
2021 }
2022
2023 for (size_t i = 0; i < blocktables_.size(); i++) {
2024 delete blocktables_[i];
2025 }
2026
2027 if (cc_cacheline_data_) {
2028 // The num integer arrays for all the cacheline structures are
2029 // allocated as a single chunk. The pointers in the cacheline struct
2030 // are populated accordingly. Hence calling free on the first
2031 // cacheline's num's address is going to free the entire array.
2032 // TODO(aganti): Refactor this to have a class for the cacheline
2033 // structure (currently defined in worker.h) and clean this up
2034 // in the destructor of that class.
2035 if (cc_cacheline_data_[0].num) {
2036 free(cc_cacheline_data_[0].num);
2037 }
2038 free(cc_cacheline_data_);
2039 }
2040
2041 sat_assert(0 == pthread_mutex_destroy(&worker_lock_));
2042
2043 return true;
2044 }
2045
2046
2047 // Pretty print really obvious results.
PrintResults()2048 bool Sat::PrintResults() {
2049 bool result = true;
2050
2051 logprintf(4, "\n");
2052 if (statuscount_) {
2053 logprintf(4, "Status: FAIL - test encountered procedural errors\n");
2054 result = false;
2055 } else if (errorcount_) {
2056 logprintf(4, "Status: FAIL - test discovered HW problems\n");
2057 result = false;
2058 } else {
2059 logprintf(4, "Status: PASS - please verify no corrected errors\n");
2060 }
2061 logprintf(4, "\n");
2062
2063 return result;
2064 }
2065
2066 // Helper functions.
AcquireWorkerLock()2067 void Sat::AcquireWorkerLock() {
2068 sat_assert(0 == pthread_mutex_lock(&worker_lock_));
2069 }
ReleaseWorkerLock()2070 void Sat::ReleaseWorkerLock() {
2071 sat_assert(0 == pthread_mutex_unlock(&worker_lock_));
2072 }
2073
logprintf(int priority,const char * format,...)2074 void logprintf(int priority, const char *format, ...) {
2075 va_list args;
2076 va_start(args, format);
2077 Logger::GlobalLogger()->VLogF(priority, format, args);
2078 va_end(args);
2079 }
2080
2081 // Stop the logging thread and verify any pending data is written to the log.
logstop()2082 void logstop() {
2083 Logger::GlobalLogger()->StopThread();
2084 }
2085
2086