• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006 Google Inc. All Rights Reserved.
2 
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // worker.cc : individual tasks that can be run in combination to
16 // stress the system
17 
18 #include <errno.h>
19 #include <pthread.h>
20 #include <sched.h>
21 #include <signal.h>
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <stdint.h>
25 #include <string.h>
26 #include <time.h>
27 #include <unistd.h>
28 
29 #include <sys/select.h>
30 #include <sys/stat.h>
31 #include <sys/types.h>
32 #include <sys/times.h>
33 
34 // These are necessary, but on by default
35 // #define __USE_GNU
36 // #define __USE_LARGEFILE64
37 #include <fcntl.h>
38 #include <sys/socket.h>
39 #include <netdb.h>
40 #include <arpa/inet.h>
41 #include <linux/unistd.h>  // for gettid
42 
43 // For size of block device
44 #include <sys/ioctl.h>
45 #include <linux/fs.h>
46 // For asynchronous I/O
47 #ifdef HAVE_LIBAIO_H
48 #include <libaio.h>
49 #endif
50 
51 #include <sys/syscall.h>
52 
53 #include <set>
54 #include <string>
55 
56 // This file must work with autoconf on its public version,
57 // so these includes are correct.
58 #include "error_diag.h"  // NOLINT
59 #include "os.h"          // NOLINT
60 #include "pattern.h"     // NOLINT
61 #include "queue.h"       // NOLINT
62 #include "sat.h"         // NOLINT
63 #include "sattypes.h"    // NOLINT
64 #include "worker.h"      // NOLINT
65 
66 // Syscalls
67 // Why ubuntu, do you hate gettid so bad?
68 #if !defined(__NR_gettid)
69   #define __NR_gettid             224
70 #endif
71 
72 #define gettid() syscall(__NR_gettid)
73 #if !defined(CPU_SETSIZE)
74 _syscall3(int, sched_getaffinity, pid_t, pid,
75           unsigned int, len, cpu_set_t*, mask)
76 _syscall3(int, sched_setaffinity, pid_t, pid,
77           unsigned int, len, cpu_set_t*, mask)
78 #endif
79 
80 namespace {
81   // Work around the sad fact that there are two (gnu, xsi) incompatible
82   // versions of strerror_r floating around google. Awesome.
sat_strerror(int err,char * buf,int len)83   bool sat_strerror(int err, char *buf, int len) {
84     buf[0] = 0;
85     char *errmsg = reinterpret_cast<char*>(strerror_r(err, buf, len));
86     int retval = reinterpret_cast<int64>(errmsg);
87     if (retval == 0)
88       return true;
89     if (retval == -1)
90       return false;
91     if (errmsg != buf) {
92       strncpy(buf, errmsg, len);
93       buf[len - 1] = 0;
94     }
95     return true;
96   }
97 
98 
addr_to_tag(void * address)99   inline uint64 addr_to_tag(void *address) {
100     return reinterpret_cast<uint64>(address);
101   }
102 }  // namespace
103 
104 #if !defined(O_DIRECT)
105 // Sometimes this isn't available.
106 // Disregard if it's not defined.
107   #define O_DIRECT            0
108 #endif
109 
110 // A struct to hold captured errors, for later reporting.
111 struct ErrorRecord {
112   uint64 actual;  // This is the actual value read.
113   uint64 reread;  // This is the actual value, reread.
114   uint64 expected;  // This is what it should have been.
115   uint64 *vaddr;  // This is where it was (or wasn't).
116   char *vbyteaddr;  // This is byte specific where the data was (or wasn't).
117   uint64 paddr;  // This is the bus address, if available.
118   uint64 *tagvaddr;  // This holds the tag value if this data was tagged.
119   uint64 tagpaddr;  // This holds the physical address corresponding to the tag.
120 };
121 
122 // This is a helper function to create new threads with pthreads.
ThreadSpawnerGeneric(void * ptr)123 static void *ThreadSpawnerGeneric(void *ptr) {
124   WorkerThread *worker = static_cast<WorkerThread*>(ptr);
125   worker->StartRoutine();
126   return NULL;
127 }
128 
Initialize()129 void WorkerStatus::Initialize() {
130   sat_assert(0 == pthread_mutex_init(&num_workers_mutex_, NULL));
131   sat_assert(0 == pthread_rwlock_init(&status_rwlock_, NULL));
132 #ifdef HAVE_PTHREAD_BARRIERS
133   sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL,
134                                        num_workers_ + 1));
135 #endif
136 }
137 
Destroy()138 void WorkerStatus::Destroy() {
139   sat_assert(0 == pthread_mutex_destroy(&num_workers_mutex_));
140   sat_assert(0 == pthread_rwlock_destroy(&status_rwlock_));
141 #ifdef HAVE_PTHREAD_BARRIERS
142   sat_assert(0 == pthread_barrier_destroy(&pause_barrier_));
143 #endif
144 }
145 
PauseWorkers()146 void WorkerStatus::PauseWorkers() {
147   if (SetStatus(PAUSE) != PAUSE)
148     WaitOnPauseBarrier();
149 }
150 
ResumeWorkers()151 void WorkerStatus::ResumeWorkers() {
152   if (SetStatus(RUN) == PAUSE)
153     WaitOnPauseBarrier();
154 }
155 
StopWorkers()156 void WorkerStatus::StopWorkers() {
157   if (SetStatus(STOP) == PAUSE)
158     WaitOnPauseBarrier();
159 }
160 
ContinueRunning(bool * paused)161 bool WorkerStatus::ContinueRunning(bool *paused) {
162   // This loop is an optimization.  We use it to immediately re-check the status
163   // after resuming from a pause, instead of returning and waiting for the next
164   // call to this function.
165   if (paused) {
166     *paused = false;
167   }
168   for (;;) {
169     switch (GetStatus()) {
170       case RUN:
171         return true;
172       case PAUSE:
173         // Wait for the other workers to call this function so that
174         // PauseWorkers() can return.
175         WaitOnPauseBarrier();
176         // Wait for ResumeWorkers() to be called.
177         WaitOnPauseBarrier();
178         // Indicate that a pause occurred.
179         if (paused) {
180           *paused = true;
181         }
182         break;
183       case STOP:
184         return false;
185     }
186   }
187 }
188 
ContinueRunningNoPause()189 bool WorkerStatus::ContinueRunningNoPause() {
190   return (GetStatus() != STOP);
191 }
192 
RemoveSelf()193 void WorkerStatus::RemoveSelf() {
194   // Acquire a read lock on status_rwlock_ while (status_ != PAUSE).
195   for (;;) {
196     AcquireStatusReadLock();
197     if (status_ != PAUSE)
198       break;
199     // We need to obey PauseWorkers() just like ContinueRunning() would, so that
200     // the other threads won't wait on pause_barrier_ forever.
201     ReleaseStatusLock();
202     // Wait for the other workers to call this function so that PauseWorkers()
203     // can return.
204     WaitOnPauseBarrier();
205     // Wait for ResumeWorkers() to be called.
206     WaitOnPauseBarrier();
207   }
208 
209   // This lock would be unnecessary if we held a write lock instead of a read
210   // lock on status_rwlock_, but that would also force all threads calling
211   // ContinueRunning() to wait on this one.  Using a separate lock avoids that.
212   AcquireNumWorkersLock();
213   // Decrement num_workers_ and reinitialize pause_barrier_, which we know isn't
214   // in use because (status != PAUSE).
215 #ifdef HAVE_PTHREAD_BARRIERS
216   sat_assert(0 == pthread_barrier_destroy(&pause_barrier_));
217   sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL, num_workers_));
218 #endif
219   --num_workers_;
220   ReleaseNumWorkersLock();
221 
222   // Release status_rwlock_.
223   ReleaseStatusLock();
224 }
225 
226 
227 // Parent thread class.
WorkerThread()228 WorkerThread::WorkerThread() {
229   status_ = false;
230   pages_copied_ = 0;
231   errorcount_ = 0;
232   runduration_usec_ = 1;
233   priority_ = Normal;
234   worker_status_ = NULL;
235   thread_spawner_ = &ThreadSpawnerGeneric;
236   tag_mode_ = false;
237 }
238 
~WorkerThread()239 WorkerThread::~WorkerThread() {}
240 
241 // Constructors. Just init some default values.
FillThread()242 FillThread::FillThread() {
243   num_pages_to_fill_ = 0;
244 }
245 
246 // Initialize file name to empty.
FileThread()247 FileThread::FileThread() {
248   filename_ = "";
249   devicename_ = "";
250   pass_ = 0;
251   page_io_ = true;
252   crc_page_ = -1;
253   local_page_ = NULL;
254 }
255 
256 // If file thread used bounce buffer in memory, account for the extra
257 // copy for memory bandwidth calculation.
GetMemoryCopiedData()258 float FileThread::GetMemoryCopiedData() {
259   if (!os_->normal_mem())
260     return GetCopiedData();
261   else
262     return 0;
263 }
264 
265 // Initialize target hostname to be invalid.
NetworkThread()266 NetworkThread::NetworkThread() {
267   snprintf(ipaddr_, sizeof(ipaddr_), "Unknown");
268   sock_ = 0;
269 }
270 
271 // Initialize?
NetworkSlaveThread()272 NetworkSlaveThread::NetworkSlaveThread() {
273 }
274 
275 // Initialize?
NetworkListenThread()276 NetworkListenThread::NetworkListenThread() {
277 }
278 
279 // Init member variables.
InitThread(int thread_num_init,class Sat * sat_init,class OsLayer * os_init,class PatternList * patternlist_init,WorkerStatus * worker_status)280 void WorkerThread::InitThread(int thread_num_init,
281                               class Sat *sat_init,
282                               class OsLayer *os_init,
283                               class PatternList *patternlist_init,
284                               WorkerStatus *worker_status) {
285   sat_assert(worker_status);
286   worker_status->AddWorkers(1);
287 
288   thread_num_ = thread_num_init;
289   sat_ = sat_init;
290   os_ = os_init;
291   patternlist_ = patternlist_init;
292   worker_status_ = worker_status;
293 
294   AvailableCpus(&cpu_mask_);
295   tag_ = 0xffffffff;
296 
297   tag_mode_ = sat_->tag_mode();
298 }
299 
300 
301 // Use pthreads to prioritize a system thread.
InitPriority()302 bool WorkerThread::InitPriority() {
303   // This doesn't affect performance that much, and may not be too safe.
304 
305   bool ret = BindToCpus(&cpu_mask_);
306   if (!ret)
307     logprintf(11, "Log: Bind to %s failed.\n",
308               cpuset_format(&cpu_mask_).c_str());
309 
310   logprintf(11, "Log: Thread %d running on core ID %d mask %s (%s).\n",
311             thread_num_, sched_getcpu(),
312             CurrentCpusFormat().c_str(),
313             cpuset_format(&cpu_mask_).c_str());
314 #if 0
315   if (priority_ == High) {
316     sched_param param;
317     param.sched_priority = 1;
318     // Set the priority; others are unchanged.
319     logprintf(0, "Log: Changing priority to SCHED_FIFO %d\n",
320               param.sched_priority);
321     if (sched_setscheduler(0, SCHED_FIFO, &param)) {
322       char buf[256];
323       sat_strerror(errno, buf, sizeof(buf));
324       logprintf(0, "Process Error: sched_setscheduler "
325                    "failed - error %d %s\n",
326                 errno, buf);
327     }
328   }
329 #endif
330   return true;
331 }
332 
333 // Use pthreads to create a system thread.
SpawnThread()334 int WorkerThread::SpawnThread() {
335   // Create the new thread.
336   int result = pthread_create(&thread_, NULL, thread_spawner_, this);
337   if (result) {
338     char buf[256];
339     sat_strerror(result, buf, sizeof(buf));
340     logprintf(0, "Process Error: pthread_create "
341                   "failed - error %d %s\n", result,
342               buf);
343     status_ = false;
344     return false;
345   }
346 
347   // 0 is pthreads success.
348   return true;
349 }
350 
351 // Kill the worker thread with SIGINT.
KillThread()352 bool WorkerThread::KillThread() {
353   return (pthread_kill(thread_, SIGINT) == 0);
354 }
355 
356 // Block until thread has exited.
JoinThread()357 bool WorkerThread::JoinThread() {
358   int result = pthread_join(thread_, NULL);
359 
360   if (result) {
361     logprintf(0, "Process Error: pthread_join failed - error %d\n", result);
362     status_ = false;
363   }
364 
365   // 0 is pthreads success.
366   return (!result);
367 }
368 
369 
StartRoutine()370 void WorkerThread::StartRoutine() {
371   InitPriority();
372   StartThreadTimer();
373   Work();
374   StopThreadTimer();
375   worker_status_->RemoveSelf();
376 }
377 
378 
379 // Thread work loop. Execute until marked finished.
Work()380 bool WorkerThread::Work() {
381   do {
382     logprintf(9, "Log: ...\n");
383     // Sleep for 1 second.
384     sat_sleep(1);
385   } while (IsReadyToRun());
386 
387   return false;
388 }
389 
390 
391 // Returns CPU mask of CPUs available to this process,
392 // Conceptually, each bit represents a logical CPU, ie:
393 //   mask = 3  (11b):   cpu0, 1
394 //   mask = 13 (1101b): cpu0, 2, 3
AvailableCpus(cpu_set_t * cpuset)395 bool WorkerThread::AvailableCpus(cpu_set_t *cpuset) {
396   CPU_ZERO(cpuset);
397 #ifdef HAVE_SCHED_GETAFFINITY
398   return sched_getaffinity(getppid(), sizeof(*cpuset), cpuset) == 0;
399 #else
400   return 0;
401 #endif
402 }
403 
404 
405 // Returns CPU mask of CPUs this thread is bound to,
406 // Conceptually, each bit represents a logical CPU, ie:
407 //   mask = 3  (11b):   cpu0, 1
408 //   mask = 13 (1101b): cpu0, 2, 3
CurrentCpus(cpu_set_t * cpuset)409 bool WorkerThread::CurrentCpus(cpu_set_t *cpuset) {
410   CPU_ZERO(cpuset);
411 #ifdef HAVE_SCHED_GETAFFINITY
412   return sched_getaffinity(0, sizeof(*cpuset), cpuset) == 0;
413 #else
414   return 0;
415 #endif
416 }
417 
418 
419 // Bind worker thread to specified CPU(s)
420 //   Args:
421 //     thread_mask: cpu_set_t representing CPUs, ie
422 //                  mask = 1  (01b):   cpu0
423 //                  mask = 3  (11b):   cpu0, 1
424 //                  mask = 13 (1101b): cpu0, 2, 3
425 //
426 //   Returns true on success, false otherwise.
BindToCpus(const cpu_set_t * thread_mask)427 bool WorkerThread::BindToCpus(const cpu_set_t *thread_mask) {
428   cpu_set_t process_mask;
429   AvailableCpus(&process_mask);
430   if (cpuset_isequal(thread_mask, &process_mask))
431     return true;
432 
433   logprintf(11, "Log: available CPU mask - %s\n",
434             cpuset_format(&process_mask).c_str());
435   if (!cpuset_issubset(thread_mask, &process_mask)) {
436     // Invalid cpu_mask, ie cpu not allocated to this process or doesn't exist.
437     logprintf(0, "Log: requested CPUs %s not a subset of available %s\n",
438               cpuset_format(thread_mask).c_str(),
439               cpuset_format(&process_mask).c_str());
440     return false;
441   }
442 #ifdef HAVE_SCHED_GETAFFINITY
443   return (sched_setaffinity(gettid(), sizeof(*thread_mask), thread_mask) == 0);
444 #else
445   return 0;
446 #endif
447 }
448 
449 
450 // A worker thread can yield itself to give up CPU until it's scheduled again.
451 //   Returns true on success, false on error.
YieldSelf()452 bool WorkerThread::YieldSelf() {
453   return (sched_yield() == 0);
454 }
455 
456 
457 // Fill this page with its pattern.
FillPage(struct page_entry * pe)458 bool WorkerThread::FillPage(struct page_entry *pe) {
459   // Error check arguments.
460   if (pe == 0) {
461     logprintf(0, "Process Error: Fill Page entry null\n");
462     return 0;
463   }
464 
465   // Mask is the bitmask of indexes used by the pattern.
466   // It is the pattern size -1. Size is always a power of 2.
467   uint64 *memwords = static_cast<uint64*>(pe->addr);
468   int length = sat_->page_length();
469 
470   if (tag_mode_) {
471     // Select tag or data as appropriate.
472     for (int i = 0; i < length / wordsize_; i++) {
473       datacast_t data;
474 
475       if ((i & 0x7) == 0) {
476         data.l64 = addr_to_tag(&memwords[i]);
477       } else {
478         data.l32.l = pe->pattern->pattern(i << 1);
479         data.l32.h = pe->pattern->pattern((i << 1) + 1);
480       }
481       memwords[i] = data.l64;
482     }
483   } else {
484     // Just fill in untagged data directly.
485     for (int i = 0; i < length / wordsize_; i++) {
486       datacast_t data;
487 
488       data.l32.l = pe->pattern->pattern(i << 1);
489       data.l32.h = pe->pattern->pattern((i << 1) + 1);
490       memwords[i] = data.l64;
491     }
492   }
493 
494   return 1;
495 }
496 
497 
498 // Tell the thread how many pages to fill.
SetFillPages(int64 num_pages_to_fill_init)499 void FillThread::SetFillPages(int64 num_pages_to_fill_init) {
500   num_pages_to_fill_ = num_pages_to_fill_init;
501 }
502 
503 // Fill this page with a random pattern.
FillPageRandom(struct page_entry * pe)504 bool FillThread::FillPageRandom(struct page_entry *pe) {
505   // Error check arguments.
506   if (pe == 0) {
507     logprintf(0, "Process Error: Fill Page entry null\n");
508     return 0;
509   }
510   if ((patternlist_ == 0) || (patternlist_->Size() == 0)) {
511     logprintf(0, "Process Error: No data patterns available\n");
512     return 0;
513   }
514 
515   // Choose a random pattern for this block.
516   pe->pattern = patternlist_->GetRandomPattern();
517   if (pe->pattern == 0) {
518     logprintf(0, "Process Error: Null data pattern\n");
519     return 0;
520   }
521 
522   // Actually fill the page.
523   return FillPage(pe);
524 }
525 
526 
527 // Memory fill work loop. Execute until alloted pages filled.
Work()528 bool FillThread::Work() {
529   bool result = true;
530 
531   logprintf(9, "Log: Starting fill thread %d\n", thread_num_);
532 
533   // We want to fill num_pages_to_fill pages, and
534   // stop when we've filled that many.
535   // We also want to capture early break
536   struct page_entry pe;
537   int64 loops = 0;
538   while (IsReadyToRun() && (loops < num_pages_to_fill_)) {
539     result = result && sat_->GetEmpty(&pe);
540     if (!result) {
541       logprintf(0, "Process Error: fill_thread failed to pop pages, "
542                 "bailing\n");
543       break;
544     }
545 
546     // Fill the page with pattern
547     result = result && FillPageRandom(&pe);
548     if (!result) break;
549 
550     // Put the page back on the queue.
551     result = result && sat_->PutValid(&pe);
552     if (!result) {
553       logprintf(0, "Process Error: fill_thread failed to push pages, "
554                 "bailing\n");
555       break;
556     }
557     loops++;
558   }
559 
560   // Fill in thread status.
561   pages_copied_ = loops;
562   status_ = result;
563   logprintf(9, "Log: Completed %d: Fill thread. Status %d, %d pages filled\n",
564             thread_num_, status_, pages_copied_);
565   return result;
566 }
567 
568 
569 // Print error information about a data miscompare.
ProcessError(struct ErrorRecord * error,int priority,const char * message)570 void WorkerThread::ProcessError(struct ErrorRecord *error,
571                                 int priority,
572                                 const char *message) {
573   char dimm_string[256] = "";
574 
575   int core_id = sched_getcpu();
576 
577   // Determine if this is a write or read error.
578   os_->Flush(error->vaddr);
579   error->reread = *(error->vaddr);
580 
581   char *good = reinterpret_cast<char*>(&(error->expected));
582   char *bad = reinterpret_cast<char*>(&(error->actual));
583 
584   sat_assert(error->expected != error->actual);
585   unsigned int offset = 0;
586   for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
587     if (good[offset] != bad[offset])
588       break;
589   }
590 
591   error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
592 
593   // Find physical address if possible.
594   error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
595 
596   // Pretty print DIMM mapping if available.
597   os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
598 
599   // Report parseable error.
600   if (priority < 5) {
601     // Run miscompare error through diagnoser for logging and reporting.
602     os_->error_diagnoser_->AddMiscompareError(dimm_string,
603                                               reinterpret_cast<uint64>
604                                               (error->vaddr), 1);
605 
606     logprintf(priority,
607               "%s: miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
608               "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
609               message,
610               core_id,
611               CurrentCpusFormat().c_str(),
612               error->vaddr,
613               error->paddr,
614               dimm_string,
615               error->actual,
616               error->reread,
617               error->expected);
618   }
619 
620 
621   // Overwrite incorrect data with correct data to prevent
622   // future miscompares when this data is reused.
623   *(error->vaddr) = error->expected;
624   os_->Flush(error->vaddr);
625 }
626 
627 
628 
629 // Print error information about a data miscompare.
ProcessError(struct ErrorRecord * error,int priority,const char * message)630 void FileThread::ProcessError(struct ErrorRecord *error,
631                               int priority,
632                               const char *message) {
633   char dimm_string[256] = "";
634 
635   // Determine if this is a write or read error.
636   os_->Flush(error->vaddr);
637   error->reread = *(error->vaddr);
638 
639   char *good = reinterpret_cast<char*>(&(error->expected));
640   char *bad = reinterpret_cast<char*>(&(error->actual));
641 
642   sat_assert(error->expected != error->actual);
643   unsigned int offset = 0;
644   for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
645     if (good[offset] != bad[offset])
646       break;
647   }
648 
649   error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
650 
651   // Find physical address if possible.
652   error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
653 
654   // Pretty print DIMM mapping if available.
655   os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
656 
657   // If crc_page_ is valid, ie checking content read back from file,
658   // track src/dst memory addresses. Otherwise catagorize as general
659   // mememory miscompare for CRC checking everywhere else.
660   if (crc_page_ != -1) {
661     int miscompare_byteoffset = static_cast<char*>(error->vbyteaddr) -
662                                 static_cast<char*>(page_recs_[crc_page_].dst);
663     os_->error_diagnoser_->AddHDDMiscompareError(devicename_,
664                                                  crc_page_,
665                                                  miscompare_byteoffset,
666                                                  page_recs_[crc_page_].src,
667                                                  page_recs_[crc_page_].dst);
668   } else {
669     os_->error_diagnoser_->AddMiscompareError(dimm_string,
670                                               reinterpret_cast<uint64>
671                                               (error->vaddr), 1);
672   }
673 
674   logprintf(priority,
675             "%s: miscompare on %s at %p(0x%llx:%s): read:0x%016llx, "
676             "reread:0x%016llx expected:0x%016llx\n",
677             message,
678             devicename_.c_str(),
679             error->vaddr,
680             error->paddr,
681             dimm_string,
682             error->actual,
683             error->reread,
684             error->expected);
685 
686   // Overwrite incorrect data with correct data to prevent
687   // future miscompares when this data is reused.
688   *(error->vaddr) = error->expected;
689   os_->Flush(error->vaddr);
690 }
691 
692 
693 // Do a word by word result check of a region.
694 // Print errors on mismatches.
CheckRegion(void * addr,class Pattern * pattern,int64 length,int offset,int64 pattern_offset)695 int WorkerThread::CheckRegion(void *addr,
696                               class Pattern *pattern,
697                               int64 length,
698                               int offset,
699                               int64 pattern_offset) {
700   uint64 *memblock = static_cast<uint64*>(addr);
701   const int kErrorLimit = 128;
702   int errors = 0;
703   int overflowerrors = 0;  // Count of overflowed errors.
704   bool page_error = false;
705   string errormessage("Hardware Error");
706   struct ErrorRecord
707     recorded[kErrorLimit];  // Queued errors for later printing.
708 
709   // For each word in the data region.
710   for (int i = 0; i < length / wordsize_; i++) {
711     uint64 actual = memblock[i];
712     uint64 expected;
713 
714     // Determine the value that should be there.
715     datacast_t data;
716     int index = 2 * i + pattern_offset;
717     data.l32.l = pattern->pattern(index);
718     data.l32.h = pattern->pattern(index + 1);
719     expected = data.l64;
720     // Check tags if necessary.
721     if (tag_mode_ && ((reinterpret_cast<uint64>(&memblock[i]) & 0x3f) == 0)) {
722       expected = addr_to_tag(&memblock[i]);
723     }
724 
725 
726     // If the value is incorrect, save an error record for later printing.
727     if (actual != expected) {
728       if (errors < kErrorLimit) {
729         recorded[errors].actual = actual;
730         recorded[errors].expected = expected;
731         recorded[errors].vaddr = &memblock[i];
732         errors++;
733       } else {
734         page_error = true;
735         // If we have overflowed the error queue, just print the errors now.
736         logprintf(10, "Log: Error record overflow, too many miscompares!\n");
737         errormessage = "Page Error";
738         break;
739       }
740     }
741   }
742 
743   // Find if this is a whole block corruption.
744   if (page_error && !tag_mode_) {
745     int patsize = patternlist_->Size();
746     for (int pat = 0; pat < patsize; pat++) {
747       class Pattern *altpattern = patternlist_->GetPattern(pat);
748       const int kGood = 0;
749       const int kBad = 1;
750       const int kGoodAgain = 2;
751       const int kNoMatch = 3;
752       int state = kGood;
753       unsigned int badstart = 0;
754       unsigned int badend = 0;
755 
756       // Don't match against ourself!
757       if (pattern == altpattern)
758         continue;
759 
760       for (int i = 0; i < length / wordsize_; i++) {
761         uint64 actual = memblock[i];
762         datacast_t expected;
763         datacast_t possible;
764 
765         // Determine the value that should be there.
766         int index = 2 * i + pattern_offset;
767 
768         expected.l32.l = pattern->pattern(index);
769         expected.l32.h = pattern->pattern(index + 1);
770 
771         possible.l32.l = pattern->pattern(index);
772         possible.l32.h = pattern->pattern(index + 1);
773 
774         if (state == kGood) {
775           if (actual == expected.l64) {
776             continue;
777           } else if (actual == possible.l64) {
778             badstart = i;
779             badend = i;
780             state = kBad;
781             continue;
782           } else {
783             state = kNoMatch;
784             break;
785           }
786         } else if (state == kBad) {
787           if (actual == possible.l64) {
788             badend = i;
789             continue;
790           } else if (actual == expected.l64) {
791             state = kGoodAgain;
792             continue;
793           } else {
794             state = kNoMatch;
795             break;
796           }
797         } else if (state == kGoodAgain) {
798           if (actual == expected.l64) {
799             continue;
800           } else {
801             state = kNoMatch;
802             break;
803           }
804         }
805       }
806 
807       if ((state == kGoodAgain) || (state == kBad)) {
808         unsigned int blockerrors = badend - badstart + 1;
809         errormessage = "Block Error";
810         // It's okay for the 1st entry to be corrected multiple times,
811         // it will simply be reported twice. Once here and once below
812         // when processing the error queue.
813         ProcessError(&recorded[0], 0, errormessage.c_str());
814         logprintf(0, "Block Error: (%p) pattern %s instead of %s, "
815                   "%d bytes from offset 0x%x to 0x%x\n",
816                   &memblock[badstart],
817                   altpattern->name(), pattern->name(),
818                   blockerrors * wordsize_,
819                   offset + badstart * wordsize_,
820                   offset + badend * wordsize_);
821       }
822     }
823   }
824 
825 
826   // Process error queue after all errors have been recorded.
827   for (int err = 0; err < errors; err++) {
828     int priority = 5;
829     if (errorcount_ + err < 30)
830       priority = 0;  // Bump up the priority for the first few errors.
831     ProcessError(&recorded[err], priority, errormessage.c_str());
832   }
833 
834   if (page_error) {
835     // For each word in the data region.
836     for (int i = 0; i < length / wordsize_; i++) {
837       uint64 actual = memblock[i];
838       uint64 expected;
839       datacast_t data;
840       // Determine the value that should be there.
841       int index = 2 * i + pattern_offset;
842 
843       data.l32.l = pattern->pattern(index);
844       data.l32.h = pattern->pattern(index + 1);
845       expected = data.l64;
846 
847       // Check tags if necessary.
848       if (tag_mode_ && ((reinterpret_cast<uint64>(&memblock[i]) & 0x3f) == 0)) {
849         expected = addr_to_tag(&memblock[i]);
850       }
851 
852       // If the value is incorrect, save an error record for later printing.
853       if (actual != expected) {
854         // If we have overflowed the error queue, print the errors now.
855         struct ErrorRecord er;
856         er.actual = actual;
857         er.expected = expected;
858         er.vaddr = &memblock[i];
859 
860         // Do the error printout. This will take a long time and
861         // likely change the machine state.
862         ProcessError(&er, 12, errormessage.c_str());
863         overflowerrors++;
864       }
865     }
866   }
867 
868   // Keep track of observed errors.
869   errorcount_ += errors + overflowerrors;
870   return errors + overflowerrors;
871 }
872 
GetCopiedData()873 float WorkerThread::GetCopiedData() {
874   return pages_copied_ * sat_->page_length() / kMegabyte;
875 }
876 
877 // Calculate the CRC of a region.
878 // Result check if the CRC mismatches.
CrcCheckPage(struct page_entry * srcpe)879 int WorkerThread::CrcCheckPage(struct page_entry *srcpe) {
880   const int blocksize = 4096;
881   const int blockwords = blocksize / wordsize_;
882   int errors = 0;
883 
884   const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
885   uint64 *memblock = static_cast<uint64*>(srcpe->addr);
886   int blocks = sat_->page_length() / blocksize;
887   for (int currentblock = 0; currentblock < blocks; currentblock++) {
888     uint64 *memslice = memblock + currentblock * blockwords;
889 
890     AdlerChecksum crc;
891     if (tag_mode_) {
892       AdlerAddrCrcC(memslice, blocksize, &crc, srcpe);
893     } else {
894       CalculateAdlerChecksum(memslice, blocksize, &crc);
895     }
896 
897     // If the CRC does not match, we'd better look closer.
898     if (!crc.Equals(*expectedcrc)) {
899       logprintf(11, "Log: CrcCheckPage Falling through to slow compare, "
900                 "CRC mismatch %s != %s\n",
901                 crc.ToHexString().c_str(),
902                 expectedcrc->ToHexString().c_str());
903       int errorcount = CheckRegion(memslice,
904                                    srcpe->pattern,
905                                    blocksize,
906                                    currentblock * blocksize, 0);
907       if (errorcount == 0) {
908         logprintf(0, "Log: CrcCheckPage CRC mismatch %s != %s, "
909                      "but no miscompares found.\n",
910                   crc.ToHexString().c_str(),
911                   expectedcrc->ToHexString().c_str());
912       }
913       errors += errorcount;
914     }
915   }
916 
917   // For odd length transfers, we should never hit this.
918   int leftovers = sat_->page_length() % blocksize;
919   if (leftovers) {
920     uint64 *memslice = memblock + blocks * blockwords;
921     errors += CheckRegion(memslice,
922                           srcpe->pattern,
923                           leftovers,
924                           blocks * blocksize, 0);
925   }
926   return errors;
927 }
928 
929 
930 // Print error information about a data miscompare.
ProcessTagError(struct ErrorRecord * error,int priority,const char * message)931 void WorkerThread::ProcessTagError(struct ErrorRecord *error,
932                                    int priority,
933                                    const char *message) {
934   char dimm_string[256] = "";
935   char tag_dimm_string[256] = "";
936   bool read_error = false;
937 
938   int core_id = sched_getcpu();
939 
940   // Determine if this is a write or read error.
941   os_->Flush(error->vaddr);
942   error->reread = *(error->vaddr);
943 
944   // Distinguish read and write errors.
945   if (error->actual != error->reread) {
946     read_error = true;
947   }
948 
949   sat_assert(error->expected != error->actual);
950 
951   error->vbyteaddr = reinterpret_cast<char*>(error->vaddr);
952 
953   // Find physical address if possible.
954   error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
955   error->tagpaddr = os_->VirtualToPhysical(error->tagvaddr);
956 
957   // Pretty print DIMM mapping if available.
958   os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
959   // Pretty print DIMM mapping if available.
960   os_->FindDimm(error->tagpaddr, tag_dimm_string, sizeof(tag_dimm_string));
961 
962   // Report parseable error.
963   if (priority < 5) {
964     logprintf(priority,
965               "%s: Tag from %p(0x%llx:%s) (%s) "
966               "miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
967               "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
968               message,
969               error->tagvaddr, error->tagpaddr,
970               tag_dimm_string,
971               read_error ? "read error" : "write error",
972               core_id,
973               CurrentCpusFormat().c_str(),
974               error->vaddr,
975               error->paddr,
976               dimm_string,
977               error->actual,
978               error->reread,
979               error->expected);
980   }
981 
982   errorcount_ += 1;
983 
984   // Overwrite incorrect data with correct data to prevent
985   // future miscompares when this data is reused.
986   *(error->vaddr) = error->expected;
987   os_->Flush(error->vaddr);
988 }
989 
990 
991 // Print out and log a tag error.
ReportTagError(uint64 * mem64,uint64 actual,uint64 tag)992 bool WorkerThread::ReportTagError(
993     uint64 *mem64,
994     uint64 actual,
995     uint64 tag) {
996   struct ErrorRecord er;
997   er.actual = actual;
998 
999   er.expected = tag;
1000   er.vaddr = mem64;
1001 
1002   // Generate vaddr from tag.
1003   er.tagvaddr = reinterpret_cast<uint64*>(actual);
1004 
1005   ProcessTagError(&er, 0, "Hardware Error");
1006   return true;
1007 }
1008 
1009 // C implementation of Adler memory copy, with memory tagging.
AdlerAddrMemcpyC(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum,struct page_entry * pe)1010 bool WorkerThread::AdlerAddrMemcpyC(uint64 *dstmem64,
1011                                     uint64 *srcmem64,
1012                                     unsigned int size_in_bytes,
1013                                     AdlerChecksum *checksum,
1014                                     struct page_entry *pe) {
1015   // Use this data wrapper to access memory with 64bit read/write.
1016   datacast_t data;
1017   datacast_t dstdata;
1018   unsigned int count = size_in_bytes / sizeof(data);
1019 
1020   if (count > ((1U) << 19)) {
1021     // Size is too large, must be strictly less than 512 KB.
1022     return false;
1023   }
1024 
1025   uint64 a1 = 1;
1026   uint64 a2 = 1;
1027   uint64 b1 = 0;
1028   uint64 b2 = 0;
1029 
1030   class Pattern *pattern = pe->pattern;
1031 
1032   unsigned int i = 0;
1033   while (i < count) {
1034     // Process 64 bits at a time.
1035     if ((i & 0x7) == 0) {
1036       data.l64 = srcmem64[i];
1037       dstdata.l64 = dstmem64[i];
1038       uint64 src_tag = addr_to_tag(&srcmem64[i]);
1039       uint64 dst_tag = addr_to_tag(&dstmem64[i]);
1040       // Detect if tags have been corrupted.
1041       if (data.l64 != src_tag)
1042         ReportTagError(&srcmem64[i], data.l64, src_tag);
1043       if (dstdata.l64 != dst_tag)
1044         ReportTagError(&dstmem64[i], dstdata.l64, dst_tag);
1045 
1046       data.l32.l = pattern->pattern(i << 1);
1047       data.l32.h = pattern->pattern((i << 1) + 1);
1048       a1 = a1 + data.l32.l;
1049       b1 = b1 + a1;
1050       a1 = a1 + data.l32.h;
1051       b1 = b1 + a1;
1052 
1053       data.l64  = dst_tag;
1054       dstmem64[i] = data.l64;
1055 
1056     } else {
1057       data.l64 = srcmem64[i];
1058       a1 = a1 + data.l32.l;
1059       b1 = b1 + a1;
1060       a1 = a1 + data.l32.h;
1061       b1 = b1 + a1;
1062       dstmem64[i] = data.l64;
1063     }
1064     i++;
1065 
1066     data.l64 = srcmem64[i];
1067     a2 = a2 + data.l32.l;
1068     b2 = b2 + a2;
1069     a2 = a2 + data.l32.h;
1070     b2 = b2 + a2;
1071     dstmem64[i] = data.l64;
1072     i++;
1073   }
1074   checksum->Set(a1, a2, b1, b2);
1075   return true;
1076 }
1077 
1078 // x86_64 SSE2 assembly implementation of Adler memory copy, with address
1079 // tagging added as a second step. This is useful for debugging failures
1080 // that only occur when SSE / nontemporal writes are used.
AdlerAddrMemcpyWarm(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum,struct page_entry * pe)1081 bool WorkerThread::AdlerAddrMemcpyWarm(uint64 *dstmem64,
1082                                        uint64 *srcmem64,
1083                                        unsigned int size_in_bytes,
1084                                        AdlerChecksum *checksum,
1085                                        struct page_entry *pe) {
1086   // Do ASM copy, ignore checksum.
1087   AdlerChecksum ignored_checksum;
1088   os_->AdlerMemcpyWarm(dstmem64, srcmem64, size_in_bytes, &ignored_checksum);
1089 
1090   // Force cache flush of both the source and destination addresses.
1091   //  length - length of block to flush in cachelines.
1092   //  mem_increment - number of dstmem/srcmem values per cacheline.
1093   int length = size_in_bytes / kCacheLineSize;
1094   int mem_increment = kCacheLineSize / sizeof(*dstmem64);
1095   OsLayer::FastFlushSync();
1096   for (int i = 0; i < length; ++i) {
1097     OsLayer::FastFlushHint(dstmem64 + (i * mem_increment));
1098     OsLayer::FastFlushHint(srcmem64 + (i * mem_increment));
1099   }
1100   OsLayer::FastFlushSync();
1101 
1102   // Check results.
1103   AdlerAddrCrcC(srcmem64, size_in_bytes, checksum, pe);
1104   // Patch up address tags.
1105   TagAddrC(dstmem64, size_in_bytes);
1106   return true;
1107 }
1108 
1109 // Retag pages..
TagAddrC(uint64 * memwords,unsigned int size_in_bytes)1110 bool WorkerThread::TagAddrC(uint64 *memwords,
1111                             unsigned int size_in_bytes) {
1112   // Mask is the bitmask of indexes used by the pattern.
1113   // It is the pattern size -1. Size is always a power of 2.
1114 
1115   // Select tag or data as appropriate.
1116   int length = size_in_bytes / wordsize_;
1117   for (int i = 0; i < length; i += 8) {
1118     datacast_t data;
1119     data.l64 = addr_to_tag(&memwords[i]);
1120     memwords[i] = data.l64;
1121   }
1122   return true;
1123 }
1124 
1125 // C implementation of Adler memory crc.
AdlerAddrCrcC(uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum,struct page_entry * pe)1126 bool WorkerThread::AdlerAddrCrcC(uint64 *srcmem64,
1127                                  unsigned int size_in_bytes,
1128                                  AdlerChecksum *checksum,
1129                                  struct page_entry *pe) {
1130   // Use this data wrapper to access memory with 64bit read/write.
1131   datacast_t data;
1132   unsigned int count = size_in_bytes / sizeof(data);
1133 
1134   if (count > ((1U) << 19)) {
1135     // Size is too large, must be strictly less than 512 KB.
1136     return false;
1137   }
1138 
1139   uint64 a1 = 1;
1140   uint64 a2 = 1;
1141   uint64 b1 = 0;
1142   uint64 b2 = 0;
1143 
1144   class Pattern *pattern = pe->pattern;
1145 
1146   unsigned int i = 0;
1147   while (i < count) {
1148     // Process 64 bits at a time.
1149     if ((i & 0x7) == 0) {
1150       data.l64 = srcmem64[i];
1151       uint64 src_tag = addr_to_tag(&srcmem64[i]);
1152       // Check that tags match expected.
1153       if (data.l64 != src_tag)
1154         ReportTagError(&srcmem64[i], data.l64, src_tag);
1155 
1156       data.l32.l = pattern->pattern(i << 1);
1157       data.l32.h = pattern->pattern((i << 1) + 1);
1158       a1 = a1 + data.l32.l;
1159       b1 = b1 + a1;
1160       a1 = a1 + data.l32.h;
1161       b1 = b1 + a1;
1162     } else {
1163       data.l64 = srcmem64[i];
1164       a1 = a1 + data.l32.l;
1165       b1 = b1 + a1;
1166       a1 = a1 + data.l32.h;
1167       b1 = b1 + a1;
1168     }
1169     i++;
1170 
1171     data.l64 = srcmem64[i];
1172     a2 = a2 + data.l32.l;
1173     b2 = b2 + a2;
1174     a2 = a2 + data.l32.h;
1175     b2 = b2 + a2;
1176     i++;
1177   }
1178   checksum->Set(a1, a2, b1, b2);
1179   return true;
1180 }
1181 
1182 // Copy a block of memory quickly, while keeping a CRC of the data.
1183 // Result check if the CRC mismatches.
CrcCopyPage(struct page_entry * dstpe,struct page_entry * srcpe)1184 int WorkerThread::CrcCopyPage(struct page_entry *dstpe,
1185                               struct page_entry *srcpe) {
1186   int errors = 0;
1187   const int blocksize = 4096;
1188   const int blockwords = blocksize / wordsize_;
1189   int blocks = sat_->page_length() / blocksize;
1190 
1191   // Base addresses for memory copy
1192   uint64 *targetmembase = static_cast<uint64*>(dstpe->addr);
1193   uint64 *sourcemembase = static_cast<uint64*>(srcpe->addr);
1194   // Remember the expected CRC
1195   const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
1196 
1197   for (int currentblock = 0; currentblock < blocks; currentblock++) {
1198     uint64 *targetmem = targetmembase + currentblock * blockwords;
1199     uint64 *sourcemem = sourcemembase + currentblock * blockwords;
1200 
1201     AdlerChecksum crc;
1202     if (tag_mode_) {
1203       AdlerAddrMemcpyC(targetmem, sourcemem, blocksize, &crc, srcpe);
1204     } else {
1205       AdlerMemcpyC(targetmem, sourcemem, blocksize, &crc);
1206     }
1207 
1208     // Investigate miscompares.
1209     if (!crc.Equals(*expectedcrc)) {
1210       logprintf(11, "Log: CrcCopyPage Falling through to slow compare, "
1211                 "CRC mismatch %s != %s\n", crc.ToHexString().c_str(),
1212                 expectedcrc->ToHexString().c_str());
1213       int errorcount = CheckRegion(sourcemem,
1214                                    srcpe->pattern,
1215                                    blocksize,
1216                                    currentblock * blocksize, 0);
1217       if (errorcount == 0) {
1218         logprintf(0, "Log: CrcCopyPage CRC mismatch %s != %s, "
1219                      "but no miscompares found. Retrying with fresh data.\n",
1220                   crc.ToHexString().c_str(),
1221                   expectedcrc->ToHexString().c_str());
1222         if (!tag_mode_) {
1223           // Copy the data originally read from this region back again.
1224           // This data should have any corruption read originally while
1225           // calculating the CRC.
1226           memcpy(sourcemem, targetmem, blocksize);
1227           errorcount = CheckRegion(sourcemem,
1228                                    srcpe->pattern,
1229                                    blocksize,
1230                                    currentblock * blocksize, 0);
1231           if (errorcount == 0) {
1232             int core_id = sched_getcpu();
1233             logprintf(0, "Process Error: CPU %d(0x%s) CrcCopyPage "
1234                          "CRC mismatch %s != %s, "
1235                          "but no miscompares found on second pass.\n",
1236                       core_id, CurrentCpusFormat().c_str(),
1237                       crc.ToHexString().c_str(),
1238                       expectedcrc->ToHexString().c_str());
1239             struct ErrorRecord er;
1240             er.actual = sourcemem[0];
1241             er.expected = 0xbad00000ull << 32;
1242             er.vaddr = sourcemem;
1243             ProcessError(&er, 0, "Hardware Error");
1244             errors += 1;
1245             errorcount_ ++;
1246           }
1247         }
1248       }
1249       errors += errorcount;
1250     }
1251   }
1252 
1253   // For odd length transfers, we should never hit this.
1254   int leftovers = sat_->page_length() % blocksize;
1255   if (leftovers) {
1256     uint64 *targetmem = targetmembase + blocks * blockwords;
1257     uint64 *sourcemem = sourcemembase + blocks * blockwords;
1258 
1259     errors += CheckRegion(sourcemem,
1260                           srcpe->pattern,
1261                           leftovers,
1262                           blocks * blocksize, 0);
1263     int leftoverwords = leftovers / wordsize_;
1264     for (int i = 0; i < leftoverwords; i++) {
1265       targetmem[i] = sourcemem[i];
1266     }
1267   }
1268 
1269   // Update pattern reference to reflect new contents.
1270   dstpe->pattern = srcpe->pattern;
1271 
1272   // Clean clean clean the errors away.
1273   if (errors) {
1274     // TODO(nsanders): Maybe we should patch rather than fill? Filling may
1275     // cause bad data to be propogated across the page.
1276     FillPage(dstpe);
1277   }
1278   return errors;
1279 }
1280 
1281 
1282 
1283 // Invert a block of memory quickly, traversing downwards.
InvertPageDown(struct page_entry * srcpe)1284 int InvertThread::InvertPageDown(struct page_entry *srcpe) {
1285   const int blocksize = 4096;
1286   const int blockwords = blocksize / wordsize_;
1287   int blocks = sat_->page_length() / blocksize;
1288 
1289   // Base addresses for memory copy
1290   unsigned int *sourcemembase = static_cast<unsigned int *>(srcpe->addr);
1291 
1292   for (int currentblock = blocks-1; currentblock >= 0; currentblock--) {
1293     unsigned int *sourcemem = sourcemembase + currentblock * blockwords;
1294     for (int i = blockwords - 32; i >= 0; i -= 32) {
1295       for (int index = i + 31; index >= i; --index) {
1296         unsigned int actual = sourcemem[index];
1297         sourcemem[index] = ~actual;
1298       }
1299       OsLayer::FastFlush(&sourcemem[i]);
1300     }
1301   }
1302 
1303   return 0;
1304 }
1305 
1306 // Invert a block of memory, traversing upwards.
InvertPageUp(struct page_entry * srcpe)1307 int InvertThread::InvertPageUp(struct page_entry *srcpe) {
1308   const int blocksize = 4096;
1309   const int blockwords = blocksize / wordsize_;
1310   int blocks = sat_->page_length() / blocksize;
1311 
1312   // Base addresses for memory copy
1313   unsigned int *sourcemembase = static_cast<unsigned int *>(srcpe->addr);
1314 
1315   for (int currentblock = 0; currentblock < blocks; currentblock++) {
1316     unsigned int *sourcemem = sourcemembase + currentblock * blockwords;
1317     for (int i = 0; i < blockwords; i += 32) {
1318       for (int index = i; index <= i + 31; ++index) {
1319         unsigned int actual = sourcemem[index];
1320         sourcemem[index] = ~actual;
1321       }
1322       OsLayer::FastFlush(&sourcemem[i]);
1323     }
1324   }
1325   return 0;
1326 }
1327 
1328 // Copy a block of memory quickly, while keeping a CRC of the data.
1329 // Result check if the CRC mismatches. Warm the CPU while running
CrcWarmCopyPage(struct page_entry * dstpe,struct page_entry * srcpe)1330 int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
1331                                   struct page_entry *srcpe) {
1332   int errors = 0;
1333   const int blocksize = 4096;
1334   const int blockwords = blocksize / wordsize_;
1335   int blocks = sat_->page_length() / blocksize;
1336 
1337   // Base addresses for memory copy
1338   uint64 *targetmembase = static_cast<uint64*>(dstpe->addr);
1339   uint64 *sourcemembase = static_cast<uint64*>(srcpe->addr);
1340   // Remember the expected CRC
1341   const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
1342 
1343   for (int currentblock = 0; currentblock < blocks; currentblock++) {
1344     uint64 *targetmem = targetmembase + currentblock * blockwords;
1345     uint64 *sourcemem = sourcemembase + currentblock * blockwords;
1346 
1347     AdlerChecksum crc;
1348     if (tag_mode_) {
1349       AdlerAddrMemcpyWarm(targetmem, sourcemem, blocksize, &crc, srcpe);
1350     } else {
1351       os_->AdlerMemcpyWarm(targetmem, sourcemem, blocksize, &crc);
1352     }
1353 
1354     // Investigate miscompares.
1355     if (!crc.Equals(*expectedcrc)) {
1356       logprintf(11, "Log: CrcWarmCopyPage Falling through to slow compare, "
1357                 "CRC mismatch %s != %s\n", crc.ToHexString().c_str(),
1358                 expectedcrc->ToHexString().c_str());
1359       int errorcount = CheckRegion(sourcemem,
1360                                    srcpe->pattern,
1361                                    blocksize,
1362                                    currentblock * blocksize, 0);
1363       if (errorcount == 0) {
1364         logprintf(0, "Log: CrcWarmCopyPage CRC mismatch expected: %s != actual: %s, "
1365                      "but no miscompares found. Retrying with fresh data.\n",
1366                   expectedcrc->ToHexString().c_str(),
1367                   crc.ToHexString().c_str() );
1368         if (!tag_mode_) {
1369           // Copy the data originally read from this region back again.
1370           // This data should have any corruption read originally while
1371           // calculating the CRC.
1372           memcpy(sourcemem, targetmem, blocksize);
1373           errorcount = CheckRegion(sourcemem,
1374                                    srcpe->pattern,
1375                                    blocksize,
1376                                    currentblock * blocksize, 0);
1377           if (errorcount == 0) {
1378             int core_id = sched_getcpu();
1379             logprintf(0, "Process Error: CPU %d(0x%s) CrciWarmCopyPage "
1380                          "CRC mismatch %s != %s, "
1381                          "but no miscompares found on second pass.\n",
1382                       core_id, CurrentCpusFormat().c_str(),
1383                       crc.ToHexString().c_str(),
1384                       expectedcrc->ToHexString().c_str());
1385             struct ErrorRecord er;
1386             er.actual = sourcemem[0];
1387             er.expected = 0xbad;
1388             er.vaddr = sourcemem;
1389             ProcessError(&er, 0, "Hardware Error");
1390             errors ++;
1391             errorcount_ ++;
1392           }
1393         }
1394       }
1395       errors += errorcount;
1396     }
1397   }
1398 
1399   // For odd length transfers, we should never hit this.
1400   int leftovers = sat_->page_length() % blocksize;
1401   if (leftovers) {
1402     uint64 *targetmem = targetmembase + blocks * blockwords;
1403     uint64 *sourcemem = sourcemembase + blocks * blockwords;
1404 
1405     errors += CheckRegion(sourcemem,
1406                           srcpe->pattern,
1407                           leftovers,
1408                           blocks * blocksize, 0);
1409     int leftoverwords = leftovers / wordsize_;
1410     for (int i = 0; i < leftoverwords; i++) {
1411       targetmem[i] = sourcemem[i];
1412     }
1413   }
1414 
1415   // Update pattern reference to reflect new contents.
1416   dstpe->pattern = srcpe->pattern;
1417 
1418   // Clean clean clean the errors away.
1419   if (errors) {
1420     // TODO(nsanders): Maybe we should patch rather than fill? Filling may
1421     // cause bad data to be propogated across the page.
1422     FillPage(dstpe);
1423   }
1424   return errors;
1425 }
1426 
1427 
1428 
1429 // Memory check work loop. Execute until done, then exhaust pages.
Work()1430 bool CheckThread::Work() {
1431   struct page_entry pe;
1432   bool result = true;
1433   int64 loops = 0;
1434 
1435   logprintf(9, "Log: Starting Check thread %d\n", thread_num_);
1436 
1437   // We want to check all the pages, and
1438   // stop when there aren't any left.
1439   while (true) {
1440     result = result && sat_->GetValid(&pe);
1441     if (!result) {
1442       if (IsReadyToRunNoPause())
1443         logprintf(0, "Process Error: check_thread failed to pop pages, "
1444                   "bailing\n");
1445       else
1446         result = true;
1447       break;
1448     }
1449 
1450     // Do the result check.
1451     CrcCheckPage(&pe);
1452 
1453     // Push pages back on the valid queue if we are still going,
1454     // throw them out otherwise.
1455     if (IsReadyToRunNoPause())
1456       result = result && sat_->PutValid(&pe);
1457     else
1458       result = result && sat_->PutEmpty(&pe);
1459     if (!result) {
1460       logprintf(0, "Process Error: check_thread failed to push pages, "
1461                 "bailing\n");
1462       break;
1463     }
1464     loops++;
1465   }
1466 
1467   pages_copied_ = loops;
1468   status_ = result;
1469   logprintf(9, "Log: Completed %d: Check thread. Status %d, %d pages checked\n",
1470             thread_num_, status_, pages_copied_);
1471   return result;
1472 }
1473 
1474 
1475 // Memory copy work loop. Execute until marked done.
Work()1476 bool CopyThread::Work() {
1477   struct page_entry src;
1478   struct page_entry dst;
1479   bool result = true;
1480   int64 loops = 0;
1481 
1482   logprintf(9, "Log: Starting copy thread %d: cpu %s, mem %x\n",
1483             thread_num_, cpuset_format(&cpu_mask_).c_str(), tag_);
1484 
1485   while (IsReadyToRun()) {
1486     // Pop the needed pages.
1487     result = result && sat_->GetValid(&src, tag_);
1488     result = result && sat_->GetEmpty(&dst, tag_);
1489     if (!result) {
1490       logprintf(0, "Process Error: copy_thread failed to pop pages, "
1491                 "bailing\n");
1492       break;
1493     }
1494 
1495     // Force errors for unittests.
1496     if (sat_->error_injection()) {
1497       if (loops == 8) {
1498         char *addr = reinterpret_cast<char*>(src.addr);
1499         int offset = random() % sat_->page_length();
1500         addr[offset] = 0xba;
1501       }
1502     }
1503 
1504     // We can use memcpy, or CRC check while we copy.
1505     if (sat_->warm()) {
1506       CrcWarmCopyPage(&dst, &src);
1507     } else if (sat_->strict()) {
1508       CrcCopyPage(&dst, &src);
1509     } else {
1510       memcpy(dst.addr, src.addr, sat_->page_length());
1511       dst.pattern = src.pattern;
1512     }
1513 
1514     result = result && sat_->PutValid(&dst);
1515     result = result && sat_->PutEmpty(&src);
1516 
1517     // Copy worker-threads yield themselves at the end of each copy loop,
1518     // to avoid threads from preempting each other in the middle of the inner
1519     // copy-loop. Cooperations between Copy worker-threads results in less
1520     // unnecessary cache thrashing (which happens when context-switching in the
1521     // middle of the inner copy-loop).
1522     YieldSelf();
1523 
1524     if (!result) {
1525       logprintf(0, "Process Error: copy_thread failed to push pages, "
1526                 "bailing\n");
1527       break;
1528     }
1529     loops++;
1530   }
1531 
1532   pages_copied_ = loops;
1533   status_ = result;
1534   logprintf(9, "Log: Completed %d: Copy thread. Status %d, %d pages copied\n",
1535             thread_num_, status_, pages_copied_);
1536   return result;
1537 }
1538 
1539 // Memory invert work loop. Execute until marked done.
Work()1540 bool InvertThread::Work() {
1541   struct page_entry src;
1542   bool result = true;
1543   int64 loops = 0;
1544 
1545   logprintf(9, "Log: Starting invert thread %d\n", thread_num_);
1546 
1547   while (IsReadyToRun()) {
1548     // Pop the needed pages.
1549     result = result && sat_->GetValid(&src);
1550     if (!result) {
1551       logprintf(0, "Process Error: invert_thread failed to pop pages, "
1552                 "bailing\n");
1553       break;
1554     }
1555 
1556     if (sat_->strict())
1557       CrcCheckPage(&src);
1558 
1559     // For the same reason CopyThread yields itself (see YieldSelf comment
1560     // in CopyThread::Work(), InvertThread yields itself after each invert
1561     // operation to improve cooperation between different worker threads
1562     // stressing the memory/cache.
1563     InvertPageUp(&src);
1564     YieldSelf();
1565     InvertPageDown(&src);
1566     YieldSelf();
1567     InvertPageDown(&src);
1568     YieldSelf();
1569     InvertPageUp(&src);
1570     YieldSelf();
1571 
1572     if (sat_->strict())
1573       CrcCheckPage(&src);
1574 
1575     result = result && sat_->PutValid(&src);
1576     if (!result) {
1577       logprintf(0, "Process Error: invert_thread failed to push pages, "
1578                 "bailing\n");
1579       break;
1580     }
1581     loops++;
1582   }
1583 
1584   pages_copied_ = loops * 2;
1585   status_ = result;
1586   logprintf(9, "Log: Completed %d: Copy thread. Status %d, %d pages copied\n",
1587             thread_num_, status_, pages_copied_);
1588   return result;
1589 }
1590 
1591 
1592 // Set file name to use for File IO.
SetFile(const char * filename_init)1593 void FileThread::SetFile(const char *filename_init) {
1594   filename_ = filename_init;
1595   devicename_ = os_->FindFileDevice(filename_);
1596 }
1597 
1598 // Open the file for access.
OpenFile(int * pfile)1599 bool FileThread::OpenFile(int *pfile) {
1600   int flags = O_RDWR | O_CREAT | O_SYNC;
1601   int fd = open(filename_.c_str(), flags | O_DIRECT, 0644);
1602   if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) {
1603     fd = open(filename_.c_str(), flags, 0644);  // Try without O_DIRECT
1604     os_->ActivateFlushPageCache();  // Not using O_DIRECT fixed EINVAL
1605   }
1606   if (fd < 0) {
1607     logprintf(0, "Process Error: Failed to create file %s!!\n",
1608               filename_.c_str());
1609     pages_copied_ = 0;
1610     return false;
1611   }
1612   *pfile = fd;
1613   return true;
1614 }
1615 
1616 // Close the file.
CloseFile(int fd)1617 bool FileThread::CloseFile(int fd) {
1618   close(fd);
1619   return true;
1620 }
1621 
1622 // Check sector tagging.
SectorTagPage(struct page_entry * src,int block)1623 bool FileThread::SectorTagPage(struct page_entry *src, int block) {
1624   int page_length = sat_->page_length();
1625   struct FileThread::SectorTag *tag =
1626     (struct FileThread::SectorTag *)(src->addr);
1627 
1628   // Tag each sector.
1629   unsigned char magic = ((0xba + thread_num_) & 0xff);
1630   for (int sec = 0; sec < page_length / 512; sec++) {
1631     tag[sec].magic = magic;
1632     tag[sec].block = block & 0xff;
1633     tag[sec].sector = sec & 0xff;
1634     tag[sec].pass = pass_ & 0xff;
1635   }
1636   return true;
1637 }
1638 
WritePageToFile(int fd,struct page_entry * src)1639 bool FileThread::WritePageToFile(int fd, struct page_entry *src) {
1640   int page_length = sat_->page_length();
1641   // Fill the file with our data.
1642   int64 size = write(fd, src->addr, page_length);
1643 
1644   if (size != page_length) {
1645     os_->ErrorReport(devicename_.c_str(), "write-error", 1);
1646     errorcount_++;
1647     logprintf(0, "Block Error: file_thread failed to write, "
1648               "bailing\n");
1649     return false;
1650   }
1651   return true;
1652 }
1653 
1654 // Write the data to the file.
WritePages(int fd)1655 bool FileThread::WritePages(int fd) {
1656   int strict = sat_->strict();
1657 
1658   // Start fresh at beginning of file for each batch of pages.
1659   lseek64(fd, 0, SEEK_SET);
1660   for (int i = 0; i < sat_->disk_pages(); i++) {
1661     struct page_entry src;
1662     if (!GetValidPage(&src))
1663       return false;
1664     // Save expected pattern.
1665     page_recs_[i].pattern = src.pattern;
1666     page_recs_[i].src = src.addr;
1667 
1668     // Check data correctness.
1669     if (strict)
1670       CrcCheckPage(&src);
1671 
1672     SectorTagPage(&src, i);
1673 
1674     bool result = WritePageToFile(fd, &src);
1675 
1676     if (!PutEmptyPage(&src))
1677       return false;
1678 
1679     if (!result)
1680       return false;
1681   }
1682   return os_->FlushPageCache();  // If O_DIRECT worked, this will be a NOP.
1683 }
1684 
1685 // Copy data from file into memory block.
ReadPageFromFile(int fd,struct page_entry * dst)1686 bool FileThread::ReadPageFromFile(int fd, struct page_entry *dst) {
1687   int page_length = sat_->page_length();
1688 
1689   // Do the actual read.
1690   int64 size = read(fd, dst->addr, page_length);
1691   if (size != page_length) {
1692     os_->ErrorReport(devicename_.c_str(), "read-error", 1);
1693     logprintf(0, "Block Error: file_thread failed to read, "
1694               "bailing\n");
1695     errorcount_++;
1696     return false;
1697   }
1698   return true;
1699 }
1700 
1701 // Check sector tagging.
SectorValidatePage(const struct PageRec & page,struct page_entry * dst,int block)1702 bool FileThread::SectorValidatePage(const struct PageRec &page,
1703                                     struct page_entry *dst, int block) {
1704   // Error injection.
1705   static int calls = 0;
1706   calls++;
1707 
1708   // Do sector tag compare.
1709   int firstsector = -1;
1710   int lastsector = -1;
1711   bool badsector = false;
1712   int page_length = sat_->page_length();
1713 
1714   // Cast data block into an array of tagged sectors.
1715   struct FileThread::SectorTag *tag =
1716   (struct FileThread::SectorTag *)(dst->addr);
1717 
1718   sat_assert(sizeof(*tag) == 512);
1719 
1720   // Error injection.
1721   if (sat_->error_injection()) {
1722     if (calls == 2) {
1723       for (int badsec = 8; badsec < 17; badsec++)
1724         tag[badsec].pass = 27;
1725     }
1726     if (calls == 18) {
1727       (static_cast<int32*>(dst->addr))[27] = 0xbadda7a;
1728     }
1729   }
1730 
1731   // Check each sector for the correct tag we added earlier,
1732   // then revert the tag to the to normal data pattern.
1733   unsigned char magic = ((0xba + thread_num_) & 0xff);
1734   for (int sec = 0; sec < page_length / 512; sec++) {
1735     // Check magic tag.
1736     if ((tag[sec].magic != magic) ||
1737         (tag[sec].block != (block & 0xff)) ||
1738         (tag[sec].sector != (sec & 0xff)) ||
1739         (tag[sec].pass != (pass_ & 0xff))) {
1740       // Offset calculation for tag location.
1741       int offset = sec * sizeof(SectorTag);
1742       if (tag[sec].block != (block & 0xff))
1743         offset += 1 * sizeof(uint8);
1744       else if (tag[sec].sector != (sec & 0xff))
1745         offset += 2 * sizeof(uint8);
1746       else if (tag[sec].pass != (pass_ & 0xff))
1747         offset += 3 * sizeof(uint8);
1748 
1749       // Run sector tag error through diagnoser for logging and reporting.
1750       errorcount_ += 1;
1751       os_->error_diagnoser_->AddHDDSectorTagError(devicename_, tag[sec].block,
1752                                                   offset,
1753                                                   tag[sec].sector,
1754                                                   page.src, page.dst);
1755 
1756       logprintf(5, "Sector Error: Sector tag @ 0x%x, pass %d/%d. "
1757                 "sec %x/%x, block %d/%d, magic %x/%x, File: %s \n",
1758                 block * page_length + 512 * sec,
1759                 (pass_ & 0xff), (unsigned int)tag[sec].pass,
1760                 sec, (unsigned int)tag[sec].sector,
1761                 block, (unsigned int)tag[sec].block,
1762                 magic, (unsigned int)tag[sec].magic,
1763                 filename_.c_str());
1764 
1765       // Keep track of first and last bad sector.
1766       if (firstsector == -1)
1767         firstsector = (block * page_length / 512) + sec;
1768       lastsector = (block * page_length / 512) + sec;
1769       badsector = true;
1770     }
1771     // Patch tag back to proper pattern.
1772     unsigned int *addr = (unsigned int *)(&tag[sec]);
1773     *addr = dst->pattern->pattern(512 * sec / sizeof(*addr));
1774   }
1775 
1776   // If we found sector errors:
1777   if (badsector == true) {
1778     logprintf(5, "Log: file sector miscompare at offset %x-%x. File: %s\n",
1779               firstsector * 512,
1780               ((lastsector + 1) * 512) - 1,
1781               filename_.c_str());
1782 
1783     // Either exit immediately, or patch the data up and continue.
1784     if (sat_->stop_on_error()) {
1785       exit(1);
1786     } else {
1787       // Patch up bad pages.
1788       for (int block = (firstsector * 512) / page_length;
1789           block <= (lastsector * 512) / page_length;
1790           block++) {
1791         unsigned int *memblock = static_cast<unsigned int *>(dst->addr);
1792         int length = page_length / wordsize_;
1793         for (int i = 0; i < length; i++) {
1794           memblock[i] = dst->pattern->pattern(i);
1795         }
1796       }
1797     }
1798   }
1799   return true;
1800 }
1801 
1802 // Get memory for an incoming data transfer..
PagePrepare()1803 bool FileThread::PagePrepare() {
1804   // We can only do direct IO to SAT pages if it is normal mem.
1805   page_io_ = os_->normal_mem();
1806 
1807   // Init a local buffer if we need it.
1808   if (!page_io_) {
1809 #ifdef HAVE_POSIX_MEMALIGN
1810     int result = posix_memalign(&local_page_, 512, sat_->page_length());
1811 #else
1812     local_page_ = memalign(512, sat_->page_length());
1813     int result = (local_page_ == 0);
1814 #endif
1815     if (result) {
1816       logprintf(0, "Process Error: disk thread posix_memalign "
1817                    "returned %d (fail)\n",
1818                 result);
1819       status_ = false;
1820       return false;
1821     }
1822   }
1823   return true;
1824 }
1825 
1826 
1827 // Remove memory allocated for data transfer.
PageTeardown()1828 bool FileThread::PageTeardown() {
1829   // Free a local buffer if we need to.
1830   if (!page_io_) {
1831     free(local_page_);
1832   }
1833   return true;
1834 }
1835 
1836 
1837 
1838 // Get memory for an incoming data transfer..
GetEmptyPage(struct page_entry * dst)1839 bool FileThread::GetEmptyPage(struct page_entry *dst) {
1840   if (page_io_) {
1841     if (!sat_->GetEmpty(dst))
1842       return false;
1843   } else {
1844     dst->addr = local_page_;
1845     dst->offset = 0;
1846     dst->pattern = 0;
1847   }
1848   return true;
1849 }
1850 
1851 // Get memory for an outgoing data transfer..
GetValidPage(struct page_entry * src)1852 bool FileThread::GetValidPage(struct page_entry *src) {
1853   struct page_entry tmp;
1854   if (!sat_->GetValid(&tmp))
1855     return false;
1856   if (page_io_) {
1857     *src = tmp;
1858     return true;
1859   } else {
1860     src->addr = local_page_;
1861     src->offset = 0;
1862     CrcCopyPage(src, &tmp);
1863     if (!sat_->PutValid(&tmp))
1864       return false;
1865   }
1866   return true;
1867 }
1868 
1869 
1870 // Throw out a used empty page.
PutEmptyPage(struct page_entry * src)1871 bool FileThread::PutEmptyPage(struct page_entry *src) {
1872   if (page_io_) {
1873     if (!sat_->PutEmpty(src))
1874       return false;
1875   }
1876   return true;
1877 }
1878 
1879 // Throw out a used, filled page.
PutValidPage(struct page_entry * src)1880 bool FileThread::PutValidPage(struct page_entry *src) {
1881   if (page_io_) {
1882     if (!sat_->PutValid(src))
1883       return false;
1884   }
1885   return true;
1886 }
1887 
1888 // Copy data from file into memory blocks.
ReadPages(int fd)1889 bool FileThread::ReadPages(int fd) {
1890   int page_length = sat_->page_length();
1891   int strict = sat_->strict();
1892   bool result = true;
1893 
1894   // Read our data back out of the file, into it's new location.
1895   lseek64(fd, 0, SEEK_SET);
1896   for (int i = 0; i < sat_->disk_pages(); i++) {
1897     struct page_entry dst;
1898     if (!GetEmptyPage(&dst))
1899       return false;
1900     // Retrieve expected pattern.
1901     dst.pattern = page_recs_[i].pattern;
1902     // Update page recordpage record.
1903     page_recs_[i].dst = dst.addr;
1904 
1905     // Read from the file into destination page.
1906     if (!ReadPageFromFile(fd, &dst)) {
1907         PutEmptyPage(&dst);
1908         return false;
1909     }
1910 
1911     SectorValidatePage(page_recs_[i], &dst, i);
1912 
1913     // Ensure that the transfer ended up with correct data.
1914     if (strict) {
1915       // Record page index currently CRC checked.
1916       crc_page_ = i;
1917       int errors = CrcCheckPage(&dst);
1918       if (errors) {
1919         logprintf(5, "Log: file miscompare at block %d, "
1920                   "offset %x-%x. File: %s\n",
1921                   i, i * page_length, ((i + 1) * page_length) - 1,
1922                   filename_.c_str());
1923         result = false;
1924       }
1925       crc_page_ = -1;
1926       errorcount_ += errors;
1927     }
1928     if (!PutValidPage(&dst))
1929       return false;
1930   }
1931   return result;
1932 }
1933 
1934 // File IO work loop. Execute until marked done.
Work()1935 bool FileThread::Work() {
1936   bool result = true;
1937   int64 loops = 0;
1938 
1939   logprintf(9, "Log: Starting file thread %d, file %s, device %s\n",
1940             thread_num_,
1941             filename_.c_str(),
1942             devicename_.c_str());
1943 
1944   if (!PagePrepare()) {
1945     status_ = false;
1946     return false;
1947   }
1948 
1949   // Open the data IO file.
1950   int fd = 0;
1951   if (!OpenFile(&fd)) {
1952     status_ = false;
1953     return false;
1954   }
1955 
1956   pass_ = 0;
1957 
1958   // Load patterns into page records.
1959   page_recs_ = new struct PageRec[sat_->disk_pages()];
1960   for (int i = 0; i < sat_->disk_pages(); i++) {
1961     page_recs_[i].pattern = new class Pattern();
1962   }
1963 
1964   // Loop until done.
1965   while (IsReadyToRun()) {
1966     // Do the file write.
1967     if (!(result = result && WritePages(fd)))
1968       break;
1969 
1970     // Do the file read.
1971     if (!(result = result && ReadPages(fd)))
1972       break;
1973 
1974     loops++;
1975     pass_ = loops;
1976   }
1977 
1978   pages_copied_ = loops * sat_->disk_pages();
1979 
1980   // Clean up.
1981   CloseFile(fd);
1982   PageTeardown();
1983 
1984   logprintf(9, "Log: Completed %d: file thread status %d, %d pages copied\n",
1985             thread_num_, status_, pages_copied_);
1986   // Failure to read from device indicates hardware,
1987   // rather than procedural SW error.
1988   status_ = true;
1989   return true;
1990 }
1991 
IsNetworkStopSet()1992 bool NetworkThread::IsNetworkStopSet() {
1993   return !IsReadyToRunNoPause();
1994 }
1995 
IsNetworkStopSet()1996 bool NetworkSlaveThread::IsNetworkStopSet() {
1997   // This thread has no completion status.
1998   // It finishes whever there is no more data to be
1999   // passed back.
2000   return true;
2001 }
2002 
2003 // Set ip name to use for Network IO.
SetIP(const char * ipaddr_init)2004 void NetworkThread::SetIP(const char *ipaddr_init) {
2005   strncpy(ipaddr_, ipaddr_init, 256);
2006 }
2007 
2008 // Create a socket.
2009 // Return 0 on error.
CreateSocket(int * psocket)2010 bool NetworkThread::CreateSocket(int *psocket) {
2011   int sock = socket(AF_INET, SOCK_STREAM, 0);
2012   if (sock == -1) {
2013     logprintf(0, "Process Error: Cannot open socket\n");
2014     pages_copied_ = 0;
2015     status_ = false;
2016     return false;
2017   }
2018   *psocket = sock;
2019   return true;
2020 }
2021 
2022 // Close the socket.
CloseSocket(int sock)2023 bool NetworkThread::CloseSocket(int sock) {
2024   close(sock);
2025   return true;
2026 }
2027 
2028 // Initiate the tcp connection.
Connect(int sock)2029 bool NetworkThread::Connect(int sock) {
2030   struct sockaddr_in dest_addr;
2031   dest_addr.sin_family = AF_INET;
2032   dest_addr.sin_port = htons(kNetworkPort);
2033   memset(&(dest_addr.sin_zero), '\0', sizeof(dest_addr.sin_zero));
2034 
2035   // Translate dot notation to u32.
2036   if (inet_aton(ipaddr_, &dest_addr.sin_addr) == 0) {
2037     logprintf(0, "Process Error: Cannot resolve %s\n", ipaddr_);
2038     pages_copied_ = 0;
2039     status_ = false;
2040     return false;
2041   }
2042 
2043   if (-1 == connect(sock, reinterpret_cast<struct sockaddr *>(&dest_addr),
2044                     sizeof(struct sockaddr))) {
2045     logprintf(0, "Process Error: Cannot connect %s\n", ipaddr_);
2046     pages_copied_ = 0;
2047     status_ = false;
2048     return false;
2049   }
2050   return true;
2051 }
2052 
2053 // Initiate the tcp connection.
Listen()2054 bool NetworkListenThread::Listen() {
2055   struct sockaddr_in sa;
2056 
2057   memset(&(sa.sin_zero), '\0', sizeof(sa.sin_zero));
2058 
2059   sa.sin_family = AF_INET;
2060   sa.sin_addr.s_addr = INADDR_ANY;
2061   sa.sin_port = htons(kNetworkPort);
2062 
2063   if (-1 == ::bind(sock_, (struct sockaddr*)&sa, sizeof(struct sockaddr))) {
2064     char buf[256];
2065     sat_strerror(errno, buf, sizeof(buf));
2066     logprintf(0, "Process Error: Cannot bind socket: %s\n", buf);
2067     pages_copied_ = 0;
2068     status_ = false;
2069     return false;
2070   }
2071   listen(sock_, 3);
2072   return true;
2073 }
2074 
2075 // Wait for a connection from a network traffic generation thread.
Wait()2076 bool NetworkListenThread::Wait() {
2077     fd_set rfds;
2078     struct timeval tv;
2079     int retval;
2080 
2081     // Watch sock_ to see when it has input.
2082     FD_ZERO(&rfds);
2083     FD_SET(sock_, &rfds);
2084     // Wait up to five seconds.
2085     tv.tv_sec = 5;
2086     tv.tv_usec = 0;
2087 
2088     retval = select(sock_ + 1, &rfds, NULL, NULL, &tv);
2089 
2090     return (retval > 0);
2091 }
2092 
2093 // Wait for a connection from a network traffic generation thread.
GetConnection(int * pnewsock)2094 bool NetworkListenThread::GetConnection(int *pnewsock) {
2095   struct sockaddr_in sa;
2096   socklen_t size = sizeof(struct sockaddr_in);
2097 
2098   int newsock = accept(sock_, reinterpret_cast<struct sockaddr *>(&sa), &size);
2099   if (newsock < 0)  {
2100     logprintf(0, "Process Error: Did not receive connection\n");
2101     pages_copied_ = 0;
2102     status_ = false;
2103     return false;
2104   }
2105   *pnewsock = newsock;
2106   return true;
2107 }
2108 
2109 // Send a page, return false if a page was not sent.
SendPage(int sock,struct page_entry * src)2110 bool NetworkThread::SendPage(int sock, struct page_entry *src) {
2111   int page_length = sat_->page_length();
2112   char *address = static_cast<char*>(src->addr);
2113 
2114   // Send our data over the network.
2115   int size = page_length;
2116   while (size) {
2117     int transferred = send(sock, address + (page_length - size), size, 0);
2118     if ((transferred == 0) || (transferred == -1)) {
2119       if (!IsNetworkStopSet()) {
2120         char buf[256] = "";
2121         sat_strerror(errno, buf, sizeof(buf));
2122         logprintf(0, "Process Error: Thread %d, "
2123                      "Network write failed, bailing. (%s)\n",
2124                   thread_num_, buf);
2125         status_ = false;
2126       }
2127       return false;
2128     }
2129     size = size - transferred;
2130   }
2131   return true;
2132 }
2133 
2134 // Receive a page. Return false if a page was not received.
ReceivePage(int sock,struct page_entry * dst)2135 bool NetworkThread::ReceivePage(int sock, struct page_entry *dst) {
2136   int page_length = sat_->page_length();
2137   char *address = static_cast<char*>(dst->addr);
2138 
2139   // Maybe we will get our data back again, maybe not.
2140   int size = page_length;
2141   while (size) {
2142     int transferred = recv(sock, address + (page_length - size), size, 0);
2143     if ((transferred == 0) || (transferred == -1)) {
2144       // Typically network slave thread should exit as network master
2145       // thread stops sending data.
2146       if (IsNetworkStopSet()) {
2147         int err = errno;
2148         if (transferred == 0 && err == 0) {
2149           // Two system setups will not sync exactly,
2150           // allow early exit, but log it.
2151           logprintf(0, "Log: Net thread did not receive any data, exiting.\n");
2152         } else {
2153           char buf[256] = "";
2154           sat_strerror(err, buf, sizeof(buf));
2155           // Print why we failed.
2156           logprintf(0, "Process Error: Thread %d, "
2157                        "Network read failed, bailing (%s).\n",
2158                     thread_num_, buf);
2159           status_ = false;
2160           // Print arguments and results.
2161           logprintf(0, "Log: recv(%d, address %x, size %x, 0) == %x, err %d\n",
2162                     sock, address + (page_length - size),
2163                     size, transferred, err);
2164           if ((transferred == 0) &&
2165               (page_length - size < 512) &&
2166               (page_length - size > 0)) {
2167             // Print null terminated data received, to see who's been
2168             // sending us supicious unwanted data.
2169             address[page_length - size] = 0;
2170             logprintf(0, "Log: received  %d bytes: '%s'\n",
2171                       page_length - size, address);
2172           }
2173         }
2174       }
2175       return false;
2176     }
2177     size = size - transferred;
2178   }
2179   return true;
2180 }
2181 
2182 // Network IO work loop. Execute until marked done.
2183 // Return true if the thread ran as expected.
Work()2184 bool NetworkThread::Work() {
2185   logprintf(9, "Log: Starting network thread %d, ip %s\n",
2186             thread_num_,
2187             ipaddr_);
2188 
2189   // Make a socket.
2190   int sock = 0;
2191   if (!CreateSocket(&sock))
2192     return false;
2193 
2194   // Network IO loop requires network slave thread to have already initialized.
2195   // We will sleep here for awhile to ensure that the slave thread will be
2196   // listening by the time we connect.
2197   // Sleep for 15 seconds.
2198   sat_sleep(15);
2199   logprintf(9, "Log: Starting execution of network thread %d, ip %s\n",
2200             thread_num_,
2201             ipaddr_);
2202 
2203 
2204   // Connect to a slave thread.
2205   if (!Connect(sock))
2206     return false;
2207 
2208   // Loop until done.
2209   bool result = true;
2210   int strict = sat_->strict();
2211   int64 loops = 0;
2212   while (IsReadyToRun()) {
2213     struct page_entry src;
2214     struct page_entry dst;
2215     result = result && sat_->GetValid(&src);
2216     result = result && sat_->GetEmpty(&dst);
2217     if (!result) {
2218       logprintf(0, "Process Error: net_thread failed to pop pages, "
2219                 "bailing\n");
2220       break;
2221     }
2222 
2223     // Check data correctness.
2224     if (strict)
2225       CrcCheckPage(&src);
2226 
2227     // Do the network write.
2228     if (!(result = result && SendPage(sock, &src)))
2229       break;
2230 
2231     // Update pattern reference to reflect new contents.
2232     dst.pattern = src.pattern;
2233 
2234     // Do the network read.
2235     if (!(result = result && ReceivePage(sock, &dst)))
2236       break;
2237 
2238     // Ensure that the transfer ended up with correct data.
2239     if (strict)
2240       CrcCheckPage(&dst);
2241 
2242     // Return all of our pages to the queue.
2243     result = result && sat_->PutValid(&dst);
2244     result = result && sat_->PutEmpty(&src);
2245     if (!result) {
2246       logprintf(0, "Process Error: net_thread failed to push pages, "
2247                 "bailing\n");
2248       break;
2249     }
2250     loops++;
2251   }
2252 
2253   pages_copied_ = loops;
2254   status_ = result;
2255 
2256   // Clean up.
2257   CloseSocket(sock);
2258 
2259   logprintf(9, "Log: Completed %d: network thread status %d, "
2260                "%d pages copied\n",
2261             thread_num_, status_, pages_copied_);
2262   return result;
2263 }
2264 
2265 // Spawn slave threads for incoming connections.
SpawnSlave(int newsock,int threadid)2266 bool NetworkListenThread::SpawnSlave(int newsock, int threadid) {
2267   logprintf(12, "Log: Listen thread spawning slave\n");
2268 
2269   // Spawn slave thread, to reflect network traffic back to sender.
2270   ChildWorker *child_worker = new ChildWorker;
2271   child_worker->thread.SetSock(newsock);
2272   child_worker->thread.InitThread(threadid, sat_, os_, patternlist_,
2273                                   &child_worker->status);
2274   child_worker->status.Initialize();
2275   child_worker->thread.SpawnThread();
2276   child_workers_.push_back(child_worker);
2277 
2278   return true;
2279 }
2280 
2281 // Reap slave threads.
ReapSlaves()2282 bool NetworkListenThread::ReapSlaves() {
2283   bool result = true;
2284   // Gather status and reap threads.
2285   logprintf(12, "Log: Joining all outstanding threads\n");
2286 
2287   for (size_t i = 0; i < child_workers_.size(); i++) {
2288     NetworkSlaveThread& child_thread = child_workers_[i]->thread;
2289     logprintf(12, "Log: Joining slave thread %d\n", i);
2290     child_thread.JoinThread();
2291     if (child_thread.GetStatus() != 1) {
2292       logprintf(0, "Process Error: Slave Thread %d failed with status %d\n", i,
2293                 child_thread.GetStatus());
2294       result = false;
2295     }
2296     errorcount_ += child_thread.GetErrorCount();
2297     logprintf(9, "Log: Slave Thread %d found %lld miscompares\n", i,
2298               child_thread.GetErrorCount());
2299     pages_copied_ += child_thread.GetPageCount();
2300   }
2301 
2302   return result;
2303 }
2304 
2305 // Network listener IO work loop. Execute until marked done.
2306 // Return false on fatal software error.
Work()2307 bool NetworkListenThread::Work() {
2308   logprintf(9, "Log: Starting network listen thread %d\n",
2309             thread_num_);
2310 
2311   // Make a socket.
2312   sock_ = 0;
2313   if (!CreateSocket(&sock_)) {
2314     status_ = false;
2315     return false;
2316   }
2317   logprintf(9, "Log: Listen thread created sock\n");
2318 
2319   // Allows incoming connections to be queued up by socket library.
2320   int newsock = 0;
2321   Listen();
2322   logprintf(12, "Log: Listen thread waiting for incoming connections\n");
2323 
2324   // Wait on incoming connections, and spawn worker threads for them.
2325   int threadcount = 0;
2326   while (IsReadyToRun()) {
2327     // Poll for connections that we can accept().
2328     if (Wait()) {
2329       // Accept those connections.
2330       logprintf(12, "Log: Listen thread found incoming connection\n");
2331       if (GetConnection(&newsock)) {
2332         SpawnSlave(newsock, threadcount);
2333         threadcount++;
2334       }
2335     }
2336   }
2337 
2338   // Gather status and join spawned threads.
2339   ReapSlaves();
2340 
2341   // Delete the child workers.
2342   for (ChildVector::iterator it = child_workers_.begin();
2343        it != child_workers_.end(); ++it) {
2344     (*it)->status.Destroy();
2345     delete *it;
2346   }
2347   child_workers_.clear();
2348 
2349   CloseSocket(sock_);
2350 
2351   status_ = true;
2352   logprintf(9,
2353             "Log: Completed %d: network listen thread status %d, "
2354             "%d pages copied\n",
2355             thread_num_, status_, pages_copied_);
2356   return true;
2357 }
2358 
2359 // Set network reflector socket struct.
SetSock(int sock)2360 void NetworkSlaveThread::SetSock(int sock) {
2361   sock_ = sock;
2362 }
2363 
2364 // Network reflector IO work loop. Execute until marked done.
2365 // Return false on fatal software error.
Work()2366 bool NetworkSlaveThread::Work() {
2367   logprintf(9, "Log: Starting network slave thread %d\n",
2368             thread_num_);
2369 
2370   // Verify that we have a socket.
2371   int sock = sock_;
2372   if (!sock) {
2373     status_ = false;
2374     return false;
2375   }
2376 
2377   // Loop until done.
2378   int64 loops = 0;
2379   // Init a local buffer for storing data.
2380   void *local_page = NULL;
2381 #ifdef HAVE_POSIX_MEMALIGN
2382   int result = posix_memalign(&local_page, 512, sat_->page_length());
2383 #else
2384   local_page = memalign(512, sat_->page_length());
2385   int result = (local_page == 0);
2386 #endif
2387   if (result) {
2388     logprintf(0, "Process Error: net slave posix_memalign "
2389                  "returned %d (fail)\n",
2390               result);
2391     status_ = false;
2392     return false;
2393   }
2394 
2395   struct page_entry page;
2396   page.addr = local_page;
2397 
2398   // This thread will continue to run as long as the thread on the other end of
2399   // the socket is still sending and receiving data.
2400   while (1) {
2401     // Do the network read.
2402     if (!ReceivePage(sock, &page))
2403       break;
2404 
2405     // Do the network write.
2406     if (!SendPage(sock, &page))
2407       break;
2408 
2409     loops++;
2410   }
2411 
2412   pages_copied_ = loops;
2413   // No results provided from this type of thread.
2414   status_ = true;
2415 
2416   // Clean up.
2417   CloseSocket(sock);
2418 
2419   logprintf(9,
2420             "Log: Completed %d: network slave thread status %d, "
2421             "%d pages copied\n",
2422             thread_num_, status_, pages_copied_);
2423   return true;
2424 }
2425 
2426 // Thread work loop. Execute until marked finished.
Work()2427 bool ErrorPollThread::Work() {
2428   logprintf(9, "Log: Starting system error poll thread %d\n", thread_num_);
2429 
2430   // This calls a generic error polling function in the Os abstraction layer.
2431   do {
2432     errorcount_ += os_->ErrorPoll();
2433     os_->ErrorWait();
2434   } while (IsReadyToRun());
2435 
2436   logprintf(9, "Log: Finished system error poll thread %d: %d errors\n",
2437             thread_num_, errorcount_);
2438   status_ = true;
2439   return true;
2440 }
2441 
2442 // Worker thread to heat up CPU.
2443 // This thread does not evaluate pass/fail or software error.
Work()2444 bool CpuStressThread::Work() {
2445   logprintf(9, "Log: Starting CPU stress thread %d\n", thread_num_);
2446 
2447   do {
2448     // Run ludloff's platform/CPU-specific assembly workload.
2449     os_->CpuStressWorkload();
2450     YieldSelf();
2451   } while (IsReadyToRun());
2452 
2453   logprintf(9, "Log: Finished CPU stress thread %d:\n",
2454             thread_num_);
2455   status_ = true;
2456   return true;
2457 }
2458 
CpuCacheCoherencyThread(cc_cacheline_data * data,int cacheline_count,int thread_num,int thread_count,int inc_count)2459 CpuCacheCoherencyThread::CpuCacheCoherencyThread(cc_cacheline_data *data,
2460                                                  int cacheline_count,
2461                                                  int thread_num,
2462                                                  int thread_count,
2463                                                  int inc_count) {
2464   cc_cacheline_data_ = data;
2465   cc_cacheline_count_ = cacheline_count;
2466   cc_thread_num_ = thread_num;
2467   cc_thread_count_ = thread_count;
2468   cc_inc_count_ = inc_count;
2469 }
2470 
2471 // A very simple psuedorandom generator.  Since the random number is based
2472 // on only a few simple logic operations, it can be done quickly in registers
2473 // and the compiler can inline it.
SimpleRandom(uint64 seed)2474 uint64 CpuCacheCoherencyThread::SimpleRandom(uint64 seed) {
2475   return (seed >> 1) ^ (-(seed & 1) & kRandomPolynomial);
2476 }
2477 
2478 // Worked thread to test the cache coherency of the CPUs
2479 // Return false on fatal sw error.
Work()2480 bool CpuCacheCoherencyThread::Work() {
2481   logprintf(9, "Log: Starting the Cache Coherency thread %d\n",
2482             cc_thread_num_);
2483   uint64 time_start, time_end;
2484   struct timeval tv;
2485 
2486   // Use a slightly more robust random number for the initial
2487   // value, so the random sequences from the simple generator will
2488   // be more divergent.
2489 #ifdef HAVE_RAND_R
2490   unsigned int seed = static_cast<unsigned int>(gettid());
2491   uint64 r = static_cast<uint64>(rand_r(&seed));
2492   r |= static_cast<uint64>(rand_r(&seed)) << 32;
2493 #else
2494   srand(time(NULL));
2495   uint64 r = static_cast<uint64>(rand());  // NOLINT
2496   r |= static_cast<uint64>(rand()) << 32;  // NOLINT
2497 #endif
2498 
2499   gettimeofday(&tv, NULL);  // Get the timestamp before increments.
2500   time_start = tv.tv_sec * 1000000ULL + tv.tv_usec;
2501 
2502   uint64 total_inc = 0;  // Total increments done by the thread.
2503   while (IsReadyToRun()) {
2504     for (int i = 0; i < cc_inc_count_; i++) {
2505       // Choose a datastructure in random and increment the appropriate
2506       // member in that according to the offset (which is the same as the
2507       // thread number.
2508       r = SimpleRandom(r);
2509       int cline_num = r % cc_cacheline_count_;
2510       int offset;
2511       // Reverse the order for odd numbered threads in odd numbered cache
2512       // lines.  This is designed for massively multi-core systems where the
2513       // number of cores exceeds the bytes in a cache line, so "distant" cores
2514       // get a chance to exercize cache coherency between them.
2515       if (cline_num & cc_thread_num_ & 1)
2516         offset = (cc_thread_count_ & ~1) - cc_thread_num_;
2517       else
2518         offset = cc_thread_num_;
2519       // Increment the member of the randomely selected structure.
2520       (cc_cacheline_data_[cline_num].num[offset])++;
2521     }
2522 
2523     total_inc += cc_inc_count_;
2524 
2525     // Calculate if the local counter matches with the global value
2526     // in all the cache line structures for this particular thread.
2527     int cc_global_num = 0;
2528     for (int cline_num = 0; cline_num < cc_cacheline_count_; cline_num++) {
2529       int offset;
2530       // Perform the same offset calculation from above.
2531       if (cline_num & cc_thread_num_ & 1)
2532         offset = (cc_thread_count_ & ~1) - cc_thread_num_;
2533       else
2534         offset = cc_thread_num_;
2535       cc_global_num += cc_cacheline_data_[cline_num].num[offset];
2536       // Reset the cachline member's value for the next run.
2537       cc_cacheline_data_[cline_num].num[offset] = 0;
2538     }
2539     if (sat_->error_injection())
2540       cc_global_num = -1;
2541 
2542     // Since the count is only stored in a byte, to squeeze more into a
2543     // single cache line, only compare it as a byte.  In the event that there
2544     // is something detected, the chance that it would be missed by a single
2545     // thread is 1 in 256.  If it affects all cores, that makes the chance
2546     // of it being missed terribly minute.  It seems unlikely any failure
2547     // case would be off by more than a small number.
2548     if ((cc_global_num & 0xff) != (cc_inc_count_ & 0xff)) {
2549       errorcount_++;
2550       logprintf(0, "Hardware Error: global(%d) and local(%d) do not match\n",
2551                 cc_global_num, cc_inc_count_);
2552     }
2553   }
2554   gettimeofday(&tv, NULL);  // Get the timestamp at the end.
2555   time_end = tv.tv_sec * 1000000ULL + tv.tv_usec;
2556 
2557   uint64 us_elapsed = time_end - time_start;
2558   // inc_rate is the no. of increments per second.
2559   double inc_rate = total_inc * 1e6 / us_elapsed;
2560 
2561   logprintf(4, "Stats: CC Thread(%d): Time=%llu us,"
2562             " Increments=%llu, Increments/sec = %.6lf\n",
2563             cc_thread_num_, us_elapsed, total_inc, inc_rate);
2564   logprintf(9, "Log: Finished CPU Cache Coherency thread %d:\n",
2565             cc_thread_num_);
2566   status_ = true;
2567   return true;
2568 }
2569 
DiskThread(DiskBlockTable * block_table)2570 DiskThread::DiskThread(DiskBlockTable *block_table) {
2571   read_block_size_ = kSectorSize;   // default 1 sector (512 bytes)
2572   write_block_size_ = kSectorSize;  // this assumes read and write block size
2573                                     // are the same
2574   segment_size_ = -1;               // use the entire disk as one segment
2575   cache_size_ = 16 * 1024 * 1024;   // assume 16MiB cache by default
2576   // Use a queue such that 3/2 times as much data as the cache can hold
2577   // is written before it is read so that there is little chance the read
2578   // data is in the cache.
2579   queue_size_ = ((cache_size_ / write_block_size_) * 3) / 2;
2580   blocks_per_segment_ = 32;
2581 
2582   read_threshold_ = 100000;         // 100ms is a reasonable limit for
2583   write_threshold_ = 100000;        // reading/writing a sector
2584 
2585   read_timeout_ = 5000000;          // 5 seconds should be long enough for a
2586   write_timeout_ = 5000000;         // timout for reading/writing
2587 
2588   device_sectors_ = 0;
2589   non_destructive_ = 0;
2590 
2591 #ifdef HAVE_LIBAIO_H
2592   aio_ctx_ = 0;
2593 #endif
2594   block_table_ = block_table;
2595   update_block_table_ = 1;
2596 
2597   block_buffer_ = NULL;
2598 
2599   blocks_written_ = 0;
2600   blocks_read_ = 0;
2601 }
2602 
~DiskThread()2603 DiskThread::~DiskThread() {
2604   if (block_buffer_)
2605     free(block_buffer_);
2606 }
2607 
2608 // Set filename for device file (in /dev).
SetDevice(const char * device_name)2609 void DiskThread::SetDevice(const char *device_name) {
2610   device_name_ = device_name;
2611 }
2612 
2613 // Set various parameters that control the behaviour of the test.
2614 // -1 is used as a sentinel value on each parameter (except non_destructive)
2615 // to indicate that the parameter not be set.
SetParameters(int read_block_size,int write_block_size,int64 segment_size,int64 cache_size,int blocks_per_segment,int64 read_threshold,int64 write_threshold,int non_destructive)2616 bool DiskThread::SetParameters(int read_block_size,
2617                                int write_block_size,
2618                                int64 segment_size,
2619                                int64 cache_size,
2620                                int blocks_per_segment,
2621                                int64 read_threshold,
2622                                int64 write_threshold,
2623                                int non_destructive) {
2624   if (read_block_size != -1) {
2625     // Blocks must be aligned to the disk's sector size.
2626     if (read_block_size % kSectorSize != 0) {
2627       logprintf(0, "Process Error: Block size must be a multiple of %d "
2628                 "(thread %d).\n", kSectorSize, thread_num_);
2629       return false;
2630     }
2631 
2632     read_block_size_ = read_block_size;
2633   }
2634 
2635   if (write_block_size != -1) {
2636     // Write blocks must be aligned to the disk's sector size and to the
2637     // block size.
2638     if (write_block_size % kSectorSize != 0) {
2639       logprintf(0, "Process Error: Write block size must be a multiple "
2640                 "of %d (thread %d).\n", kSectorSize, thread_num_);
2641       return false;
2642     }
2643     if (write_block_size % read_block_size_ != 0) {
2644       logprintf(0, "Process Error: Write block size must be a multiple "
2645                 "of the read block size, which is %d (thread %d).\n",
2646                 read_block_size_, thread_num_);
2647       return false;
2648     }
2649 
2650     write_block_size_ = write_block_size;
2651 
2652   } else {
2653     // Make sure write_block_size_ is still valid.
2654     if (read_block_size_ > write_block_size_) {
2655       logprintf(5, "Log: Assuming write block size equal to read block size, "
2656                 "which is %d (thread %d).\n", read_block_size_,
2657                 thread_num_);
2658       write_block_size_ = read_block_size_;
2659     } else {
2660       if (write_block_size_ % read_block_size_ != 0) {
2661         logprintf(0, "Process Error: Write block size (defined as %d) must "
2662                   "be a multiple of the read block size, which is %d "
2663                   "(thread %d).\n", write_block_size_, read_block_size_,
2664                   thread_num_);
2665         return false;
2666       }
2667     }
2668   }
2669 
2670   if (cache_size != -1) {
2671     cache_size_ = cache_size;
2672   }
2673 
2674   if (blocks_per_segment != -1) {
2675     if (blocks_per_segment <= 0) {
2676       logprintf(0, "Process Error: Blocks per segment must be greater than "
2677                    "zero.\n (thread %d)", thread_num_);
2678       return false;
2679     }
2680 
2681     blocks_per_segment_ = blocks_per_segment;
2682   }
2683 
2684   if (read_threshold != -1) {
2685     if (read_threshold <= 0) {
2686       logprintf(0, "Process Error: Read threshold must be greater than "
2687                    "zero (thread %d).\n", thread_num_);
2688       return false;
2689     }
2690 
2691     read_threshold_ = read_threshold;
2692   }
2693 
2694   if (write_threshold != -1) {
2695     if (write_threshold <= 0) {
2696       logprintf(0, "Process Error: Write threshold must be greater than "
2697                    "zero (thread %d).\n", thread_num_);
2698       return false;
2699     }
2700 
2701     write_threshold_ = write_threshold;
2702   }
2703 
2704   if (segment_size != -1) {
2705     // Segments must be aligned to the disk's sector size.
2706     if (segment_size % kSectorSize != 0) {
2707       logprintf(0, "Process Error: Segment size must be a multiple of %d"
2708                 " (thread %d).\n", kSectorSize, thread_num_);
2709       return false;
2710     }
2711 
2712     segment_size_ = segment_size / kSectorSize;
2713   }
2714 
2715   non_destructive_ = non_destructive;
2716 
2717   // Having a queue of 150% of blocks that will fit in the disk's cache
2718   // should be enough to force out the oldest block before it is read and hence,
2719   // making sure the data comes form the disk and not the cache.
2720   queue_size_ = ((cache_size_ / write_block_size_) * 3) / 2;
2721   // Updating DiskBlockTable parameters
2722   if (update_block_table_) {
2723     block_table_->SetParameters(kSectorSize, write_block_size_,
2724                                 device_sectors_, segment_size_,
2725                                 device_name_);
2726   }
2727   return true;
2728 }
2729 
2730 // Open a device, return false on failure.
OpenDevice(int * pfile)2731 bool DiskThread::OpenDevice(int *pfile) {
2732   int flags = O_RDWR | O_SYNC | O_LARGEFILE;
2733   int fd = open(device_name_.c_str(), flags | O_DIRECT, 0);
2734   if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) {
2735     fd = open(device_name_.c_str(), flags, 0);  // Try without O_DIRECT
2736     os_->ActivateFlushPageCache();
2737   }
2738   if (fd < 0) {
2739     logprintf(0, "Process Error: Failed to open device %s (thread %d)!!\n",
2740               device_name_.c_str(), thread_num_);
2741     return false;
2742   }
2743   *pfile = fd;
2744 
2745   return GetDiskSize(fd);
2746 }
2747 
2748 // Retrieves the size (in bytes) of the disk/file.
2749 // Return false on failure.
GetDiskSize(int fd)2750 bool DiskThread::GetDiskSize(int fd) {
2751   struct stat device_stat;
2752   if (fstat(fd, &device_stat) == -1) {
2753     logprintf(0, "Process Error: Unable to fstat disk %s (thread %d).\n",
2754               device_name_.c_str(), thread_num_);
2755     return false;
2756   }
2757 
2758   // For a block device, an ioctl is needed to get the size since the size
2759   // of the device file (i.e. /dev/sdb) is 0.
2760   if (S_ISBLK(device_stat.st_mode)) {
2761     uint64 block_size = 0;
2762 
2763     if (ioctl(fd, BLKGETSIZE64, &block_size) == -1) {
2764       logprintf(0, "Process Error: Unable to ioctl disk %s (thread %d).\n",
2765                 device_name_.c_str(), thread_num_);
2766       return false;
2767     }
2768 
2769     // Zero size indicates nonworking device..
2770     if (block_size == 0) {
2771       os_->ErrorReport(device_name_.c_str(), "device-size-zero", 1);
2772       ++errorcount_;
2773       status_ = true;  // Avoid a procedural error.
2774       return false;
2775     }
2776 
2777     device_sectors_ = block_size / kSectorSize;
2778 
2779   } else if (S_ISREG(device_stat.st_mode)) {
2780     device_sectors_ = device_stat.st_size / kSectorSize;
2781 
2782   } else {
2783     logprintf(0, "Process Error: %s is not a regular file or block "
2784               "device (thread %d).\n", device_name_.c_str(),
2785               thread_num_);
2786     return false;
2787   }
2788 
2789   logprintf(12, "Log: Device sectors: %lld on disk %s (thread %d).\n",
2790             device_sectors_, device_name_.c_str(), thread_num_);
2791 
2792   if (update_block_table_) {
2793     block_table_->SetParameters(kSectorSize, write_block_size_,
2794                                 device_sectors_, segment_size_,
2795                                 device_name_);
2796   }
2797 
2798   return true;
2799 }
2800 
CloseDevice(int fd)2801 bool DiskThread::CloseDevice(int fd) {
2802   close(fd);
2803   return true;
2804 }
2805 
2806 // Return the time in microseconds.
GetTime()2807 int64 DiskThread::GetTime() {
2808   struct timeval tv;
2809   gettimeofday(&tv, NULL);
2810   return tv.tv_sec * 1000000 + tv.tv_usec;
2811 }
2812 
2813 // Do randomized reads and (possibly) writes on a device.
2814 // Return false on fatal SW error, true on SW success,
2815 // regardless of whether HW failed.
DoWork(int fd)2816 bool DiskThread::DoWork(int fd) {
2817   int64 block_num = 0;
2818   int64 num_segments;
2819 
2820   if (segment_size_ == -1) {
2821     num_segments = 1;
2822   } else {
2823     num_segments = device_sectors_ / segment_size_;
2824     if (device_sectors_ % segment_size_ != 0)
2825       num_segments++;
2826   }
2827 
2828   // Disk size should be at least 3x cache size.  See comment later for
2829   // details.
2830   sat_assert(device_sectors_ * kSectorSize > 3 * cache_size_);
2831 
2832   // This disk test works by writing blocks with a certain pattern to
2833   // disk, then reading them back and verifying it against the pattern
2834   // at a later time.  A failure happens when either the block cannot
2835   // be written/read or when the read block is different than what was
2836   // written.  If a block takes too long to write/read, then a warning
2837   // is given instead of an error since taking too long is not
2838   // necessarily an error.
2839   //
2840   // To prevent the read blocks from coming from the disk cache,
2841   // enough blocks are written before read such that a block would
2842   // be ejected from the disk cache by the time it is read.
2843   //
2844   // TODO(amistry): Implement some sort of read/write throttling.  The
2845   //                flood of asynchronous I/O requests when a drive is
2846   //                unplugged is causing the application and kernel to
2847   //                become unresponsive.
2848 
2849   while (IsReadyToRun()) {
2850     // Write blocks to disk.
2851     logprintf(16, "Log: Write phase %sfor disk %s (thread %d).\n",
2852               non_destructive_ ? "(disabled) " : "",
2853               device_name_.c_str(), thread_num_);
2854     while (IsReadyToRunNoPause() &&
2855            in_flight_sectors_.size() <
2856                static_cast<size_t>(queue_size_ + 1)) {
2857       // Confine testing to a particular segment of the disk.
2858       int64 segment = (block_num / blocks_per_segment_) % num_segments;
2859       if (!non_destructive_ &&
2860           (block_num % blocks_per_segment_ == 0)) {
2861         logprintf(20, "Log: Starting to write segment %lld out of "
2862                   "%lld on disk %s (thread %d).\n",
2863                   segment, num_segments, device_name_.c_str(),
2864                   thread_num_);
2865       }
2866       block_num++;
2867 
2868       BlockData *block = block_table_->GetUnusedBlock(segment);
2869 
2870       // If an unused sequence of sectors could not be found, skip to the
2871       // next block to process.  Soon, a new segment will come and new
2872       // sectors will be able to be allocated.  This effectively puts a
2873       // minumim on the disk size at 3x the stated cache size, or 48MiB
2874       // if a cache size is not given (since the cache is set as 16MiB
2875       // by default).  Given that todays caches are at the low MiB range
2876       // and drive sizes at the mid GB, this shouldn't pose a problem.
2877       // The 3x minimum comes from the following:
2878       //   1. In order to allocate 'y' blocks from a segment, the
2879       //      segment must contain at least 2y blocks or else an
2880       //      allocation may not succeed.
2881       //   2. Assume the entire disk is one segment.
2882       //   3. A full write phase consists of writing blocks corresponding to
2883       //      3/2 cache size.
2884       //   4. Therefore, the one segment must have 2 * 3/2 * cache
2885       //      size worth of blocks = 3 * cache size worth of blocks
2886       //      to complete.
2887       // In non-destructive mode, don't write anything to disk.
2888       if (!non_destructive_) {
2889         if (!WriteBlockToDisk(fd, block)) {
2890           block_table_->RemoveBlock(block);
2891           return true;
2892         }
2893         blocks_written_++;
2894       }
2895 
2896       // Block is either initialized by writing, or in nondestructive case,
2897       // initialized by being added into the datastructure for later reading.
2898       block->initialized();
2899 
2900       in_flight_sectors_.push(block);
2901     }
2902     if (!os_->FlushPageCache())  // If O_DIRECT worked, this will be a NOP.
2903       return false;
2904 
2905     // Verify blocks on disk.
2906     logprintf(20, "Log: Read phase for disk %s (thread %d).\n",
2907               device_name_.c_str(), thread_num_);
2908     while (IsReadyToRunNoPause() && !in_flight_sectors_.empty()) {
2909       BlockData *block = in_flight_sectors_.front();
2910       in_flight_sectors_.pop();
2911       if (!ValidateBlockOnDisk(fd, block))
2912         return true;
2913       block_table_->RemoveBlock(block);
2914       blocks_read_++;
2915     }
2916   }
2917 
2918   pages_copied_ = blocks_written_ + blocks_read_;
2919   return true;
2920 }
2921 
2922 // Do an asynchronous disk I/O operation.
2923 // Return false if the IO is not set up.
AsyncDiskIO(IoOp op,int fd,void * buf,int64 size,int64 offset,int64 timeout)2924 bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size,
2925                             int64 offset, int64 timeout) {
2926 #ifdef HAVE_LIBAIO_H
2927   // Use the Linux native asynchronous I/O interface for reading/writing.
2928   // A read/write consists of three basic steps:
2929   //    1. create an io context.
2930   //    2. prepare and submit an io request to the context
2931   //    3. wait for an event on the context.
2932 
2933   struct {
2934     const int opcode;
2935     const char *op_str;
2936     const char *error_str;
2937   } operations[2] = {
2938     { IO_CMD_PREAD, "read", "disk-read-error" },
2939     { IO_CMD_PWRITE, "write", "disk-write-error" }
2940   };
2941 
2942   struct iocb cb;
2943   memset(&cb, 0, sizeof(cb));
2944 
2945   cb.aio_fildes = fd;
2946   cb.aio_lio_opcode = operations[op].opcode;
2947   cb.u.c.buf = buf;
2948   cb.u.c.nbytes = size;
2949   cb.u.c.offset = offset;
2950 
2951   struct iocb *cbs[] = { &cb };
2952   if (io_submit(aio_ctx_, 1, cbs) != 1) {
2953     int error = errno;
2954     char buf[256];
2955     sat_strerror(error, buf, sizeof(buf));
2956     logprintf(0, "Process Error: Unable to submit async %s "
2957                  "on disk %s (thread %d). Error %d, %s\n",
2958               operations[op].op_str, device_name_.c_str(),
2959               thread_num_, error, buf);
2960     return false;
2961   }
2962 
2963   struct io_event event;
2964   memset(&event, 0, sizeof(event));
2965   struct timespec tv;
2966   tv.tv_sec = timeout / 1000000;
2967   tv.tv_nsec = (timeout % 1000000) * 1000;
2968   if (io_getevents(aio_ctx_, 1, 1, &event, &tv) != 1) {
2969     // A ctrl-c from the keyboard will cause io_getevents to fail with an
2970     // EINTR error code.  This is not an error and so don't treat it as such,
2971     // but still log it.
2972     int error = errno;
2973     if (error == EINTR) {
2974       logprintf(5, "Log: %s interrupted on disk %s (thread %d).\n",
2975                 operations[op].op_str, device_name_.c_str(),
2976                 thread_num_);
2977     } else {
2978       os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
2979       errorcount_ += 1;
2980       logprintf(0, "Hardware Error: Timeout doing async %s to sectors "
2981                    "starting at %lld on disk %s (thread %d).\n",
2982                 operations[op].op_str, offset / kSectorSize,
2983                 device_name_.c_str(), thread_num_);
2984     }
2985 
2986     // Don't bother checking return codes since io_cancel seems to always fail.
2987     // Since io_cancel is always failing, destroying and recreating an I/O
2988     // context is a workaround for canceling an in-progress I/O operation.
2989     // TODO(amistry): Find out why io_cancel isn't working and make it work.
2990     io_cancel(aio_ctx_, &cb, &event);
2991     io_destroy(aio_ctx_);
2992     aio_ctx_ = 0;
2993     if (io_setup(5, &aio_ctx_)) {
2994       int error = errno;
2995       char buf[256];
2996       sat_strerror(error, buf, sizeof(buf));
2997       logprintf(0, "Process Error: Unable to create aio context on disk %s"
2998                 " (thread %d) Error %d, %s\n",
2999                 device_name_.c_str(), thread_num_, error, buf);
3000     }
3001 
3002     return false;
3003   }
3004 
3005   // event.res contains the number of bytes written/read or
3006   // error if < 0, I think.
3007   if (event.res != static_cast<uint64>(size)) {
3008     errorcount_++;
3009     os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
3010 
3011     int64 result = static_cast<int64>(event.res);
3012     if (result < 0) {
3013       switch (result) {
3014         case -EIO:
3015           logprintf(0, "Hardware Error: Low-level I/O error while doing %s to "
3016                        "sectors starting at %lld on disk %s (thread %d).\n",
3017                     operations[op].op_str, offset / kSectorSize,
3018                     device_name_.c_str(), thread_num_);
3019           break;
3020         default:
3021           logprintf(0, "Hardware Error: Unknown error while doing %s to "
3022                        "sectors starting at %lld on disk %s (thread %d).\n",
3023                     operations[op].op_str, offset / kSectorSize,
3024                     device_name_.c_str(), thread_num_);
3025       }
3026     } else {
3027       logprintf(0, "Hardware Error: Unable to %s to sectors starting at "
3028                    "%lld on disk %s (thread %d).\n",
3029                 operations[op].op_str, offset / kSectorSize,
3030                 device_name_.c_str(), thread_num_);
3031     }
3032     return false;
3033   }
3034 
3035   return true;
3036 #else  // !HAVE_LIBAIO_H
3037   return false;
3038 #endif
3039 }
3040 
3041 // Write a block to disk.
3042 // Return false if the block is not written.
WriteBlockToDisk(int fd,BlockData * block)3043 bool DiskThread::WriteBlockToDisk(int fd, BlockData *block) {
3044   memset(block_buffer_, 0, block->size());
3045 
3046   // Fill block buffer with a pattern
3047   struct page_entry pe;
3048   if (!sat_->GetValid(&pe)) {
3049     // Even though a valid page could not be obatined, it is not an error
3050     // since we can always fill in a pattern directly, albeit slower.
3051     unsigned int *memblock = static_cast<unsigned int *>(block_buffer_);
3052     block->set_pattern(patternlist_->GetRandomPattern());
3053 
3054     logprintf(11, "Log: Warning, using pattern fill fallback in "
3055                   "DiskThread::WriteBlockToDisk on disk %s (thread %d).\n",
3056               device_name_.c_str(), thread_num_);
3057 
3058     for (unsigned int i = 0; i < block->size()/wordsize_; i++) {
3059       memblock[i] = block->pattern()->pattern(i);
3060     }
3061   } else {
3062     memcpy(block_buffer_, pe.addr, block->size());
3063     block->set_pattern(pe.pattern);
3064     sat_->PutValid(&pe);
3065   }
3066 
3067   logprintf(12, "Log: Writing %lld sectors starting at %lld on disk %s"
3068             " (thread %d).\n",
3069             block->size()/kSectorSize, block->address(),
3070             device_name_.c_str(), thread_num_);
3071 
3072   int64 start_time = GetTime();
3073 
3074   if (!AsyncDiskIO(ASYNC_IO_WRITE, fd, block_buffer_, block->size(),
3075                    block->address() * kSectorSize, write_timeout_)) {
3076     return false;
3077   }
3078 
3079   int64 end_time = GetTime();
3080   logprintf(12, "Log: Writing time: %lld us (thread %d).\n",
3081             end_time - start_time, thread_num_);
3082   if (end_time - start_time > write_threshold_) {
3083     logprintf(5, "Log: Write took %lld us which is longer than threshold "
3084                  "%lld us on disk %s (thread %d).\n",
3085               end_time - start_time, write_threshold_, device_name_.c_str(),
3086               thread_num_);
3087   }
3088 
3089   return true;
3090 }
3091 
3092 // Verify a block on disk.
3093 // Return true if the block was read, also increment errorcount
3094 // if the block had data errors or performance problems.
ValidateBlockOnDisk(int fd,BlockData * block)3095 bool DiskThread::ValidateBlockOnDisk(int fd, BlockData *block) {
3096   int64 blocks = block->size() / read_block_size_;
3097   int64 bytes_read = 0;
3098   int64 current_blocks;
3099   int64 current_bytes;
3100   uint64 address = block->address();
3101 
3102   logprintf(20, "Log: Reading sectors starting at %lld on disk %s "
3103             "(thread %d).\n",
3104             address, device_name_.c_str(), thread_num_);
3105 
3106   // Read block from disk and time the read.  If it takes longer than the
3107   // threshold, complain.
3108   if (lseek64(fd, address * kSectorSize, SEEK_SET) == -1) {
3109     logprintf(0, "Process Error: Unable to seek to sector %lld in "
3110               "DiskThread::ValidateSectorsOnDisk on disk %s "
3111               "(thread %d).\n", address, device_name_.c_str(), thread_num_);
3112     return false;
3113   }
3114   int64 start_time = GetTime();
3115 
3116   // Split a large write-sized block into small read-sized blocks and
3117   // read them in groups of randomly-sized multiples of read block size.
3118   // This assures all data written on disk by this particular block
3119   // will be tested using a random reading pattern.
3120   while (blocks != 0) {
3121     // Test all read blocks in a written block.
3122     current_blocks = (random() % blocks) + 1;
3123     current_bytes = current_blocks * read_block_size_;
3124 
3125     memset(block_buffer_, 0, current_bytes);
3126 
3127     logprintf(20, "Log: Reading %lld sectors starting at sector %lld on "
3128               "disk %s (thread %d)\n",
3129               current_bytes / kSectorSize,
3130               (address * kSectorSize + bytes_read) / kSectorSize,
3131               device_name_.c_str(), thread_num_);
3132 
3133     if (!AsyncDiskIO(ASYNC_IO_READ, fd, block_buffer_, current_bytes,
3134                      address * kSectorSize + bytes_read,
3135                      write_timeout_)) {
3136       return false;
3137     }
3138 
3139     int64 end_time = GetTime();
3140     logprintf(20, "Log: Reading time: %lld us (thread %d).\n",
3141               end_time - start_time, thread_num_);
3142     if (end_time - start_time > read_threshold_) {
3143       logprintf(5, "Log: Read took %lld us which is longer than threshold "
3144                 "%lld us on disk %s (thread %d).\n",
3145                 end_time - start_time, read_threshold_,
3146                 device_name_.c_str(), thread_num_);
3147     }
3148 
3149     // In non-destructive mode, don't compare the block to the pattern since
3150     // the block was never written to disk in the first place.
3151     if (!non_destructive_) {
3152       if (CheckRegion(block_buffer_, block->pattern(), current_bytes,
3153                       0, bytes_read)) {
3154         os_->ErrorReport(device_name_.c_str(), "disk-pattern-error", 1);
3155         errorcount_ += 1;
3156         logprintf(0, "Hardware Error: Pattern mismatch in block starting at "
3157                   "sector %lld in DiskThread::ValidateSectorsOnDisk on "
3158                   "disk %s (thread %d).\n",
3159                   address, device_name_.c_str(), thread_num_);
3160       }
3161     }
3162 
3163     bytes_read += current_blocks * read_block_size_;
3164     blocks -= current_blocks;
3165   }
3166 
3167   return true;
3168 }
3169 
3170 // Direct device access thread.
3171 // Return false on software error.
Work()3172 bool DiskThread::Work() {
3173   int fd;
3174 
3175   logprintf(9, "Log: Starting disk thread %d, disk %s\n",
3176             thread_num_, device_name_.c_str());
3177 
3178   srandom(time(NULL));
3179 
3180   if (!OpenDevice(&fd)) {
3181     status_ = false;
3182     return false;
3183   }
3184 
3185   // Allocate a block buffer aligned to 512 bytes since the kernel requires it
3186   // when using direct IO.
3187 #ifdef HAVE_POSIX_MEMALIGN
3188   int memalign_result = posix_memalign(&block_buffer_, kBufferAlignment,
3189                                        sat_->page_length());
3190 #else
3191   block_buffer_ = memalign(kBufferAlignment, sat_->page_length());
3192   int memalign_result = (block_buffer_ == 0);
3193 #endif
3194   if (memalign_result) {
3195     CloseDevice(fd);
3196     logprintf(0, "Process Error: Unable to allocate memory for buffers "
3197                  "for disk %s (thread %d) posix memalign returned %d.\n",
3198               device_name_.c_str(), thread_num_, memalign_result);
3199     status_ = false;
3200     return false;
3201   }
3202 
3203 #ifdef HAVE_LIBAIO_H
3204   if (io_setup(5, &aio_ctx_)) {
3205     CloseDevice(fd);
3206     logprintf(0, "Process Error: Unable to create aio context for disk %s"
3207               " (thread %d).\n",
3208               device_name_.c_str(), thread_num_);
3209     status_ = false;
3210     return false;
3211   }
3212 #endif
3213 
3214   bool result = DoWork(fd);
3215 
3216   status_ = result;
3217 
3218 #ifdef HAVE_LIBAIO_H
3219   io_destroy(aio_ctx_);
3220 #endif
3221   CloseDevice(fd);
3222 
3223   logprintf(9, "Log: Completed %d (disk %s): disk thread status %d, "
3224                "%d pages copied\n",
3225             thread_num_, device_name_.c_str(), status_, pages_copied_);
3226   return result;
3227 }
3228 
RandomDiskThread(DiskBlockTable * block_table)3229 RandomDiskThread::RandomDiskThread(DiskBlockTable *block_table)
3230     : DiskThread(block_table) {
3231   update_block_table_ = 0;
3232 }
3233 
~RandomDiskThread()3234 RandomDiskThread::~RandomDiskThread() {
3235 }
3236 
3237 // Workload for random disk thread.
DoWork(int fd)3238 bool RandomDiskThread::DoWork(int fd) {
3239   logprintf(11, "Log: Random phase for disk %s (thread %d).\n",
3240             device_name_.c_str(), thread_num_);
3241   while (IsReadyToRun()) {
3242     BlockData *block = block_table_->GetRandomBlock();
3243     if (block == NULL) {
3244       logprintf(12, "Log: No block available for device %s (thread %d).\n",
3245                 device_name_.c_str(), thread_num_);
3246     } else {
3247       ValidateBlockOnDisk(fd, block);
3248       block_table_->ReleaseBlock(block);
3249       blocks_read_++;
3250     }
3251   }
3252   pages_copied_ = blocks_read_;
3253   return true;
3254 }
3255 
MemoryRegionThread()3256 MemoryRegionThread::MemoryRegionThread() {
3257   error_injection_ = false;
3258   pages_ = NULL;
3259 }
3260 
~MemoryRegionThread()3261 MemoryRegionThread::~MemoryRegionThread() {
3262   if (pages_ != NULL)
3263     delete pages_;
3264 }
3265 
3266 // Set a region of memory or MMIO to be tested.
3267 // Return false if region could not be mapped.
SetRegion(void * region,int64 size)3268 bool MemoryRegionThread::SetRegion(void *region, int64 size) {
3269   int plength = sat_->page_length();
3270   int npages = size / plength;
3271   if (size % plength) {
3272     logprintf(0, "Process Error: region size is not a multiple of SAT "
3273               "page length\n");
3274     return false;
3275   } else {
3276     if (pages_ != NULL)
3277       delete pages_;
3278     pages_ = new PageEntryQueue(npages);
3279     char *base_addr = reinterpret_cast<char*>(region);
3280     region_ = base_addr;
3281     for (int i = 0; i < npages; i++) {
3282       struct page_entry pe;
3283       init_pe(&pe);
3284       pe.addr = reinterpret_cast<void*>(base_addr + i * plength);
3285       pe.offset = i * plength;
3286 
3287       pages_->Push(&pe);
3288     }
3289     return true;
3290   }
3291 }
3292 
3293 // More detailed error printout for hardware errors in memory or MMIO
3294 // regions.
ProcessError(struct ErrorRecord * error,int priority,const char * message)3295 void MemoryRegionThread::ProcessError(struct ErrorRecord *error,
3296                                       int priority,
3297                                       const char *message) {
3298   uint32 buffer_offset;
3299   if (phase_ == kPhaseCopy) {
3300     // If the error occurred on the Copy Phase, it means that
3301     // the source data (i.e., the main memory) is wrong. so
3302     // just pass it to the original ProcessError to call a
3303     // bad-dimm error
3304     WorkerThread::ProcessError(error, priority, message);
3305   } else if (phase_ == kPhaseCheck) {
3306     // A error on the Check Phase means that the memory region tested
3307     // has an error. Gathering more information and then reporting
3308     // the error.
3309     // Determine if this is a write or read error.
3310     os_->Flush(error->vaddr);
3311     error->reread = *(error->vaddr);
3312     char *good = reinterpret_cast<char*>(&(error->expected));
3313     char *bad = reinterpret_cast<char*>(&(error->actual));
3314     sat_assert(error->expected != error->actual);
3315     unsigned int offset = 0;
3316     for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
3317       if (good[offset] != bad[offset])
3318         break;
3319     }
3320 
3321     error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
3322 
3323     buffer_offset = error->vbyteaddr - region_;
3324 
3325     // Find physical address if possible.
3326     error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
3327     logprintf(priority,
3328               "%s: miscompare on %s, CRC check at %p(0x%llx), "
3329               "offset %llx: read:0x%016llx, reread:0x%016llx "
3330               "expected:0x%016llx\n",
3331               message,
3332               identifier_.c_str(),
3333               error->vaddr,
3334               error->paddr,
3335               buffer_offset,
3336               error->actual,
3337               error->reread,
3338               error->expected);
3339   } else {
3340     logprintf(0, "Process Error: memory region thread raised an "
3341               "unexpected error.");
3342   }
3343 }
3344 
3345 // Workload for testion memory or MMIO regions.
3346 // Return false on software error.
Work()3347 bool MemoryRegionThread::Work() {
3348   struct page_entry source_pe;
3349   struct page_entry memregion_pe;
3350   bool result = true;
3351   int64 loops = 0;
3352   const uint64 error_constant = 0x00ba00000000ba00LL;
3353 
3354   // For error injection.
3355   int64 *addr = 0x0;
3356   int offset = 0;
3357   int64 data = 0;
3358 
3359   logprintf(9, "Log: Starting Memory Region thread %d\n", thread_num_);
3360 
3361   while (IsReadyToRun()) {
3362     // Getting pages from SAT and queue.
3363     phase_ = kPhaseNoPhase;
3364     result = result && sat_->GetValid(&source_pe);
3365     if (!result) {
3366       logprintf(0, "Process Error: memory region thread failed to pop "
3367                 "pages from SAT, bailing\n");
3368       break;
3369     }
3370 
3371     result = result && pages_->PopRandom(&memregion_pe);
3372     if (!result) {
3373       logprintf(0, "Process Error: memory region thread failed to pop "
3374                 "pages from queue, bailing\n");
3375       break;
3376     }
3377 
3378     // Error injection for CRC copy.
3379     if ((sat_->error_injection() || error_injection_) && loops == 1) {
3380       addr = reinterpret_cast<int64*>(source_pe.addr);
3381       offset = random() % (sat_->page_length() / wordsize_);
3382       data = addr[offset];
3383       addr[offset] = error_constant;
3384     }
3385 
3386     // Copying SAT page into memory region.
3387     phase_ = kPhaseCopy;
3388     CrcCopyPage(&memregion_pe, &source_pe);
3389     memregion_pe.pattern = source_pe.pattern;
3390 
3391     // Error injection for CRC Check.
3392     if ((sat_->error_injection() || error_injection_) && loops == 2) {
3393       addr = reinterpret_cast<int64*>(memregion_pe.addr);
3394       offset = random() % (sat_->page_length() / wordsize_);
3395       data = addr[offset];
3396       addr[offset] = error_constant;
3397     }
3398 
3399     // Checking page content in memory region.
3400     phase_ = kPhaseCheck;
3401     CrcCheckPage(&memregion_pe);
3402 
3403     phase_ = kPhaseNoPhase;
3404     // Storing pages on their proper queues.
3405     result = result && sat_->PutValid(&source_pe);
3406     if (!result) {
3407       logprintf(0, "Process Error: memory region thread failed to push "
3408                 "pages into SAT, bailing\n");
3409       break;
3410     }
3411     result = result && pages_->Push(&memregion_pe);
3412     if (!result) {
3413       logprintf(0, "Process Error: memory region thread failed to push "
3414                 "pages into queue, bailing\n");
3415       break;
3416     }
3417 
3418     if ((sat_->error_injection() || error_injection_) &&
3419         loops >= 1 && loops <= 2) {
3420       addr[offset] = data;
3421     }
3422 
3423     loops++;
3424     YieldSelf();
3425   }
3426 
3427   pages_copied_ = loops;
3428   status_ = result;
3429   logprintf(9, "Log: Completed %d: Memory Region thread. Status %d, %d "
3430             "pages checked\n", thread_num_, status_, pages_copied_);
3431   return result;
3432 }
3433 
3434 // The list of MSRs to read from each cpu.
3435 const CpuFreqThread::CpuRegisterType CpuFreqThread::kCpuRegisters[] = {
3436   { kMsrTscAddr, "TSC" },
3437   { kMsrAperfAddr, "APERF" },
3438   { kMsrMperfAddr, "MPERF" },
3439 };
3440 
CpuFreqThread(int num_cpus,int freq_threshold,int round)3441 CpuFreqThread::CpuFreqThread(int num_cpus, int freq_threshold, int round)
3442   : num_cpus_(num_cpus),
3443     freq_threshold_(freq_threshold),
3444     round_(round) {
3445   sat_assert(round >= 0);
3446   if (round == 0) {
3447     // If rounding is off, force rounding to the nearest MHz.
3448     round_ = 1;
3449     round_value_ = 0.5;
3450   } else {
3451     round_value_ = round/2.0;
3452   }
3453 }
3454 
~CpuFreqThread()3455 CpuFreqThread::~CpuFreqThread() {
3456 }
3457 
3458 // Compute the difference between the currently read MSR values and the
3459 // previously read values and store the results in delta. If any of the
3460 // values did not increase, or the TSC value is too small, returns false.
3461 // Otherwise, returns true.
ComputeDelta(CpuDataType * current,CpuDataType * previous,CpuDataType * delta)3462 bool CpuFreqThread::ComputeDelta(CpuDataType *current, CpuDataType *previous,
3463                                  CpuDataType *delta) {
3464   // Loop through the msrs.
3465   for (int msr = 0; msr < kMsrLast; msr++) {
3466     if (previous->msrs[msr] > current->msrs[msr]) {
3467       logprintf(0, "Log: Register %s went backwards 0x%llx to 0x%llx "
3468                 "skipping interval\n", kCpuRegisters[msr], previous->msrs[msr],
3469                 current->msrs[msr]);
3470       return false;
3471     } else {
3472       delta->msrs[msr] = current->msrs[msr] - previous->msrs[msr];
3473     }
3474   }
3475 
3476   // Check for TSC < 1 Mcycles over interval.
3477   if (delta->msrs[kMsrTsc] < (1000 * 1000)) {
3478     logprintf(0, "Log: Insanely slow TSC rate, TSC stops in idle?\n");
3479     return false;
3480   }
3481   timersub(&current->tv, &previous->tv, &delta->tv);
3482 
3483   return true;
3484 }
3485 
3486 // Compute the change in values of the MSRs between current and previous,
3487 // set the frequency in MHz of the cpu. If there is an error computing
3488 // the delta, return false. Othewise, return true.
ComputeFrequency(CpuDataType * current,CpuDataType * previous,int * freq)3489 bool CpuFreqThread::ComputeFrequency(CpuDataType *current,
3490                                      CpuDataType *previous, int *freq) {
3491   CpuDataType delta;
3492   if (!ComputeDelta(current, previous, &delta)) {
3493     return false;
3494   }
3495 
3496   double interval = delta.tv.tv_sec + delta.tv.tv_usec / 1000000.0;
3497   double frequency = 1.0 * delta.msrs[kMsrTsc] / 1000000
3498                      * delta.msrs[kMsrAperf] / delta.msrs[kMsrMperf] / interval;
3499 
3500   // Use the rounding value to round up properly.
3501   int computed = static_cast<int>(frequency + round_value_);
3502   *freq = computed - (computed % round_);
3503   return true;
3504 }
3505 
3506 // This is the task function that the thread executes.
Work()3507 bool CpuFreqThread::Work() {
3508   cpu_set_t cpuset;
3509   if (!AvailableCpus(&cpuset)) {
3510     logprintf(0, "Process Error: Cannot get information about the cpus.\n");
3511     return false;
3512   }
3513 
3514   // Start off indicating the test is passing.
3515   status_ = true;
3516 
3517   int curr = 0;
3518   int prev = 1;
3519   uint32 num_intervals = 0;
3520   bool paused = false;
3521   bool valid;
3522   bool pass = true;
3523 
3524   vector<CpuDataType> data[2];
3525   data[0].resize(num_cpus_);
3526   data[1].resize(num_cpus_);
3527   while (IsReadyToRun(&paused)) {
3528     if (paused) {
3529       // Reset the intervals and restart logic after the pause.
3530       num_intervals = 0;
3531     }
3532     if (num_intervals == 0) {
3533       // If this is the first interval, then always wait a bit before
3534       // starting to collect data.
3535       sat_sleep(kStartupDelay);
3536     }
3537 
3538     // Get the per cpu counters.
3539     valid = true;
3540     for (int cpu = 0; cpu < num_cpus_; cpu++) {
3541       if (CPU_ISSET(cpu, &cpuset)) {
3542         if (!GetMsrs(cpu, &data[curr][cpu])) {
3543           logprintf(0, "Failed to get msrs on cpu %d.\n", cpu);
3544           valid = false;
3545           break;
3546         }
3547       }
3548     }
3549     if (!valid) {
3550       // Reset the number of collected intervals since something bad happened.
3551       num_intervals = 0;
3552       continue;
3553     }
3554 
3555     num_intervals++;
3556 
3557     // Only compute a delta when we have at least two intervals worth of data.
3558     if (num_intervals > 2) {
3559       for (int cpu = 0; cpu < num_cpus_; cpu++) {
3560         if (CPU_ISSET(cpu, &cpuset)) {
3561           int freq;
3562           if (!ComputeFrequency(&data[curr][cpu], &data[prev][cpu],
3563                                 &freq)) {
3564             // Reset the number of collected intervals since an unknown
3565             // error occurred.
3566             logprintf(0, "Log: Cannot get frequency of cpu %d.\n", cpu);
3567             num_intervals = 0;
3568             break;
3569           }
3570           logprintf(15, "Cpu %d Freq %d\n", cpu, freq);
3571           if (freq < freq_threshold_) {
3572             errorcount_++;
3573             pass = false;
3574             logprintf(0, "Log: Cpu %d frequency is too low, frequency %d MHz "
3575                       "threshold %d MHz.\n", cpu, freq, freq_threshold_);
3576           }
3577         }
3578       }
3579     }
3580 
3581     sat_sleep(kIntervalPause);
3582 
3583     // Swap the values in curr and prev (these values flip between 0 and 1).
3584     curr ^= 1;
3585     prev ^= 1;
3586   }
3587 
3588   return pass;
3589 }
3590 
3591 
3592 // Get the MSR values for this particular cpu and save them in data. If
3593 // any error is encountered, returns false. Otherwise, returns true.
GetMsrs(int cpu,CpuDataType * data)3594 bool CpuFreqThread::GetMsrs(int cpu, CpuDataType *data) {
3595   for (int msr = 0; msr < kMsrLast; msr++) {
3596     if (!os_->ReadMSR(cpu, kCpuRegisters[msr].msr, &data->msrs[msr])) {
3597       return false;
3598     }
3599   }
3600   // Save the time at which we acquired these values.
3601   gettimeofday(&data->tv, NULL);
3602 
3603   return true;
3604 }
3605 
3606 // Returns true if this test can run on the current machine. Otherwise,
3607 // returns false.
CanRun()3608 bool CpuFreqThread::CanRun() {
3609 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
3610   unsigned int eax, ebx, ecx, edx;
3611 
3612   // Check that the TSC feature is supported.
3613   // This check is valid for both Intel and AMD.
3614   eax = 1;
3615   cpuid(&eax, &ebx, &ecx, &edx);
3616   if (!(edx & (1 << 5))) {
3617     logprintf(0, "Process Error: No TSC support.\n");
3618     return false;
3619   }
3620 
3621   // Check the highest extended function level supported.
3622   // This check is valid for both Intel and AMD.
3623   eax = 0x80000000;
3624   cpuid(&eax, &ebx, &ecx, &edx);
3625   if (eax < 0x80000007) {
3626     logprintf(0, "Process Error: No invariant TSC support.\n");
3627     return false;
3628   }
3629 
3630   // Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
3631   // This check is valid for both Intel and AMD.
3632   eax = 0x80000007;
3633   cpuid(&eax, &ebx, &ecx, &edx);
3634   if ((edx & (1 << 8)) == 0) {
3635     logprintf(0, "Process Error: No non-stop TSC support.\n");
3636     return false;
3637   }
3638 
3639   // APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
3640   // This check is valid for both Intel and AMD.
3641   eax = 0x6;
3642   cpuid(&eax, &ebx, &ecx, &edx);
3643   if ((ecx & 1) == 0) {
3644     logprintf(0, "Process Error: No APERF MSR support.\n");
3645     return false;
3646   }
3647   return true;
3648 #else
3649   logprintf(0, "Process Error: "
3650                "cpu_freq_test is only supported on X86 processors.\n");
3651   return false;
3652 #endif
3653 }
3654