// Copyright 2006 Google Inc. All Rights Reserved. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // worker.cc : individual tasks that can be run in combination to // stress the system #include #include #include #include #include #include #include #include #include #include #include #include #include #include // These are necessary, but on by default // #define __USE_GNU // #define __USE_LARGEFILE64 #include #include #include #include #include // for gettid // For size of block device #include #include // For asynchronous I/O #ifdef HAVE_LIBAIO_H #include #endif #include #include #include // This file must work with autoconf on its public version, // so these includes are correct. #include "error_diag.h" // NOLINT #include "os.h" // NOLINT #include "pattern.h" // NOLINT #include "queue.h" // NOLINT #include "sat.h" // NOLINT #include "sattypes.h" // NOLINT #include "worker.h" // NOLINT // Syscalls // Why ubuntu, do you hate gettid so bad? #if !defined(__NR_gettid) #define __NR_gettid 224 #endif #define gettid() syscall(__NR_gettid) #if !defined(CPU_SETSIZE) _syscall3(int, sched_getaffinity, pid_t, pid, unsigned int, len, cpu_set_t*, mask) _syscall3(int, sched_setaffinity, pid_t, pid, unsigned int, len, cpu_set_t*, mask) #endif namespace { // Get HW core ID from cpuid instruction. inline int apicid(void) { int cpu; #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686) __asm __volatile("cpuid" : "=b" (cpu) : "a" (1) : "cx", "dx"); #elif defined(STRESSAPPTEST_CPU_ARMV7A) #warning "Unsupported CPU type ARMV7A: unable to determine core ID." cpu = 0; #else #warning "Unsupported CPU type: unable to determine core ID." cpu = 0; #endif return (cpu >> 24); } // Work around the sad fact that there are two (gnu, xsi) incompatible // versions of strerror_r floating around google. Awesome. bool sat_strerror(int err, char *buf, int len) { buf[0] = 0; char *errmsg = reinterpret_cast(strerror_r(err, buf, len)); int retval = reinterpret_cast(errmsg); if (retval == 0) return true; if (retval == -1) return false; if (errmsg != buf) { strncpy(buf, errmsg, len); buf[len - 1] = 0; } return true; } inline uint64 addr_to_tag(void *address) { return reinterpret_cast(address); } } #if !defined(O_DIRECT) // Sometimes this isn't available. // Disregard if it's not defined. #define O_DIRECT 0 #endif // A struct to hold captured errors, for later reporting. struct ErrorRecord { uint64 actual; // This is the actual value read. uint64 reread; // This is the actual value, reread. uint64 expected; // This is what it should have been. uint64 *vaddr; // This is where it was (or wasn't). char *vbyteaddr; // This is byte specific where the data was (or wasn't). uint64 paddr; // This is the bus address, if available. uint64 *tagvaddr; // This holds the tag value if this data was tagged. uint64 tagpaddr; // This holds the physical address corresponding to the tag. }; // This is a helper function to create new threads with pthreads. static void *ThreadSpawnerGeneric(void *ptr) { WorkerThread *worker = static_cast(ptr); worker->StartRoutine(); return NULL; } void WorkerStatus::Initialize() { sat_assert(0 == pthread_mutex_init(&num_workers_mutex_, NULL)); sat_assert(0 == pthread_rwlock_init(&status_rwlock_, NULL)); #ifdef _POSIX_BARRIERS sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL, num_workers_ + 1)); #endif } void WorkerStatus::Destroy() { sat_assert(0 == pthread_mutex_destroy(&num_workers_mutex_)); sat_assert(0 == pthread_rwlock_destroy(&status_rwlock_)); #ifdef _POSIX_BARRIERS sat_assert(0 == pthread_barrier_destroy(&pause_barrier_)); #endif } void WorkerStatus::PauseWorkers() { if (SetStatus(PAUSE) != PAUSE) WaitOnPauseBarrier(); } void WorkerStatus::ResumeWorkers() { if (SetStatus(RUN) == PAUSE) WaitOnPauseBarrier(); } void WorkerStatus::StopWorkers() { if (SetStatus(STOP) == PAUSE) WaitOnPauseBarrier(); } bool WorkerStatus::ContinueRunning() { // This loop is an optimization. We use it to immediately re-check the status // after resuming from a pause, instead of returning and waiting for the next // call to this function. for (;;) { switch (GetStatus()) { case RUN: return true; case PAUSE: // Wait for the other workers to call this function so that // PauseWorkers() can return. WaitOnPauseBarrier(); // Wait for ResumeWorkers() to be called. WaitOnPauseBarrier(); break; case STOP: return false; } } } bool WorkerStatus::ContinueRunningNoPause() { return (GetStatus() != STOP); } void WorkerStatus::RemoveSelf() { // Acquire a read lock on status_rwlock_ while (status_ != PAUSE). for (;;) { AcquireStatusReadLock(); if (status_ != PAUSE) break; // We need to obey PauseWorkers() just like ContinueRunning() would, so that // the other threads won't wait on pause_barrier_ forever. ReleaseStatusLock(); // Wait for the other workers to call this function so that PauseWorkers() // can return. WaitOnPauseBarrier(); // Wait for ResumeWorkers() to be called. WaitOnPauseBarrier(); } // This lock would be unnecessary if we held a write lock instead of a read // lock on status_rwlock_, but that would also force all threads calling // ContinueRunning() to wait on this one. Using a separate lock avoids that. AcquireNumWorkersLock(); // Decrement num_workers_ and reinitialize pause_barrier_, which we know isn't // in use because (status != PAUSE). #ifdef _POSIX_BARRIERS sat_assert(0 == pthread_barrier_destroy(&pause_barrier_)); sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL, num_workers_)); #endif --num_workers_; ReleaseNumWorkersLock(); // Release status_rwlock_. ReleaseStatusLock(); } // Parent thread class. WorkerThread::WorkerThread() { status_ = false; pages_copied_ = 0; errorcount_ = 0; runduration_usec_ = 1; priority_ = Normal; worker_status_ = NULL; thread_spawner_ = &ThreadSpawnerGeneric; tag_mode_ = false; } WorkerThread::~WorkerThread() {} // Constructors. Just init some default values. FillThread::FillThread() { num_pages_to_fill_ = 0; } // Initialize file name to empty. FileThread::FileThread() { filename_ = ""; devicename_ = ""; pass_ = 0; page_io_ = true; crc_page_ = -1; local_page_ = NULL; } // If file thread used bounce buffer in memory, account for the extra // copy for memory bandwidth calculation. float FileThread::GetMemoryCopiedData() { if (!os_->normal_mem()) return GetCopiedData(); else return 0; } // Initialize target hostname to be invalid. NetworkThread::NetworkThread() { snprintf(ipaddr_, sizeof(ipaddr_), "Unknown"); sock_ = 0; } // Initialize? NetworkSlaveThread::NetworkSlaveThread() { } // Initialize? NetworkListenThread::NetworkListenThread() { } // Init member variables. void WorkerThread::InitThread(int thread_num_init, class Sat *sat_init, class OsLayer *os_init, class PatternList *patternlist_init, WorkerStatus *worker_status) { sat_assert(worker_status); worker_status->AddWorkers(1); thread_num_ = thread_num_init; sat_ = sat_init; os_ = os_init; patternlist_ = patternlist_init; worker_status_ = worker_status; AvailableCpus(&cpu_mask_); tag_ = 0xffffffff; tag_mode_ = sat_->tag_mode(); } // Use pthreads to prioritize a system thread. bool WorkerThread::InitPriority() { // This doesn't affect performance that much, and may not be too safe. bool ret = BindToCpus(&cpu_mask_); if (!ret) logprintf(11, "Log: Bind to %s failed.\n", cpuset_format(&cpu_mask_).c_str()); logprintf(11, "Log: Thread %d running on apic ID %d mask %s (%s).\n", thread_num_, apicid(), CurrentCpusFormat().c_str(), cpuset_format(&cpu_mask_).c_str()); #if 0 if (priority_ == High) { sched_param param; param.sched_priority = 1; // Set the priority; others are unchanged. logprintf(0, "Log: Changing priority to SCHED_FIFO %d\n", param.sched_priority); if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { char buf[256]; sat_strerror(errno, buf, sizeof(buf)); logprintf(0, "Process Error: sched_setscheduler " "failed - error %d %s\n", errno, buf); } } #endif return true; } // Use pthreads to create a system thread. int WorkerThread::SpawnThread() { // Create the new thread. int result = pthread_create(&thread_, NULL, thread_spawner_, this); if (result) { char buf[256]; sat_strerror(result, buf, sizeof(buf)); logprintf(0, "Process Error: pthread_create " "failed - error %d %s\n", result, buf); status_ = false; return false; } // 0 is pthreads success. return true; } // Kill the worker thread with SIGINT. bool WorkerThread::KillThread() { return (pthread_kill(thread_, SIGINT) == 0); } // Block until thread has exited. bool WorkerThread::JoinThread() { int result = pthread_join(thread_, NULL); if (result) { logprintf(0, "Process Error: pthread_join failed - error %d\n", result); status_ = false; } // 0 is pthreads success. return (!result); } void WorkerThread::StartRoutine() { InitPriority(); StartThreadTimer(); Work(); StopThreadTimer(); worker_status_->RemoveSelf(); } // Thread work loop. Execute until marked finished. bool WorkerThread::Work() { do { logprintf(9, "Log: ...\n"); // Sleep for 1 second. sat_sleep(1); } while (IsReadyToRun()); return false; } // Returns CPU mask of CPUs available to this process, // Conceptually, each bit represents a logical CPU, ie: // mask = 3 (11b): cpu0, 1 // mask = 13 (1101b): cpu0, 2, 3 bool WorkerThread::AvailableCpus(cpu_set_t *cpuset) { CPU_ZERO(cpuset); #ifdef HAVE_SCHED_GETAFFINITY return sched_getaffinity(getppid(), sizeof(*cpuset), cpuset) == 0; #else return 0; #endif } // Returns CPU mask of CPUs this thread is bound to, // Conceptually, each bit represents a logical CPU, ie: // mask = 3 (11b): cpu0, 1 // mask = 13 (1101b): cpu0, 2, 3 bool WorkerThread::CurrentCpus(cpu_set_t *cpuset) { CPU_ZERO(cpuset); #ifdef HAVE_SCHED_GETAFFINITY return sched_getaffinity(0, sizeof(*cpuset), cpuset) == 0; #else return 0; #endif } // Bind worker thread to specified CPU(s) // Args: // thread_mask: cpu_set_t representing CPUs, ie // mask = 1 (01b): cpu0 // mask = 3 (11b): cpu0, 1 // mask = 13 (1101b): cpu0, 2, 3 // // Returns true on success, false otherwise. bool WorkerThread::BindToCpus(const cpu_set_t *thread_mask) { cpu_set_t process_mask; AvailableCpus(&process_mask); if (cpuset_isequal(thread_mask, &process_mask)) return true; logprintf(11, "Log: available CPU mask - %s\n", cpuset_format(&process_mask).c_str()); if (!cpuset_issubset(thread_mask, &process_mask)) { // Invalid cpu_mask, ie cpu not allocated to this process or doesn't exist. logprintf(0, "Log: requested CPUs %s not a subset of available %s\n", cpuset_format(thread_mask).c_str(), cpuset_format(&process_mask).c_str()); return false; } #ifdef HAVE_SCHED_GETAFFINITY return (sched_setaffinity(gettid(), sizeof(*thread_mask), thread_mask) == 0); #else return 0; #endif } // A worker thread can yield itself to give up CPU until it's scheduled again. // Returns true on success, false on error. bool WorkerThread::YieldSelf() { return (sched_yield() == 0); } // Fill this page with its pattern. bool WorkerThread::FillPage(struct page_entry *pe) { // Error check arguments. if (pe == 0) { logprintf(0, "Process Error: Fill Page entry null\n"); return 0; } // Mask is the bitmask of indexes used by the pattern. // It is the pattern size -1. Size is always a power of 2. uint64 *memwords = static_cast(pe->addr); int length = sat_->page_length(); if (tag_mode_) { // Select tag or data as appropriate. for (int i = 0; i < length / wordsize_; i++) { datacast_t data; if ((i & 0x7) == 0) { data.l64 = addr_to_tag(&memwords[i]); } else { data.l32.l = pe->pattern->pattern(i << 1); data.l32.h = pe->pattern->pattern((i << 1) + 1); } memwords[i] = data.l64; } } else { // Just fill in untagged data directly. for (int i = 0; i < length / wordsize_; i++) { datacast_t data; data.l32.l = pe->pattern->pattern(i << 1); data.l32.h = pe->pattern->pattern((i << 1) + 1); memwords[i] = data.l64; } } return 1; } // Tell the thread how many pages to fill. void FillThread::SetFillPages(int64 num_pages_to_fill_init) { num_pages_to_fill_ = num_pages_to_fill_init; } // Fill this page with a random pattern. bool FillThread::FillPageRandom(struct page_entry *pe) { // Error check arguments. if (pe == 0) { logprintf(0, "Process Error: Fill Page entry null\n"); return 0; } if ((patternlist_ == 0) || (patternlist_->Size() == 0)) { logprintf(0, "Process Error: No data patterns available\n"); return 0; } // Choose a random pattern for this block. pe->pattern = patternlist_->GetRandomPattern(); if (pe->pattern == 0) { logprintf(0, "Process Error: Null data pattern\n"); return 0; } // Actually fill the page. return FillPage(pe); } // Memory fill work loop. Execute until alloted pages filled. bool FillThread::Work() { bool result = true; logprintf(9, "Log: Starting fill thread %d\n", thread_num_); // We want to fill num_pages_to_fill pages, and // stop when we've filled that many. // We also want to capture early break struct page_entry pe; int64 loops = 0; while (IsReadyToRun() && (loops < num_pages_to_fill_)) { result = result && sat_->GetEmpty(&pe); if (!result) { logprintf(0, "Process Error: fill_thread failed to pop pages, " "bailing\n"); break; } // Fill the page with pattern result = result && FillPageRandom(&pe); if (!result) break; // Put the page back on the queue. result = result && sat_->PutValid(&pe); if (!result) { logprintf(0, "Process Error: fill_thread failed to push pages, " "bailing\n"); break; } loops++; } // Fill in thread status. pages_copied_ = loops; status_ = result; logprintf(9, "Log: Completed %d: Fill thread. Status %d, %d pages filled\n", thread_num_, status_, pages_copied_); return result; } // Print error information about a data miscompare. void WorkerThread::ProcessError(struct ErrorRecord *error, int priority, const char *message) { char dimm_string[256] = ""; int apic_id = apicid(); // Determine if this is a write or read error. os_->Flush(error->vaddr); error->reread = *(error->vaddr); char *good = reinterpret_cast(&(error->expected)); char *bad = reinterpret_cast(&(error->actual)); sat_assert(error->expected != error->actual); unsigned int offset = 0; for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) { if (good[offset] != bad[offset]) break; } error->vbyteaddr = reinterpret_cast(error->vaddr) + offset; // Find physical address if possible. error->paddr = os_->VirtualToPhysical(error->vbyteaddr); // Pretty print DIMM mapping if available. os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string)); // Report parseable error. if (priority < 5) { // Run miscompare error through diagnoser for logging and reporting. os_->error_diagnoser_->AddMiscompareError(dimm_string, reinterpret_cast (error->vaddr), 1); logprintf(priority, "%s: miscompare on CPU %d(0x%s) at %p(0x%llx:%s): " "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n", message, apic_id, CurrentCpusFormat().c_str(), error->vaddr, error->paddr, dimm_string, error->actual, error->reread, error->expected); } // Overwrite incorrect data with correct data to prevent // future miscompares when this data is reused. *(error->vaddr) = error->expected; os_->Flush(error->vaddr); } // Print error information about a data miscompare. void FileThread::ProcessError(struct ErrorRecord *error, int priority, const char *message) { char dimm_string[256] = ""; // Determine if this is a write or read error. os_->Flush(error->vaddr); error->reread = *(error->vaddr); char *good = reinterpret_cast(&(error->expected)); char *bad = reinterpret_cast(&(error->actual)); sat_assert(error->expected != error->actual); unsigned int offset = 0; for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) { if (good[offset] != bad[offset]) break; } error->vbyteaddr = reinterpret_cast(error->vaddr) + offset; // Find physical address if possible. error->paddr = os_->VirtualToPhysical(error->vbyteaddr); // Pretty print DIMM mapping if available. os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string)); // If crc_page_ is valid, ie checking content read back from file, // track src/dst memory addresses. Otherwise catagorize as general // mememory miscompare for CRC checking everywhere else. if (crc_page_ != -1) { int miscompare_byteoffset = static_cast(error->vbyteaddr) - static_cast(page_recs_[crc_page_].dst); os_->error_diagnoser_->AddHDDMiscompareError(devicename_, crc_page_, miscompare_byteoffset, page_recs_[crc_page_].src, page_recs_[crc_page_].dst); } else { os_->error_diagnoser_->AddMiscompareError(dimm_string, reinterpret_cast (error->vaddr), 1); } logprintf(priority, "%s: miscompare on %s at %p(0x%llx:%s): read:0x%016llx, " "reread:0x%016llx expected:0x%016llx\n", message, devicename_.c_str(), error->vaddr, error->paddr, dimm_string, error->actual, error->reread, error->expected); // Overwrite incorrect data with correct data to prevent // future miscompares when this data is reused. *(error->vaddr) = error->expected; os_->Flush(error->vaddr); } // Do a word by word result check of a region. // Print errors on mismatches. int WorkerThread::CheckRegion(void *addr, class Pattern *pattern, int64 length, int offset, int64 pattern_offset) { uint64 *memblock = static_cast(addr); const int kErrorLimit = 128; int errors = 0; int overflowerrors = 0; // Count of overflowed errors. bool page_error = false; string errormessage("Hardware Error"); struct ErrorRecord recorded[kErrorLimit]; // Queued errors for later printing. // For each word in the data region. for (int i = 0; i < length / wordsize_; i++) { uint64 actual = memblock[i]; uint64 expected; // Determine the value that should be there. datacast_t data; int index = 2 * i + pattern_offset; data.l32.l = pattern->pattern(index); data.l32.h = pattern->pattern(index + 1); expected = data.l64; // Check tags if necessary. if (tag_mode_ && ((reinterpret_cast(&memblock[i]) & 0x3f) == 0)) { expected = addr_to_tag(&memblock[i]); } // If the value is incorrect, save an error record for later printing. if (actual != expected) { if (errors < kErrorLimit) { recorded[errors].actual = actual; recorded[errors].expected = expected; recorded[errors].vaddr = &memblock[i]; errors++; } else { page_error = true; // If we have overflowed the error queue, just print the errors now. logprintf(10, "Log: Error record overflow, too many miscompares!\n"); errormessage = "Page Error"; break; } } } // Find if this is a whole block corruption. if (page_error && !tag_mode_) { int patsize = patternlist_->Size(); for (int pat = 0; pat < patsize; pat++) { class Pattern *altpattern = patternlist_->GetPattern(pat); const int kGood = 0; const int kBad = 1; const int kGoodAgain = 2; const int kNoMatch = 3; int state = kGood; unsigned int badstart = 0; unsigned int badend = 0; // Don't match against ourself! if (pattern == altpattern) continue; for (int i = 0; i < length / wordsize_; i++) { uint64 actual = memblock[i]; datacast_t expected; datacast_t possible; // Determine the value that should be there. int index = 2 * i + pattern_offset; expected.l32.l = pattern->pattern(index); expected.l32.h = pattern->pattern(index + 1); possible.l32.l = pattern->pattern(index); possible.l32.h = pattern->pattern(index + 1); if (state == kGood) { if (actual == expected.l64) { continue; } else if (actual == possible.l64) { badstart = i; badend = i; state = kBad; continue; } else { state = kNoMatch; break; } } else if (state == kBad) { if (actual == possible.l64) { badend = i; continue; } else if (actual == expected.l64) { state = kGoodAgain; continue; } else { state = kNoMatch; break; } } else if (state == kGoodAgain) { if (actual == expected.l64) { continue; } else { state = kNoMatch; break; } } } if ((state == kGoodAgain) || (state == kBad)) { unsigned int blockerrors = badend - badstart + 1; errormessage = "Block Error"; ProcessError(&recorded[0], 0, errormessage.c_str()); logprintf(0, "Block Error: (%p) pattern %s instead of %s, " "%d bytes from offset 0x%x to 0x%x\n", &memblock[badstart], altpattern->name(), pattern->name(), blockerrors * wordsize_, offset + badstart * wordsize_, offset + badend * wordsize_); errorcount_ += blockerrors; return blockerrors; } } } // Process error queue after all errors have been recorded. for (int err = 0; err < errors; err++) { int priority = 5; if (errorcount_ + err < 30) priority = 0; // Bump up the priority for the first few errors. ProcessError(&recorded[err], priority, errormessage.c_str()); } if (page_error) { // For each word in the data region. int error_recount = 0; for (int i = 0; i < length / wordsize_; i++) { uint64 actual = memblock[i]; uint64 expected; datacast_t data; // Determine the value that should be there. int index = 2 * i + pattern_offset; data.l32.l = pattern->pattern(index); data.l32.h = pattern->pattern(index + 1); expected = data.l64; // Check tags if necessary. if (tag_mode_ && ((reinterpret_cast(&memblock[i]) & 0x3f) == 0)) { expected = addr_to_tag(&memblock[i]); } // If the value is incorrect, save an error record for later printing. if (actual != expected) { if (error_recount < kErrorLimit) { // We already reported these. error_recount++; } else { // If we have overflowed the error queue, print the errors now. struct ErrorRecord er; er.actual = actual; er.expected = expected; er.vaddr = &memblock[i]; // Do the error printout. This will take a long time and // likely change the machine state. ProcessError(&er, 12, errormessage.c_str()); overflowerrors++; } } } } // Keep track of observed errors. errorcount_ += errors + overflowerrors; return errors + overflowerrors; } float WorkerThread::GetCopiedData() { return pages_copied_ * sat_->page_length() / kMegabyte; } // Calculate the CRC of a region. // Result check if the CRC mismatches. int WorkerThread::CrcCheckPage(struct page_entry *srcpe) { const int blocksize = 4096; const int blockwords = blocksize / wordsize_; int errors = 0; const AdlerChecksum *expectedcrc = srcpe->pattern->crc(); uint64 *memblock = static_cast(srcpe->addr); int blocks = sat_->page_length() / blocksize; for (int currentblock = 0; currentblock < blocks; currentblock++) { uint64 *memslice = memblock + currentblock * blockwords; AdlerChecksum crc; if (tag_mode_) { AdlerAddrCrcC(memslice, blocksize, &crc, srcpe); } else { CalculateAdlerChecksum(memslice, blocksize, &crc); } // If the CRC does not match, we'd better look closer. if (!crc.Equals(*expectedcrc)) { logprintf(11, "Log: CrcCheckPage Falling through to slow compare, " "CRC mismatch %s != %s\n", crc.ToHexString().c_str(), expectedcrc->ToHexString().c_str()); int errorcount = CheckRegion(memslice, srcpe->pattern, blocksize, currentblock * blocksize, 0); if (errorcount == 0) { logprintf(0, "Log: CrcCheckPage CRC mismatch %s != %s, " "but no miscompares found.\n", crc.ToHexString().c_str(), expectedcrc->ToHexString().c_str()); } errors += errorcount; } } // For odd length transfers, we should never hit this. int leftovers = sat_->page_length() % blocksize; if (leftovers) { uint64 *memslice = memblock + blocks * blockwords; errors += CheckRegion(memslice, srcpe->pattern, leftovers, blocks * blocksize, 0); } return errors; } // Print error information about a data miscompare. void WorkerThread::ProcessTagError(struct ErrorRecord *error, int priority, const char *message) { char dimm_string[256] = ""; char tag_dimm_string[256] = ""; bool read_error = false; int apic_id = apicid(); // Determine if this is a write or read error. os_->Flush(error->vaddr); error->reread = *(error->vaddr); // Distinguish read and write errors. if (error->actual != error->reread) { read_error = true; } sat_assert(error->expected != error->actual); error->vbyteaddr = reinterpret_cast(error->vaddr); // Find physical address if possible. error->paddr = os_->VirtualToPhysical(error->vbyteaddr); error->tagpaddr = os_->VirtualToPhysical(error->tagvaddr); // Pretty print DIMM mapping if available. os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string)); // Pretty print DIMM mapping if available. os_->FindDimm(error->tagpaddr, tag_dimm_string, sizeof(tag_dimm_string)); // Report parseable error. if (priority < 5) { logprintf(priority, "%s: Tag from %p(0x%llx:%s) (%s) " "miscompare on CPU %d(0x%s) at %p(0x%llx:%s): " "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n", message, error->tagvaddr, error->tagpaddr, tag_dimm_string, read_error ? "read error" : "write error", apic_id, CurrentCpusFormat().c_str(), error->vaddr, error->paddr, dimm_string, error->actual, error->reread, error->expected); } errorcount_ += 1; // Overwrite incorrect data with correct data to prevent // future miscompares when this data is reused. *(error->vaddr) = error->expected; os_->Flush(error->vaddr); } // Print out and log a tag error. bool WorkerThread::ReportTagError( uint64 *mem64, uint64 actual, uint64 tag) { struct ErrorRecord er; er.actual = actual; er.expected = tag; er.vaddr = mem64; // Generate vaddr from tag. er.tagvaddr = reinterpret_cast(actual); ProcessTagError(&er, 0, "Hardware Error"); return true; } // C implementation of Adler memory copy, with memory tagging. bool WorkerThread::AdlerAddrMemcpyC(uint64 *dstmem64, uint64 *srcmem64, unsigned int size_in_bytes, AdlerChecksum *checksum, struct page_entry *pe) { // Use this data wrapper to access memory with 64bit read/write. datacast_t data; datacast_t dstdata; unsigned int count = size_in_bytes / sizeof(data); if (count > ((1U) << 19)) { // Size is too large, must be strictly less than 512 KB. return false; } uint64 a1 = 1; uint64 a2 = 1; uint64 b1 = 0; uint64 b2 = 0; class Pattern *pattern = pe->pattern; unsigned int i = 0; while (i < count) { // Process 64 bits at a time. if ((i & 0x7) == 0) { data.l64 = srcmem64[i]; dstdata.l64 = dstmem64[i]; uint64 src_tag = addr_to_tag(&srcmem64[i]); uint64 dst_tag = addr_to_tag(&dstmem64[i]); // Detect if tags have been corrupted. if (data.l64 != src_tag) ReportTagError(&srcmem64[i], data.l64, src_tag); if (dstdata.l64 != dst_tag) ReportTagError(&dstmem64[i], dstdata.l64, dst_tag); data.l32.l = pattern->pattern(i << 1); data.l32.h = pattern->pattern((i << 1) + 1); a1 = a1 + data.l32.l; b1 = b1 + a1; a1 = a1 + data.l32.h; b1 = b1 + a1; data.l64 = dst_tag; dstmem64[i] = data.l64; } else { data.l64 = srcmem64[i]; a1 = a1 + data.l32.l; b1 = b1 + a1; a1 = a1 + data.l32.h; b1 = b1 + a1; dstmem64[i] = data.l64; } i++; data.l64 = srcmem64[i]; a2 = a2 + data.l32.l; b2 = b2 + a2; a2 = a2 + data.l32.h; b2 = b2 + a2; dstmem64[i] = data.l64; i++; } checksum->Set(a1, a2, b1, b2); return true; } // x86_64 SSE2 assembly implementation of Adler memory copy, with address // tagging added as a second step. This is useful for debugging failures // that only occur when SSE / nontemporal writes are used. bool WorkerThread::AdlerAddrMemcpyWarm(uint64 *dstmem64, uint64 *srcmem64, unsigned int size_in_bytes, AdlerChecksum *checksum, struct page_entry *pe) { // Do ASM copy, ignore checksum. AdlerChecksum ignored_checksum; os_->AdlerMemcpyWarm(dstmem64, srcmem64, size_in_bytes, &ignored_checksum); // Force cache flush. int length = size_in_bytes / sizeof(*dstmem64); for (int i = 0; i < length; i += sizeof(*dstmem64)) { os_->FastFlush(dstmem64 + i); os_->FastFlush(srcmem64 + i); } // Check results. AdlerAddrCrcC(srcmem64, size_in_bytes, checksum, pe); // Patch up address tags. TagAddrC(dstmem64, size_in_bytes); return true; } // Retag pages.. bool WorkerThread::TagAddrC(uint64 *memwords, unsigned int size_in_bytes) { // Mask is the bitmask of indexes used by the pattern. // It is the pattern size -1. Size is always a power of 2. // Select tag or data as appropriate. int length = size_in_bytes / wordsize_; for (int i = 0; i < length; i += 8) { datacast_t data; data.l64 = addr_to_tag(&memwords[i]); memwords[i] = data.l64; } return true; } // C implementation of Adler memory crc. bool WorkerThread::AdlerAddrCrcC(uint64 *srcmem64, unsigned int size_in_bytes, AdlerChecksum *checksum, struct page_entry *pe) { // Use this data wrapper to access memory with 64bit read/write. datacast_t data; unsigned int count = size_in_bytes / sizeof(data); if (count > ((1U) << 19)) { // Size is too large, must be strictly less than 512 KB. return false; } uint64 a1 = 1; uint64 a2 = 1; uint64 b1 = 0; uint64 b2 = 0; class Pattern *pattern = pe->pattern; unsigned int i = 0; while (i < count) { // Process 64 bits at a time. if ((i & 0x7) == 0) { data.l64 = srcmem64[i]; uint64 src_tag = addr_to_tag(&srcmem64[i]); // Check that tags match expected. if (data.l64 != src_tag) ReportTagError(&srcmem64[i], data.l64, src_tag); data.l32.l = pattern->pattern(i << 1); data.l32.h = pattern->pattern((i << 1) + 1); a1 = a1 + data.l32.l; b1 = b1 + a1; a1 = a1 + data.l32.h; b1 = b1 + a1; } else { data.l64 = srcmem64[i]; a1 = a1 + data.l32.l; b1 = b1 + a1; a1 = a1 + data.l32.h; b1 = b1 + a1; } i++; data.l64 = srcmem64[i]; a2 = a2 + data.l32.l; b2 = b2 + a2; a2 = a2 + data.l32.h; b2 = b2 + a2; i++; } checksum->Set(a1, a2, b1, b2); return true; } // Copy a block of memory quickly, while keeping a CRC of the data. // Result check if the CRC mismatches. int WorkerThread::CrcCopyPage(struct page_entry *dstpe, struct page_entry *srcpe) { int errors = 0; const int blocksize = 4096; const int blockwords = blocksize / wordsize_; int blocks = sat_->page_length() / blocksize; // Base addresses for memory copy uint64 *targetmembase = static_cast(dstpe->addr); uint64 *sourcemembase = static_cast(srcpe->addr); // Remember the expected CRC const AdlerChecksum *expectedcrc = srcpe->pattern->crc(); for (int currentblock = 0; currentblock < blocks; currentblock++) { uint64 *targetmem = targetmembase + currentblock * blockwords; uint64 *sourcemem = sourcemembase + currentblock * blockwords; AdlerChecksum crc; if (tag_mode_) { AdlerAddrMemcpyC(targetmem, sourcemem, blocksize, &crc, srcpe); } else { AdlerMemcpyC(targetmem, sourcemem, blocksize, &crc); } // Investigate miscompares. if (!crc.Equals(*expectedcrc)) { logprintf(11, "Log: CrcCopyPage Falling through to slow compare, " "CRC mismatch %s != %s\n", crc.ToHexString().c_str(), expectedcrc->ToHexString().c_str()); int errorcount = CheckRegion(sourcemem, srcpe->pattern, blocksize, currentblock * blocksize, 0); if (errorcount == 0) { logprintf(0, "Log: CrcCopyPage CRC mismatch %s != %s, " "but no miscompares found. Retrying with fresh data.\n", crc.ToHexString().c_str(), expectedcrc->ToHexString().c_str()); if (!tag_mode_) { // Copy the data originally read from this region back again. // This data should have any corruption read originally while // calculating the CRC. memcpy(sourcemem, targetmem, blocksize); errorcount = CheckRegion(sourcemem, srcpe->pattern, blocksize, currentblock * blocksize, 0); if (errorcount == 0) { int apic_id = apicid(); logprintf(0, "Process Error: CPU %d(0x%s) CrcCopyPage " "CRC mismatch %s != %s, " "but no miscompares found on second pass.\n", apic_id, CurrentCpusFormat().c_str(), crc.ToHexString().c_str(), expectedcrc->ToHexString().c_str()); struct ErrorRecord er; er.actual = sourcemem[0]; er.expected = 0x0; er.vaddr = sourcemem; ProcessError(&er, 0, "Hardware Error"); } } } errors += errorcount; } } // For odd length transfers, we should never hit this. int leftovers = sat_->page_length() % blocksize; if (leftovers) { uint64 *targetmem = targetmembase + blocks * blockwords; uint64 *sourcemem = sourcemembase + blocks * blockwords; errors += CheckRegion(sourcemem, srcpe->pattern, leftovers, blocks * blocksize, 0); int leftoverwords = leftovers / wordsize_; for (int i = 0; i < leftoverwords; i++) { targetmem[i] = sourcemem[i]; } } // Update pattern reference to reflect new contents. dstpe->pattern = srcpe->pattern; // Clean clean clean the errors away. if (errors) { // TODO(nsanders): Maybe we should patch rather than fill? Filling may // cause bad data to be propogated across the page. FillPage(dstpe); } return errors; } // Invert a block of memory quickly, traversing downwards. int InvertThread::InvertPageDown(struct page_entry *srcpe) { const int blocksize = 4096; const int blockwords = blocksize / wordsize_; int blocks = sat_->page_length() / blocksize; // Base addresses for memory copy unsigned int *sourcemembase = static_cast(srcpe->addr); for (int currentblock = blocks-1; currentblock >= 0; currentblock--) { unsigned int *sourcemem = sourcemembase + currentblock * blockwords; for (int i = blockwords - 32; i >= 0; i -= 32) { for (int index = i + 31; index >= i; --index) { unsigned int actual = sourcemem[index]; sourcemem[index] = ~actual; } OsLayer::FastFlush(&sourcemem[i]); } } return 0; } // Invert a block of memory, traversing upwards. int InvertThread::InvertPageUp(struct page_entry *srcpe) { const int blocksize = 4096; const int blockwords = blocksize / wordsize_; int blocks = sat_->page_length() / blocksize; // Base addresses for memory copy unsigned int *sourcemembase = static_cast(srcpe->addr); for (int currentblock = 0; currentblock < blocks; currentblock++) { unsigned int *sourcemem = sourcemembase + currentblock * blockwords; for (int i = 0; i < blockwords; i += 32) { for (int index = i; index <= i + 31; ++index) { unsigned int actual = sourcemem[index]; sourcemem[index] = ~actual; } OsLayer::FastFlush(&sourcemem[i]); } } return 0; } // Copy a block of memory quickly, while keeping a CRC of the data. // Result check if the CRC mismatches. Warm the CPU while running int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe, struct page_entry *srcpe) { int errors = 0; const int blocksize = 4096; const int blockwords = blocksize / wordsize_; int blocks = sat_->page_length() / blocksize; // Base addresses for memory copy uint64 *targetmembase = static_cast(dstpe->addr); uint64 *sourcemembase = static_cast(srcpe->addr); // Remember the expected CRC const AdlerChecksum *expectedcrc = srcpe->pattern->crc(); for (int currentblock = 0; currentblock < blocks; currentblock++) { uint64 *targetmem = targetmembase + currentblock * blockwords; uint64 *sourcemem = sourcemembase + currentblock * blockwords; AdlerChecksum crc; if (tag_mode_) { AdlerAddrMemcpyWarm(targetmem, sourcemem, blocksize, &crc, srcpe); } else { os_->AdlerMemcpyWarm(targetmem, sourcemem, blocksize, &crc); } // Investigate miscompares. if (!crc.Equals(*expectedcrc)) { logprintf(11, "Log: CrcWarmCopyPage Falling through to slow compare, " "CRC mismatch %s != %s\n", crc.ToHexString().c_str(), expectedcrc->ToHexString().c_str()); int errorcount = CheckRegion(sourcemem, srcpe->pattern, blocksize, currentblock * blocksize, 0); if (errorcount == 0) { logprintf(0, "Log: CrcWarmCopyPage CRC mismatch %s != %s, " "but no miscompares found. Retrying with fresh data.\n", crc.ToHexString().c_str(), expectedcrc->ToHexString().c_str()); if (!tag_mode_) { // Copy the data originally read from this region back again. // This data should have any corruption read originally while // calculating the CRC. memcpy(sourcemem, targetmem, blocksize); errorcount = CheckRegion(sourcemem, srcpe->pattern, blocksize, currentblock * blocksize, 0); if (errorcount == 0) { int apic_id = apicid(); logprintf(0, "Process Error: CPU %d(0x%s) CrciWarmCopyPage " "CRC mismatch %s != %s, " "but no miscompares found on second pass.\n", apic_id, CurrentCpusFormat().c_str(), crc.ToHexString().c_str(), expectedcrc->ToHexString().c_str()); struct ErrorRecord er; er.actual = sourcemem[0]; er.expected = 0x0; er.vaddr = sourcemem; ProcessError(&er, 0, "Hardware Error"); } } } errors += errorcount; } } // For odd length transfers, we should never hit this. int leftovers = sat_->page_length() % blocksize; if (leftovers) { uint64 *targetmem = targetmembase + blocks * blockwords; uint64 *sourcemem = sourcemembase + blocks * blockwords; errors += CheckRegion(sourcemem, srcpe->pattern, leftovers, blocks * blocksize, 0); int leftoverwords = leftovers / wordsize_; for (int i = 0; i < leftoverwords; i++) { targetmem[i] = sourcemem[i]; } } // Update pattern reference to reflect new contents. dstpe->pattern = srcpe->pattern; // Clean clean clean the errors away. if (errors) { // TODO(nsanders): Maybe we should patch rather than fill? Filling may // cause bad data to be propogated across the page. FillPage(dstpe); } return errors; } // Memory check work loop. Execute until done, then exhaust pages. bool CheckThread::Work() { struct page_entry pe; bool result = true; int64 loops = 0; logprintf(9, "Log: Starting Check thread %d\n", thread_num_); // We want to check all the pages, and // stop when there aren't any left. while (true) { result = result && sat_->GetValid(&pe); if (!result) { if (IsReadyToRunNoPause()) logprintf(0, "Process Error: check_thread failed to pop pages, " "bailing\n"); else result = true; break; } // Do the result check. CrcCheckPage(&pe); // Push pages back on the valid queue if we are still going, // throw them out otherwise. if (IsReadyToRunNoPause()) result = result && sat_->PutValid(&pe); else result = result && sat_->PutEmpty(&pe); if (!result) { logprintf(0, "Process Error: check_thread failed to push pages, " "bailing\n"); break; } loops++; } pages_copied_ = loops; status_ = result; logprintf(9, "Log: Completed %d: Check thread. Status %d, %d pages checked\n", thread_num_, status_, pages_copied_); return result; } // Memory copy work loop. Execute until marked done. bool CopyThread::Work() { struct page_entry src; struct page_entry dst; bool result = true; int64 loops = 0; logprintf(9, "Log: Starting copy thread %d: cpu %s, mem %x\n", thread_num_, cpuset_format(&cpu_mask_).c_str(), tag_); while (IsReadyToRun()) { // Pop the needed pages. result = result && sat_->GetValid(&src, tag_); result = result && sat_->GetEmpty(&dst, tag_); if (!result) { logprintf(0, "Process Error: copy_thread failed to pop pages, " "bailing\n"); break; } // Force errors for unittests. if (sat_->error_injection()) { if (loops == 8) { char *addr = reinterpret_cast(src.addr); int offset = random() % sat_->page_length(); addr[offset] = 0xba; } } // We can use memcpy, or CRC check while we copy. if (sat_->warm()) { CrcWarmCopyPage(&dst, &src); } else if (sat_->strict()) { CrcCopyPage(&dst, &src); } else { memcpy(dst.addr, src.addr, sat_->page_length()); dst.pattern = src.pattern; } result = result && sat_->PutValid(&dst); result = result && sat_->PutEmpty(&src); // Copy worker-threads yield themselves at the end of each copy loop, // to avoid threads from preempting each other in the middle of the inner // copy-loop. Cooperations between Copy worker-threads results in less // unnecessary cache thrashing (which happens when context-switching in the // middle of the inner copy-loop). YieldSelf(); if (!result) { logprintf(0, "Process Error: copy_thread failed to push pages, " "bailing\n"); break; } loops++; } pages_copied_ = loops; status_ = result; logprintf(9, "Log: Completed %d: Copy thread. Status %d, %d pages copied\n", thread_num_, status_, pages_copied_); return result; } // Memory invert work loop. Execute until marked done. bool InvertThread::Work() { struct page_entry src; bool result = true; int64 loops = 0; logprintf(9, "Log: Starting invert thread %d\n", thread_num_); while (IsReadyToRun()) { // Pop the needed pages. result = result && sat_->GetValid(&src); if (!result) { logprintf(0, "Process Error: invert_thread failed to pop pages, " "bailing\n"); break; } if (sat_->strict()) CrcCheckPage(&src); // For the same reason CopyThread yields itself (see YieldSelf comment // in CopyThread::Work(), InvertThread yields itself after each invert // operation to improve cooperation between different worker threads // stressing the memory/cache. InvertPageUp(&src); YieldSelf(); InvertPageDown(&src); YieldSelf(); InvertPageDown(&src); YieldSelf(); InvertPageUp(&src); YieldSelf(); if (sat_->strict()) CrcCheckPage(&src); result = result && sat_->PutValid(&src); if (!result) { logprintf(0, "Process Error: invert_thread failed to push pages, " "bailing\n"); break; } loops++; } pages_copied_ = loops * 2; status_ = result; logprintf(9, "Log: Completed %d: Copy thread. Status %d, %d pages copied\n", thread_num_, status_, pages_copied_); return result; } // Set file name to use for File IO. void FileThread::SetFile(const char *filename_init) { filename_ = filename_init; devicename_ = os_->FindFileDevice(filename_); } // Open the file for access. bool FileThread::OpenFile(int *pfile) { bool no_O_DIRECT = false; int flags = O_RDWR | O_CREAT | O_SYNC; int fd = open(filename_.c_str(), flags | O_DIRECT, 0644); if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) { no_O_DIRECT = true; fd = open(filename_.c_str(), flags, 0644); // Try without O_DIRECT } if (fd < 0) { logprintf(0, "Process Error: Failed to create file %s!!\n", filename_.c_str()); pages_copied_ = 0; return false; } if (no_O_DIRECT) os_->ActivateFlushPageCache(); // Not using O_DIRECT fixed EINVAL *pfile = fd; return true; } // Close the file. bool FileThread::CloseFile(int fd) { close(fd); return true; } // Check sector tagging. bool FileThread::SectorTagPage(struct page_entry *src, int block) { int page_length = sat_->page_length(); struct FileThread::SectorTag *tag = (struct FileThread::SectorTag *)(src->addr); // Tag each sector. unsigned char magic = ((0xba + thread_num_) & 0xff); for (int sec = 0; sec < page_length / 512; sec++) { tag[sec].magic = magic; tag[sec].block = block & 0xff; tag[sec].sector = sec & 0xff; tag[sec].pass = pass_ & 0xff; } return true; } bool FileThread::WritePageToFile(int fd, struct page_entry *src) { int page_length = sat_->page_length(); // Fill the file with our data. int64 size = write(fd, src->addr, page_length); if (size != page_length) { os_->ErrorReport(devicename_.c_str(), "write-error", 1); errorcount_++; logprintf(0, "Block Error: file_thread failed to write, " "bailing\n"); return false; } return true; } // Write the data to the file. bool FileThread::WritePages(int fd) { int strict = sat_->strict(); // Start fresh at beginning of file for each batch of pages. lseek64(fd, 0, SEEK_SET); for (int i = 0; i < sat_->disk_pages(); i++) { struct page_entry src; if (!GetValidPage(&src)) return false; // Save expected pattern. page_recs_[i].pattern = src.pattern; page_recs_[i].src = src.addr; // Check data correctness. if (strict) CrcCheckPage(&src); SectorTagPage(&src, i); bool result = WritePageToFile(fd, &src); if (!PutEmptyPage(&src)) return false; if (!result) return false; } return os_->FlushPageCache(); // If O_DIRECT worked, this will be a NOP. } // Copy data from file into memory block. bool FileThread::ReadPageFromFile(int fd, struct page_entry *dst) { int page_length = sat_->page_length(); // Do the actual read. int64 size = read(fd, dst->addr, page_length); if (size != page_length) { os_->ErrorReport(devicename_.c_str(), "read-error", 1); logprintf(0, "Block Error: file_thread failed to read, " "bailing\n"); errorcount_++; return false; } return true; } // Check sector tagging. bool FileThread::SectorValidatePage(const struct PageRec &page, struct page_entry *dst, int block) { // Error injection. static int calls = 0; calls++; // Do sector tag compare. int firstsector = -1; int lastsector = -1; bool badsector = false; int page_length = sat_->page_length(); // Cast data block into an array of tagged sectors. struct FileThread::SectorTag *tag = (struct FileThread::SectorTag *)(dst->addr); sat_assert(sizeof(*tag) == 512); // Error injection. if (sat_->error_injection()) { if (calls == 2) { for (int badsec = 8; badsec < 17; badsec++) tag[badsec].pass = 27; } if (calls == 18) { (static_cast(dst->addr))[27] = 0xbadda7a; } } // Check each sector for the correct tag we added earlier, // then revert the tag to the to normal data pattern. unsigned char magic = ((0xba + thread_num_) & 0xff); for (int sec = 0; sec < page_length / 512; sec++) { // Check magic tag. if ((tag[sec].magic != magic) || (tag[sec].block != (block & 0xff)) || (tag[sec].sector != (sec & 0xff)) || (tag[sec].pass != (pass_ & 0xff))) { // Offset calculation for tag location. int offset = sec * sizeof(SectorTag); if (tag[sec].block != (block & 0xff)) offset += 1 * sizeof(uint8); else if (tag[sec].sector != (sec & 0xff)) offset += 2 * sizeof(uint8); else if (tag[sec].pass != (pass_ & 0xff)) offset += 3 * sizeof(uint8); // Run sector tag error through diagnoser for logging and reporting. errorcount_ += 1; os_->error_diagnoser_->AddHDDSectorTagError(devicename_, tag[sec].block, offset, tag[sec].sector, page.src, page.dst); logprintf(5, "Sector Error: Sector tag @ 0x%x, pass %d/%d. " "sec %x/%x, block %d/%d, magic %x/%x, File: %s \n", block * page_length + 512 * sec, (pass_ & 0xff), (unsigned int)tag[sec].pass, sec, (unsigned int)tag[sec].sector, block, (unsigned int)tag[sec].block, magic, (unsigned int)tag[sec].magic, filename_.c_str()); // Keep track of first and last bad sector. if (firstsector == -1) firstsector = (block * page_length / 512) + sec; lastsector = (block * page_length / 512) + sec; badsector = true; } // Patch tag back to proper pattern. unsigned int *addr = (unsigned int *)(&tag[sec]); *addr = dst->pattern->pattern(512 * sec / sizeof(*addr)); } // If we found sector errors: if (badsector == true) { logprintf(5, "Log: file sector miscompare at offset %x-%x. File: %s\n", firstsector * 512, ((lastsector + 1) * 512) - 1, filename_.c_str()); // Either exit immediately, or patch the data up and continue. if (sat_->stop_on_error()) { exit(1); } else { // Patch up bad pages. for (int block = (firstsector * 512) / page_length; block <= (lastsector * 512) / page_length; block++) { unsigned int *memblock = static_cast(dst->addr); int length = page_length / wordsize_; for (int i = 0; i < length; i++) { memblock[i] = dst->pattern->pattern(i); } } } } return true; } // Get memory for an incoming data transfer.. bool FileThread::PagePrepare() { // We can only do direct IO to SAT pages if it is normal mem. page_io_ = os_->normal_mem(); // Init a local buffer if we need it. if (!page_io_) { #ifdef HAVE_POSIX_MEMALIGN int result = posix_memalign(&local_page_, 512, sat_->page_length()); #else local_page_ = memalign(512, sat_->page_length()); int result = (local_page_ == 0); #endif if (result) { logprintf(0, "Process Error: disk thread posix_memalign " "returned %d (fail)\n", result); status_ = false; return false; } } return true; } // Remove memory allocated for data transfer. bool FileThread::PageTeardown() { // Free a local buffer if we need to. if (!page_io_) { free(local_page_); } return true; } // Get memory for an incoming data transfer.. bool FileThread::GetEmptyPage(struct page_entry *dst) { if (page_io_) { if (!sat_->GetEmpty(dst)) return false; } else { dst->addr = local_page_; dst->offset = 0; dst->pattern = 0; } return true; } // Get memory for an outgoing data transfer.. bool FileThread::GetValidPage(struct page_entry *src) { struct page_entry tmp; if (!sat_->GetValid(&tmp)) return false; if (page_io_) { *src = tmp; return true; } else { src->addr = local_page_; src->offset = 0; CrcCopyPage(src, &tmp); if (!sat_->PutValid(&tmp)) return false; } return true; } // Throw out a used empty page. bool FileThread::PutEmptyPage(struct page_entry *src) { if (page_io_) { if (!sat_->PutEmpty(src)) return false; } return true; } // Throw out a used, filled page. bool FileThread::PutValidPage(struct page_entry *src) { if (page_io_) { if (!sat_->PutValid(src)) return false; } return true; } // Copy data from file into memory blocks. bool FileThread::ReadPages(int fd) { int page_length = sat_->page_length(); int strict = sat_->strict(); bool result = true; // Read our data back out of the file, into it's new location. lseek64(fd, 0, SEEK_SET); for (int i = 0; i < sat_->disk_pages(); i++) { struct page_entry dst; if (!GetEmptyPage(&dst)) return false; // Retrieve expected pattern. dst.pattern = page_recs_[i].pattern; // Update page recordpage record. page_recs_[i].dst = dst.addr; // Read from the file into destination page. if (!ReadPageFromFile(fd, &dst)) { PutEmptyPage(&dst); return false; } SectorValidatePage(page_recs_[i], &dst, i); // Ensure that the transfer ended up with correct data. if (strict) { // Record page index currently CRC checked. crc_page_ = i; int errors = CrcCheckPage(&dst); if (errors) { logprintf(5, "Log: file miscompare at block %d, " "offset %x-%x. File: %s\n", i, i * page_length, ((i + 1) * page_length) - 1, filename_.c_str()); result = false; } crc_page_ = -1; errorcount_ += errors; } if (!PutValidPage(&dst)) return false; } return result; } // File IO work loop. Execute until marked done. bool FileThread::Work() { bool result = true; int64 loops = 0; logprintf(9, "Log: Starting file thread %d, file %s, device %s\n", thread_num_, filename_.c_str(), devicename_.c_str()); if (!PagePrepare()) { status_ = false; return false; } // Open the data IO file. int fd = 0; if (!OpenFile(&fd)) { status_ = false; return false; } pass_ = 0; // Load patterns into page records. page_recs_ = new struct PageRec[sat_->disk_pages()]; for (int i = 0; i < sat_->disk_pages(); i++) { page_recs_[i].pattern = new struct Pattern(); } // Loop until done. while (IsReadyToRun()) { // Do the file write. if (!(result = result && WritePages(fd))) break; // Do the file read. if (!(result = result && ReadPages(fd))) break; loops++; pass_ = loops; } pages_copied_ = loops * sat_->disk_pages(); // Clean up. CloseFile(fd); PageTeardown(); logprintf(9, "Log: Completed %d: file thread status %d, %d pages copied\n", thread_num_, status_, pages_copied_); // Failure to read from device indicates hardware, // rather than procedural SW error. status_ = true; return true; } bool NetworkThread::IsNetworkStopSet() { return !IsReadyToRunNoPause(); } bool NetworkSlaveThread::IsNetworkStopSet() { // This thread has no completion status. // It finishes whever there is no more data to be // passed back. return true; } // Set ip name to use for Network IO. void NetworkThread::SetIP(const char *ipaddr_init) { strncpy(ipaddr_, ipaddr_init, 256); } // Create a socket. // Return 0 on error. bool NetworkThread::CreateSocket(int *psocket) { int sock = socket(AF_INET, SOCK_STREAM, 0); if (sock == -1) { logprintf(0, "Process Error: Cannot open socket\n"); pages_copied_ = 0; status_ = false; return false; } *psocket = sock; return true; } // Close the socket. bool NetworkThread::CloseSocket(int sock) { close(sock); return true; } // Initiate the tcp connection. bool NetworkThread::Connect(int sock) { struct sockaddr_in dest_addr; dest_addr.sin_family = AF_INET; dest_addr.sin_port = htons(kNetworkPort); memset(&(dest_addr.sin_zero), '\0', sizeof(dest_addr.sin_zero)); // Translate dot notation to u32. if (inet_aton(ipaddr_, &dest_addr.sin_addr) == 0) { logprintf(0, "Process Error: Cannot resolve %s\n", ipaddr_); pages_copied_ = 0; status_ = false; return false; } if (-1 == connect(sock, reinterpret_cast(&dest_addr), sizeof(struct sockaddr))) { logprintf(0, "Process Error: Cannot connect %s\n", ipaddr_); pages_copied_ = 0; status_ = false; return false; } return true; } // Initiate the tcp connection. bool NetworkListenThread::Listen() { struct sockaddr_in sa; memset(&(sa.sin_zero), '\0', sizeof(sa.sin_zero)); sa.sin_family = AF_INET; sa.sin_addr.s_addr = INADDR_ANY; sa.sin_port = htons(kNetworkPort); if (-1 == bind(sock_, (struct sockaddr*)&sa, sizeof(struct sockaddr))) { char buf[256]; sat_strerror(errno, buf, sizeof(buf)); logprintf(0, "Process Error: Cannot bind socket: %s\n", buf); pages_copied_ = 0; status_ = false; return false; } listen(sock_, 3); return true; } // Wait for a connection from a network traffic generation thread. bool NetworkListenThread::Wait() { fd_set rfds; struct timeval tv; int retval; // Watch sock_ to see when it has input. FD_ZERO(&rfds); FD_SET(sock_, &rfds); // Wait up to five seconds. tv.tv_sec = 5; tv.tv_usec = 0; retval = select(sock_ + 1, &rfds, NULL, NULL, &tv); return (retval > 0); } // Wait for a connection from a network traffic generation thread. bool NetworkListenThread::GetConnection(int *pnewsock) { struct sockaddr_in sa; socklen_t size = sizeof(struct sockaddr_in); int newsock = accept(sock_, reinterpret_cast(&sa), &size); if (newsock < 0) { logprintf(0, "Process Error: Did not receive connection\n"); pages_copied_ = 0; status_ = false; return false; } *pnewsock = newsock; return true; } // Send a page, return false if a page was not sent. bool NetworkThread::SendPage(int sock, struct page_entry *src) { int page_length = sat_->page_length(); char *address = static_cast(src->addr); // Send our data over the network. int size = page_length; while (size) { int transferred = send(sock, address + (page_length - size), size, 0); if ((transferred == 0) || (transferred == -1)) { if (!IsNetworkStopSet()) { char buf[256] = ""; sat_strerror(errno, buf, sizeof(buf)); logprintf(0, "Process Error: Thread %d, " "Network write failed, bailing. (%s)\n", thread_num_, buf); status_ = false; } return false; } size = size - transferred; } return true; } // Receive a page. Return false if a page was not received. bool NetworkThread::ReceivePage(int sock, struct page_entry *dst) { int page_length = sat_->page_length(); char *address = static_cast(dst->addr); // Maybe we will get our data back again, maybe not. int size = page_length; while (size) { int transferred = recv(sock, address + (page_length - size), size, 0); if ((transferred == 0) || (transferred == -1)) { // Typically network slave thread should exit as network master // thread stops sending data. if (IsNetworkStopSet()) { int err = errno; if (transferred == 0 && err == 0) { // Two system setups will not sync exactly, // allow early exit, but log it. logprintf(0, "Log: Net thread did not receive any data, exiting.\n"); } else { char buf[256] = ""; sat_strerror(err, buf, sizeof(buf)); // Print why we failed. logprintf(0, "Process Error: Thread %d, " "Network read failed, bailing (%s).\n", thread_num_, buf); status_ = false; // Print arguments and results. logprintf(0, "Log: recv(%d, address %x, size %x, 0) == %x, err %d\n", sock, address + (page_length - size), size, transferred, err); if ((transferred == 0) && (page_length - size < 512) && (page_length - size > 0)) { // Print null terminated data received, to see who's been // sending us supicious unwanted data. address[page_length - size] = 0; logprintf(0, "Log: received %d bytes: '%s'\n", page_length - size, address); } } } return false; } size = size - transferred; } return true; } // Network IO work loop. Execute until marked done. // Return true if the thread ran as expected. bool NetworkThread::Work() { logprintf(9, "Log: Starting network thread %d, ip %s\n", thread_num_, ipaddr_); // Make a socket. int sock = 0; if (!CreateSocket(&sock)) return false; // Network IO loop requires network slave thread to have already initialized. // We will sleep here for awhile to ensure that the slave thread will be // listening by the time we connect. // Sleep for 15 seconds. sat_sleep(15); logprintf(9, "Log: Starting execution of network thread %d, ip %s\n", thread_num_, ipaddr_); // Connect to a slave thread. if (!Connect(sock)) return false; // Loop until done. bool result = true; int strict = sat_->strict(); int64 loops = 0; while (IsReadyToRun()) { struct page_entry src; struct page_entry dst; result = result && sat_->GetValid(&src); result = result && sat_->GetEmpty(&dst); if (!result) { logprintf(0, "Process Error: net_thread failed to pop pages, " "bailing\n"); break; } // Check data correctness. if (strict) CrcCheckPage(&src); // Do the network write. if (!(result = result && SendPage(sock, &src))) break; // Update pattern reference to reflect new contents. dst.pattern = src.pattern; // Do the network read. if (!(result = result && ReceivePage(sock, &dst))) break; // Ensure that the transfer ended up with correct data. if (strict) CrcCheckPage(&dst); // Return all of our pages to the queue. result = result && sat_->PutValid(&dst); result = result && sat_->PutEmpty(&src); if (!result) { logprintf(0, "Process Error: net_thread failed to push pages, " "bailing\n"); break; } loops++; } pages_copied_ = loops; status_ = result; // Clean up. CloseSocket(sock); logprintf(9, "Log: Completed %d: network thread status %d, " "%d pages copied\n", thread_num_, status_, pages_copied_); return result; } // Spawn slave threads for incoming connections. bool NetworkListenThread::SpawnSlave(int newsock, int threadid) { logprintf(12, "Log: Listen thread spawning slave\n"); // Spawn slave thread, to reflect network traffic back to sender. ChildWorker *child_worker = new ChildWorker; child_worker->thread.SetSock(newsock); child_worker->thread.InitThread(threadid, sat_, os_, patternlist_, &child_worker->status); child_worker->status.Initialize(); child_worker->thread.SpawnThread(); child_workers_.push_back(child_worker); return true; } // Reap slave threads. bool NetworkListenThread::ReapSlaves() { bool result = true; // Gather status and reap threads. logprintf(12, "Log: Joining all outstanding threads\n"); for (size_t i = 0; i < child_workers_.size(); i++) { NetworkSlaveThread& child_thread = child_workers_[i]->thread; logprintf(12, "Log: Joining slave thread %d\n", i); child_thread.JoinThread(); if (child_thread.GetStatus() != 1) { logprintf(0, "Process Error: Slave Thread %d failed with status %d\n", i, child_thread.GetStatus()); result = false; } errorcount_ += child_thread.GetErrorCount(); logprintf(9, "Log: Slave Thread %d found %lld miscompares\n", i, child_thread.GetErrorCount()); pages_copied_ += child_thread.GetPageCount(); } return result; } // Network listener IO work loop. Execute until marked done. // Return false on fatal software error. bool NetworkListenThread::Work() { logprintf(9, "Log: Starting network listen thread %d\n", thread_num_); // Make a socket. sock_ = 0; if (!CreateSocket(&sock_)) { status_ = false; return false; } logprintf(9, "Log: Listen thread created sock\n"); // Allows incoming connections to be queued up by socket library. int newsock = 0; Listen(); logprintf(12, "Log: Listen thread waiting for incoming connections\n"); // Wait on incoming connections, and spawn worker threads for them. int threadcount = 0; while (IsReadyToRun()) { // Poll for connections that we can accept(). if (Wait()) { // Accept those connections. logprintf(12, "Log: Listen thread found incoming connection\n"); if (GetConnection(&newsock)) { SpawnSlave(newsock, threadcount); threadcount++; } } } // Gather status and join spawned threads. ReapSlaves(); // Delete the child workers. for (ChildVector::iterator it = child_workers_.begin(); it != child_workers_.end(); ++it) { (*it)->status.Destroy(); delete *it; } child_workers_.clear(); CloseSocket(sock_); status_ = true; logprintf(9, "Log: Completed %d: network listen thread status %d, " "%d pages copied\n", thread_num_, status_, pages_copied_); return true; } // Set network reflector socket struct. void NetworkSlaveThread::SetSock(int sock) { sock_ = sock; } // Network reflector IO work loop. Execute until marked done. // Return false on fatal software error. bool NetworkSlaveThread::Work() { logprintf(9, "Log: Starting network slave thread %d\n", thread_num_); // Verify that we have a socket. int sock = sock_; if (!sock) { status_ = false; return false; } // Loop until done. int64 loops = 0; // Init a local buffer for storing data. void *local_page = NULL; #ifdef HAVE_POSIX_MEMALIGN int result = posix_memalign(&local_page, 512, sat_->page_length()); #else local_page = memalign(512, sat_->page_length()); int result = (local_page == 0); #endif if (result) { logprintf(0, "Process Error: net slave posix_memalign " "returned %d (fail)\n", result); status_ = false; return false; } struct page_entry page; page.addr = local_page; // This thread will continue to run as long as the thread on the other end of // the socket is still sending and receiving data. while (1) { // Do the network read. if (!ReceivePage(sock, &page)) break; // Do the network write. if (!SendPage(sock, &page)) break; loops++; } pages_copied_ = loops; // No results provided from this type of thread. status_ = true; // Clean up. CloseSocket(sock); logprintf(9, "Log: Completed %d: network slave thread status %d, " "%d pages copied\n", thread_num_, status_, pages_copied_); return true; } // Thread work loop. Execute until marked finished. bool ErrorPollThread::Work() { logprintf(9, "Log: Starting system error poll thread %d\n", thread_num_); // This calls a generic error polling function in the Os abstraction layer. do { errorcount_ += os_->ErrorPoll(); os_->ErrorWait(); } while (IsReadyToRun()); logprintf(9, "Log: Finished system error poll thread %d: %d errors\n", thread_num_, errorcount_); status_ = true; return true; } // Worker thread to heat up CPU. // This thread does not evaluate pass/fail or software error. bool CpuStressThread::Work() { logprintf(9, "Log: Starting CPU stress thread %d\n", thread_num_); do { // Run ludloff's platform/CPU-specific assembly workload. os_->CpuStressWorkload(); YieldSelf(); } while (IsReadyToRun()); logprintf(9, "Log: Finished CPU stress thread %d:\n", thread_num_); status_ = true; return true; } CpuCacheCoherencyThread::CpuCacheCoherencyThread(cc_cacheline_data *data, int cacheline_count, int thread_num, int inc_count) { cc_cacheline_data_ = data; cc_cacheline_count_ = cacheline_count; cc_thread_num_ = thread_num; cc_inc_count_ = inc_count; } // Worked thread to test the cache coherency of the CPUs // Return false on fatal sw error. bool CpuCacheCoherencyThread::Work() { logprintf(9, "Log: Starting the Cache Coherency thread %d\n", cc_thread_num_); uint64 time_start, time_end; struct timeval tv; unsigned int seed = static_cast(gettid()); gettimeofday(&tv, NULL); // Get the timestamp before increments. time_start = tv.tv_sec * 1000000ULL + tv.tv_usec; uint64 total_inc = 0; // Total increments done by the thread. while (IsReadyToRun()) { for (int i = 0; i < cc_inc_count_; i++) { // Choose a datastructure in random and increment the appropriate // member in that according to the offset (which is the same as the // thread number. #ifdef HAVE_RAND_R int r = rand_r(&seed); #else int r = rand(); #endif r = cc_cacheline_count_ * (r / (RAND_MAX + 1.0)); // Increment the member of the randomely selected structure. (cc_cacheline_data_[r].num[cc_thread_num_])++; } total_inc += cc_inc_count_; // Calculate if the local counter matches with the global value // in all the cache line structures for this particular thread. int cc_global_num = 0; for (int cline_num = 0; cline_num < cc_cacheline_count_; cline_num++) { cc_global_num += cc_cacheline_data_[cline_num].num[cc_thread_num_]; // Reset the cachline member's value for the next run. cc_cacheline_data_[cline_num].num[cc_thread_num_] = 0; } if (sat_->error_injection()) cc_global_num = -1; if (cc_global_num != cc_inc_count_) { errorcount_++; logprintf(0, "Hardware Error: global(%d) and local(%d) do not match\n", cc_global_num, cc_inc_count_); } } gettimeofday(&tv, NULL); // Get the timestamp at the end. time_end = tv.tv_sec * 1000000ULL + tv.tv_usec; uint64 us_elapsed = time_end - time_start; // inc_rate is the no. of increments per second. double inc_rate = total_inc * 1e6 / us_elapsed; logprintf(4, "Stats: CC Thread(%d): Time=%llu us," " Increments=%llu, Increments/sec = %.6lf\n", cc_thread_num_, us_elapsed, total_inc, inc_rate); logprintf(9, "Log: Finished CPU Cache Coherency thread %d:\n", cc_thread_num_); status_ = true; return true; } DiskThread::DiskThread(DiskBlockTable *block_table) { read_block_size_ = kSectorSize; // default 1 sector (512 bytes) write_block_size_ = kSectorSize; // this assumes read and write block size // are the same segment_size_ = -1; // use the entire disk as one segment cache_size_ = 16 * 1024 * 1024; // assume 16MiB cache by default // Use a queue such that 3/2 times as much data as the cache can hold // is written before it is read so that there is little chance the read // data is in the cache. queue_size_ = ((cache_size_ / write_block_size_) * 3) / 2; blocks_per_segment_ = 32; read_threshold_ = 100000; // 100ms is a reasonable limit for write_threshold_ = 100000; // reading/writing a sector read_timeout_ = 5000000; // 5 seconds should be long enough for a write_timeout_ = 5000000; // timout for reading/writing device_sectors_ = 0; non_destructive_ = 0; #ifdef HAVE_LIBAIO_H aio_ctx_ = 0; #endif block_table_ = block_table; update_block_table_ = 1; block_buffer_ = NULL; blocks_written_ = 0; blocks_read_ = 0; } DiskThread::~DiskThread() { if (block_buffer_) free(block_buffer_); } // Set filename for device file (in /dev). void DiskThread::SetDevice(const char *device_name) { device_name_ = device_name; } // Set various parameters that control the behaviour of the test. // -1 is used as a sentinel value on each parameter (except non_destructive) // to indicate that the parameter not be set. bool DiskThread::SetParameters(int read_block_size, int write_block_size, int64 segment_size, int64 cache_size, int blocks_per_segment, int64 read_threshold, int64 write_threshold, int non_destructive) { if (read_block_size != -1) { // Blocks must be aligned to the disk's sector size. if (read_block_size % kSectorSize != 0) { logprintf(0, "Process Error: Block size must be a multiple of %d " "(thread %d).\n", kSectorSize, thread_num_); return false; } read_block_size_ = read_block_size; } if (write_block_size != -1) { // Write blocks must be aligned to the disk's sector size and to the // block size. if (write_block_size % kSectorSize != 0) { logprintf(0, "Process Error: Write block size must be a multiple " "of %d (thread %d).\n", kSectorSize, thread_num_); return false; } if (write_block_size % read_block_size_ != 0) { logprintf(0, "Process Error: Write block size must be a multiple " "of the read block size, which is %d (thread %d).\n", read_block_size_, thread_num_); return false; } write_block_size_ = write_block_size; } else { // Make sure write_block_size_ is still valid. if (read_block_size_ > write_block_size_) { logprintf(5, "Log: Assuming write block size equal to read block size, " "which is %d (thread %d).\n", read_block_size_, thread_num_); write_block_size_ = read_block_size_; } else { if (write_block_size_ % read_block_size_ != 0) { logprintf(0, "Process Error: Write block size (defined as %d) must " "be a multiple of the read block size, which is %d " "(thread %d).\n", write_block_size_, read_block_size_, thread_num_); return false; } } } if (cache_size != -1) { cache_size_ = cache_size; } if (blocks_per_segment != -1) { if (blocks_per_segment <= 0) { logprintf(0, "Process Error: Blocks per segment must be greater than " "zero.\n (thread %d)", thread_num_); return false; } blocks_per_segment_ = blocks_per_segment; } if (read_threshold != -1) { if (read_threshold <= 0) { logprintf(0, "Process Error: Read threshold must be greater than " "zero (thread %d).\n", thread_num_); return false; } read_threshold_ = read_threshold; } if (write_threshold != -1) { if (write_threshold <= 0) { logprintf(0, "Process Error: Write threshold must be greater than " "zero (thread %d).\n", thread_num_); return false; } write_threshold_ = write_threshold; } if (segment_size != -1) { // Segments must be aligned to the disk's sector size. if (segment_size % kSectorSize != 0) { logprintf(0, "Process Error: Segment size must be a multiple of %d" " (thread %d).\n", kSectorSize, thread_num_); return false; } segment_size_ = segment_size / kSectorSize; } non_destructive_ = non_destructive; // Having a queue of 150% of blocks that will fit in the disk's cache // should be enough to force out the oldest block before it is read and hence, // making sure the data comes form the disk and not the cache. queue_size_ = ((cache_size_ / write_block_size_) * 3) / 2; // Updating DiskBlockTable parameters if (update_block_table_) { block_table_->SetParameters(kSectorSize, write_block_size_, device_sectors_, segment_size_, device_name_); } return true; } // Open a device, return false on failure. bool DiskThread::OpenDevice(int *pfile) { bool no_O_DIRECT = false; int flags = O_RDWR | O_SYNC | O_LARGEFILE; int fd = open(device_name_.c_str(), flags | O_DIRECT, 0); if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) { no_O_DIRECT = true; fd = open(device_name_.c_str(), flags, 0); // Try without O_DIRECT } if (fd < 0) { logprintf(0, "Process Error: Failed to open device %s (thread %d)!!\n", device_name_.c_str(), thread_num_); return false; } if (no_O_DIRECT) os_->ActivateFlushPageCache(); *pfile = fd; return GetDiskSize(fd); } // Retrieves the size (in bytes) of the disk/file. // Return false on failure. bool DiskThread::GetDiskSize(int fd) { struct stat device_stat; if (fstat(fd, &device_stat) == -1) { logprintf(0, "Process Error: Unable to fstat disk %s (thread %d).\n", device_name_.c_str(), thread_num_); return false; } // For a block device, an ioctl is needed to get the size since the size // of the device file (i.e. /dev/sdb) is 0. if (S_ISBLK(device_stat.st_mode)) { uint64 block_size = 0; if (ioctl(fd, BLKGETSIZE64, &block_size) == -1) { logprintf(0, "Process Error: Unable to ioctl disk %s (thread %d).\n", device_name_.c_str(), thread_num_); return false; } // Zero size indicates nonworking device.. if (block_size == 0) { os_->ErrorReport(device_name_.c_str(), "device-size-zero", 1); ++errorcount_; status_ = true; // Avoid a procedural error. return false; } device_sectors_ = block_size / kSectorSize; } else if (S_ISREG(device_stat.st_mode)) { device_sectors_ = device_stat.st_size / kSectorSize; } else { logprintf(0, "Process Error: %s is not a regular file or block " "device (thread %d).\n", device_name_.c_str(), thread_num_); return false; } logprintf(12, "Log: Device sectors: %lld on disk %s (thread %d).\n", device_sectors_, device_name_.c_str(), thread_num_); if (update_block_table_) { block_table_->SetParameters(kSectorSize, write_block_size_, device_sectors_, segment_size_, device_name_); } return true; } bool DiskThread::CloseDevice(int fd) { close(fd); return true; } // Return the time in microseconds. int64 DiskThread::GetTime() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000000 + tv.tv_usec; } // Do randomized reads and (possibly) writes on a device. // Return false on fatal SW error, true on SW success, // regardless of whether HW failed. bool DiskThread::DoWork(int fd) { int64 block_num = 0; int64 num_segments; if (segment_size_ == -1) { num_segments = 1; } else { num_segments = device_sectors_ / segment_size_; if (device_sectors_ % segment_size_ != 0) num_segments++; } // Disk size should be at least 3x cache size. See comment later for // details. sat_assert(device_sectors_ * kSectorSize > 3 * cache_size_); // This disk test works by writing blocks with a certain pattern to // disk, then reading them back and verifying it against the pattern // at a later time. A failure happens when either the block cannot // be written/read or when the read block is different than what was // written. If a block takes too long to write/read, then a warning // is given instead of an error since taking too long is not // necessarily an error. // // To prevent the read blocks from coming from the disk cache, // enough blocks are written before read such that a block would // be ejected from the disk cache by the time it is read. // // TODO(amistry): Implement some sort of read/write throttling. The // flood of asynchronous I/O requests when a drive is // unplugged is causing the application and kernel to // become unresponsive. while (IsReadyToRun()) { // Write blocks to disk. logprintf(16, "Log: Write phase %sfor disk %s (thread %d).\n", non_destructive_ ? "(disabled) " : "", device_name_.c_str(), thread_num_); while (IsReadyToRunNoPause() && in_flight_sectors_.size() < static_cast(queue_size_ + 1)) { // Confine testing to a particular segment of the disk. int64 segment = (block_num / blocks_per_segment_) % num_segments; if (!non_destructive_ && (block_num % blocks_per_segment_ == 0)) { logprintf(20, "Log: Starting to write segment %lld out of " "%lld on disk %s (thread %d).\n", segment, num_segments, device_name_.c_str(), thread_num_); } block_num++; BlockData *block = block_table_->GetUnusedBlock(segment); // If an unused sequence of sectors could not be found, skip to the // next block to process. Soon, a new segment will come and new // sectors will be able to be allocated. This effectively puts a // minumim on the disk size at 3x the stated cache size, or 48MiB // if a cache size is not given (since the cache is set as 16MiB // by default). Given that todays caches are at the low MiB range // and drive sizes at the mid GB, this shouldn't pose a problem. // The 3x minimum comes from the following: // 1. In order to allocate 'y' blocks from a segment, the // segment must contain at least 2y blocks or else an // allocation may not succeed. // 2. Assume the entire disk is one segment. // 3. A full write phase consists of writing blocks corresponding to // 3/2 cache size. // 4. Therefore, the one segment must have 2 * 3/2 * cache // size worth of blocks = 3 * cache size worth of blocks // to complete. // In non-destructive mode, don't write anything to disk. if (!non_destructive_) { if (!WriteBlockToDisk(fd, block)) { block_table_->RemoveBlock(block); return true; } blocks_written_++; } // Block is either initialized by writing, or in nondestructive case, // initialized by being added into the datastructure for later reading. block->SetBlockAsInitialized(); in_flight_sectors_.push(block); } if (!os_->FlushPageCache()) // If O_DIRECT worked, this will be a NOP. return false; // Verify blocks on disk. logprintf(20, "Log: Read phase for disk %s (thread %d).\n", device_name_.c_str(), thread_num_); while (IsReadyToRunNoPause() && !in_flight_sectors_.empty()) { BlockData *block = in_flight_sectors_.front(); in_flight_sectors_.pop(); if (!ValidateBlockOnDisk(fd, block)) return true; block_table_->RemoveBlock(block); blocks_read_++; } } pages_copied_ = blocks_written_ + blocks_read_; return true; } // Do an asynchronous disk I/O operation. // Return false if the IO is not set up. bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size, int64 offset, int64 timeout) { #ifdef HAVE_LIBAIO_H // Use the Linux native asynchronous I/O interface for reading/writing. // A read/write consists of three basic steps: // 1. create an io context. // 2. prepare and submit an io request to the context // 3. wait for an event on the context. struct { const int opcode; const char *op_str; const char *error_str; } operations[2] = { { IO_CMD_PREAD, "read", "disk-read-error" }, { IO_CMD_PWRITE, "write", "disk-write-error" } }; struct iocb cb; memset(&cb, 0, sizeof(cb)); cb.aio_fildes = fd; cb.aio_lio_opcode = operations[op].opcode; cb.u.c.buf = buf; cb.u.c.nbytes = size; cb.u.c.offset = offset; struct iocb *cbs[] = { &cb }; if (io_submit(aio_ctx_, 1, cbs) != 1) { int error = errno; char buf[256]; sat_strerror(error, buf, sizeof(buf)); logprintf(0, "Process Error: Unable to submit async %s " "on disk %s (thread %d). Error %d, %s\n", operations[op].op_str, device_name_.c_str(), thread_num_, error, buf); return false; } struct io_event event; memset(&event, 0, sizeof(event)); struct timespec tv; tv.tv_sec = timeout / 1000000; tv.tv_nsec = (timeout % 1000000) * 1000; if (io_getevents(aio_ctx_, 1, 1, &event, &tv) != 1) { // A ctrl-c from the keyboard will cause io_getevents to fail with an // EINTR error code. This is not an error and so don't treat it as such, // but still log it. int error = errno; if (error == EINTR) { logprintf(5, "Log: %s interrupted on disk %s (thread %d).\n", operations[op].op_str, device_name_.c_str(), thread_num_); } else { os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1); errorcount_ += 1; logprintf(0, "Hardware Error: Timeout doing async %s to sectors " "starting at %lld on disk %s (thread %d).\n", operations[op].op_str, offset / kSectorSize, device_name_.c_str(), thread_num_); } // Don't bother checking return codes since io_cancel seems to always fail. // Since io_cancel is always failing, destroying and recreating an I/O // context is a workaround for canceling an in-progress I/O operation. // TODO(amistry): Find out why io_cancel isn't working and make it work. io_cancel(aio_ctx_, &cb, &event); io_destroy(aio_ctx_); aio_ctx_ = 0; if (io_setup(5, &aio_ctx_)) { int error = errno; char buf[256]; sat_strerror(error, buf, sizeof(buf)); logprintf(0, "Process Error: Unable to create aio context on disk %s" " (thread %d) Error %d, %s\n", device_name_.c_str(), thread_num_, error, buf); } return false; } // event.res contains the number of bytes written/read or // error if < 0, I think. if (event.res != static_cast(size)) { errorcount_++; os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1); if (event.res < 0) { switch (event.res) { case -EIO: logprintf(0, "Hardware Error: Low-level I/O error while doing %s to " "sectors starting at %lld on disk %s (thread %d).\n", operations[op].op_str, offset / kSectorSize, device_name_.c_str(), thread_num_); break; default: logprintf(0, "Hardware Error: Unknown error while doing %s to " "sectors starting at %lld on disk %s (thread %d).\n", operations[op].op_str, offset / kSectorSize, device_name_.c_str(), thread_num_); } } else { logprintf(0, "Hardware Error: Unable to %s to sectors starting at " "%lld on disk %s (thread %d).\n", operations[op].op_str, offset / kSectorSize, device_name_.c_str(), thread_num_); } return false; } return true; #else // !HAVE_LIBAIO_H return false; #endif } // Write a block to disk. // Return false if the block is not written. bool DiskThread::WriteBlockToDisk(int fd, BlockData *block) { memset(block_buffer_, 0, block->GetSize()); // Fill block buffer with a pattern struct page_entry pe; if (!sat_->GetValid(&pe)) { // Even though a valid page could not be obatined, it is not an error // since we can always fill in a pattern directly, albeit slower. unsigned int *memblock = static_cast(block_buffer_); block->SetPattern(patternlist_->GetRandomPattern()); logprintf(11, "Log: Warning, using pattern fill fallback in " "DiskThread::WriteBlockToDisk on disk %s (thread %d).\n", device_name_.c_str(), thread_num_); for (int i = 0; i < block->GetSize()/wordsize_; i++) { memblock[i] = block->GetPattern()->pattern(i); } } else { memcpy(block_buffer_, pe.addr, block->GetSize()); block->SetPattern(pe.pattern); sat_->PutValid(&pe); } logprintf(12, "Log: Writing %lld sectors starting at %lld on disk %s" " (thread %d).\n", block->GetSize()/kSectorSize, block->GetAddress(), device_name_.c_str(), thread_num_); int64 start_time = GetTime(); if (!AsyncDiskIO(ASYNC_IO_WRITE, fd, block_buffer_, block->GetSize(), block->GetAddress() * kSectorSize, write_timeout_)) { return false; } int64 end_time = GetTime(); logprintf(12, "Log: Writing time: %lld us (thread %d).\n", end_time - start_time, thread_num_); if (end_time - start_time > write_threshold_) { logprintf(5, "Log: Write took %lld us which is longer than threshold " "%lld us on disk %s (thread %d).\n", end_time - start_time, write_threshold_, device_name_.c_str(), thread_num_); } return true; } // Verify a block on disk. // Return true if the block was read, also increment errorcount // if the block had data errors or performance problems. bool DiskThread::ValidateBlockOnDisk(int fd, BlockData *block) { int64 blocks = block->GetSize() / read_block_size_; int64 bytes_read = 0; int64 current_blocks; int64 current_bytes; uint64 address = block->GetAddress(); logprintf(20, "Log: Reading sectors starting at %lld on disk %s " "(thread %d).\n", address, device_name_.c_str(), thread_num_); // Read block from disk and time the read. If it takes longer than the // threshold, complain. if (lseek64(fd, address * kSectorSize, SEEK_SET) == -1) { logprintf(0, "Process Error: Unable to seek to sector %lld in " "DiskThread::ValidateSectorsOnDisk on disk %s " "(thread %d).\n", address, device_name_.c_str(), thread_num_); return false; } int64 start_time = GetTime(); // Split a large write-sized block into small read-sized blocks and // read them in groups of randomly-sized multiples of read block size. // This assures all data written on disk by this particular block // will be tested using a random reading pattern. while (blocks != 0) { // Test all read blocks in a written block. current_blocks = (random() % blocks) + 1; current_bytes = current_blocks * read_block_size_; memset(block_buffer_, 0, current_bytes); logprintf(20, "Log: Reading %lld sectors starting at sector %lld on " "disk %s (thread %d)\n", current_bytes / kSectorSize, (address * kSectorSize + bytes_read) / kSectorSize, device_name_.c_str(), thread_num_); if (!AsyncDiskIO(ASYNC_IO_READ, fd, block_buffer_, current_bytes, address * kSectorSize + bytes_read, write_timeout_)) { return false; } int64 end_time = GetTime(); logprintf(20, "Log: Reading time: %lld us (thread %d).\n", end_time - start_time, thread_num_); if (end_time - start_time > read_threshold_) { logprintf(5, "Log: Read took %lld us which is longer than threshold " "%lld us on disk %s (thread %d).\n", end_time - start_time, read_threshold_, device_name_.c_str(), thread_num_); } // In non-destructive mode, don't compare the block to the pattern since // the block was never written to disk in the first place. if (!non_destructive_) { if (CheckRegion(block_buffer_, block->GetPattern(), current_bytes, 0, bytes_read)) { os_->ErrorReport(device_name_.c_str(), "disk-pattern-error", 1); errorcount_ += 1; logprintf(0, "Hardware Error: Pattern mismatch in block starting at " "sector %lld in DiskThread::ValidateSectorsOnDisk on " "disk %s (thread %d).\n", address, device_name_.c_str(), thread_num_); } } bytes_read += current_blocks * read_block_size_; blocks -= current_blocks; } return true; } // Direct device access thread. // Return false on software error. bool DiskThread::Work() { int fd; logprintf(9, "Log: Starting disk thread %d, disk %s\n", thread_num_, device_name_.c_str()); srandom(time(NULL)); if (!OpenDevice(&fd)) { status_ = false; return false; } // Allocate a block buffer aligned to 512 bytes since the kernel requires it // when using direct IO. #ifdef HAVE_POSIX_MEMALIGN int memalign_result = posix_memalign(&block_buffer_, kBufferAlignment, sat_->page_length()); #else block_buffer_ = memalign(kBufferAlignment, sat_->page_length()); int memalign_result = (block_buffer_ == 0); #endif if (memalign_result) { CloseDevice(fd); logprintf(0, "Process Error: Unable to allocate memory for buffers " "for disk %s (thread %d) posix memalign returned %d.\n", device_name_.c_str(), thread_num_, memalign_result); status_ = false; return false; } #ifdef HAVE_LIBAIO_H if (io_setup(5, &aio_ctx_)) { CloseDevice(fd); logprintf(0, "Process Error: Unable to create aio context for disk %s" " (thread %d).\n", device_name_.c_str(), thread_num_); status_ = false; return false; } #endif bool result = DoWork(fd); status_ = result; #ifdef HAVE_LIBAIO_H io_destroy(aio_ctx_); #endif CloseDevice(fd); logprintf(9, "Log: Completed %d (disk %s): disk thread status %d, " "%d pages copied\n", thread_num_, device_name_.c_str(), status_, pages_copied_); return result; } RandomDiskThread::RandomDiskThread(DiskBlockTable *block_table) : DiskThread(block_table) { update_block_table_ = 0; } RandomDiskThread::~RandomDiskThread() { } // Workload for random disk thread. bool RandomDiskThread::DoWork(int fd) { logprintf(11, "Log: Random phase for disk %s (thread %d).\n", device_name_.c_str(), thread_num_); while (IsReadyToRun()) { BlockData *block = block_table_->GetRandomBlock(); if (block == NULL) { logprintf(12, "Log: No block available for device %s (thread %d).\n", device_name_.c_str(), thread_num_); } else { ValidateBlockOnDisk(fd, block); block_table_->ReleaseBlock(block); blocks_read_++; } } pages_copied_ = blocks_read_; return true; } MemoryRegionThread::MemoryRegionThread() { error_injection_ = false; pages_ = NULL; } MemoryRegionThread::~MemoryRegionThread() { if (pages_ != NULL) delete pages_; } // Set a region of memory or MMIO to be tested. // Return false if region could not be mapped. bool MemoryRegionThread::SetRegion(void *region, int64 size) { int plength = sat_->page_length(); int npages = size / plength; if (size % plength) { logprintf(0, "Process Error: region size is not a multiple of SAT " "page length\n"); return false; } else { if (pages_ != NULL) delete pages_; pages_ = new PageEntryQueue(npages); char *base_addr = reinterpret_cast(region); region_ = base_addr; for (int i = 0; i < npages; i++) { struct page_entry pe; init_pe(&pe); pe.addr = reinterpret_cast(base_addr + i * plength); pe.offset = i * plength; pages_->Push(&pe); } return true; } } // More detailed error printout for hardware errors in memory or MMIO // regions. void MemoryRegionThread::ProcessError(struct ErrorRecord *error, int priority, const char *message) { uint32 buffer_offset; if (phase_ == kPhaseCopy) { // If the error occurred on the Copy Phase, it means that // the source data (i.e., the main memory) is wrong. so // just pass it to the original ProcessError to call a // bad-dimm error WorkerThread::ProcessError(error, priority, message); } else if (phase_ == kPhaseCheck) { // A error on the Check Phase means that the memory region tested // has an error. Gathering more information and then reporting // the error. // Determine if this is a write or read error. os_->Flush(error->vaddr); error->reread = *(error->vaddr); char *good = reinterpret_cast(&(error->expected)); char *bad = reinterpret_cast(&(error->actual)); sat_assert(error->expected != error->actual); unsigned int offset = 0; for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) { if (good[offset] != bad[offset]) break; } error->vbyteaddr = reinterpret_cast(error->vaddr) + offset; buffer_offset = error->vbyteaddr - region_; // Find physical address if possible. error->paddr = os_->VirtualToPhysical(error->vbyteaddr); logprintf(priority, "%s: miscompare on %s, CRC check at %p(0x%llx), " "offset %llx: read:0x%016llx, reread:0x%016llx " "expected:0x%016llx\n", message, identifier_.c_str(), error->vaddr, error->paddr, buffer_offset, error->actual, error->reread, error->expected); } else { logprintf(0, "Process Error: memory region thread raised an " "unexpected error."); } } // Workload for testion memory or MMIO regions. // Return false on software error. bool MemoryRegionThread::Work() { struct page_entry source_pe; struct page_entry memregion_pe; bool result = true; int64 loops = 0; const uint64 error_constant = 0x00ba00000000ba00LL; // For error injection. int64 *addr = 0x0; int offset = 0; int64 data = 0; logprintf(9, "Log: Starting Memory Region thread %d\n", thread_num_); while (IsReadyToRun()) { // Getting pages from SAT and queue. phase_ = kPhaseNoPhase; result = result && sat_->GetValid(&source_pe); if (!result) { logprintf(0, "Process Error: memory region thread failed to pop " "pages from SAT, bailing\n"); break; } result = result && pages_->PopRandom(&memregion_pe); if (!result) { logprintf(0, "Process Error: memory region thread failed to pop " "pages from queue, bailing\n"); break; } // Error injection for CRC copy. if ((sat_->error_injection() || error_injection_) && loops == 1) { addr = reinterpret_cast(source_pe.addr); offset = random() % (sat_->page_length() / wordsize_); data = addr[offset]; addr[offset] = error_constant; } // Copying SAT page into memory region. phase_ = kPhaseCopy; CrcCopyPage(&memregion_pe, &source_pe); memregion_pe.pattern = source_pe.pattern; // Error injection for CRC Check. if ((sat_->error_injection() || error_injection_) && loops == 2) { addr = reinterpret_cast(memregion_pe.addr); offset = random() % (sat_->page_length() / wordsize_); data = addr[offset]; addr[offset] = error_constant; } // Checking page content in memory region. phase_ = kPhaseCheck; CrcCheckPage(&memregion_pe); phase_ = kPhaseNoPhase; // Storing pages on their proper queues. result = result && sat_->PutValid(&source_pe); if (!result) { logprintf(0, "Process Error: memory region thread failed to push " "pages into SAT, bailing\n"); break; } result = result && pages_->Push(&memregion_pe); if (!result) { logprintf(0, "Process Error: memory region thread failed to push " "pages into queue, bailing\n"); break; } if ((sat_->error_injection() || error_injection_) && loops >= 1 && loops <= 2) { addr[offset] = data; } loops++; YieldSelf(); } pages_copied_ = loops; status_ = result; logprintf(9, "Log: Completed %d: Memory Region thread. Status %d, %d " "pages checked\n", thread_num_, status_, pages_copied_); return result; }