1 // Copyright 2008 Google Inc. All Rights Reserved. 2 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 7 // http://www.apache.org/licenses/LICENSE-2.0 8 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // error_diag.h: Ambiguous error diagnosis class 16 17 #ifndef STRESSAPPTEST_ERROR_DIAG_H_ 18 #define STRESSAPPTEST_ERROR_DIAG_H_ 19 20 #include <pthread.h> 21 #include <list> 22 #include <map> 23 #include <set> 24 #include <string> 25 26 // This file must work with autoconf on its public version, 27 // so these includes are correct. 28 #include "sattypes.h" 29 #include "os.h" 30 31 class ErrorInstance; 32 33 // This describes the components of the system. 34 class DeviceTree { 35 public: 36 explicit DeviceTree(string name); 37 ~DeviceTree(); 38 39 // Atomically find arbitrary device in subtree. 40 DeviceTree *FindInSubTree(string name); 41 // Find or add named device. 42 DeviceTree *FindOrAddDevice(string name); 43 // Atomically add sub device. 44 void InsertSubDevice(string name); 45 // Returns parent device. GetParent()46 DeviceTree *GetParent() { return parent_; } 47 // Pretty prints device tree. 48 void PrettyPrint(string spacer = " "); 49 // Atomically add error instance to device. 50 void AddErrorInstance(ErrorInstance *error_instance); 51 // Returns true of device is known to be bad. 52 bool KnownBad(); 53 // Returns number of direct sub devices. NumDirectSubDevices()54 int NumDirectSubDevices() { return subdevices_.size(); } 55 56 private: 57 // Unlocked version of FindInSubTree. 58 DeviceTree *UnlockedFindInSubTree(string name); 59 60 std::map<string, DeviceTree*> subdevices_; // Map of sub-devices. 61 std::list<ErrorInstance*> errors_; // Log of errors. 62 DeviceTree *parent_; // Pointer to parent device. 63 string name_; // Device name. 64 pthread_mutex_t device_tree_mutex_; // Mutex protecting device tree. 65 }; 66 67 68 // enum type for collected errors. 69 enum SATErrorType { 70 SAT_ERROR_NONE = 0, 71 SAT_ERROR_ECC, 72 SAT_ERROR_MISCOMPARE, 73 SAT_ERROR_SECTOR_TAG, 74 }; 75 76 // enum type for error severity. 77 enum SATErrorSeverity { 78 SAT_ERROR_CORRECTABLE = 0, 79 SAT_ERROR_FATAL, 80 }; 81 82 // This describes an error and it's likely causes. 83 class ErrorInstance { 84 public: ErrorInstance()85 ErrorInstance(): type_(SAT_ERROR_NONE), severity_(SAT_ERROR_CORRECTABLE) {} 86 87 SATErrorType type_; // Type of error: ECC, miscompare, sector. 88 SATErrorSeverity severity_; // Correctable, or fatal. 89 std::set<DeviceTree*> causes_; // Devices that can cause this type of error. 90 }; 91 92 // This describes ECC errors. 93 class ECCErrorInstance: public ErrorInstance { 94 public: ECCErrorInstance()95 ECCErrorInstance() { type_ = SAT_ERROR_ECC; } 96 97 uint64 addr_; // Address where error occured. 98 }; 99 100 // This describes miscompare errors. 101 class MiscompareErrorInstance: public ErrorInstance { 102 public: MiscompareErrorInstance()103 MiscompareErrorInstance() { type_ = SAT_ERROR_MISCOMPARE; } 104 105 uint64 addr_; // Address where miscompare occured. 106 }; 107 108 // This describes HDD miscompare errors. 109 class HDDMiscompareErrorInstance: public MiscompareErrorInstance { 110 public: 111 uint64 addr2_; // addr_ and addr2_ are src and dst memory addr. 112 int offset_; // offset. 113 int block_; // error block. 114 }; 115 116 // This describes HDD miscompare errors. 117 class HDDSectorTagErrorInstance: public ErrorInstance { 118 public: HDDSectorTagErrorInstance()119 HDDSectorTagErrorInstance() { type_ = SAT_ERROR_SECTOR_TAG; } 120 121 uint64 addr_; 122 uint64 addr2_; // addr_ and addr2_ are src and dst memory addr. 123 int sector_; // error sector. 124 int block_; // error block. 125 }; 126 127 // Generic error storage and sorting class. 128 class ErrorDiag { 129 public: 130 ErrorDiag(); 131 virtual ~ErrorDiag(); 132 133 // Add info about a CECC. 134 virtual int AddCeccError(string dimm_string); 135 136 // Add info about a UECC. 137 virtual int AddUeccError(string dimm_string); 138 139 // Add info about a miscompare. 140 virtual int AddMiscompareError(string dimm_string, uint64 addr, int count); 141 142 // Add info about a miscompare from a drive. 143 virtual int AddHDDMiscompareError(string devicename, int block, int offset, 144 void *src_addr, void *dst_addr); 145 146 // Add info about a sector tag miscompare from a drive. 147 virtual int AddHDDSectorTagError(string devicename, int block, int offset, 148 int sector, void *src_addr, void *dst_addr); 149 150 // Set platform specific handle and initialize device tree. 151 bool set_os(OsLayer *os); 152 153 protected: 154 // Create and initialize system device tree. 155 virtual bool InitializeDeviceTree(); 156 157 // Utility Function to translate a virtual address to DIMM number. 158 string AddressToDimmString(OsLayer *os, void *addr, int offset); 159 160 DeviceTree *system_tree_root_; // System device tree. 161 OsLayer *os_; // Platform handle. 162 163 private: 164 DISALLOW_COPY_AND_ASSIGN(ErrorDiag); 165 }; 166 167 #endif // STRESSAPPTEST_ERROR_DIAG_H_ 168