1 // Copyright 2008 Google Inc. All Rights Reserved.
2
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6
7 // http://www.apache.org/licenses/LICENSE-2.0
8
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // error_diag.cc: Collects device errors for analysis to more accurately
16 // pin-point failed component.
17
18 #include <set>
19 #include <list>
20 #include <map>
21
22 // This file must work with autoconf on its public version,
23 // so these includes are correct.
24 #include "error_diag.h"
25 #include "sattypes.h"
26
27
28 // DeviceTree constructor.
DeviceTree(string name)29 DeviceTree::DeviceTree(string name)
30 : parent_(0), name_(name) {
31 pthread_mutex_init(&device_tree_mutex_, NULL);
32 }
33
34 // DeviceTree destructor.
~DeviceTree()35 DeviceTree::~DeviceTree() {
36 // Deallocate subtree devices.
37 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
38 itr != subdevices_.end();
39 ++itr) {
40 delete itr->second;
41 }
42 // Deallocate device errors.
43 for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
44 itr != errors_.end();
45 ++itr) {
46 delete (*itr);
47 }
48 pthread_mutex_destroy(&device_tree_mutex_);
49 }
50
51 // Atomically find named device in sub device tree.
52 // Returns 0 if not found
FindInSubTree(string name)53 DeviceTree *DeviceTree::FindInSubTree(string name) {
54 DeviceTree *ret;
55 pthread_mutex_lock(&device_tree_mutex_);
56 ret = UnlockedFindInSubTree(name);
57 pthread_mutex_unlock(&device_tree_mutex_);
58 return ret;
59 }
60
61 // Find named device in sub device tree (Non-atomic).
62 // Returns 0 if not found
UnlockedFindInSubTree(string name)63 DeviceTree *DeviceTree::UnlockedFindInSubTree(string name) {
64 std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name);
65 if (itr != subdevices_.end()) {
66 return itr->second;
67 } else {
68 // Search sub-tree.
69 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
70 itr != subdevices_.end();
71 ++itr) {
72 DeviceTree *result = itr->second->UnlockedFindInSubTree(name);
73 if (result != 0)
74 return result;
75 }
76 return 0;
77 }
78 }
79
80 // Atomically add error instance to device.
AddErrorInstance(ErrorInstance * error_instance)81 void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) {
82 pthread_mutex_lock(&device_tree_mutex_);
83 errors_.push_back(error_instance);
84 pthread_mutex_unlock(&device_tree_mutex_);
85 }
86
87 // Find or add queried device as necessary.
FindOrAddDevice(string name)88 DeviceTree *DeviceTree::FindOrAddDevice(string name) {
89 // Assume named device does not exist and try to insert the device anyway.
90 // No-op if named device already exists.
91 InsertSubDevice(name);
92 // Find and return sub device pointer.
93 return FindInSubTree(name);
94 }
95
96 // Pretty prints device tree.
PrettyPrint(string spacer)97 void DeviceTree::PrettyPrint(string spacer) {
98 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
99 itr != subdevices_.end();
100 ++itr) {
101 printf("%s%s\n", spacer.c_str(), itr->first.c_str());
102 itr->second->PrettyPrint(spacer+spacer);
103 }
104 }
105
106 // Atomically add sub device.
107 // No-op if named device already exists.
InsertSubDevice(string name)108 void DeviceTree::InsertSubDevice(string name) {
109 pthread_mutex_lock(&device_tree_mutex_);
110 if (UnlockedFindInSubTree(name) != 0) {
111 pthread_mutex_unlock(&device_tree_mutex_);
112 return;
113 }
114 subdevices_[name] = new DeviceTree(name);
115 subdevices_[name]->parent_ = this;
116 pthread_mutex_unlock(&device_tree_mutex_);
117 }
118
119
120 // Returns true of any error associated with this device is fatal.
KnownBad()121 bool DeviceTree::KnownBad() {
122 pthread_mutex_lock(&device_tree_mutex_);
123 for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
124 itr != errors_.end();
125 ++itr) {
126 if ((*itr)->severity_ == SAT_ERROR_FATAL) {
127 pthread_mutex_unlock(&device_tree_mutex_);
128 return true;
129 }
130 }
131 pthread_mutex_unlock(&device_tree_mutex_);
132 return false;
133 }
134
135
136 // ErrorDiag constructor.
ErrorDiag()137 ErrorDiag::ErrorDiag() {
138 os_ = 0;
139 system_tree_root_ = 0;
140 }
141
142 // ErrorDiag destructor.
~ErrorDiag()143 ErrorDiag::~ErrorDiag() {
144 if (system_tree_root_)
145 delete system_tree_root_;
146 }
147
148 // Set platform specific handle and initialize device tree.
149 // Returns false on error. true otherwise.
set_os(OsLayer * os)150 bool ErrorDiag::set_os(OsLayer *os) {
151 os_ = os;
152 return(InitializeDeviceTree());
153 }
154
155 // Create and initialize system device tree.
156 // Returns false on error. true otherwise.
InitializeDeviceTree()157 bool ErrorDiag::InitializeDeviceTree() {
158 system_tree_root_ = new DeviceTree("system_root");
159 if (!system_tree_root_)
160 return false;
161 return true;
162 }
163
164 // Logs info about a CECC.
165 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddCeccError(string dimm_string)166 int ErrorDiag::AddCeccError(string dimm_string) {
167 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
168 ECCErrorInstance *error = new ECCErrorInstance;
169 if (!error)
170 return -1;
171 error->severity_ = SAT_ERROR_CORRECTABLE;
172 dimm_device->AddErrorInstance(error);
173 return 0;
174 }
175
176 // Logs info about a UECC.
177 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddUeccError(string dimm_string)178 int ErrorDiag::AddUeccError(string dimm_string) {
179 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
180 ECCErrorInstance *error = new ECCErrorInstance;
181 if (!error)
182 return -1;
183 error->severity_ = SAT_ERROR_FATAL;
184 dimm_device->AddErrorInstance(error);
185 return 0;
186 }
187
188 // Logs info about a miscompare.
189 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddMiscompareError(string dimm_string,uint64 addr,int count)190 int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) {
191 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
192 MiscompareErrorInstance *error = new MiscompareErrorInstance;
193 if (!error)
194 return -1;
195 error->severity_ = SAT_ERROR_FATAL;
196 error->addr_ = addr;
197 dimm_device->AddErrorInstance(error);
198 os_->ErrorReport(dimm_string.c_str(), "miscompare", count);
199 return 1;
200 }
201
202 // Utility Function to translate a virtual address to DIMM number.
203 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddressToDimmString(OsLayer * os,void * addr,int offset)204 string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) {
205 char dimm_string[256] = "";
206 char *vbyteaddr = reinterpret_cast<char*>(addr) + offset;
207 uint64 paddr = os->VirtualToPhysical(vbyteaddr);
208 os->FindDimm(paddr, dimm_string, sizeof(dimm_string));
209 return string(dimm_string);
210 }
211
212 // Info about a miscompare from a drive.
213 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddHDDMiscompareError(string devicename,int block,int offset,void * src_addr,void * dst_addr)214 int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset,
215 void *src_addr, void *dst_addr) {
216 bool mask_hdd_error = false;
217
218 HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance;
219 if (!error)
220 return -1;
221
222 error->addr_ = reinterpret_cast<uint64>(src_addr);
223 error->addr2_ = reinterpret_cast<uint64>(dst_addr);
224 error->offset_ = offset;
225 error->block_ = block;
226
227 string src_dimm = AddressToDimmString(os_, src_addr, offset);
228 string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
229
230 // DIMM name look up success
231 if (src_dimm.compare("DIMM Unknown")) {
232 // Add src DIMM as possible miscompare cause.
233 DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
234 error->causes_.insert(src_dimm_dev);
235 if (src_dimm_dev->KnownBad()) {
236 mask_hdd_error = true;
237 logprintf(5, "Log: supressed %s miscompare report: "
238 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
239 }
240 }
241 if (dst_dimm.compare("DIMM Unknown")) {
242 // Add dst DIMM as possible miscompare cause.
243 DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
244 error->causes_.insert(dst_dimm_dev);
245 if (dst_dimm_dev->KnownBad()) {
246 mask_hdd_error = true;
247 logprintf(5, "Log: supressed %s miscompare report: "
248 "known bad destination: %s\n", devicename.c_str(),
249 dst_dimm.c_str());
250 }
251 }
252
253 DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
254 hdd_dev->AddErrorInstance(error);
255
256 // HDD error was not masked by bad DIMMs: report bad HDD.
257 if (!mask_hdd_error) {
258 os_->ErrorReport(devicename.c_str(), "miscompare", 1);
259 error->severity_ = SAT_ERROR_FATAL;
260 return 1;
261 }
262 return 0;
263 }
264
265 // Info about a sector tag miscompare from a drive.
266 // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
AddHDDSectorTagError(string devicename,int block,int offset,int sector,void * src_addr,void * dst_addr)267 int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset,
268 int sector, void *src_addr,
269 void *dst_addr) {
270 bool mask_hdd_error = false;
271
272 HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance;
273 if (!error)
274 return -1;
275
276 error->addr_ = reinterpret_cast<uint64>(src_addr);
277 error->addr2_ = reinterpret_cast<uint64>(dst_addr);
278 error->sector_ = sector;
279 error->block_ = block;
280
281 string src_dimm = AddressToDimmString(os_, src_addr, offset);
282 string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
283
284 // DIMM name look up success
285 if (src_dimm.compare("DIMM Unknown")) {
286 // Add src DIMM as possible miscompare cause.
287 DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
288 error->causes_.insert(src_dimm_dev);
289 if (src_dimm_dev->KnownBad()) {
290 mask_hdd_error = true;
291 logprintf(5, "Log: supressed %s sector tag error report: "
292 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
293 }
294 }
295 if (dst_dimm.compare("DIMM Unknown")) {
296 // Add dst DIMM as possible miscompare cause.
297 DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
298 error->causes_.insert(dst_dimm_dev);
299 if (dst_dimm_dev->KnownBad()) {
300 mask_hdd_error = true;
301 logprintf(5, "Log: supressed %s sector tag error report: "
302 "known bad destination: %s\n", devicename.c_str(),
303 dst_dimm.c_str());
304 }
305 }
306
307 DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
308 hdd_dev->AddErrorInstance(error);
309
310 // HDD error was not masked by bad DIMMs: report bad HDD.
311 if (!mask_hdd_error) {
312 os_->ErrorReport(devicename.c_str(), "sector", 1);
313 error->severity_ = SAT_ERROR_FATAL;
314 return 1;
315 }
316 return 0;
317 }
318