• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * blktrace support code for fio
3  */
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <sys/stat.h>
7 #include <dirent.h>
8 
9 #include "flist.h"
10 #include "fio.h"
11 #include "blktrace_api.h"
12 
13 #define TRACE_FIFO_SIZE	8192
14 
15 /*
16  * fifo refill frontend, to avoid reading data in trace sized bites
17  */
refill_fifo(struct thread_data * td,struct fifo * fifo,int fd)18 static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd)
19 {
20 	char buf[TRACE_FIFO_SIZE];
21 	unsigned int total;
22 	int ret;
23 
24 	total = sizeof(buf);
25 	if (total > fifo_room(fifo))
26 		total = fifo_room(fifo);
27 
28 	ret = read(fd, buf, total);
29 	if (ret < 0) {
30 		td_verror(td, errno, "read blktrace file");
31 		return -1;
32 	}
33 
34 	if (ret > 0)
35 		ret = fifo_put(fifo, buf, ret);
36 
37 	dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret);
38 	return ret;
39 }
40 
41 /*
42  * Retrieve 'len' bytes from the fifo, refilling if necessary.
43  */
trace_fifo_get(struct thread_data * td,struct fifo * fifo,int fd,void * buf,unsigned int len)44 static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd,
45 			  void *buf, unsigned int len)
46 {
47 	if (fifo_len(fifo) < len) {
48 		int ret = refill_fifo(td, fifo, fd);
49 
50 		if (ret < 0)
51 			return ret;
52 	}
53 
54 	return fifo_get(fifo, buf, len);
55 }
56 
57 /*
58  * Just discard the pdu by seeking past it.
59  */
discard_pdu(struct thread_data * td,struct fifo * fifo,int fd,struct blk_io_trace * t)60 static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd,
61 		       struct blk_io_trace *t)
62 {
63 	if (t->pdu_len == 0)
64 		return 0;
65 
66 	dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len);
67 	return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len);
68 }
69 
70 /*
71  * Check if this is a blktrace binary data file. We read a single trace
72  * into memory and check for the magic signature.
73  */
is_blktrace(const char * filename,int * need_swap)74 int is_blktrace(const char *filename, int *need_swap)
75 {
76 	struct blk_io_trace t;
77 	int fd, ret;
78 
79 	fd = open(filename, O_RDONLY);
80 	if (fd < 0)
81 		return 0;
82 
83 	ret = read(fd, &t, sizeof(t));
84 	close(fd);
85 
86 	if (ret < 0) {
87 		perror("read blktrace");
88 		return 0;
89 	} else if (ret != sizeof(t)) {
90 		log_err("fio: short read on blktrace file\n");
91 		return 0;
92 	}
93 
94 	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
95 		*need_swap = 0;
96 		return 1;
97 	}
98 
99 	/*
100 	 * Maybe it needs to be endian swapped...
101 	 */
102 	t.magic = fio_swap32(t.magic);
103 	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
104 		*need_swap = 1;
105 		return 1;
106 	}
107 
108 	return 0;
109 }
110 
lookup_device(struct thread_data * td,char * path,unsigned int maj,unsigned int min)111 static int lookup_device(struct thread_data *td, char *path, unsigned int maj,
112 			 unsigned int min)
113 {
114 	struct dirent *dir;
115 	struct stat st;
116 	int found = 0;
117 	DIR *D;
118 
119 	D = opendir(path);
120 	if (!D)
121 		return 0;
122 
123 	while ((dir = readdir(D)) != NULL) {
124 		char full_path[256];
125 
126 		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
127 			continue;
128 
129 		sprintf(full_path, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, dir->d_name);
130 		if (lstat(full_path, &st) == -1) {
131 			perror("lstat");
132 			break;
133 		}
134 
135 		if (S_ISDIR(st.st_mode)) {
136 			found = lookup_device(td, full_path, maj, min);
137 			if (found) {
138 				strcpy(path, full_path);
139 				break;
140 			}
141 		}
142 
143 		if (!S_ISBLK(st.st_mode))
144 			continue;
145 
146 		/*
147 		 * If replay_redirect is set then always return this device
148 		 * upon lookup which overrides the device lookup based on
149 		 * major minor in the actual blktrace
150 		 */
151 		if (td->o.replay_redirect) {
152 			dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden"
153 					" with: %s\n", maj, min,
154 					td->o.replay_redirect);
155 			strcpy(path, td->o.replay_redirect);
156 			found = 1;
157 			break;
158 		}
159 
160 		if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
161 			dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min);
162 			strcpy(path, full_path);
163 			found = 1;
164 			break;
165 		}
166 	}
167 
168 	closedir(D);
169 	return found;
170 }
171 
172 #define FMINORBITS	20
173 #define FMINORMASK	((1U << FMINORBITS) - 1)
174 #define FMAJOR(dev)	((unsigned int) ((dev) >> FMINORBITS))
175 #define FMINOR(dev)	((unsigned int) ((dev) & FMINORMASK))
176 
trace_add_open_close_event(struct thread_data * td,int fileno,enum file_log_act action)177 static void trace_add_open_close_event(struct thread_data *td, int fileno, enum file_log_act action)
178 {
179 	struct io_piece *ipo;
180 
181 	ipo = calloc(1, sizeof(*ipo));
182 	init_ipo(ipo);
183 
184 	ipo->ddir = DDIR_INVAL;
185 	ipo->fileno = fileno;
186 	ipo->file_action = action;
187 	flist_add_tail(&ipo->list, &td->io_log_list);
188 }
189 
trace_add_file(struct thread_data * td,__u32 device)190 static int trace_add_file(struct thread_data *td, __u32 device)
191 {
192 	static unsigned int last_maj, last_min, last_fileno;
193 	unsigned int maj = FMAJOR(device);
194 	unsigned int min = FMINOR(device);
195 	struct fio_file *f;
196 	char dev[256];
197 	unsigned int i;
198 
199 	if (last_maj == maj && last_min == min)
200 		return last_fileno;
201 
202 	last_maj = maj;
203 	last_min = min;
204 
205 	/*
206 	 * check for this file in our list
207 	 */
208 	for_each_file(td, f, i)
209 		if (f->major == maj && f->minor == min) {
210 			last_fileno = f->fileno;
211 			return last_fileno;
212 		}
213 
214 	strcpy(dev, "/dev");
215 	if (lookup_device(td, dev, maj, min)) {
216 		int fileno;
217 
218 		dprint(FD_BLKTRACE, "add devices %s\n", dev);
219 		fileno = add_file_exclusive(td, dev);
220 		td->o.open_files++;
221 		td->files[fileno]->major = maj;
222 		td->files[fileno]->minor = min;
223 		trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
224 		last_fileno = fileno;
225 	}
226 
227 	return last_fileno;
228 }
229 
230 /*
231  * Store blk_io_trace data in an ipo for later retrieval.
232  */
store_ipo(struct thread_data * td,unsigned long long offset,unsigned int bytes,int rw,unsigned long long ttime,int fileno)233 static void store_ipo(struct thread_data *td, unsigned long long offset,
234 		      unsigned int bytes, int rw, unsigned long long ttime,
235 		      int fileno)
236 {
237 	struct io_piece *ipo = malloc(sizeof(*ipo));
238 
239 	init_ipo(ipo);
240 
241 	/*
242 	 * the 512 is wrong here, it should be the hardware sector size...
243 	 */
244 	ipo->offset = offset * 512;
245 	ipo->len = bytes;
246 	ipo->delay = ttime / 1000;
247 	if (rw)
248 		ipo->ddir = DDIR_WRITE;
249 	else
250 		ipo->ddir = DDIR_READ;
251 	ipo->fileno = fileno;
252 
253 	dprint(FD_BLKTRACE, "store ddir=%d, off=%llu, len=%lu, delay=%lu\n",
254 							ipo->ddir, ipo->offset,
255 							ipo->len, ipo->delay);
256 	queue_io_piece(td, ipo);
257 }
258 
handle_trace_notify(struct blk_io_trace * t)259 static void handle_trace_notify(struct blk_io_trace *t)
260 {
261 	switch (t->action) {
262 	case BLK_TN_PROCESS:
263 		log_info("blktrace: got process notify: %x, %d\n",
264 				t->action, t->pid);
265 		break;
266 	case BLK_TN_TIMESTAMP:
267 		log_info("blktrace: got timestamp notify: %x, %d\n",
268 				t->action, t->pid);
269 		break;
270 	case BLK_TN_MESSAGE:
271 		break;
272 	default:
273 		dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action);
274 		break;
275 	}
276 }
277 
handle_trace_discard(struct thread_data * td,struct blk_io_trace * t,unsigned long long ttime,unsigned long * ios)278 static void handle_trace_discard(struct thread_data *td, struct blk_io_trace *t,
279 				 unsigned long long ttime, unsigned long *ios)
280 {
281 	struct io_piece *ipo = malloc(sizeof(*ipo));
282 	int fileno;
283 
284 	init_ipo(ipo);
285 	fileno = trace_add_file(td, t->device);
286 
287 	ios[DDIR_WRITE]++;
288 	td->o.size += t->bytes;
289 
290 	memset(ipo, 0, sizeof(*ipo));
291 	INIT_FLIST_HEAD(&ipo->list);
292 
293 	/*
294 	 * the 512 is wrong here, it should be the hardware sector size...
295 	 */
296 	ipo->offset = t->sector * 512;
297 	ipo->len = t->bytes;
298 	ipo->delay = ttime / 1000;
299 	ipo->ddir = DDIR_TRIM;
300 	ipo->fileno = fileno;
301 
302 	dprint(FD_BLKTRACE, "store discard, off=%llu, len=%lu, delay=%lu\n",
303 							ipo->offset, ipo->len,
304 							ipo->delay);
305 	queue_io_piece(td, ipo);
306 }
307 
handle_trace_fs(struct thread_data * td,struct blk_io_trace * t,unsigned long long ttime,unsigned long * ios,unsigned int * bs)308 static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
309 			    unsigned long long ttime, unsigned long *ios,
310 			    unsigned int *bs)
311 {
312 	int rw;
313 	int fileno;
314 
315 	fileno = trace_add_file(td, t->device);
316 
317 	rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
318 
319 	if (t->bytes > bs[rw])
320 		bs[rw] = t->bytes;
321 
322 	ios[rw]++;
323 	td->o.size += t->bytes;
324 	store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
325 }
326 
327 /*
328  * We only care for queue traces, most of the others are side effects
329  * due to internal workings of the block layer.
330  */
handle_trace(struct thread_data * td,struct blk_io_trace * t,unsigned long long ttime,unsigned long * ios,unsigned int * bs)331 static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
332 			 unsigned long long ttime, unsigned long *ios,
333 			 unsigned int *bs)
334 {
335 	if ((t->action & 0xffff) != __BLK_TA_QUEUE)
336 		return;
337 	if (t->action & BLK_TC_ACT(BLK_TC_PC))
338 		return;
339 
340 	if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
341 		handle_trace_notify(t);
342 	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
343 		handle_trace_discard(td, t, ttime, ios);
344 	else
345 		handle_trace_fs(td, t, ttime, ios, bs);
346 }
347 
byteswap_trace(struct blk_io_trace * t)348 static void byteswap_trace(struct blk_io_trace *t)
349 {
350 	t->magic = fio_swap32(t->magic);
351 	t->sequence = fio_swap32(t->sequence);
352 	t->time = fio_swap64(t->time);
353 	t->sector = fio_swap64(t->sector);
354 	t->bytes = fio_swap32(t->bytes);
355 	t->action = fio_swap32(t->action);
356 	t->pid = fio_swap32(t->pid);
357 	t->device = fio_swap32(t->device);
358 	t->cpu = fio_swap32(t->cpu);
359 	t->error = fio_swap16(t->error);
360 	t->pdu_len = fio_swap16(t->pdu_len);
361 }
362 
363 /*
364  * Load a blktrace file by reading all the blk_io_trace entries, and storing
365  * them as io_pieces like the fio text version would do.
366  */
load_blktrace(struct thread_data * td,const char * filename,int need_swap)367 int load_blktrace(struct thread_data *td, const char *filename, int need_swap)
368 {
369 	unsigned long long ttime, delay;
370 	struct blk_io_trace t;
371 	unsigned long ios[2], skipped_writes;
372 	unsigned int cpu;
373 	unsigned int rw_bs[2];
374 	struct fifo *fifo;
375 	int fd, i, old_state;
376 	struct fio_file *f;
377 	int this_depth, depth;
378 
379 	fd = open(filename, O_RDONLY);
380 	if (fd < 0) {
381 		td_verror(td, errno, "open blktrace file");
382 		return 1;
383 	}
384 
385 	fifo = fifo_alloc(TRACE_FIFO_SIZE);
386 
387 	old_state = td_bump_runstate(td, TD_SETTING_UP);
388 
389 	td->o.size = 0;
390 
391 	cpu = 0;
392 	ttime = 0;
393 	ios[0] = ios[1] = 0;
394 	rw_bs[0] = rw_bs[1] = 0;
395 	skipped_writes = 0;
396 	this_depth = depth = 0;
397 	do {
398 		int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
399 
400 		if (ret < 0)
401 			goto err;
402 		else if (!ret)
403 			break;
404 		else if (ret < (int) sizeof(t)) {
405 			log_err("fio: short fifo get\n");
406 			break;
407 		}
408 
409 		if (need_swap)
410 			byteswap_trace(&t);
411 
412 		if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
413 			log_err("fio: bad magic in blktrace data: %x\n",
414 								t.magic);
415 			goto err;
416 		}
417 		if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) {
418 			log_err("fio: bad blktrace version %d\n",
419 								t.magic & 0xff);
420 			goto err;
421 		}
422 		ret = discard_pdu(td, fifo, fd, &t);
423 		if (ret < 0) {
424 			td_verror(td, ret, "blktrace lseek");
425 			goto err;
426 		} else if (t.pdu_len != ret) {
427 			log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
428 			goto err;
429 		}
430 		if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
431 			if ((t.action & 0xffff) == __BLK_TA_QUEUE)
432 				this_depth++;
433 			else if ((t.action & 0xffff) == __BLK_TA_COMPLETE) {
434 				depth = max(depth, this_depth);
435 				this_depth = 0;
436 			}
437 			if (!ttime) {
438 				ttime = t.time;
439 				cpu = t.cpu;
440 			}
441 
442 			delay = 0;
443 			if (cpu == t.cpu)
444 				delay = t.time - ttime;
445 			if ((t.action & BLK_TC_ACT(BLK_TC_WRITE)) && read_only)
446 				skipped_writes++;
447 			else {
448 				/*
449 				 * set delay to zero if no_stall enabled for
450 				 * fast replay
451 				 */
452 				if (td->o.no_stall)
453 					delay = 0;
454 
455 				handle_trace(td, &t, delay, ios, rw_bs);
456 			}
457 
458 			ttime = t.time;
459 			cpu = t.cpu;
460 		} else {
461 			delay = 0;
462 			handle_trace(td, &t, delay, ios, rw_bs);
463 		}
464 	} while (1);
465 
466 	for (i = 0; i < td->files_index; i++) {
467 		f = td->files[i];
468 		trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE);
469 	}
470 
471 	fifo_free(fifo);
472 	close(fd);
473 
474 	td_restore_runstate(td, old_state);
475 
476 	if (!td->files_index) {
477 		log_err("fio: did not find replay device(s)\n");
478 		return 1;
479 	}
480 
481 	/*
482 	 * For stacked devices, we don't always get a COMPLETE event so
483 	 * the depth grows to insane values. Limit it to something sane(r).
484 	 */
485 	if (!depth || depth > 1024)
486 		depth = 1024;
487 
488 	if (skipped_writes)
489 		log_err("fio: %s skips replay of %lu writes due to read-only\n",
490 						td->o.name, skipped_writes);
491 
492 	if (!ios[DDIR_READ] && !ios[DDIR_WRITE]) {
493 		log_err("fio: found no ios in blktrace data\n");
494 		return 1;
495 	} else if (ios[DDIR_READ] && !ios[DDIR_READ]) {
496 		td->o.td_ddir = TD_DDIR_READ;
497 		td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
498 	} else if (!ios[DDIR_READ] && ios[DDIR_WRITE]) {
499 		td->o.td_ddir = TD_DDIR_WRITE;
500 		td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
501 	} else {
502 		td->o.td_ddir = TD_DDIR_RW;
503 		td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
504 		td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
505 	}
506 
507 	/*
508 	 * We need to do direct/raw ios to the device, to avoid getting
509 	 * read-ahead in our way.
510 	 */
511 	td->o.odirect = 1;
512 
513 	/*
514 	 * we don't know if this option was set or not. it defaults to 1,
515 	 * so we'll just guess that we should override it if it's still 1
516 	 */
517 	if (td->o.iodepth != 1)
518 		td->o.iodepth = depth;
519 
520 	return 0;
521 err:
522 	close(fd);
523 	fifo_free(fifo);
524 	return 1;
525 }
526