1 #include <errno.h>
2 #include <fcntl.h>
3 #include <linux/watchdog.h>
4 #include <signal.h>
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <sys/ioctl.h>
9 #include <sys/select.h>
10 #include <sys/poll.h>
11 #include <sys/signalfd.h>
12 #include <sys/stat.h>
13 #include <sys/time.h>
14 #include <sys/types.h>
15 #include <sys/utsname.h>
16 #include <sys/wait.h>
17 #include <time.h>
18 #include <unistd.h>
19
20 #include "igt_core.h"
21 #include "executor.h"
22 #include "output_strings.h"
23
24 static struct {
25 int *fds;
26 size_t num_dogs;
27 } watchdogs;
28
__close_watchdog(int fd)29 static void __close_watchdog(int fd)
30 {
31 ssize_t ret = write(fd, "V", 1);
32
33 if (ret == -1)
34 fprintf(stderr, "Failed to stop a watchdog: %s\n",
35 strerror(errno));
36
37 close(fd);
38 }
39
close_watchdogs(struct settings * settings)40 static void close_watchdogs(struct settings *settings)
41 {
42 size_t i;
43
44 if (settings && settings->log_level >= LOG_LEVEL_VERBOSE)
45 printf("Closing watchdogs\n");
46
47 if (settings == NULL && watchdogs.num_dogs != 0)
48 fprintf(stderr, "Closing watchdogs from exit handler!\n");
49
50 for (i = 0; i < watchdogs.num_dogs; i++) {
51 __close_watchdog(watchdogs.fds[i]);
52 }
53
54 free(watchdogs.fds);
55 watchdogs.num_dogs = 0;
56 watchdogs.fds = NULL;
57 }
58
close_watchdogs_atexit(void)59 static void close_watchdogs_atexit(void)
60 {
61 close_watchdogs(NULL);
62 }
63
init_watchdogs(struct settings * settings)64 static void init_watchdogs(struct settings *settings)
65 {
66 int i;
67 char name[32];
68 int fd;
69
70 memset(&watchdogs, 0, sizeof(watchdogs));
71
72 if (!settings->use_watchdog || settings->inactivity_timeout <= 0)
73 return;
74
75 if (settings->log_level >= LOG_LEVEL_VERBOSE) {
76 printf("Initializing watchdogs\n");
77 }
78
79 atexit(close_watchdogs_atexit);
80
81 for (i = 0; ; i++) {
82 snprintf(name, sizeof(name), "/dev/watchdog%d", i);
83 if ((fd = open(name, O_RDWR | O_CLOEXEC)) < 0)
84 break;
85
86 watchdogs.num_dogs++;
87 watchdogs.fds = realloc(watchdogs.fds, watchdogs.num_dogs * sizeof(int));
88 watchdogs.fds[i] = fd;
89
90 if (settings->log_level >= LOG_LEVEL_VERBOSE)
91 printf(" %s\n", name);
92 }
93 }
94
watchdogs_set_timeout(int timeout)95 static int watchdogs_set_timeout(int timeout)
96 {
97 size_t i;
98 int orig_timeout = timeout;
99
100 for (i = 0; i < watchdogs.num_dogs; i++) {
101 if (ioctl(watchdogs.fds[i], WDIOC_SETTIMEOUT, &timeout)) {
102 __close_watchdog(watchdogs.fds[i]);
103 watchdogs.fds[i] = -1;
104 continue;
105 }
106
107 if (timeout < orig_timeout) {
108 /*
109 * Timeout of this caliber refused. We want to
110 * use the same timeout for all devices.
111 */
112 return watchdogs_set_timeout(timeout);
113 }
114 }
115
116 return timeout;
117 }
118
ping_watchdogs(void)119 static void ping_watchdogs(void)
120 {
121 size_t i;
122 int ret;
123
124 for (i = 0; i < watchdogs.num_dogs; i++) {
125 ret = ioctl(watchdogs.fds[i], WDIOC_KEEPALIVE, NULL);
126
127 if (ret == -1)
128 fprintf(stderr, "Failed to ping a watchdog: %s\n",
129 strerror(errno));
130 }
131 }
132
handle_lockdep(void)133 static char *handle_lockdep(void)
134 {
135 const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
136 int fd = open("/proc/lockdep_stats", O_RDONLY);
137 const char *debug_locks_line = " debug_locks:";
138 char buf[4096], *p;
139 ssize_t bufsize = 0;
140 int val;
141
142 if (fd < 0)
143 return NULL;
144
145 strcpy(buf, header);
146
147 if ((bufsize = read(fd, buf + strlen(header), sizeof(buf) - strlen(header) - 1)) < 0)
148 return NULL;
149 bufsize += strlen(header);
150 buf[bufsize] = '\0';
151 close(fd);
152
153 if ((p = strstr(buf, debug_locks_line)) != NULL &&
154 sscanf(p + strlen(debug_locks_line), "%d", &val) == 1 &&
155 val != 1) {
156 return strdup(buf);
157 }
158
159 return NULL;
160 }
161
162 /* see Linux's include/linux/kernel.h */
163 static const struct {
164 unsigned long bit;
165 const char *explanation;
166 } abort_taints[] = {
167 {(1 << 5), "TAINT_BAD_PAGE: Bad page reference or an unexpected page flags."},
168 {(1 << 7), "TAINT_DIE: Kernel has died - BUG/OOPS."},
169 {(1 << 9), "TAINT_WARN: WARN_ON has happened."},
170 {0, 0}};
171
tainted(unsigned long * taints)172 static unsigned long tainted(unsigned long *taints)
173 {
174 FILE *f;
175 unsigned long bad_taints = 0;
176
177 for (typeof(*abort_taints) *taint = abort_taints; taint->bit; taint++)
178 bad_taints |= taint->bit;
179
180 *taints = 0;
181
182 f = fopen("/proc/sys/kernel/tainted", "r");
183 if (f) {
184 fscanf(f, "%lu", taints);
185 fclose(f);
186 }
187
188 return *taints & bad_taints;
189 }
190
handle_taint(void)191 static char *handle_taint(void)
192 {
193 unsigned long taints;
194 char *reason;
195
196 if (!tainted(&taints))
197 return NULL;
198
199 asprintf(&reason, "Kernel badly tainted (%#lx) (check dmesg for details):\n",
200 taints);
201
202 for (typeof(*abort_taints) *taint = abort_taints; taint->bit; taint++) {
203 if (taint->bit & taints) {
204 char *old_reason = reason;
205 asprintf(&reason, "%s\t(%#lx) %s\n",
206 old_reason,
207 taint->bit,
208 taint->explanation);
209 free(old_reason);
210 }
211 }
212
213 return reason;
214 }
215
216 static const struct {
217 int condition;
218 char *(*handler)(void);
219 } abort_handlers[] = {
220 { ABORT_LOCKDEP, handle_lockdep },
221 { ABORT_TAINT, handle_taint },
222 { 0, 0 },
223 };
224
need_to_abort(const struct settings * settings)225 static char *need_to_abort(const struct settings* settings)
226 {
227 typeof(*abort_handlers) *it;
228
229 for (it = abort_handlers; it->condition; it++) {
230 char *abort;
231
232 if (!(settings->abort_mask & it->condition))
233 continue;
234
235 abort = it->handler();
236 if (!abort)
237 continue;
238
239 if (settings->log_level >= LOG_LEVEL_NORMAL)
240 fprintf(stderr, "Aborting: %s\n", abort);
241
242 return abort;
243 }
244
245 return NULL;
246 }
247
prune_subtest(struct job_list_entry * entry,char * subtest)248 static void prune_subtest(struct job_list_entry *entry, char *subtest)
249 {
250 char *excl;
251
252 /*
253 * Subtest pruning is done by adding exclusion strings to the
254 * subtest list. The last matching item on the subtest
255 * selection command line flag decides whether to run a
256 * subtest, see igt_core.c for details. If the list is empty,
257 * the expected subtest set is unknown, so we need to add '*'
258 * first so we can start excluding.
259 */
260
261 if (entry->subtest_count == 0) {
262 entry->subtest_count++;
263 entry->subtests = realloc(entry->subtests, entry->subtest_count * sizeof(*entry->subtests));
264 entry->subtests[0] = strdup("*");
265 }
266
267 excl = malloc(strlen(subtest) + 2);
268 excl[0] = '!';
269 strcpy(excl + 1, subtest);
270
271 entry->subtest_count++;
272 entry->subtests = realloc(entry->subtests, entry->subtest_count * sizeof(*entry->subtests));
273 entry->subtests[entry->subtest_count - 1] = excl;
274 }
275
prune_from_journal(struct job_list_entry * entry,int fd)276 static bool prune_from_journal(struct job_list_entry *entry, int fd)
277 {
278 char *subtest;
279 FILE *f;
280 size_t pruned = 0;
281 size_t old_count = entry->subtest_count;
282
283 /*
284 * Each journal line is a subtest that has been started, or
285 * the line 'exit:$exitcode (time)', or 'timeout:$exitcode (time)'.
286 */
287
288 f = fdopen(fd, "r");
289 if (!f)
290 return false;
291
292 while (fscanf(f, "%ms", &subtest) == 1) {
293 if (!strncmp(subtest, EXECUTOR_EXIT, strlen(EXECUTOR_EXIT))) {
294 /* Fully done. Mark that by making the binary name invalid. */
295 fscanf(f, " (%*fs)");
296 entry->binary[0] = '\0';
297 free(subtest);
298 continue;
299 }
300
301 if (!strncmp(subtest, EXECUTOR_TIMEOUT, strlen(EXECUTOR_TIMEOUT))) {
302 fscanf(f, " (%*fs)");
303 free(subtest);
304 continue;
305 }
306
307 prune_subtest(entry, subtest);
308
309 free(subtest);
310 pruned++;
311 }
312
313 fclose(f);
314
315 /*
316 * If we know the subtests we originally wanted to run, check
317 * if we got an equal amount already.
318 */
319 if (old_count > 0 && pruned >= old_count)
320 entry->binary[0] = '\0';
321
322 return pruned > 0;
323 }
324
325 static const char *filenames[_F_LAST] = {
326 [_F_JOURNAL] = "journal.txt",
327 [_F_OUT] = "out.txt",
328 [_F_ERR] = "err.txt",
329 [_F_DMESG] = "dmesg.txt",
330 };
331
open_at_end(int dirfd,const char * name)332 static int open_at_end(int dirfd, const char *name)
333 {
334 int fd = openat(dirfd, name, O_RDWR | O_CREAT | O_CLOEXEC, 0666);
335 char last;
336
337 if (fd >= 0) {
338 if (lseek(fd, -1, SEEK_END) >= 0 &&
339 read(fd, &last, 1) == 1 &&
340 last != '\n') {
341 write(fd, "\n", 1);
342 }
343 lseek(fd, 0, SEEK_END);
344 }
345
346 return fd;
347 }
348
open_for_reading(int dirfd,const char * name)349 static int open_for_reading(int dirfd, const char *name)
350 {
351 return openat(dirfd, name, O_RDONLY);
352 }
353
open_output_files(int dirfd,int * fds,bool write)354 bool open_output_files(int dirfd, int *fds, bool write)
355 {
356 int i;
357 int (*openfunc)(int, const char*) = write ? open_at_end : open_for_reading;
358
359 for (i = 0; i < _F_LAST; i++) {
360 if ((fds[i] = openfunc(dirfd, filenames[i])) < 0) {
361 while (--i >= 0)
362 close(fds[i]);
363 return false;
364 }
365 }
366
367 return true;
368 }
369
close_outputs(int * fds)370 void close_outputs(int *fds)
371 {
372 int i;
373
374 for (i = 0; i < _F_LAST; i++) {
375 close(fds[i]);
376 }
377 }
378
dump_dmesg(int kmsgfd,int outfd)379 static void dump_dmesg(int kmsgfd, int outfd)
380 {
381 /*
382 * Write kernel messages to the log file until we reach
383 * 'now'. Unfortunately, /dev/kmsg doesn't support seeking to
384 * -1 from SEEK_END so we need to use a second fd to read a
385 * message to match against, or stop when we reach EAGAIN.
386 */
387
388 int comparefd = open("/dev/kmsg", O_RDONLY | O_NONBLOCK);
389 unsigned flags;
390 unsigned long long seq, cmpseq, usec;
391 char cont;
392 char buf[2048];
393 ssize_t r;
394
395 if (comparefd < 0)
396 return;
397 lseek(comparefd, 0, SEEK_END);
398
399 if (fcntl(kmsgfd, F_SETFL, O_NONBLOCK)) {
400 close(comparefd);
401 return;
402 }
403
404 while (1) {
405 if (comparefd >= 0) {
406 r = read(comparefd, buf, sizeof(buf) - 1);
407 if (r < 0) {
408 if (errno != EAGAIN && errno != EPIPE) {
409 close(comparefd);
410 return;
411 }
412 } else {
413 buf[r] = '\0';
414 if (sscanf(buf, "%u,%llu,%llu,%c;",
415 &flags, &cmpseq, &usec, &cont) == 4) {
416 /* Reading comparison record done. */
417 close(comparefd);
418 comparefd = -1;
419 }
420 }
421 }
422
423 r = read(kmsgfd, buf, sizeof(buf));
424 if (r <= 0) {
425 if (errno == EPIPE)
426 continue;
427
428 /*
429 * If EAGAIN, we're done. If some other error,
430 * we can't do anything anyway.
431 */
432 close(comparefd);
433 return;
434 }
435
436 write(outfd, buf, r);
437
438 if (comparefd < 0 && sscanf(buf, "%u,%llu,%llu,%c;",
439 &flags, &seq, &usec, &cont) == 4) {
440 /*
441 * Comparison record has been read, compare
442 * the sequence number to see if we have read
443 * enough.
444 */
445 if (seq >= cmpseq)
446 return;
447 }
448 }
449 }
450
kill_child(int sig,pid_t child)451 static bool kill_child(int sig, pid_t child)
452 {
453 /*
454 * Send the signal to the child directly, and to the child's
455 * process group.
456 */
457 kill(-child, sig);
458 if (kill(child, sig) && errno == ESRCH) {
459 fprintf(stderr, "Child process does not exist. This shouldn't happen.\n");
460 return false;
461 }
462
463 return true;
464 }
465
466 /*
467 * Returns:
468 * =0 - Success
469 * <0 - Failure executing
470 * >0 - Timeout happened, need to recreate from journal
471 */
monitor_output(pid_t child,int outfd,int errfd,int kmsgfd,int sigfd,int * outputs,double * time_spent,struct settings * settings)472 static int monitor_output(pid_t child,
473 int outfd, int errfd, int kmsgfd, int sigfd,
474 int *outputs,
475 double *time_spent,
476 struct settings *settings)
477 {
478 fd_set set;
479 char buf[2048];
480 char *outbuf = NULL;
481 size_t outbufsize = 0;
482 char current_subtest[256] = {};
483 struct signalfd_siginfo siginfo;
484 ssize_t s;
485 int n, status;
486 int nfds = outfd;
487 int timeout = settings->inactivity_timeout;
488 int timeout_intervals = 1, intervals_left;
489 int wd_extra = 10;
490 int killed = 0; /* 0 if not killed, signal number otherwise */
491 struct timespec time_beg, time_end;
492 unsigned long taints = 0;
493 bool aborting = false;
494
495 igt_gettime(&time_beg);
496
497 if (errfd > nfds)
498 nfds = errfd;
499 if (kmsgfd > nfds)
500 nfds = kmsgfd;
501 if (sigfd > nfds)
502 nfds = sigfd;
503 nfds++;
504
505 if (timeout > 0) {
506 /*
507 * Use original timeout plus some leeway. If we're still
508 * alive, we want to kill the test process instead of cutting
509 * power.
510 */
511 int wd_timeout = watchdogs_set_timeout(timeout + wd_extra);
512
513 if (wd_timeout < timeout + wd_extra) {
514 /* Watchdog timeout smaller, so ping it more often */
515 if (wd_timeout - wd_extra < 0)
516 wd_extra = wd_timeout / 2;
517 timeout_intervals = timeout / (wd_timeout - wd_extra);
518 timeout /= timeout_intervals;
519
520 if (settings->log_level >= LOG_LEVEL_VERBOSE) {
521 printf("Watchdog doesn't support the timeout we requested (shortened to %d seconds).\n"
522 "Using %d intervals of %d seconds.\n",
523 wd_timeout, timeout_intervals, timeout);
524 }
525 }
526 }
527
528 intervals_left = timeout_intervals;
529
530 while (outfd >= 0 || errfd >= 0 || sigfd >= 0) {
531 struct timeval tv = { .tv_sec = timeout };
532
533 FD_ZERO(&set);
534 if (outfd >= 0)
535 FD_SET(outfd, &set);
536 if (errfd >= 0)
537 FD_SET(errfd, &set);
538 if (kmsgfd >= 0)
539 FD_SET(kmsgfd, &set);
540 if (sigfd >= 0)
541 FD_SET(sigfd, &set);
542
543 n = select(nfds, &set, NULL, NULL, timeout == 0 ? NULL : &tv);
544 if (n < 0) {
545 /* TODO */
546 return -1;
547 }
548
549 if (n == 0) {
550 if (--intervals_left)
551 continue;
552
553 ping_watchdogs();
554
555 switch (killed) {
556 case 0:
557 if (settings->log_level >= LOG_LEVEL_NORMAL) {
558 printf("Timeout. Killing the current test with SIGQUIT.\n");
559 fflush(stdout);
560 }
561
562 killed = SIGQUIT;
563 if (!kill_child(killed, child))
564 return -1;
565
566 /*
567 * Now continue the loop and let the
568 * dying child be handled normally.
569 */
570 timeout = 20;
571 watchdogs_set_timeout(120);
572 intervals_left = timeout_intervals = 1;
573 break;
574 case SIGQUIT:
575 if (settings->log_level >= LOG_LEVEL_NORMAL) {
576 printf("Timeout. Killing the current test with SIGKILL.\n");
577 fflush(stdout);
578 }
579
580 killed = SIGKILL;
581 if (!kill_child(killed, child))
582 return -1;
583
584 intervals_left = timeout_intervals = 1;
585 break;
586 case SIGKILL:
587 /*
588 * If the child still exists, and the kernel
589 * hasn't oopsed, assume it is still making
590 * forward progress towards exiting (i.e. still
591 * freeing all of its resources).
592 */
593 if (kill(child, 0) == 0 && !tainted(&taints)) {
594 intervals_left = 1;
595 break;
596 }
597
598 /* Nothing that can be done, really. Let's tell the caller we want to abort. */
599 if (settings->log_level >= LOG_LEVEL_NORMAL) {
600 fprintf(stderr, "Child refuses to die, tainted %lx. Aborting.\n",
601 taints);
602 }
603 close_watchdogs(settings);
604 free(outbuf);
605 close(outfd);
606 close(errfd);
607 close(kmsgfd);
608 return -1;
609 }
610
611 continue;
612 }
613
614 intervals_left = timeout_intervals;
615 ping_watchdogs();
616
617 /* TODO: Refactor these handlers to their own functions */
618 if (outfd >= 0 && FD_ISSET(outfd, &set)) {
619 char *newline;
620
621 s = read(outfd, buf, sizeof(buf));
622 if (s <= 0) {
623 if (s < 0) {
624 fprintf(stderr, "Error reading test's stdout: %s\n",
625 strerror(errno));
626 }
627
628 close(outfd);
629 outfd = -1;
630 goto out_end;
631 }
632
633 write(outputs[_F_OUT], buf, s);
634 if (settings->sync) {
635 fdatasync(outputs[_F_OUT]);
636 }
637
638 outbuf = realloc(outbuf, outbufsize + s);
639 memcpy(outbuf + outbufsize, buf, s);
640 outbufsize += s;
641
642 while ((newline = memchr(outbuf, '\n', outbufsize)) != NULL) {
643 size_t linelen = newline - outbuf + 1;
644
645 if (linelen > strlen(STARTING_SUBTEST) &&
646 !memcmp(outbuf, STARTING_SUBTEST, strlen(STARTING_SUBTEST))) {
647 write(outputs[_F_JOURNAL], outbuf + strlen(STARTING_SUBTEST),
648 linelen - strlen(STARTING_SUBTEST));
649 memcpy(current_subtest, outbuf + strlen(STARTING_SUBTEST),
650 linelen - strlen(STARTING_SUBTEST));
651 current_subtest[linelen - strlen(STARTING_SUBTEST)] = '\0';
652
653 if (settings->log_level >= LOG_LEVEL_VERBOSE) {
654 fwrite(outbuf, 1, linelen, stdout);
655 }
656 }
657 if (linelen > strlen(SUBTEST_RESULT) &&
658 !memcmp(outbuf, SUBTEST_RESULT, strlen(SUBTEST_RESULT))) {
659 char *delim = memchr(outbuf, ':', linelen);
660
661 if (delim != NULL) {
662 size_t subtestlen = delim - outbuf - strlen(SUBTEST_RESULT);
663 if (memcmp(current_subtest, outbuf + strlen(SUBTEST_RESULT),
664 subtestlen)) {
665 /* Result for a test that didn't ever start */
666 write(outputs[_F_JOURNAL],
667 outbuf + strlen(SUBTEST_RESULT),
668 subtestlen);
669 write(outputs[_F_JOURNAL], "\n", 1);
670 if (settings->sync) {
671 fdatasync(outputs[_F_JOURNAL]);
672 }
673 current_subtest[0] = '\0';
674 }
675
676 if (settings->log_level >= LOG_LEVEL_VERBOSE) {
677 fwrite(outbuf, 1, linelen, stdout);
678 }
679 }
680 }
681
682 memmove(outbuf, newline + 1, outbufsize - linelen);
683 outbufsize -= linelen;
684 }
685 }
686 out_end:
687
688 if (errfd >= 0 && FD_ISSET(errfd, &set)) {
689 s = read(errfd, buf, sizeof(buf));
690 if (s <= 0) {
691 if (s < 0) {
692 fprintf(stderr, "Error reading test's stderr: %s\n",
693 strerror(errno));
694 }
695 close(errfd);
696 errfd = -1;
697 } else {
698 write(outputs[_F_ERR], buf, s);
699 if (settings->sync) {
700 fdatasync(outputs[_F_ERR]);
701 }
702 }
703 }
704
705 if (kmsgfd >= 0 && FD_ISSET(kmsgfd, &set)) {
706 s = read(kmsgfd, buf, sizeof(buf));
707 if (s < 0) {
708 if (errno != EPIPE && errno != EINVAL) {
709 fprintf(stderr, "Error reading from kmsg, stopping monitoring: %s\n",
710 strerror(errno));
711 close(kmsgfd);
712 kmsgfd = -1;
713 } else if (errno == EINVAL) {
714 fprintf(stderr, "Warning: Buffer too small for kernel log record, record lost.\n");
715 }
716 } else {
717 write(outputs[_F_DMESG], buf, s);
718 if (settings->sync) {
719 fdatasync(outputs[_F_DMESG]);
720 }
721 }
722 }
723
724 if (sigfd >= 0 && FD_ISSET(sigfd, &set)) {
725 double time;
726
727 s = read(sigfd, &siginfo, sizeof(siginfo));
728 if (s < 0) {
729 fprintf(stderr, "Error reading from signalfd: %s\n",
730 strerror(errno));
731 continue;
732 } else if (siginfo.ssi_signo == SIGCHLD) {
733 if (child != waitpid(child, &status, WNOHANG)) {
734 fprintf(stderr, "Failed to reap child\n");
735 status = 9999;
736 } else if (WIFEXITED(status)) {
737 status = WEXITSTATUS(status);
738 if (status >= 128) {
739 status = 128 - status;
740 }
741 } else if (WIFSIGNALED(status)) {
742 status = -WTERMSIG(status);
743 } else {
744 status = 9999;
745 }
746 } else {
747 /* We're dying, so we're taking them with us */
748 if (settings->log_level >= LOG_LEVEL_NORMAL)
749 printf("Abort requested via %s, terminating children\n",
750 strsignal(siginfo.ssi_signo));
751
752 aborting = true;
753 timeout = 2;
754 killed = SIGQUIT;
755 if (!kill_child(killed, child))
756 return -1;
757
758 continue;
759 }
760
761 igt_gettime(&time_end);
762
763 time = igt_time_elapsed(&time_beg, &time_end);
764 if (time < 0.0)
765 time = 0.0;
766
767 if (!aborting) {
768 dprintf(outputs[_F_JOURNAL], "%s%d (%.3fs)\n",
769 killed ? EXECUTOR_TIMEOUT : EXECUTOR_EXIT,
770 status, time);
771 if (settings->sync) {
772 fdatasync(outputs[_F_JOURNAL]);
773 }
774
775 if (time_spent)
776 *time_spent = time;
777 }
778
779 child = 0;
780 sigfd = -1; /* we are dying, no signal handling for now */
781 }
782 }
783
784 dump_dmesg(kmsgfd, outputs[_F_DMESG]);
785 if (settings->sync)
786 fdatasync(outputs[_F_DMESG]);
787
788 free(outbuf);
789 close(outfd);
790 close(errfd);
791 close(kmsgfd);
792
793 if (aborting)
794 return -1;
795
796 return killed;
797 }
798
799 static void __attribute__((noreturn))
execute_test_process(int outfd,int errfd,struct settings * settings,struct job_list_entry * entry)800 execute_test_process(int outfd, int errfd,
801 struct settings *settings,
802 struct job_list_entry *entry)
803 {
804 char *argv[4] = {};
805 size_t rootlen;
806
807 dup2(outfd, STDOUT_FILENO);
808 dup2(errfd, STDERR_FILENO);
809
810 setpgid(0, 0);
811
812 rootlen = strlen(settings->test_root);
813 argv[0] = malloc(rootlen + strlen(entry->binary) + 2);
814 strcpy(argv[0], settings->test_root);
815 argv[0][rootlen] = '/';
816 strcpy(argv[0] + rootlen + 1, entry->binary);
817
818 if (entry->subtest_count) {
819 size_t argsize;
820 size_t i;
821
822 argv[1] = strdup("--run-subtest");
823 argsize = strlen(entry->subtests[0]);
824 argv[2] = malloc(argsize + 1);
825 strcpy(argv[2], entry->subtests[0]);
826
827 for (i = 1; i < entry->subtest_count; i++) {
828 char *sub = entry->subtests[i];
829 size_t sublen = strlen(sub);
830
831 argv[2] = realloc(argv[2], argsize + sublen + 2);
832 argv[2][argsize] = ',';
833 strcpy(argv[2] + argsize + 1, sub);
834 argsize += sublen + 1;
835 }
836 }
837
838 execv(argv[0], argv);
839 fprintf(stderr, "Cannot execute %s\n", argv[0]);
840 exit(IGT_EXIT_INVALID);
841 }
842
digits(size_t num)843 static int digits(size_t num)
844 {
845 int ret = 0;
846 while (num) {
847 num /= 10;
848 ret++;
849 }
850
851 if (ret == 0) ret++;
852 return ret;
853 }
854
print_time_left(struct execute_state * state,struct settings * settings)855 static void print_time_left(struct execute_state *state,
856 struct settings *settings)
857 {
858 int width;
859
860 if (settings->overall_timeout <= 0)
861 return;
862
863 width = digits(settings->overall_timeout);
864 printf("(%*.0fs left) ", width, state->time_left);
865 }
866
entry_display_name(struct job_list_entry * entry)867 static char *entry_display_name(struct job_list_entry *entry)
868 {
869 size_t size = strlen(entry->binary) + 1;
870 char *ret = malloc(size);
871
872 sprintf(ret, "%s", entry->binary);
873
874 if (entry->subtest_count > 0) {
875 size_t i;
876 const char *delim = "";
877
878 size += 3; /* strlen(" (") + strlen(")") */
879 ret = realloc(ret, size);
880 strcat(ret, " (");
881
882 for (i = 0; i < entry->subtest_count; i++) {
883 size += strlen(delim) + strlen(entry->subtests[i]);
884 ret = realloc(ret, size);
885
886 strcat(ret, delim);
887 strcat(ret, entry->subtests[i]);
888
889 delim = ", ";
890 }
891 /* There's already room for this */
892 strcat(ret, ")");
893 }
894
895 return ret;
896 }
897
898 /*
899 * Returns:
900 * =0 - Success
901 * <0 - Failure executing
902 * >0 - Timeout happened, need to recreate from journal
903 */
execute_next_entry(struct execute_state * state,size_t total,double * time_spent,struct settings * settings,struct job_list_entry * entry,int testdirfd,int resdirfd,int sigfd,sigset_t * sigmask)904 static int execute_next_entry(struct execute_state *state,
905 size_t total,
906 double *time_spent,
907 struct settings *settings,
908 struct job_list_entry *entry,
909 int testdirfd, int resdirfd,
910 int sigfd, sigset_t *sigmask)
911 {
912 int dirfd;
913 int outputs[_F_LAST];
914 int kmsgfd;
915 int outpipe[2] = { -1, -1 };
916 int errpipe[2] = { -1, -1 };
917 int outfd, errfd;
918 char name[32];
919 pid_t child;
920 int result;
921 size_t idx = state->next;
922
923 snprintf(name, sizeof(name), "%zd", idx);
924 mkdirat(resdirfd, name, 0777);
925 if ((dirfd = openat(resdirfd, name, O_DIRECTORY | O_RDONLY | O_CLOEXEC)) < 0) {
926 fprintf(stderr, "Error accessing individual test result directory\n");
927 return -1;
928 }
929
930 if (!open_output_files(dirfd, outputs, true)) {
931 fprintf(stderr, "Error opening output files\n");
932 result = -1;
933 goto out_dirfd;
934 }
935
936 if (settings->sync) {
937 fsync(dirfd);
938 fsync(resdirfd);
939 }
940
941 if (pipe(outpipe) || pipe(errpipe)) {
942 fprintf(stderr, "Error creating pipes: %s\n", strerror(errno));
943 result = -1;
944 goto out_pipe;
945 }
946
947 if ((kmsgfd = open("/dev/kmsg", O_RDONLY | O_CLOEXEC)) < 0) {
948 fprintf(stderr, "Warning: Cannot open /dev/kmsg\n");
949 } else {
950 /* TODO: Checking of abort conditions in pre-execute dmesg */
951 lseek(kmsgfd, 0, SEEK_END);
952 }
953
954
955 if (settings->log_level >= LOG_LEVEL_NORMAL) {
956 char *displayname;
957 int width = digits(total);
958 printf("[%0*zd/%0*zd] ", width, idx + 1, width, total);
959
960 print_time_left(state, settings);
961
962 displayname = entry_display_name(entry);
963 printf("%s", displayname);
964 free(displayname);
965
966 printf("\n");
967 }
968
969 /*
970 * Flush outputs before forking so our (buffered) output won't
971 * end up in the test outputs.
972 */
973 fflush(stdout);
974 fflush(stderr);
975
976 child = fork();
977 if (child < 0) {
978 fprintf(stderr, "Failed to fork: %s\n", strerror(errno));
979 result = -1;
980 goto out_kmsgfd;
981 } else if (child == 0) {
982 outfd = outpipe[1];
983 errfd = errpipe[1];
984 close(outpipe[0]);
985 close(errpipe[0]);
986
987 sigprocmask(SIG_UNBLOCK, sigmask, NULL);
988
989 setenv("IGT_SENTINEL_ON_STDERR", "1", 1);
990
991 execute_test_process(outfd, errfd, settings, entry);
992 /* unreachable */
993 }
994
995 outfd = outpipe[0];
996 errfd = errpipe[0];
997 close(outpipe[1]);
998 close(errpipe[1]);
999 outpipe[1] = errpipe[1] = -1;
1000
1001 result = monitor_output(child, outfd, errfd, kmsgfd, sigfd,
1002 outputs, time_spent, settings);
1003
1004 out_kmsgfd:
1005 close(kmsgfd);
1006 out_pipe:
1007 close_outputs(outputs);
1008 close(outpipe[0]);
1009 close(outpipe[1]);
1010 close(errpipe[0]);
1011 close(errpipe[1]);
1012 close_outputs(outputs);
1013 out_dirfd:
1014 close(dirfd);
1015
1016 return result;
1017 }
1018
remove_file(int dirfd,const char * name)1019 static int remove_file(int dirfd, const char *name)
1020 {
1021 return unlinkat(dirfd, name, 0) && errno != ENOENT;
1022 }
1023
clear_test_result_directory(int dirfd)1024 static bool clear_test_result_directory(int dirfd)
1025 {
1026 int i;
1027
1028 for (i = 0; i < _F_LAST; i++) {
1029 if (remove_file(dirfd, filenames[i])) {
1030 fprintf(stderr, "Error deleting %s from test result directory: %s\n",
1031 filenames[i],
1032 strerror(errno));
1033 return false;
1034 }
1035 }
1036
1037 return true;
1038 }
1039
clear_old_results(char * path)1040 static bool clear_old_results(char *path)
1041 {
1042 int dirfd;
1043 size_t i;
1044
1045 if ((dirfd = open(path, O_DIRECTORY | O_RDONLY)) < 0) {
1046 if (errno == ENOENT) {
1047 /* Successfully cleared if it doesn't even exist */
1048 return true;
1049 }
1050
1051 fprintf(stderr, "Error clearing old results: %s\n", strerror(errno));
1052 return false;
1053 }
1054
1055 if (remove_file(dirfd, "uname.txt") ||
1056 remove_file(dirfd, "starttime.txt") ||
1057 remove_file(dirfd, "endtime.txt") ||
1058 remove_file(dirfd, "aborted.txt")) {
1059 close(dirfd);
1060 fprintf(stderr, "Error clearing old results: %s\n", strerror(errno));
1061 return false;
1062 }
1063
1064 for (i = 0; true; i++) {
1065 char name[32];
1066 int resdirfd;
1067
1068 snprintf(name, sizeof(name), "%zd", i);
1069 if ((resdirfd = openat(dirfd, name, O_DIRECTORY | O_RDONLY)) < 0)
1070 break;
1071
1072 if (!clear_test_result_directory(resdirfd)) {
1073 close(resdirfd);
1074 close(dirfd);
1075 return false;
1076 }
1077 close(resdirfd);
1078 if (unlinkat(dirfd, name, AT_REMOVEDIR)) {
1079 fprintf(stderr,
1080 "Warning: Result directory %s contains extra files\n",
1081 name);
1082 }
1083 }
1084
1085 close(dirfd);
1086
1087 return true;
1088 }
1089
timeofday_double(void)1090 static double timeofday_double(void)
1091 {
1092 struct timeval tv;
1093
1094 if (!gettimeofday(&tv, NULL))
1095 return tv.tv_sec + tv.tv_usec / 1000000.0;
1096 return 0.0;
1097 }
1098
init_time_left(struct execute_state * state,struct settings * settings)1099 static void init_time_left(struct execute_state *state,
1100 struct settings *settings)
1101 {
1102 if (settings->overall_timeout <= 0)
1103 state->time_left = -1;
1104 else
1105 state->time_left = settings->overall_timeout;
1106 }
1107
initialize_execute_state_from_resume(int dirfd,struct execute_state * state,struct settings * settings,struct job_list * list)1108 bool initialize_execute_state_from_resume(int dirfd,
1109 struct execute_state *state,
1110 struct settings *settings,
1111 struct job_list *list)
1112 {
1113 struct job_list_entry *entry;
1114 int resdirfd, fd, i;
1115
1116 free_settings(settings);
1117 free_job_list(list);
1118 memset(state, 0, sizeof(*state));
1119 state->resuming = true;
1120
1121 if (!read_settings_from_dir(settings, dirfd) ||
1122 !read_job_list(list, dirfd)) {
1123 close(dirfd);
1124 return false;
1125 }
1126
1127 init_time_left(state, settings);
1128
1129 for (i = list->size; i >= 0; i--) {
1130 char name[32];
1131
1132 snprintf(name, sizeof(name), "%d", i);
1133 if ((resdirfd = openat(dirfd, name, O_DIRECTORY | O_RDONLY)) >= 0)
1134 break;
1135 }
1136
1137 if (i < 0)
1138 /* Nothing has been executed yet, state is fine as is */
1139 goto success;
1140
1141 entry = &list->entries[i];
1142 state->next = i;
1143 if ((fd = openat(resdirfd, filenames[_F_JOURNAL], O_RDONLY)) >= 0) {
1144 if (!prune_from_journal(entry, fd)) {
1145 /*
1146 * The test does not have subtests, or
1147 * incompleted before the first subtest
1148 * began. Either way, not suitable to
1149 * re-run.
1150 */
1151 state->next = i + 1;
1152 } else if (entry->binary[0] == '\0') {
1153 /* This test is fully completed */
1154 state->next = i + 1;
1155 }
1156
1157 close(fd);
1158 }
1159
1160 success:
1161 close(resdirfd);
1162 close(dirfd);
1163
1164 return true;
1165 }
1166
initialize_execute_state(struct execute_state * state,struct settings * settings,struct job_list * job_list)1167 bool initialize_execute_state(struct execute_state *state,
1168 struct settings *settings,
1169 struct job_list *job_list)
1170 {
1171 memset(state, 0, sizeof(*state));
1172
1173 if (!validate_settings(settings))
1174 return false;
1175
1176 if (!serialize_settings(settings) ||
1177 !serialize_job_list(job_list, settings))
1178 return false;
1179
1180 if (settings->overwrite &&
1181 !clear_old_results(settings->results_path))
1182 return false;
1183
1184 init_time_left(state, settings);
1185
1186 state->dry = settings->dry_run;
1187
1188 return true;
1189 }
1190
reduce_time_left(struct settings * settings,struct execute_state * state,double time_spent)1191 static void reduce_time_left(struct settings *settings,
1192 struct execute_state *state,
1193 double time_spent)
1194 {
1195 if (state->time_left < 0)
1196 return;
1197
1198 if (time_spent > state->time_left)
1199 state->time_left = 0.0;
1200 else
1201 state->time_left -= time_spent;
1202 }
1203
overall_timeout_exceeded(struct execute_state * state)1204 static bool overall_timeout_exceeded(struct execute_state *state)
1205 {
1206 return state->time_left == 0.0;
1207 }
1208
write_abort_file(int resdirfd,const char * reason,const char * testbefore,const char * testafter)1209 static void write_abort_file(int resdirfd,
1210 const char *reason,
1211 const char *testbefore,
1212 const char *testafter)
1213 {
1214 int abortfd;
1215
1216 if ((abortfd = openat(resdirfd, "aborted.txt", O_CREAT | O_WRONLY | O_EXCL, 0666)) >= 0) {
1217 /*
1218 * Ignore failure to open, there's
1219 * already an abort probably (if this
1220 * is a resume)
1221 */
1222 dprintf(abortfd, "Aborting.\n");
1223 dprintf(abortfd, "Previous test: %s\n", testbefore);
1224 dprintf(abortfd, "Next test: %s\n\n", testafter);
1225 write(abortfd, reason, strlen(reason));
1226 close(abortfd);
1227 }
1228 }
1229
oom_immortal(void)1230 static void oom_immortal(void)
1231 {
1232 int fd;
1233 const char never_kill[] = "-1000";
1234
1235 fd = open("/proc/self/oom_score_adj", O_WRONLY);
1236 if (fd < 0) {
1237 fprintf(stderr, "Warning: Cannot adjust oom score.\n");
1238 return;
1239 }
1240 if (write(fd, never_kill, sizeof(never_kill)) != sizeof(never_kill))
1241 fprintf(stderr, "Warning: Adjusting oom score failed.\n");
1242
1243 close(fd);
1244 }
1245
should_die_because_signal(int sigfd)1246 static bool should_die_because_signal(int sigfd)
1247 {
1248 struct signalfd_siginfo siginfo;
1249 int ret;
1250 struct pollfd sigpoll = { .fd = sigfd, .events = POLLIN | POLLRDBAND };
1251
1252 ret = poll(&sigpoll, 1, 0);
1253
1254 if (ret != 0) {
1255 if (ret == -1) {
1256 fprintf(stderr, "Poll on signalfd failed with %s\n", strerror(errno));
1257 return true; /* something is wrong, let's die */
1258 }
1259
1260 ret = read(sigfd, &siginfo, sizeof(siginfo));
1261
1262 if (ret == -1) {
1263 fprintf(stderr, "Error reading from signalfd: %s\n", strerror(errno));
1264 return false; /* we may want to retry later */
1265 }
1266
1267 if (siginfo.ssi_signo == SIGCHLD) {
1268 fprintf(stderr, "Runner got stray SIGCHLD while not executing any tests.\n");
1269 } else {
1270 fprintf(stderr, "Runner is being killed by %s\n",
1271 strsignal(siginfo.ssi_signo));
1272 return true;
1273 }
1274
1275 }
1276
1277 return false;
1278 }
1279
execute(struct execute_state * state,struct settings * settings,struct job_list * job_list)1280 bool execute(struct execute_state *state,
1281 struct settings *settings,
1282 struct job_list *job_list)
1283 {
1284 struct utsname unamebuf;
1285 int resdirfd, testdirfd, unamefd, timefd;
1286 sigset_t sigmask;
1287 int sigfd;
1288 double time_spent = 0.0;
1289 bool status = true;
1290
1291 if (state->dry) {
1292 printf("Dry run, not executing. Invoke igt_resume if you want to execute.\n");
1293 return true;
1294 }
1295
1296 if ((resdirfd = open(settings->results_path, O_DIRECTORY | O_RDONLY)) < 0) {
1297 /* Initialize state should have done this */
1298 fprintf(stderr, "Error: Failure opening results path %s\n",
1299 settings->results_path);
1300 return false;
1301 }
1302
1303 if ((testdirfd = open(settings->test_root, O_DIRECTORY | O_RDONLY)) < 0) {
1304 fprintf(stderr, "Error: Failure opening test root %s\n",
1305 settings->test_root);
1306 close(resdirfd);
1307 return false;
1308 }
1309
1310 /* TODO: On resume, don't rewrite, verify that content matches current instead */
1311 if ((unamefd = openat(resdirfd, "uname.txt", O_CREAT | O_WRONLY | O_TRUNC, 0666)) < 0) {
1312 fprintf(stderr, "Error: Failure opening uname.txt: %s\n",
1313 strerror(errno));
1314 close(testdirfd);
1315 close(resdirfd);
1316 return false;
1317 }
1318
1319 if ((timefd = openat(resdirfd, "starttime.txt", O_CREAT | O_WRONLY | O_EXCL, 0666)) >= 0) {
1320 /*
1321 * Ignore failure to open. If this is a resume, we
1322 * don't want to overwrite. For other errors, we
1323 * ignore the start time.
1324 */
1325 dprintf(timefd, "%f\n", timeofday_double());
1326 close(timefd);
1327 }
1328
1329 oom_immortal();
1330
1331 sigemptyset(&sigmask);
1332 sigaddset(&sigmask, SIGCHLD);
1333 sigaddset(&sigmask, SIGINT);
1334 sigaddset(&sigmask, SIGTERM);
1335 sigaddset(&sigmask, SIGQUIT);
1336 sigaddset(&sigmask, SIGHUP);
1337 sigfd = signalfd(-1, &sigmask, O_CLOEXEC);
1338 sigprocmask(SIG_BLOCK, &sigmask, NULL);
1339
1340 if (sigfd < 0) {
1341 /* TODO: Handle better */
1342 fprintf(stderr, "Cannot mask signals\n");
1343 status = false;
1344 goto end;
1345 }
1346
1347 init_watchdogs(settings);
1348
1349 if (!uname(&unamebuf)) {
1350 dprintf(unamefd, "%s %s %s %s %s\n",
1351 unamebuf.sysname,
1352 unamebuf.nodename,
1353 unamebuf.release,
1354 unamebuf.version,
1355 unamebuf.machine);
1356 } else {
1357 dprintf(unamefd, "uname() failed\n");
1358 }
1359 close(unamefd);
1360
1361 /* Check if we're already in abort-state at bootup */
1362 if (!state->resuming) {
1363 char *reason;
1364
1365 if ((reason = need_to_abort(settings)) != NULL) {
1366 char *nexttest = entry_display_name(&job_list->entries[state->next]);
1367 write_abort_file(resdirfd, reason, "nothing", nexttest);
1368 free(reason);
1369 free(nexttest);
1370
1371 status = false;
1372
1373 goto end;
1374 }
1375 }
1376
1377 for (; state->next < job_list->size;
1378 state->next++) {
1379 char *reason;
1380 int result;
1381
1382 if (should_die_because_signal(sigfd)) {
1383 status = false;
1384 goto end;
1385 }
1386
1387 result = execute_next_entry(state,
1388 job_list->size,
1389 &time_spent,
1390 settings,
1391 &job_list->entries[state->next],
1392 testdirfd, resdirfd,
1393 sigfd, &sigmask);
1394
1395 if (result < 0) {
1396 status = false;
1397 break;
1398 }
1399
1400 reduce_time_left(settings, state, time_spent);
1401
1402 if (overall_timeout_exceeded(state)) {
1403 if (settings->log_level >= LOG_LEVEL_NORMAL) {
1404 printf("Overall timeout time exceeded, stopping.\n");
1405 }
1406
1407 break;
1408 }
1409
1410 if ((reason = need_to_abort(settings)) != NULL) {
1411 char *prev = entry_display_name(&job_list->entries[state->next]);
1412 char *next = (state->next + 1 < job_list->size ?
1413 entry_display_name(&job_list->entries[state->next + 1]) :
1414 strdup("nothing"));
1415 write_abort_file(resdirfd, reason, prev, next);
1416 free(prev);
1417 free(next);
1418 free(reason);
1419 status = false;
1420 break;
1421 }
1422
1423 if (result > 0) {
1424 double time_left = state->time_left;
1425
1426 close_watchdogs(settings);
1427 sigprocmask(SIG_UNBLOCK, &sigmask, NULL);
1428 /* make sure that we do not leave any signals unhandled */
1429 if (should_die_because_signal(sigfd)) {
1430 status = false;
1431 goto end_post_signal_restore;
1432 }
1433 close(sigfd);
1434 close(testdirfd);
1435 initialize_execute_state_from_resume(resdirfd, state, settings, job_list);
1436 state->time_left = time_left;
1437 return execute(state, settings, job_list);
1438 }
1439 }
1440
1441 if ((timefd = openat(resdirfd, "endtime.txt", O_CREAT | O_WRONLY | O_EXCL, 0666)) >= 0) {
1442 dprintf(timefd, "%f\n", timeofday_double());
1443 close(timefd);
1444 }
1445
1446 end:
1447 close_watchdogs(settings);
1448 sigprocmask(SIG_UNBLOCK, &sigmask, NULL);
1449 /* make sure that we do not leave any signals unhandled */
1450 if (should_die_because_signal(sigfd))
1451 status = false;
1452 end_post_signal_restore:
1453 close(sigfd);
1454 close(testdirfd);
1455 close(resdirfd);
1456 return status;
1457 }
1458