• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import os, select
2import virt_utils, virt_vm, aexpect
3
4
5class scheduler:
6    """
7    A scheduler that manages several parallel test execution pipelines on a
8    single host.
9    """
10
11    def __init__(self, tests, num_workers, total_cpus, total_mem, bindir):
12        """
13        Initialize the class.
14
15        @param tests: A list of test dictionaries.
16        @param num_workers: The number of workers (pipelines).
17        @param total_cpus: The total number of CPUs to dedicate to tests.
18        @param total_mem: The total amount of memory to dedicate to tests.
19        @param bindir: The directory where environment files reside.
20        """
21        self.tests = tests
22        self.num_workers = num_workers
23        self.total_cpus = total_cpus
24        self.total_mem = total_mem
25        self.bindir = bindir
26        # Pipes -- s stands for scheduler, w stands for worker
27        self.s2w = [os.pipe() for i in range(num_workers)]
28        self.w2s = [os.pipe() for i in range(num_workers)]
29        self.s2w_r = [os.fdopen(r, "r", 0) for r, w in self.s2w]
30        self.s2w_w = [os.fdopen(w, "w", 0) for r, w in self.s2w]
31        self.w2s_r = [os.fdopen(r, "r", 0) for r, w in self.w2s]
32        self.w2s_w = [os.fdopen(w, "w", 0) for r, w in self.w2s]
33        # "Personal" worker dicts contain modifications that are applied
34        # specifically to each worker.  For example, each worker must use a
35        # different environment file and a different MAC address pool.
36        self.worker_dicts = [{"env": "env%d" % i} for i in range(num_workers)]
37
38
39    def worker(self, index, run_test_func):
40        """
41        The worker function.
42
43        Waits for commands from the scheduler and processes them.
44
45        @param index: The index of this worker (in the range 0..num_workers-1).
46        @param run_test_func: A function to be called to run a test
47                (e.g. job.run_test).
48        """
49        r = self.s2w_r[index]
50        w = self.w2s_w[index]
51        self_dict = self.worker_dicts[index]
52
53        # Inform the scheduler this worker is ready
54        w.write("ready\n")
55
56        while True:
57            cmd = r.readline().split()
58            if not cmd:
59                continue
60
61            # The scheduler wants this worker to run a test
62            if cmd[0] == "run":
63                test_index = int(cmd[1])
64                test = self.tests[test_index].copy()
65                test.update(self_dict)
66                test_iterations = int(test.get("iterations", 1))
67                status = run_test_func("kvm", params=test,
68                                       tag=test.get("shortname"),
69                                       iterations=test_iterations)
70                w.write("done %s %s\n" % (test_index, status))
71                w.write("ready\n")
72
73            # The scheduler wants this worker to free its used resources
74            elif cmd[0] == "cleanup":
75                env_filename = os.path.join(self.bindir, self_dict["env"])
76                env = virt_utils.Env(env_filename)
77                for obj in env.values():
78                    if isinstance(obj, virt_vm.VM):
79                        obj.destroy()
80                    elif isinstance(obj, aexpect.Spawn):
81                        obj.close()
82                env.save()
83                w.write("cleanup_done\n")
84                w.write("ready\n")
85
86            # There's no more work for this worker
87            elif cmd[0] == "terminate":
88                break
89
90
91    def scheduler(self):
92        """
93        The scheduler function.
94
95        Sends commands to workers, telling them to run tests, clean up or
96        terminate execution.
97        """
98        idle_workers = []
99        closing_workers = []
100        test_status = ["waiting"] * len(self.tests)
101        test_worker = [None] * len(self.tests)
102        used_cpus = [0] * self.num_workers
103        used_mem = [0] * self.num_workers
104
105        while True:
106            # Wait for a message from a worker
107            r, w, x = select.select(self.w2s_r, [], [])
108
109            someone_is_ready = False
110
111            for pipe in r:
112                worker_index = self.w2s_r.index(pipe)
113                msg = pipe.readline().split()
114                if not msg:
115                    continue
116
117                # A worker is ready -- add it to the idle_workers list
118                if msg[0] == "ready":
119                    idle_workers.append(worker_index)
120                    someone_is_ready = True
121
122                # A worker completed a test
123                elif msg[0] == "done":
124                    test_index = int(msg[1])
125                    test = self.tests[test_index]
126                    status = int(eval(msg[2]))
127                    test_status[test_index] = ("fail", "pass")[status]
128                    # If the test failed, mark all dependent tests as "failed" too
129                    if not status:
130                        for i, other_test in enumerate(self.tests):
131                            for dep in other_test.get("dep", []):
132                                if dep in test["name"]:
133                                    test_status[i] = "fail"
134
135                # A worker is done shutting down its VMs and other processes
136                elif msg[0] == "cleanup_done":
137                    used_cpus[worker_index] = 0
138                    used_mem[worker_index] = 0
139                    closing_workers.remove(worker_index)
140
141            if not someone_is_ready:
142                continue
143
144            for worker in idle_workers[:]:
145                # Find a test for this worker
146                test_found = False
147                for i, test in enumerate(self.tests):
148                    # We only want "waiting" tests
149                    if test_status[i] != "waiting":
150                        continue
151                    # Make sure the test isn't assigned to another worker
152                    if test_worker[i] is not None and test_worker[i] != worker:
153                        continue
154                    # Make sure the test's dependencies are satisfied
155                    dependencies_satisfied = True
156                    for dep in test["dep"]:
157                        dependencies = [j for j, t in enumerate(self.tests)
158                                        if dep in t["name"]]
159                        bad_status_deps = [j for j in dependencies
160                                           if test_status[j] != "pass"]
161                        if bad_status_deps:
162                            dependencies_satisfied = False
163                            break
164                    if not dependencies_satisfied:
165                        continue
166                    # Make sure we have enough resources to run the test
167                    test_used_cpus = int(test.get("used_cpus", 1))
168                    test_used_mem = int(test.get("used_mem", 128))
169                    # First make sure the other workers aren't using too many
170                    # CPUs (not including the workers currently shutting down)
171                    uc = (sum(used_cpus) - used_cpus[worker] -
172                          sum(used_cpus[i] for i in closing_workers))
173                    if uc and uc + test_used_cpus > self.total_cpus:
174                        continue
175                    # ... or too much memory
176                    um = (sum(used_mem) - used_mem[worker] -
177                          sum(used_mem[i] for i in closing_workers))
178                    if um and um + test_used_mem > self.total_mem:
179                        continue
180                    # If we reached this point it means there are, or will
181                    # soon be, enough resources to run the test
182                    test_found = True
183                    # Now check if the test can be run right now, i.e. if the
184                    # other workers, including the ones currently shutting
185                    # down, aren't using too many CPUs
186                    uc = (sum(used_cpus) - used_cpus[worker])
187                    if uc and uc + test_used_cpus > self.total_cpus:
188                        continue
189                    # ... or too much memory
190                    um = (sum(used_mem) - used_mem[worker])
191                    if um and um + test_used_mem > self.total_mem:
192                        continue
193                    # Everything is OK -- run the test
194                    test_status[i] = "running"
195                    test_worker[i] = worker
196                    idle_workers.remove(worker)
197                    # Update used_cpus and used_mem
198                    used_cpus[worker] = test_used_cpus
199                    used_mem[worker] = test_used_mem
200                    # Assign all related tests to this worker
201                    for j, other_test in enumerate(self.tests):
202                        for other_dep in other_test["dep"]:
203                            # All tests that depend on this test
204                            if other_dep in test["name"]:
205                                test_worker[j] = worker
206                                break
207                            # ... and all tests that share a dependency
208                            # with this test
209                            for dep in test["dep"]:
210                                if dep in other_dep or other_dep in dep:
211                                    test_worker[j] = worker
212                                    break
213                    # Tell the worker to run the test
214                    self.s2w_w[worker].write("run %s\n" % i)
215                    break
216
217                # If there won't be any tests for this worker to run soon, tell
218                # the worker to free its used resources
219                if not test_found and (used_cpus[worker] or used_mem[worker]):
220                    self.s2w_w[worker].write("cleanup\n")
221                    idle_workers.remove(worker)
222                    closing_workers.append(worker)
223
224            # If there are no more new tests to run, terminate the workers and
225            # the scheduler
226            if len(idle_workers) == self.num_workers:
227                for worker in idle_workers:
228                    self.s2w_w[worker].write("terminate\n")
229                break
230