1 /*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26 /*
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32 /*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * libcfs/libcfs/workitem.c
37 *
38 * Author: Isaac Huang <isaac@clusterfs.com>
39 * Liang Zhen <zhen.liang@sun.com>
40 */
41
42 #define DEBUG_SUBSYSTEM S_LNET
43
44 #include "../../include/linux/libcfs/libcfs.h"
45
46 #define CFS_WS_NAME_LEN 16
47
48 struct cfs_wi_sched {
49 struct list_head ws_list; /* chain on global list */
50 /** serialised workitems */
51 spinlock_t ws_lock;
52 /** where schedulers sleep */
53 wait_queue_head_t ws_waitq;
54 /** concurrent workitems */
55 struct list_head ws_runq;
56 /** rescheduled running-workitems, a workitem can be rescheduled
57 * while running in wi_action(), but we don't to execute it again
58 * unless it returns from wi_action(), so we put it on ws_rerunq
59 * while rescheduling, and move it to runq after it returns
60 * from wi_action() */
61 struct list_head ws_rerunq;
62 /** CPT-table for this scheduler */
63 struct cfs_cpt_table *ws_cptab;
64 /** CPT id for affinity */
65 int ws_cpt;
66 /** number of scheduled workitems */
67 int ws_nscheduled;
68 /** started scheduler thread, protected by cfs_wi_data::wi_glock */
69 unsigned int ws_nthreads:30;
70 /** shutting down, protected by cfs_wi_data::wi_glock */
71 unsigned int ws_stopping:1;
72 /** serialize starting thread, protected by cfs_wi_data::wi_glock */
73 unsigned int ws_starting:1;
74 /** scheduler name */
75 char ws_name[CFS_WS_NAME_LEN];
76 };
77
78 static struct cfs_workitem_data {
79 /** serialize */
80 spinlock_t wi_glock;
81 /** list of all schedulers */
82 struct list_head wi_scheds;
83 /** WI module is initialized */
84 int wi_init;
85 /** shutting down the whole WI module */
86 int wi_stopping;
87 } cfs_wi_data;
88
89 static inline void
cfs_wi_sched_lock(struct cfs_wi_sched * sched)90 cfs_wi_sched_lock(struct cfs_wi_sched *sched)
91 {
92 spin_lock(&sched->ws_lock);
93 }
94
95 static inline void
cfs_wi_sched_unlock(struct cfs_wi_sched * sched)96 cfs_wi_sched_unlock(struct cfs_wi_sched *sched)
97 {
98 spin_unlock(&sched->ws_lock);
99 }
100
101 static inline int
cfs_wi_sched_cansleep(struct cfs_wi_sched * sched)102 cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
103 {
104 cfs_wi_sched_lock(sched);
105 if (sched->ws_stopping) {
106 cfs_wi_sched_unlock(sched);
107 return 0;
108 }
109
110 if (!list_empty(&sched->ws_runq)) {
111 cfs_wi_sched_unlock(sched);
112 return 0;
113 }
114 cfs_wi_sched_unlock(sched);
115 return 1;
116 }
117
118 /* XXX:
119 * 0. it only works when called from wi->wi_action.
120 * 1. when it returns no one shall try to schedule the workitem.
121 */
122 void
cfs_wi_exit(struct cfs_wi_sched * sched,cfs_workitem_t * wi)123 cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
124 {
125 LASSERT(!in_interrupt()); /* because we use plain spinlock */
126 LASSERT(!sched->ws_stopping);
127
128 cfs_wi_sched_lock(sched);
129
130 LASSERT(wi->wi_running);
131 if (wi->wi_scheduled) { /* cancel pending schedules */
132 LASSERT(!list_empty(&wi->wi_list));
133 list_del_init(&wi->wi_list);
134
135 LASSERT(sched->ws_nscheduled > 0);
136 sched->ws_nscheduled--;
137 }
138
139 LASSERT(list_empty(&wi->wi_list));
140
141 wi->wi_scheduled = 1; /* LBUG future schedule attempts */
142 cfs_wi_sched_unlock(sched);
143
144 return;
145 }
146 EXPORT_SYMBOL(cfs_wi_exit);
147
148 /**
149 * cancel schedule request of workitem \a wi
150 */
151 int
cfs_wi_deschedule(struct cfs_wi_sched * sched,cfs_workitem_t * wi)152 cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
153 {
154 int rc;
155
156 LASSERT(!in_interrupt()); /* because we use plain spinlock */
157 LASSERT(!sched->ws_stopping);
158
159 /*
160 * return 0 if it's running already, otherwise return 1, which
161 * means the workitem will not be scheduled and will not have
162 * any race with wi_action.
163 */
164 cfs_wi_sched_lock(sched);
165
166 rc = !(wi->wi_running);
167
168 if (wi->wi_scheduled) { /* cancel pending schedules */
169 LASSERT(!list_empty(&wi->wi_list));
170 list_del_init(&wi->wi_list);
171
172 LASSERT(sched->ws_nscheduled > 0);
173 sched->ws_nscheduled--;
174
175 wi->wi_scheduled = 0;
176 }
177
178 LASSERT (list_empty(&wi->wi_list));
179
180 cfs_wi_sched_unlock(sched);
181 return rc;
182 }
183 EXPORT_SYMBOL(cfs_wi_deschedule);
184
185 /*
186 * Workitem scheduled with (serial == 1) is strictly serialised not only with
187 * itself, but also with others scheduled this way.
188 *
189 * Now there's only one static serialised queue, but in the future more might
190 * be added, and even dynamic creation of serialised queues might be supported.
191 */
192 void
cfs_wi_schedule(struct cfs_wi_sched * sched,cfs_workitem_t * wi)193 cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
194 {
195 LASSERT(!in_interrupt()); /* because we use plain spinlock */
196 LASSERT(!sched->ws_stopping);
197
198 cfs_wi_sched_lock(sched);
199
200 if (!wi->wi_scheduled) {
201 LASSERT (list_empty(&wi->wi_list));
202
203 wi->wi_scheduled = 1;
204 sched->ws_nscheduled++;
205 if (!wi->wi_running) {
206 list_add_tail(&wi->wi_list, &sched->ws_runq);
207 wake_up(&sched->ws_waitq);
208 } else {
209 list_add(&wi->wi_list, &sched->ws_rerunq);
210 }
211 }
212
213 LASSERT (!list_empty(&wi->wi_list));
214 cfs_wi_sched_unlock(sched);
215 return;
216 }
217 EXPORT_SYMBOL(cfs_wi_schedule);
218
219 static int
cfs_wi_scheduler(void * arg)220 cfs_wi_scheduler (void *arg)
221 {
222 struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
223
224 cfs_block_allsigs();
225
226 /* CPT affinity scheduler? */
227 if (sched->ws_cptab != NULL)
228 cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
229
230 spin_lock(&cfs_wi_data.wi_glock);
231
232 LASSERT(sched->ws_starting == 1);
233 sched->ws_starting--;
234 sched->ws_nthreads++;
235
236 spin_unlock(&cfs_wi_data.wi_glock);
237
238 cfs_wi_sched_lock(sched);
239
240 while (!sched->ws_stopping) {
241 int nloops = 0;
242 int rc;
243 cfs_workitem_t *wi;
244
245 while (!list_empty(&sched->ws_runq) &&
246 nloops < CFS_WI_RESCHED) {
247 wi = list_entry(sched->ws_runq.next,
248 cfs_workitem_t, wi_list);
249 LASSERT(wi->wi_scheduled && !wi->wi_running);
250
251 list_del_init(&wi->wi_list);
252
253 LASSERT(sched->ws_nscheduled > 0);
254 sched->ws_nscheduled--;
255
256 wi->wi_running = 1;
257 wi->wi_scheduled = 0;
258
259 cfs_wi_sched_unlock(sched);
260 nloops++;
261
262 rc = (*wi->wi_action) (wi);
263
264 cfs_wi_sched_lock(sched);
265 if (rc != 0) /* WI should be dead, even be freed! */
266 continue;
267
268 wi->wi_running = 0;
269 if (list_empty(&wi->wi_list))
270 continue;
271
272 LASSERT(wi->wi_scheduled);
273 /* wi is rescheduled, should be on rerunq now, we
274 * move it to runq so it can run action now */
275 list_move_tail(&wi->wi_list, &sched->ws_runq);
276 }
277
278 if (!list_empty(&sched->ws_runq)) {
279 cfs_wi_sched_unlock(sched);
280 /* don't sleep because some workitems still
281 * expect me to come back soon */
282 cond_resched();
283 cfs_wi_sched_lock(sched);
284 continue;
285 }
286
287 cfs_wi_sched_unlock(sched);
288 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
289 !cfs_wi_sched_cansleep(sched));
290 cfs_wi_sched_lock(sched);
291 }
292
293 cfs_wi_sched_unlock(sched);
294
295 spin_lock(&cfs_wi_data.wi_glock);
296 sched->ws_nthreads--;
297 spin_unlock(&cfs_wi_data.wi_glock);
298
299 return 0;
300 }
301
302 void
cfs_wi_sched_destroy(struct cfs_wi_sched * sched)303 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
304 {
305 int i;
306
307 LASSERT(cfs_wi_data.wi_init);
308 LASSERT(!cfs_wi_data.wi_stopping);
309
310 spin_lock(&cfs_wi_data.wi_glock);
311 if (sched->ws_stopping) {
312 CDEBUG(D_INFO, "%s is in progress of stopping\n",
313 sched->ws_name);
314 spin_unlock(&cfs_wi_data.wi_glock);
315 return;
316 }
317
318 LASSERT(!list_empty(&sched->ws_list));
319 sched->ws_stopping = 1;
320
321 spin_unlock(&cfs_wi_data.wi_glock);
322
323 i = 2;
324 wake_up_all(&sched->ws_waitq);
325
326 spin_lock(&cfs_wi_data.wi_glock);
327 while (sched->ws_nthreads > 0) {
328 CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
329 "waiting for %d threads of WI sched[%s] to terminate\n",
330 sched->ws_nthreads, sched->ws_name);
331
332 spin_unlock(&cfs_wi_data.wi_glock);
333 set_current_state(TASK_UNINTERRUPTIBLE);
334 schedule_timeout(cfs_time_seconds(1) / 20);
335 spin_lock(&cfs_wi_data.wi_glock);
336 }
337
338 list_del(&sched->ws_list);
339
340 spin_unlock(&cfs_wi_data.wi_glock);
341 LASSERT(sched->ws_nscheduled == 0);
342
343 LIBCFS_FREE(sched, sizeof(*sched));
344 }
345 EXPORT_SYMBOL(cfs_wi_sched_destroy);
346
347 int
cfs_wi_sched_create(char * name,struct cfs_cpt_table * cptab,int cpt,int nthrs,struct cfs_wi_sched ** sched_pp)348 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
349 int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
350 {
351 struct cfs_wi_sched *sched;
352 int rc;
353
354 LASSERT(cfs_wi_data.wi_init);
355 LASSERT(!cfs_wi_data.wi_stopping);
356 LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
357 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
358
359 LIBCFS_ALLOC(sched, sizeof(*sched));
360 if (sched == NULL)
361 return -ENOMEM;
362
363 strlcpy(sched->ws_name, name, CFS_WS_NAME_LEN);
364
365 sched->ws_cptab = cptab;
366 sched->ws_cpt = cpt;
367
368 spin_lock_init(&sched->ws_lock);
369 init_waitqueue_head(&sched->ws_waitq);
370 INIT_LIST_HEAD(&sched->ws_runq);
371 INIT_LIST_HEAD(&sched->ws_rerunq);
372 INIT_LIST_HEAD(&sched->ws_list);
373
374 rc = 0;
375 while (nthrs > 0) {
376 char name[16];
377 struct task_struct *task;
378
379 spin_lock(&cfs_wi_data.wi_glock);
380 while (sched->ws_starting > 0) {
381 spin_unlock(&cfs_wi_data.wi_glock);
382 schedule();
383 spin_lock(&cfs_wi_data.wi_glock);
384 }
385
386 sched->ws_starting++;
387 spin_unlock(&cfs_wi_data.wi_glock);
388
389 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
390 snprintf(name, sizeof(name), "%s_%02d_%02u",
391 sched->ws_name, sched->ws_cpt,
392 sched->ws_nthreads);
393 } else {
394 snprintf(name, sizeof(name), "%s_%02u",
395 sched->ws_name, sched->ws_nthreads);
396 }
397
398 task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
399 if (!IS_ERR(task)) {
400 nthrs--;
401 continue;
402 }
403 rc = PTR_ERR(task);
404
405 CERROR("Failed to create thread for WI scheduler %s: %d\n",
406 name, rc);
407
408 spin_lock(&cfs_wi_data.wi_glock);
409
410 /* make up for cfs_wi_sched_destroy */
411 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
412 sched->ws_starting--;
413
414 spin_unlock(&cfs_wi_data.wi_glock);
415
416 cfs_wi_sched_destroy(sched);
417 return rc;
418 }
419 spin_lock(&cfs_wi_data.wi_glock);
420 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
421 spin_unlock(&cfs_wi_data.wi_glock);
422
423 *sched_pp = sched;
424 return 0;
425 }
426 EXPORT_SYMBOL(cfs_wi_sched_create);
427
428 int
cfs_wi_startup(void)429 cfs_wi_startup(void)
430 {
431 memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
432
433 spin_lock_init(&cfs_wi_data.wi_glock);
434 INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
435 cfs_wi_data.wi_init = 1;
436
437 return 0;
438 }
439
440 void
cfs_wi_shutdown(void)441 cfs_wi_shutdown(void)
442 {
443 struct cfs_wi_sched *sched;
444
445 spin_lock(&cfs_wi_data.wi_glock);
446 cfs_wi_data.wi_stopping = 1;
447 spin_unlock(&cfs_wi_data.wi_glock);
448
449 /* nobody should contend on this list */
450 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
451 sched->ws_stopping = 1;
452 wake_up_all(&sched->ws_waitq);
453 }
454
455 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
456 spin_lock(&cfs_wi_data.wi_glock);
457
458 while (sched->ws_nthreads != 0) {
459 spin_unlock(&cfs_wi_data.wi_glock);
460 set_current_state(TASK_UNINTERRUPTIBLE);
461 schedule_timeout(cfs_time_seconds(1) / 20);
462 spin_lock(&cfs_wi_data.wi_glock);
463 }
464 spin_unlock(&cfs_wi_data.wi_glock);
465 }
466 while (!list_empty(&cfs_wi_data.wi_scheds)) {
467 sched = list_entry(cfs_wi_data.wi_scheds.next,
468 struct cfs_wi_sched, ws_list);
469 list_del(&sched->ws_list);
470 LIBCFS_FREE(sched, sizeof(*sched));
471 }
472
473 cfs_wi_data.wi_stopping = 0;
474 cfs_wi_data.wi_init = 0;
475 }
476