• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * libcfs/libcfs/workitem.c
37  *
38  * Author: Isaac Huang <isaac@clusterfs.com>
39  *	 Liang Zhen  <zhen.liang@sun.com>
40  */
41 
42 #define DEBUG_SUBSYSTEM S_LNET
43 
44 #include "../../include/linux/libcfs/libcfs.h"
45 
46 #define CFS_WS_NAME_LEN	 16
47 
48 typedef struct cfs_wi_sched {
49 	struct list_head		ws_list;	/* chain on global list */
50 	/** serialised workitems */
51 	spinlock_t		ws_lock;
52 	/** where schedulers sleep */
53 	wait_queue_head_t		ws_waitq;
54 	/** concurrent workitems */
55 	struct list_head		ws_runq;
56 	/** rescheduled running-workitems, a workitem can be rescheduled
57 	 * while running in wi_action(), but we don't to execute it again
58 	 * unless it returns from wi_action(), so we put it on ws_rerunq
59 	 * while rescheduling, and move it to runq after it returns
60 	 * from wi_action() */
61 	struct list_head		ws_rerunq;
62 	/** CPT-table for this scheduler */
63 	struct cfs_cpt_table	*ws_cptab;
64 	/** CPT id for affinity */
65 	int			ws_cpt;
66 	/** number of scheduled workitems */
67 	int			ws_nscheduled;
68 	/** started scheduler thread, protected by cfs_wi_data::wi_glock */
69 	unsigned int		ws_nthreads:30;
70 	/** shutting down, protected by cfs_wi_data::wi_glock */
71 	unsigned int		ws_stopping:1;
72 	/** serialize starting thread, protected by cfs_wi_data::wi_glock */
73 	unsigned int		ws_starting:1;
74 	/** scheduler name */
75 	char			ws_name[CFS_WS_NAME_LEN];
76 } cfs_wi_sched_t;
77 
78 static struct cfs_workitem_data {
79 	/** serialize */
80 	spinlock_t		wi_glock;
81 	/** list of all schedulers */
82 	struct list_head		wi_scheds;
83 	/** WI module is initialized */
84 	int			wi_init;
85 	/** shutting down the whole WI module */
86 	int			wi_stopping;
87 } cfs_wi_data;
88 
89 static inline void
cfs_wi_sched_lock(cfs_wi_sched_t * sched)90 cfs_wi_sched_lock(cfs_wi_sched_t *sched)
91 {
92 	spin_lock(&sched->ws_lock);
93 }
94 
95 static inline void
cfs_wi_sched_unlock(cfs_wi_sched_t * sched)96 cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
97 {
98 	spin_unlock(&sched->ws_lock);
99 }
100 
101 static inline int
cfs_wi_sched_cansleep(cfs_wi_sched_t * sched)102 cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
103 {
104 	cfs_wi_sched_lock(sched);
105 	if (sched->ws_stopping) {
106 		cfs_wi_sched_unlock(sched);
107 		return 0;
108 	}
109 
110 	if (!list_empty(&sched->ws_runq)) {
111 		cfs_wi_sched_unlock(sched);
112 		return 0;
113 	}
114 	cfs_wi_sched_unlock(sched);
115 	return 1;
116 }
117 
118 
119 /* XXX:
120  * 0. it only works when called from wi->wi_action.
121  * 1. when it returns no one shall try to schedule the workitem.
122  */
123 void
cfs_wi_exit(struct cfs_wi_sched * sched,cfs_workitem_t * wi)124 cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
125 {
126 	LASSERT(!in_interrupt()); /* because we use plain spinlock */
127 	LASSERT(!sched->ws_stopping);
128 
129 	cfs_wi_sched_lock(sched);
130 
131 	LASSERT(wi->wi_running);
132 	if (wi->wi_scheduled) { /* cancel pending schedules */
133 		LASSERT(!list_empty(&wi->wi_list));
134 		list_del_init(&wi->wi_list);
135 
136 		LASSERT(sched->ws_nscheduled > 0);
137 		sched->ws_nscheduled--;
138 	}
139 
140 	LASSERT(list_empty(&wi->wi_list));
141 
142 	wi->wi_scheduled = 1; /* LBUG future schedule attempts */
143 	cfs_wi_sched_unlock(sched);
144 
145 	return;
146 }
147 EXPORT_SYMBOL(cfs_wi_exit);
148 
149 /**
150  * cancel schedule request of workitem \a wi
151  */
152 int
cfs_wi_deschedule(struct cfs_wi_sched * sched,cfs_workitem_t * wi)153 cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
154 {
155 	int	rc;
156 
157 	LASSERT(!in_interrupt()); /* because we use plain spinlock */
158 	LASSERT(!sched->ws_stopping);
159 
160 	/*
161 	 * return 0 if it's running already, otherwise return 1, which
162 	 * means the workitem will not be scheduled and will not have
163 	 * any race with wi_action.
164 	 */
165 	cfs_wi_sched_lock(sched);
166 
167 	rc = !(wi->wi_running);
168 
169 	if (wi->wi_scheduled) { /* cancel pending schedules */
170 		LASSERT(!list_empty(&wi->wi_list));
171 		list_del_init(&wi->wi_list);
172 
173 		LASSERT(sched->ws_nscheduled > 0);
174 		sched->ws_nscheduled--;
175 
176 		wi->wi_scheduled = 0;
177 	}
178 
179 	LASSERT (list_empty(&wi->wi_list));
180 
181 	cfs_wi_sched_unlock(sched);
182 	return rc;
183 }
184 EXPORT_SYMBOL(cfs_wi_deschedule);
185 
186 /*
187  * Workitem scheduled with (serial == 1) is strictly serialised not only with
188  * itself, but also with others scheduled this way.
189  *
190  * Now there's only one static serialised queue, but in the future more might
191  * be added, and even dynamic creation of serialised queues might be supported.
192  */
193 void
cfs_wi_schedule(struct cfs_wi_sched * sched,cfs_workitem_t * wi)194 cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
195 {
196 	LASSERT(!in_interrupt()); /* because we use plain spinlock */
197 	LASSERT(!sched->ws_stopping);
198 
199 	cfs_wi_sched_lock(sched);
200 
201 	if (!wi->wi_scheduled) {
202 		LASSERT (list_empty(&wi->wi_list));
203 
204 		wi->wi_scheduled = 1;
205 		sched->ws_nscheduled++;
206 		if (!wi->wi_running) {
207 			list_add_tail(&wi->wi_list, &sched->ws_runq);
208 			wake_up(&sched->ws_waitq);
209 		} else {
210 			list_add(&wi->wi_list, &sched->ws_rerunq);
211 		}
212 	}
213 
214 	LASSERT (!list_empty(&wi->wi_list));
215 	cfs_wi_sched_unlock(sched);
216 	return;
217 }
218 EXPORT_SYMBOL(cfs_wi_schedule);
219 
220 
221 static int
cfs_wi_scheduler(void * arg)222 cfs_wi_scheduler (void *arg)
223 {
224 	struct cfs_wi_sched	*sched = (cfs_wi_sched_t *)arg;
225 
226 	cfs_block_allsigs();
227 
228 	/* CPT affinity scheduler? */
229 	if (sched->ws_cptab != NULL)
230 		cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
231 
232 	spin_lock(&cfs_wi_data.wi_glock);
233 
234 	LASSERT(sched->ws_starting == 1);
235 	sched->ws_starting--;
236 	sched->ws_nthreads++;
237 
238 	spin_unlock(&cfs_wi_data.wi_glock);
239 
240 	cfs_wi_sched_lock(sched);
241 
242 	while (!sched->ws_stopping) {
243 		int	     nloops = 0;
244 		int	     rc;
245 		cfs_workitem_t *wi;
246 
247 		while (!list_empty(&sched->ws_runq) &&
248 		       nloops < CFS_WI_RESCHED) {
249 			wi = list_entry(sched->ws_runq.next,
250 					    cfs_workitem_t, wi_list);
251 			LASSERT(wi->wi_scheduled && !wi->wi_running);
252 
253 			list_del_init(&wi->wi_list);
254 
255 			LASSERT(sched->ws_nscheduled > 0);
256 			sched->ws_nscheduled--;
257 
258 			wi->wi_running   = 1;
259 			wi->wi_scheduled = 0;
260 
261 
262 			cfs_wi_sched_unlock(sched);
263 			nloops++;
264 
265 			rc = (*wi->wi_action) (wi);
266 
267 			cfs_wi_sched_lock(sched);
268 			if (rc != 0) /* WI should be dead, even be freed! */
269 				continue;
270 
271 			wi->wi_running = 0;
272 			if (list_empty(&wi->wi_list))
273 				continue;
274 
275 			LASSERT(wi->wi_scheduled);
276 			/* wi is rescheduled, should be on rerunq now, we
277 			 * move it to runq so it can run action now */
278 			list_move_tail(&wi->wi_list, &sched->ws_runq);
279 		}
280 
281 		if (!list_empty(&sched->ws_runq)) {
282 			cfs_wi_sched_unlock(sched);
283 			/* don't sleep because some workitems still
284 			 * expect me to come back soon */
285 			cond_resched();
286 			cfs_wi_sched_lock(sched);
287 			continue;
288 		}
289 
290 		cfs_wi_sched_unlock(sched);
291 		rc = wait_event_interruptible_exclusive(sched->ws_waitq,
292 						!cfs_wi_sched_cansleep(sched));
293 		cfs_wi_sched_lock(sched);
294 	}
295 
296 	cfs_wi_sched_unlock(sched);
297 
298 	spin_lock(&cfs_wi_data.wi_glock);
299 	sched->ws_nthreads--;
300 	spin_unlock(&cfs_wi_data.wi_glock);
301 
302 	return 0;
303 }
304 
305 
306 void
cfs_wi_sched_destroy(struct cfs_wi_sched * sched)307 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
308 {
309 	int	i;
310 
311 	LASSERT(cfs_wi_data.wi_init);
312 	LASSERT(!cfs_wi_data.wi_stopping);
313 
314 	spin_lock(&cfs_wi_data.wi_glock);
315 	if (sched->ws_stopping) {
316 		CDEBUG(D_INFO, "%s is in progress of stopping\n",
317 		       sched->ws_name);
318 		spin_unlock(&cfs_wi_data.wi_glock);
319 		return;
320 	}
321 
322 	LASSERT(!list_empty(&sched->ws_list));
323 	sched->ws_stopping = 1;
324 
325 	spin_unlock(&cfs_wi_data.wi_glock);
326 
327 	i = 2;
328 	wake_up_all(&sched->ws_waitq);
329 
330 	spin_lock(&cfs_wi_data.wi_glock);
331 	while (sched->ws_nthreads > 0) {
332 		CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
333 		       "waiting for %d threads of WI sched[%s] to terminate\n",
334 		       sched->ws_nthreads, sched->ws_name);
335 
336 		spin_unlock(&cfs_wi_data.wi_glock);
337 		set_current_state(TASK_UNINTERRUPTIBLE);
338 		schedule_timeout(cfs_time_seconds(1) / 20);
339 		spin_lock(&cfs_wi_data.wi_glock);
340 	}
341 
342 	list_del(&sched->ws_list);
343 
344 	spin_unlock(&cfs_wi_data.wi_glock);
345 	LASSERT(sched->ws_nscheduled == 0);
346 
347 	LIBCFS_FREE(sched, sizeof(*sched));
348 }
349 EXPORT_SYMBOL(cfs_wi_sched_destroy);
350 
351 int
cfs_wi_sched_create(char * name,struct cfs_cpt_table * cptab,int cpt,int nthrs,struct cfs_wi_sched ** sched_pp)352 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
353 		    int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
354 {
355 	struct cfs_wi_sched	*sched;
356 	int			rc;
357 
358 	LASSERT(cfs_wi_data.wi_init);
359 	LASSERT(!cfs_wi_data.wi_stopping);
360 	LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
361 		(cpt >= 0 && cpt < cfs_cpt_number(cptab)));
362 
363 	LIBCFS_ALLOC(sched, sizeof(*sched));
364 	if (sched == NULL)
365 		return -ENOMEM;
366 
367 	strncpy(sched->ws_name, name, CFS_WS_NAME_LEN);
368 	sched->ws_name[CFS_WS_NAME_LEN - 1] = '\0';
369 	sched->ws_cptab = cptab;
370 	sched->ws_cpt = cpt;
371 
372 	spin_lock_init(&sched->ws_lock);
373 	init_waitqueue_head(&sched->ws_waitq);
374 	INIT_LIST_HEAD(&sched->ws_runq);
375 	INIT_LIST_HEAD(&sched->ws_rerunq);
376 	INIT_LIST_HEAD(&sched->ws_list);
377 
378 	rc = 0;
379 	while (nthrs > 0)  {
380 		char	name[16];
381 		struct task_struct *task;
382 
383 		spin_lock(&cfs_wi_data.wi_glock);
384 		while (sched->ws_starting > 0) {
385 			spin_unlock(&cfs_wi_data.wi_glock);
386 			schedule();
387 			spin_lock(&cfs_wi_data.wi_glock);
388 		}
389 
390 		sched->ws_starting++;
391 		spin_unlock(&cfs_wi_data.wi_glock);
392 
393 		if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
394 			snprintf(name, sizeof(name), "%s_%02d_%02u",
395 				 sched->ws_name, sched->ws_cpt,
396 				 sched->ws_nthreads);
397 		} else {
398 			snprintf(name, sizeof(name), "%s_%02u",
399 				 sched->ws_name, sched->ws_nthreads);
400 		}
401 
402 		task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
403 		if (!IS_ERR(task)) {
404 			nthrs--;
405 			continue;
406 		}
407 		rc = PTR_ERR(task);
408 
409 		CERROR("Failed to create thread for WI scheduler %s: %d\n",
410 		       name, rc);
411 
412 		spin_lock(&cfs_wi_data.wi_glock);
413 
414 		/* make up for cfs_wi_sched_destroy */
415 		list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
416 		sched->ws_starting--;
417 
418 		spin_unlock(&cfs_wi_data.wi_glock);
419 
420 		cfs_wi_sched_destroy(sched);
421 		return rc;
422 	}
423 	spin_lock(&cfs_wi_data.wi_glock);
424 	list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
425 	spin_unlock(&cfs_wi_data.wi_glock);
426 
427 	*sched_pp = sched;
428 	return 0;
429 }
430 EXPORT_SYMBOL(cfs_wi_sched_create);
431 
432 int
cfs_wi_startup(void)433 cfs_wi_startup(void)
434 {
435 	memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
436 
437 	spin_lock_init(&cfs_wi_data.wi_glock);
438 	INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
439 	cfs_wi_data.wi_init = 1;
440 
441 	return 0;
442 }
443 
444 void
cfs_wi_shutdown(void)445 cfs_wi_shutdown (void)
446 {
447 	struct cfs_wi_sched	*sched;
448 
449 	spin_lock(&cfs_wi_data.wi_glock);
450 	cfs_wi_data.wi_stopping = 1;
451 	spin_unlock(&cfs_wi_data.wi_glock);
452 
453 	/* nobody should contend on this list */
454 	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
455 		sched->ws_stopping = 1;
456 		wake_up_all(&sched->ws_waitq);
457 	}
458 
459 	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
460 		spin_lock(&cfs_wi_data.wi_glock);
461 
462 		while (sched->ws_nthreads != 0) {
463 			spin_unlock(&cfs_wi_data.wi_glock);
464 			set_current_state(TASK_UNINTERRUPTIBLE);
465 			schedule_timeout(cfs_time_seconds(1) / 20);
466 			spin_lock(&cfs_wi_data.wi_glock);
467 		}
468 		spin_unlock(&cfs_wi_data.wi_glock);
469 	}
470 	while (!list_empty(&cfs_wi_data.wi_scheds)) {
471 		sched = list_entry(cfs_wi_data.wi_scheds.next,
472 				       struct cfs_wi_sched, ws_list);
473 		list_del(&sched->ws_list);
474 		LIBCFS_FREE(sched, sizeof(*sched));
475 	}
476 
477 	cfs_wi_data.wi_stopping = 0;
478 	cfs_wi_data.wi_init = 0;
479 }
480