• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * libcfs/libcfs/workitem.c
37  *
38  * Author: Isaac Huang <isaac@clusterfs.com>
39  *	 Liang Zhen  <zhen.liang@sun.com>
40  */
41 
42 #define DEBUG_SUBSYSTEM S_LNET
43 
44 #include "../../include/linux/libcfs/libcfs.h"
45 
46 #define CFS_WS_NAME_LEN	 16
47 
48 struct cfs_wi_sched {
49 	struct list_head		ws_list;	/* chain on global list */
50 	/** serialised workitems */
51 	spinlock_t		ws_lock;
52 	/** where schedulers sleep */
53 	wait_queue_head_t		ws_waitq;
54 	/** concurrent workitems */
55 	struct list_head		ws_runq;
56 	/** rescheduled running-workitems, a workitem can be rescheduled
57 	 * while running in wi_action(), but we don't to execute it again
58 	 * unless it returns from wi_action(), so we put it on ws_rerunq
59 	 * while rescheduling, and move it to runq after it returns
60 	 * from wi_action() */
61 	struct list_head		ws_rerunq;
62 	/** CPT-table for this scheduler */
63 	struct cfs_cpt_table	*ws_cptab;
64 	/** CPT id for affinity */
65 	int			ws_cpt;
66 	/** number of scheduled workitems */
67 	int			ws_nscheduled;
68 	/** started scheduler thread, protected by cfs_wi_data::wi_glock */
69 	unsigned int		ws_nthreads:30;
70 	/** shutting down, protected by cfs_wi_data::wi_glock */
71 	unsigned int		ws_stopping:1;
72 	/** serialize starting thread, protected by cfs_wi_data::wi_glock */
73 	unsigned int		ws_starting:1;
74 	/** scheduler name */
75 	char			ws_name[CFS_WS_NAME_LEN];
76 };
77 
78 static struct cfs_workitem_data {
79 	/** serialize */
80 	spinlock_t		wi_glock;
81 	/** list of all schedulers */
82 	struct list_head		wi_scheds;
83 	/** WI module is initialized */
84 	int			wi_init;
85 	/** shutting down the whole WI module */
86 	int			wi_stopping;
87 } cfs_wi_data;
88 
89 static inline void
cfs_wi_sched_lock(struct cfs_wi_sched * sched)90 cfs_wi_sched_lock(struct cfs_wi_sched *sched)
91 {
92 	spin_lock(&sched->ws_lock);
93 }
94 
95 static inline void
cfs_wi_sched_unlock(struct cfs_wi_sched * sched)96 cfs_wi_sched_unlock(struct cfs_wi_sched *sched)
97 {
98 	spin_unlock(&sched->ws_lock);
99 }
100 
101 static inline int
cfs_wi_sched_cansleep(struct cfs_wi_sched * sched)102 cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
103 {
104 	cfs_wi_sched_lock(sched);
105 	if (sched->ws_stopping) {
106 		cfs_wi_sched_unlock(sched);
107 		return 0;
108 	}
109 
110 	if (!list_empty(&sched->ws_runq)) {
111 		cfs_wi_sched_unlock(sched);
112 		return 0;
113 	}
114 	cfs_wi_sched_unlock(sched);
115 	return 1;
116 }
117 
118 /* XXX:
119  * 0. it only works when called from wi->wi_action.
120  * 1. when it returns no one shall try to schedule the workitem.
121  */
122 void
cfs_wi_exit(struct cfs_wi_sched * sched,cfs_workitem_t * wi)123 cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
124 {
125 	LASSERT(!in_interrupt()); /* because we use plain spinlock */
126 	LASSERT(!sched->ws_stopping);
127 
128 	cfs_wi_sched_lock(sched);
129 
130 	LASSERT(wi->wi_running);
131 	if (wi->wi_scheduled) { /* cancel pending schedules */
132 		LASSERT(!list_empty(&wi->wi_list));
133 		list_del_init(&wi->wi_list);
134 
135 		LASSERT(sched->ws_nscheduled > 0);
136 		sched->ws_nscheduled--;
137 	}
138 
139 	LASSERT(list_empty(&wi->wi_list));
140 
141 	wi->wi_scheduled = 1; /* LBUG future schedule attempts */
142 	cfs_wi_sched_unlock(sched);
143 
144 	return;
145 }
146 EXPORT_SYMBOL(cfs_wi_exit);
147 
148 /**
149  * cancel schedule request of workitem \a wi
150  */
151 int
cfs_wi_deschedule(struct cfs_wi_sched * sched,cfs_workitem_t * wi)152 cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
153 {
154 	int	rc;
155 
156 	LASSERT(!in_interrupt()); /* because we use plain spinlock */
157 	LASSERT(!sched->ws_stopping);
158 
159 	/*
160 	 * return 0 if it's running already, otherwise return 1, which
161 	 * means the workitem will not be scheduled and will not have
162 	 * any race with wi_action.
163 	 */
164 	cfs_wi_sched_lock(sched);
165 
166 	rc = !(wi->wi_running);
167 
168 	if (wi->wi_scheduled) { /* cancel pending schedules */
169 		LASSERT(!list_empty(&wi->wi_list));
170 		list_del_init(&wi->wi_list);
171 
172 		LASSERT(sched->ws_nscheduled > 0);
173 		sched->ws_nscheduled--;
174 
175 		wi->wi_scheduled = 0;
176 	}
177 
178 	LASSERT (list_empty(&wi->wi_list));
179 
180 	cfs_wi_sched_unlock(sched);
181 	return rc;
182 }
183 EXPORT_SYMBOL(cfs_wi_deschedule);
184 
185 /*
186  * Workitem scheduled with (serial == 1) is strictly serialised not only with
187  * itself, but also with others scheduled this way.
188  *
189  * Now there's only one static serialised queue, but in the future more might
190  * be added, and even dynamic creation of serialised queues might be supported.
191  */
192 void
cfs_wi_schedule(struct cfs_wi_sched * sched,cfs_workitem_t * wi)193 cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
194 {
195 	LASSERT(!in_interrupt()); /* because we use plain spinlock */
196 	LASSERT(!sched->ws_stopping);
197 
198 	cfs_wi_sched_lock(sched);
199 
200 	if (!wi->wi_scheduled) {
201 		LASSERT (list_empty(&wi->wi_list));
202 
203 		wi->wi_scheduled = 1;
204 		sched->ws_nscheduled++;
205 		if (!wi->wi_running) {
206 			list_add_tail(&wi->wi_list, &sched->ws_runq);
207 			wake_up(&sched->ws_waitq);
208 		} else {
209 			list_add(&wi->wi_list, &sched->ws_rerunq);
210 		}
211 	}
212 
213 	LASSERT (!list_empty(&wi->wi_list));
214 	cfs_wi_sched_unlock(sched);
215 	return;
216 }
217 EXPORT_SYMBOL(cfs_wi_schedule);
218 
219 static int
cfs_wi_scheduler(void * arg)220 cfs_wi_scheduler (void *arg)
221 {
222 	struct cfs_wi_sched	*sched = (struct cfs_wi_sched *)arg;
223 
224 	cfs_block_allsigs();
225 
226 	/* CPT affinity scheduler? */
227 	if (sched->ws_cptab != NULL)
228 		cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt);
229 
230 	spin_lock(&cfs_wi_data.wi_glock);
231 
232 	LASSERT(sched->ws_starting == 1);
233 	sched->ws_starting--;
234 	sched->ws_nthreads++;
235 
236 	spin_unlock(&cfs_wi_data.wi_glock);
237 
238 	cfs_wi_sched_lock(sched);
239 
240 	while (!sched->ws_stopping) {
241 		int	     nloops = 0;
242 		int	     rc;
243 		cfs_workitem_t *wi;
244 
245 		while (!list_empty(&sched->ws_runq) &&
246 		       nloops < CFS_WI_RESCHED) {
247 			wi = list_entry(sched->ws_runq.next,
248 					    cfs_workitem_t, wi_list);
249 			LASSERT(wi->wi_scheduled && !wi->wi_running);
250 
251 			list_del_init(&wi->wi_list);
252 
253 			LASSERT(sched->ws_nscheduled > 0);
254 			sched->ws_nscheduled--;
255 
256 			wi->wi_running   = 1;
257 			wi->wi_scheduled = 0;
258 
259 			cfs_wi_sched_unlock(sched);
260 			nloops++;
261 
262 			rc = (*wi->wi_action) (wi);
263 
264 			cfs_wi_sched_lock(sched);
265 			if (rc != 0) /* WI should be dead, even be freed! */
266 				continue;
267 
268 			wi->wi_running = 0;
269 			if (list_empty(&wi->wi_list))
270 				continue;
271 
272 			LASSERT(wi->wi_scheduled);
273 			/* wi is rescheduled, should be on rerunq now, we
274 			 * move it to runq so it can run action now */
275 			list_move_tail(&wi->wi_list, &sched->ws_runq);
276 		}
277 
278 		if (!list_empty(&sched->ws_runq)) {
279 			cfs_wi_sched_unlock(sched);
280 			/* don't sleep because some workitems still
281 			 * expect me to come back soon */
282 			cond_resched();
283 			cfs_wi_sched_lock(sched);
284 			continue;
285 		}
286 
287 		cfs_wi_sched_unlock(sched);
288 		rc = wait_event_interruptible_exclusive(sched->ws_waitq,
289 						!cfs_wi_sched_cansleep(sched));
290 		cfs_wi_sched_lock(sched);
291 	}
292 
293 	cfs_wi_sched_unlock(sched);
294 
295 	spin_lock(&cfs_wi_data.wi_glock);
296 	sched->ws_nthreads--;
297 	spin_unlock(&cfs_wi_data.wi_glock);
298 
299 	return 0;
300 }
301 
302 void
cfs_wi_sched_destroy(struct cfs_wi_sched * sched)303 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
304 {
305 	int	i;
306 
307 	LASSERT(cfs_wi_data.wi_init);
308 	LASSERT(!cfs_wi_data.wi_stopping);
309 
310 	spin_lock(&cfs_wi_data.wi_glock);
311 	if (sched->ws_stopping) {
312 		CDEBUG(D_INFO, "%s is in progress of stopping\n",
313 		       sched->ws_name);
314 		spin_unlock(&cfs_wi_data.wi_glock);
315 		return;
316 	}
317 
318 	LASSERT(!list_empty(&sched->ws_list));
319 	sched->ws_stopping = 1;
320 
321 	spin_unlock(&cfs_wi_data.wi_glock);
322 
323 	i = 2;
324 	wake_up_all(&sched->ws_waitq);
325 
326 	spin_lock(&cfs_wi_data.wi_glock);
327 	while (sched->ws_nthreads > 0) {
328 		CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
329 		       "waiting for %d threads of WI sched[%s] to terminate\n",
330 		       sched->ws_nthreads, sched->ws_name);
331 
332 		spin_unlock(&cfs_wi_data.wi_glock);
333 		set_current_state(TASK_UNINTERRUPTIBLE);
334 		schedule_timeout(cfs_time_seconds(1) / 20);
335 		spin_lock(&cfs_wi_data.wi_glock);
336 	}
337 
338 	list_del(&sched->ws_list);
339 
340 	spin_unlock(&cfs_wi_data.wi_glock);
341 	LASSERT(sched->ws_nscheduled == 0);
342 
343 	LIBCFS_FREE(sched, sizeof(*sched));
344 }
345 EXPORT_SYMBOL(cfs_wi_sched_destroy);
346 
347 int
cfs_wi_sched_create(char * name,struct cfs_cpt_table * cptab,int cpt,int nthrs,struct cfs_wi_sched ** sched_pp)348 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
349 		    int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
350 {
351 	struct cfs_wi_sched	*sched;
352 	int			rc;
353 
354 	LASSERT(cfs_wi_data.wi_init);
355 	LASSERT(!cfs_wi_data.wi_stopping);
356 	LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
357 		(cpt >= 0 && cpt < cfs_cpt_number(cptab)));
358 
359 	LIBCFS_ALLOC(sched, sizeof(*sched));
360 	if (sched == NULL)
361 		return -ENOMEM;
362 
363 	strlcpy(sched->ws_name, name, CFS_WS_NAME_LEN);
364 
365 	sched->ws_cptab = cptab;
366 	sched->ws_cpt = cpt;
367 
368 	spin_lock_init(&sched->ws_lock);
369 	init_waitqueue_head(&sched->ws_waitq);
370 	INIT_LIST_HEAD(&sched->ws_runq);
371 	INIT_LIST_HEAD(&sched->ws_rerunq);
372 	INIT_LIST_HEAD(&sched->ws_list);
373 
374 	rc = 0;
375 	while (nthrs > 0)  {
376 		char	name[16];
377 		struct task_struct *task;
378 
379 		spin_lock(&cfs_wi_data.wi_glock);
380 		while (sched->ws_starting > 0) {
381 			spin_unlock(&cfs_wi_data.wi_glock);
382 			schedule();
383 			spin_lock(&cfs_wi_data.wi_glock);
384 		}
385 
386 		sched->ws_starting++;
387 		spin_unlock(&cfs_wi_data.wi_glock);
388 
389 		if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
390 			snprintf(name, sizeof(name), "%s_%02d_%02u",
391 				 sched->ws_name, sched->ws_cpt,
392 				 sched->ws_nthreads);
393 		} else {
394 			snprintf(name, sizeof(name), "%s_%02u",
395 				 sched->ws_name, sched->ws_nthreads);
396 		}
397 
398 		task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
399 		if (!IS_ERR(task)) {
400 			nthrs--;
401 			continue;
402 		}
403 		rc = PTR_ERR(task);
404 
405 		CERROR("Failed to create thread for WI scheduler %s: %d\n",
406 		       name, rc);
407 
408 		spin_lock(&cfs_wi_data.wi_glock);
409 
410 		/* make up for cfs_wi_sched_destroy */
411 		list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
412 		sched->ws_starting--;
413 
414 		spin_unlock(&cfs_wi_data.wi_glock);
415 
416 		cfs_wi_sched_destroy(sched);
417 		return rc;
418 	}
419 	spin_lock(&cfs_wi_data.wi_glock);
420 	list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
421 	spin_unlock(&cfs_wi_data.wi_glock);
422 
423 	*sched_pp = sched;
424 	return 0;
425 }
426 EXPORT_SYMBOL(cfs_wi_sched_create);
427 
428 int
cfs_wi_startup(void)429 cfs_wi_startup(void)
430 {
431 	memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
432 
433 	spin_lock_init(&cfs_wi_data.wi_glock);
434 	INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
435 	cfs_wi_data.wi_init = 1;
436 
437 	return 0;
438 }
439 
440 void
cfs_wi_shutdown(void)441 cfs_wi_shutdown(void)
442 {
443 	struct cfs_wi_sched	*sched;
444 
445 	spin_lock(&cfs_wi_data.wi_glock);
446 	cfs_wi_data.wi_stopping = 1;
447 	spin_unlock(&cfs_wi_data.wi_glock);
448 
449 	/* nobody should contend on this list */
450 	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
451 		sched->ws_stopping = 1;
452 		wake_up_all(&sched->ws_waitq);
453 	}
454 
455 	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
456 		spin_lock(&cfs_wi_data.wi_glock);
457 
458 		while (sched->ws_nthreads != 0) {
459 			spin_unlock(&cfs_wi_data.wi_glock);
460 			set_current_state(TASK_UNINTERRUPTIBLE);
461 			schedule_timeout(cfs_time_seconds(1) / 20);
462 			spin_lock(&cfs_wi_data.wi_glock);
463 		}
464 		spin_unlock(&cfs_wi_data.wi_glock);
465 	}
466 	while (!list_empty(&cfs_wi_data.wi_scheds)) {
467 		sched = list_entry(cfs_wi_data.wi_scheds.next,
468 				       struct cfs_wi_sched, ws_list);
469 		list_del(&sched->ws_list);
470 		LIBCFS_FREE(sched, sizeof(*sched));
471 	}
472 
473 	cfs_wi_data.wi_stopping = 0;
474 	cfs_wi_data.wi_init = 0;
475 }
476