• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2012, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * libcfs/libcfs/workitem.c
33  *
34  * Author: Isaac Huang <isaac@clusterfs.com>
35  *	 Liang Zhen  <zhen.liang@sun.com>
36  */
37 
38 #define DEBUG_SUBSYSTEM S_LNET
39 
40 #include "../../include/linux/libcfs/libcfs.h"
41 
42 #define CFS_WS_NAME_LEN	 16
43 
44 struct cfs_wi_sched {
45 	/* chain on global list */
46 	struct list_head		ws_list;
47 	/** serialised workitems */
48 	spinlock_t		ws_lock;
49 	/** where schedulers sleep */
50 	wait_queue_head_t		ws_waitq;
51 	/** concurrent workitems */
52 	struct list_head		ws_runq;
53 	/**
54 	 * rescheduled running-workitems, a workitem can be rescheduled
55 	 * while running in wi_action(), but we don't to execute it again
56 	 * unless it returns from wi_action(), so we put it on ws_rerunq
57 	 * while rescheduling, and move it to runq after it returns
58 	 * from wi_action()
59 	 */
60 	struct list_head		ws_rerunq;
61 	/** CPT-table for this scheduler */
62 	struct cfs_cpt_table	*ws_cptab;
63 	/** CPT id for affinity */
64 	int			ws_cpt;
65 	/** number of scheduled workitems */
66 	int			ws_nscheduled;
67 	/** started scheduler thread, protected by cfs_wi_data::wi_glock */
68 	unsigned int		ws_nthreads:30;
69 	/** shutting down, protected by cfs_wi_data::wi_glock */
70 	unsigned int		ws_stopping:1;
71 	/** serialize starting thread, protected by cfs_wi_data::wi_glock */
72 	unsigned int		ws_starting:1;
73 	/** scheduler name */
74 	char			ws_name[CFS_WS_NAME_LEN];
75 };
76 
77 static struct cfs_workitem_data {
78 	/** serialize */
79 	spinlock_t		wi_glock;
80 	/** list of all schedulers */
81 	struct list_head		wi_scheds;
82 	/** WI module is initialized */
83 	int			wi_init;
84 	/** shutting down the whole WI module */
85 	int			wi_stopping;
86 } cfs_wi_data;
87 
88 static inline int
cfs_wi_sched_cansleep(struct cfs_wi_sched * sched)89 cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
90 {
91 	spin_lock(&sched->ws_lock);
92 	if (sched->ws_stopping) {
93 		spin_unlock(&sched->ws_lock);
94 		return 0;
95 	}
96 
97 	if (!list_empty(&sched->ws_runq)) {
98 		spin_unlock(&sched->ws_lock);
99 		return 0;
100 	}
101 	spin_unlock(&sched->ws_lock);
102 	return 1;
103 }
104 
105 /* XXX:
106  * 0. it only works when called from wi->wi_action.
107  * 1. when it returns no one shall try to schedule the workitem.
108  */
109 void
cfs_wi_exit(struct cfs_wi_sched * sched,struct cfs_workitem * wi)110 cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
111 {
112 	LASSERT(!in_interrupt()); /* because we use plain spinlock */
113 	LASSERT(!sched->ws_stopping);
114 
115 	spin_lock(&sched->ws_lock);
116 
117 	LASSERT(wi->wi_running);
118 	if (wi->wi_scheduled) { /* cancel pending schedules */
119 		LASSERT(!list_empty(&wi->wi_list));
120 		list_del_init(&wi->wi_list);
121 
122 		LASSERT(sched->ws_nscheduled > 0);
123 		sched->ws_nscheduled--;
124 	}
125 
126 	LASSERT(list_empty(&wi->wi_list));
127 
128 	wi->wi_scheduled = 1; /* LBUG future schedule attempts */
129 	spin_unlock(&sched->ws_lock);
130 }
131 EXPORT_SYMBOL(cfs_wi_exit);
132 
133 /**
134  * cancel schedule request of workitem \a wi
135  */
136 int
cfs_wi_deschedule(struct cfs_wi_sched * sched,struct cfs_workitem * wi)137 cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
138 {
139 	int	rc;
140 
141 	LASSERT(!in_interrupt()); /* because we use plain spinlock */
142 	LASSERT(!sched->ws_stopping);
143 
144 	/*
145 	 * return 0 if it's running already, otherwise return 1, which
146 	 * means the workitem will not be scheduled and will not have
147 	 * any race with wi_action.
148 	 */
149 	spin_lock(&sched->ws_lock);
150 
151 	rc = !(wi->wi_running);
152 
153 	if (wi->wi_scheduled) { /* cancel pending schedules */
154 		LASSERT(!list_empty(&wi->wi_list));
155 		list_del_init(&wi->wi_list);
156 
157 		LASSERT(sched->ws_nscheduled > 0);
158 		sched->ws_nscheduled--;
159 
160 		wi->wi_scheduled = 0;
161 	}
162 
163 	LASSERT(list_empty(&wi->wi_list));
164 
165 	spin_unlock(&sched->ws_lock);
166 	return rc;
167 }
168 EXPORT_SYMBOL(cfs_wi_deschedule);
169 
170 /*
171  * Workitem scheduled with (serial == 1) is strictly serialised not only with
172  * itself, but also with others scheduled this way.
173  *
174  * Now there's only one static serialised queue, but in the future more might
175  * be added, and even dynamic creation of serialised queues might be supported.
176  */
177 void
cfs_wi_schedule(struct cfs_wi_sched * sched,struct cfs_workitem * wi)178 cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
179 {
180 	LASSERT(!in_interrupt()); /* because we use plain spinlock */
181 	LASSERT(!sched->ws_stopping);
182 
183 	spin_lock(&sched->ws_lock);
184 
185 	if (!wi->wi_scheduled) {
186 		LASSERT(list_empty(&wi->wi_list));
187 
188 		wi->wi_scheduled = 1;
189 		sched->ws_nscheduled++;
190 		if (!wi->wi_running) {
191 			list_add_tail(&wi->wi_list, &sched->ws_runq);
192 			wake_up(&sched->ws_waitq);
193 		} else {
194 			list_add(&wi->wi_list, &sched->ws_rerunq);
195 		}
196 	}
197 
198 	LASSERT(!list_empty(&wi->wi_list));
199 	spin_unlock(&sched->ws_lock);
200 }
201 EXPORT_SYMBOL(cfs_wi_schedule);
202 
cfs_wi_scheduler(void * arg)203 static int cfs_wi_scheduler(void *arg)
204 {
205 	struct cfs_wi_sched	*sched = (struct cfs_wi_sched *)arg;
206 
207 	cfs_block_allsigs();
208 
209 	/* CPT affinity scheduler? */
210 	if (sched->ws_cptab)
211 		if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
212 			CWARN("Failed to bind %s on CPT %d\n",
213 			      sched->ws_name, sched->ws_cpt);
214 
215 	spin_lock(&cfs_wi_data.wi_glock);
216 
217 	LASSERT(sched->ws_starting == 1);
218 	sched->ws_starting--;
219 	sched->ws_nthreads++;
220 
221 	spin_unlock(&cfs_wi_data.wi_glock);
222 
223 	spin_lock(&sched->ws_lock);
224 
225 	while (!sched->ws_stopping) {
226 		int	     nloops = 0;
227 		int	     rc;
228 		struct cfs_workitem *wi;
229 
230 		while (!list_empty(&sched->ws_runq) &&
231 		       nloops < CFS_WI_RESCHED) {
232 			wi = list_entry(sched->ws_runq.next,
233 					struct cfs_workitem, wi_list);
234 			LASSERT(wi->wi_scheduled && !wi->wi_running);
235 
236 			list_del_init(&wi->wi_list);
237 
238 			LASSERT(sched->ws_nscheduled > 0);
239 			sched->ws_nscheduled--;
240 
241 			wi->wi_running   = 1;
242 			wi->wi_scheduled = 0;
243 
244 			spin_unlock(&sched->ws_lock);
245 			nloops++;
246 
247 			rc = (*wi->wi_action) (wi);
248 
249 			spin_lock(&sched->ws_lock);
250 			if (rc != 0) /* WI should be dead, even be freed! */
251 				continue;
252 
253 			wi->wi_running = 0;
254 			if (list_empty(&wi->wi_list))
255 				continue;
256 
257 			LASSERT(wi->wi_scheduled);
258 			/* wi is rescheduled, should be on rerunq now, we
259 			 * move it to runq so it can run action now
260 			 */
261 			list_move_tail(&wi->wi_list, &sched->ws_runq);
262 		}
263 
264 		if (!list_empty(&sched->ws_runq)) {
265 			spin_unlock(&sched->ws_lock);
266 			/* don't sleep because some workitems still
267 			 * expect me to come back soon
268 			 */
269 			cond_resched();
270 			spin_lock(&sched->ws_lock);
271 			continue;
272 		}
273 
274 		spin_unlock(&sched->ws_lock);
275 		rc = wait_event_interruptible_exclusive(sched->ws_waitq,
276 						!cfs_wi_sched_cansleep(sched));
277 		spin_lock(&sched->ws_lock);
278 	}
279 
280 	spin_unlock(&sched->ws_lock);
281 
282 	spin_lock(&cfs_wi_data.wi_glock);
283 	sched->ws_nthreads--;
284 	spin_unlock(&cfs_wi_data.wi_glock);
285 
286 	return 0;
287 }
288 
289 void
cfs_wi_sched_destroy(struct cfs_wi_sched * sched)290 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
291 {
292 	int	i;
293 
294 	LASSERT(cfs_wi_data.wi_init);
295 	LASSERT(!cfs_wi_data.wi_stopping);
296 
297 	spin_lock(&cfs_wi_data.wi_glock);
298 	if (sched->ws_stopping) {
299 		CDEBUG(D_INFO, "%s is in progress of stopping\n",
300 		       sched->ws_name);
301 		spin_unlock(&cfs_wi_data.wi_glock);
302 		return;
303 	}
304 
305 	LASSERT(!list_empty(&sched->ws_list));
306 	sched->ws_stopping = 1;
307 
308 	spin_unlock(&cfs_wi_data.wi_glock);
309 
310 	i = 2;
311 	wake_up_all(&sched->ws_waitq);
312 
313 	spin_lock(&cfs_wi_data.wi_glock);
314 	while (sched->ws_nthreads > 0) {
315 		CDEBUG(is_power_of_2(++i) ? D_WARNING : D_NET,
316 		       "waiting for %d threads of WI sched[%s] to terminate\n",
317 		       sched->ws_nthreads, sched->ws_name);
318 
319 		spin_unlock(&cfs_wi_data.wi_glock);
320 		set_current_state(TASK_UNINTERRUPTIBLE);
321 		schedule_timeout(cfs_time_seconds(1) / 20);
322 		spin_lock(&cfs_wi_data.wi_glock);
323 	}
324 
325 	list_del(&sched->ws_list);
326 
327 	spin_unlock(&cfs_wi_data.wi_glock);
328 	LASSERT(sched->ws_nscheduled == 0);
329 
330 	LIBCFS_FREE(sched, sizeof(*sched));
331 }
332 EXPORT_SYMBOL(cfs_wi_sched_destroy);
333 
334 int
cfs_wi_sched_create(char * name,struct cfs_cpt_table * cptab,int cpt,int nthrs,struct cfs_wi_sched ** sched_pp)335 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
336 		    int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
337 {
338 	struct cfs_wi_sched	*sched;
339 	int			rc;
340 
341 	LASSERT(cfs_wi_data.wi_init);
342 	LASSERT(!cfs_wi_data.wi_stopping);
343 	LASSERT(!cptab || cpt == CFS_CPT_ANY ||
344 		(cpt >= 0 && cpt < cfs_cpt_number(cptab)));
345 
346 	LIBCFS_ALLOC(sched, sizeof(*sched));
347 	if (!sched)
348 		return -ENOMEM;
349 
350 	if (strlen(name) > sizeof(sched->ws_name) - 1) {
351 		LIBCFS_FREE(sched, sizeof(*sched));
352 		return -E2BIG;
353 	}
354 	strncpy(sched->ws_name, name, sizeof(sched->ws_name));
355 
356 	sched->ws_cptab = cptab;
357 	sched->ws_cpt = cpt;
358 
359 	spin_lock_init(&sched->ws_lock);
360 	init_waitqueue_head(&sched->ws_waitq);
361 	INIT_LIST_HEAD(&sched->ws_runq);
362 	INIT_LIST_HEAD(&sched->ws_rerunq);
363 	INIT_LIST_HEAD(&sched->ws_list);
364 
365 	rc = 0;
366 	while (nthrs > 0)  {
367 		char	name[16];
368 		struct task_struct *task;
369 
370 		spin_lock(&cfs_wi_data.wi_glock);
371 		while (sched->ws_starting > 0) {
372 			spin_unlock(&cfs_wi_data.wi_glock);
373 			schedule();
374 			spin_lock(&cfs_wi_data.wi_glock);
375 		}
376 
377 		sched->ws_starting++;
378 		spin_unlock(&cfs_wi_data.wi_glock);
379 
380 		if (sched->ws_cptab && sched->ws_cpt >= 0) {
381 			snprintf(name, sizeof(name), "%s_%02d_%02u",
382 				 sched->ws_name, sched->ws_cpt,
383 				 sched->ws_nthreads);
384 		} else {
385 			snprintf(name, sizeof(name), "%s_%02u",
386 				 sched->ws_name, sched->ws_nthreads);
387 		}
388 
389 		task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
390 		if (!IS_ERR(task)) {
391 			nthrs--;
392 			continue;
393 		}
394 		rc = PTR_ERR(task);
395 
396 		CERROR("Failed to create thread for WI scheduler %s: %d\n",
397 		       name, rc);
398 
399 		spin_lock(&cfs_wi_data.wi_glock);
400 
401 		/* make up for cfs_wi_sched_destroy */
402 		list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
403 		sched->ws_starting--;
404 
405 		spin_unlock(&cfs_wi_data.wi_glock);
406 
407 		cfs_wi_sched_destroy(sched);
408 		return rc;
409 	}
410 	spin_lock(&cfs_wi_data.wi_glock);
411 	list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
412 	spin_unlock(&cfs_wi_data.wi_glock);
413 
414 	*sched_pp = sched;
415 	return 0;
416 }
417 EXPORT_SYMBOL(cfs_wi_sched_create);
418 
419 int
cfs_wi_startup(void)420 cfs_wi_startup(void)
421 {
422 	memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
423 
424 	spin_lock_init(&cfs_wi_data.wi_glock);
425 	INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
426 	cfs_wi_data.wi_init = 1;
427 
428 	return 0;
429 }
430 
431 void
cfs_wi_shutdown(void)432 cfs_wi_shutdown(void)
433 {
434 	struct cfs_wi_sched	*sched;
435 	struct cfs_wi_sched *temp;
436 
437 	spin_lock(&cfs_wi_data.wi_glock);
438 	cfs_wi_data.wi_stopping = 1;
439 	spin_unlock(&cfs_wi_data.wi_glock);
440 
441 	/* nobody should contend on this list */
442 	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
443 		sched->ws_stopping = 1;
444 		wake_up_all(&sched->ws_waitq);
445 	}
446 
447 	list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
448 		spin_lock(&cfs_wi_data.wi_glock);
449 
450 		while (sched->ws_nthreads != 0) {
451 			spin_unlock(&cfs_wi_data.wi_glock);
452 			set_current_state(TASK_UNINTERRUPTIBLE);
453 			schedule_timeout(cfs_time_seconds(1) / 20);
454 			spin_lock(&cfs_wi_data.wi_glock);
455 		}
456 		spin_unlock(&cfs_wi_data.wi_glock);
457 	}
458 	list_for_each_entry_safe(sched, temp, &cfs_wi_data.wi_scheds, ws_list) {
459 		list_del(&sched->ws_list);
460 		LIBCFS_FREE(sched, sizeof(*sched));
461 	}
462 
463 	cfs_wi_data.wi_stopping = 0;
464 	cfs_wi_data.wi_init = 0;
465 }
466