• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Generic process-grouping system.
3  *
4  *  Based originally on the cpuset system, extracted by Paul Menage
5  *  Copyright (C) 2006 Google, Inc
6  *
7  *  Notifications support
8  *  Copyright (C) 2009 Nokia Corporation
9  *  Author: Kirill A. Shutemov
10  *
11  *  Copyright notices from the original cpuset code:
12  *  --------------------------------------------------
13  *  Copyright (C) 2003 BULL SA.
14  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
15  *
16  *  Portions derived from Patrick Mochel's sysfs code.
17  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
18  *
19  *  2003-10-10 Written by Simon Derr.
20  *  2003-10-22 Updates by Stephen Hemminger.
21  *  2004 May-July Rework by Paul Jackson.
22  *  ---------------------------------------------------
23  *
24  *  This file is subject to the terms and conditions of the GNU General Public
25  *  License.  See the file COPYING in the main directory of the Linux
26  *  distribution for more details.
27  */
28 
29 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
30 
31 #include <linux/cgroup.h>
32 #include <linux/cred.h>
33 #include <linux/ctype.h>
34 #include <linux/errno.h>
35 #include <linux/init_task.h>
36 #include <linux/kernel.h>
37 #include <linux/list.h>
38 #include <linux/magic.h>
39 #include <linux/mm.h>
40 #include <linux/mutex.h>
41 #include <linux/mount.h>
42 #include <linux/pagemap.h>
43 #include <linux/proc_fs.h>
44 #include <linux/rcupdate.h>
45 #include <linux/sched.h>
46 #include <linux/slab.h>
47 #include <linux/spinlock.h>
48 #include <linux/percpu-rwsem.h>
49 #include <linux/string.h>
50 #include <linux/sort.h>
51 #include <linux/kmod.h>
52 #include <linux/delayacct.h>
53 #include <linux/cgroupstats.h>
54 #include <linux/hashtable.h>
55 #include <linux/pid_namespace.h>
56 #include <linux/idr.h>
57 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58 #include <linux/kthread.h>
59 #include <linux/delay.h>
60 #include <linux/atomic.h>
61 #include <linux/cpuset.h>
62 #include <linux/proc_ns.h>
63 #include <linux/nsproxy.h>
64 #include <linux/file.h>
65 #include <net/sock.h>
66 
67 #define CREATE_TRACE_POINTS
68 #include <trace/events/cgroup.h>
69 
70 /*
71  * pidlists linger the following amount before being destroyed.  The goal
72  * is avoiding frequent destruction in the middle of consecutive read calls
73  * Expiring in the middle is a performance problem not a correctness one.
74  * 1 sec should be enough.
75  */
76 #define CGROUP_PIDLIST_DESTROY_DELAY	HZ
77 
78 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
79 					 MAX_CFTYPE_NAME + 2)
80 
81 /*
82  * cgroup_mutex is the master lock.  Any modification to cgroup or its
83  * hierarchy must be performed while holding it.
84  *
85  * css_set_lock protects task->cgroups pointer, the list of css_set
86  * objects, and the chain of tasks off each css_set.
87  *
88  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
89  * cgroup.h can use them for lockdep annotations.
90  */
91 #ifdef CONFIG_PROVE_RCU
92 DEFINE_MUTEX(cgroup_mutex);
93 DEFINE_SPINLOCK(css_set_lock);
94 EXPORT_SYMBOL_GPL(cgroup_mutex);
95 EXPORT_SYMBOL_GPL(css_set_lock);
96 #else
97 static DEFINE_MUTEX(cgroup_mutex);
98 static DEFINE_SPINLOCK(css_set_lock);
99 #endif
100 
101 /*
102  * Protects cgroup_idr and css_idr so that IDs can be released without
103  * grabbing cgroup_mutex.
104  */
105 static DEFINE_SPINLOCK(cgroup_idr_lock);
106 
107 /*
108  * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
109  * against file removal/re-creation across css hiding.
110  */
111 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
112 
113 /*
114  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
115  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
116  */
117 static DEFINE_SPINLOCK(release_agent_path_lock);
118 
119 struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
120 
121 #define cgroup_assert_mutex_or_rcu_locked()				\
122 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
123 			   !lockdep_is_held(&cgroup_mutex),		\
124 			   "cgroup_mutex or RCU read lock required");
125 
126 /*
127  * cgroup destruction makes heavy use of work items and there can be a lot
128  * of concurrent destructions.  Use a separate workqueue so that cgroup
129  * destruction work items don't end up filling up max_active of system_wq
130  * which may lead to deadlock.
131  */
132 static struct workqueue_struct *cgroup_destroy_wq;
133 
134 /*
135  * pidlist destructions need to be flushed on cgroup destruction.  Use a
136  * separate workqueue as flush domain.
137  */
138 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
139 
140 /* generate an array of cgroup subsystem pointers */
141 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
142 static struct cgroup_subsys *cgroup_subsys[] = {
143 #include <linux/cgroup_subsys.h>
144 };
145 #undef SUBSYS
146 
147 /* array of cgroup subsystem names */
148 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
149 static const char *cgroup_subsys_name[] = {
150 #include <linux/cgroup_subsys.h>
151 };
152 #undef SUBSYS
153 
154 /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
155 #define SUBSYS(_x)								\
156 	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);			\
157 	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);			\
158 	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);			\
159 	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
160 #include <linux/cgroup_subsys.h>
161 #undef SUBSYS
162 
163 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
164 static struct static_key_true *cgroup_subsys_enabled_key[] = {
165 #include <linux/cgroup_subsys.h>
166 };
167 #undef SUBSYS
168 
169 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
170 static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
171 #include <linux/cgroup_subsys.h>
172 };
173 #undef SUBSYS
174 
175 /*
176  * The default hierarchy, reserved for the subsystems that are otherwise
177  * unattached - it never has more than a single cgroup, and all tasks are
178  * part of that cgroup.
179  */
180 struct cgroup_root cgrp_dfl_root;
181 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
182 
183 /*
184  * The default hierarchy always exists but is hidden until mounted for the
185  * first time.  This is for backward compatibility.
186  */
187 static bool cgrp_dfl_visible;
188 
189 /* Controllers blocked by the commandline in v1 */
190 static u16 cgroup_no_v1_mask;
191 
192 /* some controllers are not supported in the default hierarchy */
193 static u16 cgrp_dfl_inhibit_ss_mask;
194 
195 /* some controllers are implicitly enabled on the default hierarchy */
196 static unsigned long cgrp_dfl_implicit_ss_mask;
197 
198 /* The list of hierarchy roots */
199 
200 static LIST_HEAD(cgroup_roots);
201 static int cgroup_root_count;
202 
203 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
204 static DEFINE_IDR(cgroup_hierarchy_idr);
205 
206 /*
207  * Assign a monotonically increasing serial number to csses.  It guarantees
208  * cgroups with bigger numbers are newer than those with smaller numbers.
209  * Also, as csses are always appended to the parent's ->children list, it
210  * guarantees that sibling csses are always sorted in the ascending serial
211  * number order on the list.  Protected by cgroup_mutex.
212  */
213 static u64 css_serial_nr_next = 1;
214 
215 /*
216  * These bitmask flags indicate whether tasks in the fork and exit paths have
217  * fork/exit handlers to call. This avoids us having to do extra work in the
218  * fork/exit path to check which subsystems have fork/exit callbacks.
219  */
220 static u16 have_fork_callback __read_mostly;
221 static u16 have_exit_callback __read_mostly;
222 static u16 have_free_callback __read_mostly;
223 
224 /* cgroup namespace for init task */
225 struct cgroup_namespace init_cgroup_ns = {
226 	.count		= { .counter = 2, },
227 	.user_ns	= &init_user_ns,
228 	.ns.ops		= &cgroupns_operations,
229 	.ns.inum	= PROC_CGROUP_INIT_INO,
230 	.root_cset	= &init_css_set,
231 };
232 
233 /* Ditto for the can_fork callback. */
234 static u16 have_canfork_callback __read_mostly;
235 
236 static struct file_system_type cgroup2_fs_type;
237 static struct cftype cgroup_dfl_base_files[];
238 static struct cftype cgroup_legacy_base_files[];
239 
240 static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
241 static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
242 static int cgroup_apply_control(struct cgroup *cgrp);
243 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
244 static void css_task_iter_advance(struct css_task_iter *it);
245 static int cgroup_destroy_locked(struct cgroup *cgrp);
246 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
247 					      struct cgroup_subsys *ss);
248 static void css_release(struct percpu_ref *ref);
249 static void kill_css(struct cgroup_subsys_state *css);
250 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
251 			      struct cgroup *cgrp, struct cftype cfts[],
252 			      bool is_add);
253 
254 /**
255  * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
256  * @ssid: subsys ID of interest
257  *
258  * cgroup_subsys_enabled() can only be used with literal subsys names which
259  * is fine for individual subsystems but unsuitable for cgroup core.  This
260  * is slower static_key_enabled() based test indexed by @ssid.
261  */
cgroup_ssid_enabled(int ssid)262 static bool cgroup_ssid_enabled(int ssid)
263 {
264 	if (CGROUP_SUBSYS_COUNT == 0)
265 		return false;
266 
267 	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
268 }
269 
cgroup_ssid_no_v1(int ssid)270 static bool cgroup_ssid_no_v1(int ssid)
271 {
272 	return cgroup_no_v1_mask & (1 << ssid);
273 }
274 
275 /**
276  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
277  * @cgrp: the cgroup of interest
278  *
279  * The default hierarchy is the v2 interface of cgroup and this function
280  * can be used to test whether a cgroup is on the default hierarchy for
281  * cases where a subsystem should behave differnetly depending on the
282  * interface version.
283  *
284  * The set of behaviors which change on the default hierarchy are still
285  * being determined and the mount option is prefixed with __DEVEL__.
286  *
287  * List of changed behaviors:
288  *
289  * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
290  *   and "name" are disallowed.
291  *
292  * - When mounting an existing superblock, mount options should match.
293  *
294  * - Remount is disallowed.
295  *
296  * - rename(2) is disallowed.
297  *
298  * - "tasks" is removed.  Everything should be at process granularity.  Use
299  *   "cgroup.procs" instead.
300  *
301  * - "cgroup.procs" is not sorted.  pids will be unique unless they got
302  *   recycled inbetween reads.
303  *
304  * - "release_agent" and "notify_on_release" are removed.  Replacement
305  *   notification mechanism will be implemented.
306  *
307  * - "cgroup.clone_children" is removed.
308  *
309  * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
310  *   and its descendants contain no task; otherwise, 1.  The file also
311  *   generates kernfs notification which can be monitored through poll and
312  *   [di]notify when the value of the file changes.
313  *
314  * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
315  *   take masks of ancestors with non-empty cpus/mems, instead of being
316  *   moved to an ancestor.
317  *
318  * - cpuset: a task can be moved into an empty cpuset, and again it takes
319  *   masks of ancestors.
320  *
321  * - memcg: use_hierarchy is on by default and the cgroup file for the flag
322  *   is not created.
323  *
324  * - blkcg: blk-throttle becomes properly hierarchical.
325  *
326  * - debug: disallowed on the default hierarchy.
327  */
cgroup_on_dfl(const struct cgroup * cgrp)328 static bool cgroup_on_dfl(const struct cgroup *cgrp)
329 {
330 	return cgrp->root == &cgrp_dfl_root;
331 }
332 
333 /* IDR wrappers which synchronize using cgroup_idr_lock */
cgroup_idr_alloc(struct idr * idr,void * ptr,int start,int end,gfp_t gfp_mask)334 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
335 			    gfp_t gfp_mask)
336 {
337 	int ret;
338 
339 	idr_preload(gfp_mask);
340 	spin_lock_bh(&cgroup_idr_lock);
341 	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
342 	spin_unlock_bh(&cgroup_idr_lock);
343 	idr_preload_end();
344 	return ret;
345 }
346 
cgroup_idr_replace(struct idr * idr,void * ptr,int id)347 static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
348 {
349 	void *ret;
350 
351 	spin_lock_bh(&cgroup_idr_lock);
352 	ret = idr_replace(idr, ptr, id);
353 	spin_unlock_bh(&cgroup_idr_lock);
354 	return ret;
355 }
356 
cgroup_idr_remove(struct idr * idr,int id)357 static void cgroup_idr_remove(struct idr *idr, int id)
358 {
359 	spin_lock_bh(&cgroup_idr_lock);
360 	idr_remove(idr, id);
361 	spin_unlock_bh(&cgroup_idr_lock);
362 }
363 
cgroup_parent(struct cgroup * cgrp)364 static struct cgroup *cgroup_parent(struct cgroup *cgrp)
365 {
366 	struct cgroup_subsys_state *parent_css = cgrp->self.parent;
367 
368 	if (parent_css)
369 		return container_of(parent_css, struct cgroup, self);
370 	return NULL;
371 }
372 
373 /* subsystems visibly enabled on a cgroup */
cgroup_control(struct cgroup * cgrp)374 static u16 cgroup_control(struct cgroup *cgrp)
375 {
376 	struct cgroup *parent = cgroup_parent(cgrp);
377 	u16 root_ss_mask = cgrp->root->subsys_mask;
378 
379 	if (parent)
380 		return parent->subtree_control;
381 
382 	if (cgroup_on_dfl(cgrp))
383 		root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
384 				  cgrp_dfl_implicit_ss_mask);
385 	return root_ss_mask;
386 }
387 
388 /* subsystems enabled on a cgroup */
cgroup_ss_mask(struct cgroup * cgrp)389 static u16 cgroup_ss_mask(struct cgroup *cgrp)
390 {
391 	struct cgroup *parent = cgroup_parent(cgrp);
392 
393 	if (parent)
394 		return parent->subtree_ss_mask;
395 
396 	return cgrp->root->subsys_mask;
397 }
398 
399 /**
400  * cgroup_css - obtain a cgroup's css for the specified subsystem
401  * @cgrp: the cgroup of interest
402  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
403  *
404  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
405  * function must be called either under cgroup_mutex or rcu_read_lock() and
406  * the caller is responsible for pinning the returned css if it wants to
407  * keep accessing it outside the said locks.  This function may return
408  * %NULL if @cgrp doesn't have @subsys_id enabled.
409  */
cgroup_css(struct cgroup * cgrp,struct cgroup_subsys * ss)410 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
411 					      struct cgroup_subsys *ss)
412 {
413 	if (ss)
414 		return rcu_dereference_check(cgrp->subsys[ss->id],
415 					lockdep_is_held(&cgroup_mutex));
416 	else
417 		return &cgrp->self;
418 }
419 
420 /**
421  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
422  * @cgrp: the cgroup of interest
423  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
424  *
425  * Similar to cgroup_css() but returns the effective css, which is defined
426  * as the matching css of the nearest ancestor including self which has @ss
427  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
428  * function is guaranteed to return non-NULL css.
429  */
cgroup_e_css(struct cgroup * cgrp,struct cgroup_subsys * ss)430 static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
431 						struct cgroup_subsys *ss)
432 {
433 	lockdep_assert_held(&cgroup_mutex);
434 
435 	if (!ss)
436 		return &cgrp->self;
437 
438 	/*
439 	 * This function is used while updating css associations and thus
440 	 * can't test the csses directly.  Test ss_mask.
441 	 */
442 	while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
443 		cgrp = cgroup_parent(cgrp);
444 		if (!cgrp)
445 			return NULL;
446 	}
447 
448 	return cgroup_css(cgrp, ss);
449 }
450 
451 /**
452  * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
453  * @cgrp: the cgroup of interest
454  * @ss: the subsystem of interest
455  *
456  * Find and get the effective css of @cgrp for @ss.  The effective css is
457  * defined as the matching css of the nearest ancestor including self which
458  * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
459  * the root css is returned, so this function always returns a valid css.
460  * The returned css must be put using css_put().
461  */
cgroup_get_e_css(struct cgroup * cgrp,struct cgroup_subsys * ss)462 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
463 					     struct cgroup_subsys *ss)
464 {
465 	struct cgroup_subsys_state *css;
466 
467 	rcu_read_lock();
468 
469 	do {
470 		css = cgroup_css(cgrp, ss);
471 
472 		if (css && css_tryget_online(css))
473 			goto out_unlock;
474 		cgrp = cgroup_parent(cgrp);
475 	} while (cgrp);
476 
477 	css = init_css_set.subsys[ss->id];
478 	css_get(css);
479 out_unlock:
480 	rcu_read_unlock();
481 	return css;
482 }
483 
484 /* convenient tests for these bits */
cgroup_is_dead(const struct cgroup * cgrp)485 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
486 {
487 	return !(cgrp->self.flags & CSS_ONLINE);
488 }
489 
cgroup_get(struct cgroup * cgrp)490 static void cgroup_get(struct cgroup *cgrp)
491 {
492 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
493 	css_get(&cgrp->self);
494 }
495 
cgroup_tryget(struct cgroup * cgrp)496 static bool cgroup_tryget(struct cgroup *cgrp)
497 {
498 	return css_tryget(&cgrp->self);
499 }
500 
of_css(struct kernfs_open_file * of)501 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
502 {
503 	struct cgroup *cgrp = of->kn->parent->priv;
504 	struct cftype *cft = of_cft(of);
505 
506 	/*
507 	 * This is open and unprotected implementation of cgroup_css().
508 	 * seq_css() is only called from a kernfs file operation which has
509 	 * an active reference on the file.  Because all the subsystem
510 	 * files are drained before a css is disassociated with a cgroup,
511 	 * the matching css from the cgroup's subsys table is guaranteed to
512 	 * be and stay valid until the enclosing operation is complete.
513 	 */
514 	if (cft->ss)
515 		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
516 	else
517 		return &cgrp->self;
518 }
519 EXPORT_SYMBOL_GPL(of_css);
520 
notify_on_release(const struct cgroup * cgrp)521 static int notify_on_release(const struct cgroup *cgrp)
522 {
523 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
524 }
525 
526 /**
527  * for_each_css - iterate all css's of a cgroup
528  * @css: the iteration cursor
529  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
530  * @cgrp: the target cgroup to iterate css's of
531  *
532  * Should be called under cgroup_[tree_]mutex.
533  */
534 #define for_each_css(css, ssid, cgrp)					\
535 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
536 		if (!((css) = rcu_dereference_check(			\
537 				(cgrp)->subsys[(ssid)],			\
538 				lockdep_is_held(&cgroup_mutex)))) { }	\
539 		else
540 
541 /**
542  * for_each_e_css - iterate all effective css's of a cgroup
543  * @css: the iteration cursor
544  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
545  * @cgrp: the target cgroup to iterate css's of
546  *
547  * Should be called under cgroup_[tree_]mutex.
548  */
549 #define for_each_e_css(css, ssid, cgrp)					\
550 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
551 		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
552 			;						\
553 		else
554 
555 /**
556  * for_each_subsys - iterate all enabled cgroup subsystems
557  * @ss: the iteration cursor
558  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
559  */
560 #define for_each_subsys(ss, ssid)					\
561 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
562 	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
563 
564 /**
565  * do_each_subsys_mask - filter for_each_subsys with a bitmask
566  * @ss: the iteration cursor
567  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
568  * @ss_mask: the bitmask
569  *
570  * The block will only run for cases where the ssid-th bit (1 << ssid) of
571  * @ss_mask is set.
572  */
573 #define do_each_subsys_mask(ss, ssid, ss_mask) do {			\
574 	unsigned long __ss_mask = (ss_mask);				\
575 	if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */	\
576 		(ssid) = 0;						\
577 		break;							\
578 	}								\
579 	for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {	\
580 		(ss) = cgroup_subsys[ssid];				\
581 		{
582 
583 #define while_each_subsys_mask()					\
584 		}							\
585 	}								\
586 } while (false)
587 
588 /* iterate across the hierarchies */
589 #define for_each_root(root)						\
590 	list_for_each_entry((root), &cgroup_roots, root_list)
591 
592 /* iterate over child cgrps, lock should be held throughout iteration */
593 #define cgroup_for_each_live_child(child, cgrp)				\
594 	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
595 		if (({ lockdep_assert_held(&cgroup_mutex);		\
596 		       cgroup_is_dead(child); }))			\
597 			;						\
598 		else
599 
600 /* walk live descendants in preorder */
601 #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)		\
602 	css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))	\
603 		if (({ lockdep_assert_held(&cgroup_mutex);		\
604 		       (dsct) = (d_css)->cgroup;			\
605 		       cgroup_is_dead(dsct); }))			\
606 			;						\
607 		else
608 
609 /* walk live descendants in postorder */
610 #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)		\
611 	css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))	\
612 		if (({ lockdep_assert_held(&cgroup_mutex);		\
613 		       (dsct) = (d_css)->cgroup;			\
614 		       cgroup_is_dead(dsct); }))			\
615 			;						\
616 		else
617 
618 static void cgroup_release_agent(struct work_struct *work);
619 static void check_for_release(struct cgroup *cgrp);
620 
621 /*
622  * A cgroup can be associated with multiple css_sets as different tasks may
623  * belong to different cgroups on different hierarchies.  In the other
624  * direction, a css_set is naturally associated with multiple cgroups.
625  * This M:N relationship is represented by the following link structure
626  * which exists for each association and allows traversing the associations
627  * from both sides.
628  */
629 struct cgrp_cset_link {
630 	/* the cgroup and css_set this link associates */
631 	struct cgroup		*cgrp;
632 	struct css_set		*cset;
633 
634 	/* list of cgrp_cset_links anchored at cgrp->cset_links */
635 	struct list_head	cset_link;
636 
637 	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
638 	struct list_head	cgrp_link;
639 };
640 
641 /*
642  * The default css_set - used by init and its children prior to any
643  * hierarchies being mounted. It contains a pointer to the root state
644  * for each subsystem. Also used to anchor the list of css_sets. Not
645  * reference-counted, to improve performance when child cgroups
646  * haven't been created.
647  */
648 struct css_set init_css_set = {
649 	.refcount		= ATOMIC_INIT(1),
650 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
651 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
652 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
653 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
654 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
655 	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
656 };
657 
658 static int css_set_count	= 1;	/* 1 for init_css_set */
659 
660 /**
661  * css_set_populated - does a css_set contain any tasks?
662  * @cset: target css_set
663  */
css_set_populated(struct css_set * cset)664 static bool css_set_populated(struct css_set *cset)
665 {
666 	lockdep_assert_held(&css_set_lock);
667 
668 	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
669 }
670 
671 /**
672  * cgroup_update_populated - updated populated count of a cgroup
673  * @cgrp: the target cgroup
674  * @populated: inc or dec populated count
675  *
676  * One of the css_sets associated with @cgrp is either getting its first
677  * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
678  * count is propagated towards root so that a given cgroup's populated_cnt
679  * is zero iff the cgroup and all its descendants don't contain any tasks.
680  *
681  * @cgrp's interface file "cgroup.populated" is zero if
682  * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
683  * changes from or to zero, userland is notified that the content of the
684  * interface file has changed.  This can be used to detect when @cgrp and
685  * its descendants become populated or empty.
686  */
cgroup_update_populated(struct cgroup * cgrp,bool populated)687 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
688 {
689 	lockdep_assert_held(&css_set_lock);
690 
691 	do {
692 		bool trigger;
693 
694 		if (populated)
695 			trigger = !cgrp->populated_cnt++;
696 		else
697 			trigger = !--cgrp->populated_cnt;
698 
699 		if (!trigger)
700 			break;
701 
702 		check_for_release(cgrp);
703 		cgroup_file_notify(&cgrp->events_file);
704 
705 		cgrp = cgroup_parent(cgrp);
706 	} while (cgrp);
707 }
708 
709 /**
710  * css_set_update_populated - update populated state of a css_set
711  * @cset: target css_set
712  * @populated: whether @cset is populated or depopulated
713  *
714  * @cset is either getting the first task or losing the last.  Update the
715  * ->populated_cnt of all associated cgroups accordingly.
716  */
css_set_update_populated(struct css_set * cset,bool populated)717 static void css_set_update_populated(struct css_set *cset, bool populated)
718 {
719 	struct cgrp_cset_link *link;
720 
721 	lockdep_assert_held(&css_set_lock);
722 
723 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
724 		cgroup_update_populated(link->cgrp, populated);
725 }
726 
727 /**
728  * css_set_move_task - move a task from one css_set to another
729  * @task: task being moved
730  * @from_cset: css_set @task currently belongs to (may be NULL)
731  * @to_cset: new css_set @task is being moved to (may be NULL)
732  * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
733  *
734  * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
735  * css_set, @from_cset can be NULL.  If @task is being disassociated
736  * instead of moved, @to_cset can be NULL.
737  *
738  * This function automatically handles populated_cnt updates and
739  * css_task_iter adjustments but the caller is responsible for managing
740  * @from_cset and @to_cset's reference counts.
741  */
css_set_move_task(struct task_struct * task,struct css_set * from_cset,struct css_set * to_cset,bool use_mg_tasks)742 static void css_set_move_task(struct task_struct *task,
743 			      struct css_set *from_cset, struct css_set *to_cset,
744 			      bool use_mg_tasks)
745 {
746 	lockdep_assert_held(&css_set_lock);
747 
748 	if (to_cset && !css_set_populated(to_cset))
749 		css_set_update_populated(to_cset, true);
750 
751 	if (from_cset) {
752 		struct css_task_iter *it, *pos;
753 
754 		WARN_ON_ONCE(list_empty(&task->cg_list));
755 
756 		/*
757 		 * @task is leaving, advance task iterators which are
758 		 * pointing to it so that they can resume at the next
759 		 * position.  Advancing an iterator might remove it from
760 		 * the list, use safe walk.  See css_task_iter_advance*()
761 		 * for details.
762 		 */
763 		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
764 					 iters_node)
765 			if (it->task_pos == &task->cg_list)
766 				css_task_iter_advance(it);
767 
768 		list_del_init(&task->cg_list);
769 		if (!css_set_populated(from_cset))
770 			css_set_update_populated(from_cset, false);
771 	} else {
772 		WARN_ON_ONCE(!list_empty(&task->cg_list));
773 	}
774 
775 	if (to_cset) {
776 		/*
777 		 * We are synchronized through cgroup_threadgroup_rwsem
778 		 * against PF_EXITING setting such that we can't race
779 		 * against cgroup_exit() changing the css_set to
780 		 * init_css_set and dropping the old one.
781 		 */
782 		WARN_ON_ONCE(task->flags & PF_EXITING);
783 
784 		rcu_assign_pointer(task->cgroups, to_cset);
785 		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
786 							     &to_cset->tasks);
787 	}
788 }
789 
790 /*
791  * hash table for cgroup groups. This improves the performance to find
792  * an existing css_set. This hash doesn't (currently) take into
793  * account cgroups in empty hierarchies.
794  */
795 #define CSS_SET_HASH_BITS	7
796 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
797 
css_set_hash(struct cgroup_subsys_state * css[])798 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
799 {
800 	unsigned long key = 0UL;
801 	struct cgroup_subsys *ss;
802 	int i;
803 
804 	for_each_subsys(ss, i)
805 		key += (unsigned long)css[i];
806 	key = (key >> 16) ^ key;
807 
808 	return key;
809 }
810 
put_css_set_locked(struct css_set * cset)811 static void put_css_set_locked(struct css_set *cset)
812 {
813 	struct cgrp_cset_link *link, *tmp_link;
814 	struct cgroup_subsys *ss;
815 	int ssid;
816 
817 	lockdep_assert_held(&css_set_lock);
818 
819 	if (!atomic_dec_and_test(&cset->refcount))
820 		return;
821 
822 	/* This css_set is dead. unlink it and release cgroup and css refs */
823 	for_each_subsys(ss, ssid) {
824 		list_del(&cset->e_cset_node[ssid]);
825 		css_put(cset->subsys[ssid]);
826 	}
827 	hash_del(&cset->hlist);
828 	css_set_count--;
829 
830 	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
831 		list_del(&link->cset_link);
832 		list_del(&link->cgrp_link);
833 		if (cgroup_parent(link->cgrp))
834 			cgroup_put(link->cgrp);
835 		kfree(link);
836 	}
837 
838 	kfree_rcu(cset, rcu_head);
839 }
840 
put_css_set(struct css_set * cset)841 static void put_css_set(struct css_set *cset)
842 {
843 	unsigned long flags;
844 
845 	/*
846 	 * Ensure that the refcount doesn't hit zero while any readers
847 	 * can see it. Similar to atomic_dec_and_lock(), but for an
848 	 * rwlock
849 	 */
850 	if (atomic_add_unless(&cset->refcount, -1, 1))
851 		return;
852 
853 	spin_lock_irqsave(&css_set_lock, flags);
854 	put_css_set_locked(cset);
855 	spin_unlock_irqrestore(&css_set_lock, flags);
856 }
857 
858 /*
859  * refcounted get/put for css_set objects
860  */
get_css_set(struct css_set * cset)861 static inline void get_css_set(struct css_set *cset)
862 {
863 	atomic_inc(&cset->refcount);
864 }
865 
866 /**
867  * compare_css_sets - helper function for find_existing_css_set().
868  * @cset: candidate css_set being tested
869  * @old_cset: existing css_set for a task
870  * @new_cgrp: cgroup that's being entered by the task
871  * @template: desired set of css pointers in css_set (pre-calculated)
872  *
873  * Returns true if "cset" matches "old_cset" except for the hierarchy
874  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
875  */
compare_css_sets(struct css_set * cset,struct css_set * old_cset,struct cgroup * new_cgrp,struct cgroup_subsys_state * template[])876 static bool compare_css_sets(struct css_set *cset,
877 			     struct css_set *old_cset,
878 			     struct cgroup *new_cgrp,
879 			     struct cgroup_subsys_state *template[])
880 {
881 	struct list_head *l1, *l2;
882 
883 	/*
884 	 * On the default hierarchy, there can be csets which are
885 	 * associated with the same set of cgroups but different csses.
886 	 * Let's first ensure that csses match.
887 	 */
888 	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
889 		return false;
890 
891 	/*
892 	 * Compare cgroup pointers in order to distinguish between
893 	 * different cgroups in hierarchies.  As different cgroups may
894 	 * share the same effective css, this comparison is always
895 	 * necessary.
896 	 */
897 	l1 = &cset->cgrp_links;
898 	l2 = &old_cset->cgrp_links;
899 	while (1) {
900 		struct cgrp_cset_link *link1, *link2;
901 		struct cgroup *cgrp1, *cgrp2;
902 
903 		l1 = l1->next;
904 		l2 = l2->next;
905 		/* See if we reached the end - both lists are equal length. */
906 		if (l1 == &cset->cgrp_links) {
907 			BUG_ON(l2 != &old_cset->cgrp_links);
908 			break;
909 		} else {
910 			BUG_ON(l2 == &old_cset->cgrp_links);
911 		}
912 		/* Locate the cgroups associated with these links. */
913 		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
914 		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
915 		cgrp1 = link1->cgrp;
916 		cgrp2 = link2->cgrp;
917 		/* Hierarchies should be linked in the same order. */
918 		BUG_ON(cgrp1->root != cgrp2->root);
919 
920 		/*
921 		 * If this hierarchy is the hierarchy of the cgroup
922 		 * that's changing, then we need to check that this
923 		 * css_set points to the new cgroup; if it's any other
924 		 * hierarchy, then this css_set should point to the
925 		 * same cgroup as the old css_set.
926 		 */
927 		if (cgrp1->root == new_cgrp->root) {
928 			if (cgrp1 != new_cgrp)
929 				return false;
930 		} else {
931 			if (cgrp1 != cgrp2)
932 				return false;
933 		}
934 	}
935 	return true;
936 }
937 
938 /**
939  * find_existing_css_set - init css array and find the matching css_set
940  * @old_cset: the css_set that we're using before the cgroup transition
941  * @cgrp: the cgroup that we're moving into
942  * @template: out param for the new set of csses, should be clear on entry
943  */
find_existing_css_set(struct css_set * old_cset,struct cgroup * cgrp,struct cgroup_subsys_state * template[])944 static struct css_set *find_existing_css_set(struct css_set *old_cset,
945 					struct cgroup *cgrp,
946 					struct cgroup_subsys_state *template[])
947 {
948 	struct cgroup_root *root = cgrp->root;
949 	struct cgroup_subsys *ss;
950 	struct css_set *cset;
951 	unsigned long key;
952 	int i;
953 
954 	/*
955 	 * Build the set of subsystem state objects that we want to see in the
956 	 * new css_set. while subsystems can change globally, the entries here
957 	 * won't change, so no need for locking.
958 	 */
959 	for_each_subsys(ss, i) {
960 		if (root->subsys_mask & (1UL << i)) {
961 			/*
962 			 * @ss is in this hierarchy, so we want the
963 			 * effective css from @cgrp.
964 			 */
965 			template[i] = cgroup_e_css(cgrp, ss);
966 		} else {
967 			/*
968 			 * @ss is not in this hierarchy, so we don't want
969 			 * to change the css.
970 			 */
971 			template[i] = old_cset->subsys[i];
972 		}
973 	}
974 
975 	key = css_set_hash(template);
976 	hash_for_each_possible(css_set_table, cset, hlist, key) {
977 		if (!compare_css_sets(cset, old_cset, cgrp, template))
978 			continue;
979 
980 		/* This css_set matches what we need */
981 		return cset;
982 	}
983 
984 	/* No existing cgroup group matched */
985 	return NULL;
986 }
987 
free_cgrp_cset_links(struct list_head * links_to_free)988 static void free_cgrp_cset_links(struct list_head *links_to_free)
989 {
990 	struct cgrp_cset_link *link, *tmp_link;
991 
992 	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
993 		list_del(&link->cset_link);
994 		kfree(link);
995 	}
996 }
997 
998 /**
999  * allocate_cgrp_cset_links - allocate cgrp_cset_links
1000  * @count: the number of links to allocate
1001  * @tmp_links: list_head the allocated links are put on
1002  *
1003  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
1004  * through ->cset_link.  Returns 0 on success or -errno.
1005  */
allocate_cgrp_cset_links(int count,struct list_head * tmp_links)1006 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1007 {
1008 	struct cgrp_cset_link *link;
1009 	int i;
1010 
1011 	INIT_LIST_HEAD(tmp_links);
1012 
1013 	for (i = 0; i < count; i++) {
1014 		link = kzalloc(sizeof(*link), GFP_KERNEL);
1015 		if (!link) {
1016 			free_cgrp_cset_links(tmp_links);
1017 			return -ENOMEM;
1018 		}
1019 		list_add(&link->cset_link, tmp_links);
1020 	}
1021 	return 0;
1022 }
1023 
1024 /**
1025  * link_css_set - a helper function to link a css_set to a cgroup
1026  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
1027  * @cset: the css_set to be linked
1028  * @cgrp: the destination cgroup
1029  */
link_css_set(struct list_head * tmp_links,struct css_set * cset,struct cgroup * cgrp)1030 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1031 			 struct cgroup *cgrp)
1032 {
1033 	struct cgrp_cset_link *link;
1034 
1035 	BUG_ON(list_empty(tmp_links));
1036 
1037 	if (cgroup_on_dfl(cgrp))
1038 		cset->dfl_cgrp = cgrp;
1039 
1040 	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1041 	link->cset = cset;
1042 	link->cgrp = cgrp;
1043 
1044 	/*
1045 	 * Always add links to the tail of the lists so that the lists are
1046 	 * in choronological order.
1047 	 */
1048 	list_move_tail(&link->cset_link, &cgrp->cset_links);
1049 	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1050 
1051 	if (cgroup_parent(cgrp))
1052 		cgroup_get(cgrp);
1053 }
1054 
1055 /**
1056  * find_css_set - return a new css_set with one cgroup updated
1057  * @old_cset: the baseline css_set
1058  * @cgrp: the cgroup to be updated
1059  *
1060  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
1061  * substituted into the appropriate hierarchy.
1062  */
find_css_set(struct css_set * old_cset,struct cgroup * cgrp)1063 static struct css_set *find_css_set(struct css_set *old_cset,
1064 				    struct cgroup *cgrp)
1065 {
1066 	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1067 	struct css_set *cset;
1068 	struct list_head tmp_links;
1069 	struct cgrp_cset_link *link;
1070 	struct cgroup_subsys *ss;
1071 	unsigned long key;
1072 	int ssid;
1073 
1074 	lockdep_assert_held(&cgroup_mutex);
1075 
1076 	/* First see if we already have a cgroup group that matches
1077 	 * the desired set */
1078 	spin_lock_irq(&css_set_lock);
1079 	cset = find_existing_css_set(old_cset, cgrp, template);
1080 	if (cset)
1081 		get_css_set(cset);
1082 	spin_unlock_irq(&css_set_lock);
1083 
1084 	if (cset)
1085 		return cset;
1086 
1087 	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1088 	if (!cset)
1089 		return NULL;
1090 
1091 	/* Allocate all the cgrp_cset_link objects that we'll need */
1092 	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1093 		kfree(cset);
1094 		return NULL;
1095 	}
1096 
1097 	atomic_set(&cset->refcount, 1);
1098 	INIT_LIST_HEAD(&cset->cgrp_links);
1099 	INIT_LIST_HEAD(&cset->tasks);
1100 	INIT_LIST_HEAD(&cset->mg_tasks);
1101 	INIT_LIST_HEAD(&cset->mg_preload_node);
1102 	INIT_LIST_HEAD(&cset->mg_node);
1103 	INIT_LIST_HEAD(&cset->task_iters);
1104 	INIT_HLIST_NODE(&cset->hlist);
1105 
1106 	/* Copy the set of subsystem state objects generated in
1107 	 * find_existing_css_set() */
1108 	memcpy(cset->subsys, template, sizeof(cset->subsys));
1109 
1110 	spin_lock_irq(&css_set_lock);
1111 	/* Add reference counts and links from the new css_set. */
1112 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1113 		struct cgroup *c = link->cgrp;
1114 
1115 		if (c->root == cgrp->root)
1116 			c = cgrp;
1117 		link_css_set(&tmp_links, cset, c);
1118 	}
1119 
1120 	BUG_ON(!list_empty(&tmp_links));
1121 
1122 	css_set_count++;
1123 
1124 	/* Add @cset to the hash table */
1125 	key = css_set_hash(cset->subsys);
1126 	hash_add(css_set_table, &cset->hlist, key);
1127 
1128 	for_each_subsys(ss, ssid) {
1129 		struct cgroup_subsys_state *css = cset->subsys[ssid];
1130 
1131 		list_add_tail(&cset->e_cset_node[ssid],
1132 			      &css->cgroup->e_csets[ssid]);
1133 		css_get(css);
1134 	}
1135 
1136 	spin_unlock_irq(&css_set_lock);
1137 
1138 	return cset;
1139 }
1140 
cgroup_root_from_kf(struct kernfs_root * kf_root)1141 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1142 {
1143 	struct cgroup *root_cgrp = kf_root->kn->priv;
1144 
1145 	return root_cgrp->root;
1146 }
1147 
cgroup_init_root_id(struct cgroup_root * root)1148 static int cgroup_init_root_id(struct cgroup_root *root)
1149 {
1150 	int id;
1151 
1152 	lockdep_assert_held(&cgroup_mutex);
1153 
1154 	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1155 	if (id < 0)
1156 		return id;
1157 
1158 	root->hierarchy_id = id;
1159 	return 0;
1160 }
1161 
cgroup_exit_root_id(struct cgroup_root * root)1162 static void cgroup_exit_root_id(struct cgroup_root *root)
1163 {
1164 	lockdep_assert_held(&cgroup_mutex);
1165 
1166 	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1167 }
1168 
cgroup_free_root(struct cgroup_root * root)1169 static void cgroup_free_root(struct cgroup_root *root)
1170 {
1171 	if (root) {
1172 		idr_destroy(&root->cgroup_idr);
1173 		kfree(root);
1174 	}
1175 }
1176 
cgroup_destroy_root(struct cgroup_root * root)1177 static void cgroup_destroy_root(struct cgroup_root *root)
1178 {
1179 	struct cgroup *cgrp = &root->cgrp;
1180 	struct cgrp_cset_link *link, *tmp_link;
1181 
1182 	trace_cgroup_destroy_root(root);
1183 
1184 	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1185 
1186 	BUG_ON(atomic_read(&root->nr_cgrps));
1187 	BUG_ON(!list_empty(&cgrp->self.children));
1188 
1189 	/* Rebind all subsystems back to the default hierarchy */
1190 	WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1191 
1192 	/*
1193 	 * Release all the links from cset_links to this hierarchy's
1194 	 * root cgroup
1195 	 */
1196 	spin_lock_irq(&css_set_lock);
1197 
1198 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1199 		list_del(&link->cset_link);
1200 		list_del(&link->cgrp_link);
1201 		kfree(link);
1202 	}
1203 
1204 	spin_unlock_irq(&css_set_lock);
1205 
1206 	if (!list_empty(&root->root_list)) {
1207 		list_del(&root->root_list);
1208 		cgroup_root_count--;
1209 	}
1210 
1211 	cgroup_exit_root_id(root);
1212 
1213 	mutex_unlock(&cgroup_mutex);
1214 
1215 	kernfs_destroy_root(root->kf_root);
1216 	cgroup_free_root(root);
1217 }
1218 
1219 /*
1220  * look up cgroup associated with current task's cgroup namespace on the
1221  * specified hierarchy
1222  */
1223 static struct cgroup *
current_cgns_cgroup_from_root(struct cgroup_root * root)1224 current_cgns_cgroup_from_root(struct cgroup_root *root)
1225 {
1226 	struct cgroup *res = NULL;
1227 	struct css_set *cset;
1228 
1229 	lockdep_assert_held(&css_set_lock);
1230 
1231 	rcu_read_lock();
1232 
1233 	cset = current->nsproxy->cgroup_ns->root_cset;
1234 	if (cset == &init_css_set) {
1235 		res = &root->cgrp;
1236 	} else {
1237 		struct cgrp_cset_link *link;
1238 
1239 		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1240 			struct cgroup *c = link->cgrp;
1241 
1242 			if (c->root == root) {
1243 				res = c;
1244 				break;
1245 			}
1246 		}
1247 	}
1248 	rcu_read_unlock();
1249 
1250 	BUG_ON(!res);
1251 	return res;
1252 }
1253 
1254 /* look up cgroup associated with given css_set on the specified hierarchy */
cset_cgroup_from_root(struct css_set * cset,struct cgroup_root * root)1255 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1256 					    struct cgroup_root *root)
1257 {
1258 	struct cgroup *res = NULL;
1259 
1260 	lockdep_assert_held(&cgroup_mutex);
1261 	lockdep_assert_held(&css_set_lock);
1262 
1263 	if (cset == &init_css_set) {
1264 		res = &root->cgrp;
1265 	} else {
1266 		struct cgrp_cset_link *link;
1267 
1268 		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1269 			struct cgroup *c = link->cgrp;
1270 
1271 			if (c->root == root) {
1272 				res = c;
1273 				break;
1274 			}
1275 		}
1276 	}
1277 
1278 	BUG_ON(!res);
1279 	return res;
1280 }
1281 
1282 /*
1283  * Return the cgroup for "task" from the given hierarchy. Must be
1284  * called with cgroup_mutex and css_set_lock held.
1285  */
task_cgroup_from_root(struct task_struct * task,struct cgroup_root * root)1286 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
1287 					    struct cgroup_root *root)
1288 {
1289 	/*
1290 	 * No need to lock the task - since we hold cgroup_mutex the
1291 	 * task can't change groups, so the only thing that can happen
1292 	 * is that it exits and its css is set back to init_css_set.
1293 	 */
1294 	return cset_cgroup_from_root(task_css_set(task), root);
1295 }
1296 
1297 /*
1298  * A task must hold cgroup_mutex to modify cgroups.
1299  *
1300  * Any task can increment and decrement the count field without lock.
1301  * So in general, code holding cgroup_mutex can't rely on the count
1302  * field not changing.  However, if the count goes to zero, then only
1303  * cgroup_attach_task() can increment it again.  Because a count of zero
1304  * means that no tasks are currently attached, therefore there is no
1305  * way a task attached to that cgroup can fork (the other way to
1306  * increment the count).  So code holding cgroup_mutex can safely
1307  * assume that if the count is zero, it will stay zero. Similarly, if
1308  * a task holds cgroup_mutex on a cgroup with zero count, it
1309  * knows that the cgroup won't be removed, as cgroup_rmdir()
1310  * needs that mutex.
1311  *
1312  * A cgroup can only be deleted if both its 'count' of using tasks
1313  * is zero, and its list of 'children' cgroups is empty.  Since all
1314  * tasks in the system use _some_ cgroup, and since there is always at
1315  * least one task in the system (init, pid == 1), therefore, root cgroup
1316  * always has either children cgroups and/or using tasks.  So we don't
1317  * need a special hack to ensure that root cgroup cannot be deleted.
1318  *
1319  * P.S.  One more locking exception.  RCU is used to guard the
1320  * update of a tasks cgroup pointer by cgroup_attach_task()
1321  */
1322 
1323 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1324 static const struct file_operations proc_cgroupstats_operations;
1325 
cgroup_file_name(struct cgroup * cgrp,const struct cftype * cft,char * buf)1326 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1327 			      char *buf)
1328 {
1329 	struct cgroup_subsys *ss = cft->ss;
1330 
1331 	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1332 	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1333 		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1334 			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1335 			 cft->name);
1336 	else
1337 		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1338 	return buf;
1339 }
1340 
1341 /**
1342  * cgroup_file_mode - deduce file mode of a control file
1343  * @cft: the control file in question
1344  *
1345  * S_IRUGO for read, S_IWUSR for write.
1346  */
cgroup_file_mode(const struct cftype * cft)1347 static umode_t cgroup_file_mode(const struct cftype *cft)
1348 {
1349 	umode_t mode = 0;
1350 
1351 	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1352 		mode |= S_IRUGO;
1353 
1354 	if (cft->write_u64 || cft->write_s64 || cft->write) {
1355 		if (cft->flags & CFTYPE_WORLD_WRITABLE)
1356 			mode |= S_IWUGO;
1357 		else
1358 			mode |= S_IWUSR;
1359 	}
1360 
1361 	return mode;
1362 }
1363 
1364 /**
1365  * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
1366  * @subtree_control: the new subtree_control mask to consider
1367  * @this_ss_mask: available subsystems
1368  *
1369  * On the default hierarchy, a subsystem may request other subsystems to be
1370  * enabled together through its ->depends_on mask.  In such cases, more
1371  * subsystems than specified in "cgroup.subtree_control" may be enabled.
1372  *
1373  * This function calculates which subsystems need to be enabled if
1374  * @subtree_control is to be applied while restricted to @this_ss_mask.
1375  */
cgroup_calc_subtree_ss_mask(u16 subtree_control,u16 this_ss_mask)1376 static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1377 {
1378 	u16 cur_ss_mask = subtree_control;
1379 	struct cgroup_subsys *ss;
1380 	int ssid;
1381 
1382 	lockdep_assert_held(&cgroup_mutex);
1383 
1384 	cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1385 
1386 	while (true) {
1387 		u16 new_ss_mask = cur_ss_mask;
1388 
1389 		do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1390 			new_ss_mask |= ss->depends_on;
1391 		} while_each_subsys_mask();
1392 
1393 		/*
1394 		 * Mask out subsystems which aren't available.  This can
1395 		 * happen only if some depended-upon subsystems were bound
1396 		 * to non-default hierarchies.
1397 		 */
1398 		new_ss_mask &= this_ss_mask;
1399 
1400 		if (new_ss_mask == cur_ss_mask)
1401 			break;
1402 		cur_ss_mask = new_ss_mask;
1403 	}
1404 
1405 	return cur_ss_mask;
1406 }
1407 
1408 /**
1409  * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1410  * @kn: the kernfs_node being serviced
1411  *
1412  * This helper undoes cgroup_kn_lock_live() and should be invoked before
1413  * the method finishes if locking succeeded.  Note that once this function
1414  * returns the cgroup returned by cgroup_kn_lock_live() may become
1415  * inaccessible any time.  If the caller intends to continue to access the
1416  * cgroup, it should pin it before invoking this function.
1417  */
cgroup_kn_unlock(struct kernfs_node * kn)1418 static void cgroup_kn_unlock(struct kernfs_node *kn)
1419 {
1420 	struct cgroup *cgrp;
1421 
1422 	if (kernfs_type(kn) == KERNFS_DIR)
1423 		cgrp = kn->priv;
1424 	else
1425 		cgrp = kn->parent->priv;
1426 
1427 	mutex_unlock(&cgroup_mutex);
1428 
1429 	kernfs_unbreak_active_protection(kn);
1430 	cgroup_put(cgrp);
1431 }
1432 
1433 /**
1434  * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1435  * @kn: the kernfs_node being serviced
1436  * @drain_offline: perform offline draining on the cgroup
1437  *
1438  * This helper is to be used by a cgroup kernfs method currently servicing
1439  * @kn.  It breaks the active protection, performs cgroup locking and
1440  * verifies that the associated cgroup is alive.  Returns the cgroup if
1441  * alive; otherwise, %NULL.  A successful return should be undone by a
1442  * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
1443  * cgroup is drained of offlining csses before return.
1444  *
1445  * Any cgroup kernfs method implementation which requires locking the
1446  * associated cgroup should use this helper.  It avoids nesting cgroup
1447  * locking under kernfs active protection and allows all kernfs operations
1448  * including self-removal.
1449  */
cgroup_kn_lock_live(struct kernfs_node * kn,bool drain_offline)1450 static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
1451 					  bool drain_offline)
1452 {
1453 	struct cgroup *cgrp;
1454 
1455 	if (kernfs_type(kn) == KERNFS_DIR)
1456 		cgrp = kn->priv;
1457 	else
1458 		cgrp = kn->parent->priv;
1459 
1460 	/*
1461 	 * We're gonna grab cgroup_mutex which nests outside kernfs
1462 	 * active_ref.  cgroup liveliness check alone provides enough
1463 	 * protection against removal.  Ensure @cgrp stays accessible and
1464 	 * break the active_ref protection.
1465 	 */
1466 	if (!cgroup_tryget(cgrp))
1467 		return NULL;
1468 	kernfs_break_active_protection(kn);
1469 
1470 	if (drain_offline)
1471 		cgroup_lock_and_drain_offline(cgrp);
1472 	else
1473 		mutex_lock(&cgroup_mutex);
1474 
1475 	if (!cgroup_is_dead(cgrp))
1476 		return cgrp;
1477 
1478 	cgroup_kn_unlock(kn);
1479 	return NULL;
1480 }
1481 
cgroup_rm_file(struct cgroup * cgrp,const struct cftype * cft)1482 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1483 {
1484 	char name[CGROUP_FILE_NAME_MAX];
1485 
1486 	lockdep_assert_held(&cgroup_mutex);
1487 
1488 	if (cft->file_offset) {
1489 		struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1490 		struct cgroup_file *cfile = (void *)css + cft->file_offset;
1491 
1492 		spin_lock_irq(&cgroup_file_kn_lock);
1493 		cfile->kn = NULL;
1494 		spin_unlock_irq(&cgroup_file_kn_lock);
1495 	}
1496 
1497 	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1498 }
1499 
1500 /**
1501  * css_clear_dir - remove subsys files in a cgroup directory
1502  * @css: taget css
1503  */
css_clear_dir(struct cgroup_subsys_state * css)1504 static void css_clear_dir(struct cgroup_subsys_state *css)
1505 {
1506 	struct cgroup *cgrp = css->cgroup;
1507 	struct cftype *cfts;
1508 
1509 	if (!(css->flags & CSS_VISIBLE))
1510 		return;
1511 
1512 	css->flags &= ~CSS_VISIBLE;
1513 
1514 	list_for_each_entry(cfts, &css->ss->cfts, node)
1515 		cgroup_addrm_files(css, cgrp, cfts, false);
1516 }
1517 
1518 /**
1519  * css_populate_dir - create subsys files in a cgroup directory
1520  * @css: target css
1521  *
1522  * On failure, no file is added.
1523  */
css_populate_dir(struct cgroup_subsys_state * css)1524 static int css_populate_dir(struct cgroup_subsys_state *css)
1525 {
1526 	struct cgroup *cgrp = css->cgroup;
1527 	struct cftype *cfts, *failed_cfts;
1528 	int ret;
1529 
1530 	if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1531 		return 0;
1532 
1533 	if (!css->ss) {
1534 		if (cgroup_on_dfl(cgrp))
1535 			cfts = cgroup_dfl_base_files;
1536 		else
1537 			cfts = cgroup_legacy_base_files;
1538 
1539 		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1540 	}
1541 
1542 	list_for_each_entry(cfts, &css->ss->cfts, node) {
1543 		ret = cgroup_addrm_files(css, cgrp, cfts, true);
1544 		if (ret < 0) {
1545 			failed_cfts = cfts;
1546 			goto err;
1547 		}
1548 	}
1549 
1550 	css->flags |= CSS_VISIBLE;
1551 
1552 	return 0;
1553 err:
1554 	list_for_each_entry(cfts, &css->ss->cfts, node) {
1555 		if (cfts == failed_cfts)
1556 			break;
1557 		cgroup_addrm_files(css, cgrp, cfts, false);
1558 	}
1559 	return ret;
1560 }
1561 
rebind_subsystems(struct cgroup_root * dst_root,u16 ss_mask)1562 static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1563 {
1564 	struct cgroup *dcgrp = &dst_root->cgrp;
1565 	struct cgroup_subsys *ss;
1566 	int ssid, i, ret;
1567 
1568 	lockdep_assert_held(&cgroup_mutex);
1569 
1570 	do_each_subsys_mask(ss, ssid, ss_mask) {
1571 		/*
1572 		 * If @ss has non-root csses attached to it, can't move.
1573 		 * If @ss is an implicit controller, it is exempt from this
1574 		 * rule and can be stolen.
1575 		 */
1576 		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1577 		    !ss->implicit_on_dfl)
1578 			return -EBUSY;
1579 
1580 		/* can't move between two non-dummy roots either */
1581 		if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1582 			return -EBUSY;
1583 	} while_each_subsys_mask();
1584 
1585 	do_each_subsys_mask(ss, ssid, ss_mask) {
1586 		struct cgroup_root *src_root = ss->root;
1587 		struct cgroup *scgrp = &src_root->cgrp;
1588 		struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1589 		struct css_set *cset;
1590 
1591 		WARN_ON(!css || cgroup_css(dcgrp, ss));
1592 
1593 		/* disable from the source */
1594 		src_root->subsys_mask &= ~(1 << ssid);
1595 		WARN_ON(cgroup_apply_control(scgrp));
1596 		cgroup_finalize_control(scgrp, 0);
1597 
1598 		/* rebind */
1599 		RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1600 		rcu_assign_pointer(dcgrp->subsys[ssid], css);
1601 		ss->root = dst_root;
1602 		css->cgroup = dcgrp;
1603 
1604 		spin_lock_irq(&css_set_lock);
1605 		hash_for_each(css_set_table, i, cset, hlist)
1606 			list_move_tail(&cset->e_cset_node[ss->id],
1607 				       &dcgrp->e_csets[ss->id]);
1608 		spin_unlock_irq(&css_set_lock);
1609 
1610 		/* default hierarchy doesn't enable controllers by default */
1611 		dst_root->subsys_mask |= 1 << ssid;
1612 		if (dst_root == &cgrp_dfl_root) {
1613 			static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1614 		} else {
1615 			dcgrp->subtree_control |= 1 << ssid;
1616 			static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1617 		}
1618 
1619 		ret = cgroup_apply_control(dcgrp);
1620 		if (ret)
1621 			pr_warn("partial failure to rebind %s controller (err=%d)\n",
1622 				ss->name, ret);
1623 
1624 		if (ss->bind)
1625 			ss->bind(css);
1626 	} while_each_subsys_mask();
1627 
1628 	kernfs_activate(dcgrp->kn);
1629 	return 0;
1630 }
1631 
cgroup_show_path(struct seq_file * sf,struct kernfs_node * kf_node,struct kernfs_root * kf_root)1632 static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1633 			    struct kernfs_root *kf_root)
1634 {
1635 	int len = 0;
1636 	char *buf = NULL;
1637 	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1638 	struct cgroup *ns_cgroup;
1639 
1640 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
1641 	if (!buf)
1642 		return -ENOMEM;
1643 
1644 	spin_lock_irq(&css_set_lock);
1645 	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1646 	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1647 	spin_unlock_irq(&css_set_lock);
1648 
1649 	if (len >= PATH_MAX)
1650 		len = -ERANGE;
1651 	else if (len > 0) {
1652 		seq_escape(sf, buf, " \t\n\\");
1653 		len = 0;
1654 	}
1655 	kfree(buf);
1656 	return len;
1657 }
1658 
cgroup_show_options(struct seq_file * seq,struct kernfs_root * kf_root)1659 static int cgroup_show_options(struct seq_file *seq,
1660 			       struct kernfs_root *kf_root)
1661 {
1662 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1663 	struct cgroup_subsys *ss;
1664 	int ssid;
1665 
1666 	if (root != &cgrp_dfl_root)
1667 		for_each_subsys(ss, ssid)
1668 			if (root->subsys_mask & (1 << ssid))
1669 				seq_show_option(seq, ss->legacy_name, NULL);
1670 	if (root->flags & CGRP_ROOT_NOPREFIX)
1671 		seq_puts(seq, ",noprefix");
1672 	if (root->flags & CGRP_ROOT_XATTR)
1673 		seq_puts(seq, ",xattr");
1674 
1675 	spin_lock(&release_agent_path_lock);
1676 	if (strlen(root->release_agent_path))
1677 		seq_show_option(seq, "release_agent",
1678 				root->release_agent_path);
1679 	spin_unlock(&release_agent_path_lock);
1680 
1681 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1682 		seq_puts(seq, ",clone_children");
1683 	if (strlen(root->name))
1684 		seq_show_option(seq, "name", root->name);
1685 	return 0;
1686 }
1687 
1688 struct cgroup_sb_opts {
1689 	u16 subsys_mask;
1690 	unsigned int flags;
1691 	char *release_agent;
1692 	bool cpuset_clone_children;
1693 	char *name;
1694 	/* User explicitly requested empty subsystem */
1695 	bool none;
1696 };
1697 
parse_cgroupfs_options(char * data,struct cgroup_sb_opts * opts)1698 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1699 {
1700 	char *token, *o = data;
1701 	bool all_ss = false, one_ss = false;
1702 	u16 mask = U16_MAX;
1703 	struct cgroup_subsys *ss;
1704 	int nr_opts = 0;
1705 	int i;
1706 
1707 #ifdef CONFIG_CPUSETS
1708 	mask = ~((u16)1 << cpuset_cgrp_id);
1709 #endif
1710 
1711 	memset(opts, 0, sizeof(*opts));
1712 
1713 	while ((token = strsep(&o, ",")) != NULL) {
1714 		nr_opts++;
1715 
1716 		if (!*token)
1717 			return -EINVAL;
1718 		if (!strcmp(token, "none")) {
1719 			/* Explicitly have no subsystems */
1720 			opts->none = true;
1721 			continue;
1722 		}
1723 		if (!strcmp(token, "all")) {
1724 			/* Mutually exclusive option 'all' + subsystem name */
1725 			if (one_ss)
1726 				return -EINVAL;
1727 			all_ss = true;
1728 			continue;
1729 		}
1730 		if (!strcmp(token, "noprefix")) {
1731 			opts->flags |= CGRP_ROOT_NOPREFIX;
1732 			continue;
1733 		}
1734 		if (!strcmp(token, "clone_children")) {
1735 			opts->cpuset_clone_children = true;
1736 			continue;
1737 		}
1738 		if (!strcmp(token, "xattr")) {
1739 			opts->flags |= CGRP_ROOT_XATTR;
1740 			continue;
1741 		}
1742 		if (!strncmp(token, "release_agent=", 14)) {
1743 			/* Specifying two release agents is forbidden */
1744 			if (opts->release_agent)
1745 				return -EINVAL;
1746 			opts->release_agent =
1747 				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1748 			if (!opts->release_agent)
1749 				return -ENOMEM;
1750 			continue;
1751 		}
1752 		if (!strncmp(token, "name=", 5)) {
1753 			const char *name = token + 5;
1754 			/* Can't specify an empty name */
1755 			if (!strlen(name))
1756 				return -EINVAL;
1757 			/* Must match [\w.-]+ */
1758 			for (i = 0; i < strlen(name); i++) {
1759 				char c = name[i];
1760 				if (isalnum(c))
1761 					continue;
1762 				if ((c == '.') || (c == '-') || (c == '_'))
1763 					continue;
1764 				return -EINVAL;
1765 			}
1766 			/* Specifying two names is forbidden */
1767 			if (opts->name)
1768 				return -EINVAL;
1769 			opts->name = kstrndup(name,
1770 					      MAX_CGROUP_ROOT_NAMELEN - 1,
1771 					      GFP_KERNEL);
1772 			if (!opts->name)
1773 				return -ENOMEM;
1774 
1775 			continue;
1776 		}
1777 
1778 		for_each_subsys(ss, i) {
1779 			if (strcmp(token, ss->legacy_name))
1780 				continue;
1781 			if (!cgroup_ssid_enabled(i))
1782 				continue;
1783 			if (cgroup_ssid_no_v1(i))
1784 				continue;
1785 
1786 			/* Mutually exclusive option 'all' + subsystem name */
1787 			if (all_ss)
1788 				return -EINVAL;
1789 			opts->subsys_mask |= (1 << i);
1790 			one_ss = true;
1791 
1792 			break;
1793 		}
1794 		if (i == CGROUP_SUBSYS_COUNT)
1795 			return -ENOENT;
1796 	}
1797 
1798 	/*
1799 	 * If the 'all' option was specified select all the subsystems,
1800 	 * otherwise if 'none', 'name=' and a subsystem name options were
1801 	 * not specified, let's default to 'all'
1802 	 */
1803 	if (all_ss || (!one_ss && !opts->none && !opts->name))
1804 		for_each_subsys(ss, i)
1805 			if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
1806 				opts->subsys_mask |= (1 << i);
1807 
1808 	/*
1809 	 * We either have to specify by name or by subsystems. (So all
1810 	 * empty hierarchies must have a name).
1811 	 */
1812 	if (!opts->subsys_mask && !opts->name)
1813 		return -EINVAL;
1814 
1815 	/*
1816 	 * Option noprefix was introduced just for backward compatibility
1817 	 * with the old cpuset, so we allow noprefix only if mounting just
1818 	 * the cpuset subsystem.
1819 	 */
1820 	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1821 		return -EINVAL;
1822 
1823 	/* Can't specify "none" and some subsystems */
1824 	if (opts->subsys_mask && opts->none)
1825 		return -EINVAL;
1826 
1827 	return 0;
1828 }
1829 
cgroup_remount(struct kernfs_root * kf_root,int * flags,char * data)1830 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1831 {
1832 	int ret = 0;
1833 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1834 	struct cgroup_sb_opts opts;
1835 	u16 added_mask, removed_mask;
1836 
1837 	if (root == &cgrp_dfl_root) {
1838 		pr_err("remount is not allowed\n");
1839 		return -EINVAL;
1840 	}
1841 
1842 	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1843 
1844 	/* See what subsystems are wanted */
1845 	ret = parse_cgroupfs_options(data, &opts);
1846 	if (ret)
1847 		goto out_unlock;
1848 
1849 	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1850 		pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1851 			task_tgid_nr(current), current->comm);
1852 
1853 	added_mask = opts.subsys_mask & ~root->subsys_mask;
1854 	removed_mask = root->subsys_mask & ~opts.subsys_mask;
1855 
1856 	/* Don't allow flags or name to change at remount */
1857 	if ((opts.flags ^ root->flags) ||
1858 	    (opts.name && strcmp(opts.name, root->name))) {
1859 		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1860 		       opts.flags, opts.name ?: "", root->flags, root->name);
1861 		ret = -EINVAL;
1862 		goto out_unlock;
1863 	}
1864 
1865 	/* remounting is not allowed for populated hierarchies */
1866 	if (!list_empty(&root->cgrp.self.children)) {
1867 		ret = -EBUSY;
1868 		goto out_unlock;
1869 	}
1870 
1871 	ret = rebind_subsystems(root, added_mask);
1872 	if (ret)
1873 		goto out_unlock;
1874 
1875 	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
1876 
1877 	if (opts.release_agent) {
1878 		spin_lock(&release_agent_path_lock);
1879 		strcpy(root->release_agent_path, opts.release_agent);
1880 		spin_unlock(&release_agent_path_lock);
1881 	}
1882 
1883 	trace_cgroup_remount(root);
1884 
1885  out_unlock:
1886 	kfree(opts.release_agent);
1887 	kfree(opts.name);
1888 	mutex_unlock(&cgroup_mutex);
1889 	return ret;
1890 }
1891 
1892 /*
1893  * To reduce the fork() overhead for systems that are not actually using
1894  * their cgroups capability, we don't maintain the lists running through
1895  * each css_set to its tasks until we see the list actually used - in other
1896  * words after the first mount.
1897  */
1898 static bool use_task_css_set_links __read_mostly;
1899 
cgroup_enable_task_cg_lists(void)1900 static void cgroup_enable_task_cg_lists(void)
1901 {
1902 	struct task_struct *p, *g;
1903 
1904 	spin_lock_irq(&css_set_lock);
1905 
1906 	if (use_task_css_set_links)
1907 		goto out_unlock;
1908 
1909 	use_task_css_set_links = true;
1910 
1911 	/*
1912 	 * We need tasklist_lock because RCU is not safe against
1913 	 * while_each_thread(). Besides, a forking task that has passed
1914 	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
1915 	 * is not guaranteed to have its child immediately visible in the
1916 	 * tasklist if we walk through it with RCU.
1917 	 */
1918 	read_lock(&tasklist_lock);
1919 	do_each_thread(g, p) {
1920 		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1921 			     task_css_set(p) != &init_css_set);
1922 
1923 		/*
1924 		 * We should check if the process is exiting, otherwise
1925 		 * it will race with cgroup_exit() in that the list
1926 		 * entry won't be deleted though the process has exited.
1927 		 * Do it while holding siglock so that we don't end up
1928 		 * racing against cgroup_exit().
1929 		 *
1930 		 * Interrupts were already disabled while acquiring
1931 		 * the css_set_lock, so we do not need to disable it
1932 		 * again when acquiring the sighand->siglock here.
1933 		 */
1934 		spin_lock(&p->sighand->siglock);
1935 		if (!(p->flags & PF_EXITING)) {
1936 			struct css_set *cset = task_css_set(p);
1937 
1938 			if (!css_set_populated(cset))
1939 				css_set_update_populated(cset, true);
1940 			list_add_tail(&p->cg_list, &cset->tasks);
1941 			get_css_set(cset);
1942 		}
1943 		spin_unlock(&p->sighand->siglock);
1944 	} while_each_thread(g, p);
1945 	read_unlock(&tasklist_lock);
1946 out_unlock:
1947 	spin_unlock_irq(&css_set_lock);
1948 }
1949 
init_cgroup_housekeeping(struct cgroup * cgrp)1950 static void init_cgroup_housekeeping(struct cgroup *cgrp)
1951 {
1952 	struct cgroup_subsys *ss;
1953 	int ssid;
1954 
1955 	INIT_LIST_HEAD(&cgrp->self.sibling);
1956 	INIT_LIST_HEAD(&cgrp->self.children);
1957 	INIT_LIST_HEAD(&cgrp->cset_links);
1958 	INIT_LIST_HEAD(&cgrp->pidlists);
1959 	mutex_init(&cgrp->pidlist_mutex);
1960 	cgrp->self.cgroup = cgrp;
1961 	cgrp->self.flags |= CSS_ONLINE;
1962 
1963 	for_each_subsys(ss, ssid)
1964 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1965 
1966 	init_waitqueue_head(&cgrp->offline_waitq);
1967 	INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
1968 }
1969 
init_cgroup_root(struct cgroup_root * root,struct cgroup_sb_opts * opts)1970 static void init_cgroup_root(struct cgroup_root *root,
1971 			     struct cgroup_sb_opts *opts)
1972 {
1973 	struct cgroup *cgrp = &root->cgrp;
1974 
1975 	INIT_LIST_HEAD(&root->root_list);
1976 	atomic_set(&root->nr_cgrps, 1);
1977 	cgrp->root = root;
1978 	init_cgroup_housekeeping(cgrp);
1979 	idr_init(&root->cgroup_idr);
1980 
1981 	root->flags = opts->flags;
1982 	if (opts->release_agent)
1983 		strcpy(root->release_agent_path, opts->release_agent);
1984 	if (opts->name)
1985 		strcpy(root->name, opts->name);
1986 	if (opts->cpuset_clone_children)
1987 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1988 }
1989 
cgroup_setup_root(struct cgroup_root * root,u16 ss_mask)1990 static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
1991 {
1992 	LIST_HEAD(tmp_links);
1993 	struct cgroup *root_cgrp = &root->cgrp;
1994 	struct css_set *cset;
1995 	int i, ret;
1996 
1997 	lockdep_assert_held(&cgroup_mutex);
1998 
1999 	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
2000 	if (ret < 0)
2001 		goto out;
2002 	root_cgrp->id = ret;
2003 	root_cgrp->ancestor_ids[0] = ret;
2004 
2005 	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
2006 			      GFP_KERNEL);
2007 	if (ret)
2008 		goto out;
2009 
2010 	/*
2011 	 * We're accessing css_set_count without locking css_set_lock here,
2012 	 * but that's OK - it can only be increased by someone holding
2013 	 * cgroup_lock, and that's us.  Later rebinding may disable
2014 	 * controllers on the default hierarchy and thus create new csets,
2015 	 * which can't be more than the existing ones.  Allocate 2x.
2016 	 */
2017 	ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
2018 	if (ret)
2019 		goto cancel_ref;
2020 
2021 	ret = cgroup_init_root_id(root);
2022 	if (ret)
2023 		goto cancel_ref;
2024 
2025 	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
2026 					   KERNFS_ROOT_CREATE_DEACTIVATED,
2027 					   root_cgrp);
2028 	if (IS_ERR(root->kf_root)) {
2029 		ret = PTR_ERR(root->kf_root);
2030 		goto exit_root_id;
2031 	}
2032 	root_cgrp->kn = root->kf_root->kn;
2033 
2034 	ret = css_populate_dir(&root_cgrp->self);
2035 	if (ret)
2036 		goto destroy_root;
2037 
2038 	ret = rebind_subsystems(root, ss_mask);
2039 	if (ret)
2040 		goto destroy_root;
2041 
2042 	trace_cgroup_setup_root(root);
2043 
2044 	/*
2045 	 * There must be no failure case after here, since rebinding takes
2046 	 * care of subsystems' refcounts, which are explicitly dropped in
2047 	 * the failure exit path.
2048 	 */
2049 	list_add(&root->root_list, &cgroup_roots);
2050 	cgroup_root_count++;
2051 
2052 	/*
2053 	 * Link the root cgroup in this hierarchy into all the css_set
2054 	 * objects.
2055 	 */
2056 	spin_lock_irq(&css_set_lock);
2057 	hash_for_each(css_set_table, i, cset, hlist) {
2058 		link_css_set(&tmp_links, cset, root_cgrp);
2059 		if (css_set_populated(cset))
2060 			cgroup_update_populated(root_cgrp, true);
2061 	}
2062 	spin_unlock_irq(&css_set_lock);
2063 
2064 	BUG_ON(!list_empty(&root_cgrp->self.children));
2065 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2066 
2067 	kernfs_activate(root_cgrp->kn);
2068 	ret = 0;
2069 	goto out;
2070 
2071 destroy_root:
2072 	kernfs_destroy_root(root->kf_root);
2073 	root->kf_root = NULL;
2074 exit_root_id:
2075 	cgroup_exit_root_id(root);
2076 cancel_ref:
2077 	percpu_ref_exit(&root_cgrp->self.refcnt);
2078 out:
2079 	free_cgrp_cset_links(&tmp_links);
2080 	return ret;
2081 }
2082 
cgroup_mount(struct file_system_type * fs_type,int flags,const char * unused_dev_name,void * data)2083 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
2084 			 int flags, const char *unused_dev_name,
2085 			 void *data)
2086 {
2087 	bool is_v2 = fs_type == &cgroup2_fs_type;
2088 	struct super_block *pinned_sb = NULL;
2089 	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
2090 	struct cgroup_subsys *ss;
2091 	struct cgroup_root *root;
2092 	struct cgroup_sb_opts opts;
2093 	struct dentry *dentry;
2094 	int ret;
2095 	int i;
2096 	bool new_sb;
2097 
2098 	get_cgroup_ns(ns);
2099 
2100 	/* Check if the caller has permission to mount. */
2101 	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
2102 		put_cgroup_ns(ns);
2103 		return ERR_PTR(-EPERM);
2104 	}
2105 
2106 	/*
2107 	 * The first time anyone tries to mount a cgroup, enable the list
2108 	 * linking each css_set to its tasks and fix up all existing tasks.
2109 	 */
2110 	if (!use_task_css_set_links)
2111 		cgroup_enable_task_cg_lists();
2112 
2113 	if (is_v2) {
2114 		if (data) {
2115 			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
2116 			put_cgroup_ns(ns);
2117 			return ERR_PTR(-EINVAL);
2118 		}
2119 		cgrp_dfl_visible = true;
2120 		root = &cgrp_dfl_root;
2121 		cgroup_get(&root->cgrp);
2122 		goto out_mount;
2123 	}
2124 
2125 	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
2126 
2127 	/* First find the desired set of subsystems */
2128 	ret = parse_cgroupfs_options(data, &opts);
2129 	if (ret)
2130 		goto out_unlock;
2131 
2132 	/*
2133 	 * Destruction of cgroup root is asynchronous, so subsystems may
2134 	 * still be dying after the previous unmount.  Let's drain the
2135 	 * dying subsystems.  We just need to ensure that the ones
2136 	 * unmounted previously finish dying and don't care about new ones
2137 	 * starting.  Testing ref liveliness is good enough.
2138 	 */
2139 	for_each_subsys(ss, i) {
2140 		if (!(opts.subsys_mask & (1 << i)) ||
2141 		    ss->root == &cgrp_dfl_root)
2142 			continue;
2143 
2144 		if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
2145 			mutex_unlock(&cgroup_mutex);
2146 			msleep(10);
2147 			ret = restart_syscall();
2148 			goto out_free;
2149 		}
2150 		cgroup_put(&ss->root->cgrp);
2151 	}
2152 
2153 	for_each_root(root) {
2154 		bool name_match = false;
2155 
2156 		if (root == &cgrp_dfl_root)
2157 			continue;
2158 
2159 		/*
2160 		 * If we asked for a name then it must match.  Also, if
2161 		 * name matches but sybsys_mask doesn't, we should fail.
2162 		 * Remember whether name matched.
2163 		 */
2164 		if (opts.name) {
2165 			if (strcmp(opts.name, root->name))
2166 				continue;
2167 			name_match = true;
2168 		}
2169 
2170 		/*
2171 		 * If we asked for subsystems (or explicitly for no
2172 		 * subsystems) then they must match.
2173 		 */
2174 		if ((opts.subsys_mask || opts.none) &&
2175 		    (opts.subsys_mask != root->subsys_mask)) {
2176 			if (!name_match)
2177 				continue;
2178 			ret = -EBUSY;
2179 			goto out_unlock;
2180 		}
2181 
2182 		if (root->flags ^ opts.flags)
2183 			pr_warn("new mount options do not match the existing superblock, will be ignored\n");
2184 
2185 		/*
2186 		 * We want to reuse @root whose lifetime is governed by its
2187 		 * ->cgrp.  Let's check whether @root is alive and keep it
2188 		 * that way.  As cgroup_kill_sb() can happen anytime, we
2189 		 * want to block it by pinning the sb so that @root doesn't
2190 		 * get killed before mount is complete.
2191 		 *
2192 		 * With the sb pinned, tryget_live can reliably indicate
2193 		 * whether @root can be reused.  If it's being killed,
2194 		 * drain it.  We can use wait_queue for the wait but this
2195 		 * path is super cold.  Let's just sleep a bit and retry.
2196 		 */
2197 		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
2198 		if (IS_ERR(pinned_sb) ||
2199 		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
2200 			mutex_unlock(&cgroup_mutex);
2201 			if (!IS_ERR_OR_NULL(pinned_sb))
2202 				deactivate_super(pinned_sb);
2203 			msleep(10);
2204 			ret = restart_syscall();
2205 			goto out_free;
2206 		}
2207 
2208 		ret = 0;
2209 		goto out_unlock;
2210 	}
2211 
2212 	/*
2213 	 * No such thing, create a new one.  name= matching without subsys
2214 	 * specification is allowed for already existing hierarchies but we
2215 	 * can't create new one without subsys specification.
2216 	 */
2217 	if (!opts.subsys_mask && !opts.none) {
2218 		ret = -EINVAL;
2219 		goto out_unlock;
2220 	}
2221 
2222 	/* Hierarchies may only be created in the initial cgroup namespace. */
2223 	if (ns != &init_cgroup_ns) {
2224 		ret = -EPERM;
2225 		goto out_unlock;
2226 	}
2227 
2228 	root = kzalloc(sizeof(*root), GFP_KERNEL);
2229 	if (!root) {
2230 		ret = -ENOMEM;
2231 		goto out_unlock;
2232 	}
2233 
2234 	init_cgroup_root(root, &opts);
2235 
2236 	ret = cgroup_setup_root(root, opts.subsys_mask);
2237 	if (ret)
2238 		cgroup_free_root(root);
2239 
2240 out_unlock:
2241 	mutex_unlock(&cgroup_mutex);
2242 out_free:
2243 	kfree(opts.release_agent);
2244 	kfree(opts.name);
2245 
2246 	if (ret) {
2247 		put_cgroup_ns(ns);
2248 		return ERR_PTR(ret);
2249 	}
2250 out_mount:
2251 	dentry = kernfs_mount(fs_type, flags, root->kf_root,
2252 			      is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
2253 			      &new_sb);
2254 
2255 	/*
2256 	 * In non-init cgroup namespace, instead of root cgroup's
2257 	 * dentry, we return the dentry corresponding to the
2258 	 * cgroupns->root_cgrp.
2259 	 */
2260 	if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
2261 		struct dentry *nsdentry;
2262 		struct cgroup *cgrp;
2263 
2264 		mutex_lock(&cgroup_mutex);
2265 		spin_lock_irq(&css_set_lock);
2266 
2267 		cgrp = cset_cgroup_from_root(ns->root_cset, root);
2268 
2269 		spin_unlock_irq(&css_set_lock);
2270 		mutex_unlock(&cgroup_mutex);
2271 
2272 		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
2273 		dput(dentry);
2274 		dentry = nsdentry;
2275 	}
2276 
2277 	if (IS_ERR(dentry) || !new_sb)
2278 		cgroup_put(&root->cgrp);
2279 
2280 	/*
2281 	 * If @pinned_sb, we're reusing an existing root and holding an
2282 	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
2283 	 */
2284 	if (pinned_sb) {
2285 		WARN_ON(new_sb);
2286 		deactivate_super(pinned_sb);
2287 	}
2288 
2289 	put_cgroup_ns(ns);
2290 	return dentry;
2291 }
2292 
cgroup_kill_sb(struct super_block * sb)2293 static void cgroup_kill_sb(struct super_block *sb)
2294 {
2295 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2296 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2297 
2298 	/*
2299 	 * If @root doesn't have any mounts or children, start killing it.
2300 	 * This prevents new mounts by disabling percpu_ref_tryget_live().
2301 	 * cgroup_mount() may wait for @root's release.
2302 	 *
2303 	 * And don't kill the default root.
2304 	 */
2305 	if (!list_empty(&root->cgrp.self.children) ||
2306 	    root == &cgrp_dfl_root)
2307 		cgroup_put(&root->cgrp);
2308 	else
2309 		percpu_ref_kill(&root->cgrp.self.refcnt);
2310 
2311 	kernfs_kill_sb(sb);
2312 }
2313 
2314 static struct file_system_type cgroup_fs_type = {
2315 	.name = "cgroup",
2316 	.mount = cgroup_mount,
2317 	.kill_sb = cgroup_kill_sb,
2318 	.fs_flags = FS_USERNS_MOUNT,
2319 };
2320 
2321 static struct file_system_type cgroup2_fs_type = {
2322 	.name = "cgroup2",
2323 	.mount = cgroup_mount,
2324 	.kill_sb = cgroup_kill_sb,
2325 	.fs_flags = FS_USERNS_MOUNT,
2326 };
2327 
cgroup_path_ns_locked(struct cgroup * cgrp,char * buf,size_t buflen,struct cgroup_namespace * ns)2328 static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2329 				 struct cgroup_namespace *ns)
2330 {
2331 	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2332 
2333 	return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2334 }
2335 
cgroup_path_ns(struct cgroup * cgrp,char * buf,size_t buflen,struct cgroup_namespace * ns)2336 int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2337 		   struct cgroup_namespace *ns)
2338 {
2339 	int ret;
2340 
2341 	mutex_lock(&cgroup_mutex);
2342 	spin_lock_irq(&css_set_lock);
2343 
2344 	ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2345 
2346 	spin_unlock_irq(&css_set_lock);
2347 	mutex_unlock(&cgroup_mutex);
2348 
2349 	return ret;
2350 }
2351 EXPORT_SYMBOL_GPL(cgroup_path_ns);
2352 
2353 /**
2354  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
2355  * @task: target task
2356  * @buf: the buffer to write the path into
2357  * @buflen: the length of the buffer
2358  *
2359  * Determine @task's cgroup on the first (the one with the lowest non-zero
2360  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
2361  * function grabs cgroup_mutex and shouldn't be used inside locks used by
2362  * cgroup controller callbacks.
2363  *
2364  * Return value is the same as kernfs_path().
2365  */
task_cgroup_path(struct task_struct * task,char * buf,size_t buflen)2366 int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
2367 {
2368 	struct cgroup_root *root;
2369 	struct cgroup *cgrp;
2370 	int hierarchy_id = 1;
2371 	int ret;
2372 
2373 	mutex_lock(&cgroup_mutex);
2374 	spin_lock_irq(&css_set_lock);
2375 
2376 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
2377 
2378 	if (root) {
2379 		cgrp = task_cgroup_from_root(task, root);
2380 		ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
2381 	} else {
2382 		/* if no hierarchy exists, everyone is in "/" */
2383 		ret = strlcpy(buf, "/", buflen);
2384 	}
2385 
2386 	spin_unlock_irq(&css_set_lock);
2387 	mutex_unlock(&cgroup_mutex);
2388 	return ret;
2389 }
2390 EXPORT_SYMBOL_GPL(task_cgroup_path);
2391 
2392 /* used to track tasks and other necessary states during migration */
2393 struct cgroup_taskset {
2394 	/* the src and dst cset list running through cset->mg_node */
2395 	struct list_head	src_csets;
2396 	struct list_head	dst_csets;
2397 
2398 	/* the subsys currently being processed */
2399 	int			ssid;
2400 
2401 	/*
2402 	 * Fields for cgroup_taskset_*() iteration.
2403 	 *
2404 	 * Before migration is committed, the target migration tasks are on
2405 	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
2406 	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
2407 	 * or ->dst_csets depending on whether migration is committed.
2408 	 *
2409 	 * ->cur_csets and ->cur_task point to the current task position
2410 	 * during iteration.
2411 	 */
2412 	struct list_head	*csets;
2413 	struct css_set		*cur_cset;
2414 	struct task_struct	*cur_task;
2415 };
2416 
2417 #define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
2418 	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
2419 	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
2420 	.csets			= &tset.src_csets,			\
2421 }
2422 
2423 /**
2424  * cgroup_taskset_add - try to add a migration target task to a taskset
2425  * @task: target task
2426  * @tset: target taskset
2427  *
2428  * Add @task, which is a migration target, to @tset.  This function becomes
2429  * noop if @task doesn't need to be migrated.  @task's css_set should have
2430  * been added as a migration source and @task->cg_list will be moved from
2431  * the css_set's tasks list to mg_tasks one.
2432  */
cgroup_taskset_add(struct task_struct * task,struct cgroup_taskset * tset)2433 static void cgroup_taskset_add(struct task_struct *task,
2434 			       struct cgroup_taskset *tset)
2435 {
2436 	struct css_set *cset;
2437 
2438 	lockdep_assert_held(&css_set_lock);
2439 
2440 	/* @task either already exited or can't exit until the end */
2441 	if (task->flags & PF_EXITING)
2442 		return;
2443 
2444 	/* leave @task alone if post_fork() hasn't linked it yet */
2445 	if (list_empty(&task->cg_list))
2446 		return;
2447 
2448 	cset = task_css_set(task);
2449 	if (!cset->mg_src_cgrp)
2450 		return;
2451 
2452 	list_move_tail(&task->cg_list, &cset->mg_tasks);
2453 	if (list_empty(&cset->mg_node))
2454 		list_add_tail(&cset->mg_node, &tset->src_csets);
2455 	if (list_empty(&cset->mg_dst_cset->mg_node))
2456 		list_move_tail(&cset->mg_dst_cset->mg_node,
2457 			       &tset->dst_csets);
2458 }
2459 
2460 /**
2461  * cgroup_taskset_first - reset taskset and return the first task
2462  * @tset: taskset of interest
2463  * @dst_cssp: output variable for the destination css
2464  *
2465  * @tset iteration is initialized and the first task is returned.
2466  */
cgroup_taskset_first(struct cgroup_taskset * tset,struct cgroup_subsys_state ** dst_cssp)2467 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2468 					 struct cgroup_subsys_state **dst_cssp)
2469 {
2470 	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2471 	tset->cur_task = NULL;
2472 
2473 	return cgroup_taskset_next(tset, dst_cssp);
2474 }
2475 
2476 /**
2477  * cgroup_taskset_next - iterate to the next task in taskset
2478  * @tset: taskset of interest
2479  * @dst_cssp: output variable for the destination css
2480  *
2481  * Return the next task in @tset.  Iteration must have been initialized
2482  * with cgroup_taskset_first().
2483  */
cgroup_taskset_next(struct cgroup_taskset * tset,struct cgroup_subsys_state ** dst_cssp)2484 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2485 					struct cgroup_subsys_state **dst_cssp)
2486 {
2487 	struct css_set *cset = tset->cur_cset;
2488 	struct task_struct *task = tset->cur_task;
2489 
2490 	while (&cset->mg_node != tset->csets) {
2491 		if (!task)
2492 			task = list_first_entry(&cset->mg_tasks,
2493 						struct task_struct, cg_list);
2494 		else
2495 			task = list_next_entry(task, cg_list);
2496 
2497 		if (&task->cg_list != &cset->mg_tasks) {
2498 			tset->cur_cset = cset;
2499 			tset->cur_task = task;
2500 
2501 			/*
2502 			 * This function may be called both before and
2503 			 * after cgroup_taskset_migrate().  The two cases
2504 			 * can be distinguished by looking at whether @cset
2505 			 * has its ->mg_dst_cset set.
2506 			 */
2507 			if (cset->mg_dst_cset)
2508 				*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2509 			else
2510 				*dst_cssp = cset->subsys[tset->ssid];
2511 
2512 			return task;
2513 		}
2514 
2515 		cset = list_next_entry(cset, mg_node);
2516 		task = NULL;
2517 	}
2518 
2519 	return NULL;
2520 }
2521 
2522 /**
2523  * cgroup_taskset_migrate - migrate a taskset
2524  * @tset: taget taskset
2525  * @root: cgroup root the migration is taking place on
2526  *
2527  * Migrate tasks in @tset as setup by migration preparation functions.
2528  * This function fails iff one of the ->can_attach callbacks fails and
2529  * guarantees that either all or none of the tasks in @tset are migrated.
2530  * @tset is consumed regardless of success.
2531  */
cgroup_taskset_migrate(struct cgroup_taskset * tset,struct cgroup_root * root)2532 static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2533 				  struct cgroup_root *root)
2534 {
2535 	struct cgroup_subsys *ss;
2536 	struct task_struct *task, *tmp_task;
2537 	struct css_set *cset, *tmp_cset;
2538 	int ssid, failed_ssid, ret;
2539 
2540 	/* methods shouldn't be called if no task is actually migrating */
2541 	if (list_empty(&tset->src_csets))
2542 		return 0;
2543 
2544 	/* check that we can legitimately attach to the cgroup */
2545 	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
2546 		if (ss->can_attach) {
2547 			tset->ssid = ssid;
2548 			ret = ss->can_attach(tset);
2549 			if (ret) {
2550 				failed_ssid = ssid;
2551 				goto out_cancel_attach;
2552 			}
2553 		}
2554 	} while_each_subsys_mask();
2555 
2556 	/*
2557 	 * Now that we're guaranteed success, proceed to move all tasks to
2558 	 * the new cgroup.  There are no failure cases after here, so this
2559 	 * is the commit point.
2560 	 */
2561 	spin_lock_irq(&css_set_lock);
2562 	list_for_each_entry(cset, &tset->src_csets, mg_node) {
2563 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2564 			struct css_set *from_cset = task_css_set(task);
2565 			struct css_set *to_cset = cset->mg_dst_cset;
2566 
2567 			get_css_set(to_cset);
2568 			css_set_move_task(task, from_cset, to_cset, true);
2569 			put_css_set_locked(from_cset);
2570 		}
2571 	}
2572 	spin_unlock_irq(&css_set_lock);
2573 
2574 	/*
2575 	 * Migration is committed, all target tasks are now on dst_csets.
2576 	 * Nothing is sensitive to fork() after this point.  Notify
2577 	 * controllers that migration is complete.
2578 	 */
2579 	tset->csets = &tset->dst_csets;
2580 
2581 	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
2582 		if (ss->attach) {
2583 			tset->ssid = ssid;
2584 			ss->attach(tset);
2585 		}
2586 	} while_each_subsys_mask();
2587 
2588 	ret = 0;
2589 	goto out_release_tset;
2590 
2591 out_cancel_attach:
2592 	do_each_subsys_mask(ss, ssid, root->subsys_mask) {
2593 		if (ssid == failed_ssid)
2594 			break;
2595 		if (ss->cancel_attach) {
2596 			tset->ssid = ssid;
2597 			ss->cancel_attach(tset);
2598 		}
2599 	} while_each_subsys_mask();
2600 out_release_tset:
2601 	spin_lock_irq(&css_set_lock);
2602 	list_splice_init(&tset->dst_csets, &tset->src_csets);
2603 	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2604 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2605 		list_del_init(&cset->mg_node);
2606 	}
2607 	spin_unlock_irq(&css_set_lock);
2608 	return ret;
2609 }
2610 
2611 /**
2612  * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
2613  * @dst_cgrp: destination cgroup to test
2614  *
2615  * On the default hierarchy, except for the root, subtree_control must be
2616  * zero for migration destination cgroups with tasks so that child cgroups
2617  * don't compete against tasks.
2618  */
cgroup_may_migrate_to(struct cgroup * dst_cgrp)2619 static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
2620 {
2621 	return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
2622 		!dst_cgrp->subtree_control;
2623 }
2624 
2625 /**
2626  * cgroup_migrate_finish - cleanup after attach
2627  * @preloaded_csets: list of preloaded css_sets
2628  *
2629  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
2630  * those functions for details.
2631  */
cgroup_migrate_finish(struct list_head * preloaded_csets)2632 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2633 {
2634 	struct css_set *cset, *tmp_cset;
2635 
2636 	lockdep_assert_held(&cgroup_mutex);
2637 
2638 	spin_lock_irq(&css_set_lock);
2639 	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
2640 		cset->mg_src_cgrp = NULL;
2641 		cset->mg_dst_cgrp = NULL;
2642 		cset->mg_dst_cset = NULL;
2643 		list_del_init(&cset->mg_preload_node);
2644 		put_css_set_locked(cset);
2645 	}
2646 	spin_unlock_irq(&css_set_lock);
2647 }
2648 
2649 /**
2650  * cgroup_migrate_add_src - add a migration source css_set
2651  * @src_cset: the source css_set to add
2652  * @dst_cgrp: the destination cgroup
2653  * @preloaded_csets: list of preloaded css_sets
2654  *
2655  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
2656  * @src_cset and add it to @preloaded_csets, which should later be cleaned
2657  * up by cgroup_migrate_finish().
2658  *
2659  * This function may be called without holding cgroup_threadgroup_rwsem
2660  * even if the target is a process.  Threads may be created and destroyed
2661  * but as long as cgroup_mutex is not dropped, no new css_set can be put
2662  * into play and the preloaded css_sets are guaranteed to cover all
2663  * migrations.
2664  */
cgroup_migrate_add_src(struct css_set * src_cset,struct cgroup * dst_cgrp,struct list_head * preloaded_csets)2665 static void cgroup_migrate_add_src(struct css_set *src_cset,
2666 				   struct cgroup *dst_cgrp,
2667 				   struct list_head *preloaded_csets)
2668 {
2669 	struct cgroup *src_cgrp;
2670 
2671 	lockdep_assert_held(&cgroup_mutex);
2672 	lockdep_assert_held(&css_set_lock);
2673 
2674 	/*
2675 	 * If ->dead, @src_set is associated with one or more dead cgroups
2676 	 * and doesn't contain any migratable tasks.  Ignore it early so
2677 	 * that the rest of migration path doesn't get confused by it.
2678 	 */
2679 	if (src_cset->dead)
2680 		return;
2681 
2682 	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2683 
2684 	if (!list_empty(&src_cset->mg_preload_node))
2685 		return;
2686 
2687 	WARN_ON(src_cset->mg_src_cgrp);
2688 	WARN_ON(src_cset->mg_dst_cgrp);
2689 	WARN_ON(!list_empty(&src_cset->mg_tasks));
2690 	WARN_ON(!list_empty(&src_cset->mg_node));
2691 
2692 	src_cset->mg_src_cgrp = src_cgrp;
2693 	src_cset->mg_dst_cgrp = dst_cgrp;
2694 	get_css_set(src_cset);
2695 	list_add(&src_cset->mg_preload_node, preloaded_csets);
2696 }
2697 
2698 /**
2699  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2700  * @preloaded_csets: list of preloaded source css_sets
2701  *
2702  * Tasks are about to be moved and all the source css_sets have been
2703  * preloaded to @preloaded_csets.  This function looks up and pins all
2704  * destination css_sets, links each to its source, and append them to
2705  * @preloaded_csets.
2706  *
2707  * This function must be called after cgroup_migrate_add_src() has been
2708  * called on each migration source css_set.  After migration is performed
2709  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2710  * @preloaded_csets.
2711  */
cgroup_migrate_prepare_dst(struct list_head * preloaded_csets)2712 static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
2713 {
2714 	LIST_HEAD(csets);
2715 	struct css_set *src_cset, *tmp_cset;
2716 
2717 	lockdep_assert_held(&cgroup_mutex);
2718 
2719 	/* look up the dst cset for each src cset and link it to src */
2720 	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
2721 		struct css_set *dst_cset;
2722 
2723 		dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2724 		if (!dst_cset)
2725 			goto err;
2726 
2727 		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2728 
2729 		/*
2730 		 * If src cset equals dst, it's noop.  Drop the src.
2731 		 * cgroup_migrate() will skip the cset too.  Note that we
2732 		 * can't handle src == dst as some nodes are used by both.
2733 		 */
2734 		if (src_cset == dst_cset) {
2735 			src_cset->mg_src_cgrp = NULL;
2736 			src_cset->mg_dst_cgrp = NULL;
2737 			list_del_init(&src_cset->mg_preload_node);
2738 			put_css_set(src_cset);
2739 			put_css_set(dst_cset);
2740 			continue;
2741 		}
2742 
2743 		src_cset->mg_dst_cset = dst_cset;
2744 
2745 		if (list_empty(&dst_cset->mg_preload_node))
2746 			list_add(&dst_cset->mg_preload_node, &csets);
2747 		else
2748 			put_css_set(dst_cset);
2749 	}
2750 
2751 	list_splice_tail(&csets, preloaded_csets);
2752 	return 0;
2753 err:
2754 	cgroup_migrate_finish(&csets);
2755 	return -ENOMEM;
2756 }
2757 
2758 /**
2759  * cgroup_migrate - migrate a process or task to a cgroup
2760  * @leader: the leader of the process or the task to migrate
2761  * @threadgroup: whether @leader points to the whole process or a single task
2762  * @root: cgroup root migration is taking place on
2763  *
2764  * Migrate a process or task denoted by @leader.  If migrating a process,
2765  * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
2766  * responsible for invoking cgroup_migrate_add_src() and
2767  * cgroup_migrate_prepare_dst() on the targets before invoking this
2768  * function and following up with cgroup_migrate_finish().
2769  *
2770  * As long as a controller's ->can_attach() doesn't fail, this function is
2771  * guaranteed to succeed.  This means that, excluding ->can_attach()
2772  * failure, when migrating multiple targets, the success or failure can be
2773  * decided for all targets by invoking group_migrate_prepare_dst() before
2774  * actually starting migrating.
2775  */
cgroup_migrate(struct task_struct * leader,bool threadgroup,struct cgroup_root * root)2776 static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2777 			  struct cgroup_root *root)
2778 {
2779 	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2780 	struct task_struct *task;
2781 
2782 	/*
2783 	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2784 	 * already PF_EXITING could be freed from underneath us unless we
2785 	 * take an rcu_read_lock.
2786 	 */
2787 	spin_lock_irq(&css_set_lock);
2788 	rcu_read_lock();
2789 	task = leader;
2790 	do {
2791 		cgroup_taskset_add(task, &tset);
2792 		if (!threadgroup)
2793 			break;
2794 	} while_each_thread(leader, task);
2795 	rcu_read_unlock();
2796 	spin_unlock_irq(&css_set_lock);
2797 
2798 	return cgroup_taskset_migrate(&tset, root);
2799 }
2800 
2801 /**
2802  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2803  * @dst_cgrp: the cgroup to attach to
2804  * @leader: the task or the leader of the threadgroup to be attached
2805  * @threadgroup: attach the whole threadgroup?
2806  *
2807  * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2808  */
cgroup_attach_task(struct cgroup * dst_cgrp,struct task_struct * leader,bool threadgroup)2809 static int cgroup_attach_task(struct cgroup *dst_cgrp,
2810 			      struct task_struct *leader, bool threadgroup)
2811 {
2812 	LIST_HEAD(preloaded_csets);
2813 	struct task_struct *task;
2814 	int ret;
2815 
2816 	if (!cgroup_may_migrate_to(dst_cgrp))
2817 		return -EBUSY;
2818 
2819 	/* look up all src csets */
2820 	spin_lock_irq(&css_set_lock);
2821 	rcu_read_lock();
2822 	task = leader;
2823 	do {
2824 		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2825 				       &preloaded_csets);
2826 		if (!threadgroup)
2827 			break;
2828 	} while_each_thread(leader, task);
2829 	rcu_read_unlock();
2830 	spin_unlock_irq(&css_set_lock);
2831 
2832 	/* prepare dst csets and commit */
2833 	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
2834 	if (!ret)
2835 		ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
2836 
2837 	cgroup_migrate_finish(&preloaded_csets);
2838 
2839 	if (!ret)
2840 		trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
2841 
2842 	return ret;
2843 }
2844 
cgroup_procs_write_permission(struct task_struct * task,struct cgroup * dst_cgrp,struct kernfs_open_file * of)2845 static int cgroup_procs_write_permission(struct task_struct *task,
2846 					 struct cgroup *dst_cgrp,
2847 					 struct kernfs_open_file *of)
2848 {
2849 	const struct cred *cred = current_cred();
2850 	const struct cred *tcred = get_task_cred(task);
2851 	int ret = 0;
2852 
2853 	/*
2854 	 * even if we're attaching all tasks in the thread group, we only
2855 	 * need to check permissions on one of them.
2856 	 */
2857 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2858 	    !uid_eq(cred->euid, tcred->uid) &&
2859 	    !uid_eq(cred->euid, tcred->suid) &&
2860 	    !ns_capable(tcred->user_ns, CAP_SYS_NICE))
2861 		ret = -EACCES;
2862 
2863 	if (!ret && cgroup_on_dfl(dst_cgrp)) {
2864 		struct super_block *sb = of->file->f_path.dentry->d_sb;
2865 		struct cgroup *cgrp;
2866 		struct inode *inode;
2867 
2868 		spin_lock_irq(&css_set_lock);
2869 		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2870 		spin_unlock_irq(&css_set_lock);
2871 
2872 		while (!cgroup_is_descendant(dst_cgrp, cgrp))
2873 			cgrp = cgroup_parent(cgrp);
2874 
2875 		ret = -ENOMEM;
2876 		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
2877 		if (inode) {
2878 			ret = inode_permission(inode, MAY_WRITE);
2879 			iput(inode);
2880 		}
2881 	}
2882 
2883 	put_cred(tcred);
2884 	return ret;
2885 }
2886 
2887 /*
2888  * Find the task_struct of the task to attach by vpid and pass it along to the
2889  * function to attach either it or all tasks in its threadgroup. Will lock
2890  * cgroup_mutex and threadgroup.
2891  */
__cgroup_procs_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off,bool threadgroup)2892 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2893 				    size_t nbytes, loff_t off, bool threadgroup)
2894 {
2895 	struct task_struct *tsk;
2896 	struct cgroup_subsys *ss;
2897 	struct cgroup *cgrp;
2898 	pid_t pid;
2899 	int ssid, ret;
2900 
2901 	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2902 		return -EINVAL;
2903 
2904 	cgrp = cgroup_kn_lock_live(of->kn, false);
2905 	if (!cgrp)
2906 		return -ENODEV;
2907 
2908 	percpu_down_write(&cgroup_threadgroup_rwsem);
2909 	rcu_read_lock();
2910 	if (pid) {
2911 		tsk = find_task_by_vpid(pid);
2912 		if (!tsk) {
2913 			ret = -ESRCH;
2914 			goto out_unlock_rcu;
2915 		}
2916 	} else {
2917 		tsk = current;
2918 	}
2919 
2920 	if (threadgroup)
2921 		tsk = tsk->group_leader;
2922 
2923 	/*
2924 	 * kthreads may acquire PF_NO_SETAFFINITY during initialization.
2925 	 * If userland migrates such a kthread to a non-root cgroup, it can
2926 	 * become trapped in a cpuset, or RT kthread may be born in a
2927 	 * cgroup with no rt_runtime allocated.  Just say no.
2928 	 */
2929 	if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2930 		ret = -EINVAL;
2931 		goto out_unlock_rcu;
2932 	}
2933 
2934 	get_task_struct(tsk);
2935 	rcu_read_unlock();
2936 
2937 	ret = cgroup_procs_write_permission(tsk, cgrp, of);
2938 	if (!ret)
2939 		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2940 
2941 	put_task_struct(tsk);
2942 	goto out_unlock_threadgroup;
2943 
2944 out_unlock_rcu:
2945 	rcu_read_unlock();
2946 out_unlock_threadgroup:
2947 	percpu_up_write(&cgroup_threadgroup_rwsem);
2948 	for_each_subsys(ss, ssid)
2949 		if (ss->post_attach)
2950 			ss->post_attach();
2951 	cgroup_kn_unlock(of->kn);
2952 	return ret ?: nbytes;
2953 }
2954 
2955 /**
2956  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2957  * @from: attach to all cgroups of a given task
2958  * @tsk: the task to be attached
2959  */
cgroup_attach_task_all(struct task_struct * from,struct task_struct * tsk)2960 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2961 {
2962 	struct cgroup_root *root;
2963 	int retval = 0;
2964 
2965 	mutex_lock(&cgroup_mutex);
2966 	percpu_down_write(&cgroup_threadgroup_rwsem);
2967 	for_each_root(root) {
2968 		struct cgroup *from_cgrp;
2969 
2970 		if (root == &cgrp_dfl_root)
2971 			continue;
2972 
2973 		spin_lock_irq(&css_set_lock);
2974 		from_cgrp = task_cgroup_from_root(from, root);
2975 		spin_unlock_irq(&css_set_lock);
2976 
2977 		retval = cgroup_attach_task(from_cgrp, tsk, false);
2978 		if (retval)
2979 			break;
2980 	}
2981 	percpu_up_write(&cgroup_threadgroup_rwsem);
2982 	mutex_unlock(&cgroup_mutex);
2983 
2984 	return retval;
2985 }
2986 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2987 
cgroup_tasks_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2988 static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2989 				  char *buf, size_t nbytes, loff_t off)
2990 {
2991 	return __cgroup_procs_write(of, buf, nbytes, off, false);
2992 }
2993 
cgroup_procs_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2994 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2995 				  char *buf, size_t nbytes, loff_t off)
2996 {
2997 	return __cgroup_procs_write(of, buf, nbytes, off, true);
2998 }
2999 
cgroup_release_agent_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)3000 static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
3001 					  char *buf, size_t nbytes, loff_t off)
3002 {
3003 	struct cgroup *cgrp;
3004 
3005 	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
3006 
3007 	cgrp = cgroup_kn_lock_live(of->kn, false);
3008 	if (!cgrp)
3009 		return -ENODEV;
3010 	spin_lock(&release_agent_path_lock);
3011 	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
3012 		sizeof(cgrp->root->release_agent_path));
3013 	spin_unlock(&release_agent_path_lock);
3014 	cgroup_kn_unlock(of->kn);
3015 	return nbytes;
3016 }
3017 
cgroup_release_agent_show(struct seq_file * seq,void * v)3018 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
3019 {
3020 	struct cgroup *cgrp = seq_css(seq)->cgroup;
3021 
3022 	spin_lock(&release_agent_path_lock);
3023 	seq_puts(seq, cgrp->root->release_agent_path);
3024 	spin_unlock(&release_agent_path_lock);
3025 	seq_putc(seq, '\n');
3026 	return 0;
3027 }
3028 
cgroup_sane_behavior_show(struct seq_file * seq,void * v)3029 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
3030 {
3031 	seq_puts(seq, "0\n");
3032 	return 0;
3033 }
3034 
cgroup_print_ss_mask(struct seq_file * seq,u16 ss_mask)3035 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
3036 {
3037 	struct cgroup_subsys *ss;
3038 	bool printed = false;
3039 	int ssid;
3040 
3041 	do_each_subsys_mask(ss, ssid, ss_mask) {
3042 		if (printed)
3043 			seq_putc(seq, ' ');
3044 		seq_printf(seq, "%s", ss->name);
3045 		printed = true;
3046 	} while_each_subsys_mask();
3047 	if (printed)
3048 		seq_putc(seq, '\n');
3049 }
3050 
3051 /* show controllers which are enabled from the parent */
cgroup_controllers_show(struct seq_file * seq,void * v)3052 static int cgroup_controllers_show(struct seq_file *seq, void *v)
3053 {
3054 	struct cgroup *cgrp = seq_css(seq)->cgroup;
3055 
3056 	cgroup_print_ss_mask(seq, cgroup_control(cgrp));
3057 	return 0;
3058 }
3059 
3060 /* show controllers which are enabled for a given cgroup's children */
cgroup_subtree_control_show(struct seq_file * seq,void * v)3061 static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
3062 {
3063 	struct cgroup *cgrp = seq_css(seq)->cgroup;
3064 
3065 	cgroup_print_ss_mask(seq, cgrp->subtree_control);
3066 	return 0;
3067 }
3068 
3069 /**
3070  * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
3071  * @cgrp: root of the subtree to update csses for
3072  *
3073  * @cgrp's control masks have changed and its subtree's css associations
3074  * need to be updated accordingly.  This function looks up all css_sets
3075  * which are attached to the subtree, creates the matching updated css_sets
3076  * and migrates the tasks to the new ones.
3077  */
cgroup_update_dfl_csses(struct cgroup * cgrp)3078 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3079 {
3080 	LIST_HEAD(preloaded_csets);
3081 	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
3082 	struct cgroup_subsys_state *d_css;
3083 	struct cgroup *dsct;
3084 	struct css_set *src_cset;
3085 	int ret;
3086 
3087 	lockdep_assert_held(&cgroup_mutex);
3088 
3089 	percpu_down_write(&cgroup_threadgroup_rwsem);
3090 
3091 	/* look up all csses currently attached to @cgrp's subtree */
3092 	spin_lock_irq(&css_set_lock);
3093 	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3094 		struct cgrp_cset_link *link;
3095 
3096 		list_for_each_entry(link, &dsct->cset_links, cset_link)
3097 			cgroup_migrate_add_src(link->cset, dsct,
3098 					       &preloaded_csets);
3099 	}
3100 	spin_unlock_irq(&css_set_lock);
3101 
3102 	/* NULL dst indicates self on default hierarchy */
3103 	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
3104 	if (ret)
3105 		goto out_finish;
3106 
3107 	spin_lock_irq(&css_set_lock);
3108 	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
3109 		struct task_struct *task, *ntask;
3110 
3111 		/* src_csets precede dst_csets, break on the first dst_cset */
3112 		if (!src_cset->mg_src_cgrp)
3113 			break;
3114 
3115 		/* all tasks in src_csets need to be migrated */
3116 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
3117 			cgroup_taskset_add(task, &tset);
3118 	}
3119 	spin_unlock_irq(&css_set_lock);
3120 
3121 	ret = cgroup_taskset_migrate(&tset, cgrp->root);
3122 out_finish:
3123 	cgroup_migrate_finish(&preloaded_csets);
3124 	percpu_up_write(&cgroup_threadgroup_rwsem);
3125 	return ret;
3126 }
3127 
3128 /**
3129  * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
3130  * @cgrp: root of the target subtree
3131  *
3132  * Because css offlining is asynchronous, userland may try to re-enable a
3133  * controller while the previous css is still around.  This function grabs
3134  * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
3135  */
cgroup_lock_and_drain_offline(struct cgroup * cgrp)3136 static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
3137 	__acquires(&cgroup_mutex)
3138 {
3139 	struct cgroup *dsct;
3140 	struct cgroup_subsys_state *d_css;
3141 	struct cgroup_subsys *ss;
3142 	int ssid;
3143 
3144 restart:
3145 	mutex_lock(&cgroup_mutex);
3146 
3147 	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3148 		for_each_subsys(ss, ssid) {
3149 			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3150 			DEFINE_WAIT(wait);
3151 
3152 			if (!css || !percpu_ref_is_dying(&css->refcnt))
3153 				continue;
3154 
3155 			cgroup_get(dsct);
3156 			prepare_to_wait(&dsct->offline_waitq, &wait,
3157 					TASK_UNINTERRUPTIBLE);
3158 
3159 			mutex_unlock(&cgroup_mutex);
3160 			schedule();
3161 			finish_wait(&dsct->offline_waitq, &wait);
3162 
3163 			cgroup_put(dsct);
3164 			goto restart;
3165 		}
3166 	}
3167 }
3168 
3169 /**
3170  * cgroup_save_control - save control masks of a subtree
3171  * @cgrp: root of the target subtree
3172  *
3173  * Save ->subtree_control and ->subtree_ss_mask to the respective old_
3174  * prefixed fields for @cgrp's subtree including @cgrp itself.
3175  */
cgroup_save_control(struct cgroup * cgrp)3176 static void cgroup_save_control(struct cgroup *cgrp)
3177 {
3178 	struct cgroup *dsct;
3179 	struct cgroup_subsys_state *d_css;
3180 
3181 	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3182 		dsct->old_subtree_control = dsct->subtree_control;
3183 		dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
3184 	}
3185 }
3186 
3187 /**
3188  * cgroup_propagate_control - refresh control masks of a subtree
3189  * @cgrp: root of the target subtree
3190  *
3191  * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
3192  * ->subtree_control and propagate controller availability through the
3193  * subtree so that descendants don't have unavailable controllers enabled.
3194  */
cgroup_propagate_control(struct cgroup * cgrp)3195 static void cgroup_propagate_control(struct cgroup *cgrp)
3196 {
3197 	struct cgroup *dsct;
3198 	struct cgroup_subsys_state *d_css;
3199 
3200 	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3201 		dsct->subtree_control &= cgroup_control(dsct);
3202 		dsct->subtree_ss_mask =
3203 			cgroup_calc_subtree_ss_mask(dsct->subtree_control,
3204 						    cgroup_ss_mask(dsct));
3205 	}
3206 }
3207 
3208 /**
3209  * cgroup_restore_control - restore control masks of a subtree
3210  * @cgrp: root of the target subtree
3211  *
3212  * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
3213  * prefixed fields for @cgrp's subtree including @cgrp itself.
3214  */
cgroup_restore_control(struct cgroup * cgrp)3215 static void cgroup_restore_control(struct cgroup *cgrp)
3216 {
3217 	struct cgroup *dsct;
3218 	struct cgroup_subsys_state *d_css;
3219 
3220 	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3221 		dsct->subtree_control = dsct->old_subtree_control;
3222 		dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3223 	}
3224 }
3225 
css_visible(struct cgroup_subsys_state * css)3226 static bool css_visible(struct cgroup_subsys_state *css)
3227 {
3228 	struct cgroup_subsys *ss = css->ss;
3229 	struct cgroup *cgrp = css->cgroup;
3230 
3231 	if (cgroup_control(cgrp) & (1 << ss->id))
3232 		return true;
3233 	if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3234 		return false;
3235 	return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3236 }
3237 
3238 /**
3239  * cgroup_apply_control_enable - enable or show csses according to control
3240  * @cgrp: root of the target subtree
3241  *
3242  * Walk @cgrp's subtree and create new csses or make the existing ones
3243  * visible.  A css is created invisible if it's being implicitly enabled
3244  * through dependency.  An invisible css is made visible when the userland
3245  * explicitly enables it.
3246  *
3247  * Returns 0 on success, -errno on failure.  On failure, csses which have
3248  * been processed already aren't cleaned up.  The caller is responsible for
3249  * cleaning up with cgroup_apply_control_disble().
3250  */
cgroup_apply_control_enable(struct cgroup * cgrp)3251 static int cgroup_apply_control_enable(struct cgroup *cgrp)
3252 {
3253 	struct cgroup *dsct;
3254 	struct cgroup_subsys_state *d_css;
3255 	struct cgroup_subsys *ss;
3256 	int ssid, ret;
3257 
3258 	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3259 		for_each_subsys(ss, ssid) {
3260 			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3261 
3262 			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
3263 
3264 			if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3265 				continue;
3266 
3267 			if (!css) {
3268 				css = css_create(dsct, ss);
3269 				if (IS_ERR(css))
3270 					return PTR_ERR(css);
3271 			}
3272 
3273 			if (css_visible(css)) {
3274 				ret = css_populate_dir(css);
3275 				if (ret)
3276 					return ret;
3277 			}
3278 		}
3279 	}
3280 
3281 	return 0;
3282 }
3283 
3284 /**
3285  * cgroup_apply_control_disable - kill or hide csses according to control
3286  * @cgrp: root of the target subtree
3287  *
3288  * Walk @cgrp's subtree and kill and hide csses so that they match
3289  * cgroup_ss_mask() and cgroup_visible_mask().
3290  *
3291  * A css is hidden when the userland requests it to be disabled while other
3292  * subsystems are still depending on it.  The css must not actively control
3293  * resources and be in the vanilla state if it's made visible again later.
3294  * Controllers which may be depended upon should provide ->css_reset() for
3295  * this purpose.
3296  */
cgroup_apply_control_disable(struct cgroup * cgrp)3297 static void cgroup_apply_control_disable(struct cgroup *cgrp)
3298 {
3299 	struct cgroup *dsct;
3300 	struct cgroup_subsys_state *d_css;
3301 	struct cgroup_subsys *ss;
3302 	int ssid;
3303 
3304 	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3305 		for_each_subsys(ss, ssid) {
3306 			struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3307 
3308 			WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
3309 
3310 			if (!css)
3311 				continue;
3312 
3313 			if (css->parent &&
3314 			    !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3315 				kill_css(css);
3316 			} else if (!css_visible(css)) {
3317 				css_clear_dir(css);
3318 				if (ss->css_reset)
3319 					ss->css_reset(css);
3320 			}
3321 		}
3322 	}
3323 }
3324 
3325 /**
3326  * cgroup_apply_control - apply control mask updates to the subtree
3327  * @cgrp: root of the target subtree
3328  *
3329  * subsystems can be enabled and disabled in a subtree using the following
3330  * steps.
3331  *
3332  * 1. Call cgroup_save_control() to stash the current state.
3333  * 2. Update ->subtree_control masks in the subtree as desired.
3334  * 3. Call cgroup_apply_control() to apply the changes.
3335  * 4. Optionally perform other related operations.
3336  * 5. Call cgroup_finalize_control() to finish up.
3337  *
3338  * This function implements step 3 and propagates the mask changes
3339  * throughout @cgrp's subtree, updates csses accordingly and perform
3340  * process migrations.
3341  */
cgroup_apply_control(struct cgroup * cgrp)3342 static int cgroup_apply_control(struct cgroup *cgrp)
3343 {
3344 	int ret;
3345 
3346 	cgroup_propagate_control(cgrp);
3347 
3348 	ret = cgroup_apply_control_enable(cgrp);
3349 	if (ret)
3350 		return ret;
3351 
3352 	/*
3353 	 * At this point, cgroup_e_css() results reflect the new csses
3354 	 * making the following cgroup_update_dfl_csses() properly update
3355 	 * css associations of all tasks in the subtree.
3356 	 */
3357 	ret = cgroup_update_dfl_csses(cgrp);
3358 	if (ret)
3359 		return ret;
3360 
3361 	return 0;
3362 }
3363 
3364 /**
3365  * cgroup_finalize_control - finalize control mask update
3366  * @cgrp: root of the target subtree
3367  * @ret: the result of the update
3368  *
3369  * Finalize control mask update.  See cgroup_apply_control() for more info.
3370  */
cgroup_finalize_control(struct cgroup * cgrp,int ret)3371 static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3372 {
3373 	if (ret) {
3374 		cgroup_restore_control(cgrp);
3375 		cgroup_propagate_control(cgrp);
3376 	}
3377 
3378 	cgroup_apply_control_disable(cgrp);
3379 }
3380 
3381 /* change the enabled child controllers for a cgroup in the default hierarchy */
cgroup_subtree_control_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)3382 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3383 					    char *buf, size_t nbytes,
3384 					    loff_t off)
3385 {
3386 	u16 enable = 0, disable = 0;
3387 	struct cgroup *cgrp, *child;
3388 	struct cgroup_subsys *ss;
3389 	char *tok;
3390 	int ssid, ret;
3391 
3392 	/*
3393 	 * Parse input - space separated list of subsystem names prefixed
3394 	 * with either + or -.
3395 	 */
3396 	buf = strstrip(buf);
3397 	while ((tok = strsep(&buf, " "))) {
3398 		if (tok[0] == '\0')
3399 			continue;
3400 		do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3401 			if (!cgroup_ssid_enabled(ssid) ||
3402 			    strcmp(tok + 1, ss->name))
3403 				continue;
3404 
3405 			if (*tok == '+') {
3406 				enable |= 1 << ssid;
3407 				disable &= ~(1 << ssid);
3408 			} else if (*tok == '-') {
3409 				disable |= 1 << ssid;
3410 				enable &= ~(1 << ssid);
3411 			} else {
3412 				return -EINVAL;
3413 			}
3414 			break;
3415 		} while_each_subsys_mask();
3416 		if (ssid == CGROUP_SUBSYS_COUNT)
3417 			return -EINVAL;
3418 	}
3419 
3420 	cgrp = cgroup_kn_lock_live(of->kn, true);
3421 	if (!cgrp)
3422 		return -ENODEV;
3423 
3424 	for_each_subsys(ss, ssid) {
3425 		if (enable & (1 << ssid)) {
3426 			if (cgrp->subtree_control & (1 << ssid)) {
3427 				enable &= ~(1 << ssid);
3428 				continue;
3429 			}
3430 
3431 			if (!(cgroup_control(cgrp) & (1 << ssid))) {
3432 				ret = -ENOENT;
3433 				goto out_unlock;
3434 			}
3435 		} else if (disable & (1 << ssid)) {
3436 			if (!(cgrp->subtree_control & (1 << ssid))) {
3437 				disable &= ~(1 << ssid);
3438 				continue;
3439 			}
3440 
3441 			/* a child has it enabled? */
3442 			cgroup_for_each_live_child(child, cgrp) {
3443 				if (child->subtree_control & (1 << ssid)) {
3444 					ret = -EBUSY;
3445 					goto out_unlock;
3446 				}
3447 			}
3448 		}
3449 	}
3450 
3451 	if (!enable && !disable) {
3452 		ret = 0;
3453 		goto out_unlock;
3454 	}
3455 
3456 	/*
3457 	 * Except for the root, subtree_control must be zero for a cgroup
3458 	 * with tasks so that child cgroups don't compete against tasks.
3459 	 */
3460 	if (enable && cgroup_parent(cgrp)) {
3461 		struct cgrp_cset_link *link;
3462 
3463 		/*
3464 		 * Because namespaces pin csets too, @cgrp->cset_links
3465 		 * might not be empty even when @cgrp is empty.  Walk and
3466 		 * verify each cset.
3467 		 */
3468 		spin_lock_irq(&css_set_lock);
3469 
3470 		ret = 0;
3471 		list_for_each_entry(link, &cgrp->cset_links, cset_link) {
3472 			if (css_set_populated(link->cset)) {
3473 				ret = -EBUSY;
3474 				break;
3475 			}
3476 		}
3477 
3478 		spin_unlock_irq(&css_set_lock);
3479 
3480 		if (ret)
3481 			goto out_unlock;
3482 	}
3483 
3484 	/* save and update control masks and prepare csses */
3485 	cgroup_save_control(cgrp);
3486 
3487 	cgrp->subtree_control |= enable;
3488 	cgrp->subtree_control &= ~disable;
3489 
3490 	ret = cgroup_apply_control(cgrp);
3491 	cgroup_finalize_control(cgrp, ret);
3492 	if (ret)
3493 		goto out_unlock;
3494 
3495 	kernfs_activate(cgrp->kn);
3496 out_unlock:
3497 	cgroup_kn_unlock(of->kn);
3498 	return ret ?: nbytes;
3499 }
3500 
cgroup_events_show(struct seq_file * seq,void * v)3501 static int cgroup_events_show(struct seq_file *seq, void *v)
3502 {
3503 	seq_printf(seq, "populated %d\n",
3504 		   cgroup_is_populated(seq_css(seq)->cgroup));
3505 	return 0;
3506 }
3507 
cgroup_file_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)3508 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
3509 				 size_t nbytes, loff_t off)
3510 {
3511 	struct cgroup *cgrp = of->kn->parent->priv;
3512 	struct cftype *cft = of->kn->priv;
3513 	struct cgroup_subsys_state *css;
3514 	int ret;
3515 
3516 	if (cft->write)
3517 		return cft->write(of, buf, nbytes, off);
3518 
3519 	/*
3520 	 * kernfs guarantees that a file isn't deleted with operations in
3521 	 * flight, which means that the matching css is and stays alive and
3522 	 * doesn't need to be pinned.  The RCU locking is not necessary
3523 	 * either.  It's just for the convenience of using cgroup_css().
3524 	 */
3525 	rcu_read_lock();
3526 	css = cgroup_css(cgrp, cft->ss);
3527 	rcu_read_unlock();
3528 
3529 	if (cft->write_u64) {
3530 		unsigned long long v;
3531 		ret = kstrtoull(buf, 0, &v);
3532 		if (!ret)
3533 			ret = cft->write_u64(css, cft, v);
3534 	} else if (cft->write_s64) {
3535 		long long v;
3536 		ret = kstrtoll(buf, 0, &v);
3537 		if (!ret)
3538 			ret = cft->write_s64(css, cft, v);
3539 	} else {
3540 		ret = -EINVAL;
3541 	}
3542 
3543 	return ret ?: nbytes;
3544 }
3545 
cgroup_seqfile_start(struct seq_file * seq,loff_t * ppos)3546 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
3547 {
3548 	return seq_cft(seq)->seq_start(seq, ppos);
3549 }
3550 
cgroup_seqfile_next(struct seq_file * seq,void * v,loff_t * ppos)3551 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
3552 {
3553 	return seq_cft(seq)->seq_next(seq, v, ppos);
3554 }
3555 
cgroup_seqfile_stop(struct seq_file * seq,void * v)3556 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
3557 {
3558 	seq_cft(seq)->seq_stop(seq, v);
3559 }
3560 
cgroup_seqfile_show(struct seq_file * m,void * arg)3561 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
3562 {
3563 	struct cftype *cft = seq_cft(m);
3564 	struct cgroup_subsys_state *css = seq_css(m);
3565 
3566 	if (cft->seq_show)
3567 		return cft->seq_show(m, arg);
3568 
3569 	if (cft->read_u64)
3570 		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
3571 	else if (cft->read_s64)
3572 		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
3573 	else
3574 		return -EINVAL;
3575 	return 0;
3576 }
3577 
3578 static struct kernfs_ops cgroup_kf_single_ops = {
3579 	.atomic_write_len	= PAGE_SIZE,
3580 	.write			= cgroup_file_write,
3581 	.seq_show		= cgroup_seqfile_show,
3582 };
3583 
3584 static struct kernfs_ops cgroup_kf_ops = {
3585 	.atomic_write_len	= PAGE_SIZE,
3586 	.write			= cgroup_file_write,
3587 	.seq_start		= cgroup_seqfile_start,
3588 	.seq_next		= cgroup_seqfile_next,
3589 	.seq_stop		= cgroup_seqfile_stop,
3590 	.seq_show		= cgroup_seqfile_show,
3591 };
3592 
3593 /*
3594  * cgroup_rename - Only allow simple rename of directories in place.
3595  */
cgroup_rename(struct kernfs_node * kn,struct kernfs_node * new_parent,const char * new_name_str)3596 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
3597 			 const char *new_name_str)
3598 {
3599 	struct cgroup *cgrp = kn->priv;
3600 	int ret;
3601 
3602 	if (kernfs_type(kn) != KERNFS_DIR)
3603 		return -ENOTDIR;
3604 	if (kn->parent != new_parent)
3605 		return -EIO;
3606 
3607 	/*
3608 	 * This isn't a proper migration and its usefulness is very
3609 	 * limited.  Disallow on the default hierarchy.
3610 	 */
3611 	if (cgroup_on_dfl(cgrp))
3612 		return -EPERM;
3613 
3614 	/*
3615 	 * We're gonna grab cgroup_mutex which nests outside kernfs
3616 	 * active_ref.  kernfs_rename() doesn't require active_ref
3617 	 * protection.  Break them before grabbing cgroup_mutex.
3618 	 */
3619 	kernfs_break_active_protection(new_parent);
3620 	kernfs_break_active_protection(kn);
3621 
3622 	mutex_lock(&cgroup_mutex);
3623 
3624 	ret = kernfs_rename(kn, new_parent, new_name_str);
3625 	if (!ret)
3626 		trace_cgroup_rename(cgrp);
3627 
3628 	mutex_unlock(&cgroup_mutex);
3629 
3630 	kernfs_unbreak_active_protection(kn);
3631 	kernfs_unbreak_active_protection(new_parent);
3632 	return ret;
3633 }
3634 
3635 /* set uid and gid of cgroup dirs and files to that of the creator */
cgroup_kn_set_ugid(struct kernfs_node * kn)3636 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3637 {
3638 	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
3639 			       .ia_uid = current_fsuid(),
3640 			       .ia_gid = current_fsgid(), };
3641 
3642 	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
3643 	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
3644 		return 0;
3645 
3646 	return kernfs_setattr(kn, &iattr);
3647 }
3648 
cgroup_add_file(struct cgroup_subsys_state * css,struct cgroup * cgrp,struct cftype * cft)3649 static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3650 			   struct cftype *cft)
3651 {
3652 	char name[CGROUP_FILE_NAME_MAX];
3653 	struct kernfs_node *kn;
3654 	struct lock_class_key *key = NULL;
3655 	int ret;
3656 
3657 #ifdef CONFIG_DEBUG_LOCK_ALLOC
3658 	key = &cft->lockdep_key;
3659 #endif
3660 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
3661 				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
3662 				  NULL, key);
3663 	if (IS_ERR(kn))
3664 		return PTR_ERR(kn);
3665 
3666 	ret = cgroup_kn_set_ugid(kn);
3667 	if (ret) {
3668 		kernfs_remove(kn);
3669 		return ret;
3670 	}
3671 
3672 	if (cft->file_offset) {
3673 		struct cgroup_file *cfile = (void *)css + cft->file_offset;
3674 
3675 		spin_lock_irq(&cgroup_file_kn_lock);
3676 		cfile->kn = kn;
3677 		spin_unlock_irq(&cgroup_file_kn_lock);
3678 	}
3679 
3680 	return 0;
3681 }
3682 
3683 /**
3684  * cgroup_addrm_files - add or remove files to a cgroup directory
3685  * @css: the target css
3686  * @cgrp: the target cgroup (usually css->cgroup)
3687  * @cfts: array of cftypes to be added
3688  * @is_add: whether to add or remove
3689  *
3690  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
3691  * For removals, this function never fails.
3692  */
cgroup_addrm_files(struct cgroup_subsys_state * css,struct cgroup * cgrp,struct cftype cfts[],bool is_add)3693 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3694 			      struct cgroup *cgrp, struct cftype cfts[],
3695 			      bool is_add)
3696 {
3697 	struct cftype *cft, *cft_end = NULL;
3698 	int ret = 0;
3699 
3700 	lockdep_assert_held(&cgroup_mutex);
3701 
3702 restart:
3703 	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3704 		/* does cft->flags tell us to skip this file on @cgrp? */
3705 		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3706 			continue;
3707 		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
3708 			continue;
3709 		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
3710 			continue;
3711 		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
3712 			continue;
3713 
3714 		if (is_add) {
3715 			ret = cgroup_add_file(css, cgrp, cft);
3716 			if (ret) {
3717 				pr_warn("%s: failed to add %s, err=%d\n",
3718 					__func__, cft->name, ret);
3719 				cft_end = cft;
3720 				is_add = false;
3721 				goto restart;
3722 			}
3723 		} else {
3724 			cgroup_rm_file(cgrp, cft);
3725 		}
3726 	}
3727 	return ret;
3728 }
3729 
cgroup_apply_cftypes(struct cftype * cfts,bool is_add)3730 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3731 {
3732 	LIST_HEAD(pending);
3733 	struct cgroup_subsys *ss = cfts[0].ss;
3734 	struct cgroup *root = &ss->root->cgrp;
3735 	struct cgroup_subsys_state *css;
3736 	int ret = 0;
3737 
3738 	lockdep_assert_held(&cgroup_mutex);
3739 
3740 	/* add/rm files for all cgroups created before */
3741 	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
3742 		struct cgroup *cgrp = css->cgroup;
3743 
3744 		if (!(css->flags & CSS_VISIBLE))
3745 			continue;
3746 
3747 		ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3748 		if (ret)
3749 			break;
3750 	}
3751 
3752 	if (is_add && !ret)
3753 		kernfs_activate(root->kn);
3754 	return ret;
3755 }
3756 
cgroup_exit_cftypes(struct cftype * cfts)3757 static void cgroup_exit_cftypes(struct cftype *cfts)
3758 {
3759 	struct cftype *cft;
3760 
3761 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
3762 		/* free copy for custom atomic_write_len, see init_cftypes() */
3763 		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
3764 			kfree(cft->kf_ops);
3765 		cft->kf_ops = NULL;
3766 		cft->ss = NULL;
3767 
3768 		/* revert flags set by cgroup core while adding @cfts */
3769 		cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
3770 	}
3771 }
3772 
cgroup_init_cftypes(struct cgroup_subsys * ss,struct cftype * cfts)3773 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3774 {
3775 	struct cftype *cft;
3776 
3777 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
3778 		struct kernfs_ops *kf_ops;
3779 
3780 		WARN_ON(cft->ss || cft->kf_ops);
3781 
3782 		if (cft->seq_start)
3783 			kf_ops = &cgroup_kf_ops;
3784 		else
3785 			kf_ops = &cgroup_kf_single_ops;
3786 
3787 		/*
3788 		 * Ugh... if @cft wants a custom max_write_len, we need to
3789 		 * make a copy of kf_ops to set its atomic_write_len.
3790 		 */
3791 		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3792 			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3793 			if (!kf_ops) {
3794 				cgroup_exit_cftypes(cfts);
3795 				return -ENOMEM;
3796 			}
3797 			kf_ops->atomic_write_len = cft->max_write_len;
3798 		}
3799 
3800 		cft->kf_ops = kf_ops;
3801 		cft->ss = ss;
3802 	}
3803 
3804 	return 0;
3805 }
3806 
cgroup_rm_cftypes_locked(struct cftype * cfts)3807 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3808 {
3809 	lockdep_assert_held(&cgroup_mutex);
3810 
3811 	if (!cfts || !cfts[0].ss)
3812 		return -ENOENT;
3813 
3814 	list_del(&cfts->node);
3815 	cgroup_apply_cftypes(cfts, false);
3816 	cgroup_exit_cftypes(cfts);
3817 	return 0;
3818 }
3819 
3820 /**
3821  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
3822  * @cfts: zero-length name terminated array of cftypes
3823  *
3824  * Unregister @cfts.  Files described by @cfts are removed from all
3825  * existing cgroups and all future cgroups won't have them either.  This
3826  * function can be called anytime whether @cfts' subsys is attached or not.
3827  *
3828  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
3829  * registered.
3830  */
cgroup_rm_cftypes(struct cftype * cfts)3831 int cgroup_rm_cftypes(struct cftype *cfts)
3832 {
3833 	int ret;
3834 
3835 	mutex_lock(&cgroup_mutex);
3836 	ret = cgroup_rm_cftypes_locked(cfts);
3837 	mutex_unlock(&cgroup_mutex);
3838 	return ret;
3839 }
3840 
3841 /**
3842  * cgroup_add_cftypes - add an array of cftypes to a subsystem
3843  * @ss: target cgroup subsystem
3844  * @cfts: zero-length name terminated array of cftypes
3845  *
3846  * Register @cfts to @ss.  Files described by @cfts are created for all
3847  * existing cgroups to which @ss is attached and all future cgroups will
3848  * have them too.  This function can be called anytime whether @ss is
3849  * attached or not.
3850  *
3851  * Returns 0 on successful registration, -errno on failure.  Note that this
3852  * function currently returns 0 as long as @cfts registration is successful
3853  * even if some file creation attempts on existing cgroups fail.
3854  */
cgroup_add_cftypes(struct cgroup_subsys * ss,struct cftype * cfts)3855 static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3856 {
3857 	int ret;
3858 
3859 	if (!cgroup_ssid_enabled(ss->id))
3860 		return 0;
3861 
3862 	if (!cfts || cfts[0].name[0] == '\0')
3863 		return 0;
3864 
3865 	ret = cgroup_init_cftypes(ss, cfts);
3866 	if (ret)
3867 		return ret;
3868 
3869 	mutex_lock(&cgroup_mutex);
3870 
3871 	list_add_tail(&cfts->node, &ss->cfts);
3872 	ret = cgroup_apply_cftypes(cfts, true);
3873 	if (ret)
3874 		cgroup_rm_cftypes_locked(cfts);
3875 
3876 	mutex_unlock(&cgroup_mutex);
3877 	return ret;
3878 }
3879 
3880 /**
3881  * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
3882  * @ss: target cgroup subsystem
3883  * @cfts: zero-length name terminated array of cftypes
3884  *
3885  * Similar to cgroup_add_cftypes() but the added files are only used for
3886  * the default hierarchy.
3887  */
cgroup_add_dfl_cftypes(struct cgroup_subsys * ss,struct cftype * cfts)3888 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3889 {
3890 	struct cftype *cft;
3891 
3892 	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3893 		cft->flags |= __CFTYPE_ONLY_ON_DFL;
3894 	return cgroup_add_cftypes(ss, cfts);
3895 }
3896 
3897 /**
3898  * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
3899  * @ss: target cgroup subsystem
3900  * @cfts: zero-length name terminated array of cftypes
3901  *
3902  * Similar to cgroup_add_cftypes() but the added files are only used for
3903  * the legacy hierarchies.
3904  */
cgroup_add_legacy_cftypes(struct cgroup_subsys * ss,struct cftype * cfts)3905 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3906 {
3907 	struct cftype *cft;
3908 
3909 	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3910 		cft->flags |= __CFTYPE_NOT_ON_DFL;
3911 	return cgroup_add_cftypes(ss, cfts);
3912 }
3913 
3914 /**
3915  * cgroup_file_notify - generate a file modified event for a cgroup_file
3916  * @cfile: target cgroup_file
3917  *
3918  * @cfile must have been obtained by setting cftype->file_offset.
3919  */
cgroup_file_notify(struct cgroup_file * cfile)3920 void cgroup_file_notify(struct cgroup_file *cfile)
3921 {
3922 	unsigned long flags;
3923 
3924 	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
3925 	if (cfile->kn)
3926 		kernfs_notify(cfile->kn);
3927 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
3928 }
3929 
3930 /**
3931  * cgroup_task_count - count the number of tasks in a cgroup.
3932  * @cgrp: the cgroup in question
3933  *
3934  * Return the number of tasks in the cgroup.  The returned number can be
3935  * higher than the actual number of tasks due to css_set references from
3936  * namespace roots and temporary usages.
3937  */
cgroup_task_count(const struct cgroup * cgrp)3938 static int cgroup_task_count(const struct cgroup *cgrp)
3939 {
3940 	int count = 0;
3941 	struct cgrp_cset_link *link;
3942 
3943 	spin_lock_irq(&css_set_lock);
3944 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
3945 		count += atomic_read(&link->cset->refcount);
3946 	spin_unlock_irq(&css_set_lock);
3947 	return count;
3948 }
3949 
3950 /**
3951  * css_next_child - find the next child of a given css
3952  * @pos: the current position (%NULL to initiate traversal)
3953  * @parent: css whose children to walk
3954  *
3955  * This function returns the next child of @parent and should be called
3956  * under either cgroup_mutex or RCU read lock.  The only requirement is
3957  * that @parent and @pos are accessible.  The next sibling is guaranteed to
3958  * be returned regardless of their states.
3959  *
3960  * If a subsystem synchronizes ->css_online() and the start of iteration, a
3961  * css which finished ->css_online() is guaranteed to be visible in the
3962  * future iterations and will stay visible until the last reference is put.
3963  * A css which hasn't finished ->css_online() or already finished
3964  * ->css_offline() may show up during traversal.  It's each subsystem's
3965  * responsibility to synchronize against on/offlining.
3966  */
css_next_child(struct cgroup_subsys_state * pos,struct cgroup_subsys_state * parent)3967 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
3968 					   struct cgroup_subsys_state *parent)
3969 {
3970 	struct cgroup_subsys_state *next;
3971 
3972 	cgroup_assert_mutex_or_rcu_locked();
3973 
3974 	/*
3975 	 * @pos could already have been unlinked from the sibling list.
3976 	 * Once a cgroup is removed, its ->sibling.next is no longer
3977 	 * updated when its next sibling changes.  CSS_RELEASED is set when
3978 	 * @pos is taken off list, at which time its next pointer is valid,
3979 	 * and, as releases are serialized, the one pointed to by the next
3980 	 * pointer is guaranteed to not have started release yet.  This
3981 	 * implies that if we observe !CSS_RELEASED on @pos in this RCU
3982 	 * critical section, the one pointed to by its next pointer is
3983 	 * guaranteed to not have finished its RCU grace period even if we
3984 	 * have dropped rcu_read_lock() inbetween iterations.
3985 	 *
3986 	 * If @pos has CSS_RELEASED set, its next pointer can't be
3987 	 * dereferenced; however, as each css is given a monotonically
3988 	 * increasing unique serial number and always appended to the
3989 	 * sibling list, the next one can be found by walking the parent's
3990 	 * children until the first css with higher serial number than
3991 	 * @pos's.  While this path can be slower, it happens iff iteration
3992 	 * races against release and the race window is very small.
3993 	 */
3994 	if (!pos) {
3995 		next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
3996 	} else if (likely(!(pos->flags & CSS_RELEASED))) {
3997 		next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
3998 	} else {
3999 		list_for_each_entry_rcu(next, &parent->children, sibling)
4000 			if (next->serial_nr > pos->serial_nr)
4001 				break;
4002 	}
4003 
4004 	/*
4005 	 * @next, if not pointing to the head, can be dereferenced and is
4006 	 * the next sibling.
4007 	 */
4008 	if (&next->sibling != &parent->children)
4009 		return next;
4010 	return NULL;
4011 }
4012 
4013 /**
4014  * css_next_descendant_pre - find the next descendant for pre-order walk
4015  * @pos: the current position (%NULL to initiate traversal)
4016  * @root: css whose descendants to walk
4017  *
4018  * To be used by css_for_each_descendant_pre().  Find the next descendant
4019  * to visit for pre-order traversal of @root's descendants.  @root is
4020  * included in the iteration and the first node to be visited.
4021  *
4022  * While this function requires cgroup_mutex or RCU read locking, it
4023  * doesn't require the whole traversal to be contained in a single critical
4024  * section.  This function will return the correct next descendant as long
4025  * as both @pos and @root are accessible and @pos is a descendant of @root.
4026  *
4027  * If a subsystem synchronizes ->css_online() and the start of iteration, a
4028  * css which finished ->css_online() is guaranteed to be visible in the
4029  * future iterations and will stay visible until the last reference is put.
4030  * A css which hasn't finished ->css_online() or already finished
4031  * ->css_offline() may show up during traversal.  It's each subsystem's
4032  * responsibility to synchronize against on/offlining.
4033  */
4034 struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state * pos,struct cgroup_subsys_state * root)4035 css_next_descendant_pre(struct cgroup_subsys_state *pos,
4036 			struct cgroup_subsys_state *root)
4037 {
4038 	struct cgroup_subsys_state *next;
4039 
4040 	cgroup_assert_mutex_or_rcu_locked();
4041 
4042 	/* if first iteration, visit @root */
4043 	if (!pos)
4044 		return root;
4045 
4046 	/* visit the first child if exists */
4047 	next = css_next_child(NULL, pos);
4048 	if (next)
4049 		return next;
4050 
4051 	/* no child, visit my or the closest ancestor's next sibling */
4052 	while (pos != root) {
4053 		next = css_next_child(pos, pos->parent);
4054 		if (next)
4055 			return next;
4056 		pos = pos->parent;
4057 	}
4058 
4059 	return NULL;
4060 }
4061 
4062 /**
4063  * css_rightmost_descendant - return the rightmost descendant of a css
4064  * @pos: css of interest
4065  *
4066  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
4067  * is returned.  This can be used during pre-order traversal to skip
4068  * subtree of @pos.
4069  *
4070  * While this function requires cgroup_mutex or RCU read locking, it
4071  * doesn't require the whole traversal to be contained in a single critical
4072  * section.  This function will return the correct rightmost descendant as
4073  * long as @pos is accessible.
4074  */
4075 struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state * pos)4076 css_rightmost_descendant(struct cgroup_subsys_state *pos)
4077 {
4078 	struct cgroup_subsys_state *last, *tmp;
4079 
4080 	cgroup_assert_mutex_or_rcu_locked();
4081 
4082 	do {
4083 		last = pos;
4084 		/* ->prev isn't RCU safe, walk ->next till the end */
4085 		pos = NULL;
4086 		css_for_each_child(tmp, last)
4087 			pos = tmp;
4088 	} while (pos);
4089 
4090 	return last;
4091 }
4092 
4093 static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state * pos)4094 css_leftmost_descendant(struct cgroup_subsys_state *pos)
4095 {
4096 	struct cgroup_subsys_state *last;
4097 
4098 	do {
4099 		last = pos;
4100 		pos = css_next_child(NULL, pos);
4101 	} while (pos);
4102 
4103 	return last;
4104 }
4105 
4106 /**
4107  * css_next_descendant_post - find the next descendant for post-order walk
4108  * @pos: the current position (%NULL to initiate traversal)
4109  * @root: css whose descendants to walk
4110  *
4111  * To be used by css_for_each_descendant_post().  Find the next descendant
4112  * to visit for post-order traversal of @root's descendants.  @root is
4113  * included in the iteration and the last node to be visited.
4114  *
4115  * While this function requires cgroup_mutex or RCU read locking, it
4116  * doesn't require the whole traversal to be contained in a single critical
4117  * section.  This function will return the correct next descendant as long
4118  * as both @pos and @cgroup are accessible and @pos is a descendant of
4119  * @cgroup.
4120  *
4121  * If a subsystem synchronizes ->css_online() and the start of iteration, a
4122  * css which finished ->css_online() is guaranteed to be visible in the
4123  * future iterations and will stay visible until the last reference is put.
4124  * A css which hasn't finished ->css_online() or already finished
4125  * ->css_offline() may show up during traversal.  It's each subsystem's
4126  * responsibility to synchronize against on/offlining.
4127  */
4128 struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state * pos,struct cgroup_subsys_state * root)4129 css_next_descendant_post(struct cgroup_subsys_state *pos,
4130 			 struct cgroup_subsys_state *root)
4131 {
4132 	struct cgroup_subsys_state *next;
4133 
4134 	cgroup_assert_mutex_or_rcu_locked();
4135 
4136 	/* if first iteration, visit leftmost descendant which may be @root */
4137 	if (!pos)
4138 		return css_leftmost_descendant(root);
4139 
4140 	/* if we visited @root, we're done */
4141 	if (pos == root)
4142 		return NULL;
4143 
4144 	/* if there's an unvisited sibling, visit its leftmost descendant */
4145 	next = css_next_child(pos, pos->parent);
4146 	if (next)
4147 		return css_leftmost_descendant(next);
4148 
4149 	/* no sibling left, visit parent */
4150 	return pos->parent;
4151 }
4152 
4153 /**
4154  * css_has_online_children - does a css have online children
4155  * @css: the target css
4156  *
4157  * Returns %true if @css has any online children; otherwise, %false.  This
4158  * function can be called from any context but the caller is responsible
4159  * for synchronizing against on/offlining as necessary.
4160  */
css_has_online_children(struct cgroup_subsys_state * css)4161 bool css_has_online_children(struct cgroup_subsys_state *css)
4162 {
4163 	struct cgroup_subsys_state *child;
4164 	bool ret = false;
4165 
4166 	rcu_read_lock();
4167 	css_for_each_child(child, css) {
4168 		if (child->flags & CSS_ONLINE) {
4169 			ret = true;
4170 			break;
4171 		}
4172 	}
4173 	rcu_read_unlock();
4174 	return ret;
4175 }
4176 
4177 /**
4178  * css_task_iter_advance_css_set - advance a task itererator to the next css_set
4179  * @it: the iterator to advance
4180  *
4181  * Advance @it to the next css_set to walk.
4182  */
css_task_iter_advance_css_set(struct css_task_iter * it)4183 static void css_task_iter_advance_css_set(struct css_task_iter *it)
4184 {
4185 	struct list_head *l = it->cset_pos;
4186 	struct cgrp_cset_link *link;
4187 	struct css_set *cset;
4188 
4189 	lockdep_assert_held(&css_set_lock);
4190 
4191 	/* Advance to the next non-empty css_set */
4192 	do {
4193 		l = l->next;
4194 		if (l == it->cset_head) {
4195 			it->cset_pos = NULL;
4196 			it->task_pos = NULL;
4197 			return;
4198 		}
4199 
4200 		if (it->ss) {
4201 			cset = container_of(l, struct css_set,
4202 					    e_cset_node[it->ss->id]);
4203 		} else {
4204 			link = list_entry(l, struct cgrp_cset_link, cset_link);
4205 			cset = link->cset;
4206 		}
4207 	} while (!css_set_populated(cset));
4208 
4209 	it->cset_pos = l;
4210 
4211 	if (!list_empty(&cset->tasks))
4212 		it->task_pos = cset->tasks.next;
4213 	else
4214 		it->task_pos = cset->mg_tasks.next;
4215 
4216 	it->tasks_head = &cset->tasks;
4217 	it->mg_tasks_head = &cset->mg_tasks;
4218 
4219 	/*
4220 	 * We don't keep css_sets locked across iteration steps and thus
4221 	 * need to take steps to ensure that iteration can be resumed after
4222 	 * the lock is re-acquired.  Iteration is performed at two levels -
4223 	 * css_sets and tasks in them.
4224 	 *
4225 	 * Once created, a css_set never leaves its cgroup lists, so a
4226 	 * pinned css_set is guaranteed to stay put and we can resume
4227 	 * iteration afterwards.
4228 	 *
4229 	 * Tasks may leave @cset across iteration steps.  This is resolved
4230 	 * by registering each iterator with the css_set currently being
4231 	 * walked and making css_set_move_task() advance iterators whose
4232 	 * next task is leaving.
4233 	 */
4234 	if (it->cur_cset) {
4235 		list_del(&it->iters_node);
4236 		put_css_set_locked(it->cur_cset);
4237 	}
4238 	get_css_set(cset);
4239 	it->cur_cset = cset;
4240 	list_add(&it->iters_node, &cset->task_iters);
4241 }
4242 
css_task_iter_advance(struct css_task_iter * it)4243 static void css_task_iter_advance(struct css_task_iter *it)
4244 {
4245 	struct list_head *l = it->task_pos;
4246 
4247 	lockdep_assert_held(&css_set_lock);
4248 	WARN_ON_ONCE(!l);
4249 
4250 	/*
4251 	 * Advance iterator to find next entry.  cset->tasks is consumed
4252 	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
4253 	 * next cset.
4254 	 */
4255 	l = l->next;
4256 
4257 	if (l == it->tasks_head)
4258 		l = it->mg_tasks_head->next;
4259 
4260 	if (l == it->mg_tasks_head)
4261 		css_task_iter_advance_css_set(it);
4262 	else
4263 		it->task_pos = l;
4264 }
4265 
4266 /**
4267  * css_task_iter_start - initiate task iteration
4268  * @css: the css to walk tasks of
4269  * @it: the task iterator to use
4270  *
4271  * Initiate iteration through the tasks of @css.  The caller can call
4272  * css_task_iter_next() to walk through the tasks until the function
4273  * returns NULL.  On completion of iteration, css_task_iter_end() must be
4274  * called.
4275  */
css_task_iter_start(struct cgroup_subsys_state * css,struct css_task_iter * it)4276 void css_task_iter_start(struct cgroup_subsys_state *css,
4277 			 struct css_task_iter *it)
4278 {
4279 	/* no one should try to iterate before mounting cgroups */
4280 	WARN_ON_ONCE(!use_task_css_set_links);
4281 
4282 	memset(it, 0, sizeof(*it));
4283 
4284 	spin_lock_irq(&css_set_lock);
4285 
4286 	it->ss = css->ss;
4287 
4288 	if (it->ss)
4289 		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4290 	else
4291 		it->cset_pos = &css->cgroup->cset_links;
4292 
4293 	it->cset_head = it->cset_pos;
4294 
4295 	css_task_iter_advance_css_set(it);
4296 
4297 	spin_unlock_irq(&css_set_lock);
4298 }
4299 
4300 /**
4301  * css_task_iter_next - return the next task for the iterator
4302  * @it: the task iterator being iterated
4303  *
4304  * The "next" function for task iteration.  @it should have been
4305  * initialized via css_task_iter_start().  Returns NULL when the iteration
4306  * reaches the end.
4307  */
css_task_iter_next(struct css_task_iter * it)4308 struct task_struct *css_task_iter_next(struct css_task_iter *it)
4309 {
4310 	if (it->cur_task) {
4311 		put_task_struct(it->cur_task);
4312 		it->cur_task = NULL;
4313 	}
4314 
4315 	spin_lock_irq(&css_set_lock);
4316 
4317 	if (it->task_pos) {
4318 		it->cur_task = list_entry(it->task_pos, struct task_struct,
4319 					  cg_list);
4320 		get_task_struct(it->cur_task);
4321 		css_task_iter_advance(it);
4322 	}
4323 
4324 	spin_unlock_irq(&css_set_lock);
4325 
4326 	return it->cur_task;
4327 }
4328 
4329 /**
4330  * css_task_iter_end - finish task iteration
4331  * @it: the task iterator to finish
4332  *
4333  * Finish task iteration started by css_task_iter_start().
4334  */
css_task_iter_end(struct css_task_iter * it)4335 void css_task_iter_end(struct css_task_iter *it)
4336 {
4337 	if (it->cur_cset) {
4338 		spin_lock_irq(&css_set_lock);
4339 		list_del(&it->iters_node);
4340 		put_css_set_locked(it->cur_cset);
4341 		spin_unlock_irq(&css_set_lock);
4342 	}
4343 
4344 	if (it->cur_task)
4345 		put_task_struct(it->cur_task);
4346 }
4347 
4348 /**
4349  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
4350  * @to: cgroup to which the tasks will be moved
4351  * @from: cgroup in which the tasks currently reside
4352  *
4353  * Locking rules between cgroup_post_fork() and the migration path
4354  * guarantee that, if a task is forking while being migrated, the new child
4355  * is guaranteed to be either visible in the source cgroup after the
4356  * parent's migration is complete or put into the target cgroup.  No task
4357  * can slip out of migration through forking.
4358  */
cgroup_transfer_tasks(struct cgroup * to,struct cgroup * from)4359 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
4360 {
4361 	LIST_HEAD(preloaded_csets);
4362 	struct cgrp_cset_link *link;
4363 	struct css_task_iter it;
4364 	struct task_struct *task;
4365 	int ret;
4366 
4367 	if (!cgroup_may_migrate_to(to))
4368 		return -EBUSY;
4369 
4370 	mutex_lock(&cgroup_mutex);
4371 
4372 	percpu_down_write(&cgroup_threadgroup_rwsem);
4373 
4374 	/* all tasks in @from are being moved, all csets are source */
4375 	spin_lock_irq(&css_set_lock);
4376 	list_for_each_entry(link, &from->cset_links, cset_link)
4377 		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
4378 	spin_unlock_irq(&css_set_lock);
4379 
4380 	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
4381 	if (ret)
4382 		goto out_err;
4383 
4384 	/*
4385 	 * Migrate tasks one-by-one until @from is empty.  This fails iff
4386 	 * ->can_attach() fails.
4387 	 */
4388 	do {
4389 		css_task_iter_start(&from->self, &it);
4390 		task = css_task_iter_next(&it);
4391 		if (task)
4392 			get_task_struct(task);
4393 		css_task_iter_end(&it);
4394 
4395 		if (task) {
4396 			ret = cgroup_migrate(task, false, to->root);
4397 			if (!ret)
4398 				trace_cgroup_transfer_tasks(to, task, false);
4399 			put_task_struct(task);
4400 		}
4401 	} while (task && !ret);
4402 out_err:
4403 	cgroup_migrate_finish(&preloaded_csets);
4404 	percpu_up_write(&cgroup_threadgroup_rwsem);
4405 	mutex_unlock(&cgroup_mutex);
4406 	return ret;
4407 }
4408 
4409 /*
4410  * Stuff for reading the 'tasks'/'procs' files.
4411  *
4412  * Reading this file can return large amounts of data if a cgroup has
4413  * *lots* of attached tasks. So it may need several calls to read(),
4414  * but we cannot guarantee that the information we produce is correct
4415  * unless we produce it entirely atomically.
4416  *
4417  */
4418 
4419 /* which pidlist file are we talking about? */
4420 enum cgroup_filetype {
4421 	CGROUP_FILE_PROCS,
4422 	CGROUP_FILE_TASKS,
4423 };
4424 
4425 /*
4426  * A pidlist is a list of pids that virtually represents the contents of one
4427  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
4428  * a pair (one each for procs, tasks) for each pid namespace that's relevant
4429  * to the cgroup.
4430  */
4431 struct cgroup_pidlist {
4432 	/*
4433 	 * used to find which pidlist is wanted. doesn't change as long as
4434 	 * this particular list stays in the list.
4435 	*/
4436 	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
4437 	/* array of xids */
4438 	pid_t *list;
4439 	/* how many elements the above list has */
4440 	int length;
4441 	/* each of these stored in a list by its cgroup */
4442 	struct list_head links;
4443 	/* pointer to the cgroup we belong to, for list removal purposes */
4444 	struct cgroup *owner;
4445 	/* for delayed destruction */
4446 	struct delayed_work destroy_dwork;
4447 };
4448 
4449 /*
4450  * The following two functions "fix" the issue where there are more pids
4451  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
4452  * TODO: replace with a kernel-wide solution to this problem
4453  */
4454 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
pidlist_allocate(int count)4455 static void *pidlist_allocate(int count)
4456 {
4457 	if (PIDLIST_TOO_LARGE(count))
4458 		return vmalloc(count * sizeof(pid_t));
4459 	else
4460 		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
4461 }
4462 
pidlist_free(void * p)4463 static void pidlist_free(void *p)
4464 {
4465 	kvfree(p);
4466 }
4467 
4468 /*
4469  * Used to destroy all pidlists lingering waiting for destroy timer.  None
4470  * should be left afterwards.
4471  */
cgroup_pidlist_destroy_all(struct cgroup * cgrp)4472 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
4473 {
4474 	struct cgroup_pidlist *l, *tmp_l;
4475 
4476 	mutex_lock(&cgrp->pidlist_mutex);
4477 	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
4478 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
4479 	mutex_unlock(&cgrp->pidlist_mutex);
4480 
4481 	flush_workqueue(cgroup_pidlist_destroy_wq);
4482 	BUG_ON(!list_empty(&cgrp->pidlists));
4483 }
4484 
cgroup_pidlist_destroy_work_fn(struct work_struct * work)4485 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
4486 {
4487 	struct delayed_work *dwork = to_delayed_work(work);
4488 	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
4489 						destroy_dwork);
4490 	struct cgroup_pidlist *tofree = NULL;
4491 
4492 	mutex_lock(&l->owner->pidlist_mutex);
4493 
4494 	/*
4495 	 * Destroy iff we didn't get queued again.  The state won't change
4496 	 * as destroy_dwork can only be queued while locked.
4497 	 */
4498 	if (!delayed_work_pending(dwork)) {
4499 		list_del(&l->links);
4500 		pidlist_free(l->list);
4501 		put_pid_ns(l->key.ns);
4502 		tofree = l;
4503 	}
4504 
4505 	mutex_unlock(&l->owner->pidlist_mutex);
4506 	kfree(tofree);
4507 }
4508 
4509 /*
4510  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
4511  * Returns the number of unique elements.
4512  */
pidlist_uniq(pid_t * list,int length)4513 static int pidlist_uniq(pid_t *list, int length)
4514 {
4515 	int src, dest = 1;
4516 
4517 	/*
4518 	 * we presume the 0th element is unique, so i starts at 1. trivial
4519 	 * edge cases first; no work needs to be done for either
4520 	 */
4521 	if (length == 0 || length == 1)
4522 		return length;
4523 	/* src and dest walk down the list; dest counts unique elements */
4524 	for (src = 1; src < length; src++) {
4525 		/* find next unique element */
4526 		while (list[src] == list[src-1]) {
4527 			src++;
4528 			if (src == length)
4529 				goto after;
4530 		}
4531 		/* dest always points to where the next unique element goes */
4532 		list[dest] = list[src];
4533 		dest++;
4534 	}
4535 after:
4536 	return dest;
4537 }
4538 
4539 /*
4540  * The two pid files - task and cgroup.procs - guaranteed that the result
4541  * is sorted, which forced this whole pidlist fiasco.  As pid order is
4542  * different per namespace, each namespace needs differently sorted list,
4543  * making it impossible to use, for example, single rbtree of member tasks
4544  * sorted by task pointer.  As pidlists can be fairly large, allocating one
4545  * per open file is dangerous, so cgroup had to implement shared pool of
4546  * pidlists keyed by cgroup and namespace.
4547  *
4548  * All this extra complexity was caused by the original implementation
4549  * committing to an entirely unnecessary property.  In the long term, we
4550  * want to do away with it.  Explicitly scramble sort order if on the
4551  * default hierarchy so that no such expectation exists in the new
4552  * interface.
4553  *
4554  * Scrambling is done by swapping every two consecutive bits, which is
4555  * non-identity one-to-one mapping which disturbs sort order sufficiently.
4556  */
pid_fry(pid_t pid)4557 static pid_t pid_fry(pid_t pid)
4558 {
4559 	unsigned a = pid & 0x55555555;
4560 	unsigned b = pid & 0xAAAAAAAA;
4561 
4562 	return (a << 1) | (b >> 1);
4563 }
4564 
cgroup_pid_fry(struct cgroup * cgrp,pid_t pid)4565 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
4566 {
4567 	if (cgroup_on_dfl(cgrp))
4568 		return pid_fry(pid);
4569 	else
4570 		return pid;
4571 }
4572 
cmppid(const void * a,const void * b)4573 static int cmppid(const void *a, const void *b)
4574 {
4575 	return *(pid_t *)a - *(pid_t *)b;
4576 }
4577 
fried_cmppid(const void * a,const void * b)4578 static int fried_cmppid(const void *a, const void *b)
4579 {
4580 	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
4581 }
4582 
cgroup_pidlist_find(struct cgroup * cgrp,enum cgroup_filetype type)4583 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
4584 						  enum cgroup_filetype type)
4585 {
4586 	struct cgroup_pidlist *l;
4587 	/* don't need task_nsproxy() if we're looking at ourself */
4588 	struct pid_namespace *ns = task_active_pid_ns(current);
4589 
4590 	lockdep_assert_held(&cgrp->pidlist_mutex);
4591 
4592 	list_for_each_entry(l, &cgrp->pidlists, links)
4593 		if (l->key.type == type && l->key.ns == ns)
4594 			return l;
4595 	return NULL;
4596 }
4597 
4598 /*
4599  * find the appropriate pidlist for our purpose (given procs vs tasks)
4600  * returns with the lock on that pidlist already held, and takes care
4601  * of the use count, or returns NULL with no locks held if we're out of
4602  * memory.
4603  */
cgroup_pidlist_find_create(struct cgroup * cgrp,enum cgroup_filetype type)4604 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
4605 						enum cgroup_filetype type)
4606 {
4607 	struct cgroup_pidlist *l;
4608 
4609 	lockdep_assert_held(&cgrp->pidlist_mutex);
4610 
4611 	l = cgroup_pidlist_find(cgrp, type);
4612 	if (l)
4613 		return l;
4614 
4615 	/* entry not found; create a new one */
4616 	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
4617 	if (!l)
4618 		return l;
4619 
4620 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
4621 	l->key.type = type;
4622 	/* don't need task_nsproxy() if we're looking at ourself */
4623 	l->key.ns = get_pid_ns(task_active_pid_ns(current));
4624 	l->owner = cgrp;
4625 	list_add(&l->links, &cgrp->pidlists);
4626 	return l;
4627 }
4628 
4629 /*
4630  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
4631  */
pidlist_array_load(struct cgroup * cgrp,enum cgroup_filetype type,struct cgroup_pidlist ** lp)4632 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
4633 			      struct cgroup_pidlist **lp)
4634 {
4635 	pid_t *array;
4636 	int length;
4637 	int pid, n = 0; /* used for populating the array */
4638 	struct css_task_iter it;
4639 	struct task_struct *tsk;
4640 	struct cgroup_pidlist *l;
4641 
4642 	lockdep_assert_held(&cgrp->pidlist_mutex);
4643 
4644 	/*
4645 	 * If cgroup gets more users after we read count, we won't have
4646 	 * enough space - tough.  This race is indistinguishable to the
4647 	 * caller from the case that the additional cgroup users didn't
4648 	 * show up until sometime later on.
4649 	 */
4650 	length = cgroup_task_count(cgrp);
4651 	array = pidlist_allocate(length);
4652 	if (!array)
4653 		return -ENOMEM;
4654 	/* now, populate the array */
4655 	css_task_iter_start(&cgrp->self, &it);
4656 	while ((tsk = css_task_iter_next(&it))) {
4657 		if (unlikely(n == length))
4658 			break;
4659 		/* get tgid or pid for procs or tasks file respectively */
4660 		if (type == CGROUP_FILE_PROCS)
4661 			pid = task_tgid_vnr(tsk);
4662 		else
4663 			pid = task_pid_vnr(tsk);
4664 		if (pid > 0) /* make sure to only use valid results */
4665 			array[n++] = pid;
4666 	}
4667 	css_task_iter_end(&it);
4668 	length = n;
4669 	/* now sort & (if procs) strip out duplicates */
4670 	if (cgroup_on_dfl(cgrp))
4671 		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
4672 	else
4673 		sort(array, length, sizeof(pid_t), cmppid, NULL);
4674 	if (type == CGROUP_FILE_PROCS)
4675 		length = pidlist_uniq(array, length);
4676 
4677 	l = cgroup_pidlist_find_create(cgrp, type);
4678 	if (!l) {
4679 		pidlist_free(array);
4680 		return -ENOMEM;
4681 	}
4682 
4683 	/* store array, freeing old if necessary */
4684 	pidlist_free(l->list);
4685 	l->list = array;
4686 	l->length = length;
4687 	*lp = l;
4688 	return 0;
4689 }
4690 
4691 /**
4692  * cgroupstats_build - build and fill cgroupstats
4693  * @stats: cgroupstats to fill information into
4694  * @dentry: A dentry entry belonging to the cgroup for which stats have
4695  * been requested.
4696  *
4697  * Build and fill cgroupstats so that taskstats can export it to user
4698  * space.
4699  */
cgroupstats_build(struct cgroupstats * stats,struct dentry * dentry)4700 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
4701 {
4702 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4703 	struct cgroup *cgrp;
4704 	struct css_task_iter it;
4705 	struct task_struct *tsk;
4706 
4707 	/* it should be kernfs_node belonging to cgroupfs and is a directory */
4708 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
4709 	    kernfs_type(kn) != KERNFS_DIR)
4710 		return -EINVAL;
4711 
4712 	mutex_lock(&cgroup_mutex);
4713 
4714 	/*
4715 	 * We aren't being called from kernfs and there's no guarantee on
4716 	 * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
4717 	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
4718 	 */
4719 	rcu_read_lock();
4720 	cgrp = rcu_dereference(kn->priv);
4721 	if (!cgrp || cgroup_is_dead(cgrp)) {
4722 		rcu_read_unlock();
4723 		mutex_unlock(&cgroup_mutex);
4724 		return -ENOENT;
4725 	}
4726 	rcu_read_unlock();
4727 
4728 	css_task_iter_start(&cgrp->self, &it);
4729 	while ((tsk = css_task_iter_next(&it))) {
4730 		switch (tsk->state) {
4731 		case TASK_RUNNING:
4732 			stats->nr_running++;
4733 			break;
4734 		case TASK_INTERRUPTIBLE:
4735 			stats->nr_sleeping++;
4736 			break;
4737 		case TASK_UNINTERRUPTIBLE:
4738 			stats->nr_uninterruptible++;
4739 			break;
4740 		case TASK_STOPPED:
4741 			stats->nr_stopped++;
4742 			break;
4743 		default:
4744 			if (delayacct_is_task_waiting_on_io(tsk))
4745 				stats->nr_io_wait++;
4746 			break;
4747 		}
4748 	}
4749 	css_task_iter_end(&it);
4750 
4751 	mutex_unlock(&cgroup_mutex);
4752 	return 0;
4753 }
4754 
4755 
4756 /*
4757  * seq_file methods for the tasks/procs files. The seq_file position is the
4758  * next pid to display; the seq_file iterator is a pointer to the pid
4759  * in the cgroup->l->list array.
4760  */
4761 
cgroup_pidlist_start(struct seq_file * s,loff_t * pos)4762 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
4763 {
4764 	/*
4765 	 * Initially we receive a position value that corresponds to
4766 	 * one more than the last pid shown (or 0 on the first call or
4767 	 * after a seek to the start). Use a binary-search to find the
4768 	 * next pid to display, if any
4769 	 */
4770 	struct kernfs_open_file *of = s->private;
4771 	struct cgroup *cgrp = seq_css(s)->cgroup;
4772 	struct cgroup_pidlist *l;
4773 	enum cgroup_filetype type = seq_cft(s)->private;
4774 	int index = 0, pid = *pos;
4775 	int *iter, ret;
4776 
4777 	mutex_lock(&cgrp->pidlist_mutex);
4778 
4779 	/*
4780 	 * !NULL @of->priv indicates that this isn't the first start()
4781 	 * after open.  If the matching pidlist is around, we can use that.
4782 	 * Look for it.  Note that @of->priv can't be used directly.  It
4783 	 * could already have been destroyed.
4784 	 */
4785 	if (of->priv)
4786 		of->priv = cgroup_pidlist_find(cgrp, type);
4787 
4788 	/*
4789 	 * Either this is the first start() after open or the matching
4790 	 * pidlist has been destroyed inbetween.  Create a new one.
4791 	 */
4792 	if (!of->priv) {
4793 		ret = pidlist_array_load(cgrp, type,
4794 					 (struct cgroup_pidlist **)&of->priv);
4795 		if (ret)
4796 			return ERR_PTR(ret);
4797 	}
4798 	l = of->priv;
4799 
4800 	if (pid) {
4801 		int end = l->length;
4802 
4803 		while (index < end) {
4804 			int mid = (index + end) / 2;
4805 			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
4806 				index = mid;
4807 				break;
4808 			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
4809 				index = mid + 1;
4810 			else
4811 				end = mid;
4812 		}
4813 	}
4814 	/* If we're off the end of the array, we're done */
4815 	if (index >= l->length)
4816 		return NULL;
4817 	/* Update the abstract position to be the actual pid that we found */
4818 	iter = l->list + index;
4819 	*pos = cgroup_pid_fry(cgrp, *iter);
4820 	return iter;
4821 }
4822 
cgroup_pidlist_stop(struct seq_file * s,void * v)4823 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
4824 {
4825 	struct kernfs_open_file *of = s->private;
4826 	struct cgroup_pidlist *l = of->priv;
4827 
4828 	if (l)
4829 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
4830 				 CGROUP_PIDLIST_DESTROY_DELAY);
4831 	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
4832 }
4833 
cgroup_pidlist_next(struct seq_file * s,void * v,loff_t * pos)4834 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
4835 {
4836 	struct kernfs_open_file *of = s->private;
4837 	struct cgroup_pidlist *l = of->priv;
4838 	pid_t *p = v;
4839 	pid_t *end = l->list + l->length;
4840 	/*
4841 	 * Advance to the next pid in the array. If this goes off the
4842 	 * end, we're done
4843 	 */
4844 	p++;
4845 	if (p >= end) {
4846 		return NULL;
4847 	} else {
4848 		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
4849 		return p;
4850 	}
4851 }
4852 
cgroup_pidlist_show(struct seq_file * s,void * v)4853 static int cgroup_pidlist_show(struct seq_file *s, void *v)
4854 {
4855 	seq_printf(s, "%d\n", *(int *)v);
4856 
4857 	return 0;
4858 }
4859 
cgroup_read_notify_on_release(struct cgroup_subsys_state * css,struct cftype * cft)4860 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
4861 					 struct cftype *cft)
4862 {
4863 	return notify_on_release(css->cgroup);
4864 }
4865 
cgroup_write_notify_on_release(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)4866 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
4867 					  struct cftype *cft, u64 val)
4868 {
4869 	if (val)
4870 		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4871 	else
4872 		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4873 	return 0;
4874 }
4875 
cgroup_clone_children_read(struct cgroup_subsys_state * css,struct cftype * cft)4876 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4877 				      struct cftype *cft)
4878 {
4879 	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4880 }
4881 
cgroup_clone_children_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)4882 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4883 				       struct cftype *cft, u64 val)
4884 {
4885 	if (val)
4886 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4887 	else
4888 		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4889 	return 0;
4890 }
4891 
4892 /* cgroup core interface files for the default hierarchy */
4893 static struct cftype cgroup_dfl_base_files[] = {
4894 	{
4895 		.name = "cgroup.procs",
4896 		.file_offset = offsetof(struct cgroup, procs_file),
4897 		.seq_start = cgroup_pidlist_start,
4898 		.seq_next = cgroup_pidlist_next,
4899 		.seq_stop = cgroup_pidlist_stop,
4900 		.seq_show = cgroup_pidlist_show,
4901 		.private = CGROUP_FILE_PROCS,
4902 		.write = cgroup_procs_write,
4903 	},
4904 	{
4905 		.name = "cgroup.controllers",
4906 		.seq_show = cgroup_controllers_show,
4907 	},
4908 	{
4909 		.name = "cgroup.subtree_control",
4910 		.seq_show = cgroup_subtree_control_show,
4911 		.write = cgroup_subtree_control_write,
4912 	},
4913 	{
4914 		.name = "cgroup.events",
4915 		.flags = CFTYPE_NOT_ON_ROOT,
4916 		.file_offset = offsetof(struct cgroup, events_file),
4917 		.seq_show = cgroup_events_show,
4918 	},
4919 	{ }	/* terminate */
4920 };
4921 
4922 /* cgroup core interface files for the legacy hierarchies */
4923 static struct cftype cgroup_legacy_base_files[] = {
4924 	{
4925 		.name = "cgroup.procs",
4926 		.seq_start = cgroup_pidlist_start,
4927 		.seq_next = cgroup_pidlist_next,
4928 		.seq_stop = cgroup_pidlist_stop,
4929 		.seq_show = cgroup_pidlist_show,
4930 		.private = CGROUP_FILE_PROCS,
4931 		.write = cgroup_procs_write,
4932 	},
4933 	{
4934 		.name = "cgroup.clone_children",
4935 		.read_u64 = cgroup_clone_children_read,
4936 		.write_u64 = cgroup_clone_children_write,
4937 	},
4938 	{
4939 		.name = "cgroup.sane_behavior",
4940 		.flags = CFTYPE_ONLY_ON_ROOT,
4941 		.seq_show = cgroup_sane_behavior_show,
4942 	},
4943 	{
4944 		.name = "tasks",
4945 		.seq_start = cgroup_pidlist_start,
4946 		.seq_next = cgroup_pidlist_next,
4947 		.seq_stop = cgroup_pidlist_stop,
4948 		.seq_show = cgroup_pidlist_show,
4949 		.private = CGROUP_FILE_TASKS,
4950 		.write = cgroup_tasks_write,
4951 	},
4952 	{
4953 		.name = "notify_on_release",
4954 		.read_u64 = cgroup_read_notify_on_release,
4955 		.write_u64 = cgroup_write_notify_on_release,
4956 	},
4957 	{
4958 		.name = "release_agent",
4959 		.flags = CFTYPE_ONLY_ON_ROOT,
4960 		.seq_show = cgroup_release_agent_show,
4961 		.write = cgroup_release_agent_write,
4962 		.max_write_len = PATH_MAX - 1,
4963 	},
4964 	{ }	/* terminate */
4965 };
4966 
4967 /*
4968  * css destruction is four-stage process.
4969  *
4970  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
4971  *    Implemented in kill_css().
4972  *
4973  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4974  *    and thus css_tryget_online() is guaranteed to fail, the css can be
4975  *    offlined by invoking offline_css().  After offlining, the base ref is
4976  *    put.  Implemented in css_killed_work_fn().
4977  *
4978  * 3. When the percpu_ref reaches zero, the only possible remaining
4979  *    accessors are inside RCU read sections.  css_release() schedules the
4980  *    RCU callback.
4981  *
4982  * 4. After the grace period, the css can be freed.  Implemented in
4983  *    css_free_work_fn().
4984  *
4985  * It is actually hairier because both step 2 and 4 require process context
4986  * and thus involve punting to css->destroy_work adding two additional
4987  * steps to the already complex sequence.
4988  */
css_free_work_fn(struct work_struct * work)4989 static void css_free_work_fn(struct work_struct *work)
4990 {
4991 	struct cgroup_subsys_state *css =
4992 		container_of(work, struct cgroup_subsys_state, destroy_work);
4993 	struct cgroup_subsys *ss = css->ss;
4994 	struct cgroup *cgrp = css->cgroup;
4995 
4996 	percpu_ref_exit(&css->refcnt);
4997 
4998 	if (ss) {
4999 		/* css free path */
5000 		struct cgroup_subsys_state *parent = css->parent;
5001 		int id = css->id;
5002 
5003 		ss->css_free(css);
5004 		cgroup_idr_remove(&ss->css_idr, id);
5005 		cgroup_put(cgrp);
5006 
5007 		if (parent)
5008 			css_put(parent);
5009 	} else {
5010 		/* cgroup free path */
5011 		atomic_dec(&cgrp->root->nr_cgrps);
5012 		cgroup_pidlist_destroy_all(cgrp);
5013 		cancel_work_sync(&cgrp->release_agent_work);
5014 
5015 		if (cgroup_parent(cgrp)) {
5016 			/*
5017 			 * We get a ref to the parent, and put the ref when
5018 			 * this cgroup is being freed, so it's guaranteed
5019 			 * that the parent won't be destroyed before its
5020 			 * children.
5021 			 */
5022 			cgroup_put(cgroup_parent(cgrp));
5023 			kernfs_put(cgrp->kn);
5024 			kfree(cgrp);
5025 		} else {
5026 			/*
5027 			 * This is root cgroup's refcnt reaching zero,
5028 			 * which indicates that the root should be
5029 			 * released.
5030 			 */
5031 			cgroup_destroy_root(cgrp->root);
5032 		}
5033 	}
5034 }
5035 
css_free_rcu_fn(struct rcu_head * rcu_head)5036 static void css_free_rcu_fn(struct rcu_head *rcu_head)
5037 {
5038 	struct cgroup_subsys_state *css =
5039 		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
5040 
5041 	INIT_WORK(&css->destroy_work, css_free_work_fn);
5042 	queue_work(cgroup_destroy_wq, &css->destroy_work);
5043 }
5044 
css_release_work_fn(struct work_struct * work)5045 static void css_release_work_fn(struct work_struct *work)
5046 {
5047 	struct cgroup_subsys_state *css =
5048 		container_of(work, struct cgroup_subsys_state, destroy_work);
5049 	struct cgroup_subsys *ss = css->ss;
5050 	struct cgroup *cgrp = css->cgroup;
5051 
5052 	mutex_lock(&cgroup_mutex);
5053 
5054 	css->flags |= CSS_RELEASED;
5055 	list_del_rcu(&css->sibling);
5056 
5057 	if (ss) {
5058 		/* css release path */
5059 		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
5060 		if (ss->css_released)
5061 			ss->css_released(css);
5062 	} else {
5063 		/* cgroup release path */
5064 		trace_cgroup_release(cgrp);
5065 
5066 		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
5067 		cgrp->id = -1;
5068 
5069 		/*
5070 		 * There are two control paths which try to determine
5071 		 * cgroup from dentry without going through kernfs -
5072 		 * cgroupstats_build() and css_tryget_online_from_dir().
5073 		 * Those are supported by RCU protecting clearing of
5074 		 * cgrp->kn->priv backpointer.
5075 		 */
5076 		if (cgrp->kn)
5077 			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5078 					 NULL);
5079 
5080 		cgroup_bpf_put(cgrp);
5081 	}
5082 
5083 	mutex_unlock(&cgroup_mutex);
5084 
5085 	call_rcu(&css->rcu_head, css_free_rcu_fn);
5086 }
5087 
css_release(struct percpu_ref * ref)5088 static void css_release(struct percpu_ref *ref)
5089 {
5090 	struct cgroup_subsys_state *css =
5091 		container_of(ref, struct cgroup_subsys_state, refcnt);
5092 
5093 	INIT_WORK(&css->destroy_work, css_release_work_fn);
5094 	queue_work(cgroup_destroy_wq, &css->destroy_work);
5095 }
5096 
init_and_link_css(struct cgroup_subsys_state * css,struct cgroup_subsys * ss,struct cgroup * cgrp)5097 static void init_and_link_css(struct cgroup_subsys_state *css,
5098 			      struct cgroup_subsys *ss, struct cgroup *cgrp)
5099 {
5100 	lockdep_assert_held(&cgroup_mutex);
5101 
5102 	cgroup_get(cgrp);
5103 
5104 	memset(css, 0, sizeof(*css));
5105 	css->cgroup = cgrp;
5106 	css->ss = ss;
5107 	css->id = -1;
5108 	INIT_LIST_HEAD(&css->sibling);
5109 	INIT_LIST_HEAD(&css->children);
5110 	css->serial_nr = css_serial_nr_next++;
5111 	atomic_set(&css->online_cnt, 0);
5112 
5113 	if (cgroup_parent(cgrp)) {
5114 		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5115 		css_get(css->parent);
5116 	}
5117 
5118 	BUG_ON(cgroup_css(cgrp, ss));
5119 }
5120 
5121 /* invoke ->css_online() on a new CSS and mark it online if successful */
online_css(struct cgroup_subsys_state * css)5122 static int online_css(struct cgroup_subsys_state *css)
5123 {
5124 	struct cgroup_subsys *ss = css->ss;
5125 	int ret = 0;
5126 
5127 	lockdep_assert_held(&cgroup_mutex);
5128 
5129 	if (ss->css_online)
5130 		ret = ss->css_online(css);
5131 	if (!ret) {
5132 		css->flags |= CSS_ONLINE;
5133 		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5134 
5135 		atomic_inc(&css->online_cnt);
5136 		if (css->parent)
5137 			atomic_inc(&css->parent->online_cnt);
5138 	}
5139 	return ret;
5140 }
5141 
5142 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
offline_css(struct cgroup_subsys_state * css)5143 static void offline_css(struct cgroup_subsys_state *css)
5144 {
5145 	struct cgroup_subsys *ss = css->ss;
5146 
5147 	lockdep_assert_held(&cgroup_mutex);
5148 
5149 	if (!(css->flags & CSS_ONLINE))
5150 		return;
5151 
5152 	if (ss->css_reset)
5153 		ss->css_reset(css);
5154 
5155 	if (ss->css_offline)
5156 		ss->css_offline(css);
5157 
5158 	css->flags &= ~CSS_ONLINE;
5159 	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5160 
5161 	wake_up_all(&css->cgroup->offline_waitq);
5162 }
5163 
5164 /**
5165  * css_create - create a cgroup_subsys_state
5166  * @cgrp: the cgroup new css will be associated with
5167  * @ss: the subsys of new css
5168  *
5169  * Create a new css associated with @cgrp - @ss pair.  On success, the new
5170  * css is online and installed in @cgrp.  This function doesn't create the
5171  * interface files.  Returns 0 on success, -errno on failure.
5172  */
css_create(struct cgroup * cgrp,struct cgroup_subsys * ss)5173 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5174 					      struct cgroup_subsys *ss)
5175 {
5176 	struct cgroup *parent = cgroup_parent(cgrp);
5177 	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5178 	struct cgroup_subsys_state *css;
5179 	int err;
5180 
5181 	lockdep_assert_held(&cgroup_mutex);
5182 
5183 	css = ss->css_alloc(parent_css);
5184 	if (!css)
5185 		css = ERR_PTR(-ENOMEM);
5186 	if (IS_ERR(css))
5187 		return css;
5188 
5189 	init_and_link_css(css, ss, cgrp);
5190 
5191 	err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5192 	if (err)
5193 		goto err_free_css;
5194 
5195 	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5196 	if (err < 0)
5197 		goto err_free_css;
5198 	css->id = err;
5199 
5200 	/* @css is ready to be brought online now, make it visible */
5201 	list_add_tail_rcu(&css->sibling, &parent_css->children);
5202 	cgroup_idr_replace(&ss->css_idr, css, css->id);
5203 
5204 	err = online_css(css);
5205 	if (err)
5206 		goto err_list_del;
5207 
5208 	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
5209 	    cgroup_parent(parent)) {
5210 		pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
5211 			current->comm, current->pid, ss->name);
5212 		if (!strcmp(ss->name, "memory"))
5213 			pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
5214 		ss->warned_broken_hierarchy = true;
5215 	}
5216 
5217 	return css;
5218 
5219 err_list_del:
5220 	list_del_rcu(&css->sibling);
5221 err_free_css:
5222 	call_rcu(&css->rcu_head, css_free_rcu_fn);
5223 	return ERR_PTR(err);
5224 }
5225 
5226 /*
5227  * The returned cgroup is fully initialized including its control mask, but
5228  * it isn't associated with its kernfs_node and doesn't have the control
5229  * mask applied.
5230  */
cgroup_create(struct cgroup * parent)5231 static struct cgroup *cgroup_create(struct cgroup *parent)
5232 {
5233 	struct cgroup_root *root = parent->root;
5234 	struct cgroup *cgrp, *tcgrp;
5235 	int level = parent->level + 1;
5236 	int ret;
5237 
5238 	/* allocate the cgroup and its ID, 0 is reserved for the root */
5239 	cgrp = kzalloc(sizeof(*cgrp) +
5240 		       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
5241 	if (!cgrp)
5242 		return ERR_PTR(-ENOMEM);
5243 
5244 	ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5245 	if (ret)
5246 		goto out_free_cgrp;
5247 
5248 	/*
5249 	 * Temporarily set the pointer to NULL, so idr_find() won't return
5250 	 * a half-baked cgroup.
5251 	 */
5252 	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
5253 	if (cgrp->id < 0) {
5254 		ret = -ENOMEM;
5255 		goto out_cancel_ref;
5256 	}
5257 
5258 	init_cgroup_housekeeping(cgrp);
5259 
5260 	cgrp->self.parent = &parent->self;
5261 	cgrp->root = root;
5262 	cgrp->level = level;
5263 
5264 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
5265 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
5266 
5267 	if (notify_on_release(parent))
5268 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5269 
5270 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5271 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5272 
5273 	cgrp->self.serial_nr = css_serial_nr_next++;
5274 
5275 	/* allocation complete, commit to creation */
5276 	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5277 	atomic_inc(&root->nr_cgrps);
5278 	cgroup_get(parent);
5279 
5280 	/*
5281 	 * @cgrp is now fully operational.  If something fails after this
5282 	 * point, it'll be released via the normal destruction path.
5283 	 */
5284 	cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
5285 
5286 	/*
5287 	 * On the default hierarchy, a child doesn't automatically inherit
5288 	 * subtree_control from the parent.  Each is configured manually.
5289 	 */
5290 	if (!cgroup_on_dfl(cgrp))
5291 		cgrp->subtree_control = cgroup_control(cgrp);
5292 
5293 	if (parent)
5294 		cgroup_bpf_inherit(cgrp, parent);
5295 
5296 	cgroup_propagate_control(cgrp);
5297 
5298 	return cgrp;
5299 
5300 out_cancel_ref:
5301 	percpu_ref_exit(&cgrp->self.refcnt);
5302 out_free_cgrp:
5303 	kfree(cgrp);
5304 	return ERR_PTR(ret);
5305 }
5306 
cgroup_mkdir(struct kernfs_node * parent_kn,const char * name,umode_t mode)5307 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
5308 			umode_t mode)
5309 {
5310 	struct cgroup *parent, *cgrp;
5311 	struct kernfs_node *kn;
5312 	int ret;
5313 
5314 	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
5315 	if (strchr(name, '\n'))
5316 		return -EINVAL;
5317 
5318 	parent = cgroup_kn_lock_live(parent_kn, false);
5319 	if (!parent)
5320 		return -ENODEV;
5321 
5322 	cgrp = cgroup_create(parent);
5323 	if (IS_ERR(cgrp)) {
5324 		ret = PTR_ERR(cgrp);
5325 		goto out_unlock;
5326 	}
5327 
5328 	/* create the directory */
5329 	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5330 	if (IS_ERR(kn)) {
5331 		ret = PTR_ERR(kn);
5332 		goto out_destroy;
5333 	}
5334 	cgrp->kn = kn;
5335 
5336 	/*
5337 	 * This extra ref will be put in cgroup_free_fn() and guarantees
5338 	 * that @cgrp->kn is always accessible.
5339 	 */
5340 	kernfs_get(kn);
5341 
5342 	ret = cgroup_kn_set_ugid(kn);
5343 	if (ret)
5344 		goto out_destroy;
5345 
5346 	ret = css_populate_dir(&cgrp->self);
5347 	if (ret)
5348 		goto out_destroy;
5349 
5350 	ret = cgroup_apply_control_enable(cgrp);
5351 	if (ret)
5352 		goto out_destroy;
5353 
5354 	trace_cgroup_mkdir(cgrp);
5355 
5356 	/* let's create and online css's */
5357 	kernfs_activate(kn);
5358 
5359 	ret = 0;
5360 	goto out_unlock;
5361 
5362 out_destroy:
5363 	cgroup_destroy_locked(cgrp);
5364 out_unlock:
5365 	cgroup_kn_unlock(parent_kn);
5366 	return ret;
5367 }
5368 
5369 /*
5370  * This is called when the refcnt of a css is confirmed to be killed.
5371  * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
5372  * initate destruction and put the css ref from kill_css().
5373  */
css_killed_work_fn(struct work_struct * work)5374 static void css_killed_work_fn(struct work_struct *work)
5375 {
5376 	struct cgroup_subsys_state *css =
5377 		container_of(work, struct cgroup_subsys_state, destroy_work);
5378 
5379 	mutex_lock(&cgroup_mutex);
5380 
5381 	do {
5382 		offline_css(css);
5383 		css_put(css);
5384 		/* @css can't go away while we're holding cgroup_mutex */
5385 		css = css->parent;
5386 	} while (css && atomic_dec_and_test(&css->online_cnt));
5387 
5388 	mutex_unlock(&cgroup_mutex);
5389 }
5390 
5391 /* css kill confirmation processing requires process context, bounce */
css_killed_ref_fn(struct percpu_ref * ref)5392 static void css_killed_ref_fn(struct percpu_ref *ref)
5393 {
5394 	struct cgroup_subsys_state *css =
5395 		container_of(ref, struct cgroup_subsys_state, refcnt);
5396 
5397 	if (atomic_dec_and_test(&css->online_cnt)) {
5398 		INIT_WORK(&css->destroy_work, css_killed_work_fn);
5399 		queue_work(cgroup_destroy_wq, &css->destroy_work);
5400 	}
5401 }
5402 
5403 /**
5404  * kill_css - destroy a css
5405  * @css: css to destroy
5406  *
5407  * This function initiates destruction of @css by removing cgroup interface
5408  * files and putting its base reference.  ->css_offline() will be invoked
5409  * asynchronously once css_tryget_online() is guaranteed to fail and when
5410  * the reference count reaches zero, @css will be released.
5411  */
kill_css(struct cgroup_subsys_state * css)5412 static void kill_css(struct cgroup_subsys_state *css)
5413 {
5414 	lockdep_assert_held(&cgroup_mutex);
5415 
5416 	if (css->flags & CSS_DYING)
5417 		return;
5418 
5419 	css->flags |= CSS_DYING;
5420 
5421 	/*
5422 	 * This must happen before css is disassociated with its cgroup.
5423 	 * See seq_css() for details.
5424 	 */
5425 	css_clear_dir(css);
5426 
5427 	/*
5428 	 * Killing would put the base ref, but we need to keep it alive
5429 	 * until after ->css_offline().
5430 	 */
5431 	css_get(css);
5432 
5433 	/*
5434 	 * cgroup core guarantees that, by the time ->css_offline() is
5435 	 * invoked, no new css reference will be given out via
5436 	 * css_tryget_online().  We can't simply call percpu_ref_kill() and
5437 	 * proceed to offlining css's because percpu_ref_kill() doesn't
5438 	 * guarantee that the ref is seen as killed on all CPUs on return.
5439 	 *
5440 	 * Use percpu_ref_kill_and_confirm() to get notifications as each
5441 	 * css is confirmed to be seen as killed on all CPUs.
5442 	 */
5443 	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5444 }
5445 
5446 /**
5447  * cgroup_destroy_locked - the first stage of cgroup destruction
5448  * @cgrp: cgroup to be destroyed
5449  *
5450  * css's make use of percpu refcnts whose killing latency shouldn't be
5451  * exposed to userland and are RCU protected.  Also, cgroup core needs to
5452  * guarantee that css_tryget_online() won't succeed by the time
5453  * ->css_offline() is invoked.  To satisfy all the requirements,
5454  * destruction is implemented in the following two steps.
5455  *
5456  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
5457  *     userland visible parts and start killing the percpu refcnts of
5458  *     css's.  Set up so that the next stage will be kicked off once all
5459  *     the percpu refcnts are confirmed to be killed.
5460  *
5461  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
5462  *     rest of destruction.  Once all cgroup references are gone, the
5463  *     cgroup is RCU-freed.
5464  *
5465  * This function implements s1.  After this step, @cgrp is gone as far as
5466  * the userland is concerned and a new cgroup with the same name may be
5467  * created.  As cgroup doesn't care about the names internally, this
5468  * doesn't cause any problem.
5469  */
cgroup_destroy_locked(struct cgroup * cgrp)5470 static int cgroup_destroy_locked(struct cgroup *cgrp)
5471 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5472 {
5473 	struct cgroup_subsys_state *css;
5474 	struct cgrp_cset_link *link;
5475 	int ssid;
5476 
5477 	lockdep_assert_held(&cgroup_mutex);
5478 
5479 	/*
5480 	 * Only migration can raise populated from zero and we're already
5481 	 * holding cgroup_mutex.
5482 	 */
5483 	if (cgroup_is_populated(cgrp))
5484 		return -EBUSY;
5485 
5486 	/*
5487 	 * Make sure there's no live children.  We can't test emptiness of
5488 	 * ->self.children as dead children linger on it while being
5489 	 * drained; otherwise, "rmdir parent/child parent" may fail.
5490 	 */
5491 	if (css_has_online_children(&cgrp->self))
5492 		return -EBUSY;
5493 
5494 	/*
5495 	 * Mark @cgrp and the associated csets dead.  The former prevents
5496 	 * further task migration and child creation by disabling
5497 	 * cgroup_lock_live_group().  The latter makes the csets ignored by
5498 	 * the migration path.
5499 	 */
5500 	cgrp->self.flags &= ~CSS_ONLINE;
5501 
5502 	spin_lock_irq(&css_set_lock);
5503 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
5504 		link->cset->dead = true;
5505 	spin_unlock_irq(&css_set_lock);
5506 
5507 	/* initiate massacre of all css's */
5508 	for_each_css(css, ssid, cgrp)
5509 		kill_css(css);
5510 
5511 	/*
5512 	 * Remove @cgrp directory along with the base files.  @cgrp has an
5513 	 * extra ref on its kn.
5514 	 */
5515 	kernfs_remove(cgrp->kn);
5516 
5517 	check_for_release(cgroup_parent(cgrp));
5518 
5519 	/* put the base reference */
5520 	percpu_ref_kill(&cgrp->self.refcnt);
5521 
5522 	return 0;
5523 };
5524 
cgroup_rmdir(struct kernfs_node * kn)5525 static int cgroup_rmdir(struct kernfs_node *kn)
5526 {
5527 	struct cgroup *cgrp;
5528 	int ret = 0;
5529 
5530 	cgrp = cgroup_kn_lock_live(kn, false);
5531 	if (!cgrp)
5532 		return 0;
5533 
5534 	ret = cgroup_destroy_locked(cgrp);
5535 
5536 	if (!ret)
5537 		trace_cgroup_rmdir(cgrp);
5538 
5539 	cgroup_kn_unlock(kn);
5540 	return ret;
5541 }
5542 
5543 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5544 	.remount_fs		= cgroup_remount,
5545 	.show_options		= cgroup_show_options,
5546 	.mkdir			= cgroup_mkdir,
5547 	.rmdir			= cgroup_rmdir,
5548 	.rename			= cgroup_rename,
5549 	.show_path		= cgroup_show_path,
5550 };
5551 
cgroup_init_subsys(struct cgroup_subsys * ss,bool early)5552 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5553 {
5554 	struct cgroup_subsys_state *css;
5555 
5556 	pr_debug("Initializing cgroup subsys %s\n", ss->name);
5557 
5558 	mutex_lock(&cgroup_mutex);
5559 
5560 	idr_init(&ss->css_idr);
5561 	INIT_LIST_HEAD(&ss->cfts);
5562 
5563 	/* Create the root cgroup state for this subsystem */
5564 	ss->root = &cgrp_dfl_root;
5565 	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
5566 	/* We don't handle early failures gracefully */
5567 	BUG_ON(IS_ERR(css));
5568 	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5569 
5570 	/*
5571 	 * Root csses are never destroyed and we can't initialize
5572 	 * percpu_ref during early init.  Disable refcnting.
5573 	 */
5574 	css->flags |= CSS_NO_REF;
5575 
5576 	if (early) {
5577 		/* allocation can't be done safely during early init */
5578 		css->id = 1;
5579 	} else {
5580 		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5581 		BUG_ON(css->id < 0);
5582 	}
5583 
5584 	/* Update the init_css_set to contain a subsys
5585 	 * pointer to this state - since the subsystem is
5586 	 * newly registered, all tasks and hence the
5587 	 * init_css_set is in the subsystem's root cgroup. */
5588 	init_css_set.subsys[ss->id] = css;
5589 
5590 	have_fork_callback |= (bool)ss->fork << ss->id;
5591 	have_exit_callback |= (bool)ss->exit << ss->id;
5592 	have_free_callback |= (bool)ss->free << ss->id;
5593 	have_canfork_callback |= (bool)ss->can_fork << ss->id;
5594 
5595 	/* At system boot, before all subsystems have been
5596 	 * registered, no tasks have been forked, so we don't
5597 	 * need to invoke fork callbacks here. */
5598 	BUG_ON(!list_empty(&init_task.tasks));
5599 
5600 	BUG_ON(online_css(css));
5601 
5602 	mutex_unlock(&cgroup_mutex);
5603 }
5604 
5605 /**
5606  * cgroup_init_early - cgroup initialization at system boot
5607  *
5608  * Initialize cgroups at system boot, and initialize any
5609  * subsystems that request early init.
5610  */
cgroup_init_early(void)5611 int __init cgroup_init_early(void)
5612 {
5613 	static struct cgroup_sb_opts __initdata opts;
5614 	struct cgroup_subsys *ss;
5615 	int i;
5616 
5617 	init_cgroup_root(&cgrp_dfl_root, &opts);
5618 	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
5619 
5620 	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
5621 
5622 	for_each_subsys(ss, i) {
5623 		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
5624 		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
5625 		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
5626 		     ss->id, ss->name);
5627 		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
5628 		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
5629 
5630 		ss->id = i;
5631 		ss->name = cgroup_subsys_name[i];
5632 		if (!ss->legacy_name)
5633 			ss->legacy_name = cgroup_subsys_name[i];
5634 
5635 		if (ss->early_init)
5636 			cgroup_init_subsys(ss, true);
5637 	}
5638 	return 0;
5639 }
5640 
5641 static u16 cgroup_disable_mask __initdata;
5642 
5643 /**
5644  * cgroup_init - cgroup initialization
5645  *
5646  * Register cgroup filesystem and /proc file, and initialize
5647  * any subsystems that didn't request early init.
5648  */
cgroup_init(void)5649 int __init cgroup_init(void)
5650 {
5651 	struct cgroup_subsys *ss;
5652 	int ssid;
5653 
5654 	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5655 	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5656 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
5657 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
5658 
5659 	/*
5660 	 * The latency of the synchronize_sched() is too high for cgroups,
5661 	 * avoid it at the cost of forcing all readers into the slow path.
5662 	 */
5663 	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
5664 
5665 	get_user_ns(init_cgroup_ns.user_ns);
5666 
5667 	mutex_lock(&cgroup_mutex);
5668 
5669 	/*
5670 	 * Add init_css_set to the hash table so that dfl_root can link to
5671 	 * it during init.
5672 	 */
5673 	hash_add(css_set_table, &init_css_set.hlist,
5674 		 css_set_hash(init_css_set.subsys));
5675 
5676 	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
5677 
5678 	mutex_unlock(&cgroup_mutex);
5679 
5680 	for_each_subsys(ss, ssid) {
5681 		if (ss->early_init) {
5682 			struct cgroup_subsys_state *css =
5683 				init_css_set.subsys[ss->id];
5684 
5685 			css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
5686 						   GFP_KERNEL);
5687 			BUG_ON(css->id < 0);
5688 		} else {
5689 			cgroup_init_subsys(ss, false);
5690 		}
5691 
5692 		list_add_tail(&init_css_set.e_cset_node[ssid],
5693 			      &cgrp_dfl_root.cgrp.e_csets[ssid]);
5694 
5695 		/*
5696 		 * Setting dfl_root subsys_mask needs to consider the
5697 		 * disabled flag and cftype registration needs kmalloc,
5698 		 * both of which aren't available during early_init.
5699 		 */
5700 		if (cgroup_disable_mask & (1 << ssid)) {
5701 			static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5702 			printk(KERN_INFO "Disabling %s control group subsystem\n",
5703 			       ss->name);
5704 			continue;
5705 		}
5706 
5707 		if (cgroup_ssid_no_v1(ssid))
5708 			printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
5709 			       ss->name);
5710 
5711 		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5712 
5713 		if (ss->implicit_on_dfl)
5714 			cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
5715 		else if (!ss->dfl_cftypes)
5716 			cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
5717 
5718 		if (ss->dfl_cftypes == ss->legacy_cftypes) {
5719 			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
5720 		} else {
5721 			WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
5722 			WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
5723 		}
5724 
5725 		if (ss->bind)
5726 			ss->bind(init_css_set.subsys[ssid]);
5727 
5728 		mutex_lock(&cgroup_mutex);
5729 		css_populate_dir(init_css_set.subsys[ssid]);
5730 		mutex_unlock(&cgroup_mutex);
5731 	}
5732 
5733 	/* init_css_set.subsys[] has been updated, re-hash */
5734 	hash_del(&init_css_set.hlist);
5735 	hash_add(css_set_table, &init_css_set.hlist,
5736 		 css_set_hash(init_css_set.subsys));
5737 
5738 	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5739 	WARN_ON(register_filesystem(&cgroup_fs_type));
5740 	WARN_ON(register_filesystem(&cgroup2_fs_type));
5741 	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
5742 
5743 	return 0;
5744 }
5745 
cgroup_wq_init(void)5746 static int __init cgroup_wq_init(void)
5747 {
5748 	/*
5749 	 * There isn't much point in executing destruction path in
5750 	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
5751 	 * Use 1 for @max_active.
5752 	 *
5753 	 * We would prefer to do this in cgroup_init() above, but that
5754 	 * is called before init_workqueues(): so leave this until after.
5755 	 */
5756 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
5757 	BUG_ON(!cgroup_destroy_wq);
5758 
5759 	/*
5760 	 * Used to destroy pidlists and separate to serve as flush domain.
5761 	 * Cap @max_active to 1 too.
5762 	 */
5763 	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
5764 						    0, 1);
5765 	BUG_ON(!cgroup_pidlist_destroy_wq);
5766 
5767 	return 0;
5768 }
5769 core_initcall(cgroup_wq_init);
5770 
5771 /*
5772  * proc_cgroup_show()
5773  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
5774  *  - Used for /proc/<pid>/cgroup.
5775  */
proc_cgroup_show(struct seq_file * m,struct pid_namespace * ns,struct pid * pid,struct task_struct * tsk)5776 int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5777 		     struct pid *pid, struct task_struct *tsk)
5778 {
5779 	char *buf;
5780 	int retval;
5781 	struct cgroup_root *root;
5782 
5783 	retval = -ENOMEM;
5784 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
5785 	if (!buf)
5786 		goto out;
5787 
5788 	mutex_lock(&cgroup_mutex);
5789 	spin_lock_irq(&css_set_lock);
5790 
5791 	for_each_root(root) {
5792 		struct cgroup_subsys *ss;
5793 		struct cgroup *cgrp;
5794 		int ssid, count = 0;
5795 
5796 		if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
5797 			continue;
5798 
5799 		seq_printf(m, "%d:", root->hierarchy_id);
5800 		if (root != &cgrp_dfl_root)
5801 			for_each_subsys(ss, ssid)
5802 				if (root->subsys_mask & (1 << ssid))
5803 					seq_printf(m, "%s%s", count++ ? "," : "",
5804 						   ss->legacy_name);
5805 		if (strlen(root->name))
5806 			seq_printf(m, "%sname=%s", count ? "," : "",
5807 				   root->name);
5808 		seq_putc(m, ':');
5809 
5810 		cgrp = task_cgroup_from_root(tsk, root);
5811 
5812 		/*
5813 		 * On traditional hierarchies, all zombie tasks show up as
5814 		 * belonging to the root cgroup.  On the default hierarchy,
5815 		 * while a zombie doesn't show up in "cgroup.procs" and
5816 		 * thus can't be migrated, its /proc/PID/cgroup keeps
5817 		 * reporting the cgroup it belonged to before exiting.  If
5818 		 * the cgroup is removed before the zombie is reaped,
5819 		 * " (deleted)" is appended to the cgroup path.
5820 		 */
5821 		if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5822 			retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
5823 						current->nsproxy->cgroup_ns);
5824 			if (retval >= PATH_MAX)
5825 				retval = -ENAMETOOLONG;
5826 			if (retval < 0)
5827 				goto out_unlock;
5828 
5829 			seq_puts(m, buf);
5830 		} else {
5831 			seq_puts(m, "/");
5832 		}
5833 
5834 		if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5835 			seq_puts(m, " (deleted)\n");
5836 		else
5837 			seq_putc(m, '\n');
5838 	}
5839 
5840 	retval = 0;
5841 out_unlock:
5842 	spin_unlock_irq(&css_set_lock);
5843 	mutex_unlock(&cgroup_mutex);
5844 	kfree(buf);
5845 out:
5846 	return retval;
5847 }
5848 
5849 /* Display information about each subsystem and each hierarchy */
proc_cgroupstats_show(struct seq_file * m,void * v)5850 static int proc_cgroupstats_show(struct seq_file *m, void *v)
5851 {
5852 	struct cgroup_subsys *ss;
5853 	int i;
5854 
5855 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
5856 	/*
5857 	 * ideally we don't want subsystems moving around while we do this.
5858 	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
5859 	 * subsys/hierarchy state.
5860 	 */
5861 	mutex_lock(&cgroup_mutex);
5862 
5863 	for_each_subsys(ss, i)
5864 		seq_printf(m, "%s\t%d\t%d\t%d\n",
5865 			   ss->legacy_name, ss->root->hierarchy_id,
5866 			   atomic_read(&ss->root->nr_cgrps),
5867 			   cgroup_ssid_enabled(i));
5868 
5869 	mutex_unlock(&cgroup_mutex);
5870 	return 0;
5871 }
5872 
cgroupstats_open(struct inode * inode,struct file * file)5873 static int cgroupstats_open(struct inode *inode, struct file *file)
5874 {
5875 	return single_open(file, proc_cgroupstats_show, NULL);
5876 }
5877 
5878 static const struct file_operations proc_cgroupstats_operations = {
5879 	.open = cgroupstats_open,
5880 	.read = seq_read,
5881 	.llseek = seq_lseek,
5882 	.release = single_release,
5883 };
5884 
5885 /**
5886  * cgroup_fork - initialize cgroup related fields during copy_process()
5887  * @child: pointer to task_struct of forking parent process.
5888  *
5889  * A task is associated with the init_css_set until cgroup_post_fork()
5890  * attaches it to the parent's css_set.  Empty cg_list indicates that
5891  * @child isn't holding reference to its css_set.
5892  */
cgroup_fork(struct task_struct * child)5893 void cgroup_fork(struct task_struct *child)
5894 {
5895 	RCU_INIT_POINTER(child->cgroups, &init_css_set);
5896 	INIT_LIST_HEAD(&child->cg_list);
5897 }
5898 
5899 /**
5900  * cgroup_can_fork - called on a new task before the process is exposed
5901  * @child: the task in question.
5902  *
5903  * This calls the subsystem can_fork() callbacks. If the can_fork() callback
5904  * returns an error, the fork aborts with that error code. This allows for
5905  * a cgroup subsystem to conditionally allow or deny new forks.
5906  */
cgroup_can_fork(struct task_struct * child)5907 int cgroup_can_fork(struct task_struct *child)
5908 {
5909 	struct cgroup_subsys *ss;
5910 	int i, j, ret;
5911 
5912 	do_each_subsys_mask(ss, i, have_canfork_callback) {
5913 		ret = ss->can_fork(child);
5914 		if (ret)
5915 			goto out_revert;
5916 	} while_each_subsys_mask();
5917 
5918 	return 0;
5919 
5920 out_revert:
5921 	for_each_subsys(ss, j) {
5922 		if (j >= i)
5923 			break;
5924 		if (ss->cancel_fork)
5925 			ss->cancel_fork(child);
5926 	}
5927 
5928 	return ret;
5929 }
5930 
5931 /**
5932  * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
5933  * @child: the task in question
5934  *
5935  * This calls the cancel_fork() callbacks if a fork failed *after*
5936  * cgroup_can_fork() succeded.
5937  */
cgroup_cancel_fork(struct task_struct * child)5938 void cgroup_cancel_fork(struct task_struct *child)
5939 {
5940 	struct cgroup_subsys *ss;
5941 	int i;
5942 
5943 	for_each_subsys(ss, i)
5944 		if (ss->cancel_fork)
5945 			ss->cancel_fork(child);
5946 }
5947 
5948 /**
5949  * cgroup_post_fork - called on a new task after adding it to the task list
5950  * @child: the task in question
5951  *
5952  * Adds the task to the list running through its css_set if necessary and
5953  * call the subsystem fork() callbacks.  Has to be after the task is
5954  * visible on the task list in case we race with the first call to
5955  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5956  * list.
5957  */
cgroup_post_fork(struct task_struct * child)5958 void cgroup_post_fork(struct task_struct *child)
5959 {
5960 	struct cgroup_subsys *ss;
5961 	int i;
5962 
5963 	/*
5964 	 * This may race against cgroup_enable_task_cg_lists().  As that
5965 	 * function sets use_task_css_set_links before grabbing
5966 	 * tasklist_lock and we just went through tasklist_lock to add
5967 	 * @child, it's guaranteed that either we see the set
5968 	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5969 	 * @child during its iteration.
5970 	 *
5971 	 * If we won the race, @child is associated with %current's
5972 	 * css_set.  Grabbing css_set_lock guarantees both that the
5973 	 * association is stable, and, on completion of the parent's
5974 	 * migration, @child is visible in the source of migration or
5975 	 * already in the destination cgroup.  This guarantee is necessary
5976 	 * when implementing operations which need to migrate all tasks of
5977 	 * a cgroup to another.
5978 	 *
5979 	 * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5980 	 * will remain in init_css_set.  This is safe because all tasks are
5981 	 * in the init_css_set before cg_links is enabled and there's no
5982 	 * operation which transfers all tasks out of init_css_set.
5983 	 */
5984 	if (use_task_css_set_links) {
5985 		struct css_set *cset;
5986 
5987 		spin_lock_irq(&css_set_lock);
5988 		cset = task_css_set(current);
5989 		if (list_empty(&child->cg_list)) {
5990 			get_css_set(cset);
5991 			css_set_move_task(child, NULL, cset, false);
5992 		}
5993 		spin_unlock_irq(&css_set_lock);
5994 	}
5995 
5996 	/*
5997 	 * Call ss->fork().  This must happen after @child is linked on
5998 	 * css_set; otherwise, @child might change state between ->fork()
5999 	 * and addition to css_set.
6000 	 */
6001 	do_each_subsys_mask(ss, i, have_fork_callback) {
6002 		ss->fork(child);
6003 	} while_each_subsys_mask();
6004 }
6005 
6006 /**
6007  * cgroup_exit - detach cgroup from exiting task
6008  * @tsk: pointer to task_struct of exiting process
6009  *
6010  * Description: Detach cgroup from @tsk and release it.
6011  *
6012  * Note that cgroups marked notify_on_release force every task in
6013  * them to take the global cgroup_mutex mutex when exiting.
6014  * This could impact scaling on very large systems.  Be reluctant to
6015  * use notify_on_release cgroups where very high task exit scaling
6016  * is required on large systems.
6017  *
6018  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
6019  * call cgroup_exit() while the task is still competent to handle
6020  * notify_on_release(), then leave the task attached to the root cgroup in
6021  * each hierarchy for the remainder of its exit.  No need to bother with
6022  * init_css_set refcnting.  init_css_set never goes away and we can't race
6023  * with migration path - PF_EXITING is visible to migration path.
6024  */
cgroup_exit(struct task_struct * tsk)6025 void cgroup_exit(struct task_struct *tsk)
6026 {
6027 	struct cgroup_subsys *ss;
6028 	struct css_set *cset;
6029 	int i;
6030 
6031 	/*
6032 	 * Unlink from @tsk from its css_set.  As migration path can't race
6033 	 * with us, we can check css_set and cg_list without synchronization.
6034 	 */
6035 	cset = task_css_set(tsk);
6036 
6037 	if (!list_empty(&tsk->cg_list)) {
6038 		spin_lock_irq(&css_set_lock);
6039 		css_set_move_task(tsk, cset, NULL, false);
6040 		spin_unlock_irq(&css_set_lock);
6041 	} else {
6042 		get_css_set(cset);
6043 	}
6044 
6045 	/* see cgroup_post_fork() for details */
6046 	do_each_subsys_mask(ss, i, have_exit_callback) {
6047 		ss->exit(tsk);
6048 	} while_each_subsys_mask();
6049 }
6050 
cgroup_free(struct task_struct * task)6051 void cgroup_free(struct task_struct *task)
6052 {
6053 	struct css_set *cset = task_css_set(task);
6054 	struct cgroup_subsys *ss;
6055 	int ssid;
6056 
6057 	do_each_subsys_mask(ss, ssid, have_free_callback) {
6058 		ss->free(task);
6059 	} while_each_subsys_mask();
6060 
6061 	put_css_set(cset);
6062 }
6063 
check_for_release(struct cgroup * cgrp)6064 static void check_for_release(struct cgroup *cgrp)
6065 {
6066 	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
6067 	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
6068 		schedule_work(&cgrp->release_agent_work);
6069 }
6070 
6071 /*
6072  * Notify userspace when a cgroup is released, by running the
6073  * configured release agent with the name of the cgroup (path
6074  * relative to the root of cgroup file system) as the argument.
6075  *
6076  * Most likely, this user command will try to rmdir this cgroup.
6077  *
6078  * This races with the possibility that some other task will be
6079  * attached to this cgroup before it is removed, or that some other
6080  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
6081  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
6082  * unused, and this cgroup will be reprieved from its death sentence,
6083  * to continue to serve a useful existence.  Next time it's released,
6084  * we will get notified again, if it still has 'notify_on_release' set.
6085  *
6086  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
6087  * means only wait until the task is successfully execve()'d.  The
6088  * separate release agent task is forked by call_usermodehelper(),
6089  * then control in this thread returns here, without waiting for the
6090  * release agent task.  We don't bother to wait because the caller of
6091  * this routine has no use for the exit status of the release agent
6092  * task, so no sense holding our caller up for that.
6093  */
cgroup_release_agent(struct work_struct * work)6094 static void cgroup_release_agent(struct work_struct *work)
6095 {
6096 	struct cgroup *cgrp =
6097 		container_of(work, struct cgroup, release_agent_work);
6098 	char *pathbuf = NULL, *agentbuf = NULL;
6099 	char *argv[3], *envp[3];
6100 	int ret;
6101 
6102 	mutex_lock(&cgroup_mutex);
6103 
6104 	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
6105 	agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
6106 	if (!pathbuf || !agentbuf)
6107 		goto out;
6108 
6109 	spin_lock_irq(&css_set_lock);
6110 	ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
6111 	spin_unlock_irq(&css_set_lock);
6112 	if (ret < 0 || ret >= PATH_MAX)
6113 		goto out;
6114 
6115 	argv[0] = agentbuf;
6116 	argv[1] = pathbuf;
6117 	argv[2] = NULL;
6118 
6119 	/* minimal command environment */
6120 	envp[0] = "HOME=/";
6121 	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
6122 	envp[2] = NULL;
6123 
6124 	mutex_unlock(&cgroup_mutex);
6125 	call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
6126 	goto out_free;
6127 out:
6128 	mutex_unlock(&cgroup_mutex);
6129 out_free:
6130 	kfree(agentbuf);
6131 	kfree(pathbuf);
6132 }
6133 
cgroup_disable(char * str)6134 static int __init cgroup_disable(char *str)
6135 {
6136 	struct cgroup_subsys *ss;
6137 	char *token;
6138 	int i;
6139 
6140 	while ((token = strsep(&str, ",")) != NULL) {
6141 		if (!*token)
6142 			continue;
6143 
6144 		for_each_subsys(ss, i) {
6145 			if (strcmp(token, ss->name) &&
6146 			    strcmp(token, ss->legacy_name))
6147 				continue;
6148 			cgroup_disable_mask |= 1 << i;
6149 		}
6150 	}
6151 	return 1;
6152 }
6153 __setup("cgroup_disable=", cgroup_disable);
6154 
cgroup_no_v1(char * str)6155 static int __init cgroup_no_v1(char *str)
6156 {
6157 	struct cgroup_subsys *ss;
6158 	char *token;
6159 	int i;
6160 
6161 	while ((token = strsep(&str, ",")) != NULL) {
6162 		if (!*token)
6163 			continue;
6164 
6165 		if (!strcmp(token, "all")) {
6166 			cgroup_no_v1_mask = U16_MAX;
6167 			break;
6168 		}
6169 
6170 		for_each_subsys(ss, i) {
6171 			if (strcmp(token, ss->name) &&
6172 			    strcmp(token, ss->legacy_name))
6173 				continue;
6174 
6175 			cgroup_no_v1_mask |= 1 << i;
6176 		}
6177 	}
6178 	return 1;
6179 }
6180 __setup("cgroup_no_v1=", cgroup_no_v1);
6181 
6182 /**
6183  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
6184  * @dentry: directory dentry of interest
6185  * @ss: subsystem of interest
6186  *
6187  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
6188  * to get the corresponding css and return it.  If such css doesn't exist
6189  * or can't be pinned, an ERR_PTR value is returned.
6190  */
css_tryget_online_from_dir(struct dentry * dentry,struct cgroup_subsys * ss)6191 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6192 						       struct cgroup_subsys *ss)
6193 {
6194 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6195 	struct file_system_type *s_type = dentry->d_sb->s_type;
6196 	struct cgroup_subsys_state *css = NULL;
6197 	struct cgroup *cgrp;
6198 
6199 	/* is @dentry a cgroup dir? */
6200 	if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6201 	    !kn || kernfs_type(kn) != KERNFS_DIR)
6202 		return ERR_PTR(-EBADF);
6203 
6204 	rcu_read_lock();
6205 
6206 	/*
6207 	 * This path doesn't originate from kernfs and @kn could already
6208 	 * have been or be removed at any point.  @kn->priv is RCU
6209 	 * protected for this access.  See css_release_work_fn() for details.
6210 	 */
6211 	cgrp = rcu_dereference(kn->priv);
6212 	if (cgrp)
6213 		css = cgroup_css(cgrp, ss);
6214 
6215 	if (!css || !css_tryget_online(css))
6216 		css = ERR_PTR(-ENOENT);
6217 
6218 	rcu_read_unlock();
6219 	return css;
6220 }
6221 
6222 /**
6223  * css_from_id - lookup css by id
6224  * @id: the cgroup id
6225  * @ss: cgroup subsys to be looked into
6226  *
6227  * Returns the css if there's valid one with @id, otherwise returns NULL.
6228  * Should be called under rcu_read_lock().
6229  */
css_from_id(int id,struct cgroup_subsys * ss)6230 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6231 {
6232 	WARN_ON_ONCE(!rcu_read_lock_held());
6233 	return idr_find(&ss->css_idr, id);
6234 }
6235 
6236 /**
6237  * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
6238  * @path: path on the default hierarchy
6239  *
6240  * Find the cgroup at @path on the default hierarchy, increment its
6241  * reference count and return it.  Returns pointer to the found cgroup on
6242  * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
6243  * if @path points to a non-directory.
6244  */
cgroup_get_from_path(const char * path)6245 struct cgroup *cgroup_get_from_path(const char *path)
6246 {
6247 	struct kernfs_node *kn;
6248 	struct cgroup *cgrp;
6249 
6250 	mutex_lock(&cgroup_mutex);
6251 
6252 	kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
6253 	if (kn) {
6254 		if (kernfs_type(kn) == KERNFS_DIR) {
6255 			cgrp = kn->priv;
6256 			cgroup_get(cgrp);
6257 		} else {
6258 			cgrp = ERR_PTR(-ENOTDIR);
6259 		}
6260 		kernfs_put(kn);
6261 	} else {
6262 		cgrp = ERR_PTR(-ENOENT);
6263 	}
6264 
6265 	mutex_unlock(&cgroup_mutex);
6266 	return cgrp;
6267 }
6268 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6269 
6270 /**
6271  * cgroup_get_from_fd - get a cgroup pointer from a fd
6272  * @fd: fd obtained by open(cgroup2_dir)
6273  *
6274  * Find the cgroup from a fd which should be obtained
6275  * by opening a cgroup directory.  Returns a pointer to the
6276  * cgroup on success. ERR_PTR is returned if the cgroup
6277  * cannot be found.
6278  */
cgroup_get_from_fd(int fd)6279 struct cgroup *cgroup_get_from_fd(int fd)
6280 {
6281 	struct cgroup_subsys_state *css;
6282 	struct cgroup *cgrp;
6283 	struct file *f;
6284 
6285 	f = fget_raw(fd);
6286 	if (!f)
6287 		return ERR_PTR(-EBADF);
6288 
6289 	css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6290 	fput(f);
6291 	if (IS_ERR(css))
6292 		return ERR_CAST(css);
6293 
6294 	cgrp = css->cgroup;
6295 	if (!cgroup_on_dfl(cgrp)) {
6296 		cgroup_put(cgrp);
6297 		return ERR_PTR(-EBADF);
6298 	}
6299 
6300 	return cgrp;
6301 }
6302 EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6303 
6304 /*
6305  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
6306  * definition in cgroup-defs.h.
6307  */
6308 #ifdef CONFIG_SOCK_CGROUP_DATA
6309 
6310 #if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
6311 
6312 DEFINE_SPINLOCK(cgroup_sk_update_lock);
6313 static bool cgroup_sk_alloc_disabled __read_mostly;
6314 
cgroup_sk_alloc_disable(void)6315 void cgroup_sk_alloc_disable(void)
6316 {
6317 	if (cgroup_sk_alloc_disabled)
6318 		return;
6319 	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
6320 	cgroup_sk_alloc_disabled = true;
6321 }
6322 
6323 #else
6324 
6325 #define cgroup_sk_alloc_disabled	false
6326 
6327 #endif
6328 
cgroup_sk_alloc(struct sock_cgroup_data * skcd)6329 void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6330 {
6331 	if (cgroup_sk_alloc_disabled)
6332 		return;
6333 
6334 	/* Socket clone path */
6335 	if (skcd->val) {
6336 		cgroup_get(sock_cgroup_ptr(skcd));
6337 		return;
6338 	}
6339 
6340 	rcu_read_lock();
6341 
6342 	while (true) {
6343 		struct css_set *cset;
6344 
6345 		cset = task_css_set(current);
6346 		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6347 			skcd->val = (unsigned long)cset->dfl_cgrp;
6348 			break;
6349 		}
6350 		cpu_relax();
6351 	}
6352 
6353 	rcu_read_unlock();
6354 }
6355 
cgroup_sk_free(struct sock_cgroup_data * skcd)6356 void cgroup_sk_free(struct sock_cgroup_data *skcd)
6357 {
6358 	cgroup_put(sock_cgroup_ptr(skcd));
6359 }
6360 
6361 #endif	/* CONFIG_SOCK_CGROUP_DATA */
6362 
6363 /* cgroup namespaces */
6364 
inc_cgroup_namespaces(struct user_namespace * ns)6365 static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
6366 {
6367 	return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
6368 }
6369 
dec_cgroup_namespaces(struct ucounts * ucounts)6370 static void dec_cgroup_namespaces(struct ucounts *ucounts)
6371 {
6372 	dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
6373 }
6374 
alloc_cgroup_ns(void)6375 static struct cgroup_namespace *alloc_cgroup_ns(void)
6376 {
6377 	struct cgroup_namespace *new_ns;
6378 	int ret;
6379 
6380 	new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
6381 	if (!new_ns)
6382 		return ERR_PTR(-ENOMEM);
6383 	ret = ns_alloc_inum(&new_ns->ns);
6384 	if (ret) {
6385 		kfree(new_ns);
6386 		return ERR_PTR(ret);
6387 	}
6388 	atomic_set(&new_ns->count, 1);
6389 	new_ns->ns.ops = &cgroupns_operations;
6390 	return new_ns;
6391 }
6392 
free_cgroup_ns(struct cgroup_namespace * ns)6393 void free_cgroup_ns(struct cgroup_namespace *ns)
6394 {
6395 	put_css_set(ns->root_cset);
6396 	dec_cgroup_namespaces(ns->ucounts);
6397 	put_user_ns(ns->user_ns);
6398 	ns_free_inum(&ns->ns);
6399 	kfree(ns);
6400 }
6401 EXPORT_SYMBOL(free_cgroup_ns);
6402 
copy_cgroup_ns(unsigned long flags,struct user_namespace * user_ns,struct cgroup_namespace * old_ns)6403 struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
6404 					struct user_namespace *user_ns,
6405 					struct cgroup_namespace *old_ns)
6406 {
6407 	struct cgroup_namespace *new_ns;
6408 	struct ucounts *ucounts;
6409 	struct css_set *cset;
6410 
6411 	BUG_ON(!old_ns);
6412 
6413 	if (!(flags & CLONE_NEWCGROUP)) {
6414 		get_cgroup_ns(old_ns);
6415 		return old_ns;
6416 	}
6417 
6418 	/* Allow only sysadmin to create cgroup namespace. */
6419 	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
6420 		return ERR_PTR(-EPERM);
6421 
6422 	ucounts = inc_cgroup_namespaces(user_ns);
6423 	if (!ucounts)
6424 		return ERR_PTR(-ENOSPC);
6425 
6426 	/* It is not safe to take cgroup_mutex here */
6427 	spin_lock_irq(&css_set_lock);
6428 	cset = task_css_set(current);
6429 	get_css_set(cset);
6430 	spin_unlock_irq(&css_set_lock);
6431 
6432 	new_ns = alloc_cgroup_ns();
6433 	if (IS_ERR(new_ns)) {
6434 		put_css_set(cset);
6435 		dec_cgroup_namespaces(ucounts);
6436 		return new_ns;
6437 	}
6438 
6439 	new_ns->user_ns = get_user_ns(user_ns);
6440 	new_ns->ucounts = ucounts;
6441 	new_ns->root_cset = cset;
6442 
6443 	return new_ns;
6444 }
6445 
to_cg_ns(struct ns_common * ns)6446 static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
6447 {
6448 	return container_of(ns, struct cgroup_namespace, ns);
6449 }
6450 
cgroupns_install(struct nsproxy * nsproxy,struct ns_common * ns)6451 static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
6452 {
6453 	struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
6454 
6455 	if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
6456 	    !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
6457 		return -EPERM;
6458 
6459 	/* Don't need to do anything if we are attaching to our own cgroupns. */
6460 	if (cgroup_ns == nsproxy->cgroup_ns)
6461 		return 0;
6462 
6463 	get_cgroup_ns(cgroup_ns);
6464 	put_cgroup_ns(nsproxy->cgroup_ns);
6465 	nsproxy->cgroup_ns = cgroup_ns;
6466 
6467 	return 0;
6468 }
6469 
cgroupns_get(struct task_struct * task)6470 static struct ns_common *cgroupns_get(struct task_struct *task)
6471 {
6472 	struct cgroup_namespace *ns = NULL;
6473 	struct nsproxy *nsproxy;
6474 
6475 	task_lock(task);
6476 	nsproxy = task->nsproxy;
6477 	if (nsproxy) {
6478 		ns = nsproxy->cgroup_ns;
6479 		get_cgroup_ns(ns);
6480 	}
6481 	task_unlock(task);
6482 
6483 	return ns ? &ns->ns : NULL;
6484 }
6485 
cgroupns_put(struct ns_common * ns)6486 static void cgroupns_put(struct ns_common *ns)
6487 {
6488 	put_cgroup_ns(to_cg_ns(ns));
6489 }
6490 
cgroupns_owner(struct ns_common * ns)6491 static struct user_namespace *cgroupns_owner(struct ns_common *ns)
6492 {
6493 	return to_cg_ns(ns)->user_ns;
6494 }
6495 
6496 const struct proc_ns_operations cgroupns_operations = {
6497 	.name		= "cgroup",
6498 	.type		= CLONE_NEWCGROUP,
6499 	.get		= cgroupns_get,
6500 	.put		= cgroupns_put,
6501 	.install	= cgroupns_install,
6502 	.owner		= cgroupns_owner,
6503 };
6504 
cgroup_namespaces_init(void)6505 static __init int cgroup_namespaces_init(void)
6506 {
6507 	return 0;
6508 }
6509 subsys_initcall(cgroup_namespaces_init);
6510 
6511 #ifdef CONFIG_CGROUP_BPF
cgroup_bpf_update(struct cgroup * cgrp,struct bpf_prog * prog,enum bpf_attach_type type,bool overridable)6512 int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
6513 		      enum bpf_attach_type type, bool overridable)
6514 {
6515 	struct cgroup *parent = cgroup_parent(cgrp);
6516 	int ret;
6517 
6518 	mutex_lock(&cgroup_mutex);
6519 	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
6520 	mutex_unlock(&cgroup_mutex);
6521 	return ret;
6522 }
6523 #endif /* CONFIG_CGROUP_BPF */
6524 
6525 #ifdef CONFIG_CGROUP_DEBUG
6526 static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state * parent_css)6527 debug_css_alloc(struct cgroup_subsys_state *parent_css)
6528 {
6529 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
6530 
6531 	if (!css)
6532 		return ERR_PTR(-ENOMEM);
6533 
6534 	return css;
6535 }
6536 
debug_css_free(struct cgroup_subsys_state * css)6537 static void debug_css_free(struct cgroup_subsys_state *css)
6538 {
6539 	kfree(css);
6540 }
6541 
debug_taskcount_read(struct cgroup_subsys_state * css,struct cftype * cft)6542 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
6543 				struct cftype *cft)
6544 {
6545 	return cgroup_task_count(css->cgroup);
6546 }
6547 
current_css_set_read(struct cgroup_subsys_state * css,struct cftype * cft)6548 static u64 current_css_set_read(struct cgroup_subsys_state *css,
6549 				struct cftype *cft)
6550 {
6551 	return (u64)(unsigned long)current->cgroups;
6552 }
6553 
current_css_set_refcount_read(struct cgroup_subsys_state * css,struct cftype * cft)6554 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
6555 					 struct cftype *cft)
6556 {
6557 	u64 count;
6558 
6559 	rcu_read_lock();
6560 	count = atomic_read(&task_css_set(current)->refcount);
6561 	rcu_read_unlock();
6562 	return count;
6563 }
6564 
current_css_set_cg_links_read(struct seq_file * seq,void * v)6565 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
6566 {
6567 	struct cgrp_cset_link *link;
6568 	struct css_set *cset;
6569 	char *name_buf;
6570 
6571 	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
6572 	if (!name_buf)
6573 		return -ENOMEM;
6574 
6575 	spin_lock_irq(&css_set_lock);
6576 	rcu_read_lock();
6577 	cset = rcu_dereference(current->cgroups);
6578 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
6579 		struct cgroup *c = link->cgrp;
6580 
6581 		cgroup_name(c, name_buf, NAME_MAX + 1);
6582 		seq_printf(seq, "Root %d group %s\n",
6583 			   c->root->hierarchy_id, name_buf);
6584 	}
6585 	rcu_read_unlock();
6586 	spin_unlock_irq(&css_set_lock);
6587 	kfree(name_buf);
6588 	return 0;
6589 }
6590 
6591 #define MAX_TASKS_SHOWN_PER_CSS 25
cgroup_css_links_read(struct seq_file * seq,void * v)6592 static int cgroup_css_links_read(struct seq_file *seq, void *v)
6593 {
6594 	struct cgroup_subsys_state *css = seq_css(seq);
6595 	struct cgrp_cset_link *link;
6596 
6597 	spin_lock_irq(&css_set_lock);
6598 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
6599 		struct css_set *cset = link->cset;
6600 		struct task_struct *task;
6601 		int count = 0;
6602 
6603 		seq_printf(seq, "css_set %p\n", cset);
6604 
6605 		list_for_each_entry(task, &cset->tasks, cg_list) {
6606 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
6607 				goto overflow;
6608 			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
6609 		}
6610 
6611 		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
6612 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
6613 				goto overflow;
6614 			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
6615 		}
6616 		continue;
6617 	overflow:
6618 		seq_puts(seq, "  ...\n");
6619 	}
6620 	spin_unlock_irq(&css_set_lock);
6621 	return 0;
6622 }
6623 
releasable_read(struct cgroup_subsys_state * css,struct cftype * cft)6624 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
6625 {
6626 	return (!cgroup_is_populated(css->cgroup) &&
6627 		!css_has_online_children(&css->cgroup->self));
6628 }
6629 
6630 static struct cftype debug_files[] =  {
6631 	{
6632 		.name = "taskcount",
6633 		.read_u64 = debug_taskcount_read,
6634 	},
6635 
6636 	{
6637 		.name = "current_css_set",
6638 		.read_u64 = current_css_set_read,
6639 	},
6640 
6641 	{
6642 		.name = "current_css_set_refcount",
6643 		.read_u64 = current_css_set_refcount_read,
6644 	},
6645 
6646 	{
6647 		.name = "current_css_set_cg_links",
6648 		.seq_show = current_css_set_cg_links_read,
6649 	},
6650 
6651 	{
6652 		.name = "cgroup_css_links",
6653 		.seq_show = cgroup_css_links_read,
6654 	},
6655 
6656 	{
6657 		.name = "releasable",
6658 		.read_u64 = releasable_read,
6659 	},
6660 
6661 	{ }	/* terminate */
6662 };
6663 
6664 struct cgroup_subsys debug_cgrp_subsys = {
6665 	.css_alloc = debug_css_alloc,
6666 	.css_free = debug_css_free,
6667 	.legacy_cftypes = debug_files,
6668 };
6669 #endif /* CONFIG_CGROUP_DEBUG */
6670