• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
2index 209e6567c..d47c0212e 100644
3--- a/kernel/bpf/syscall.c
4+++ b/kernel/bpf/syscall.c
5@@ -128,21 +128,6 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
6 	return map;
7 }
8
9-static void bpf_map_write_active_inc(struct bpf_map *map)
10-{
11-	atomic64_inc(&map->writecnt);
12-}
13-
14-static void bpf_map_write_active_dec(struct bpf_map *map)
15-{
16-	atomic64_dec(&map->writecnt);
17-}
18-
19-bool bpf_map_write_active(const struct bpf_map *map)
20-{
21-	return atomic64_read(&map->writecnt) != 0;
22-}
23-
24 static u32 bpf_map_value_size(struct bpf_map *map)
25 {
26 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
27@@ -604,8 +589,11 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma)
28 {
29 	struct bpf_map *map = vma->vm_file->private_data;
30
31-	if (vma->vm_flags & VM_MAYWRITE)
32-		bpf_map_write_active_inc(map);
33+	if (vma->vm_flags & VM_MAYWRITE) {
34+		mutex_lock(&map->freeze_mutex);
35+		map->writecnt++;
36+		mutex_unlock(&map->freeze_mutex);
37+	}
38 }
39
40 /* called for all unmapped memory region (including initial) */
41@@ -613,8 +601,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma)
42 {
43 	struct bpf_map *map = vma->vm_file->private_data;
44
45-	if (vma->vm_flags & VM_MAYWRITE)
46-		bpf_map_write_active_dec(map);
47+	if (vma->vm_flags & VM_MAYWRITE) {
48+		mutex_lock(&map->freeze_mutex);
49+		map->writecnt--;
50+		mutex_unlock(&map->freeze_mutex);
51+	}
52 }
53
54 static const struct vm_operations_struct bpf_map_default_vmops = {
55@@ -664,7 +655,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
56 		goto out;
57
58 	if (vma->vm_flags & VM_MAYWRITE)
59-		bpf_map_write_active_inc(map);
60+		map->writecnt++;
61 out:
62 	mutex_unlock(&map->freeze_mutex);
63 	return err;
64@@ -1096,7 +1087,6 @@ static int map_update_elem(union bpf_attr *attr)
65 	map = __bpf_map_get(f);
66 	if (IS_ERR(map))
67 		return PTR_ERR(map);
68-	bpf_map_write_active_inc(map);
69 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
70 		err = -EPERM;
71 		goto err_put;
72@@ -1138,7 +1128,6 @@ static int map_update_elem(union bpf_attr *attr)
73 free_key:
74 	kfree(key);
75 err_put:
76-	bpf_map_write_active_dec(map);
77 	fdput(f);
78 	return err;
79 }
80@@ -1161,7 +1150,6 @@ static int map_delete_elem(union bpf_attr *attr)
81 	map = __bpf_map_get(f);
82 	if (IS_ERR(map))
83 		return PTR_ERR(map);
84-	bpf_map_write_active_inc(map);
85 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
86 		err = -EPERM;
87 		goto err_put;
88@@ -1192,7 +1180,6 @@ static int map_delete_elem(union bpf_attr *attr)
89 out:
90 	kfree(key);
91 err_put:
92-	bpf_map_write_active_dec(map);
93 	fdput(f);
94 	return err;
95 }
96@@ -1497,7 +1484,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
97 	map = __bpf_map_get(f);
98 	if (IS_ERR(map))
99 		return PTR_ERR(map);
100-	bpf_map_write_active_inc(map);
101 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
102 	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
103 		err = -EPERM;
104@@ -1539,7 +1525,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
105 free_key:
106 	kfree(key);
107 err_put:
108-	bpf_map_write_active_dec(map);
109 	fdput(f);
110 	return err;
111 }
112@@ -1566,7 +1551,8 @@ static int map_freeze(const union bpf_attr *attr)
113 	}
114
115 	mutex_lock(&map->freeze_mutex);
116-	if (bpf_map_write_active(map)) {
117+
118+	if (map->writecnt) {
119 		err = -EBUSY;
120 		goto err_put;
121 	}
122@@ -3991,9 +3977,6 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
123 			    union bpf_attr __user *uattr,
124 			    int cmd)
125 {
126-	bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
127-			 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
128-	bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
129 	struct bpf_map *map;
130 	int err, ufd;
131 	struct fd f;
132@@ -4006,13 +3989,16 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
133 	map = __bpf_map_get(f);
134 	if (IS_ERR(map))
135 		return PTR_ERR(map);
136-	if (has_write)
137-		bpf_map_write_active_inc(map);
138-	if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
139+
140+	if ((cmd == BPF_MAP_LOOKUP_BATCH ||
141+	     cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
142+	    !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
143 		err = -EPERM;
144 		goto err_put;
145 	}
146-	if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
147+
148+	if (cmd != BPF_MAP_LOOKUP_BATCH &&
149+	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
150 		err = -EPERM;
151 		goto err_put;
152 	}
153@@ -4025,9 +4011,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
154 		BPF_DO_BATCH(map->ops->map_update_batch);
155 	else
156 		BPF_DO_BATCH(map->ops->map_delete_batch);
157+
158 err_put:
159-	if (has_write)
160-		bpf_map_write_active_dec(map);
161 	fdput(f);
162 	return err;
163 }
164diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
165index 8de769745..3e854b91f 100644
166--- a/kernel/bpf/verifier.c
167+++ b/kernel/bpf/verifier.c
168@@ -3492,22 +3492,7 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
169
170 static bool bpf_map_is_rdonly(const struct bpf_map *map)
171 {
172-	/* A map is considered read-only if the following condition are true:
173-	 *
174-	 * 1) BPF program side cannot change any of the map content. The
175-	 *    BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
176-	 *    and was set at map creation time.
177-	 * 2) The map value(s) have been initialized from user space by a
178-	 *    loader and then "frozen", such that no new map update/delete
179-	 *    operations from syscall side are possible for the rest of
180-	 *    the map's lifetime from that point onwards.
181-	 * 3) Any parallel/pending map update/delete operations from syscall
182-	 *    side have been completed. Only after that point, it's safe to
183-	 *    assume that map value(s) are immutable.
184-	 */
185-	return (map->map_flags & BPF_F_RDONLY_PROG) &&
186-	       READ_ONCE(map->frozen) &&
187-	       !bpf_map_write_active(map);
188+	return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen;
189 }
190
191 static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
192diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
193index d9f8a464b..cddc908bc 100644
194--- a/kernel/cgroup/cgroup-v1.c
195+++ b/kernel/cgroup/cgroup-v1.c
196@@ -518,7 +518,8 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
197 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
198 #endif
199 	    !uid_eq(cred->euid, tcred->uid) &&
200-	    !uid_eq(cred->euid, tcred->suid))
201+	    !uid_eq(cred->euid, tcred->suid) &&
202+	    !ns_capable(tcred->user_ns, CAP_SYS_NICE))
203 		ret = -EACCES;
204 	put_cred(tcred);
205 	if (ret)
206diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
207index 3173fe473..f4d318733 100644
208--- a/kernel/cgroup/cpuset.c
209+++ b/kernel/cgroup/cpuset.c
210@@ -334,6 +334,8 @@ static struct cpuset top_cpuset = {
211  * guidelines for accessing subsystem state in kernel/cgroup.c
212  */
213
214+static DEFINE_MUTEX(cpuset_mutex);
215+
216 DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
217
218 void cpuset_read_lock(void)
219@@ -351,9 +353,9 @@ static DEFINE_SPINLOCK(callback_lock);
220 static struct workqueue_struct *cpuset_migrate_mm_wq;
221
222 /*
223- * CPU / memory hotplug is handled asynchronously.
224+ * CPU / memory hotplug is handled asynchronously
225+ * for hotplug, synchronously for resume_cpus
226  */
227-static void cpuset_hotplug_workfn(struct work_struct *work);
228 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
229
230 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
231@@ -373,18 +375,29 @@ static inline bool is_in_v2_mode(void)
232 }
233
234 /*
235- * Return in pmask the portion of a cpusets's cpus_allowed that
236- * are online.  If none are online, walk up the cpuset hierarchy
237- * until we find one that does have some online cpus.
238+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
239+ * are online and are capable of running the task.  If none are found,
240+ * walk up the cpuset hierarchy until we find one that does have some
241+ * appropriate cpus.
242  *
243  * One way or another, we guarantee to return some non-empty subset
244- * of cpu_online_mask.
245+ * of cpu_active_mask.
246  *
247  * Call with callback_lock or cpuset_mutex held.
248  */
249-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
250+static void guarantee_online_cpus(struct task_struct *tsk,
251+				  struct cpumask *pmask)
252 {
253-	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
254+	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
255+	struct cpuset *cs;
256+
257+	if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
258+		cpumask_copy(pmask, cpu_active_mask);
259+
260+	rcu_read_lock();
261+	cs = task_cs(tsk);
262+
263+	while (!cpumask_intersects(cs->effective_cpus, pmask)) {
264 		cs = parent_cs(cs);
265 		if (unlikely(!cs)) {
266 			/*
267@@ -394,11 +407,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
268 			 * cpuset's effective_cpus is on its way to be
269 			 * identical to cpu_online_mask.
270 			 */
271-			cpumask_copy(pmask, cpu_online_mask);
272-			return;
273+			goto out_unlock;
274 		}
275 	}
276-	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
277+	cpumask_and(pmask, pmask, cs->effective_cpus);
278+
279+out_unlock:
280+	rcu_read_unlock();
281 }
282
283 /*
284@@ -489,6 +504,9 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
285 	if (cs && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
286 		goto free_three;
287
288+	if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
289+		goto free_three;
290+
291 	return 0;
292
293 free_three:
294@@ -939,7 +957,7 @@ static void rebuild_root_domains(void)
295 	struct cpuset *cs = NULL;
296 	struct cgroup_subsys_state *pos_css;
297
298-	percpu_rwsem_assert_held(&cpuset_rwsem);
299+	lockdep_assert_held(&cpuset_mutex);
300 	lockdep_assert_cpus_held();
301 	lockdep_assert_held(&sched_domains_mutex);
302
303@@ -999,8 +1017,7 @@ static void rebuild_sched_domains_locked(void)
304 	struct cpuset *cs;
305 	int ndoms;
306
307-	lockdep_assert_cpus_held();
308-	percpu_rwsem_assert_held(&cpuset_rwsem);
309+	lockdep_assert_held(&cpuset_mutex);
310
311 	/*
312 	 * If we have raced with CPU hotplug, return early to avoid
313@@ -1051,12 +1068,18 @@ static void rebuild_sched_domains_locked(void)
314 void rebuild_sched_domains(void)
315 {
316 	get_online_cpus();
317-	percpu_down_write(&cpuset_rwsem);
318+	mutex_lock(&cpuset_mutex);
319 	rebuild_sched_domains_locked();
320-	percpu_up_write(&cpuset_rwsem);
321+	mutex_unlock(&cpuset_mutex);
322 	put_online_cpus();
323 }
324
325+static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p,
326+				const struct cpumask *new_mask)
327+{
328+	return set_cpus_allowed_ptr(p, new_mask);
329+}
330+
331 /**
332  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
333  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
334@@ -1072,7 +1095,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
335
336 	css_task_iter_start(&cs->css, 0, &it);
337 	while ((task = css_task_iter_next(&it)))
338-		set_cpus_allowed_ptr(task, cs->effective_cpus);
339+		update_cpus_allowed(cs, task, cs->effective_cpus);
340 	css_task_iter_end(&it);
341 }
342
343@@ -1096,8 +1119,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
344 		cpumask_and(new_cpus, new_cpus, cs->cpus_requested);
345 		cpumask_and(new_cpus, new_cpus, cpu_active_mask);
346 	} else {
347-		cpumask_and(new_cpus, cs->cpus_requested,
348-			    parent->effective_cpus);
349+		cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
350 	}
351 }
352
353@@ -1162,7 +1184,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
354 	int new_prs;
355 	bool part_error = false;	/* Partition error? */
356
357-	percpu_rwsem_assert_held(&cpuset_rwsem);
358+	lockdep_assert_held(&cpuset_mutex);
359
360 	/*
361 	 * The parent must be a partition root.
362@@ -2158,7 +2180,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
363 	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
364 	cs = css_cs(css);
365
366-	percpu_down_write(&cpuset_rwsem);
367+	mutex_lock(&cpuset_mutex);
368
369 	/* allow moving tasks into an empty cpuset if on default hierarchy */
370 	ret = -ENOSPC;
371@@ -2182,7 +2204,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
372 	cs->attach_in_progress++;
373 	ret = 0;
374 out_unlock:
375-	percpu_up_write(&cpuset_rwsem);
376+	mutex_unlock(&cpuset_mutex);
377 	return ret;
378 }
379
380@@ -2192,9 +2214,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
381
382 	cgroup_taskset_first(tset, &css);
383
384-	percpu_down_write(&cpuset_rwsem);
385+	mutex_lock(&cpuset_mutex);
386 	css_cs(css)->attach_in_progress--;
387-	percpu_up_write(&cpuset_rwsem);
388+	mutex_unlock(&cpuset_mutex);
389 }
390
391 /*
392@@ -2217,22 +2239,20 @@ static void cpuset_attach(struct cgroup_taskset *tset)
393 	cgroup_taskset_first(tset, &css);
394 	cs = css_cs(css);
395
396-	percpu_down_write(&cpuset_rwsem);
397-
398-	/* prepare for attach */
399-	if (cs == &top_cpuset)
400-		cpumask_copy(cpus_attach, cpu_possible_mask);
401-	else
402-		guarantee_online_cpus(cs, cpus_attach);
403+	mutex_lock(&cpuset_mutex);
404
405 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
406
407 	cgroup_taskset_for_each(task, css, tset) {
408+		if (cs != &top_cpuset)
409+			guarantee_online_cpus(task, cpus_attach);
410+		else
411+			cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
412 		/*
413 		 * can_attach beforehand should guarantee that this doesn't
414 		 * fail.  TODO: have a better way to handle failure here
415 		 */
416-		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
417+		WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach));
418
419 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
420 		cpuset_update_task_spread_flag(cs, task);
421@@ -2271,7 +2291,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
422 	if (!cs->attach_in_progress)
423 		wake_up(&cpuset_attach_wq);
424
425-	percpu_up_write(&cpuset_rwsem);
426+	mutex_unlock(&cpuset_mutex);
427 }
428
429 /* The various types of files and directories in a cpuset file system */
430@@ -2303,7 +2323,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
431 	int retval = 0;
432
433 	get_online_cpus();
434-	percpu_down_write(&cpuset_rwsem);
435+	mutex_lock(&cpuset_mutex);
436 	if (!is_cpuset_online(cs)) {
437 		retval = -ENODEV;
438 		goto out_unlock;
439@@ -2339,7 +2359,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
440 		break;
441 	}
442 out_unlock:
443-	percpu_up_write(&cpuset_rwsem);
444+	mutex_unlock(&cpuset_mutex);
445 	put_online_cpus();
446 	return retval;
447 }
448@@ -2352,7 +2372,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
449 	int retval = -ENODEV;
450
451 	get_online_cpus();
452-	percpu_down_write(&cpuset_rwsem);
453+	mutex_lock(&cpuset_mutex);
454 	if (!is_cpuset_online(cs))
455 		goto out_unlock;
456
457@@ -2365,7 +2385,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
458 		break;
459 	}
460 out_unlock:
461-	percpu_up_write(&cpuset_rwsem);
462+	mutex_unlock(&cpuset_mutex);
463 	put_online_cpus();
464 	return retval;
465 }
466@@ -2406,7 +2426,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
467 	flush_work(&cpuset_hotplug_work);
468
469 	get_online_cpus();
470-	percpu_down_write(&cpuset_rwsem);
471+	mutex_lock(&cpuset_mutex);
472 	if (!is_cpuset_online(cs))
473 		goto out_unlock;
474
475@@ -2430,7 +2450,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
476
477 	free_cpuset(trialcs);
478 out_unlock:
479-	percpu_up_write(&cpuset_rwsem);
480+	mutex_unlock(&cpuset_mutex);
481 	put_online_cpus();
482 	kernfs_unbreak_active_protection(of->kn);
483 	css_put(&cs->css);
484@@ -2563,13 +2583,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
485
486 	css_get(&cs->css);
487 	get_online_cpus();
488-	percpu_down_write(&cpuset_rwsem);
489+	mutex_lock(&cpuset_mutex);
490 	if (!is_cpuset_online(cs))
491 		goto out_unlock;
492
493 	retval = update_prstate(cs, val);
494 out_unlock:
495-	percpu_up_write(&cpuset_rwsem);
496+	mutex_unlock(&cpuset_mutex);
497 	put_online_cpus();
498 	css_put(&cs->css);
499 	return retval ?: nbytes;
500@@ -2777,7 +2797,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
501 		return 0;
502
503 	get_online_cpus();
504-	percpu_down_write(&cpuset_rwsem);
505+	mutex_lock(&cpuset_mutex);
506
507 	set_bit(CS_ONLINE, &cs->flags);
508 	if (is_spread_page(parent))
509@@ -2829,7 +2849,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
510 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
511 	spin_unlock_irq(&callback_lock);
512 out_unlock:
513-	percpu_up_write(&cpuset_rwsem);
514+	mutex_unlock(&cpuset_mutex);
515 	put_online_cpus();
516 	return 0;
517 }
518@@ -2850,7 +2870,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
519 	struct cpuset *cs = css_cs(css);
520
521 	get_online_cpus();
522-	percpu_down_write(&cpuset_rwsem);
523+	mutex_lock(&cpuset_mutex);
524
525 	if (is_partition_root(cs))
526 		update_prstate(cs, 0);
527@@ -2869,7 +2889,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
528 	cpuset_dec();
529 	clear_bit(CS_ONLINE, &cs->flags);
530
531-	percpu_up_write(&cpuset_rwsem);
532+	mutex_unlock(&cpuset_mutex);
533 	put_online_cpus();
534 }
535
536@@ -2882,7 +2902,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
537
538 static void cpuset_bind(struct cgroup_subsys_state *root_css)
539 {
540-	percpu_down_write(&cpuset_rwsem);
541+	mutex_lock(&cpuset_mutex);
542 	spin_lock_irq(&callback_lock);
543
544 	if (is_in_v2_mode()) {
545@@ -2895,7 +2915,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
546 	}
547
548 	spin_unlock_irq(&callback_lock);
549-	percpu_up_write(&cpuset_rwsem);
550+	mutex_unlock(&cpuset_mutex);
551 }
552
553 /*
554@@ -2908,7 +2928,6 @@ static void cpuset_fork(struct task_struct *task)
555 	if (task_css_is_root(task, cpuset_cgrp_id))
556 		return;
557
558-	set_cpus_allowed_ptr(task, current->cpus_ptr);
559 	task->mems_allowed = current->mems_allowed;
560 }
561
562@@ -2937,7 +2956,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
563
564 int __init cpuset_init(void)
565 {
566-	BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
567
568 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
569 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL));
570@@ -3012,7 +3030,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
571 	is_empty = cpumask_empty(cs->cpus_allowed) ||
572 		   nodes_empty(cs->mems_allowed);
573
574-	percpu_up_write(&cpuset_rwsem);
575+	mutex_unlock(&cpuset_mutex);
576
577 	/*
578 	 * Move tasks to the nearest ancestor with execution resources,
579@@ -3022,7 +3040,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
580 	if (is_empty)
581 		remove_tasks_in_empty_cpuset(cs);
582
583-	percpu_down_write(&cpuset_rwsem);
584+	mutex_lock(&cpuset_mutex);
585 }
586
587 static void
588@@ -3072,14 +3090,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
589 retry:
590 	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
591
592-	percpu_down_write(&cpuset_rwsem);
593+	mutex_lock(&cpuset_mutex);
594
595 	/*
596 	 * We have raced with task attaching. We wait until attaching
597 	 * is finished, so we won't attach a task to an empty cpuset.
598 	 */
599 	if (cs->attach_in_progress) {
600-		percpu_up_write(&cpuset_rwsem);
601+		mutex_unlock(&cpuset_mutex);
602 		goto retry;
603 	}
604
605@@ -3151,7 +3169,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
606 		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
607 					    cpus_updated, mems_updated);
608
609-	percpu_up_write(&cpuset_rwsem);
610+	mutex_unlock(&cpuset_mutex);
611 }
612
613 /**
614@@ -3170,7 +3188,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
615  * Note that CPU offlining during suspend is ignored.  We don't modify
616  * cpusets across suspend/resume cycles at all.
617  */
618-static void cpuset_hotplug_workfn(struct work_struct *work)
619+void cpuset_hotplug_workfn(struct work_struct *work)
620 {
621 	static cpumask_t new_cpus;
622 	static nodemask_t new_mems;
623@@ -3181,7 +3199,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
624 	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
625 		ptmp = &tmp;
626
627-	percpu_down_write(&cpuset_rwsem);
628+	mutex_lock(&cpuset_mutex);
629
630 	/* fetch the available cpus/mems and find out which changed how */
631 	cpumask_copy(&new_cpus, cpu_active_mask);
632@@ -3238,7 +3256,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
633 		update_tasks_nodemask(&top_cpuset);
634 	}
635
636-	percpu_up_write(&cpuset_rwsem);
637+	mutex_unlock(&cpuset_mutex);
638
639 	/* if cpus or mems changed, we need to propagate to descendants */
640 	if (cpus_updated || mems_updated) {
641@@ -3282,6 +3300,7 @@ void cpuset_wait_for_hotplug(void)
642 {
643 	flush_work(&cpuset_hotplug_work);
644 }
645+EXPORT_SYMBOL_GPL(cpuset_wait_for_hotplug);
646
647 /*
648  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
649@@ -3337,11 +3356,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
650
651 	spin_lock_irqsave(&callback_lock, flags);
652 	rcu_read_lock();
653-	guarantee_online_cpus(task_cs(tsk), pmask);
654+	guarantee_online_cpus(tsk, pmask);
655 	rcu_read_unlock();
656 	spin_unlock_irqrestore(&callback_lock, flags);
657 }
658-
659+EXPORT_SYMBOL_GPL(cpuset_cpus_allowed);
660 /**
661  * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
662  * @tsk: pointer to task_struct with which the scheduler is struggling
663@@ -3356,9 +3375,17 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
664
665 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
666 {
667+	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
668+	const struct cpumask *cs_mask;
669+
670 	rcu_read_lock();
671-	do_set_cpus_allowed(tsk, is_in_v2_mode() ?
672-		task_cs(tsk)->cpus_allowed : cpu_possible_mask);
673+	cs_mask = task_cs(tsk)->cpus_allowed;
674+
675+	if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask))
676+		goto unlock; /* select_fallback_rq will try harder */
677+
678+	do_set_cpus_allowed(tsk, cs_mask);
679+unlock:
680 	rcu_read_unlock();
681
682 	/*
683diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
684index 08236798d..081d026f1 100644
685--- a/kernel/cgroup/legacy_freezer.c
686+++ b/kernel/cgroup/legacy_freezer.c
687@@ -479,3 +479,4 @@ struct cgroup_subsys freezer_cgrp_subsys = {
688 	.fork		= freezer_fork,
689 	.legacy_cftypes	= files,
690 };
691+EXPORT_SYMBOL_GPL(freezer_cgrp_subsys);
692diff --git a/kernel/cpu.c b/kernel/cpu.c
693index 4b27158d3..b076ccd1b 100644
694--- a/kernel/cpu.c
695+++ b/kernel/cpu.c
696@@ -39,6 +39,8 @@
697 #define CREATE_TRACE_POINTS
698 #include <trace/events/cpuhp.h>
699
700+#undef CREATE_TRACE_POINTS
701+
702 #include "smpboot.h"
703
704 /**
705@@ -274,11 +276,13 @@ void cpu_maps_update_begin(void)
706 {
707 	mutex_lock(&cpu_add_remove_lock);
708 }
709+EXPORT_SYMBOL_GPL(cpu_maps_update_begin);
710
711 void cpu_maps_update_done(void)
712 {
713 	mutex_unlock(&cpu_add_remove_lock);
714 }
715+EXPORT_SYMBOL_GPL(cpu_maps_update_done);
716
717 /*
718  * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
719@@ -1053,7 +1057,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
720 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
721 	int prev_state, ret = 0;
722
723-	if (num_online_cpus() == 1)
724+	if (num_active_cpus() == 1 && cpu_active(cpu))
725 		return -EBUSY;
726
727 	if (!cpu_present(cpu))
728diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
729index e2999a070..79cb6d063 100644
730--- a/kernel/irq/generic-chip.c
731+++ b/kernel/irq/generic-chip.c
732@@ -200,6 +200,7 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
733 	irq_gc_unlock(gc);
734 	return 0;
735 }
736+EXPORT_SYMBOL_GPL(irq_gc_set_wake);
737
738 static u32 irq_readl_be(void __iomem *addr)
739 {
740diff --git a/kernel/power/Makefile b/kernel/power/Makefile
741index 5899260a8..466eaa74f 100644
742--- a/kernel/power/Makefile
743+++ b/kernel/power/Makefile
744@@ -1,5 +1,17 @@
745 # SPDX-License-Identifier: GPL-2.0
746
747+CURRENT_DIR := $(abspath $(dir $(realpath $(lastword $(MAKEFILE_LIST)))))
748+
749+ifeq ($(PRODUCT_PATH),)
750+$(error PRODUCT_PATH is not set)
751+endif
752+
753+WEAKUP_DIR := ../../../../../../$(PRODUCT_PATH)/kernel_core/kernel/power
754+ifeq ($(wildcard $(CURRENT_DIR)/$(WEAKUP_DIR)),)
755+HCS_ABS_DIR := $(abspath $(CURRENT_DIR)/$(WEAKUP_DIR))
756+$(error miss in $(HCS_ABS_DIR) for standrad system)
757+endif
758+
759 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
760
761 KASAN_SANITIZE_snapshot.o	:= n
762@@ -17,4 +29,5 @@ obj-$(CONFIG_PM_WAKELOCKS)	+= wakelock.o
763
764 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
765
766+obj-$(CONFIG_SUSPEND)		+= $(WEAKUP_DIR)/
767 obj-$(CONFIG_ENERGY_MODEL)	+= energy_model.o
768diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
769index 119b929dc..41430128d 100644
770--- a/kernel/power/energy_model.c
771+++ b/kernel/power/energy_model.c
772@@ -52,6 +52,17 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
773 }
774 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
775
776+static int em_debug_units_show(struct seq_file *s, void *unused)
777+{
778+	struct em_perf_domain *pd = s->private;
779+	char *units = pd->milliwatts ? "milliWatts" : "bogoWatts";
780+
781+	seq_printf(s, "%s\n", units);
782+
783+	return 0;
784+}
785+DEFINE_SHOW_ATTRIBUTE(em_debug_units);
786+
787 static void em_debug_create_pd(struct device *dev)
788 {
789 	struct dentry *d;
790@@ -64,6 +75,8 @@ static void em_debug_create_pd(struct device *dev)
791 		debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
792 				    &em_debug_cpus_fops);
793
794+	debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
795+
796 	/* Create a sub-directory for each performance state */
797 	for (i = 0; i < dev->em_pd->nr_perf_states; i++)
798 		em_debug_create_ps(&dev->em_pd->table[i], d);
799@@ -245,17 +258,24 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
800  * @cpus	: Pointer to cpumask_t, which in case of a CPU device is
801  *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
802  *		type of devices this should be set to NULL.
803+ * @milliwatts	: Flag indicating that the power values are in milliWatts or
804+ *		in some other scale. It must be set properly.
805  *
806  * Create Energy Model tables for a performance domain using the callbacks
807  * defined in cb.
808  *
809+ * The @milliwatts is important to set with correct value. Some kernel
810+ * sub-systems might rely on this flag and check if all devices in the EM are
811+ * using the same scale.
812+ *
813  * If multiple clients register the same performance domain, all but the first
814  * registration will be ignored.
815  *
816  * Return 0 on success
817  */
818 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
819-				struct em_data_callback *cb, cpumask_t *cpus)
820+				struct em_data_callback *cb, cpumask_t *cpus,
821+				bool milliwatts)
822 {
823 	unsigned long cap, prev_cap = 0;
824 	int cpu, ret;
825@@ -308,6 +328,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
826 	if (ret)
827 		goto unlock;
828
829+	dev->em_pd->milliwatts = milliwatts;
830+
831 	em_debug_create_pd(dev);
832 	dev_info(dev, "EM: created perf domain\n");
833
834diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
835index bf640fd61..b13fe337f 100644
836--- a/kernel/power/hibernate.c
837+++ b/kernel/power/hibernate.c
838@@ -326,7 +326,7 @@ static int create_image(int platform_mode)
839
840 	if (!in_suspend) {
841 		events_check_enabled = false;
842-		clear_free_pages();
843+		clear_or_poison_free_pages();
844 	}
845
846 	platform_leave(platform_mode);
847diff --git a/kernel/power/power.h b/kernel/power/power.h
848index 24f12d534..778bf431e 100644
849--- a/kernel/power/power.h
850+++ b/kernel/power/power.h
851@@ -106,7 +106,7 @@ extern int create_basic_memory_bitmaps(void);
852 extern void free_basic_memory_bitmaps(void);
853 extern int hibernate_preallocate_memory(void);
854
855-extern void clear_free_pages(void);
856+extern void clear_or_poison_free_pages(void);
857
858 /**
859  *	Auxiliary structure used for reading the snapshot image data and
860diff --git a/kernel/power/process.c b/kernel/power/process.c
861index 45b054b7b..cc0623080 100644
862--- a/kernel/power/process.c
863+++ b/kernel/power/process.c
864@@ -85,18 +85,21 @@ static int try_to_freeze_tasks(bool user_only)
865 	elapsed = ktime_sub(end, start);
866 	elapsed_msecs = ktime_to_ms(elapsed);
867
868-	if (todo) {
869+	if (wakeup) {
870 		pr_cont("\n");
871-		pr_err("Freezing of tasks %s after %d.%03d seconds "
872-		       "(%d tasks refusing to freeze, wq_busy=%d):\n",
873-		       wakeup ? "aborted" : "failed",
874+		pr_err("Freezing of tasks aborted after %d.%03d seconds",
875+		       elapsed_msecs / 1000, elapsed_msecs % 1000);
876+	} else if (todo) {
877+		pr_cont("\n");
878+		pr_err("Freezing of tasks failed after %d.%03d seconds"
879+		       " (%d tasks refusing to freeze, wq_busy=%d):\n",
880 		       elapsed_msecs / 1000, elapsed_msecs % 1000,
881 		       todo - wq_busy, wq_busy);
882
883 		if (wq_busy)
884 			show_workqueue_state();
885
886-		if (!wakeup || pm_debug_messages_on) {
887+		if (pm_debug_messages_on) {
888 			read_lock(&tasklist_lock);
889 			for_each_process_thread(g, p) {
890 				if (p != current && !freezer_should_skip(p)
891diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
892index 46b1804c1..a3491b29c 100644
893--- a/kernel/power/snapshot.c
894+++ b/kernel/power/snapshot.c
895@@ -1144,7 +1144,15 @@ void free_basic_memory_bitmaps(void)
896 	pr_debug("Basic memory bitmaps freed\n");
897 }
898
899-void clear_free_pages(void)
900+static void clear_or_poison_free_page(struct page *page)
901+{
902+	if (page_poisoning_enabled_static())
903+		__kernel_poison_pages(page, 1);
904+	else if (want_init_on_free())
905+		clear_highpage(page);
906+}
907+
908+void clear_or_poison_free_pages(void)
909 {
910 	struct memory_bitmap *bm = free_pages_map;
911 	unsigned long pfn;
912@@ -1152,12 +1160,12 @@ void clear_free_pages(void)
913 	if (WARN_ON(!(free_pages_map)))
914 		return;
915
916-	if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) {
917+	if (page_poisoning_enabled() || want_init_on_free()) {
918 		memory_bm_position_reset(bm);
919 		pfn = memory_bm_next_pfn(bm);
920 		while (pfn != BM_END_OF_MAP) {
921 			if (pfn_valid(pfn))
922-				clear_highpage(pfn_to_page(pfn));
923+				clear_or_poison_free_page(pfn_to_page(pfn));
924
925 			pfn = memory_bm_next_pfn(bm);
926 		}
927diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
928index 32391acc8..545958377 100644
929--- a/kernel/power/suspend.c
930+++ b/kernel/power/suspend.c
931@@ -30,6 +30,7 @@
932 #include <trace/events/power.h>
933 #include <linux/compiler.h>
934 #include <linux/moduleparam.h>
935+#include <linux/wakeup_reason.h>
936
937 #include "power.h"
938
939@@ -139,6 +140,7 @@ static void s2idle_loop(void)
940 		}
941
942 		pm_wakeup_clear(false);
943+		clear_wakeup_reasons();
944
945 		s2idle_enter();
946 	}
947@@ -359,6 +361,7 @@ static int suspend_prepare(suspend_state_t state)
948 	if (!error)
949 		return 0;
950
951+	log_suspend_abort_reason("One or more tasks refusing to freeze");
952 	suspend_stats.failed_freeze++;
953 	dpm_save_failed_step(SUSPEND_FREEZE);
954 	pm_notifier_call_chain(PM_POST_SUSPEND);
955@@ -388,7 +391,7 @@ void __weak arch_suspend_enable_irqs(void)
956  */
957 static int suspend_enter(suspend_state_t state, bool *wakeup)
958 {
959-	int error;
960+	int error, last_dev;
961
962 	error = platform_suspend_prepare(state);
963 	if (error)
964@@ -396,7 +399,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
965
966 	error = dpm_suspend_late(PMSG_SUSPEND);
967 	if (error) {
968+		last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
969+		last_dev %= REC_FAILED_NUM;
970 		pr_err("late suspend of devices failed\n");
971+		log_suspend_abort_reason("late suspend of %s device failed",
972+					 suspend_stats.failed_devs[last_dev]);
973 		goto Platform_finish;
974 	}
975 	error = platform_suspend_prepare_late(state);
976@@ -405,7 +412,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
977
978 	error = dpm_suspend_noirq(PMSG_SUSPEND);
979 	if (error) {
980+		last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
981+		last_dev %= REC_FAILED_NUM;
982 		pr_err("noirq suspend of devices failed\n");
983+		log_suspend_abort_reason("noirq suspend of %s device failed",
984+					 suspend_stats.failed_devs[last_dev]);
985 		goto Platform_early_resume;
986 	}
987 	error = platform_suspend_prepare_noirq(state);
988@@ -421,8 +432,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
989 	}
990
991 	error = suspend_disable_secondary_cpus();
992-	if (error || suspend_test(TEST_CPUS))
993+	if (error || suspend_test(TEST_CPUS)) {
994+		log_suspend_abort_reason("Disabling non-boot cpus failed");
995 		goto Enable_cpus;
996+	}
997
998 	arch_suspend_disable_irqs();
999 	BUG_ON(!irqs_disabled());
1000@@ -493,6 +506,8 @@ int suspend_devices_and_enter(suspend_state_t state)
1001 	error = dpm_suspend_start(PMSG_SUSPEND);
1002 	if (error) {
1003 		pr_err("Some devices failed to suspend, or early wake event detected\n");
1004+		log_suspend_abort_reason(
1005+				"Some devices failed to suspend, or early wake event detected");
1006 		goto Recover_platform;
1007 	}
1008 	suspend_test_finish("suspend devices");
1009diff --git a/kernel/reboot.c b/kernel/reboot.c
1010index af6f23d8b..bce629531 100644
1011--- a/kernel/reboot.c
1012+++ b/kernel/reboot.c
1013@@ -215,6 +215,27 @@ void do_kernel_restart(char *cmd)
1014 	atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
1015 }
1016
1017+#ifdef CONFIG_NO_GKI
1018+static ATOMIC_NOTIFIER_HEAD(pre_restart_handler_list);
1019+
1020+int register_pre_restart_handler(struct notifier_block *nb)
1021+{
1022+	return atomic_notifier_chain_register(&pre_restart_handler_list, nb);
1023+}
1024+EXPORT_SYMBOL(register_pre_restart_handler);
1025+
1026+int unregister_pre_restart_handler(struct notifier_block *nb)
1027+{
1028+	return atomic_notifier_chain_unregister(&pre_restart_handler_list, nb);
1029+}
1030+EXPORT_SYMBOL(unregister_pre_restart_handler);
1031+
1032+void do_kernel_pre_restart(char *cmd)
1033+{
1034+	atomic_notifier_call_chain(&pre_restart_handler_list, reboot_mode, cmd);
1035+}
1036+#endif
1037+
1038 void migrate_to_reboot_cpu(void)
1039 {
1040 	/* The boot cpu is always logical cpu 0 */
1041diff --git a/kernel/sched/core.c b/kernel/sched/core.c
1042index e2f00be4b..750da3e7c 100644
1043--- a/kernel/sched/core.c
1044+++ b/kernel/sched/core.c
1045@@ -47,6 +47,13 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
1046 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
1047 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
1048 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
1049+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
1050+#ifdef CONFIG_SCHEDSTATS
1051+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
1052+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
1053+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
1054+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
1055+#endif
1056
1057 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
1058
1059@@ -660,7 +667,7 @@ int get_nohz_timer_target(void)
1060 	int i, cpu = smp_processor_id(), default_cpu = -1;
1061 	struct sched_domain *sd;
1062
1063-	if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
1064+	if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
1065 		if (!idle_cpu(cpu))
1066 			return cpu;
1067 		default_cpu = cpu;
1068@@ -680,8 +687,25 @@ int get_nohz_timer_target(void)
1069 		}
1070 	}
1071
1072-	if (default_cpu == -1)
1073-		default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
1074+	if (default_cpu == -1) {
1075+		for_each_cpu_and(i, cpu_active_mask,
1076+				 housekeeping_cpumask(HK_FLAG_TIMER)) {
1077+			if (cpu == i)
1078+				continue;
1079+
1080+			if (!idle_cpu(i)) {
1081+				cpu = i;
1082+				goto unlock;
1083+			}
1084+		}
1085+
1086+		/* no active, not-idle, housekpeeing CPU found. */
1087+		default_cpu = cpumask_any(cpu_active_mask);
1088+
1089+		if (unlikely(default_cpu >= nr_cpu_ids))
1090+			goto unlock;
1091+	}
1092+
1093 	cpu = default_cpu;
1094 unlock:
1095 	rcu_read_unlock();
1096@@ -1770,7 +1794,10 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1097 	if (is_per_cpu_kthread(p))
1098 		return cpu_online(cpu);
1099
1100-	return cpu_active(cpu);
1101+	if (!cpu_active(cpu))
1102+		return false;
1103+
1104+	return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
1105 }
1106
1107 /*
1108@@ -2433,10 +2460,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
1109 			}
1110 			fallthrough;
1111 		case possible:
1112-			do_set_cpus_allowed(p, cpu_possible_mask);
1113+			do_set_cpus_allowed(p, task_cpu_possible_mask(p));
1114 			state = fail;
1115 			break;
1116-
1117 		case fail:
1118 #ifdef CONFIG_CPU_ISOLATION_OPT
1119 			allow_iso = true;
1120@@ -2627,6 +2653,9 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1121 {
1122 	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
1123
1124+	if (wake_flags & WF_SYNC)
1125+		en_flags |= ENQUEUE_WAKEUP_SYNC;
1126+
1127 	lockdep_assert_held(&rq->lock);
1128
1129 	if (p->sched_contributes_to_load)
1130@@ -3019,6 +3048,19 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1131 	if (!(p->state & state))
1132 		goto unlock;
1133
1134+#ifdef CONFIG_FREEZER
1135+	/*
1136+	 * If we're going to wake up a thread which may be frozen, then
1137+	 * we can only do so if we have an active CPU which is capable of
1138+	 * running it. This may not be the case when resuming from suspend,
1139+	 * as the secondary CPUs may not yet be back online. See __thaw_task()
1140+	 * for the actual wakeup.
1141+	 */
1142+	if (unlikely(frozen_or_skipped(p)) &&
1143+	    !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
1144+		goto unlock;
1145+#endif
1146+
1147 	trace_sched_waking(p);
1148
1149 	/* We're going to change ->state: */
1150@@ -5004,7 +5046,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
1151 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
1152 			  void *key)
1153 {
1154-	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
1155+	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC));
1156 	return try_to_wake_up(curr->private, mode, wake_flags);
1157 }
1158 EXPORT_SYMBOL(default_wake_function);
1159@@ -5713,16 +5755,19 @@ int sched_setscheduler(struct task_struct *p, int policy,
1160 {
1161 	return _sched_setscheduler(p, policy, param, true);
1162 }
1163+EXPORT_SYMBOL_GPL(sched_setscheduler);
1164
1165 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
1166 {
1167 	return __sched_setscheduler(p, attr, true, true);
1168 }
1169+EXPORT_SYMBOL_GPL(sched_setattr);
1170
1171 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
1172 {
1173 	return __sched_setscheduler(p, attr, false, true);
1174 }
1175+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
1176
1177 /**
1178  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
1179@@ -5742,6 +5787,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
1180 {
1181 	return _sched_setscheduler(p, policy, param, false);
1182 }
1183+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
1184
1185 /*
1186  * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
1187@@ -7044,6 +7090,11 @@ void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf,
1188 	 */
1189 	update_rq_clock(rq);
1190
1191+#ifdef CONFIG_SCHED_DEBUG
1192+	/* note the clock update in orf */
1193+	orf.clock_update_flags |= RQCF_UPDATED;
1194+#endif
1195+
1196 	for (;;) {
1197 		/*
1198 		 * There's this thread running, bail when that's the only
1199diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
1200index 81e43a56d..4df7f4e68 100644
1201--- a/kernel/sched/fair.c
1202+++ b/kernel/sched/fair.c
1203@@ -86,6 +86,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L
1204  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
1205  */
1206 unsigned int sysctl_sched_min_granularity			= 750000ULL;
1207+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
1208 static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
1209
1210 /*
1211@@ -10686,9 +10687,20 @@ void nohz_balance_enter_idle(int cpu)
1212
1213 	SCHED_WARN_ON(cpu != smp_processor_id());
1214
1215-	/* If this CPU is going down, then nothing needs to be done: */
1216-	if (!cpu_active(cpu))
1217+	if (!cpu_active(cpu)) {
1218+		/*
1219+		 * A CPU can be paused while it is idle with it's tick
1220+		 * stopped. nohz_balance_exit_idle() should be called
1221+		 * from the local CPU, so it can't be called during
1222+		 * pause. This results in paused CPU participating in
1223+		 * the nohz idle balance, which should be avoided.
1224+		 *
1225+		 * When the paused CPU exits idle and enters again,
1226+		 * exempt the paused CPU from nohz_balance_exit_idle.
1227+		 */
1228+		nohz_balance_exit_idle(rq);
1229 		return;
1230+	}
1231
1232 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
1233 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
1234diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
1235index 2593a733c..69afd8d1e 100644
1236--- a/kernel/sched/idle.c
1237+++ b/kernel/sched/idle.c
1238@@ -450,6 +450,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
1239 {
1240 	raw_spin_unlock_irq(&rq->lock);
1241 	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
1242+
1243 	dump_stack();
1244 	raw_spin_lock_irq(&rq->lock);
1245 }
1246diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
1247index d2a655643..b5837e277 100644
1248--- a/kernel/sched/loadavg.c
1249+++ b/kernel/sched/loadavg.c
1250@@ -75,6 +75,7 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
1251 	loads[1] = (avenrun[1] + offset) << shift;
1252 	loads[2] = (avenrun[2] + offset) << shift;
1253 }
1254+EXPORT_SYMBOL_GPL(get_avenrun);
1255
1256 long calc_load_fold_active(struct rq *this_rq, long adjust)
1257 {
1258diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
1259index 2c613e1cf..e2890b677 100644
1260--- a/kernel/sched/pelt.c
1261+++ b/kernel/sched/pelt.c
1262@@ -28,6 +28,42 @@
1263 #include "sched.h"
1264 #include "pelt.h"
1265
1266+int pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD;
1267+int pelt_load_avg_max = PELT32_LOAD_AVG_MAX;
1268+const u32 *pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv;
1269+
1270+static int __init set_pelt(char *str)
1271+{
1272+	int rc, num;
1273+
1274+	rc = kstrtoint(str, 0, &num);
1275+	if (rc) {
1276+		pr_err("%s: kstrtoint failed. rc=%d\n", __func__, rc);
1277+		return 0;
1278+	}
1279+
1280+	switch (num) {
1281+	case PELT8_LOAD_AVG_PERIOD:
1282+		pelt_load_avg_period = PELT8_LOAD_AVG_PERIOD;
1283+		pelt_load_avg_max = PELT8_LOAD_AVG_MAX;
1284+		pelt_runnable_avg_yN_inv = pelt8_runnable_avg_yN_inv;
1285+		pr_info("PELT half life is set to %dms\n", num);
1286+		break;
1287+	case PELT32_LOAD_AVG_PERIOD:
1288+		pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD;
1289+		pelt_load_avg_max = PELT32_LOAD_AVG_MAX;
1290+		pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv;
1291+		pr_info("PELT half life is set to %dms\n", num);
1292+		break;
1293+	default:
1294+		pr_err("Default PELT half life is 32ms\n");
1295+	}
1296+
1297+	return 0;
1298+}
1299+
1300+early_param("pelt", set_pelt);
1301+
1302 /*
1303  * Approximate:
1304  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
1305@@ -54,7 +90,7 @@ static u64 decay_load(u64 val, u64 n)
1306 		local_n %= LOAD_AVG_PERIOD;
1307 	}
1308
1309-	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
1310+	val = mul_u64_u32_shr(val, pelt_runnable_avg_yN_inv[local_n], 32);
1311 	return val;
1312 }
1313
1314diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
1315index b7f38f3ad..b0e6c438f 100644
1316--- a/kernel/sched/psi.c
1317+++ b/kernel/sched/psi.c
1318@@ -550,7 +550,7 @@ static u64 update_triggers(struct psi_group *group, u64 now)
1319 	return now + group->poll_min_period;
1320 }
1321
1322-/* Schedule polling if it's not already scheduled. */
1323+/* Schedule polling if it's not already scheduled or forced. */
1324 static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
1325 {
1326 	struct task_struct *task;
1327diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
1328index d5c00fa02..689cc1a63 100644
1329--- a/kernel/sched/rt.c
1330+++ b/kernel/sched/rt.c
1331@@ -1390,6 +1390,27 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1332 	enqueue_top_rt_rq(&rq->rt);
1333 }
1334
1335+#ifdef CONFIG_SMP
1336+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
1337+					bool sync)
1338+{
1339+	/*
1340+	 * If the waker is CFS, then an RT sync wakeup would preempt the waker
1341+	 * and force it to run for a likely small time after the RT wakee is
1342+	 * done. So, only honor RT sync wakeups from RT wakers.
1343+	 */
1344+	return sync && task_has_rt_policy(rq->curr) &&
1345+		p->prio <= rq->rt.highest_prio.next &&
1346+		rq->rt.rt_nr_running <= 2;
1347+}
1348+#else
1349+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
1350+					bool sync)
1351+{
1352+	return 0;
1353+}
1354+#endif
1355+
1356 /*
1357  * Adding/removing a task to/from a priority array:
1358  */
1359@@ -1397,6 +1418,7 @@ static void
1360 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1361 {
1362 	struct sched_rt_entity *rt_se = &p->rt;
1363+	bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC);
1364
1365 	if (flags & ENQUEUE_WAKEUP)
1366 		rt_se->timeout = 0;
1367@@ -1404,7 +1426,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1368 	enqueue_rt_entity(rt_se, flags);
1369 	walt_inc_cumulative_runnable_avg(rq, p);
1370
1371-	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1372+	if (!task_current(rq, p) && p->nr_cpus_allowed > 1 &&
1373+	    !should_honor_rt_sync(rq, p, sync))
1374 		enqueue_pushable_task(rq, p);
1375 }
1376
1377@@ -1461,7 +1484,10 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1378 {
1379 	struct task_struct *curr;
1380 	struct rq *rq;
1381+	struct rq *this_cpu_rq;
1382 	bool test;
1383+	bool sync = !!(flags & WF_SYNC);
1384+	int this_cpu;
1385
1386 	/* For anything but wake ups, just return the task_cpu */
1387 	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1388@@ -1471,6 +1497,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1389
1390 	rcu_read_lock();
1391 	curr = READ_ONCE(rq->curr); /* unlocked access */
1392+	this_cpu = smp_processor_id();
1393+	this_cpu_rq = cpu_rq(this_cpu);
1394
1395 	/*
1396 	 * If the current task on @p's runqueue is an RT task, then
1397@@ -1502,6 +1530,15 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1398 	       unlikely(rt_task(curr)) &&
1399 	       (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
1400
1401+	/*
1402+	 * Respect the sync flag as long as the task can run on this CPU.
1403+	 */
1404+	if (should_honor_rt_sync(this_cpu_rq, p, sync) &&
1405+	    cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
1406+		cpu = this_cpu;
1407+		goto out_unlock;
1408+	}
1409+
1410 	if (test || !rt_task_fits_capacity(p, cpu)) {
1411 		int target = find_lowest_rq(p);
1412
1413diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
1414index c529706be..92a6875bc 100644
1415--- a/kernel/sched/sched-pelt.h
1416+++ b/kernel/sched/sched-pelt.h
1417@@ -1,7 +1,7 @@
1418 /* SPDX-License-Identifier: GPL-2.0 */
1419 /* Generated by Documentation/scheduler/sched-pelt; do not modify. */
1420
1421-static const u32 runnable_avg_yN_inv[] __maybe_unused = {
1422+static const u32 pelt32_runnable_avg_yN_inv[] __maybe_unused = {
1423 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1424 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1425 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1426@@ -10,5 +10,20 @@ static const u32 runnable_avg_yN_inv[] __maybe_unused = {
1427 	0x85aac367, 0x82cd8698,
1428 };
1429
1430-#define LOAD_AVG_PERIOD 32
1431-#define LOAD_AVG_MAX 47742
1432+#define PELT32_LOAD_AVG_PERIOD 32
1433+#define PELT32_LOAD_AVG_MAX 47742
1434+
1435+static const u32 pelt8_runnable_avg_yN_inv[] __maybe_unused = {
1436+	0xffffffff, 0xeac0c6e6, 0xd744fcc9, 0xc5672a10,
1437+	0xb504f333, 0xa5fed6a9, 0x9837f050, 0x8b95c1e3,
1438+};
1439+
1440+#define PELT8_LOAD_AVG_PERIOD 8
1441+#define PELT8_LOAD_AVG_MAX 12336
1442+
1443+extern const u32 *pelt_runnable_avg_yN_inv;
1444+extern int pelt_load_avg_period;
1445+extern int pelt_load_avg_max;
1446+
1447+#define LOAD_AVG_PERIOD pelt_load_avg_period
1448+#define LOAD_AVG_MAX pelt_load_avg_max
1449diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
1450index 592c8653c..7c02fed0a 100644
1451--- a/kernel/sched/sched.h
1452+++ b/kernel/sched/sched.h
1453@@ -1913,6 +1913,8 @@ extern const int		sched_latency_to_weight[40];
1454 #define ENQUEUE_MIGRATED	0x00
1455 #endif
1456
1457+#define ENQUEUE_WAKEUP_SYNC	0x80
1458+
1459 #define RETRY_TASK		((void *)-1UL)
1460
1461 struct sched_class {
1462diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
1463index 9191e5daa..58d840c62 100644
1464--- a/kernel/sched/topology.c
1465+++ b/kernel/sched/topology.c
1466@@ -5,6 +5,9 @@
1467 #include "sched.h"
1468
1469 DEFINE_MUTEX(sched_domains_mutex);
1470+#ifdef CONFIG_LOCKDEP
1471+EXPORT_SYMBOL_GPL(sched_domains_mutex);
1472+#endif
1473
1474 /* Protected by sched_domains_mutex: */
1475 static cpumask_var_t sched_domains_tmpmask;
1476diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
1477index a55642aa3..6911bbca0 100644
1478--- a/kernel/sched/wait.c
1479+++ b/kernel/sched/wait.c
1480@@ -396,7 +396,8 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en
1481 }
1482 EXPORT_SYMBOL(finish_wait);
1483
1484-int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
1485+__sched int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
1486+				     int sync, void *key)
1487 {
1488 	int ret = default_wake_function(wq_entry, mode, sync, key);
1489
1490@@ -432,7 +433,7 @@ static inline bool is_kthread_should_stop(void)
1491  * }						smp_mb(); // C
1492  * remove_wait_queue(&wq_head, &wait);		wq_entry->flags |= WQ_FLAG_WOKEN;
1493  */
1494-long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
1495+__sched long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, long timeout)
1496 {
1497 	/*
1498 	 * The below executes an smp_mb(), which matches with the full barrier
1499@@ -457,7 +458,8 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
1500 }
1501 EXPORT_SYMBOL(wait_woken);
1502
1503-int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
1504+__sched int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
1505+				int sync, void *key)
1506 {
1507 	/* Pairs with the smp_store_mb() in wait_woken(). */
1508 	smp_mb(); /* C */
1509diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
1510index 3e6740207..033fa94f3 100644
1511--- a/kernel/stop_machine.c
1512+++ b/kernel/stop_machine.c
1513@@ -27,6 +27,7 @@
1514  * Structure to determine completion condition and record errors.  May
1515  * be shared by works on different cpus.
1516  */
1517+
1518 struct cpu_stop_done {
1519 	atomic_t		nr_todo;	/* nr left to execute */
1520 	int			ret;		/* collected return value */
1521