• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
2index 209e6567c..d47c0212e 100644
3--- a/kernel/bpf/syscall.c
4+++ b/kernel/bpf/syscall.c
5@@ -128,21 +128,6 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
6 	return map;
7 }
8
9-static void bpf_map_write_active_inc(struct bpf_map *map)
10-{
11-	atomic64_inc(&map->writecnt);
12-}
13-
14-static void bpf_map_write_active_dec(struct bpf_map *map)
15-{
16-	atomic64_dec(&map->writecnt);
17-}
18-
19-bool bpf_map_write_active(const struct bpf_map *map)
20-{
21-	return atomic64_read(&map->writecnt) != 0;
22-}
23-
24 static u32 bpf_map_value_size(struct bpf_map *map)
25 {
26 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
27@@ -604,8 +589,11 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma)
28 {
29 	struct bpf_map *map = vma->vm_file->private_data;
30
31-	if (vma->vm_flags & VM_MAYWRITE)
32-		bpf_map_write_active_inc(map);
33+	if (vma->vm_flags & VM_MAYWRITE) {
34+		mutex_lock(&map->freeze_mutex);
35+		map->writecnt++;
36+		mutex_unlock(&map->freeze_mutex);
37+	}
38 }
39
40 /* called for all unmapped memory region (including initial) */
41@@ -613,8 +601,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma)
42 {
43 	struct bpf_map *map = vma->vm_file->private_data;
44
45-	if (vma->vm_flags & VM_MAYWRITE)
46-		bpf_map_write_active_dec(map);
47+	if (vma->vm_flags & VM_MAYWRITE) {
48+		mutex_lock(&map->freeze_mutex);
49+		map->writecnt--;
50+		mutex_unlock(&map->freeze_mutex);
51+	}
52 }
53
54 static const struct vm_operations_struct bpf_map_default_vmops = {
55@@ -664,7 +655,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
56 		goto out;
57
58 	if (vma->vm_flags & VM_MAYWRITE)
59-		bpf_map_write_active_inc(map);
60+		map->writecnt++;
61 out:
62 	mutex_unlock(&map->freeze_mutex);
63 	return err;
64@@ -1096,7 +1087,6 @@ static int map_update_elem(union bpf_attr *attr)
65 	map = __bpf_map_get(f);
66 	if (IS_ERR(map))
67 		return PTR_ERR(map);
68-	bpf_map_write_active_inc(map);
69 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
70 		err = -EPERM;
71 		goto err_put;
72@@ -1138,7 +1128,6 @@ static int map_update_elem(union bpf_attr *attr)
73 free_key:
74 	kfree(key);
75 err_put:
76-	bpf_map_write_active_dec(map);
77 	fdput(f);
78 	return err;
79 }
80@@ -1161,7 +1150,6 @@ static int map_delete_elem(union bpf_attr *attr)
81 	map = __bpf_map_get(f);
82 	if (IS_ERR(map))
83 		return PTR_ERR(map);
84-	bpf_map_write_active_inc(map);
85 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
86 		err = -EPERM;
87 		goto err_put;
88@@ -1192,7 +1180,6 @@ static int map_delete_elem(union bpf_attr *attr)
89 out:
90 	kfree(key);
91 err_put:
92-	bpf_map_write_active_dec(map);
93 	fdput(f);
94 	return err;
95 }
96@@ -1497,7 +1484,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
97 	map = __bpf_map_get(f);
98 	if (IS_ERR(map))
99 		return PTR_ERR(map);
100-	bpf_map_write_active_inc(map);
101 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
102 	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
103 		err = -EPERM;
104@@ -1539,7 +1525,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
105 free_key:
106 	kfree(key);
107 err_put:
108-	bpf_map_write_active_dec(map);
109 	fdput(f);
110 	return err;
111 }
112@@ -1566,7 +1551,8 @@ static int map_freeze(const union bpf_attr *attr)
113 	}
114
115 	mutex_lock(&map->freeze_mutex);
116-	if (bpf_map_write_active(map)) {
117+
118+	if (map->writecnt) {
119 		err = -EBUSY;
120 		goto err_put;
121 	}
122@@ -3991,9 +3977,6 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
123 			    union bpf_attr __user *uattr,
124 			    int cmd)
125 {
126-	bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
127-			 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
128-	bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
129 	struct bpf_map *map;
130 	int err, ufd;
131 	struct fd f;
132@@ -4006,13 +3989,16 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
133 	map = __bpf_map_get(f);
134 	if (IS_ERR(map))
135 		return PTR_ERR(map);
136-	if (has_write)
137-		bpf_map_write_active_inc(map);
138-	if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
139+
140+	if ((cmd == BPF_MAP_LOOKUP_BATCH ||
141+	     cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
142+	    !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
143 		err = -EPERM;
144 		goto err_put;
145 	}
146-	if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
147+
148+	if (cmd != BPF_MAP_LOOKUP_BATCH &&
149+	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
150 		err = -EPERM;
151 		goto err_put;
152 	}
153@@ -4025,9 +4011,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
154 		BPF_DO_BATCH(map->ops->map_update_batch);
155 	else
156 		BPF_DO_BATCH(map->ops->map_delete_batch);
157+
158 err_put:
159-	if (has_write)
160-		bpf_map_write_active_dec(map);
161 	fdput(f);
162 	return err;
163 }
164diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
165index 8de769745..3e854b91f 100644
166--- a/kernel/bpf/verifier.c
167+++ b/kernel/bpf/verifier.c
168@@ -3492,22 +3492,7 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
169
170 static bool bpf_map_is_rdonly(const struct bpf_map *map)
171 {
172-	/* A map is considered read-only if the following condition are true:
173-	 *
174-	 * 1) BPF program side cannot change any of the map content. The
175-	 *    BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
176-	 *    and was set at map creation time.
177-	 * 2) The map value(s) have been initialized from user space by a
178-	 *    loader and then "frozen", such that no new map update/delete
179-	 *    operations from syscall side are possible for the rest of
180-	 *    the map's lifetime from that point onwards.
181-	 * 3) Any parallel/pending map update/delete operations from syscall
182-	 *    side have been completed. Only after that point, it's safe to
183-	 *    assume that map value(s) are immutable.
184-	 */
185-	return (map->map_flags & BPF_F_RDONLY_PROG) &&
186-	       READ_ONCE(map->frozen) &&
187-	       !bpf_map_write_active(map);
188+	return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen;
189 }
190
191 static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
192diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
193index d9f8a464b..cddc908bc 100644
194--- a/kernel/cgroup/cgroup-v1.c
195+++ b/kernel/cgroup/cgroup-v1.c
196@@ -518,7 +518,8 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
197 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
198 #endif
199 	    !uid_eq(cred->euid, tcred->uid) &&
200-	    !uid_eq(cred->euid, tcred->suid))
201+	    !uid_eq(cred->euid, tcred->suid) &&
202+	    !ns_capable(tcred->user_ns, CAP_SYS_NICE))
203 		ret = -EACCES;
204 	put_cred(tcred);
205 	if (ret)
206diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
207index 3173fe473..f4d318733 100644
208--- a/kernel/cgroup/cpuset.c
209+++ b/kernel/cgroup/cpuset.c
210@@ -335,6 +335,8 @@ static struct cpuset top_cpuset = {
211  * guidelines for accessing subsystem state in kernel/cgroup.c
212  */
213
214+static DEFINE_MUTEX(cpuset_mutex);
215+
216 DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
217
218 void cpuset_read_lock(void)
219@@ -352,9 +354,9 @@ static DEFINE_SPINLOCK(callback_lock);
220 static struct workqueue_struct *cpuset_migrate_mm_wq;
221
222 /*
223- * CPU / memory hotplug is handled asynchronously.
224+ * CPU / memory hotplug is handled asynchronously
225+ * for hotplug, synchronously for resume_cpus
226  */
227-static void cpuset_hotplug_workfn(struct work_struct *work);
228 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
229
230 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
231@@ -374,18 +376,29 @@ static inline bool is_in_v2_mode(void)
232 }
233
234 /*
235- * Return in pmask the portion of a cpusets's cpus_allowed that
236- * are online.  If none are online, walk up the cpuset hierarchy
237- * until we find one that does have some online cpus.
238+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
239+ * are online and are capable of running the task.  If none are found,
240+ * walk up the cpuset hierarchy until we find one that does have some
241+ * appropriate cpus.
242  *
243  * One way or another, we guarantee to return some non-empty subset
244- * of cpu_online_mask.
245+ * of cpu_active_mask.
246  *
247  * Call with callback_lock or cpuset_mutex held.
248  */
249-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
250+static void guarantee_online_cpus(struct task_struct *tsk,
251+				  struct cpumask *pmask)
252 {
253-	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
254+	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
255+	struct cpuset *cs;
256+
257+	if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
258+		cpumask_copy(pmask, cpu_active_mask);
259+
260+	rcu_read_lock();
261+	cs = task_cs(tsk);
262+
263+	while (!cpumask_intersects(cs->effective_cpus, pmask)) {
264 		cs = parent_cs(cs);
265 		if (unlikely(!cs)) {
266 			/*
267@@ -395,11 +408,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
268 			 * cpuset's effective_cpus is on its way to be
269 			 * identical to cpu_online_mask.
270 			 */
271-			cpumask_copy(pmask, cpu_online_mask);
272-			return;
273+			goto out_unlock;
274 		}
275 	}
276-	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
277+	cpumask_and(pmask, pmask, cs->effective_cpus);
278+
279+out_unlock:
280+	rcu_read_unlock();
281 }
282
283 /*
284@@ -490,6 +505,9 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
285 	if (cs && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
286 		goto free_three;
287
288+	if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
289+		goto free_three;
290+
291 	return 0;
292
293 free_three:
294@@ -940,7 +958,7 @@ static void rebuild_root_domains(void)
295 	struct cpuset *cs = NULL;
296 	struct cgroup_subsys_state *pos_css;
297
298-	percpu_rwsem_assert_held(&cpuset_rwsem);
299+	lockdep_assert_held(&cpuset_mutex);
300 	lockdep_assert_cpus_held();
301 	lockdep_assert_held(&sched_domains_mutex);
302
303@@ -1000,8 +1018,7 @@ static void rebuild_sched_domains_locked(void)
304 	struct cpuset *cs;
305 	int ndoms;
306
307-	lockdep_assert_cpus_held();
308-	percpu_rwsem_assert_held(&cpuset_rwsem);
309+	lockdep_assert_held(&cpuset_mutex);
310
311 	/*
312 	 * If we have raced with CPU hotplug, return early to avoid
313@@ -1052,12 +1069,18 @@ static void rebuild_sched_domains_locked(void)
314 void rebuild_sched_domains(void)
315 {
316 	get_online_cpus();
317-	percpu_down_write(&cpuset_rwsem);
318+	mutex_lock(&cpuset_mutex);
319 	rebuild_sched_domains_locked();
320-	percpu_up_write(&cpuset_rwsem);
321+	mutex_unlock(&cpuset_mutex);
322 	put_online_cpus();
323 }
324
325+static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p,
326+				const struct cpumask *new_mask)
327+{
328+	return set_cpus_allowed_ptr(p, new_mask);
329+}
330+
331 /**
332  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
333  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
334@@ -1080,7 +1103,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
335 		if (top_cs && (task->flags & PF_KTHREAD) &&
336 		    kthread_is_per_cpu(task))
337 			continue;
338-		set_cpus_allowed_ptr(task, cs->effective_cpus);
339+		update_cpus_allowed(cs, task, cs->effective_cpus);
340 	}
341 	css_task_iter_end(&it);
342 }
343@@ -1105,8 +1128,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
344 		cpumask_and(new_cpus, new_cpus, cs->cpus_requested);
345 		cpumask_and(new_cpus, new_cpus, cpu_active_mask);
346 	} else {
347-		cpumask_and(new_cpus, cs->cpus_requested,
348-			    parent->effective_cpus);
349+		cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
350 	}
351 }
352
353@@ -1171,7 +1193,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
354 	int new_prs;
355 	bool part_error = false;	/* Partition error? */
356
357-	percpu_rwsem_assert_held(&cpuset_rwsem);
358+	lockdep_assert_held(&cpuset_mutex);
359
360 	/*
361 	 * The parent must be a partition root.
362@@ -2171,7 +2193,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
363 	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
364 	cs = css_cs(css);
365
366-	percpu_down_write(&cpuset_rwsem);
367+	mutex_lock(&cpuset_mutex);
368
369 	/* allow moving tasks into an empty cpuset if on default hierarchy */
370 	ret = -ENOSPC;
371@@ -2195,7 +2217,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
372 	cs->attach_in_progress++;
373 	ret = 0;
374 out_unlock:
375-	percpu_up_write(&cpuset_rwsem);
376+	mutex_unlock(&cpuset_mutex);
377 	return ret;
378 }
379
380@@ -2205,9 +2227,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
381
382 	cgroup_taskset_first(tset, &css);
383
384-	percpu_down_write(&cpuset_rwsem);
385+	mutex_lock(&cpuset_mutex);
386 	css_cs(css)->attach_in_progress--;
387-	percpu_up_write(&cpuset_rwsem);
388+	mutex_unlock(&cpuset_mutex);
389 }
390
391 /*
392@@ -2231,22 +2253,20 @@ static void cpuset_attach(struct cgroup_taskset *tset)
393 	cs = css_cs(css);
394
395 	lockdep_assert_cpus_held();	/* see cgroup_attach_lock() */
396-	percpu_down_write(&cpuset_rwsem);
397-
398-	/* prepare for attach */
399-	if (cs == &top_cpuset)
400-		cpumask_copy(cpus_attach, cpu_possible_mask);
401-	else
402-		guarantee_online_cpus(cs, cpus_attach);
403+	mutex_lock(&cpuset_mutex);
404
405 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
406
407 	cgroup_taskset_for_each(task, css, tset) {
408+		if (cs != &top_cpuset)
409+			guarantee_online_cpus(task, cpus_attach);
410+		else
411+			cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
412 		/*
413 		 * can_attach beforehand should guarantee that this doesn't
414 		 * fail.  TODO: have a better way to handle failure here
415 		 */
416-		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
417+		WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach));
418
419 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
420 		cpuset_update_task_spread_flag(cs, task);
421@@ -2285,7 +2305,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
422 	if (!cs->attach_in_progress)
423 		wake_up(&cpuset_attach_wq);
424
425-	percpu_up_write(&cpuset_rwsem);
426+	mutex_unlock(&cpuset_mutex);
427 }
428
429 /* The various types of files and directories in a cpuset file system */
430@@ -2317,7 +2337,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
431 	int retval = 0;
432
433 	get_online_cpus();
434-	percpu_down_write(&cpuset_rwsem);
435+	mutex_lock(&cpuset_mutex);
436 	if (!is_cpuset_online(cs)) {
437 		retval = -ENODEV;
438 		goto out_unlock;
439@@ -2353,7 +2373,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
440 		break;
441 	}
442 out_unlock:
443-	percpu_up_write(&cpuset_rwsem);
444+	mutex_unlock(&cpuset_mutex);
445 	put_online_cpus();
446 	return retval;
447 }
448@@ -2366,7 +2386,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
449 	int retval = -ENODEV;
450
451 	get_online_cpus();
452-	percpu_down_write(&cpuset_rwsem);
453+	mutex_lock(&cpuset_mutex);
454 	if (!is_cpuset_online(cs))
455 		goto out_unlock;
456
457@@ -2379,7 +2399,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
458 		break;
459 	}
460 out_unlock:
461-	percpu_up_write(&cpuset_rwsem);
462+	mutex_unlock(&cpuset_mutex);
463 	put_online_cpus();
464 	return retval;
465 }
466@@ -2420,7 +2440,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
467 	flush_work(&cpuset_hotplug_work);
468
469 	get_online_cpus();
470-	percpu_down_write(&cpuset_rwsem);
471+	mutex_lock(&cpuset_mutex);
472 	if (!is_cpuset_online(cs))
473 		goto out_unlock;
474
475@@ -2444,7 +2464,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
476
477 	free_cpuset(trialcs);
478 out_unlock:
479-	percpu_up_write(&cpuset_rwsem);
480+	mutex_unlock(&cpuset_mutex);
481 	put_online_cpus();
482 	kernfs_unbreak_active_protection(of->kn);
483 	css_put(&cs->css);
484@@ -2577,13 +2597,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
485
486 	css_get(&cs->css);
487 	get_online_cpus();
488-	percpu_down_write(&cpuset_rwsem);
489+	mutex_lock(&cpuset_mutex);
490 	if (!is_cpuset_online(cs))
491 		goto out_unlock;
492
493 	retval = update_prstate(cs, val);
494 out_unlock:
495-	percpu_up_write(&cpuset_rwsem);
496+	mutex_unlock(&cpuset_mutex);
497 	put_online_cpus();
498 	css_put(&cs->css);
499 	return retval ?: nbytes;
500@@ -2791,7 +2811,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
501 		return 0;
502
503 	get_online_cpus();
504-	percpu_down_write(&cpuset_rwsem);
505+	mutex_lock(&cpuset_mutex);
506
507 	set_bit(CS_ONLINE, &cs->flags);
508 	if (is_spread_page(parent))
509@@ -2843,7 +2863,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
510 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
511 	spin_unlock_irq(&callback_lock);
512 out_unlock:
513-	percpu_up_write(&cpuset_rwsem);
514+	mutex_unlock(&cpuset_mutex);
515 	put_online_cpus();
516 	return 0;
517 }
518@@ -2864,7 +2884,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
519 	struct cpuset *cs = css_cs(css);
520
521 	get_online_cpus();
522-	percpu_down_write(&cpuset_rwsem);
523+	mutex_lock(&cpuset_mutex);
524
525 	if (is_partition_root(cs))
526 		update_prstate(cs, 0);
527@@ -2883,7 +2903,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
528 	cpuset_dec();
529 	clear_bit(CS_ONLINE, &cs->flags);
530
531-	percpu_up_write(&cpuset_rwsem);
532+	mutex_unlock(&cpuset_mutex);
533 	put_online_cpus();
534 }
535
536@@ -2896,7 +2916,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
537
538 static void cpuset_bind(struct cgroup_subsys_state *root_css)
539 {
540-	percpu_down_write(&cpuset_rwsem);
541+	mutex_lock(&cpuset_mutex);
542 	spin_lock_irq(&callback_lock);
543
544 	if (is_in_v2_mode()) {
545@@ -2909,7 +2929,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
546 	}
547
548 	spin_unlock_irq(&callback_lock);
549-	percpu_up_write(&cpuset_rwsem);
550+	mutex_unlock(&cpuset_mutex);
551 }
552
553 /*
554@@ -2919,10 +2939,10 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
555  */
556 static void cpuset_fork(struct task_struct *task)
557 {
558+	int inherit_cpus = 0;
559 	if (task_css_is_root(task, cpuset_cgrp_id))
560 		return;
561
562-	set_cpus_allowed_ptr(task, current->cpus_ptr);
563 	task->mems_allowed = current->mems_allowed;
564 }
565
566@@ -2951,7 +2971,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
567
568 int __init cpuset_init(void)
569 {
570-	BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
571
572 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
573 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL));
574@@ -3026,7 +3045,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
575 	is_empty = cpumask_empty(cs->cpus_allowed) ||
576 		   nodes_empty(cs->mems_allowed);
577
578-	percpu_up_write(&cpuset_rwsem);
579+	mutex_unlock(&cpuset_mutex);
580
581 	/*
582 	 * Move tasks to the nearest ancestor with execution resources,
583@@ -3036,7 +3055,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
584 	if (is_empty)
585 		remove_tasks_in_empty_cpuset(cs);
586
587-	percpu_down_write(&cpuset_rwsem);
588+	mutex_lock(&cpuset_mutex);
589 }
590
591 static void
592@@ -3086,14 +3105,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
593 retry:
594 	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
595
596-	percpu_down_write(&cpuset_rwsem);
597+	mutex_lock(&cpuset_mutex);
598
599 	/*
600 	 * We have raced with task attaching. We wait until attaching
601 	 * is finished, so we won't attach a task to an empty cpuset.
602 	 */
603 	if (cs->attach_in_progress) {
604-		percpu_up_write(&cpuset_rwsem);
605+		mutex_unlock(&cpuset_mutex);
606 		goto retry;
607 	}
608
609@@ -3165,7 +3184,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
610 		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
611 					    cpus_updated, mems_updated);
612
613-	percpu_up_write(&cpuset_rwsem);
614+	mutex_unlock(&cpuset_mutex);
615 }
616
617 /**
618@@ -3184,7 +3203,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
619  * Note that CPU offlining during suspend is ignored.  We don't modify
620  * cpusets across suspend/resume cycles at all.
621  */
622-static void cpuset_hotplug_workfn(struct work_struct *work)
623+void cpuset_hotplug_workfn(struct work_struct *work)
624 {
625 	static cpumask_t new_cpus;
626 	static nodemask_t new_mems;
627@@ -3195,7 +3214,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
628 	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
629 		ptmp = &tmp;
630
631-	percpu_down_write(&cpuset_rwsem);
632+	mutex_lock(&cpuset_mutex);
633
634 	/* fetch the available cpus/mems and find out which changed how */
635 	cpumask_copy(&new_cpus, cpu_active_mask);
636@@ -3252,7 +3271,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
637 		update_tasks_nodemask(&top_cpuset);
638 	}
639
640-	percpu_up_write(&cpuset_rwsem);
641+	mutex_unlock(&cpuset_mutex);
642
643 	/* if cpus or mems changed, we need to propagate to descendants */
644 	if (cpus_updated || mems_updated) {
645@@ -3296,6 +3315,7 @@ void cpuset_wait_for_hotplug(void)
646 {
647 	flush_work(&cpuset_hotplug_work);
648 }
649+EXPORT_SYMBOL_GPL(cpuset_wait_for_hotplug);
650
651 /*
652  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
653@@ -3354,11 +3374,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
654
655 	spin_lock_irqsave(&callback_lock, flags);
656 	rcu_read_lock();
657-	guarantee_online_cpus(task_cs(tsk), pmask);
658+	guarantee_online_cpus(tsk, pmask);
659 	rcu_read_unlock();
660 	spin_unlock_irqrestore(&callback_lock, flags);
661 }
662-
663+EXPORT_SYMBOL_GPL(cpuset_cpus_allowed);
664 /**
665  * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
666  * @tsk: pointer to task_struct with which the scheduler is struggling
667@@ -3373,9 +3393,17 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
668
669 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
670 {
671+	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
672+	const struct cpumask *cs_mask;
673+
674 	rcu_read_lock();
675-	do_set_cpus_allowed(tsk, is_in_v2_mode() ?
676-		task_cs(tsk)->cpus_allowed : cpu_possible_mask);
677+	cs_mask = task_cs(tsk)->cpus_allowed;
678+
679+	if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask))
680+		goto unlock; /* select_fallback_rq will try harder */
681+
682+	do_set_cpus_allowed(tsk, cs_mask);
683+unlock:
684 	rcu_read_unlock();
685
686 	/*
687
688diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
689index 08236798d..081d026f1 100644
690--- a/kernel/cgroup/legacy_freezer.c
691+++ b/kernel/cgroup/legacy_freezer.c
692@@ -479,3 +479,4 @@ struct cgroup_subsys freezer_cgrp_subsys = {
693 	.fork		= freezer_fork,
694 	.legacy_cftypes	= files,
695 };
696+EXPORT_SYMBOL_GPL(freezer_cgrp_subsys);
697diff --git a/kernel/cpu.c b/kernel/cpu.c
698index 4b27158d3..b076ccd1b 100644
699--- a/kernel/cpu.c
700+++ b/kernel/cpu.c
701@@ -39,6 +39,8 @@
702 #define CREATE_TRACE_POINTS
703 #include <trace/events/cpuhp.h>
704
705+#undef CREATE_TRACE_POINTS
706+
707 #include "smpboot.h"
708
709 /**
710@@ -274,11 +276,13 @@ void cpu_maps_update_begin(void)
711 {
712 	mutex_lock(&cpu_add_remove_lock);
713 }
714+EXPORT_SYMBOL_GPL(cpu_maps_update_begin);
715
716 void cpu_maps_update_done(void)
717 {
718 	mutex_unlock(&cpu_add_remove_lock);
719 }
720+EXPORT_SYMBOL_GPL(cpu_maps_update_done);
721
722 /*
723  * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
724@@ -1053,7 +1057,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
725 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
726 	int prev_state, ret = 0;
727
728-	if (num_online_cpus() == 1)
729+	if (num_active_cpus() == 1 && cpu_active(cpu))
730 		return -EBUSY;
731
732 	if (!cpu_present(cpu))
733diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
734index e2999a070..79cb6d063 100644
735--- a/kernel/irq/generic-chip.c
736+++ b/kernel/irq/generic-chip.c
737@@ -200,6 +200,7 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
738 	irq_gc_unlock(gc);
739 	return 0;
740 }
741+EXPORT_SYMBOL_GPL(irq_gc_set_wake);
742
743 static u32 irq_readl_be(void __iomem *addr)
744 {
745diff --git a/kernel/power/Makefile b/kernel/power/Makefile
746index 5899260a8..466eaa74f 100644
747--- a/kernel/power/Makefile
748+++ b/kernel/power/Makefile
749@@ -1,5 +1,17 @@
750 # SPDX-License-Identifier: GPL-2.0
751
752+CURRENT_DIR := $(abspath $(dir $(realpath $(lastword $(MAKEFILE_LIST)))))
753+
754+ifeq ($(PRODUCT_PATH),)
755+$(error PRODUCT_PATH is not set)
756+endif
757+
758+WEAKUP_DIR := ../../../../../../$(PRODUCT_PATH)/kernel_core/kernel/power
759+ifeq ($(wildcard $(CURRENT_DIR)/$(WEAKUP_DIR)),)
760+HCS_ABS_DIR := $(abspath $(CURRENT_DIR)/$(WEAKUP_DIR))
761+$(error miss in $(HCS_ABS_DIR) for standrad system)
762+endif
763+
764 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
765
766 KASAN_SANITIZE_snapshot.o	:= n
767@@ -17,4 +29,5 @@ obj-$(CONFIG_PM_WAKELOCKS)	+= wakelock.o
768
769 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
770
771+obj-$(CONFIG_SUSPEND)		+= $(WEAKUP_DIR)/
772 obj-$(CONFIG_ENERGY_MODEL)	+= energy_model.o
773diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
774index 119b929dc..41430128d 100644
775--- a/kernel/power/energy_model.c
776+++ b/kernel/power/energy_model.c
777@@ -52,6 +52,17 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
778 }
779 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
780
781+static int em_debug_units_show(struct seq_file *s, void *unused)
782+{
783+	struct em_perf_domain *pd = s->private;
784+	char *units = pd->milliwatts ? "milliWatts" : "bogoWatts";
785+
786+	seq_printf(s, "%s\n", units);
787+
788+	return 0;
789+}
790+DEFINE_SHOW_ATTRIBUTE(em_debug_units);
791+
792 static void em_debug_create_pd(struct device *dev)
793 {
794 	struct dentry *d;
795@@ -64,6 +75,8 @@ static void em_debug_create_pd(struct device *dev)
796 		debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
797 				    &em_debug_cpus_fops);
798
799+	debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
800+
801 	/* Create a sub-directory for each performance state */
802 	for (i = 0; i < dev->em_pd->nr_perf_states; i++)
803 		em_debug_create_ps(&dev->em_pd->table[i], d);
804@@ -245,17 +258,24 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
805  * @cpus	: Pointer to cpumask_t, which in case of a CPU device is
806  *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
807  *		type of devices this should be set to NULL.
808+ * @milliwatts	: Flag indicating that the power values are in milliWatts or
809+ *		in some other scale. It must be set properly.
810  *
811  * Create Energy Model tables for a performance domain using the callbacks
812  * defined in cb.
813  *
814+ * The @milliwatts is important to set with correct value. Some kernel
815+ * sub-systems might rely on this flag and check if all devices in the EM are
816+ * using the same scale.
817+ *
818  * If multiple clients register the same performance domain, all but the first
819  * registration will be ignored.
820  *
821  * Return 0 on success
822  */
823 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
824-				struct em_data_callback *cb, cpumask_t *cpus)
825+				struct em_data_callback *cb, cpumask_t *cpus,
826+				bool milliwatts)
827 {
828 	unsigned long cap, prev_cap = 0;
829 	int cpu, ret;
830@@ -308,6 +328,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
831 	if (ret)
832 		goto unlock;
833
834+	dev->em_pd->milliwatts = milliwatts;
835+
836 	em_debug_create_pd(dev);
837 	dev_info(dev, "EM: created perf domain\n");
838
839diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
840index bf640fd61..b13fe337f 100644
841--- a/kernel/power/hibernate.c
842+++ b/kernel/power/hibernate.c
843@@ -326,7 +326,7 @@ static int create_image(int platform_mode)
844
845 	if (!in_suspend) {
846 		events_check_enabled = false;
847-		clear_free_pages();
848+		clear_or_poison_free_pages();
849 	}
850
851 	platform_leave(platform_mode);
852diff --git a/kernel/power/power.h b/kernel/power/power.h
853index 24f12d534..778bf431e 100644
854--- a/kernel/power/power.h
855+++ b/kernel/power/power.h
856@@ -106,7 +106,7 @@ extern int create_basic_memory_bitmaps(void);
857 extern void free_basic_memory_bitmaps(void);
858 extern int hibernate_preallocate_memory(void);
859
860-extern void clear_free_pages(void);
861+extern void clear_or_poison_free_pages(void);
862
863 /**
864  *	Auxiliary structure used for reading the snapshot image data and
865diff --git a/kernel/power/process.c b/kernel/power/process.c
866index 45b054b7b..cc0623080 100644
867--- a/kernel/power/process.c
868+++ b/kernel/power/process.c
869@@ -85,18 +85,21 @@ static int try_to_freeze_tasks(bool user_only)
870 	elapsed = ktime_sub(end, start);
871 	elapsed_msecs = ktime_to_ms(elapsed);
872
873-	if (todo) {
874+	if (wakeup) {
875 		pr_cont("\n");
876-		pr_err("Freezing of tasks %s after %d.%03d seconds "
877-		       "(%d tasks refusing to freeze, wq_busy=%d):\n",
878-		       wakeup ? "aborted" : "failed",
879+		pr_err("Freezing of tasks aborted after %d.%03d seconds",
880+		       elapsed_msecs / 1000, elapsed_msecs % 1000);
881+	} else if (todo) {
882+		pr_cont("\n");
883+		pr_err("Freezing of tasks failed after %d.%03d seconds"
884+		       " (%d tasks refusing to freeze, wq_busy=%d):\n",
885 		       elapsed_msecs / 1000, elapsed_msecs % 1000,
886 		       todo - wq_busy, wq_busy);
887
888 		if (wq_busy)
889 			show_workqueue_state();
890
891-		if (!wakeup || pm_debug_messages_on) {
892+		if (pm_debug_messages_on) {
893 			read_lock(&tasklist_lock);
894 			for_each_process_thread(g, p) {
895 				if (p != current && !freezer_should_skip(p)
896diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
897index 46b1804c1..a3491b29c 100644
898--- a/kernel/power/snapshot.c
899+++ b/kernel/power/snapshot.c
900@@ -1144,7 +1144,15 @@ void free_basic_memory_bitmaps(void)
901 	pr_debug("Basic memory bitmaps freed\n");
902 }
903
904-void clear_free_pages(void)
905+static void clear_or_poison_free_page(struct page *page)
906+{
907+	if (page_poisoning_enabled_static())
908+		__kernel_poison_pages(page, 1);
909+	else if (want_init_on_free())
910+		clear_highpage(page);
911+}
912+
913+void clear_or_poison_free_pages(void)
914 {
915 	struct memory_bitmap *bm = free_pages_map;
916 	unsigned long pfn;
917@@ -1152,12 +1160,12 @@ void clear_free_pages(void)
918 	if (WARN_ON(!(free_pages_map)))
919 		return;
920
921-	if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) {
922+	if (page_poisoning_enabled() || want_init_on_free()) {
923 		memory_bm_position_reset(bm);
924 		pfn = memory_bm_next_pfn(bm);
925 		while (pfn != BM_END_OF_MAP) {
926 			if (pfn_valid(pfn))
927-				clear_highpage(pfn_to_page(pfn));
928+				clear_or_poison_free_page(pfn_to_page(pfn));
929
930 			pfn = memory_bm_next_pfn(bm);
931 		}
932diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
933index 32391acc8..545958377 100644
934--- a/kernel/power/suspend.c
935+++ b/kernel/power/suspend.c
936@@ -30,6 +30,7 @@
937 #include <trace/events/power.h>
938 #include <linux/compiler.h>
939 #include <linux/moduleparam.h>
940+#include <linux/wakeup_reason.h>
941
942 #include "power.h"
943
944@@ -138,6 +139,7 @@ static void s2idle_loop(void)
945 			break;
946 		}
947
948+		clear_wakeup_reasons();
949 		s2idle_enter();
950 	}
951
952@@ -357,6 +359,7 @@ static int suspend_prepare(suspend_state_t state)
953 	if (!error)
954 		return 0;
955
956+	log_suspend_abort_reason("One or more tasks refusing to freeze");
957 	suspend_stats.failed_freeze++;
958 	dpm_save_failed_step(SUSPEND_FREEZE);
959 	pm_notifier_call_chain(PM_POST_SUSPEND);
960@@ -386,7 +389,7 @@ void __weak arch_suspend_enable_irqs(void)
961  */
962 static int suspend_enter(suspend_state_t state, bool *wakeup)
963 {
964-	int error;
965+	int error, last_dev;
966
967 	error = platform_suspend_prepare(state);
968 	if (error)
969@@ -394,7 +397,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
970
971 	error = dpm_suspend_late(PMSG_SUSPEND);
972 	if (error) {
973+		last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
974+		last_dev %= REC_FAILED_NUM;
975 		pr_err("late suspend of devices failed\n");
976+		log_suspend_abort_reason("late suspend of %s device failed",
977+					 suspend_stats.failed_devs[last_dev]);
978 		goto Platform_finish;
979 	}
980 	error = platform_suspend_prepare_late(state);
981@@ -403,7 +410,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
982
983 	error = dpm_suspend_noirq(PMSG_SUSPEND);
984 	if (error) {
985+		last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
986+		last_dev %= REC_FAILED_NUM;
987 		pr_err("noirq suspend of devices failed\n");
988+		log_suspend_abort_reason("noirq suspend of %s device failed",
989+					 suspend_stats.failed_devs[last_dev]);
990 		goto Platform_early_resume;
991 	}
992 	error = platform_suspend_prepare_noirq(state);
993@@ -419,8 +430,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
994 	}
995
996 	error = suspend_disable_secondary_cpus();
997-	if (error || suspend_test(TEST_CPUS))
998+	if (error || suspend_test(TEST_CPUS)) {
999+		log_suspend_abort_reason("Disabling non-boot cpus failed");
1000 		goto Enable_cpus;
1001+	}
1002
1003 	arch_suspend_disable_irqs();
1004 	BUG_ON(!irqs_disabled());
1005@@ -491,6 +504,8 @@ int suspend_devices_and_enter(suspend_state_t state)
1006 	error = dpm_suspend_start(PMSG_SUSPEND);
1007 	if (error) {
1008 		pr_err("Some devices failed to suspend, or early wake event detected\n");
1009+		log_suspend_abort_reason(
1010+				"Some devices failed to suspend, or early wake event detected");
1011 		goto Recover_platform;
1012 	}
1013 	suspend_test_finish("suspend devices");
1014diff --git a/kernel/reboot.c b/kernel/reboot.c
1015index af6f23d8b..bce629531 100644
1016--- a/kernel/reboot.c
1017+++ b/kernel/reboot.c
1018@@ -215,6 +215,27 @@ void do_kernel_restart(char *cmd)
1019 	atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
1020 }
1021
1022+#ifdef CONFIG_NO_GKI
1023+static ATOMIC_NOTIFIER_HEAD(pre_restart_handler_list);
1024+
1025+int register_pre_restart_handler(struct notifier_block *nb)
1026+{
1027+	return atomic_notifier_chain_register(&pre_restart_handler_list, nb);
1028+}
1029+EXPORT_SYMBOL(register_pre_restart_handler);
1030+
1031+int unregister_pre_restart_handler(struct notifier_block *nb)
1032+{
1033+	return atomic_notifier_chain_unregister(&pre_restart_handler_list, nb);
1034+}
1035+EXPORT_SYMBOL(unregister_pre_restart_handler);
1036+
1037+void do_kernel_pre_restart(char *cmd)
1038+{
1039+	atomic_notifier_call_chain(&pre_restart_handler_list, reboot_mode, cmd);
1040+}
1041+#endif
1042+
1043 void migrate_to_reboot_cpu(void)
1044 {
1045 	/* The boot cpu is always logical cpu 0 */
1046diff --git a/kernel/sched/core.c b/kernel/sched/core.c
1047index e2f00be4b..750da3e7c 100644
1048--- a/kernel/sched/core.c
1049+++ b/kernel/sched/core.c
1050@@ -47,6 +47,13 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
1051 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
1052 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
1053 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
1054+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
1055+#ifdef CONFIG_SCHEDSTATS
1056+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
1057+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
1058+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
1059+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
1060+#endif
1061
1062 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
1063
1064@@ -660,7 +667,7 @@ int get_nohz_timer_target(void)
1065 	int i, cpu = smp_processor_id(), default_cpu = -1;
1066 	struct sched_domain *sd;
1067
1068-	if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
1069+	if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
1070 		if (!idle_cpu(cpu))
1071 			return cpu;
1072 		default_cpu = cpu;
1073@@ -680,8 +687,25 @@ int get_nohz_timer_target(void)
1074 		}
1075 	}
1076
1077-	if (default_cpu == -1)
1078-		default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
1079+	if (default_cpu == -1) {
1080+		for_each_cpu_and(i, cpu_active_mask,
1081+				 housekeeping_cpumask(HK_FLAG_TIMER)) {
1082+			if (cpu == i)
1083+				continue;
1084+
1085+			if (!idle_cpu(i)) {
1086+				cpu = i;
1087+				goto unlock;
1088+			}
1089+		}
1090+
1091+		/* no active, not-idle, housekpeeing CPU found. */
1092+		default_cpu = cpumask_any(cpu_active_mask);
1093+
1094+		if (unlikely(default_cpu >= nr_cpu_ids))
1095+			goto unlock;
1096+	}
1097+
1098 	cpu = default_cpu;
1099 unlock:
1100 	rcu_read_unlock();
1101@@ -1770,7 +1794,10 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1102 	if (is_per_cpu_kthread(p))
1103 		return cpu_online(cpu);
1104
1105-	return cpu_active(cpu);
1106+	if (!cpu_active(cpu))
1107+		return false;
1108+
1109+	return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
1110 }
1111
1112 /*
1113@@ -2433,10 +2460,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
1114 			}
1115 			fallthrough;
1116 		case possible:
1117-			do_set_cpus_allowed(p, cpu_possible_mask);
1118+			do_set_cpus_allowed(p, task_cpu_possible_mask(p));
1119 			state = fail;
1120 			break;
1121-
1122 		case fail:
1123 #ifdef CONFIG_CPU_ISOLATION_OPT
1124 			allow_iso = true;
1125@@ -2627,6 +2653,9 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1126 {
1127 	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
1128
1129+	if (wake_flags & WF_SYNC)
1130+		en_flags |= ENQUEUE_WAKEUP_SYNC;
1131+
1132 	lockdep_assert_held(&rq->lock);
1133
1134 	if (p->sched_contributes_to_load)
1135@@ -3019,6 +3048,19 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1136 	if (!(p->state & state))
1137 		goto unlock;
1138
1139+#ifdef CONFIG_FREEZER
1140+	/*
1141+	 * If we're going to wake up a thread which may be frozen, then
1142+	 * we can only do so if we have an active CPU which is capable of
1143+	 * running it. This may not be the case when resuming from suspend,
1144+	 * as the secondary CPUs may not yet be back online. See __thaw_task()
1145+	 * for the actual wakeup.
1146+	 */
1147+	if (unlikely(frozen_or_skipped(p)) &&
1148+	    !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
1149+		goto unlock;
1150+#endif
1151+
1152 	trace_sched_waking(p);
1153
1154 	/* We're going to change ->state: */
1155@@ -5004,7 +5046,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
1156 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
1157 			  void *key)
1158 {
1159-	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
1160+	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC));
1161 	return try_to_wake_up(curr->private, mode, wake_flags);
1162 }
1163 EXPORT_SYMBOL(default_wake_function);
1164@@ -5713,16 +5755,19 @@ int sched_setscheduler(struct task_struct *p, int policy,
1165 {
1166 	return _sched_setscheduler(p, policy, param, true);
1167 }
1168+EXPORT_SYMBOL_GPL(sched_setscheduler);
1169
1170 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
1171 {
1172 	return __sched_setscheduler(p, attr, true, true);
1173 }
1174+EXPORT_SYMBOL_GPL(sched_setattr);
1175
1176 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
1177 {
1178 	return __sched_setscheduler(p, attr, false, true);
1179 }
1180+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
1181
1182 /**
1183  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
1184@@ -5742,6 +5787,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
1185 {
1186 	return _sched_setscheduler(p, policy, param, false);
1187 }
1188+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
1189
1190 /*
1191  * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
1192@@ -7044,6 +7090,11 @@ void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf,
1193 	 */
1194 	update_rq_clock(rq);
1195
1196+#ifdef CONFIG_SCHED_DEBUG
1197+	/* note the clock update in orf */
1198+	orf.clock_update_flags |= RQCF_UPDATED;
1199+#endif
1200+
1201 	for (;;) {
1202 		/*
1203 		 * There's this thread running, bail when that's the only
1204diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
1205index 81e43a56d..4df7f4e68 100644
1206--- a/kernel/sched/fair.c
1207+++ b/kernel/sched/fair.c
1208@@ -86,6 +86,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L
1209  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
1210  */
1211 unsigned int sysctl_sched_min_granularity			= 750000ULL;
1212+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
1213 static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
1214
1215 /*
1216@@ -10686,9 +10687,20 @@ void nohz_balance_enter_idle(int cpu)
1217
1218 	SCHED_WARN_ON(cpu != smp_processor_id());
1219
1220-	/* If this CPU is going down, then nothing needs to be done: */
1221-	if (!cpu_active(cpu))
1222+	if (!cpu_active(cpu)) {
1223+		/*
1224+		 * A CPU can be paused while it is idle with it's tick
1225+		 * stopped. nohz_balance_exit_idle() should be called
1226+		 * from the local CPU, so it can't be called during
1227+		 * pause. This results in paused CPU participating in
1228+		 * the nohz idle balance, which should be avoided.
1229+		 *
1230+		 * When the paused CPU exits idle and enters again,
1231+		 * exempt the paused CPU from nohz_balance_exit_idle.
1232+		 */
1233+		nohz_balance_exit_idle(rq);
1234 		return;
1235+	}
1236
1237 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
1238 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
1239diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
1240index 2593a733c..69afd8d1e 100644
1241--- a/kernel/sched/idle.c
1242+++ b/kernel/sched/idle.c
1243@@ -450,6 +450,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
1244 {
1245 	raw_spin_unlock_irq(&rq->lock);
1246 	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
1247+
1248 	dump_stack();
1249 	raw_spin_lock_irq(&rq->lock);
1250 }
1251diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
1252index d2a655643..b5837e277 100644
1253--- a/kernel/sched/loadavg.c
1254+++ b/kernel/sched/loadavg.c
1255@@ -75,6 +75,7 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
1256 	loads[1] = (avenrun[1] + offset) << shift;
1257 	loads[2] = (avenrun[2] + offset) << shift;
1258 }
1259+EXPORT_SYMBOL_GPL(get_avenrun);
1260
1261 long calc_load_fold_active(struct rq *this_rq, long adjust)
1262 {
1263diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
1264index 2c613e1cf..e2890b677 100644
1265--- a/kernel/sched/pelt.c
1266+++ b/kernel/sched/pelt.c
1267@@ -28,6 +28,42 @@
1268 #include "sched.h"
1269 #include "pelt.h"
1270
1271+int pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD;
1272+int pelt_load_avg_max = PELT32_LOAD_AVG_MAX;
1273+const u32 *pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv;
1274+
1275+static int __init set_pelt(char *str)
1276+{
1277+	int rc, num;
1278+
1279+	rc = kstrtoint(str, 0, &num);
1280+	if (rc) {
1281+		pr_err("%s: kstrtoint failed. rc=%d\n", __func__, rc);
1282+		return 0;
1283+	}
1284+
1285+	switch (num) {
1286+	case PELT8_LOAD_AVG_PERIOD:
1287+		pelt_load_avg_period = PELT8_LOAD_AVG_PERIOD;
1288+		pelt_load_avg_max = PELT8_LOAD_AVG_MAX;
1289+		pelt_runnable_avg_yN_inv = pelt8_runnable_avg_yN_inv;
1290+		pr_info("PELT half life is set to %dms\n", num);
1291+		break;
1292+	case PELT32_LOAD_AVG_PERIOD:
1293+		pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD;
1294+		pelt_load_avg_max = PELT32_LOAD_AVG_MAX;
1295+		pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv;
1296+		pr_info("PELT half life is set to %dms\n", num);
1297+		break;
1298+	default:
1299+		pr_err("Default PELT half life is 32ms\n");
1300+	}
1301+
1302+	return 0;
1303+}
1304+
1305+early_param("pelt", set_pelt);
1306+
1307 /*
1308  * Approximate:
1309  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
1310@@ -54,7 +90,7 @@ static u64 decay_load(u64 val, u64 n)
1311 		local_n %= LOAD_AVG_PERIOD;
1312 	}
1313
1314-	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
1315+	val = mul_u64_u32_shr(val, pelt_runnable_avg_yN_inv[local_n], 32);
1316 	return val;
1317 }
1318
1319diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
1320index b7f38f3ad..b0e6c438f 100644
1321--- a/kernel/sched/psi.c
1322+++ b/kernel/sched/psi.c
1323@@ -550,7 +550,7 @@ static u64 update_triggers(struct psi_group *group, u64 now)
1324 	return now + group->poll_min_period;
1325 }
1326
1327-/* Schedule polling if it's not already scheduled. */
1328+/* Schedule polling if it's not already scheduled or forced. */
1329 static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
1330 {
1331 	struct task_struct *task;
1332diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
1333index d5c00fa02..689cc1a63 100644
1334--- a/kernel/sched/rt.c
1335+++ b/kernel/sched/rt.c
1336@@ -1393,6 +1393,27 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1337 	enqueue_top_rt_rq(&rq->rt);
1338 }
1339
1340+#ifdef CONFIG_SMP
1341+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
1342+					bool sync)
1343+{
1344+	/*
1345+	 * If the waker is CFS, then an RT sync wakeup would preempt the waker
1346+	 * and force it to run for a likely small time after the RT wakee is
1347+	 * done. So, only honor RT sync wakeups from RT wakers.
1348+	 */
1349+	return sync && task_has_rt_policy(rq->curr) &&
1350+		p->prio <= rq->rt.highest_prio.next &&
1351+		rq->rt.rt_nr_running <= 2;
1352+}
1353+#else
1354+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
1355+					bool sync)
1356+{
1357+	return 0;
1358+}
1359+#endif
1360+
1361 /*
1362  * Adding/removing a task to/from a priority array:
1363  */
1364@@ -1400,6 +1421,7 @@ static void
1365 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1366 {
1367 	struct sched_rt_entity *rt_se = &p->rt;
1368+	bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC);
1369
1370 	if (flags & ENQUEUE_WAKEUP)
1371 		rt_se->timeout = 0;
1372@@ -1407,7 +1429,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1373 	enqueue_rt_entity(rt_se, flags);
1374 	walt_inc_cumulative_runnable_avg(rq, p);
1375
1376-	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1377+	if (!task_current(rq, p) && p->nr_cpus_allowed > 1 &&
1378+	    !should_honor_rt_sync(rq, p, sync))
1379 		enqueue_pushable_task(rq, p);
1380 }
1381
1382@@ -1464,7 +1487,10 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1383 {
1384 	struct task_struct *curr;
1385 	struct rq *rq;
1386+	struct rq *this_cpu_rq;
1387 	bool test;
1388+	bool sync = !!(flags & WF_SYNC);
1389+	int this_cpu;
1390
1391 	/* For anything but wake ups, just return the task_cpu */
1392 	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1393@@ -1474,6 +1500,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1394
1395 	rcu_read_lock();
1396 	curr = READ_ONCE(rq->curr); /* unlocked access */
1397+	this_cpu = smp_processor_id();
1398+	this_cpu_rq = cpu_rq(this_cpu);
1399
1400 	/*
1401 	 * If the current task on @p's runqueue is an RT task, then
1402@@ -1508,6 +1536,15 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1403 	test |= sysctl_sched_enable_rt_cas;
1404 #endif
1405
1406+	/*
1407+	 * Respect the sync flag as long as the task can run on this CPU.
1408+	 */
1409+	if (should_honor_rt_sync(this_cpu_rq, p, sync) &&
1410+	    cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
1411+		cpu = this_cpu;
1412+		goto out_unlock;
1413+	}
1414+
1415 	if (test || !rt_task_fits_capacity(p, cpu)) {
1416 		int target = find_lowest_rq(p);
1417
1418diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
1419index c529706be..92a6875bc 100644
1420--- a/kernel/sched/sched-pelt.h
1421+++ b/kernel/sched/sched-pelt.h
1422@@ -1,7 +1,7 @@
1423 /* SPDX-License-Identifier: GPL-2.0 */
1424 /* Generated by Documentation/scheduler/sched-pelt; do not modify. */
1425
1426-static const u32 runnable_avg_yN_inv[] __maybe_unused = {
1427+static const u32 pelt32_runnable_avg_yN_inv[] __maybe_unused = {
1428 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1429 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1430 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1431@@ -10,5 +10,20 @@ static const u32 runnable_avg_yN_inv[] __maybe_unused = {
1432 	0x85aac367, 0x82cd8698,
1433 };
1434
1435-#define LOAD_AVG_PERIOD 32
1436-#define LOAD_AVG_MAX 47742
1437+#define PELT32_LOAD_AVG_PERIOD 32
1438+#define PELT32_LOAD_AVG_MAX 47742
1439+
1440+static const u32 pelt8_runnable_avg_yN_inv[] __maybe_unused = {
1441+	0xffffffff, 0xeac0c6e6, 0xd744fcc9, 0xc5672a10,
1442+	0xb504f333, 0xa5fed6a9, 0x9837f050, 0x8b95c1e3,
1443+};
1444+
1445+#define PELT8_LOAD_AVG_PERIOD 8
1446+#define PELT8_LOAD_AVG_MAX 12336
1447+
1448+extern const u32 *pelt_runnable_avg_yN_inv;
1449+extern int pelt_load_avg_period;
1450+extern int pelt_load_avg_max;
1451+
1452+#define LOAD_AVG_PERIOD pelt_load_avg_period
1453+#define LOAD_AVG_MAX pelt_load_avg_max
1454diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
1455index 592c8653c..7c02fed0a 100644
1456--- a/kernel/sched/sched.h
1457+++ b/kernel/sched/sched.h
1458@@ -1913,6 +1913,8 @@ extern const int		sched_latency_to_weight[40];
1459 #define ENQUEUE_MIGRATED	0x00
1460 #endif
1461
1462+#define ENQUEUE_WAKEUP_SYNC	0x80
1463+
1464 #define RETRY_TASK		((void *)-1UL)
1465
1466 struct sched_class {
1467diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
1468index 9191e5daa..58d840c62 100644
1469--- a/kernel/sched/topology.c
1470+++ b/kernel/sched/topology.c
1471@@ -5,6 +5,9 @@
1472 #include "sched.h"
1473
1474 DEFINE_MUTEX(sched_domains_mutex);
1475+#ifdef CONFIG_LOCKDEP
1476+EXPORT_SYMBOL_GPL(sched_domains_mutex);
1477+#endif
1478
1479 /* Protected by sched_domains_mutex: */
1480 static cpumask_var_t sched_domains_tmpmask;
1481diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
1482index a55642aa3..6911bbca0 100644
1483--- a/kernel/sched/wait.c
1484+++ b/kernel/sched/wait.c
1485@@ -396,7 +396,8 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en
1486 }
1487 EXPORT_SYMBOL(finish_wait);
1488
1489-int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
1490+__sched int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
1491+				     int sync, void *key)
1492 {
1493 	int ret = default_wake_function(wq_entry, mode, sync, key);
1494
1495@@ -432,7 +433,7 @@ static inline bool is_kthread_should_stop(void)
1496  * }						smp_mb(); // C
1497  * remove_wait_queue(&wq_head, &wait);		wq_entry->flags |= WQ_FLAG_WOKEN;
1498  */
1499-long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
1500+__sched long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, long timeout)
1501 {
1502 	/*
1503 	 * The below executes an smp_mb(), which matches with the full barrier
1504@@ -457,7 +458,8 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
1505 }
1506 EXPORT_SYMBOL(wait_woken);
1507
1508-int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
1509+__sched int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
1510+				int sync, void *key)
1511 {
1512 	/* Pairs with the smp_store_mb() in wait_woken(). */
1513 	smp_mb(); /* C */
1514diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
1515index 3e6740207..033fa94f3 100644
1516--- a/kernel/stop_machine.c
1517+++ b/kernel/stop_machine.c
1518@@ -27,6 +27,7 @@
1519  * Structure to determine completion condition and record errors.  May
1520  * be shared by works on different cpus.
1521  */
1522+
1523 struct cpu_stop_done {
1524 	atomic_t		nr_todo;	/* nr left to execute */
1525 	int			ret;		/* collected return value */
1526