linux-5.10/yangfan_patch/kernel.patch

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 209e6567c..d47c0212e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -128,21 +128,6 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	return map;
 }

-static void bpf_map_write_active_inc(struct bpf_map *map)
-{
-	atomic64_inc(&map->writecnt);
-}
-
-static void bpf_map_write_active_dec(struct bpf_map *map)
-{
-	atomic64_dec(&map->writecnt);
-}
-
-bool bpf_map_write_active(const struct bpf_map *map)
-{
-	return atomic64_read(&map->writecnt) != 0;
-}
-
 static u32 bpf_map_value_size(struct bpf_map *map)
 {
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -604,8 +589,11 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma)
 {
 	struct bpf_map *map = vma->vm_file->private_data;

-	if (vma->vm_flags & VM_MAYWRITE)
-		bpf_map_write_active_inc(map);
+	if (vma->vm_flags & VM_MAYWRITE) {
+		mutex_lock(&map->freeze_mutex);
+		map->writecnt++;
+		mutex_unlock(&map->freeze_mutex);
+	}
 }

 /* called for all unmapped memory region (including initial) */
@@ -613,8 +601,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma)
 {
 	struct bpf_map *map = vma->vm_file->private_data;

-	if (vma->vm_flags & VM_MAYWRITE)
-		bpf_map_write_active_dec(map);
+	if (vma->vm_flags & VM_MAYWRITE) {
+		mutex_lock(&map->freeze_mutex);
+		map->writecnt--;
+		mutex_unlock(&map->freeze_mutex);
+	}
 }

 static const struct vm_operations_struct bpf_map_default_vmops = {
@@ -664,7 +655,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
 		goto out;

 	if (vma->vm_flags & VM_MAYWRITE)
-		bpf_map_write_active_inc(map);
+		map->writecnt++;
 out:
 	mutex_unlock(&map->freeze_mutex);
 	return err;
@@ -1096,7 +1087,6 @@ static int map_update_elem(union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-	bpf_map_write_active_inc(map);
 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
 		goto err_put;
@@ -1138,7 +1128,6 @@ static int map_update_elem(union bpf_attr *attr)
 free_key:
 	kfree(key);
 err_put:
-	bpf_map_write_active_dec(map);
 	fdput(f);
 	return err;
 }
@@ -1161,7 +1150,6 @@ static int map_delete_elem(union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-	bpf_map_write_active_inc(map);
 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
 		goto err_put;
@@ -1192,7 +1180,6 @@ static int map_delete_elem(union bpf_attr *attr)
 out:
 	kfree(key);
 err_put:
-	bpf_map_write_active_dec(map);
 	fdput(f);
 	return err;
 }
@@ -1497,7 +1484,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-	bpf_map_write_active_inc(map);
 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
 	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
@@ -1539,7 +1525,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 free_key:
 	kfree(key);
 err_put:
-	bpf_map_write_active_dec(map);
 	fdput(f);
 	return err;
 }
@@ -1566,7 +1551,8 @@ static int map_freeze(const union bpf_attr *attr)
 	}

 	mutex_lock(&map->freeze_mutex);
-	if (bpf_map_write_active(map)) {
+
+	if (map->writecnt) {
 		err = -EBUSY;
 		goto err_put;
 	}
@@ -3991,9 +3977,6 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 			    union bpf_attr __user *uattr,
 			    int cmd)
 {
-	bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
-			 cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
-	bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
 	struct bpf_map *map;
 	int err, ufd;
 	struct fd f;
@@ -4006,13 +3989,16 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-	if (has_write)
-		bpf_map_write_active_inc(map);
-	if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+
+	if ((cmd == BPF_MAP_LOOKUP_BATCH ||
+	     cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
+	    !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
 		err = -EPERM;
 		goto err_put;
 	}
-	if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+
+	if (cmd != BPF_MAP_LOOKUP_BATCH &&
+	    !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -4025,9 +4011,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 		BPF_DO_BATCH(map->ops->map_update_batch);
 	else
 		BPF_DO_BATCH(map->ops->map_delete_batch);
+
 err_put:
-	if (has_write)
-		bpf_map_write_active_dec(map);
 	fdput(f);
 	return err;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8de769745..3e854b91f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3492,22 +3492,7 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)

 static bool bpf_map_is_rdonly(const struct bpf_map *map)
 {
-	/* A map is considered read-only if the following condition are true:
-	 *
-	 * 1) BPF program side cannot change any of the map content. The
-	 *    BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
-	 *    and was set at map creation time.
-	 * 2) The map value(s) have been initialized from user space by a
-	 *    loader and then "frozen", such that no new map update/delete
-	 *    operations from syscall side are possible for the rest of
-	 *    the map's lifetime from that point onwards.
-	 * 3) Any parallel/pending map update/delete operations from syscall
-	 *    side have been completed. Only after that point, it's safe to
-	 *    assume that map value(s) are immutable.
-	 */
-	return (map->map_flags & BPF_F_RDONLY_PROG) &&
-	       READ_ONCE(map->frozen) &&
-	       !bpf_map_write_active(map);
+	return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen;
 }

 static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index d9f8a464b..cddc908bc 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -518,7 +518,8 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 #endif
 	    !uid_eq(cred->euid, tcred->uid) &&
-	    !uid_eq(cred->euid, tcred->suid))
+	    !uid_eq(cred->euid, tcred->suid) &&
+	    !ns_capable(tcred->user_ns, CAP_SYS_NICE))
 		ret = -EACCES;
 	put_cred(tcred);
 	if (ret)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 3173fe473..f4d318733 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -335,6 +335,8 @@ static struct cpuset top_cpuset = {
  * guidelines for accessing subsystem state in kernel/cgroup.c
  */

+static DEFINE_MUTEX(cpuset_mutex);
+
 DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);

 void cpuset_read_lock(void)
@@ -352,9 +354,9 @@ static DEFINE_SPINLOCK(callback_lock);
 static struct workqueue_struct *cpuset_migrate_mm_wq;

 /*
- * CPU / memory hotplug is handled asynchronously.
+ * CPU / memory hotplug is handled asynchronously
+ * for hotplug, synchronously for resume_cpus
  */
-static void cpuset_hotplug_workfn(struct work_struct *work);
 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);

 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
@@ -374,18 +376,29 @@ static inline bool is_in_v2_mode(void)
 }

 /*
- * Return in pmask the portion of a cpusets's cpus_allowed that
- * are online.  If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
+ * are online and are capable of running the task.  If none are found,
+ * walk up the cpuset hierarchy until we find one that does have some
+ * appropriate cpus.
  *
  * One way or another, we guarantee to return some non-empty subset
- * of cpu_online_mask.
+ * of cpu_active_mask.
  *
  * Call with callback_lock or cpuset_mutex held.
  */
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
+static void guarantee_online_cpus(struct task_struct *tsk,
+				  struct cpumask *pmask)
 {
-	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
+	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+	struct cpuset *cs;
+
+	if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
+		cpumask_copy(pmask, cpu_active_mask);
+
+	rcu_read_lock();
+	cs = task_cs(tsk);
+
+	while (!cpumask_intersects(cs->effective_cpus, pmask)) {
 		cs = parent_cs(cs);
 		if (unlikely(!cs)) {
 			/*
@@ -395,11 +408,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 			 * cpuset's effective_cpus is on its way to be
 			 * identical to cpu_online_mask.
 			 */
-			cpumask_copy(pmask, cpu_online_mask);
-			return;
+			goto out_unlock;
 		}
 	}
-	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
+	cpumask_and(pmask, pmask, cs->effective_cpus);
+
+out_unlock:
+	rcu_read_unlock();
 }

 /*
@@ -490,6 +505,9 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 	if (cs && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
 		goto free_three;

+	if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+		goto free_three;
+
 	return 0;

 free_three:
@@ -940,7 +958,7 @@ static void rebuild_root_domains(void)
 	struct cpuset *cs = NULL;
 	struct cgroup_subsys_state *pos_css;

-	percpu_rwsem_assert_held(&cpuset_rwsem);
+	lockdep_assert_held(&cpuset_mutex);
 	lockdep_assert_cpus_held();
 	lockdep_assert_held(&sched_domains_mutex);

@@ -1000,8 +1018,7 @@ static void rebuild_sched_domains_locked(void)
 	struct cpuset *cs;
 	int ndoms;

-	lockdep_assert_cpus_held();
-	percpu_rwsem_assert_held(&cpuset_rwsem);
+	lockdep_assert_held(&cpuset_mutex);

 	/*
 	 * If we have raced with CPU hotplug, return early to avoid
@@ -1052,12 +1069,18 @@ static void rebuild_sched_domains_locked(void)
 void rebuild_sched_domains(void)
 {
 	get_online_cpus();
-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);
 	rebuild_sched_domains_locked();
-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 	put_online_cpus();
 }

+static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p,
+				const struct cpumask *new_mask)
+{
+	return set_cpus_allowed_ptr(p, new_mask);
+}
+
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -1080,7 +1103,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
 		if (top_cs && (task->flags & PF_KTHREAD) &&
 		    kthread_is_per_cpu(task))
 			continue;
-		set_cpus_allowed_ptr(task, cs->effective_cpus);
+		update_cpus_allowed(cs, task, cs->effective_cpus);
 	}
 	css_task_iter_end(&it);
 }
@@ -1105,8 +1128,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
 		cpumask_and(new_cpus, new_cpus, cs->cpus_requested);
 		cpumask_and(new_cpus, new_cpus, cpu_active_mask);
 	} else {
-		cpumask_and(new_cpus, cs->cpus_requested,
-			    parent->effective_cpus);
+		cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
 	}
 }

@@ -1171,7 +1193,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 	int new_prs;
 	bool part_error = false;	/* Partition error? */

-	percpu_rwsem_assert_held(&cpuset_rwsem);
+	lockdep_assert_held(&cpuset_mutex);

 	/*
 	 * The parent must be a partition root.
@@ -2171,7 +2193,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
 	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
 	cs = css_cs(css);

-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);

 	/* allow moving tasks into an empty cpuset if on default hierarchy */
 	ret = -ENOSPC;
@@ -2195,7 +2217,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
 	cs->attach_in_progress++;
 	ret = 0;
 out_unlock:
-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 	return ret;
 }

@@ -2205,9 +2227,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)

 	cgroup_taskset_first(tset, &css);

-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);
 	css_cs(css)->attach_in_progress--;
-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 }

 /*
@@ -2231,22 +2253,20 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	cs = css_cs(css);

 	lockdep_assert_cpus_held();	/* see cgroup_attach_lock() */
-	percpu_down_write(&cpuset_rwsem);
-
-	/* prepare for attach */
-	if (cs == &top_cpuset)
-		cpumask_copy(cpus_attach, cpu_possible_mask);
-	else
-		guarantee_online_cpus(cs, cpus_attach);
+	mutex_lock(&cpuset_mutex);

 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);

 	cgroup_taskset_for_each(task, css, tset) {
+		if (cs != &top_cpuset)
+			guarantee_online_cpus(task, cpus_attach);
+		else
+			cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
 		/*
 		 * can_attach beforehand should guarantee that this doesn't
 		 * fail.  TODO: have a better way to handle failure here
 		 */
-		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+		WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach));

 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
 		cpuset_update_task_spread_flag(cs, task);
@@ -2285,7 +2305,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	if (!cs->attach_in_progress)
 		wake_up(&cpuset_attach_wq);

-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 }

 /* The various types of files and directories in a cpuset file system */
@@ -2317,7 +2337,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	int retval = 0;

 	get_online_cpus();
-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs)) {
 		retval = -ENODEV;
 		goto out_unlock;
@@ -2353,7 +2373,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 		break;
 	}
 out_unlock:
-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 	put_online_cpus();
 	return retval;
 }
@@ -2366,7 +2386,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	int retval = -ENODEV;

 	get_online_cpus();
-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;

@@ -2379,7 +2399,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 		break;
 	}
 out_unlock:
-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 	put_online_cpus();
 	return retval;
 }
@@ -2420,7 +2440,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	flush_work(&cpuset_hotplug_work);

 	get_online_cpus();
-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;

@@ -2444,7 +2464,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,

 	free_cpuset(trialcs);
 out_unlock:
-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 	put_online_cpus();
 	kernfs_unbreak_active_protection(of->kn);
 	css_put(&cs->css);
@@ -2577,13 +2597,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,

 	css_get(&cs->css);
 	get_online_cpus();
-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;

 	retval = update_prstate(cs, val);
 out_unlock:
-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 	put_online_cpus();
 	css_put(&cs->css);
 	return retval ?: nbytes;
@@ -2791,7 +2811,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 		return 0;

 	get_online_cpus();
-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);

 	set_bit(CS_ONLINE, &cs->flags);
 	if (is_spread_page(parent))
@@ -2843,7 +2863,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
 	spin_unlock_irq(&callback_lock);
 out_unlock:
-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 	put_online_cpus();
 	return 0;
 }
@@ -2864,7 +2884,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 	struct cpuset *cs = css_cs(css);

 	get_online_cpus();
-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);

 	if (is_partition_root(cs))
 		update_prstate(cs, 0);
@@ -2883,7 +2903,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 	cpuset_dec();
 	clear_bit(CS_ONLINE, &cs->flags);

-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 	put_online_cpus();
 }

@@ -2896,7 +2916,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)

 static void cpuset_bind(struct cgroup_subsys_state *root_css)
 {
-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);
 	spin_lock_irq(&callback_lock);

 	if (is_in_v2_mode()) {
@@ -2909,7 +2929,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 	}

 	spin_unlock_irq(&callback_lock);
-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 }

 /*
@@ -2919,10 +2939,10 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
  */
 static void cpuset_fork(struct task_struct *task)
 {
+	int inherit_cpus = 0;
 	if (task_css_is_root(task, cpuset_cgrp_id))
 		return;

-	set_cpus_allowed_ptr(task, current->cpus_ptr);
 	task->mems_allowed = current->mems_allowed;
 }

@@ -2951,7 +2971,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {

 int __init cpuset_init(void)
 {
-	BUG_ON(percpu_init_rwsem(&cpuset_rwsem));

 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL));
@@ -3026,7 +3045,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 	is_empty = cpumask_empty(cs->cpus_allowed) ||
 		   nodes_empty(cs->mems_allowed);

-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);

 	/*
 	 * Move tasks to the nearest ancestor with execution resources,
@@ -3036,7 +3055,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 	if (is_empty)
 		remove_tasks_in_empty_cpuset(cs);

-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);
 }

 static void
@@ -3086,14 +3105,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 retry:
 	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);

-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);

 	/*
 	 * We have raced with task attaching. We wait until attaching
 	 * is finished, so we won't attach a task to an empty cpuset.
 	 */
 	if (cs->attach_in_progress) {
-		percpu_up_write(&cpuset_rwsem);
+		mutex_unlock(&cpuset_mutex);
 		goto retry;
 	}

@@ -3165,7 +3184,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
 					    cpus_updated, mems_updated);

-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);
 }

 /**
@@ -3184,7 +3203,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
  * Note that CPU offlining during suspend is ignored.  We don't modify
  * cpusets across suspend/resume cycles at all.
  */
-static void cpuset_hotplug_workfn(struct work_struct *work)
+void cpuset_hotplug_workfn(struct work_struct *work)
 {
 	static cpumask_t new_cpus;
 	static nodemask_t new_mems;
@@ -3195,7 +3214,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
 		ptmp = &tmp;

-	percpu_down_write(&cpuset_rwsem);
+	mutex_lock(&cpuset_mutex);

 	/* fetch the available cpus/mems and find out which changed how */
 	cpumask_copy(&new_cpus, cpu_active_mask);
@@ -3252,7 +3271,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 		update_tasks_nodemask(&top_cpuset);
 	}

-	percpu_up_write(&cpuset_rwsem);
+	mutex_unlock(&cpuset_mutex);

 	/* if cpus or mems changed, we need to propagate to descendants */
 	if (cpus_updated || mems_updated) {
@@ -3296,6 +3315,7 @@ void cpuset_wait_for_hotplug(void)
 {
 	flush_work(&cpuset_hotplug_work);
 }
+EXPORT_SYMBOL_GPL(cpuset_wait_for_hotplug);

 /*
  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
@@ -3354,11 +3374,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)

 	spin_lock_irqsave(&callback_lock, flags);
 	rcu_read_lock();
-	guarantee_online_cpus(task_cs(tsk), pmask);
+	guarantee_online_cpus(tsk, pmask);
 	rcu_read_unlock();
 	spin_unlock_irqrestore(&callback_lock, flags);
 }
-
+EXPORT_SYMBOL_GPL(cpuset_cpus_allowed);
 /**
  * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
  * @tsk: pointer to task_struct with which the scheduler is struggling
@@ -3373,9 +3393,17 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)

 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
+	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+	const struct cpumask *cs_mask;
+
 	rcu_read_lock();
-	do_set_cpus_allowed(tsk, is_in_v2_mode() ?
-		task_cs(tsk)->cpus_allowed : cpu_possible_mask);
+	cs_mask = task_cs(tsk)->cpus_allowed;
+
+	if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask))
+		goto unlock; /* select_fallback_rq will try harder */
+
+	do_set_cpus_allowed(tsk, cs_mask);
+unlock:
 	rcu_read_unlock();

 	/*

diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 08236798d..081d026f1 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -479,3 +479,4 @@ struct cgroup_subsys freezer_cgrp_subsys = {
 	.fork		= freezer_fork,
 	.legacy_cftypes	= files,
 };
+EXPORT_SYMBOL_GPL(freezer_cgrp_subsys);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 4b27158d3..b076ccd1b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -39,6 +39,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/cpuhp.h>

+#undef CREATE_TRACE_POINTS
+
 #include "smpboot.h"

 /**
@@ -274,11 +276,13 @@ void cpu_maps_update_begin(void)
 {
 	mutex_lock(&cpu_add_remove_lock);
 }
+EXPORT_SYMBOL_GPL(cpu_maps_update_begin);

 void cpu_maps_update_done(void)
 {
 	mutex_unlock(&cpu_add_remove_lock);
 }
+EXPORT_SYMBOL_GPL(cpu_maps_update_done);

 /*
  * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
@@ -1053,7 +1057,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	int prev_state, ret = 0;

-	if (num_online_cpus() == 1)
+	if (num_active_cpus() == 1 && cpu_active(cpu))
 		return -EBUSY;

 	if (!cpu_present(cpu))
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index e2999a070..79cb6d063 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -200,6 +200,7 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
 	irq_gc_unlock(gc);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(irq_gc_set_wake);

 static u32 irq_readl_be(void __iomem *addr)
 {
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 5899260a8..466eaa74f 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,5 +1,17 @@
 # SPDX-License-Identifier: GPL-2.0

+CURRENT_DIR := $(abspath $(dir $(realpath $(lastword $(MAKEFILE_LIST)))))
+
+ifeq ($(PRODUCT_PATH),)
+$(error PRODUCT_PATH is not set)
+endif
+
+WEAKUP_DIR := ../../../../../../$(PRODUCT_PATH)/kernel_core/kernel/power
+ifeq ($(wildcard $(CURRENT_DIR)/$(WEAKUP_DIR)),)
+HCS_ABS_DIR := $(abspath $(CURRENT_DIR)/$(WEAKUP_DIR))
+$(error miss in $(HCS_ABS_DIR) for standrad system)
+endif
+
 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG

 KASAN_SANITIZE_snapshot.o	:= n
@@ -17,4 +29,5 @@ obj-$(CONFIG_PM_WAKELOCKS)	+= wakelock.o

 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o

+obj-$(CONFIG_SUSPEND)		+= $(WEAKUP_DIR)/
 obj-$(CONFIG_ENERGY_MODEL)	+= energy_model.o
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 119b929dc..41430128d 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -52,6 +52,17 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);

+static int em_debug_units_show(struct seq_file *s, void *unused)
+{
+	struct em_perf_domain *pd = s->private;
+	char *units = pd->milliwatts ? "milliWatts" : "bogoWatts";
+
+	seq_printf(s, "%s\n", units);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_units);
+
 static void em_debug_create_pd(struct device *dev)
 {
 	struct dentry *d;
@@ -64,6 +75,8 @@ static void em_debug_create_pd(struct device *dev)
 		debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
 				    &em_debug_cpus_fops);

+	debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
+
 	/* Create a sub-directory for each performance state */
 	for (i = 0; i < dev->em_pd->nr_perf_states; i++)
 		em_debug_create_ps(&dev->em_pd->table[i], d);
@@ -245,17 +258,24 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
  * @cpus	: Pointer to cpumask_t, which in case of a CPU device is
  *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
  *		type of devices this should be set to NULL.
+ * @milliwatts	: Flag indicating that the power values are in milliWatts or
+ *		in some other scale. It must be set properly.
  *
  * Create Energy Model tables for a performance domain using the callbacks
  * defined in cb.
  *
+ * The @milliwatts is important to set with correct value. Some kernel
+ * sub-systems might rely on this flag and check if all devices in the EM are
+ * using the same scale.
+ *
  * If multiple clients register the same performance domain, all but the first
  * registration will be ignored.
  *
  * Return 0 on success
  */
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
-				struct em_data_callback *cb, cpumask_t *cpus)
+				struct em_data_callback *cb, cpumask_t *cpus,
+				bool milliwatts)
 {
 	unsigned long cap, prev_cap = 0;
 	int cpu, ret;
@@ -308,6 +328,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 	if (ret)
 		goto unlock;

+	dev->em_pd->milliwatts = milliwatts;
+
 	em_debug_create_pd(dev);
 	dev_info(dev, "EM: created perf domain\n");

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bf640fd61..b13fe337f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -326,7 +326,7 @@ static int create_image(int platform_mode)

 	if (!in_suspend) {
 		events_check_enabled = false;
-		clear_free_pages();
+		clear_or_poison_free_pages();
 	}

 	platform_leave(platform_mode);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 24f12d534..778bf431e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -106,7 +106,7 @@ extern int create_basic_memory_bitmaps(void);
 extern void free_basic_memory_bitmaps(void);
 extern int hibernate_preallocate_memory(void);

-extern void clear_free_pages(void);
+extern void clear_or_poison_free_pages(void);

 /**
  *	Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 45b054b7b..cc0623080 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -85,18 +85,21 @@ static int try_to_freeze_tasks(bool user_only)
 	elapsed = ktime_sub(end, start);
 	elapsed_msecs = ktime_to_ms(elapsed);

-	if (todo) {
+	if (wakeup) {
 		pr_cont("\n");
-		pr_err("Freezing of tasks %s after %d.%03d seconds "
-		       "(%d tasks refusing to freeze, wq_busy=%d):\n",
-		       wakeup ? "aborted" : "failed",
+		pr_err("Freezing of tasks aborted after %d.%03d seconds",
+		       elapsed_msecs / 1000, elapsed_msecs % 1000);
+	} else if (todo) {
+		pr_cont("\n");
+		pr_err("Freezing of tasks failed after %d.%03d seconds"
+		       " (%d tasks refusing to freeze, wq_busy=%d):\n",
 		       elapsed_msecs / 1000, elapsed_msecs % 1000,
 		       todo - wq_busy, wq_busy);

 		if (wq_busy)
 			show_workqueue_state();

-		if (!wakeup || pm_debug_messages_on) {
+		if (pm_debug_messages_on) {
 			read_lock(&tasklist_lock);
 			for_each_process_thread(g, p) {
 				if (p != current && !freezer_should_skip(p)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 46b1804c1..a3491b29c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1144,7 +1144,15 @@ void free_basic_memory_bitmaps(void)
 	pr_debug("Basic memory bitmaps freed\n");
 }

-void clear_free_pages(void)
+static void clear_or_poison_free_page(struct page *page)
+{
+	if (page_poisoning_enabled_static())
+		__kernel_poison_pages(page, 1);
+	else if (want_init_on_free())
+		clear_highpage(page);
+}
+
+void clear_or_poison_free_pages(void)
 {
 	struct memory_bitmap *bm = free_pages_map;
 	unsigned long pfn;
@@ -1152,12 +1160,12 @@ void clear_free_pages(void)
 	if (WARN_ON(!(free_pages_map)))
 		return;

-	if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) {
+	if (page_poisoning_enabled() || want_init_on_free()) {
 		memory_bm_position_reset(bm);
 		pfn = memory_bm_next_pfn(bm);
 		while (pfn != BM_END_OF_MAP) {
 			if (pfn_valid(pfn))
-				clear_highpage(pfn_to_page(pfn));
+				clear_or_poison_free_page(pfn_to_page(pfn));

 			pfn = memory_bm_next_pfn(bm);
 		}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 32391acc8..545958377 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,6 +30,7 @@
 #include <trace/events/power.h>
 #include <linux/compiler.h>
 #include <linux/moduleparam.h>
+#include <linux/wakeup_reason.h>

 #include "power.h"

@@ -138,6 +139,7 @@ static void s2idle_loop(void)
 			break;
 		}

+		clear_wakeup_reasons();
 		s2idle_enter();
 	}

@@ -357,6 +359,7 @@ static int suspend_prepare(suspend_state_t state)
 	if (!error)
 		return 0;

+	log_suspend_abort_reason("One or more tasks refusing to freeze");
 	suspend_stats.failed_freeze++;
 	dpm_save_failed_step(SUSPEND_FREEZE);
 	pm_notifier_call_chain(PM_POST_SUSPEND);
@@ -386,7 +389,7 @@ void __weak arch_suspend_enable_irqs(void)
  */
 static int suspend_enter(suspend_state_t state, bool *wakeup)
 {
-	int error;
+	int error, last_dev;

 	error = platform_suspend_prepare(state);
 	if (error)
@@ -394,7 +397,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)

 	error = dpm_suspend_late(PMSG_SUSPEND);
 	if (error) {
+		last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+		last_dev %= REC_FAILED_NUM;
 		pr_err("late suspend of devices failed\n");
+		log_suspend_abort_reason("late suspend of %s device failed",
+					 suspend_stats.failed_devs[last_dev]);
 		goto Platform_finish;
 	}
 	error = platform_suspend_prepare_late(state);
@@ -403,7 +410,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)

 	error = dpm_suspend_noirq(PMSG_SUSPEND);
 	if (error) {
+		last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+		last_dev %= REC_FAILED_NUM;
 		pr_err("noirq suspend of devices failed\n");
+		log_suspend_abort_reason("noirq suspend of %s device failed",
+					 suspend_stats.failed_devs[last_dev]);
 		goto Platform_early_resume;
 	}
 	error = platform_suspend_prepare_noirq(state);
@@ -419,8 +430,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	}

 	error = suspend_disable_secondary_cpus();
-	if (error || suspend_test(TEST_CPUS))
+	if (error || suspend_test(TEST_CPUS)) {
+		log_suspend_abort_reason("Disabling non-boot cpus failed");
 		goto Enable_cpus;
+	}

 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
@@ -491,6 +504,8 @@ int suspend_devices_and_enter(suspend_state_t state)
 	error = dpm_suspend_start(PMSG_SUSPEND);
 	if (error) {
 		pr_err("Some devices failed to suspend, or early wake event detected\n");
+		log_suspend_abort_reason(
+				"Some devices failed to suspend, or early wake event detected");
 		goto Recover_platform;
 	}
 	suspend_test_finish("suspend devices");
diff --git a/kernel/reboot.c b/kernel/reboot.c
index af6f23d8b..bce629531 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -215,6 +215,27 @@ void do_kernel_restart(char *cmd)
 	atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
 }

+#ifdef CONFIG_NO_GKI
+static ATOMIC_NOTIFIER_HEAD(pre_restart_handler_list);
+
+int register_pre_restart_handler(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&pre_restart_handler_list, nb);
+}
+EXPORT_SYMBOL(register_pre_restart_handler);
+
+int unregister_pre_restart_handler(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&pre_restart_handler_list, nb);
+}
+EXPORT_SYMBOL(unregister_pre_restart_handler);
+
+void do_kernel_pre_restart(char *cmd)
+{
+	atomic_notifier_call_chain(&pre_restart_handler_list, reboot_mode, cmd);
+}
+#endif
+
 void migrate_to_reboot_cpu(void)
 {
 	/* The boot cpu is always logical cpu 0 */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e2f00be4b..750da3e7c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -47,6 +47,13 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
+#ifdef CONFIG_SCHEDSTATS
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
+#endif

 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

@@ -660,7 +667,7 @@ int get_nohz_timer_target(void)
 	int i, cpu = smp_processor_id(), default_cpu = -1;
 	struct sched_domain *sd;

-	if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+	if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
 		if (!idle_cpu(cpu))
 			return cpu;
 		default_cpu = cpu;
@@ -680,8 +687,25 @@ int get_nohz_timer_target(void)
 		}
 	}

-	if (default_cpu == -1)
-		default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+	if (default_cpu == -1) {
+		for_each_cpu_and(i, cpu_active_mask,
+				 housekeeping_cpumask(HK_FLAG_TIMER)) {
+			if (cpu == i)
+				continue;
+
+			if (!idle_cpu(i)) {
+				cpu = i;
+				goto unlock;
+			}
+		}
+
+		/* no active, not-idle, housekpeeing CPU found. */
+		default_cpu = cpumask_any(cpu_active_mask);
+
+		if (unlikely(default_cpu >= nr_cpu_ids))
+			goto unlock;
+	}
+
 	cpu = default_cpu;
 unlock:
 	rcu_read_unlock();
@@ -1770,7 +1794,10 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
 	if (is_per_cpu_kthread(p))
 		return cpu_online(cpu);

-	return cpu_active(cpu);
+	if (!cpu_active(cpu))
+		return false;
+
+	return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
 }

 /*
@@ -2433,10 +2460,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			}
 			fallthrough;
 		case possible:
-			do_set_cpus_allowed(p, cpu_possible_mask);
+			do_set_cpus_allowed(p, task_cpu_possible_mask(p));
 			state = fail;
 			break;
-
 		case fail:
 #ifdef CONFIG_CPU_ISOLATION_OPT
 			allow_iso = true;
@@ -2627,6 +2653,9 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 {
 	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;

+	if (wake_flags & WF_SYNC)
+		en_flags |= ENQUEUE_WAKEUP_SYNC;
+
 	lockdep_assert_held(&rq->lock);

 	if (p->sched_contributes_to_load)
@@ -3019,6 +3048,19 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	if (!(p->state & state))
 		goto unlock;

+#ifdef CONFIG_FREEZER
+	/*
+	 * If we're going to wake up a thread which may be frozen, then
+	 * we can only do so if we have an active CPU which is capable of
+	 * running it. This may not be the case when resuming from suspend,
+	 * as the secondary CPUs may not yet be back online. See __thaw_task()
+	 * for the actual wakeup.
+	 */
+	if (unlikely(frozen_or_skipped(p)) &&
+	    !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p)))
+		goto unlock;
+#endif
+
 	trace_sched_waking(p);

 	/* We're going to change ->state: */
@@ -5004,7 +5046,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC));
 	return try_to_wake_up(curr->private, mode, wake_flags);
 }
 EXPORT_SYMBOL(default_wake_function);
@@ -5713,16 +5755,19 @@ int sched_setscheduler(struct task_struct *p, int policy,
 {
 	return _sched_setscheduler(p, policy, param, true);
 }
+EXPORT_SYMBOL_GPL(sched_setscheduler);

 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
 	return __sched_setscheduler(p, attr, true, true);
 }
+EXPORT_SYMBOL_GPL(sched_setattr);

 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
 {
 	return __sched_setscheduler(p, attr, false, true);
 }
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);

 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
@@ -5742,6 +5787,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 {
 	return _sched_setscheduler(p, policy, param, false);
 }
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);

 /*
  * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
@@ -7044,6 +7090,11 @@ void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf,
 	 */
 	update_rq_clock(rq);

+#ifdef CONFIG_SCHED_DEBUG
+	/* note the clock update in orf */
+	orf.clock_update_flags |= RQCF_UPDATED;
+#endif
+
 	for (;;) {
 		/*
 		 * There's this thread running, bail when that's the only
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 81e43a56d..4df7f4e68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -86,6 +86,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
 unsigned int sysctl_sched_min_granularity			= 750000ULL;
+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
 static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;

 /*
@@ -10686,9 +10687,20 @@ void nohz_balance_enter_idle(int cpu)

 	SCHED_WARN_ON(cpu != smp_processor_id());

-	/* If this CPU is going down, then nothing needs to be done: */
-	if (!cpu_active(cpu))
+	if (!cpu_active(cpu)) {
+		/*
+		 * A CPU can be paused while it is idle with it's tick
+		 * stopped. nohz_balance_exit_idle() should be called
+		 * from the local CPU, so it can't be called during
+		 * pause. This results in paused CPU participating in
+		 * the nohz idle balance, which should be avoided.
+		 *
+		 * When the paused CPU exits idle and enters again,
+		 * exempt the paused CPU from nohz_balance_exit_idle.
+		 */
+		nohz_balance_exit_idle(rq);
 		return;
+	}

 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2593a733c..69afd8d1e 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -450,6 +450,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 {
 	raw_spin_unlock_irq(&rq->lock);
 	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+
 	dump_stack();
 	raw_spin_lock_irq(&rq->lock);
 }
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index d2a655643..b5837e277 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -75,6 +75,7 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 	loads[1] = (avenrun[1] + offset) << shift;
 	loads[2] = (avenrun[2] + offset) << shift;
 }
+EXPORT_SYMBOL_GPL(get_avenrun);

 long calc_load_fold_active(struct rq *this_rq, long adjust)
 {
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 2c613e1cf..e2890b677 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,6 +28,42 @@
 #include "sched.h"
 #include "pelt.h"

+int pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD;
+int pelt_load_avg_max = PELT32_LOAD_AVG_MAX;
+const u32 *pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv;
+
+static int __init set_pelt(char *str)
+{
+	int rc, num;
+
+	rc = kstrtoint(str, 0, &num);
+	if (rc) {
+		pr_err("%s: kstrtoint failed. rc=%d\n", __func__, rc);
+		return 0;
+	}
+
+	switch (num) {
+	case PELT8_LOAD_AVG_PERIOD:
+		pelt_load_avg_period = PELT8_LOAD_AVG_PERIOD;
+		pelt_load_avg_max = PELT8_LOAD_AVG_MAX;
+		pelt_runnable_avg_yN_inv = pelt8_runnable_avg_yN_inv;
+		pr_info("PELT half life is set to %dms\n", num);
+		break;
+	case PELT32_LOAD_AVG_PERIOD:
+		pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD;
+		pelt_load_avg_max = PELT32_LOAD_AVG_MAX;
+		pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv;
+		pr_info("PELT half life is set to %dms\n", num);
+		break;
+	default:
+		pr_err("Default PELT half life is 32ms\n");
+	}
+
+	return 0;
+}
+
+early_param("pelt", set_pelt);
+
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
@@ -54,7 +90,7 @@ static u64 decay_load(u64 val, u64 n)
 		local_n %= LOAD_AVG_PERIOD;
 	}

-	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+	val = mul_u64_u32_shr(val, pelt_runnable_avg_yN_inv[local_n], 32);
 	return val;
 }

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index b7f38f3ad..b0e6c438f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -550,7 +550,7 @@ static u64 update_triggers(struct psi_group *group, u64 now)
 	return now + group->poll_min_period;
 }

-/* Schedule polling if it's not already scheduled. */
+/* Schedule polling if it's not already scheduled or forced. */
 static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
 {
 	struct task_struct *task;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d5c00fa02..689cc1a63 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1393,6 +1393,27 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 	enqueue_top_rt_rq(&rq->rt);
 }

+#ifdef CONFIG_SMP
+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
+					bool sync)
+{
+	/*
+	 * If the waker is CFS, then an RT sync wakeup would preempt the waker
+	 * and force it to run for a likely small time after the RT wakee is
+	 * done. So, only honor RT sync wakeups from RT wakers.
+	 */
+	return sync && task_has_rt_policy(rq->curr) &&
+		p->prio <= rq->rt.highest_prio.next &&
+		rq->rt.rt_nr_running <= 2;
+}
+#else
+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
+					bool sync)
+{
+	return 0;
+}
+#endif
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -1400,6 +1421,7 @@ static void
 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
+	bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC);

 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
@@ -1407,7 +1429,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	enqueue_rt_entity(rt_se, flags);
 	walt_inc_cumulative_runnable_avg(rq, p);

-	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+	if (!task_current(rq, p) && p->nr_cpus_allowed > 1 &&
+	    !should_honor_rt_sync(rq, p, sync))
 		enqueue_pushable_task(rq, p);
 }

@@ -1464,7 +1487,10 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
 	struct task_struct *curr;
 	struct rq *rq;
+	struct rq *this_cpu_rq;
 	bool test;
+	bool sync = !!(flags & WF_SYNC);
+	int this_cpu;

 	/* For anything but wake ups, just return the task_cpu */
 	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1474,6 +1500,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)

 	rcu_read_lock();
 	curr = READ_ONCE(rq->curr); /* unlocked access */
+	this_cpu = smp_processor_id();
+	this_cpu_rq = cpu_rq(this_cpu);

 	/*
 	 * If the current task on @p's runqueue is an RT task, then
@@ -1508,6 +1536,15 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	test |= sysctl_sched_enable_rt_cas;
 #endif

+	/*
+	 * Respect the sync flag as long as the task can run on this CPU.
+	 */
+	if (should_honor_rt_sync(this_cpu_rq, p, sync) &&
+	    cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
+		cpu = this_cpu;
+		goto out_unlock;
+	}
+
 	if (test || !rt_task_fits_capacity(p, cpu)) {
 		int target = find_lowest_rq(p);

diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index c529706be..92a6875bc 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by Documentation/scheduler/sched-pelt; do not modify. */

-static const u32 runnable_avg_yN_inv[] __maybe_unused = {
+static const u32 pelt32_runnable_avg_yN_inv[] __maybe_unused = {
 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
@@ -10,5 +10,20 @@ static const u32 runnable_avg_yN_inv[] __maybe_unused = {
 	0x85aac367, 0x82cd8698,
 };

-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742
+#define PELT32_LOAD_AVG_PERIOD 32
+#define PELT32_LOAD_AVG_MAX 47742
+
+static const u32 pelt8_runnable_avg_yN_inv[] __maybe_unused = {
+	0xffffffff, 0xeac0c6e6, 0xd744fcc9, 0xc5672a10,
+	0xb504f333, 0xa5fed6a9, 0x9837f050, 0x8b95c1e3,
+};
+
+#define PELT8_LOAD_AVG_PERIOD 8
+#define PELT8_LOAD_AVG_MAX 12336
+
+extern const u32 *pelt_runnable_avg_yN_inv;
+extern int pelt_load_avg_period;
+extern int pelt_load_avg_max;
+
+#define LOAD_AVG_PERIOD pelt_load_avg_period
+#define LOAD_AVG_MAX pelt_load_avg_max
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 592c8653c..7c02fed0a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1913,6 +1913,8 @@ extern const int		sched_latency_to_weight[40];
 #define ENQUEUE_MIGRATED	0x00
 #endif

+#define ENQUEUE_WAKEUP_SYNC	0x80
+
 #define RETRY_TASK		((void *)-1UL)

 struct sched_class {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9191e5daa..58d840c62 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -5,6 +5,9 @@
 #include "sched.h"

 DEFINE_MUTEX(sched_domains_mutex);
+#ifdef CONFIG_LOCKDEP
+EXPORT_SYMBOL_GPL(sched_domains_mutex);
+#endif

 /* Protected by sched_domains_mutex: */
 static cpumask_var_t sched_domains_tmpmask;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index a55642aa3..6911bbca0 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -396,7 +396,8 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en
 }
 EXPORT_SYMBOL(finish_wait);

-int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
+__sched int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
+				     int sync, void *key)
 {
 	int ret = default_wake_function(wq_entry, mode, sync, key);

@@ -432,7 +433,7 @@ static inline bool is_kthread_should_stop(void)
  * }						smp_mb(); // C
  * remove_wait_queue(&wq_head, &wait);		wq_entry->flags |= WQ_FLAG_WOKEN;
  */
-long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
+__sched long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, long timeout)
 {
 	/*
 	 * The below executes an smp_mb(), which matches with the full barrier
@@ -457,7 +458,8 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
 }
 EXPORT_SYMBOL(wait_woken);

-int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
+__sched int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
+				int sync, void *key)
 {
 	/* Pairs with the smp_store_mb() in wait_woken(). */
 	smp_mb(); /* C */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 3e6740207..033fa94f3 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -27,6 +27,7 @@
  * Structure to determine completion condition and record errors.  May
  * be shared by works on different cpus.
  */
+
 struct cpu_stop_done {
 	atomic_t		nr_todo;	/* nr left to execute */
 	int			ret;		/* collected return value */