1diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c 2index 209e6567c..d47c0212e 100644 3--- a/kernel/bpf/syscall.c 4+++ b/kernel/bpf/syscall.c 5@@ -128,21 +128,6 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) 6 return map; 7 } 8 9-static void bpf_map_write_active_inc(struct bpf_map *map) 10-{ 11- atomic64_inc(&map->writecnt); 12-} 13- 14-static void bpf_map_write_active_dec(struct bpf_map *map) 15-{ 16- atomic64_dec(&map->writecnt); 17-} 18- 19-bool bpf_map_write_active(const struct bpf_map *map) 20-{ 21- return atomic64_read(&map->writecnt) != 0; 22-} 23- 24 static u32 bpf_map_value_size(struct bpf_map *map) 25 { 26 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 27@@ -604,8 +589,11 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma) 28 { 29 struct bpf_map *map = vma->vm_file->private_data; 30 31- if (vma->vm_flags & VM_MAYWRITE) 32- bpf_map_write_active_inc(map); 33+ if (vma->vm_flags & VM_MAYWRITE) { 34+ mutex_lock(&map->freeze_mutex); 35+ map->writecnt++; 36+ mutex_unlock(&map->freeze_mutex); 37+ } 38 } 39 40 /* called for all unmapped memory region (including initial) */ 41@@ -613,8 +601,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma) 42 { 43 struct bpf_map *map = vma->vm_file->private_data; 44 45- if (vma->vm_flags & VM_MAYWRITE) 46- bpf_map_write_active_dec(map); 47+ if (vma->vm_flags & VM_MAYWRITE) { 48+ mutex_lock(&map->freeze_mutex); 49+ map->writecnt--; 50+ mutex_unlock(&map->freeze_mutex); 51+ } 52 } 53 54 static const struct vm_operations_struct bpf_map_default_vmops = { 55@@ -664,7 +655,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) 56 goto out; 57 58 if (vma->vm_flags & VM_MAYWRITE) 59- bpf_map_write_active_inc(map); 60+ map->writecnt++; 61 out: 62 mutex_unlock(&map->freeze_mutex); 63 return err; 64@@ -1096,7 +1087,6 @@ static int map_update_elem(union bpf_attr *attr) 65 map = __bpf_map_get(f); 66 if (IS_ERR(map)) 67 return PTR_ERR(map); 68- bpf_map_write_active_inc(map); 69 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 70 err = -EPERM; 71 goto err_put; 72@@ -1138,7 +1128,6 @@ static int map_update_elem(union bpf_attr *attr) 73 free_key: 74 kfree(key); 75 err_put: 76- bpf_map_write_active_dec(map); 77 fdput(f); 78 return err; 79 } 80@@ -1161,7 +1150,6 @@ static int map_delete_elem(union bpf_attr *attr) 81 map = __bpf_map_get(f); 82 if (IS_ERR(map)) 83 return PTR_ERR(map); 84- bpf_map_write_active_inc(map); 85 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 86 err = -EPERM; 87 goto err_put; 88@@ -1192,7 +1180,6 @@ static int map_delete_elem(union bpf_attr *attr) 89 out: 90 kfree(key); 91 err_put: 92- bpf_map_write_active_dec(map); 93 fdput(f); 94 return err; 95 } 96@@ -1497,7 +1484,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) 97 map = __bpf_map_get(f); 98 if (IS_ERR(map)) 99 return PTR_ERR(map); 100- bpf_map_write_active_inc(map); 101 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || 102 !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 103 err = -EPERM; 104@@ -1539,7 +1525,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) 105 free_key: 106 kfree(key); 107 err_put: 108- bpf_map_write_active_dec(map); 109 fdput(f); 110 return err; 111 } 112@@ -1566,7 +1551,8 @@ static int map_freeze(const union bpf_attr *attr) 113 } 114 115 mutex_lock(&map->freeze_mutex); 116- if (bpf_map_write_active(map)) { 117+ 118+ if (map->writecnt) { 119 err = -EBUSY; 120 goto err_put; 121 } 122@@ -3991,9 +3977,6 @@ static int bpf_map_do_batch(const union bpf_attr *attr, 123 union bpf_attr __user *uattr, 124 int cmd) 125 { 126- bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || 127- cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; 128- bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; 129 struct bpf_map *map; 130 int err, ufd; 131 struct fd f; 132@@ -4006,13 +3989,16 @@ static int bpf_map_do_batch(const union bpf_attr *attr, 133 map = __bpf_map_get(f); 134 if (IS_ERR(map)) 135 return PTR_ERR(map); 136- if (has_write) 137- bpf_map_write_active_inc(map); 138- if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { 139+ 140+ if ((cmd == BPF_MAP_LOOKUP_BATCH || 141+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && 142+ !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { 143 err = -EPERM; 144 goto err_put; 145 } 146- if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 147+ 148+ if (cmd != BPF_MAP_LOOKUP_BATCH && 149+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 150 err = -EPERM; 151 goto err_put; 152 } 153@@ -4025,9 +4011,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr, 154 BPF_DO_BATCH(map->ops->map_update_batch); 155 else 156 BPF_DO_BATCH(map->ops->map_delete_batch); 157+ 158 err_put: 159- if (has_write) 160- bpf_map_write_active_dec(map); 161 fdput(f); 162 return err; 163 } 164diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c 165index 8de769745..3e854b91f 100644 166--- a/kernel/bpf/verifier.c 167+++ b/kernel/bpf/verifier.c 168@@ -3492,22 +3492,7 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) 169 170 static bool bpf_map_is_rdonly(const struct bpf_map *map) 171 { 172- /* A map is considered read-only if the following condition are true: 173- * 174- * 1) BPF program side cannot change any of the map content. The 175- * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map 176- * and was set at map creation time. 177- * 2) The map value(s) have been initialized from user space by a 178- * loader and then "frozen", such that no new map update/delete 179- * operations from syscall side are possible for the rest of 180- * the map's lifetime from that point onwards. 181- * 3) Any parallel/pending map update/delete operations from syscall 182- * side have been completed. Only after that point, it's safe to 183- * assume that map value(s) are immutable. 184- */ 185- return (map->map_flags & BPF_F_RDONLY_PROG) && 186- READ_ONCE(map->frozen) && 187- !bpf_map_write_active(map); 188+ return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; 189 } 190 191 static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) 192diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c 193index d9f8a464b..cddc908bc 100644 194--- a/kernel/cgroup/cgroup-v1.c 195+++ b/kernel/cgroup/cgroup-v1.c 196@@ -518,7 +518,8 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, 197 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 198 #endif 199 !uid_eq(cred->euid, tcred->uid) && 200- !uid_eq(cred->euid, tcred->suid)) 201+ !uid_eq(cred->euid, tcred->suid) && 202+ !ns_capable(tcred->user_ns, CAP_SYS_NICE)) 203 ret = -EACCES; 204 put_cred(tcred); 205 if (ret) 206diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c 207index 3173fe473..f4d318733 100644 208--- a/kernel/cgroup/cpuset.c 209+++ b/kernel/cgroup/cpuset.c 210@@ -335,6 +335,8 @@ static struct cpuset top_cpuset = { 211 * guidelines for accessing subsystem state in kernel/cgroup.c 212 */ 213 214+static DEFINE_MUTEX(cpuset_mutex); 215+ 216 DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); 217 218 void cpuset_read_lock(void) 219@@ -352,9 +354,9 @@ static DEFINE_SPINLOCK(callback_lock); 220 static struct workqueue_struct *cpuset_migrate_mm_wq; 221 222 /* 223- * CPU / memory hotplug is handled asynchronously. 224+ * CPU / memory hotplug is handled asynchronously 225+ * for hotplug, synchronously for resume_cpus 226 */ 227-static void cpuset_hotplug_workfn(struct work_struct *work); 228 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 229 230 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 231@@ -374,18 +376,29 @@ static inline bool is_in_v2_mode(void) 232 } 233 234 /* 235- * Return in pmask the portion of a cpusets's cpus_allowed that 236- * are online. If none are online, walk up the cpuset hierarchy 237- * until we find one that does have some online cpus. 238+ * Return in pmask the portion of a task's cpusets's cpus_allowed that 239+ * are online and are capable of running the task. If none are found, 240+ * walk up the cpuset hierarchy until we find one that does have some 241+ * appropriate cpus. 242 * 243 * One way or another, we guarantee to return some non-empty subset 244- * of cpu_online_mask. 245+ * of cpu_active_mask. 246 * 247 * Call with callback_lock or cpuset_mutex held. 248 */ 249-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 250+static void guarantee_online_cpus(struct task_struct *tsk, 251+ struct cpumask *pmask) 252 { 253- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { 254+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 255+ struct cpuset *cs; 256+ 257+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) 258+ cpumask_copy(pmask, cpu_active_mask); 259+ 260+ rcu_read_lock(); 261+ cs = task_cs(tsk); 262+ 263+ while (!cpumask_intersects(cs->effective_cpus, pmask)) { 264 cs = parent_cs(cs); 265 if (unlikely(!cs)) { 266 /* 267@@ -395,11 +408,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 268 * cpuset's effective_cpus is on its way to be 269 * identical to cpu_online_mask. 270 */ 271- cpumask_copy(pmask, cpu_online_mask); 272- return; 273+ goto out_unlock; 274 } 275 } 276- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); 277+ cpumask_and(pmask, pmask, cs->effective_cpus); 278+ 279+out_unlock: 280+ rcu_read_unlock(); 281 } 282 283 /* 284@@ -490,6 +505,9 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 285 if (cs && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) 286 goto free_three; 287 288+ if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) 289+ goto free_three; 290+ 291 return 0; 292 293 free_three: 294@@ -940,7 +958,7 @@ static void rebuild_root_domains(void) 295 struct cpuset *cs = NULL; 296 struct cgroup_subsys_state *pos_css; 297 298- percpu_rwsem_assert_held(&cpuset_rwsem); 299+ lockdep_assert_held(&cpuset_mutex); 300 lockdep_assert_cpus_held(); 301 lockdep_assert_held(&sched_domains_mutex); 302 303@@ -1000,8 +1018,7 @@ static void rebuild_sched_domains_locked(void) 304 struct cpuset *cs; 305 int ndoms; 306 307- lockdep_assert_cpus_held(); 308- percpu_rwsem_assert_held(&cpuset_rwsem); 309+ lockdep_assert_held(&cpuset_mutex); 310 311 /* 312 * If we have raced with CPU hotplug, return early to avoid 313@@ -1052,12 +1069,18 @@ static void rebuild_sched_domains_locked(void) 314 void rebuild_sched_domains(void) 315 { 316 get_online_cpus(); 317- percpu_down_write(&cpuset_rwsem); 318+ mutex_lock(&cpuset_mutex); 319 rebuild_sched_domains_locked(); 320- percpu_up_write(&cpuset_rwsem); 321+ mutex_unlock(&cpuset_mutex); 322 put_online_cpus(); 323 } 324 325+static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p, 326+ const struct cpumask *new_mask) 327+{ 328+ return set_cpus_allowed_ptr(p, new_mask); 329+} 330+ 331 /** 332 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 333 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 334@@ -1080,7 +1103,7 @@ static void update_tasks_cpumask(struct cpuset *cs) 335 if (top_cs && (task->flags & PF_KTHREAD) && 336 kthread_is_per_cpu(task)) 337 continue; 338- set_cpus_allowed_ptr(task, cs->effective_cpus); 339+ update_cpus_allowed(cs, task, cs->effective_cpus); 340 } 341 css_task_iter_end(&it); 342 } 343@@ -1105,8 +1128,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus, 344 cpumask_and(new_cpus, new_cpus, cs->cpus_requested); 345 cpumask_and(new_cpus, new_cpus, cpu_active_mask); 346 } else { 347- cpumask_and(new_cpus, cs->cpus_requested, 348- parent->effective_cpus); 349+ cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); 350 } 351 } 352 353@@ -1171,7 +1193,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, 354 int new_prs; 355 bool part_error = false; /* Partition error? */ 356 357- percpu_rwsem_assert_held(&cpuset_rwsem); 358+ lockdep_assert_held(&cpuset_mutex); 359 360 /* 361 * The parent must be a partition root. 362@@ -2171,7 +2193,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) 363 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 364 cs = css_cs(css); 365 366- percpu_down_write(&cpuset_rwsem); 367+ mutex_lock(&cpuset_mutex); 368 369 /* allow moving tasks into an empty cpuset if on default hierarchy */ 370 ret = -ENOSPC; 371@@ -2195,7 +2217,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) 372 cs->attach_in_progress++; 373 ret = 0; 374 out_unlock: 375- percpu_up_write(&cpuset_rwsem); 376+ mutex_unlock(&cpuset_mutex); 377 return ret; 378 } 379 380@@ -2205,9 +2227,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) 381 382 cgroup_taskset_first(tset, &css); 383 384- percpu_down_write(&cpuset_rwsem); 385+ mutex_lock(&cpuset_mutex); 386 css_cs(css)->attach_in_progress--; 387- percpu_up_write(&cpuset_rwsem); 388+ mutex_unlock(&cpuset_mutex); 389 } 390 391 /* 392@@ -2231,22 +2253,20 @@ static void cpuset_attach(struct cgroup_taskset *tset) 393 cs = css_cs(css); 394 395 lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ 396- percpu_down_write(&cpuset_rwsem); 397- 398- /* prepare for attach */ 399- if (cs == &top_cpuset) 400- cpumask_copy(cpus_attach, cpu_possible_mask); 401- else 402- guarantee_online_cpus(cs, cpus_attach); 403+ mutex_lock(&cpuset_mutex); 404 405 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 406 407 cgroup_taskset_for_each(task, css, tset) { 408+ if (cs != &top_cpuset) 409+ guarantee_online_cpus(task, cpus_attach); 410+ else 411+ cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); 412 /* 413 * can_attach beforehand should guarantee that this doesn't 414 * fail. TODO: have a better way to handle failure here 415 */ 416- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); 417+ WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach)); 418 419 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); 420 cpuset_update_task_spread_flag(cs, task); 421@@ -2285,7 +2305,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) 422 if (!cs->attach_in_progress) 423 wake_up(&cpuset_attach_wq); 424 425- percpu_up_write(&cpuset_rwsem); 426+ mutex_unlock(&cpuset_mutex); 427 } 428 429 /* The various types of files and directories in a cpuset file system */ 430@@ -2317,7 +2337,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 431 int retval = 0; 432 433 get_online_cpus(); 434- percpu_down_write(&cpuset_rwsem); 435+ mutex_lock(&cpuset_mutex); 436 if (!is_cpuset_online(cs)) { 437 retval = -ENODEV; 438 goto out_unlock; 439@@ -2353,7 +2373,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 440 break; 441 } 442 out_unlock: 443- percpu_up_write(&cpuset_rwsem); 444+ mutex_unlock(&cpuset_mutex); 445 put_online_cpus(); 446 return retval; 447 } 448@@ -2366,7 +2386,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 449 int retval = -ENODEV; 450 451 get_online_cpus(); 452- percpu_down_write(&cpuset_rwsem); 453+ mutex_lock(&cpuset_mutex); 454 if (!is_cpuset_online(cs)) 455 goto out_unlock; 456 457@@ -2379,7 +2399,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 458 break; 459 } 460 out_unlock: 461- percpu_up_write(&cpuset_rwsem); 462+ mutex_unlock(&cpuset_mutex); 463 put_online_cpus(); 464 return retval; 465 } 466@@ -2420,7 +2440,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 467 flush_work(&cpuset_hotplug_work); 468 469 get_online_cpus(); 470- percpu_down_write(&cpuset_rwsem); 471+ mutex_lock(&cpuset_mutex); 472 if (!is_cpuset_online(cs)) 473 goto out_unlock; 474 475@@ -2444,7 +2464,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 476 477 free_cpuset(trialcs); 478 out_unlock: 479- percpu_up_write(&cpuset_rwsem); 480+ mutex_unlock(&cpuset_mutex); 481 put_online_cpus(); 482 kernfs_unbreak_active_protection(of->kn); 483 css_put(&cs->css); 484@@ -2577,13 +2597,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, 485 486 css_get(&cs->css); 487 get_online_cpus(); 488- percpu_down_write(&cpuset_rwsem); 489+ mutex_lock(&cpuset_mutex); 490 if (!is_cpuset_online(cs)) 491 goto out_unlock; 492 493 retval = update_prstate(cs, val); 494 out_unlock: 495- percpu_up_write(&cpuset_rwsem); 496+ mutex_unlock(&cpuset_mutex); 497 put_online_cpus(); 498 css_put(&cs->css); 499 return retval ?: nbytes; 500@@ -2791,7 +2811,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) 501 return 0; 502 503 get_online_cpus(); 504- percpu_down_write(&cpuset_rwsem); 505+ mutex_lock(&cpuset_mutex); 506 507 set_bit(CS_ONLINE, &cs->flags); 508 if (is_spread_page(parent)) 509@@ -2843,7 +2863,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) 510 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 511 spin_unlock_irq(&callback_lock); 512 out_unlock: 513- percpu_up_write(&cpuset_rwsem); 514+ mutex_unlock(&cpuset_mutex); 515 put_online_cpus(); 516 return 0; 517 } 518@@ -2864,7 +2884,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) 519 struct cpuset *cs = css_cs(css); 520 521 get_online_cpus(); 522- percpu_down_write(&cpuset_rwsem); 523+ mutex_lock(&cpuset_mutex); 524 525 if (is_partition_root(cs)) 526 update_prstate(cs, 0); 527@@ -2883,7 +2903,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) 528 cpuset_dec(); 529 clear_bit(CS_ONLINE, &cs->flags); 530 531- percpu_up_write(&cpuset_rwsem); 532+ mutex_unlock(&cpuset_mutex); 533 put_online_cpus(); 534 } 535 536@@ -2896,7 +2916,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) 537 538 static void cpuset_bind(struct cgroup_subsys_state *root_css) 539 { 540- percpu_down_write(&cpuset_rwsem); 541+ mutex_lock(&cpuset_mutex); 542 spin_lock_irq(&callback_lock); 543 544 if (is_in_v2_mode()) { 545@@ -2909,7 +2929,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) 546 } 547 548 spin_unlock_irq(&callback_lock); 549- percpu_up_write(&cpuset_rwsem); 550+ mutex_unlock(&cpuset_mutex); 551 } 552 553 /* 554@@ -2919,10 +2939,10 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) 555 */ 556 static void cpuset_fork(struct task_struct *task) 557 { 558+ int inherit_cpus = 0; 559 if (task_css_is_root(task, cpuset_cgrp_id)) 560 return; 561 562- set_cpus_allowed_ptr(task, current->cpus_ptr); 563 task->mems_allowed = current->mems_allowed; 564 } 565 566@@ -2951,7 +2971,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { 567 568 int __init cpuset_init(void) 569 { 570- BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); 571 572 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); 573 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)); 574@@ -3026,7 +3045,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, 575 is_empty = cpumask_empty(cs->cpus_allowed) || 576 nodes_empty(cs->mems_allowed); 577 578- percpu_up_write(&cpuset_rwsem); 579+ mutex_unlock(&cpuset_mutex); 580 581 /* 582 * Move tasks to the nearest ancestor with execution resources, 583@@ -3036,7 +3055,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, 584 if (is_empty) 585 remove_tasks_in_empty_cpuset(cs); 586 587- percpu_down_write(&cpuset_rwsem); 588+ mutex_lock(&cpuset_mutex); 589 } 590 591 static void 592@@ -3086,14 +3105,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) 593 retry: 594 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 595 596- percpu_down_write(&cpuset_rwsem); 597+ mutex_lock(&cpuset_mutex); 598 599 /* 600 * We have raced with task attaching. We wait until attaching 601 * is finished, so we won't attach a task to an empty cpuset. 602 */ 603 if (cs->attach_in_progress) { 604- percpu_up_write(&cpuset_rwsem); 605+ mutex_unlock(&cpuset_mutex); 606 goto retry; 607 } 608 609@@ -3165,7 +3184,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) 610 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, 611 cpus_updated, mems_updated); 612 613- percpu_up_write(&cpuset_rwsem); 614+ mutex_unlock(&cpuset_mutex); 615 } 616 617 /** 618@@ -3184,7 +3203,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) 619 * Note that CPU offlining during suspend is ignored. We don't modify 620 * cpusets across suspend/resume cycles at all. 621 */ 622-static void cpuset_hotplug_workfn(struct work_struct *work) 623+void cpuset_hotplug_workfn(struct work_struct *work) 624 { 625 static cpumask_t new_cpus; 626 static nodemask_t new_mems; 627@@ -3195,7 +3214,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) 628 if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 629 ptmp = &tmp; 630 631- percpu_down_write(&cpuset_rwsem); 632+ mutex_lock(&cpuset_mutex); 633 634 /* fetch the available cpus/mems and find out which changed how */ 635 cpumask_copy(&new_cpus, cpu_active_mask); 636@@ -3252,7 +3271,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) 637 update_tasks_nodemask(&top_cpuset); 638 } 639 640- percpu_up_write(&cpuset_rwsem); 641+ mutex_unlock(&cpuset_mutex); 642 643 /* if cpus or mems changed, we need to propagate to descendants */ 644 if (cpus_updated || mems_updated) { 645@@ -3296,6 +3315,7 @@ void cpuset_wait_for_hotplug(void) 646 { 647 flush_work(&cpuset_hotplug_work); 648 } 649+EXPORT_SYMBOL_GPL(cpuset_wait_for_hotplug); 650 651 /* 652 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 653@@ -3354,11 +3374,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 654 655 spin_lock_irqsave(&callback_lock, flags); 656 rcu_read_lock(); 657- guarantee_online_cpus(task_cs(tsk), pmask); 658+ guarantee_online_cpus(tsk, pmask); 659 rcu_read_unlock(); 660 spin_unlock_irqrestore(&callback_lock, flags); 661 } 662- 663+EXPORT_SYMBOL_GPL(cpuset_cpus_allowed); 664 /** 665 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. 666 * @tsk: pointer to task_struct with which the scheduler is struggling 667@@ -3373,9 +3393,17 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 668 669 void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 670 { 671+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 672+ const struct cpumask *cs_mask; 673+ 674 rcu_read_lock(); 675- do_set_cpus_allowed(tsk, is_in_v2_mode() ? 676- task_cs(tsk)->cpus_allowed : cpu_possible_mask); 677+ cs_mask = task_cs(tsk)->cpus_allowed; 678+ 679+ if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask)) 680+ goto unlock; /* select_fallback_rq will try harder */ 681+ 682+ do_set_cpus_allowed(tsk, cs_mask); 683+unlock: 684 rcu_read_unlock(); 685 686 /* 687 688diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c 689index 08236798d..081d026f1 100644 690--- a/kernel/cgroup/legacy_freezer.c 691+++ b/kernel/cgroup/legacy_freezer.c 692@@ -479,3 +479,4 @@ struct cgroup_subsys freezer_cgrp_subsys = { 693 .fork = freezer_fork, 694 .legacy_cftypes = files, 695 }; 696+EXPORT_SYMBOL_GPL(freezer_cgrp_subsys); 697diff --git a/kernel/cpu.c b/kernel/cpu.c 698index 4b27158d3..b076ccd1b 100644 699--- a/kernel/cpu.c 700+++ b/kernel/cpu.c 701@@ -39,6 +39,8 @@ 702 #define CREATE_TRACE_POINTS 703 #include <trace/events/cpuhp.h> 704 705+#undef CREATE_TRACE_POINTS 706+ 707 #include "smpboot.h" 708 709 /** 710@@ -274,11 +276,13 @@ void cpu_maps_update_begin(void) 711 { 712 mutex_lock(&cpu_add_remove_lock); 713 } 714+EXPORT_SYMBOL_GPL(cpu_maps_update_begin); 715 716 void cpu_maps_update_done(void) 717 { 718 mutex_unlock(&cpu_add_remove_lock); 719 } 720+EXPORT_SYMBOL_GPL(cpu_maps_update_done); 721 722 /* 723 * If set, cpu_up and cpu_down will return -EBUSY and do nothing. 724@@ -1053,7 +1057,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, 725 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 726 int prev_state, ret = 0; 727 728- if (num_online_cpus() == 1) 729+ if (num_active_cpus() == 1 && cpu_active(cpu)) 730 return -EBUSY; 731 732 if (!cpu_present(cpu)) 733diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c 734index e2999a070..79cb6d063 100644 735--- a/kernel/irq/generic-chip.c 736+++ b/kernel/irq/generic-chip.c 737@@ -200,6 +200,7 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) 738 irq_gc_unlock(gc); 739 return 0; 740 } 741+EXPORT_SYMBOL_GPL(irq_gc_set_wake); 742 743 static u32 irq_readl_be(void __iomem *addr) 744 { 745diff --git a/kernel/power/Makefile b/kernel/power/Makefile 746index 5899260a8..466eaa74f 100644 747--- a/kernel/power/Makefile 748+++ b/kernel/power/Makefile 749@@ -1,5 +1,17 @@ 750 # SPDX-License-Identifier: GPL-2.0 751 752+CURRENT_DIR := $(abspath $(dir $(realpath $(lastword $(MAKEFILE_LIST))))) 753+ 754+ifeq ($(PRODUCT_PATH),) 755+$(error PRODUCT_PATH is not set) 756+endif 757+ 758+WEAKUP_DIR := ../../../../../../$(PRODUCT_PATH)/kernel_core/kernel/power 759+ifeq ($(wildcard $(CURRENT_DIR)/$(WEAKUP_DIR)),) 760+HCS_ABS_DIR := $(abspath $(CURRENT_DIR)/$(WEAKUP_DIR)) 761+$(error miss in $(HCS_ABS_DIR) for standrad system) 762+endif 763+ 764 ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 765 766 KASAN_SANITIZE_snapshot.o := n 767@@ -17,4 +29,5 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o 768 769 obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 770 771+obj-$(CONFIG_SUSPEND) += $(WEAKUP_DIR)/ 772 obj-$(CONFIG_ENERGY_MODEL) += energy_model.o 773diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c 774index 119b929dc..41430128d 100644 775--- a/kernel/power/energy_model.c 776+++ b/kernel/power/energy_model.c 777@@ -52,6 +52,17 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) 778 } 779 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); 780 781+static int em_debug_units_show(struct seq_file *s, void *unused) 782+{ 783+ struct em_perf_domain *pd = s->private; 784+ char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; 785+ 786+ seq_printf(s, "%s\n", units); 787+ 788+ return 0; 789+} 790+DEFINE_SHOW_ATTRIBUTE(em_debug_units); 791+ 792 static void em_debug_create_pd(struct device *dev) 793 { 794 struct dentry *d; 795@@ -64,6 +75,8 @@ static void em_debug_create_pd(struct device *dev) 796 debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, 797 &em_debug_cpus_fops); 798 799+ debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); 800+ 801 /* Create a sub-directory for each performance state */ 802 for (i = 0; i < dev->em_pd->nr_perf_states; i++) 803 em_debug_create_ps(&dev->em_pd->table[i], d); 804@@ -245,17 +258,24 @@ EXPORT_SYMBOL_GPL(em_cpu_get); 805 * @cpus : Pointer to cpumask_t, which in case of a CPU device is 806 * obligatory. It can be taken from i.e. 'policy->cpus'. For other 807 * type of devices this should be set to NULL. 808+ * @milliwatts : Flag indicating that the power values are in milliWatts or 809+ * in some other scale. It must be set properly. 810 * 811 * Create Energy Model tables for a performance domain using the callbacks 812 * defined in cb. 813 * 814+ * The @milliwatts is important to set with correct value. Some kernel 815+ * sub-systems might rely on this flag and check if all devices in the EM are 816+ * using the same scale. 817+ * 818 * If multiple clients register the same performance domain, all but the first 819 * registration will be ignored. 820 * 821 * Return 0 on success 822 */ 823 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 824- struct em_data_callback *cb, cpumask_t *cpus) 825+ struct em_data_callback *cb, cpumask_t *cpus, 826+ bool milliwatts) 827 { 828 unsigned long cap, prev_cap = 0; 829 int cpu, ret; 830@@ -308,6 +328,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 831 if (ret) 832 goto unlock; 833 834+ dev->em_pd->milliwatts = milliwatts; 835+ 836 em_debug_create_pd(dev); 837 dev_info(dev, "EM: created perf domain\n"); 838 839diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c 840index bf640fd61..b13fe337f 100644 841--- a/kernel/power/hibernate.c 842+++ b/kernel/power/hibernate.c 843@@ -326,7 +326,7 @@ static int create_image(int platform_mode) 844 845 if (!in_suspend) { 846 events_check_enabled = false; 847- clear_free_pages(); 848+ clear_or_poison_free_pages(); 849 } 850 851 platform_leave(platform_mode); 852diff --git a/kernel/power/power.h b/kernel/power/power.h 853index 24f12d534..778bf431e 100644 854--- a/kernel/power/power.h 855+++ b/kernel/power/power.h 856@@ -106,7 +106,7 @@ extern int create_basic_memory_bitmaps(void); 857 extern void free_basic_memory_bitmaps(void); 858 extern int hibernate_preallocate_memory(void); 859 860-extern void clear_free_pages(void); 861+extern void clear_or_poison_free_pages(void); 862 863 /** 864 * Auxiliary structure used for reading the snapshot image data and 865diff --git a/kernel/power/process.c b/kernel/power/process.c 866index 45b054b7b..cc0623080 100644 867--- a/kernel/power/process.c 868+++ b/kernel/power/process.c 869@@ -85,18 +85,21 @@ static int try_to_freeze_tasks(bool user_only) 870 elapsed = ktime_sub(end, start); 871 elapsed_msecs = ktime_to_ms(elapsed); 872 873- if (todo) { 874+ if (wakeup) { 875 pr_cont("\n"); 876- pr_err("Freezing of tasks %s after %d.%03d seconds " 877- "(%d tasks refusing to freeze, wq_busy=%d):\n", 878- wakeup ? "aborted" : "failed", 879+ pr_err("Freezing of tasks aborted after %d.%03d seconds", 880+ elapsed_msecs / 1000, elapsed_msecs % 1000); 881+ } else if (todo) { 882+ pr_cont("\n"); 883+ pr_err("Freezing of tasks failed after %d.%03d seconds" 884+ " (%d tasks refusing to freeze, wq_busy=%d):\n", 885 elapsed_msecs / 1000, elapsed_msecs % 1000, 886 todo - wq_busy, wq_busy); 887 888 if (wq_busy) 889 show_workqueue_state(); 890 891- if (!wakeup || pm_debug_messages_on) { 892+ if (pm_debug_messages_on) { 893 read_lock(&tasklist_lock); 894 for_each_process_thread(g, p) { 895 if (p != current && !freezer_should_skip(p) 896diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c 897index 46b1804c1..a3491b29c 100644 898--- a/kernel/power/snapshot.c 899+++ b/kernel/power/snapshot.c 900@@ -1144,7 +1144,15 @@ void free_basic_memory_bitmaps(void) 901 pr_debug("Basic memory bitmaps freed\n"); 902 } 903 904-void clear_free_pages(void) 905+static void clear_or_poison_free_page(struct page *page) 906+{ 907+ if (page_poisoning_enabled_static()) 908+ __kernel_poison_pages(page, 1); 909+ else if (want_init_on_free()) 910+ clear_highpage(page); 911+} 912+ 913+void clear_or_poison_free_pages(void) 914 { 915 struct memory_bitmap *bm = free_pages_map; 916 unsigned long pfn; 917@@ -1152,12 +1160,12 @@ void clear_free_pages(void) 918 if (WARN_ON(!(free_pages_map))) 919 return; 920 921- if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) { 922+ if (page_poisoning_enabled() || want_init_on_free()) { 923 memory_bm_position_reset(bm); 924 pfn = memory_bm_next_pfn(bm); 925 while (pfn != BM_END_OF_MAP) { 926 if (pfn_valid(pfn)) 927- clear_highpage(pfn_to_page(pfn)); 928+ clear_or_poison_free_page(pfn_to_page(pfn)); 929 930 pfn = memory_bm_next_pfn(bm); 931 } 932diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c 933index 32391acc8..545958377 100644 934--- a/kernel/power/suspend.c 935+++ b/kernel/power/suspend.c 936@@ -30,6 +30,7 @@ 937 #include <trace/events/power.h> 938 #include <linux/compiler.h> 939 #include <linux/moduleparam.h> 940+#include <linux/wakeup_reason.h> 941 942 #include "power.h" 943 944@@ -138,6 +139,7 @@ static void s2idle_loop(void) 945 break; 946 } 947 948+ clear_wakeup_reasons(); 949 s2idle_enter(); 950 } 951 952@@ -357,6 +359,7 @@ static int suspend_prepare(suspend_state_t state) 953 if (!error) 954 return 0; 955 956+ log_suspend_abort_reason("One or more tasks refusing to freeze"); 957 suspend_stats.failed_freeze++; 958 dpm_save_failed_step(SUSPEND_FREEZE); 959 pm_notifier_call_chain(PM_POST_SUSPEND); 960@@ -386,7 +389,7 @@ void __weak arch_suspend_enable_irqs(void) 961 */ 962 static int suspend_enter(suspend_state_t state, bool *wakeup) 963 { 964- int error; 965+ int error, last_dev; 966 967 error = platform_suspend_prepare(state); 968 if (error) 969@@ -394,7 +397,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) 970 971 error = dpm_suspend_late(PMSG_SUSPEND); 972 if (error) { 973+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; 974+ last_dev %= REC_FAILED_NUM; 975 pr_err("late suspend of devices failed\n"); 976+ log_suspend_abort_reason("late suspend of %s device failed", 977+ suspend_stats.failed_devs[last_dev]); 978 goto Platform_finish; 979 } 980 error = platform_suspend_prepare_late(state); 981@@ -403,7 +410,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) 982 983 error = dpm_suspend_noirq(PMSG_SUSPEND); 984 if (error) { 985+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; 986+ last_dev %= REC_FAILED_NUM; 987 pr_err("noirq suspend of devices failed\n"); 988+ log_suspend_abort_reason("noirq suspend of %s device failed", 989+ suspend_stats.failed_devs[last_dev]); 990 goto Platform_early_resume; 991 } 992 error = platform_suspend_prepare_noirq(state); 993@@ -419,8 +430,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) 994 } 995 996 error = suspend_disable_secondary_cpus(); 997- if (error || suspend_test(TEST_CPUS)) 998+ if (error || suspend_test(TEST_CPUS)) { 999+ log_suspend_abort_reason("Disabling non-boot cpus failed"); 1000 goto Enable_cpus; 1001+ } 1002 1003 arch_suspend_disable_irqs(); 1004 BUG_ON(!irqs_disabled()); 1005@@ -491,6 +504,8 @@ int suspend_devices_and_enter(suspend_state_t state) 1006 error = dpm_suspend_start(PMSG_SUSPEND); 1007 if (error) { 1008 pr_err("Some devices failed to suspend, or early wake event detected\n"); 1009+ log_suspend_abort_reason( 1010+ "Some devices failed to suspend, or early wake event detected"); 1011 goto Recover_platform; 1012 } 1013 suspend_test_finish("suspend devices"); 1014diff --git a/kernel/reboot.c b/kernel/reboot.c 1015index af6f23d8b..bce629531 100644 1016--- a/kernel/reboot.c 1017+++ b/kernel/reboot.c 1018@@ -215,6 +215,27 @@ void do_kernel_restart(char *cmd) 1019 atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd); 1020 } 1021 1022+#ifdef CONFIG_NO_GKI 1023+static ATOMIC_NOTIFIER_HEAD(pre_restart_handler_list); 1024+ 1025+int register_pre_restart_handler(struct notifier_block *nb) 1026+{ 1027+ return atomic_notifier_chain_register(&pre_restart_handler_list, nb); 1028+} 1029+EXPORT_SYMBOL(register_pre_restart_handler); 1030+ 1031+int unregister_pre_restart_handler(struct notifier_block *nb) 1032+{ 1033+ return atomic_notifier_chain_unregister(&pre_restart_handler_list, nb); 1034+} 1035+EXPORT_SYMBOL(unregister_pre_restart_handler); 1036+ 1037+void do_kernel_pre_restart(char *cmd) 1038+{ 1039+ atomic_notifier_call_chain(&pre_restart_handler_list, reboot_mode, cmd); 1040+} 1041+#endif 1042+ 1043 void migrate_to_reboot_cpu(void) 1044 { 1045 /* The boot cpu is always logical cpu 0 */ 1046diff --git a/kernel/sched/core.c b/kernel/sched/core.c 1047index e2f00be4b..750da3e7c 100644 1048--- a/kernel/sched/core.c 1049+++ b/kernel/sched/core.c 1050@@ -47,6 +47,13 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); 1051 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); 1052 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); 1053 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); 1054+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking); 1055+#ifdef CONFIG_SCHEDSTATS 1056+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep); 1057+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait); 1058+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait); 1059+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked); 1060+#endif 1061 1062 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 1063 1064@@ -660,7 +667,7 @@ int get_nohz_timer_target(void) 1065 int i, cpu = smp_processor_id(), default_cpu = -1; 1066 struct sched_domain *sd; 1067 1068- if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { 1069+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) { 1070 if (!idle_cpu(cpu)) 1071 return cpu; 1072 default_cpu = cpu; 1073@@ -680,8 +687,25 @@ int get_nohz_timer_target(void) 1074 } 1075 } 1076 1077- if (default_cpu == -1) 1078- default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); 1079+ if (default_cpu == -1) { 1080+ for_each_cpu_and(i, cpu_active_mask, 1081+ housekeeping_cpumask(HK_FLAG_TIMER)) { 1082+ if (cpu == i) 1083+ continue; 1084+ 1085+ if (!idle_cpu(i)) { 1086+ cpu = i; 1087+ goto unlock; 1088+ } 1089+ } 1090+ 1091+ /* no active, not-idle, housekpeeing CPU found. */ 1092+ default_cpu = cpumask_any(cpu_active_mask); 1093+ 1094+ if (unlikely(default_cpu >= nr_cpu_ids)) 1095+ goto unlock; 1096+ } 1097+ 1098 cpu = default_cpu; 1099 unlock: 1100 rcu_read_unlock(); 1101@@ -1770,7 +1794,10 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) 1102 if (is_per_cpu_kthread(p)) 1103 return cpu_online(cpu); 1104 1105- return cpu_active(cpu); 1106+ if (!cpu_active(cpu)) 1107+ return false; 1108+ 1109+ return cpumask_test_cpu(cpu, task_cpu_possible_mask(p)); 1110 } 1111 1112 /* 1113@@ -2433,10 +2460,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) 1114 } 1115 fallthrough; 1116 case possible: 1117- do_set_cpus_allowed(p, cpu_possible_mask); 1118+ do_set_cpus_allowed(p, task_cpu_possible_mask(p)); 1119 state = fail; 1120 break; 1121- 1122 case fail: 1123 #ifdef CONFIG_CPU_ISOLATION_OPT 1124 allow_iso = true; 1125@@ -2627,6 +2653,9 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, 1126 { 1127 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; 1128 1129+ if (wake_flags & WF_SYNC) 1130+ en_flags |= ENQUEUE_WAKEUP_SYNC; 1131+ 1132 lockdep_assert_held(&rq->lock); 1133 1134 if (p->sched_contributes_to_load) 1135@@ -3019,6 +3048,19 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1136 if (!(p->state & state)) 1137 goto unlock; 1138 1139+#ifdef CONFIG_FREEZER 1140+ /* 1141+ * If we're going to wake up a thread which may be frozen, then 1142+ * we can only do so if we have an active CPU which is capable of 1143+ * running it. This may not be the case when resuming from suspend, 1144+ * as the secondary CPUs may not yet be back online. See __thaw_task() 1145+ * for the actual wakeup. 1146+ */ 1147+ if (unlikely(frozen_or_skipped(p)) && 1148+ !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p))) 1149+ goto unlock; 1150+#endif 1151+ 1152 trace_sched_waking(p); 1153 1154 /* We're going to change ->state: */ 1155@@ -5004,7 +5046,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) 1156 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, 1157 void *key) 1158 { 1159- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); 1160+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC)); 1161 return try_to_wake_up(curr->private, mode, wake_flags); 1162 } 1163 EXPORT_SYMBOL(default_wake_function); 1164@@ -5713,16 +5755,19 @@ int sched_setscheduler(struct task_struct *p, int policy, 1165 { 1166 return _sched_setscheduler(p, policy, param, true); 1167 } 1168+EXPORT_SYMBOL_GPL(sched_setscheduler); 1169 1170 int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 1171 { 1172 return __sched_setscheduler(p, attr, true, true); 1173 } 1174+EXPORT_SYMBOL_GPL(sched_setattr); 1175 1176 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) 1177 { 1178 return __sched_setscheduler(p, attr, false, true); 1179 } 1180+EXPORT_SYMBOL_GPL(sched_setattr_nocheck); 1181 1182 /** 1183 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 1184@@ -5742,6 +5787,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, 1185 { 1186 return _sched_setscheduler(p, policy, param, false); 1187 } 1188+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); 1189 1190 /* 1191 * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally 1192@@ -7044,6 +7090,11 @@ void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, 1193 */ 1194 update_rq_clock(rq); 1195 1196+#ifdef CONFIG_SCHED_DEBUG 1197+ /* note the clock update in orf */ 1198+ orf.clock_update_flags |= RQCF_UPDATED; 1199+#endif 1200+ 1201 for (;;) { 1202 /* 1203 * There's this thread running, bail when that's the only 1204diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 1205index 81e43a56d..4df7f4e68 100644 1206--- a/kernel/sched/fair.c 1207+++ b/kernel/sched/fair.c 1208@@ -86,6 +86,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L 1209 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) 1210 */ 1211 unsigned int sysctl_sched_min_granularity = 750000ULL; 1212+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity); 1213 static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; 1214 1215 /* 1216@@ -10686,9 +10687,20 @@ void nohz_balance_enter_idle(int cpu) 1217 1218 SCHED_WARN_ON(cpu != smp_processor_id()); 1219 1220- /* If this CPU is going down, then nothing needs to be done: */ 1221- if (!cpu_active(cpu)) 1222+ if (!cpu_active(cpu)) { 1223+ /* 1224+ * A CPU can be paused while it is idle with it's tick 1225+ * stopped. nohz_balance_exit_idle() should be called 1226+ * from the local CPU, so it can't be called during 1227+ * pause. This results in paused CPU participating in 1228+ * the nohz idle balance, which should be avoided. 1229+ * 1230+ * When the paused CPU exits idle and enters again, 1231+ * exempt the paused CPU from nohz_balance_exit_idle. 1232+ */ 1233+ nohz_balance_exit_idle(rq); 1234 return; 1235+ } 1236 1237 /* Spare idle load balancing on CPUs that don't want to be disturbed: */ 1238 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) 1239diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c 1240index 2593a733c..69afd8d1e 100644 1241--- a/kernel/sched/idle.c 1242+++ b/kernel/sched/idle.c 1243@@ -450,6 +450,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) 1244 { 1245 raw_spin_unlock_irq(&rq->lock); 1246 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 1247+ 1248 dump_stack(); 1249 raw_spin_lock_irq(&rq->lock); 1250 } 1251diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c 1252index d2a655643..b5837e277 100644 1253--- a/kernel/sched/loadavg.c 1254+++ b/kernel/sched/loadavg.c 1255@@ -75,6 +75,7 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) 1256 loads[1] = (avenrun[1] + offset) << shift; 1257 loads[2] = (avenrun[2] + offset) << shift; 1258 } 1259+EXPORT_SYMBOL_GPL(get_avenrun); 1260 1261 long calc_load_fold_active(struct rq *this_rq, long adjust) 1262 { 1263diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c 1264index 2c613e1cf..e2890b677 100644 1265--- a/kernel/sched/pelt.c 1266+++ b/kernel/sched/pelt.c 1267@@ -28,6 +28,42 @@ 1268 #include "sched.h" 1269 #include "pelt.h" 1270 1271+int pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD; 1272+int pelt_load_avg_max = PELT32_LOAD_AVG_MAX; 1273+const u32 *pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv; 1274+ 1275+static int __init set_pelt(char *str) 1276+{ 1277+ int rc, num; 1278+ 1279+ rc = kstrtoint(str, 0, &num); 1280+ if (rc) { 1281+ pr_err("%s: kstrtoint failed. rc=%d\n", __func__, rc); 1282+ return 0; 1283+ } 1284+ 1285+ switch (num) { 1286+ case PELT8_LOAD_AVG_PERIOD: 1287+ pelt_load_avg_period = PELT8_LOAD_AVG_PERIOD; 1288+ pelt_load_avg_max = PELT8_LOAD_AVG_MAX; 1289+ pelt_runnable_avg_yN_inv = pelt8_runnable_avg_yN_inv; 1290+ pr_info("PELT half life is set to %dms\n", num); 1291+ break; 1292+ case PELT32_LOAD_AVG_PERIOD: 1293+ pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD; 1294+ pelt_load_avg_max = PELT32_LOAD_AVG_MAX; 1295+ pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv; 1296+ pr_info("PELT half life is set to %dms\n", num); 1297+ break; 1298+ default: 1299+ pr_err("Default PELT half life is 32ms\n"); 1300+ } 1301+ 1302+ return 0; 1303+} 1304+ 1305+early_param("pelt", set_pelt); 1306+ 1307 /* 1308 * Approximate: 1309 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) 1310@@ -54,7 +90,7 @@ static u64 decay_load(u64 val, u64 n) 1311 local_n %= LOAD_AVG_PERIOD; 1312 } 1313 1314- val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); 1315+ val = mul_u64_u32_shr(val, pelt_runnable_avg_yN_inv[local_n], 32); 1316 return val; 1317 } 1318 1319diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c 1320index b7f38f3ad..b0e6c438f 100644 1321--- a/kernel/sched/psi.c 1322+++ b/kernel/sched/psi.c 1323@@ -550,7 +550,7 @@ static u64 update_triggers(struct psi_group *group, u64 now) 1324 return now + group->poll_min_period; 1325 } 1326 1327-/* Schedule polling if it's not already scheduled. */ 1328+/* Schedule polling if it's not already scheduled or forced. */ 1329 static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) 1330 { 1331 struct task_struct *task; 1332diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 1333index d5c00fa02..689cc1a63 100644 1334--- a/kernel/sched/rt.c 1335+++ b/kernel/sched/rt.c 1336@@ -1393,6 +1393,27 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1337 enqueue_top_rt_rq(&rq->rt); 1338 } 1339 1340+#ifdef CONFIG_SMP 1341+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, 1342+ bool sync) 1343+{ 1344+ /* 1345+ * If the waker is CFS, then an RT sync wakeup would preempt the waker 1346+ * and force it to run for a likely small time after the RT wakee is 1347+ * done. So, only honor RT sync wakeups from RT wakers. 1348+ */ 1349+ return sync && task_has_rt_policy(rq->curr) && 1350+ p->prio <= rq->rt.highest_prio.next && 1351+ rq->rt.rt_nr_running <= 2; 1352+} 1353+#else 1354+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, 1355+ bool sync) 1356+{ 1357+ return 0; 1358+} 1359+#endif 1360+ 1361 /* 1362 * Adding/removing a task to/from a priority array: 1363 */ 1364@@ -1400,6 +1421,7 @@ static void 1365 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1366 { 1367 struct sched_rt_entity *rt_se = &p->rt; 1368+ bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC); 1369 1370 if (flags & ENQUEUE_WAKEUP) 1371 rt_se->timeout = 0; 1372@@ -1407,7 +1429,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1373 enqueue_rt_entity(rt_se, flags); 1374 walt_inc_cumulative_runnable_avg(rq, p); 1375 1376- if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1377+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1 && 1378+ !should_honor_rt_sync(rq, p, sync)) 1379 enqueue_pushable_task(rq, p); 1380 } 1381 1382@@ -1464,7 +1487,10 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1383 { 1384 struct task_struct *curr; 1385 struct rq *rq; 1386+ struct rq *this_cpu_rq; 1387 bool test; 1388+ bool sync = !!(flags & WF_SYNC); 1389+ int this_cpu; 1390 1391 /* For anything but wake ups, just return the task_cpu */ 1392 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1393@@ -1474,6 +1500,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1394 1395 rcu_read_lock(); 1396 curr = READ_ONCE(rq->curr); /* unlocked access */ 1397+ this_cpu = smp_processor_id(); 1398+ this_cpu_rq = cpu_rq(this_cpu); 1399 1400 /* 1401 * If the current task on @p's runqueue is an RT task, then 1402@@ -1508,6 +1536,15 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1403 test |= sysctl_sched_enable_rt_cas; 1404 #endif 1405 1406+ /* 1407+ * Respect the sync flag as long as the task can run on this CPU. 1408+ */ 1409+ if (should_honor_rt_sync(this_cpu_rq, p, sync) && 1410+ cpumask_test_cpu(this_cpu, p->cpus_ptr)) { 1411+ cpu = this_cpu; 1412+ goto out_unlock; 1413+ } 1414+ 1415 if (test || !rt_task_fits_capacity(p, cpu)) { 1416 int target = find_lowest_rq(p); 1417 1418diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h 1419index c529706be..92a6875bc 100644 1420--- a/kernel/sched/sched-pelt.h 1421+++ b/kernel/sched/sched-pelt.h 1422@@ -1,7 +1,7 @@ 1423 /* SPDX-License-Identifier: GPL-2.0 */ 1424 /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ 1425 1426-static const u32 runnable_avg_yN_inv[] __maybe_unused = { 1427+static const u32 pelt32_runnable_avg_yN_inv[] __maybe_unused = { 1428 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 1429 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 1430 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, 1431@@ -10,5 +10,20 @@ static const u32 runnable_avg_yN_inv[] __maybe_unused = { 1432 0x85aac367, 0x82cd8698, 1433 }; 1434 1435-#define LOAD_AVG_PERIOD 32 1436-#define LOAD_AVG_MAX 47742 1437+#define PELT32_LOAD_AVG_PERIOD 32 1438+#define PELT32_LOAD_AVG_MAX 47742 1439+ 1440+static const u32 pelt8_runnable_avg_yN_inv[] __maybe_unused = { 1441+ 0xffffffff, 0xeac0c6e6, 0xd744fcc9, 0xc5672a10, 1442+ 0xb504f333, 0xa5fed6a9, 0x9837f050, 0x8b95c1e3, 1443+}; 1444+ 1445+#define PELT8_LOAD_AVG_PERIOD 8 1446+#define PELT8_LOAD_AVG_MAX 12336 1447+ 1448+extern const u32 *pelt_runnable_avg_yN_inv; 1449+extern int pelt_load_avg_period; 1450+extern int pelt_load_avg_max; 1451+ 1452+#define LOAD_AVG_PERIOD pelt_load_avg_period 1453+#define LOAD_AVG_MAX pelt_load_avg_max 1454diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 1455index 592c8653c..7c02fed0a 100644 1456--- a/kernel/sched/sched.h 1457+++ b/kernel/sched/sched.h 1458@@ -1913,6 +1913,8 @@ extern const int sched_latency_to_weight[40]; 1459 #define ENQUEUE_MIGRATED 0x00 1460 #endif 1461 1462+#define ENQUEUE_WAKEUP_SYNC 0x80 1463+ 1464 #define RETRY_TASK ((void *)-1UL) 1465 1466 struct sched_class { 1467diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c 1468index 9191e5daa..58d840c62 100644 1469--- a/kernel/sched/topology.c 1470+++ b/kernel/sched/topology.c 1471@@ -5,6 +5,9 @@ 1472 #include "sched.h" 1473 1474 DEFINE_MUTEX(sched_domains_mutex); 1475+#ifdef CONFIG_LOCKDEP 1476+EXPORT_SYMBOL_GPL(sched_domains_mutex); 1477+#endif 1478 1479 /* Protected by sched_domains_mutex: */ 1480 static cpumask_var_t sched_domains_tmpmask; 1481diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c 1482index a55642aa3..6911bbca0 100644 1483--- a/kernel/sched/wait.c 1484+++ b/kernel/sched/wait.c 1485@@ -396,7 +396,8 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en 1486 } 1487 EXPORT_SYMBOL(finish_wait); 1488 1489-int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) 1490+__sched int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, 1491+ int sync, void *key) 1492 { 1493 int ret = default_wake_function(wq_entry, mode, sync, key); 1494 1495@@ -432,7 +433,7 @@ static inline bool is_kthread_should_stop(void) 1496 * } smp_mb(); // C 1497 * remove_wait_queue(&wq_head, &wait); wq_entry->flags |= WQ_FLAG_WOKEN; 1498 */ 1499-long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) 1500+__sched long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, long timeout) 1501 { 1502 /* 1503 * The below executes an smp_mb(), which matches with the full barrier 1504@@ -457,7 +458,8 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) 1505 } 1506 EXPORT_SYMBOL(wait_woken); 1507 1508-int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) 1509+__sched int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, 1510+ int sync, void *key) 1511 { 1512 /* Pairs with the smp_store_mb() in wait_woken(). */ 1513 smp_mb(); /* C */ 1514diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c 1515index 3e6740207..033fa94f3 100644 1516--- a/kernel/stop_machine.c 1517+++ b/kernel/stop_machine.c 1518@@ -27,6 +27,7 @@ 1519 * Structure to determine completion condition and record errors. May 1520 * be shared by works on different cpus. 1521 */ 1522+ 1523 struct cpu_stop_done { 1524 atomic_t nr_todo; /* nr left to execute */ 1525 int ret; /* collected return value */ 1526