1diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c 2index 209e6567c..d47c0212e 100644 3--- a/kernel/bpf/syscall.c 4+++ b/kernel/bpf/syscall.c 5@@ -128,21 +128,6 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) 6 return map; 7 } 8 9-static void bpf_map_write_active_inc(struct bpf_map *map) 10-{ 11- atomic64_inc(&map->writecnt); 12-} 13- 14-static void bpf_map_write_active_dec(struct bpf_map *map) 15-{ 16- atomic64_dec(&map->writecnt); 17-} 18- 19-bool bpf_map_write_active(const struct bpf_map *map) 20-{ 21- return atomic64_read(&map->writecnt) != 0; 22-} 23- 24 static u32 bpf_map_value_size(struct bpf_map *map) 25 { 26 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 27@@ -604,8 +589,11 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma) 28 { 29 struct bpf_map *map = vma->vm_file->private_data; 30 31- if (vma->vm_flags & VM_MAYWRITE) 32- bpf_map_write_active_inc(map); 33+ if (vma->vm_flags & VM_MAYWRITE) { 34+ mutex_lock(&map->freeze_mutex); 35+ map->writecnt++; 36+ mutex_unlock(&map->freeze_mutex); 37+ } 38 } 39 40 /* called for all unmapped memory region (including initial) */ 41@@ -613,8 +601,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma) 42 { 43 struct bpf_map *map = vma->vm_file->private_data; 44 45- if (vma->vm_flags & VM_MAYWRITE) 46- bpf_map_write_active_dec(map); 47+ if (vma->vm_flags & VM_MAYWRITE) { 48+ mutex_lock(&map->freeze_mutex); 49+ map->writecnt--; 50+ mutex_unlock(&map->freeze_mutex); 51+ } 52 } 53 54 static const struct vm_operations_struct bpf_map_default_vmops = { 55@@ -664,7 +655,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) 56 goto out; 57 58 if (vma->vm_flags & VM_MAYWRITE) 59- bpf_map_write_active_inc(map); 60+ map->writecnt++; 61 out: 62 mutex_unlock(&map->freeze_mutex); 63 return err; 64@@ -1096,7 +1087,6 @@ static int map_update_elem(union bpf_attr *attr) 65 map = __bpf_map_get(f); 66 if (IS_ERR(map)) 67 return PTR_ERR(map); 68- bpf_map_write_active_inc(map); 69 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 70 err = -EPERM; 71 goto err_put; 72@@ -1138,7 +1128,6 @@ static int map_update_elem(union bpf_attr *attr) 73 free_key: 74 kfree(key); 75 err_put: 76- bpf_map_write_active_dec(map); 77 fdput(f); 78 return err; 79 } 80@@ -1161,7 +1150,6 @@ static int map_delete_elem(union bpf_attr *attr) 81 map = __bpf_map_get(f); 82 if (IS_ERR(map)) 83 return PTR_ERR(map); 84- bpf_map_write_active_inc(map); 85 if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 86 err = -EPERM; 87 goto err_put; 88@@ -1192,7 +1180,6 @@ static int map_delete_elem(union bpf_attr *attr) 89 out: 90 kfree(key); 91 err_put: 92- bpf_map_write_active_dec(map); 93 fdput(f); 94 return err; 95 } 96@@ -1497,7 +1484,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) 97 map = __bpf_map_get(f); 98 if (IS_ERR(map)) 99 return PTR_ERR(map); 100- bpf_map_write_active_inc(map); 101 if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || 102 !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 103 err = -EPERM; 104@@ -1539,7 +1525,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) 105 free_key: 106 kfree(key); 107 err_put: 108- bpf_map_write_active_dec(map); 109 fdput(f); 110 return err; 111 } 112@@ -1566,7 +1551,8 @@ static int map_freeze(const union bpf_attr *attr) 113 } 114 115 mutex_lock(&map->freeze_mutex); 116- if (bpf_map_write_active(map)) { 117+ 118+ if (map->writecnt) { 119 err = -EBUSY; 120 goto err_put; 121 } 122@@ -3991,9 +3977,6 @@ static int bpf_map_do_batch(const union bpf_attr *attr, 123 union bpf_attr __user *uattr, 124 int cmd) 125 { 126- bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || 127- cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; 128- bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; 129 struct bpf_map *map; 130 int err, ufd; 131 struct fd f; 132@@ -4006,13 +3989,16 @@ static int bpf_map_do_batch(const union bpf_attr *attr, 133 map = __bpf_map_get(f); 134 if (IS_ERR(map)) 135 return PTR_ERR(map); 136- if (has_write) 137- bpf_map_write_active_inc(map); 138- if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { 139+ 140+ if ((cmd == BPF_MAP_LOOKUP_BATCH || 141+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && 142+ !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { 143 err = -EPERM; 144 goto err_put; 145 } 146- if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 147+ 148+ if (cmd != BPF_MAP_LOOKUP_BATCH && 149+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { 150 err = -EPERM; 151 goto err_put; 152 } 153@@ -4025,9 +4011,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr, 154 BPF_DO_BATCH(map->ops->map_update_batch); 155 else 156 BPF_DO_BATCH(map->ops->map_delete_batch); 157+ 158 err_put: 159- if (has_write) 160- bpf_map_write_active_dec(map); 161 fdput(f); 162 return err; 163 } 164diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c 165index 8de769745..3e854b91f 100644 166--- a/kernel/bpf/verifier.c 167+++ b/kernel/bpf/verifier.c 168@@ -3492,22 +3492,7 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) 169 170 static bool bpf_map_is_rdonly(const struct bpf_map *map) 171 { 172- /* A map is considered read-only if the following condition are true: 173- * 174- * 1) BPF program side cannot change any of the map content. The 175- * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map 176- * and was set at map creation time. 177- * 2) The map value(s) have been initialized from user space by a 178- * loader and then "frozen", such that no new map update/delete 179- * operations from syscall side are possible for the rest of 180- * the map's lifetime from that point onwards. 181- * 3) Any parallel/pending map update/delete operations from syscall 182- * side have been completed. Only after that point, it's safe to 183- * assume that map value(s) are immutable. 184- */ 185- return (map->map_flags & BPF_F_RDONLY_PROG) && 186- READ_ONCE(map->frozen) && 187- !bpf_map_write_active(map); 188+ return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; 189 } 190 191 static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) 192diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c 193index d9f8a464b..cddc908bc 100644 194--- a/kernel/cgroup/cgroup-v1.c 195+++ b/kernel/cgroup/cgroup-v1.c 196@@ -518,7 +518,8 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, 197 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 198 #endif 199 !uid_eq(cred->euid, tcred->uid) && 200- !uid_eq(cred->euid, tcred->suid)) 201+ !uid_eq(cred->euid, tcred->suid) && 202+ !ns_capable(tcred->user_ns, CAP_SYS_NICE)) 203 ret = -EACCES; 204 put_cred(tcred); 205 if (ret) 206diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c 207index 3173fe473..f4d318733 100644 208--- a/kernel/cgroup/cpuset.c 209+++ b/kernel/cgroup/cpuset.c 210@@ -334,6 +334,8 @@ static struct cpuset top_cpuset = { 211 * guidelines for accessing subsystem state in kernel/cgroup.c 212 */ 213 214+static DEFINE_MUTEX(cpuset_mutex); 215+ 216 DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); 217 218 void cpuset_read_lock(void) 219@@ -351,9 +353,9 @@ static DEFINE_SPINLOCK(callback_lock); 220 static struct workqueue_struct *cpuset_migrate_mm_wq; 221 222 /* 223- * CPU / memory hotplug is handled asynchronously. 224+ * CPU / memory hotplug is handled asynchronously 225+ * for hotplug, synchronously for resume_cpus 226 */ 227-static void cpuset_hotplug_workfn(struct work_struct *work); 228 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 229 230 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 231@@ -373,18 +375,29 @@ static inline bool is_in_v2_mode(void) 232 } 233 234 /* 235- * Return in pmask the portion of a cpusets's cpus_allowed that 236- * are online. If none are online, walk up the cpuset hierarchy 237- * until we find one that does have some online cpus. 238+ * Return in pmask the portion of a task's cpusets's cpus_allowed that 239+ * are online and are capable of running the task. If none are found, 240+ * walk up the cpuset hierarchy until we find one that does have some 241+ * appropriate cpus. 242 * 243 * One way or another, we guarantee to return some non-empty subset 244- * of cpu_online_mask. 245+ * of cpu_active_mask. 246 * 247 * Call with callback_lock or cpuset_mutex held. 248 */ 249-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 250+static void guarantee_online_cpus(struct task_struct *tsk, 251+ struct cpumask *pmask) 252 { 253- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { 254+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 255+ struct cpuset *cs; 256+ 257+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) 258+ cpumask_copy(pmask, cpu_active_mask); 259+ 260+ rcu_read_lock(); 261+ cs = task_cs(tsk); 262+ 263+ while (!cpumask_intersects(cs->effective_cpus, pmask)) { 264 cs = parent_cs(cs); 265 if (unlikely(!cs)) { 266 /* 267@@ -394,11 +407,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 268 * cpuset's effective_cpus is on its way to be 269 * identical to cpu_online_mask. 270 */ 271- cpumask_copy(pmask, cpu_online_mask); 272- return; 273+ goto out_unlock; 274 } 275 } 276- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); 277+ cpumask_and(pmask, pmask, cs->effective_cpus); 278+ 279+out_unlock: 280+ rcu_read_unlock(); 281 } 282 283 /* 284@@ -489,6 +504,9 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 285 if (cs && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) 286 goto free_three; 287 288+ if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) 289+ goto free_three; 290+ 291 return 0; 292 293 free_three: 294@@ -939,7 +957,7 @@ static void rebuild_root_domains(void) 295 struct cpuset *cs = NULL; 296 struct cgroup_subsys_state *pos_css; 297 298- percpu_rwsem_assert_held(&cpuset_rwsem); 299+ lockdep_assert_held(&cpuset_mutex); 300 lockdep_assert_cpus_held(); 301 lockdep_assert_held(&sched_domains_mutex); 302 303@@ -999,8 +1017,7 @@ static void rebuild_sched_domains_locked(void) 304 struct cpuset *cs; 305 int ndoms; 306 307- lockdep_assert_cpus_held(); 308- percpu_rwsem_assert_held(&cpuset_rwsem); 309+ lockdep_assert_held(&cpuset_mutex); 310 311 /* 312 * If we have raced with CPU hotplug, return early to avoid 313@@ -1051,12 +1068,18 @@ static void rebuild_sched_domains_locked(void) 314 void rebuild_sched_domains(void) 315 { 316 get_online_cpus(); 317- percpu_down_write(&cpuset_rwsem); 318+ mutex_lock(&cpuset_mutex); 319 rebuild_sched_domains_locked(); 320- percpu_up_write(&cpuset_rwsem); 321+ mutex_unlock(&cpuset_mutex); 322 put_online_cpus(); 323 } 324 325+static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p, 326+ const struct cpumask *new_mask) 327+{ 328+ return set_cpus_allowed_ptr(p, new_mask); 329+} 330+ 331 /** 332 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 333 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 334@@ -1072,7 +1095,7 @@ static void update_tasks_cpumask(struct cpuset *cs) 335 336 css_task_iter_start(&cs->css, 0, &it); 337 while ((task = css_task_iter_next(&it))) 338- set_cpus_allowed_ptr(task, cs->effective_cpus); 339+ update_cpus_allowed(cs, task, cs->effective_cpus); 340 css_task_iter_end(&it); 341 } 342 343@@ -1096,8 +1119,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus, 344 cpumask_and(new_cpus, new_cpus, cs->cpus_requested); 345 cpumask_and(new_cpus, new_cpus, cpu_active_mask); 346 } else { 347- cpumask_and(new_cpus, cs->cpus_requested, 348- parent->effective_cpus); 349+ cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); 350 } 351 } 352 353@@ -1162,7 +1184,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, 354 int new_prs; 355 bool part_error = false; /* Partition error? */ 356 357- percpu_rwsem_assert_held(&cpuset_rwsem); 358+ lockdep_assert_held(&cpuset_mutex); 359 360 /* 361 * The parent must be a partition root. 362@@ -2158,7 +2180,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) 363 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 364 cs = css_cs(css); 365 366- percpu_down_write(&cpuset_rwsem); 367+ mutex_lock(&cpuset_mutex); 368 369 /* allow moving tasks into an empty cpuset if on default hierarchy */ 370 ret = -ENOSPC; 371@@ -2182,7 +2204,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) 372 cs->attach_in_progress++; 373 ret = 0; 374 out_unlock: 375- percpu_up_write(&cpuset_rwsem); 376+ mutex_unlock(&cpuset_mutex); 377 return ret; 378 } 379 380@@ -2192,9 +2214,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) 381 382 cgroup_taskset_first(tset, &css); 383 384- percpu_down_write(&cpuset_rwsem); 385+ mutex_lock(&cpuset_mutex); 386 css_cs(css)->attach_in_progress--; 387- percpu_up_write(&cpuset_rwsem); 388+ mutex_unlock(&cpuset_mutex); 389 } 390 391 /* 392@@ -2217,22 +2239,20 @@ static void cpuset_attach(struct cgroup_taskset *tset) 393 cgroup_taskset_first(tset, &css); 394 cs = css_cs(css); 395 396- percpu_down_write(&cpuset_rwsem); 397- 398- /* prepare for attach */ 399- if (cs == &top_cpuset) 400- cpumask_copy(cpus_attach, cpu_possible_mask); 401- else 402- guarantee_online_cpus(cs, cpus_attach); 403+ mutex_lock(&cpuset_mutex); 404 405 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 406 407 cgroup_taskset_for_each(task, css, tset) { 408+ if (cs != &top_cpuset) 409+ guarantee_online_cpus(task, cpus_attach); 410+ else 411+ cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); 412 /* 413 * can_attach beforehand should guarantee that this doesn't 414 * fail. TODO: have a better way to handle failure here 415 */ 416- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); 417+ WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach)); 418 419 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); 420 cpuset_update_task_spread_flag(cs, task); 421@@ -2271,7 +2291,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) 422 if (!cs->attach_in_progress) 423 wake_up(&cpuset_attach_wq); 424 425- percpu_up_write(&cpuset_rwsem); 426+ mutex_unlock(&cpuset_mutex); 427 } 428 429 /* The various types of files and directories in a cpuset file system */ 430@@ -2303,7 +2323,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 431 int retval = 0; 432 433 get_online_cpus(); 434- percpu_down_write(&cpuset_rwsem); 435+ mutex_lock(&cpuset_mutex); 436 if (!is_cpuset_online(cs)) { 437 retval = -ENODEV; 438 goto out_unlock; 439@@ -2339,7 +2359,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 440 break; 441 } 442 out_unlock: 443- percpu_up_write(&cpuset_rwsem); 444+ mutex_unlock(&cpuset_mutex); 445 put_online_cpus(); 446 return retval; 447 } 448@@ -2352,7 +2372,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 449 int retval = -ENODEV; 450 451 get_online_cpus(); 452- percpu_down_write(&cpuset_rwsem); 453+ mutex_lock(&cpuset_mutex); 454 if (!is_cpuset_online(cs)) 455 goto out_unlock; 456 457@@ -2365,7 +2385,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 458 break; 459 } 460 out_unlock: 461- percpu_up_write(&cpuset_rwsem); 462+ mutex_unlock(&cpuset_mutex); 463 put_online_cpus(); 464 return retval; 465 } 466@@ -2406,7 +2426,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 467 flush_work(&cpuset_hotplug_work); 468 469 get_online_cpus(); 470- percpu_down_write(&cpuset_rwsem); 471+ mutex_lock(&cpuset_mutex); 472 if (!is_cpuset_online(cs)) 473 goto out_unlock; 474 475@@ -2430,7 +2450,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 476 477 free_cpuset(trialcs); 478 out_unlock: 479- percpu_up_write(&cpuset_rwsem); 480+ mutex_unlock(&cpuset_mutex); 481 put_online_cpus(); 482 kernfs_unbreak_active_protection(of->kn); 483 css_put(&cs->css); 484@@ -2563,13 +2583,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, 485 486 css_get(&cs->css); 487 get_online_cpus(); 488- percpu_down_write(&cpuset_rwsem); 489+ mutex_lock(&cpuset_mutex); 490 if (!is_cpuset_online(cs)) 491 goto out_unlock; 492 493 retval = update_prstate(cs, val); 494 out_unlock: 495- percpu_up_write(&cpuset_rwsem); 496+ mutex_unlock(&cpuset_mutex); 497 put_online_cpus(); 498 css_put(&cs->css); 499 return retval ?: nbytes; 500@@ -2777,7 +2797,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) 501 return 0; 502 503 get_online_cpus(); 504- percpu_down_write(&cpuset_rwsem); 505+ mutex_lock(&cpuset_mutex); 506 507 set_bit(CS_ONLINE, &cs->flags); 508 if (is_spread_page(parent)) 509@@ -2829,7 +2849,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) 510 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 511 spin_unlock_irq(&callback_lock); 512 out_unlock: 513- percpu_up_write(&cpuset_rwsem); 514+ mutex_unlock(&cpuset_mutex); 515 put_online_cpus(); 516 return 0; 517 } 518@@ -2850,7 +2870,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) 519 struct cpuset *cs = css_cs(css); 520 521 get_online_cpus(); 522- percpu_down_write(&cpuset_rwsem); 523+ mutex_lock(&cpuset_mutex); 524 525 if (is_partition_root(cs)) 526 update_prstate(cs, 0); 527@@ -2869,7 +2889,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) 528 cpuset_dec(); 529 clear_bit(CS_ONLINE, &cs->flags); 530 531- percpu_up_write(&cpuset_rwsem); 532+ mutex_unlock(&cpuset_mutex); 533 put_online_cpus(); 534 } 535 536@@ -2882,7 +2902,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) 537 538 static void cpuset_bind(struct cgroup_subsys_state *root_css) 539 { 540- percpu_down_write(&cpuset_rwsem); 541+ mutex_lock(&cpuset_mutex); 542 spin_lock_irq(&callback_lock); 543 544 if (is_in_v2_mode()) { 545@@ -2895,7 +2915,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) 546 } 547 548 spin_unlock_irq(&callback_lock); 549- percpu_up_write(&cpuset_rwsem); 550+ mutex_unlock(&cpuset_mutex); 551 } 552 553 /* 554@@ -2908,7 +2928,6 @@ static void cpuset_fork(struct task_struct *task) 555 if (task_css_is_root(task, cpuset_cgrp_id)) 556 return; 557 558- set_cpus_allowed_ptr(task, current->cpus_ptr); 559 task->mems_allowed = current->mems_allowed; 560 } 561 562@@ -2937,7 +2956,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { 563 564 int __init cpuset_init(void) 565 { 566- BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); 567 568 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); 569 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)); 570@@ -3012,7 +3030,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, 571 is_empty = cpumask_empty(cs->cpus_allowed) || 572 nodes_empty(cs->mems_allowed); 573 574- percpu_up_write(&cpuset_rwsem); 575+ mutex_unlock(&cpuset_mutex); 576 577 /* 578 * Move tasks to the nearest ancestor with execution resources, 579@@ -3022,7 +3040,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, 580 if (is_empty) 581 remove_tasks_in_empty_cpuset(cs); 582 583- percpu_down_write(&cpuset_rwsem); 584+ mutex_lock(&cpuset_mutex); 585 } 586 587 static void 588@@ -3072,14 +3090,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) 589 retry: 590 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 591 592- percpu_down_write(&cpuset_rwsem); 593+ mutex_lock(&cpuset_mutex); 594 595 /* 596 * We have raced with task attaching. We wait until attaching 597 * is finished, so we won't attach a task to an empty cpuset. 598 */ 599 if (cs->attach_in_progress) { 600- percpu_up_write(&cpuset_rwsem); 601+ mutex_unlock(&cpuset_mutex); 602 goto retry; 603 } 604 605@@ -3151,7 +3169,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) 606 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, 607 cpus_updated, mems_updated); 608 609- percpu_up_write(&cpuset_rwsem); 610+ mutex_unlock(&cpuset_mutex); 611 } 612 613 /** 614@@ -3170,7 +3188,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) 615 * Note that CPU offlining during suspend is ignored. We don't modify 616 * cpusets across suspend/resume cycles at all. 617 */ 618-static void cpuset_hotplug_workfn(struct work_struct *work) 619+void cpuset_hotplug_workfn(struct work_struct *work) 620 { 621 static cpumask_t new_cpus; 622 static nodemask_t new_mems; 623@@ -3181,7 +3199,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) 624 if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 625 ptmp = &tmp; 626 627- percpu_down_write(&cpuset_rwsem); 628+ mutex_lock(&cpuset_mutex); 629 630 /* fetch the available cpus/mems and find out which changed how */ 631 cpumask_copy(&new_cpus, cpu_active_mask); 632@@ -3238,7 +3256,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) 633 update_tasks_nodemask(&top_cpuset); 634 } 635 636- percpu_up_write(&cpuset_rwsem); 637+ mutex_unlock(&cpuset_mutex); 638 639 /* if cpus or mems changed, we need to propagate to descendants */ 640 if (cpus_updated || mems_updated) { 641@@ -3282,6 +3300,7 @@ void cpuset_wait_for_hotplug(void) 642 { 643 flush_work(&cpuset_hotplug_work); 644 } 645+EXPORT_SYMBOL_GPL(cpuset_wait_for_hotplug); 646 647 /* 648 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 649@@ -3337,11 +3356,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 650 651 spin_lock_irqsave(&callback_lock, flags); 652 rcu_read_lock(); 653- guarantee_online_cpus(task_cs(tsk), pmask); 654+ guarantee_online_cpus(tsk, pmask); 655 rcu_read_unlock(); 656 spin_unlock_irqrestore(&callback_lock, flags); 657 } 658- 659+EXPORT_SYMBOL_GPL(cpuset_cpus_allowed); 660 /** 661 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. 662 * @tsk: pointer to task_struct with which the scheduler is struggling 663@@ -3356,9 +3375,17 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 664 665 void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 666 { 667+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); 668+ const struct cpumask *cs_mask; 669+ 670 rcu_read_lock(); 671- do_set_cpus_allowed(tsk, is_in_v2_mode() ? 672- task_cs(tsk)->cpus_allowed : cpu_possible_mask); 673+ cs_mask = task_cs(tsk)->cpus_allowed; 674+ 675+ if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask)) 676+ goto unlock; /* select_fallback_rq will try harder */ 677+ 678+ do_set_cpus_allowed(tsk, cs_mask); 679+unlock: 680 rcu_read_unlock(); 681 682 /* 683diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c 684index 08236798d..081d026f1 100644 685--- a/kernel/cgroup/legacy_freezer.c 686+++ b/kernel/cgroup/legacy_freezer.c 687@@ -479,3 +479,4 @@ struct cgroup_subsys freezer_cgrp_subsys = { 688 .fork = freezer_fork, 689 .legacy_cftypes = files, 690 }; 691+EXPORT_SYMBOL_GPL(freezer_cgrp_subsys); 692diff --git a/kernel/cpu.c b/kernel/cpu.c 693index 4b27158d3..b076ccd1b 100644 694--- a/kernel/cpu.c 695+++ b/kernel/cpu.c 696@@ -39,6 +39,8 @@ 697 #define CREATE_TRACE_POINTS 698 #include <trace/events/cpuhp.h> 699 700+#undef CREATE_TRACE_POINTS 701+ 702 #include "smpboot.h" 703 704 /** 705@@ -274,11 +276,13 @@ void cpu_maps_update_begin(void) 706 { 707 mutex_lock(&cpu_add_remove_lock); 708 } 709+EXPORT_SYMBOL_GPL(cpu_maps_update_begin); 710 711 void cpu_maps_update_done(void) 712 { 713 mutex_unlock(&cpu_add_remove_lock); 714 } 715+EXPORT_SYMBOL_GPL(cpu_maps_update_done); 716 717 /* 718 * If set, cpu_up and cpu_down will return -EBUSY and do nothing. 719@@ -1053,7 +1057,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, 720 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); 721 int prev_state, ret = 0; 722 723- if (num_online_cpus() == 1) 724+ if (num_active_cpus() == 1 && cpu_active(cpu)) 725 return -EBUSY; 726 727 if (!cpu_present(cpu)) 728diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c 729index e2999a070..79cb6d063 100644 730--- a/kernel/irq/generic-chip.c 731+++ b/kernel/irq/generic-chip.c 732@@ -200,6 +200,7 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) 733 irq_gc_unlock(gc); 734 return 0; 735 } 736+EXPORT_SYMBOL_GPL(irq_gc_set_wake); 737 738 static u32 irq_readl_be(void __iomem *addr) 739 { 740diff --git a/kernel/power/Makefile b/kernel/power/Makefile 741index 5899260a8..466eaa74f 100644 742--- a/kernel/power/Makefile 743+++ b/kernel/power/Makefile 744@@ -1,5 +1,17 @@ 745 # SPDX-License-Identifier: GPL-2.0 746 747+CURRENT_DIR := $(abspath $(dir $(realpath $(lastword $(MAKEFILE_LIST))))) 748+ 749+ifeq ($(PRODUCT_PATH),) 750+$(error PRODUCT_PATH is not set) 751+endif 752+ 753+WEAKUP_DIR := ../../../../../../$(PRODUCT_PATH)/kernel_core/kernel/power 754+ifeq ($(wildcard $(CURRENT_DIR)/$(WEAKUP_DIR)),) 755+HCS_ABS_DIR := $(abspath $(CURRENT_DIR)/$(WEAKUP_DIR)) 756+$(error miss in $(HCS_ABS_DIR) for standrad system) 757+endif 758+ 759 ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 760 761 KASAN_SANITIZE_snapshot.o := n 762@@ -17,4 +29,5 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o 763 764 obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 765 766+obj-$(CONFIG_SUSPEND) += $(WEAKUP_DIR)/ 767 obj-$(CONFIG_ENERGY_MODEL) += energy_model.o 768diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c 769index 119b929dc..41430128d 100644 770--- a/kernel/power/energy_model.c 771+++ b/kernel/power/energy_model.c 772@@ -52,6 +52,17 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) 773 } 774 DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); 775 776+static int em_debug_units_show(struct seq_file *s, void *unused) 777+{ 778+ struct em_perf_domain *pd = s->private; 779+ char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; 780+ 781+ seq_printf(s, "%s\n", units); 782+ 783+ return 0; 784+} 785+DEFINE_SHOW_ATTRIBUTE(em_debug_units); 786+ 787 static void em_debug_create_pd(struct device *dev) 788 { 789 struct dentry *d; 790@@ -64,6 +75,8 @@ static void em_debug_create_pd(struct device *dev) 791 debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, 792 &em_debug_cpus_fops); 793 794+ debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); 795+ 796 /* Create a sub-directory for each performance state */ 797 for (i = 0; i < dev->em_pd->nr_perf_states; i++) 798 em_debug_create_ps(&dev->em_pd->table[i], d); 799@@ -245,17 +258,24 @@ EXPORT_SYMBOL_GPL(em_cpu_get); 800 * @cpus : Pointer to cpumask_t, which in case of a CPU device is 801 * obligatory. It can be taken from i.e. 'policy->cpus'. For other 802 * type of devices this should be set to NULL. 803+ * @milliwatts : Flag indicating that the power values are in milliWatts or 804+ * in some other scale. It must be set properly. 805 * 806 * Create Energy Model tables for a performance domain using the callbacks 807 * defined in cb. 808 * 809+ * The @milliwatts is important to set with correct value. Some kernel 810+ * sub-systems might rely on this flag and check if all devices in the EM are 811+ * using the same scale. 812+ * 813 * If multiple clients register the same performance domain, all but the first 814 * registration will be ignored. 815 * 816 * Return 0 on success 817 */ 818 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 819- struct em_data_callback *cb, cpumask_t *cpus) 820+ struct em_data_callback *cb, cpumask_t *cpus, 821+ bool milliwatts) 822 { 823 unsigned long cap, prev_cap = 0; 824 int cpu, ret; 825@@ -308,6 +328,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, 826 if (ret) 827 goto unlock; 828 829+ dev->em_pd->milliwatts = milliwatts; 830+ 831 em_debug_create_pd(dev); 832 dev_info(dev, "EM: created perf domain\n"); 833 834diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c 835index bf640fd61..b13fe337f 100644 836--- a/kernel/power/hibernate.c 837+++ b/kernel/power/hibernate.c 838@@ -326,7 +326,7 @@ static int create_image(int platform_mode) 839 840 if (!in_suspend) { 841 events_check_enabled = false; 842- clear_free_pages(); 843+ clear_or_poison_free_pages(); 844 } 845 846 platform_leave(platform_mode); 847diff --git a/kernel/power/power.h b/kernel/power/power.h 848index 24f12d534..778bf431e 100644 849--- a/kernel/power/power.h 850+++ b/kernel/power/power.h 851@@ -106,7 +106,7 @@ extern int create_basic_memory_bitmaps(void); 852 extern void free_basic_memory_bitmaps(void); 853 extern int hibernate_preallocate_memory(void); 854 855-extern void clear_free_pages(void); 856+extern void clear_or_poison_free_pages(void); 857 858 /** 859 * Auxiliary structure used for reading the snapshot image data and 860diff --git a/kernel/power/process.c b/kernel/power/process.c 861index 45b054b7b..cc0623080 100644 862--- a/kernel/power/process.c 863+++ b/kernel/power/process.c 864@@ -85,18 +85,21 @@ static int try_to_freeze_tasks(bool user_only) 865 elapsed = ktime_sub(end, start); 866 elapsed_msecs = ktime_to_ms(elapsed); 867 868- if (todo) { 869+ if (wakeup) { 870 pr_cont("\n"); 871- pr_err("Freezing of tasks %s after %d.%03d seconds " 872- "(%d tasks refusing to freeze, wq_busy=%d):\n", 873- wakeup ? "aborted" : "failed", 874+ pr_err("Freezing of tasks aborted after %d.%03d seconds", 875+ elapsed_msecs / 1000, elapsed_msecs % 1000); 876+ } else if (todo) { 877+ pr_cont("\n"); 878+ pr_err("Freezing of tasks failed after %d.%03d seconds" 879+ " (%d tasks refusing to freeze, wq_busy=%d):\n", 880 elapsed_msecs / 1000, elapsed_msecs % 1000, 881 todo - wq_busy, wq_busy); 882 883 if (wq_busy) 884 show_workqueue_state(); 885 886- if (!wakeup || pm_debug_messages_on) { 887+ if (pm_debug_messages_on) { 888 read_lock(&tasklist_lock); 889 for_each_process_thread(g, p) { 890 if (p != current && !freezer_should_skip(p) 891diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c 892index 46b1804c1..a3491b29c 100644 893--- a/kernel/power/snapshot.c 894+++ b/kernel/power/snapshot.c 895@@ -1144,7 +1144,15 @@ void free_basic_memory_bitmaps(void) 896 pr_debug("Basic memory bitmaps freed\n"); 897 } 898 899-void clear_free_pages(void) 900+static void clear_or_poison_free_page(struct page *page) 901+{ 902+ if (page_poisoning_enabled_static()) 903+ __kernel_poison_pages(page, 1); 904+ else if (want_init_on_free()) 905+ clear_highpage(page); 906+} 907+ 908+void clear_or_poison_free_pages(void) 909 { 910 struct memory_bitmap *bm = free_pages_map; 911 unsigned long pfn; 912@@ -1152,12 +1160,12 @@ void clear_free_pages(void) 913 if (WARN_ON(!(free_pages_map))) 914 return; 915 916- if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) { 917+ if (page_poisoning_enabled() || want_init_on_free()) { 918 memory_bm_position_reset(bm); 919 pfn = memory_bm_next_pfn(bm); 920 while (pfn != BM_END_OF_MAP) { 921 if (pfn_valid(pfn)) 922- clear_highpage(pfn_to_page(pfn)); 923+ clear_or_poison_free_page(pfn_to_page(pfn)); 924 925 pfn = memory_bm_next_pfn(bm); 926 } 927diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c 928index 32391acc8..545958377 100644 929--- a/kernel/power/suspend.c 930+++ b/kernel/power/suspend.c 931@@ -30,6 +30,7 @@ 932 #include <trace/events/power.h> 933 #include <linux/compiler.h> 934 #include <linux/moduleparam.h> 935+#include <linux/wakeup_reason.h> 936 937 #include "power.h" 938 939@@ -139,6 +140,7 @@ static void s2idle_loop(void) 940 } 941 942 pm_wakeup_clear(false); 943+ clear_wakeup_reasons(); 944 945 s2idle_enter(); 946 } 947@@ -359,6 +361,7 @@ static int suspend_prepare(suspend_state_t state) 948 if (!error) 949 return 0; 950 951+ log_suspend_abort_reason("One or more tasks refusing to freeze"); 952 suspend_stats.failed_freeze++; 953 dpm_save_failed_step(SUSPEND_FREEZE); 954 pm_notifier_call_chain(PM_POST_SUSPEND); 955@@ -388,7 +391,7 @@ void __weak arch_suspend_enable_irqs(void) 956 */ 957 static int suspend_enter(suspend_state_t state, bool *wakeup) 958 { 959- int error; 960+ int error, last_dev; 961 962 error = platform_suspend_prepare(state); 963 if (error) 964@@ -396,7 +399,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) 965 966 error = dpm_suspend_late(PMSG_SUSPEND); 967 if (error) { 968+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; 969+ last_dev %= REC_FAILED_NUM; 970 pr_err("late suspend of devices failed\n"); 971+ log_suspend_abort_reason("late suspend of %s device failed", 972+ suspend_stats.failed_devs[last_dev]); 973 goto Platform_finish; 974 } 975 error = platform_suspend_prepare_late(state); 976@@ -405,7 +412,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) 977 978 error = dpm_suspend_noirq(PMSG_SUSPEND); 979 if (error) { 980+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; 981+ last_dev %= REC_FAILED_NUM; 982 pr_err("noirq suspend of devices failed\n"); 983+ log_suspend_abort_reason("noirq suspend of %s device failed", 984+ suspend_stats.failed_devs[last_dev]); 985 goto Platform_early_resume; 986 } 987 error = platform_suspend_prepare_noirq(state); 988@@ -421,8 +432,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) 989 } 990 991 error = suspend_disable_secondary_cpus(); 992- if (error || suspend_test(TEST_CPUS)) 993+ if (error || suspend_test(TEST_CPUS)) { 994+ log_suspend_abort_reason("Disabling non-boot cpus failed"); 995 goto Enable_cpus; 996+ } 997 998 arch_suspend_disable_irqs(); 999 BUG_ON(!irqs_disabled()); 1000@@ -493,6 +506,8 @@ int suspend_devices_and_enter(suspend_state_t state) 1001 error = dpm_suspend_start(PMSG_SUSPEND); 1002 if (error) { 1003 pr_err("Some devices failed to suspend, or early wake event detected\n"); 1004+ log_suspend_abort_reason( 1005+ "Some devices failed to suspend, or early wake event detected"); 1006 goto Recover_platform; 1007 } 1008 suspend_test_finish("suspend devices"); 1009diff --git a/kernel/reboot.c b/kernel/reboot.c 1010index af6f23d8b..bce629531 100644 1011--- a/kernel/reboot.c 1012+++ b/kernel/reboot.c 1013@@ -215,6 +215,27 @@ void do_kernel_restart(char *cmd) 1014 atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd); 1015 } 1016 1017+#ifdef CONFIG_NO_GKI 1018+static ATOMIC_NOTIFIER_HEAD(pre_restart_handler_list); 1019+ 1020+int register_pre_restart_handler(struct notifier_block *nb) 1021+{ 1022+ return atomic_notifier_chain_register(&pre_restart_handler_list, nb); 1023+} 1024+EXPORT_SYMBOL(register_pre_restart_handler); 1025+ 1026+int unregister_pre_restart_handler(struct notifier_block *nb) 1027+{ 1028+ return atomic_notifier_chain_unregister(&pre_restart_handler_list, nb); 1029+} 1030+EXPORT_SYMBOL(unregister_pre_restart_handler); 1031+ 1032+void do_kernel_pre_restart(char *cmd) 1033+{ 1034+ atomic_notifier_call_chain(&pre_restart_handler_list, reboot_mode, cmd); 1035+} 1036+#endif 1037+ 1038 void migrate_to_reboot_cpu(void) 1039 { 1040 /* The boot cpu is always logical cpu 0 */ 1041diff --git a/kernel/sched/core.c b/kernel/sched/core.c 1042index e2f00be4b..750da3e7c 100644 1043--- a/kernel/sched/core.c 1044+++ b/kernel/sched/core.c 1045@@ -47,6 +47,13 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); 1046 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); 1047 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); 1048 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); 1049+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking); 1050+#ifdef CONFIG_SCHEDSTATS 1051+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep); 1052+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait); 1053+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait); 1054+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked); 1055+#endif 1056 1057 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 1058 1059@@ -660,7 +667,7 @@ int get_nohz_timer_target(void) 1060 int i, cpu = smp_processor_id(), default_cpu = -1; 1061 struct sched_domain *sd; 1062 1063- if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { 1064+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) { 1065 if (!idle_cpu(cpu)) 1066 return cpu; 1067 default_cpu = cpu; 1068@@ -680,8 +687,25 @@ int get_nohz_timer_target(void) 1069 } 1070 } 1071 1072- if (default_cpu == -1) 1073- default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); 1074+ if (default_cpu == -1) { 1075+ for_each_cpu_and(i, cpu_active_mask, 1076+ housekeeping_cpumask(HK_FLAG_TIMER)) { 1077+ if (cpu == i) 1078+ continue; 1079+ 1080+ if (!idle_cpu(i)) { 1081+ cpu = i; 1082+ goto unlock; 1083+ } 1084+ } 1085+ 1086+ /* no active, not-idle, housekpeeing CPU found. */ 1087+ default_cpu = cpumask_any(cpu_active_mask); 1088+ 1089+ if (unlikely(default_cpu >= nr_cpu_ids)) 1090+ goto unlock; 1091+ } 1092+ 1093 cpu = default_cpu; 1094 unlock: 1095 rcu_read_unlock(); 1096@@ -1770,7 +1794,10 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) 1097 if (is_per_cpu_kthread(p)) 1098 return cpu_online(cpu); 1099 1100- return cpu_active(cpu); 1101+ if (!cpu_active(cpu)) 1102+ return false; 1103+ 1104+ return cpumask_test_cpu(cpu, task_cpu_possible_mask(p)); 1105 } 1106 1107 /* 1108@@ -2433,10 +2460,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) 1109 } 1110 fallthrough; 1111 case possible: 1112- do_set_cpus_allowed(p, cpu_possible_mask); 1113+ do_set_cpus_allowed(p, task_cpu_possible_mask(p)); 1114 state = fail; 1115 break; 1116- 1117 case fail: 1118 #ifdef CONFIG_CPU_ISOLATION_OPT 1119 allow_iso = true; 1120@@ -2627,6 +2653,9 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, 1121 { 1122 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; 1123 1124+ if (wake_flags & WF_SYNC) 1125+ en_flags |= ENQUEUE_WAKEUP_SYNC; 1126+ 1127 lockdep_assert_held(&rq->lock); 1128 1129 if (p->sched_contributes_to_load) 1130@@ -3019,6 +3048,19 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1131 if (!(p->state & state)) 1132 goto unlock; 1133 1134+#ifdef CONFIG_FREEZER 1135+ /* 1136+ * If we're going to wake up a thread which may be frozen, then 1137+ * we can only do so if we have an active CPU which is capable of 1138+ * running it. This may not be the case when resuming from suspend, 1139+ * as the secondary CPUs may not yet be back online. See __thaw_task() 1140+ * for the actual wakeup. 1141+ */ 1142+ if (unlikely(frozen_or_skipped(p)) && 1143+ !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p))) 1144+ goto unlock; 1145+#endif 1146+ 1147 trace_sched_waking(p); 1148 1149 /* We're going to change ->state: */ 1150@@ -5004,7 +5046,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) 1151 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, 1152 void *key) 1153 { 1154- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); 1155+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC)); 1156 return try_to_wake_up(curr->private, mode, wake_flags); 1157 } 1158 EXPORT_SYMBOL(default_wake_function); 1159@@ -5713,16 +5755,19 @@ int sched_setscheduler(struct task_struct *p, int policy, 1160 { 1161 return _sched_setscheduler(p, policy, param, true); 1162 } 1163+EXPORT_SYMBOL_GPL(sched_setscheduler); 1164 1165 int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 1166 { 1167 return __sched_setscheduler(p, attr, true, true); 1168 } 1169+EXPORT_SYMBOL_GPL(sched_setattr); 1170 1171 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) 1172 { 1173 return __sched_setscheduler(p, attr, false, true); 1174 } 1175+EXPORT_SYMBOL_GPL(sched_setattr_nocheck); 1176 1177 /** 1178 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 1179@@ -5742,6 +5787,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, 1180 { 1181 return _sched_setscheduler(p, policy, param, false); 1182 } 1183+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); 1184 1185 /* 1186 * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally 1187@@ -7044,6 +7090,11 @@ void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, 1188 */ 1189 update_rq_clock(rq); 1190 1191+#ifdef CONFIG_SCHED_DEBUG 1192+ /* note the clock update in orf */ 1193+ orf.clock_update_flags |= RQCF_UPDATED; 1194+#endif 1195+ 1196 for (;;) { 1197 /* 1198 * There's this thread running, bail when that's the only 1199diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c 1200index 81e43a56d..4df7f4e68 100644 1201--- a/kernel/sched/fair.c 1202+++ b/kernel/sched/fair.c 1203@@ -86,6 +86,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L 1204 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) 1205 */ 1206 unsigned int sysctl_sched_min_granularity = 750000ULL; 1207+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity); 1208 static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; 1209 1210 /* 1211@@ -10686,9 +10687,20 @@ void nohz_balance_enter_idle(int cpu) 1212 1213 SCHED_WARN_ON(cpu != smp_processor_id()); 1214 1215- /* If this CPU is going down, then nothing needs to be done: */ 1216- if (!cpu_active(cpu)) 1217+ if (!cpu_active(cpu)) { 1218+ /* 1219+ * A CPU can be paused while it is idle with it's tick 1220+ * stopped. nohz_balance_exit_idle() should be called 1221+ * from the local CPU, so it can't be called during 1222+ * pause. This results in paused CPU participating in 1223+ * the nohz idle balance, which should be avoided. 1224+ * 1225+ * When the paused CPU exits idle and enters again, 1226+ * exempt the paused CPU from nohz_balance_exit_idle. 1227+ */ 1228+ nohz_balance_exit_idle(rq); 1229 return; 1230+ } 1231 1232 /* Spare idle load balancing on CPUs that don't want to be disturbed: */ 1233 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) 1234diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c 1235index 2593a733c..69afd8d1e 100644 1236--- a/kernel/sched/idle.c 1237+++ b/kernel/sched/idle.c 1238@@ -450,6 +450,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) 1239 { 1240 raw_spin_unlock_irq(&rq->lock); 1241 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 1242+ 1243 dump_stack(); 1244 raw_spin_lock_irq(&rq->lock); 1245 } 1246diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c 1247index d2a655643..b5837e277 100644 1248--- a/kernel/sched/loadavg.c 1249+++ b/kernel/sched/loadavg.c 1250@@ -75,6 +75,7 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) 1251 loads[1] = (avenrun[1] + offset) << shift; 1252 loads[2] = (avenrun[2] + offset) << shift; 1253 } 1254+EXPORT_SYMBOL_GPL(get_avenrun); 1255 1256 long calc_load_fold_active(struct rq *this_rq, long adjust) 1257 { 1258diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c 1259index 2c613e1cf..e2890b677 100644 1260--- a/kernel/sched/pelt.c 1261+++ b/kernel/sched/pelt.c 1262@@ -28,6 +28,42 @@ 1263 #include "sched.h" 1264 #include "pelt.h" 1265 1266+int pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD; 1267+int pelt_load_avg_max = PELT32_LOAD_AVG_MAX; 1268+const u32 *pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv; 1269+ 1270+static int __init set_pelt(char *str) 1271+{ 1272+ int rc, num; 1273+ 1274+ rc = kstrtoint(str, 0, &num); 1275+ if (rc) { 1276+ pr_err("%s: kstrtoint failed. rc=%d\n", __func__, rc); 1277+ return 0; 1278+ } 1279+ 1280+ switch (num) { 1281+ case PELT8_LOAD_AVG_PERIOD: 1282+ pelt_load_avg_period = PELT8_LOAD_AVG_PERIOD; 1283+ pelt_load_avg_max = PELT8_LOAD_AVG_MAX; 1284+ pelt_runnable_avg_yN_inv = pelt8_runnable_avg_yN_inv; 1285+ pr_info("PELT half life is set to %dms\n", num); 1286+ break; 1287+ case PELT32_LOAD_AVG_PERIOD: 1288+ pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD; 1289+ pelt_load_avg_max = PELT32_LOAD_AVG_MAX; 1290+ pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv; 1291+ pr_info("PELT half life is set to %dms\n", num); 1292+ break; 1293+ default: 1294+ pr_err("Default PELT half life is 32ms\n"); 1295+ } 1296+ 1297+ return 0; 1298+} 1299+ 1300+early_param("pelt", set_pelt); 1301+ 1302 /* 1303 * Approximate: 1304 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) 1305@@ -54,7 +90,7 @@ static u64 decay_load(u64 val, u64 n) 1306 local_n %= LOAD_AVG_PERIOD; 1307 } 1308 1309- val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); 1310+ val = mul_u64_u32_shr(val, pelt_runnable_avg_yN_inv[local_n], 32); 1311 return val; 1312 } 1313 1314diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c 1315index b7f38f3ad..b0e6c438f 100644 1316--- a/kernel/sched/psi.c 1317+++ b/kernel/sched/psi.c 1318@@ -550,7 +550,7 @@ static u64 update_triggers(struct psi_group *group, u64 now) 1319 return now + group->poll_min_period; 1320 } 1321 1322-/* Schedule polling if it's not already scheduled. */ 1323+/* Schedule polling if it's not already scheduled or forced. */ 1324 static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) 1325 { 1326 struct task_struct *task; 1327diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c 1328index d5c00fa02..689cc1a63 100644 1329--- a/kernel/sched/rt.c 1330+++ b/kernel/sched/rt.c 1331@@ -1390,6 +1390,27 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1332 enqueue_top_rt_rq(&rq->rt); 1333 } 1334 1335+#ifdef CONFIG_SMP 1336+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, 1337+ bool sync) 1338+{ 1339+ /* 1340+ * If the waker is CFS, then an RT sync wakeup would preempt the waker 1341+ * and force it to run for a likely small time after the RT wakee is 1342+ * done. So, only honor RT sync wakeups from RT wakers. 1343+ */ 1344+ return sync && task_has_rt_policy(rq->curr) && 1345+ p->prio <= rq->rt.highest_prio.next && 1346+ rq->rt.rt_nr_running <= 2; 1347+} 1348+#else 1349+static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, 1350+ bool sync) 1351+{ 1352+ return 0; 1353+} 1354+#endif 1355+ 1356 /* 1357 * Adding/removing a task to/from a priority array: 1358 */ 1359@@ -1397,6 +1418,7 @@ static void 1360 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1361 { 1362 struct sched_rt_entity *rt_se = &p->rt; 1363+ bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC); 1364 1365 if (flags & ENQUEUE_WAKEUP) 1366 rt_se->timeout = 0; 1367@@ -1404,7 +1426,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1368 enqueue_rt_entity(rt_se, flags); 1369 walt_inc_cumulative_runnable_avg(rq, p); 1370 1371- if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1372+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1 && 1373+ !should_honor_rt_sync(rq, p, sync)) 1374 enqueue_pushable_task(rq, p); 1375 } 1376 1377@@ -1461,7 +1484,10 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1378 { 1379 struct task_struct *curr; 1380 struct rq *rq; 1381+ struct rq *this_cpu_rq; 1382 bool test; 1383+ bool sync = !!(flags & WF_SYNC); 1384+ int this_cpu; 1385 1386 /* For anything but wake ups, just return the task_cpu */ 1387 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1388@@ -1471,6 +1497,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1389 1390 rcu_read_lock(); 1391 curr = READ_ONCE(rq->curr); /* unlocked access */ 1392+ this_cpu = smp_processor_id(); 1393+ this_cpu_rq = cpu_rq(this_cpu); 1394 1395 /* 1396 * If the current task on @p's runqueue is an RT task, then 1397@@ -1502,6 +1530,15 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1398 unlikely(rt_task(curr)) && 1399 (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); 1400 1401+ /* 1402+ * Respect the sync flag as long as the task can run on this CPU. 1403+ */ 1404+ if (should_honor_rt_sync(this_cpu_rq, p, sync) && 1405+ cpumask_test_cpu(this_cpu, p->cpus_ptr)) { 1406+ cpu = this_cpu; 1407+ goto out_unlock; 1408+ } 1409+ 1410 if (test || !rt_task_fits_capacity(p, cpu)) { 1411 int target = find_lowest_rq(p); 1412 1413diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h 1414index c529706be..92a6875bc 100644 1415--- a/kernel/sched/sched-pelt.h 1416+++ b/kernel/sched/sched-pelt.h 1417@@ -1,7 +1,7 @@ 1418 /* SPDX-License-Identifier: GPL-2.0 */ 1419 /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ 1420 1421-static const u32 runnable_avg_yN_inv[] __maybe_unused = { 1422+static const u32 pelt32_runnable_avg_yN_inv[] __maybe_unused = { 1423 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 1424 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 1425 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, 1426@@ -10,5 +10,20 @@ static const u32 runnable_avg_yN_inv[] __maybe_unused = { 1427 0x85aac367, 0x82cd8698, 1428 }; 1429 1430-#define LOAD_AVG_PERIOD 32 1431-#define LOAD_AVG_MAX 47742 1432+#define PELT32_LOAD_AVG_PERIOD 32 1433+#define PELT32_LOAD_AVG_MAX 47742 1434+ 1435+static const u32 pelt8_runnable_avg_yN_inv[] __maybe_unused = { 1436+ 0xffffffff, 0xeac0c6e6, 0xd744fcc9, 0xc5672a10, 1437+ 0xb504f333, 0xa5fed6a9, 0x9837f050, 0x8b95c1e3, 1438+}; 1439+ 1440+#define PELT8_LOAD_AVG_PERIOD 8 1441+#define PELT8_LOAD_AVG_MAX 12336 1442+ 1443+extern const u32 *pelt_runnable_avg_yN_inv; 1444+extern int pelt_load_avg_period; 1445+extern int pelt_load_avg_max; 1446+ 1447+#define LOAD_AVG_PERIOD pelt_load_avg_period 1448+#define LOAD_AVG_MAX pelt_load_avg_max 1449diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h 1450index 592c8653c..7c02fed0a 100644 1451--- a/kernel/sched/sched.h 1452+++ b/kernel/sched/sched.h 1453@@ -1913,6 +1913,8 @@ extern const int sched_latency_to_weight[40]; 1454 #define ENQUEUE_MIGRATED 0x00 1455 #endif 1456 1457+#define ENQUEUE_WAKEUP_SYNC 0x80 1458+ 1459 #define RETRY_TASK ((void *)-1UL) 1460 1461 struct sched_class { 1462diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c 1463index 9191e5daa..58d840c62 100644 1464--- a/kernel/sched/topology.c 1465+++ b/kernel/sched/topology.c 1466@@ -5,6 +5,9 @@ 1467 #include "sched.h" 1468 1469 DEFINE_MUTEX(sched_domains_mutex); 1470+#ifdef CONFIG_LOCKDEP 1471+EXPORT_SYMBOL_GPL(sched_domains_mutex); 1472+#endif 1473 1474 /* Protected by sched_domains_mutex: */ 1475 static cpumask_var_t sched_domains_tmpmask; 1476diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c 1477index a55642aa3..6911bbca0 100644 1478--- a/kernel/sched/wait.c 1479+++ b/kernel/sched/wait.c 1480@@ -396,7 +396,8 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en 1481 } 1482 EXPORT_SYMBOL(finish_wait); 1483 1484-int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) 1485+__sched int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, 1486+ int sync, void *key) 1487 { 1488 int ret = default_wake_function(wq_entry, mode, sync, key); 1489 1490@@ -432,7 +433,7 @@ static inline bool is_kthread_should_stop(void) 1491 * } smp_mb(); // C 1492 * remove_wait_queue(&wq_head, &wait); wq_entry->flags |= WQ_FLAG_WOKEN; 1493 */ 1494-long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) 1495+__sched long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, long timeout) 1496 { 1497 /* 1498 * The below executes an smp_mb(), which matches with the full barrier 1499@@ -457,7 +458,8 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) 1500 } 1501 EXPORT_SYMBOL(wait_woken); 1502 1503-int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) 1504+__sched int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, 1505+ int sync, void *key) 1506 { 1507 /* Pairs with the smp_store_mb() in wait_woken(). */ 1508 smp_mb(); /* C */ 1509diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c 1510index 3e6740207..033fa94f3 100644 1511--- a/kernel/stop_machine.c 1512+++ b/kernel/stop_machine.c 1513@@ -27,6 +27,7 @@ 1514 * Structure to determine completion condition and record errors. May 1515 * be shared by works on different cpus. 1516 */ 1517+ 1518 struct cpu_stop_done { 1519 atomic_t nr_todo; /* nr left to execute */ 1520 int ret; /* collected return value */ 1521