diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 209e6567c..d47c0212e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -128,21 +128,6 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -static void bpf_map_write_active_inc(struct bpf_map *map) -{ - atomic64_inc(&map->writecnt); -} - -static void bpf_map_write_active_dec(struct bpf_map *map) -{ - atomic64_dec(&map->writecnt); -} - -bool bpf_map_write_active(const struct bpf_map *map) -{ - return atomic64_read(&map->writecnt) != 0; -} - static u32 bpf_map_value_size(struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -604,8 +589,11 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_MAYWRITE) - bpf_map_write_active_inc(map); + if (vma->vm_flags & VM_MAYWRITE) { + mutex_lock(&map->freeze_mutex); + map->writecnt++; + mutex_unlock(&map->freeze_mutex); + } } /* called for all unmapped memory region (including initial) */ @@ -613,8 +601,11 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_MAYWRITE) - bpf_map_write_active_dec(map); + if (vma->vm_flags & VM_MAYWRITE) { + mutex_lock(&map->freeze_mutex); + map->writecnt--; + mutex_unlock(&map->freeze_mutex); + } } static const struct vm_operations_struct bpf_map_default_vmops = { @@ -664,7 +655,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) goto out; if (vma->vm_flags & VM_MAYWRITE) - bpf_map_write_active_inc(map); + map->writecnt++; out: mutex_unlock(&map->freeze_mutex); return err; @@ -1096,7 +1087,6 @@ static int map_update_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; @@ -1138,7 +1128,6 @@ static int map_update_elem(union bpf_attr *attr) free_key: kfree(key); err_put: - bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1161,7 +1150,6 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; @@ -1192,7 +1180,6 @@ static int map_delete_elem(union bpf_attr *attr) out: kfree(key); err_put: - bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1497,7 +1484,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; @@ -1539,7 +1525,6 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) free_key: kfree(key); err_put: - bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1566,7 +1551,8 @@ static int map_freeze(const union bpf_attr *attr) } mutex_lock(&map->freeze_mutex); - if (bpf_map_write_active(map)) { + + if (map->writecnt) { err = -EBUSY; goto err_put; } @@ -3991,9 +3977,6 @@ static int bpf_map_do_batch(const union bpf_attr *attr, union bpf_attr __user *uattr, int cmd) { - bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || - cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; - bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; struct bpf_map *map; int err, ufd; struct fd f; @@ -4006,13 +3989,16 @@ static int bpf_map_do_batch(const union bpf_attr *attr, map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (has_write) - bpf_map_write_active_inc(map); - if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { + + if ((cmd == BPF_MAP_LOOKUP_BATCH || + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && + !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } - if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + + if (cmd != BPF_MAP_LOOKUP_BATCH && + !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -4025,9 +4011,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr, BPF_DO_BATCH(map->ops->map_update_batch); else BPF_DO_BATCH(map->ops->map_delete_batch); + err_put: - if (has_write) - bpf_map_write_active_dec(map); fdput(f); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8de769745..3e854b91f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3492,22 +3492,7 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) static bool bpf_map_is_rdonly(const struct bpf_map *map) { - /* A map is considered read-only if the following condition are true: - * - * 1) BPF program side cannot change any of the map content. The - * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map - * and was set at map creation time. - * 2) The map value(s) have been initialized from user space by a - * loader and then "frozen", such that no new map update/delete - * operations from syscall side are possible for the rest of - * the map's lifetime from that point onwards. - * 3) Any parallel/pending map update/delete operations from syscall - * side have been completed. Only after that point, it's safe to - * assume that map value(s) are immutable. - */ - return (map->map_flags & BPF_F_RDONLY_PROG) && - READ_ONCE(map->frozen) && - !bpf_map_write_active(map); + return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; } static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index d9f8a464b..cddc908bc 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -518,7 +518,8 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && #endif !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + !uid_eq(cred->euid, tcred->suid) && + !ns_capable(tcred->user_ns, CAP_SYS_NICE)) ret = -EACCES; put_cred(tcred); if (ret) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3173fe473..f4d318733 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -334,6 +334,8 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ +static DEFINE_MUTEX(cpuset_mutex); + DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); void cpuset_read_lock(void) @@ -351,9 +353,9 @@ static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; /* - * CPU / memory hotplug is handled asynchronously. + * CPU / memory hotplug is handled asynchronously + * for hotplug, synchronously for resume_cpus */ -static void cpuset_hotplug_workfn(struct work_struct *work); static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); @@ -373,18 +375,29 @@ static inline bool is_in_v2_mode(void) } /* - * Return in pmask the portion of a cpusets's cpus_allowed that - * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. + * Return in pmask the portion of a task's cpusets's cpus_allowed that + * are online and are capable of running the task. If none are found, + * walk up the cpuset hierarchy until we find one that does have some + * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_mask. + * of cpu_active_mask. * * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) +static void guarantee_online_cpus(struct task_struct *tsk, + struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + struct cpuset *cs; + + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) + cpumask_copy(pmask, cpu_active_mask); + + rcu_read_lock(); + cs = task_cs(tsk); + + while (!cpumask_intersects(cs->effective_cpus, pmask)) { cs = parent_cs(cs); if (unlikely(!cs)) { /* @@ -394,11 +407,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ - cpumask_copy(pmask, cpu_online_mask); - return; + goto out_unlock; } } - cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); + cpumask_and(pmask, pmask, cs->effective_cpus); + +out_unlock: + rcu_read_unlock(); } /* @@ -489,6 +504,9 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (cs && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) goto free_three; + if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL)) + goto free_three; + return 0; free_three: @@ -939,7 +957,7 @@ static void rebuild_root_domains(void) struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -999,8 +1017,7 @@ static void rebuild_sched_domains_locked(void) struct cpuset *cs; int ndoms; - lockdep_assert_cpus_held(); - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * If we have raced with CPU hotplug, return early to avoid @@ -1051,12 +1068,18 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); } +static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p, + const struct cpumask *new_mask) +{ + return set_cpus_allowed_ptr(p, new_mask); +} + /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -1072,7 +1095,7 @@ static void update_tasks_cpumask(struct cpuset *cs) css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) - set_cpus_allowed_ptr(task, cs->effective_cpus); + update_cpus_allowed(cs, task, cs->effective_cpus); css_task_iter_end(&it); } @@ -1096,8 +1119,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus, cpumask_and(new_cpus, new_cpus, cs->cpus_requested); cpumask_and(new_cpus, new_cpus, cpu_active_mask); } else { - cpumask_and(new_cpus, cs->cpus_requested, - parent->effective_cpus); + cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus); } } @@ -1162,7 +1184,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, int new_prs; bool part_error = false; /* Partition error? */ - percpu_rwsem_assert_held(&cpuset_rwsem); + lockdep_assert_held(&cpuset_mutex); /* * The parent must be a partition root. @@ -2158,7 +2180,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); cs = css_cs(css); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; @@ -2182,7 +2204,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) cs->attach_in_progress++; ret = 0; out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); return ret; } @@ -2192,9 +2214,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); css_cs(css)->attach_in_progress--; - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* @@ -2217,22 +2239,20 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - percpu_down_write(&cpuset_rwsem); - - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); + mutex_lock(&cpuset_mutex); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cgroup_taskset_for_each(task, css, tset) { + if (cs != &top_cpuset) + guarantee_online_cpus(task, cpus_attach); + else + cpumask_copy(cpus_attach, task_cpu_possible_mask(task)); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); @@ -2271,7 +2291,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) if (!cs->attach_in_progress) wake_up(&cpuset_attach_wq); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* The various types of files and directories in a cpuset file system */ @@ -2303,7 +2323,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, int retval = 0; get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -2339,7 +2359,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); return retval; } @@ -2352,7 +2372,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, int retval = -ENODEV; get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2365,7 +2385,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); return retval; } @@ -2406,7 +2426,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, flush_work(&cpuset_hotplug_work); get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2430,7 +2450,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); @@ -2563,13 +2583,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, css_get(&cs->css); get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; retval = update_prstate(cs, val); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); css_put(&cs->css); return retval ?: nbytes; @@ -2777,7 +2797,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) return 0; get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) @@ -2829,7 +2849,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); return 0; } @@ -2850,7 +2870,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) struct cpuset *cs = css_cs(css); get_online_cpus(); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); if (is_partition_root(cs)) update_prstate(cs, 0); @@ -2869,7 +2889,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); put_online_cpus(); } @@ -2882,7 +2902,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { @@ -2895,7 +2915,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) } spin_unlock_irq(&callback_lock); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /* @@ -2908,7 +2928,6 @@ static void cpuset_fork(struct task_struct *task) if (task_css_is_root(task, cpuset_cgrp_id)) return; - set_cpus_allowed_ptr(task, current->cpus_ptr); task->mems_allowed = current->mems_allowed; } @@ -2937,7 +2956,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = { int __init cpuset_init(void) { - BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL)); @@ -3012,7 +3030,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); /* * Move tasks to the nearest ancestor with execution resources, @@ -3022,7 +3040,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, if (is_empty) remove_tasks_in_empty_cpuset(cs); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); } static void @@ -3072,14 +3090,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* * We have raced with task attaching. We wait until attaching * is finished, so we won't attach a task to an empty cpuset. */ if (cs->attach_in_progress) { - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); goto retry; } @@ -3151,7 +3169,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); } /** @@ -3170,7 +3188,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -3181,7 +3199,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; - percpu_down_write(&cpuset_rwsem); + mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ cpumask_copy(&new_cpus, cpu_active_mask); @@ -3238,7 +3256,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) update_tasks_nodemask(&top_cpuset); } - percpu_up_write(&cpuset_rwsem); + mutex_unlock(&cpuset_mutex); /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { @@ -3282,6 +3300,7 @@ void cpuset_wait_for_hotplug(void) { flush_work(&cpuset_hotplug_work); } +EXPORT_SYMBOL_GPL(cpuset_wait_for_hotplug); /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. @@ -3337,11 +3356,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); - guarantee_online_cpus(task_cs(tsk), pmask); + guarantee_online_cpus(tsk, pmask); rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } - +EXPORT_SYMBOL_GPL(cpuset_cpus_allowed); /** * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. * @tsk: pointer to task_struct with which the scheduler is struggling @@ -3356,9 +3375,17 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + const struct cpumask *cs_mask; + rcu_read_lock(); - do_set_cpus_allowed(tsk, is_in_v2_mode() ? - task_cs(tsk)->cpus_allowed : cpu_possible_mask); + cs_mask = task_cs(tsk)->cpus_allowed; + + if (!is_in_v2_mode() || !cpumask_subset(cs_mask, possible_mask)) + goto unlock; /* select_fallback_rq will try harder */ + + do_set_cpus_allowed(tsk, cs_mask); +unlock: rcu_read_unlock(); /* diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 08236798d..081d026f1 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -479,3 +479,4 @@ struct cgroup_subsys freezer_cgrp_subsys = { .fork = freezer_fork, .legacy_cftypes = files, }; +EXPORT_SYMBOL_GPL(freezer_cgrp_subsys); diff --git a/kernel/cpu.c b/kernel/cpu.c index 4b27158d3..b076ccd1b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -39,6 +39,8 @@ #define CREATE_TRACE_POINTS #include +#undef CREATE_TRACE_POINTS + #include "smpboot.h" /** @@ -274,11 +276,13 @@ void cpu_maps_update_begin(void) { mutex_lock(&cpu_add_remove_lock); } +EXPORT_SYMBOL_GPL(cpu_maps_update_begin); void cpu_maps_update_done(void) { mutex_unlock(&cpu_add_remove_lock); } +EXPORT_SYMBOL_GPL(cpu_maps_update_done); /* * If set, cpu_up and cpu_down will return -EBUSY and do nothing. @@ -1053,7 +1057,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int prev_state, ret = 0; - if (num_online_cpus() == 1) + if (num_active_cpus() == 1 && cpu_active(cpu)) return -EBUSY; if (!cpu_present(cpu)) diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index e2999a070..79cb6d063 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -200,6 +200,7 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) irq_gc_unlock(gc); return 0; } +EXPORT_SYMBOL_GPL(irq_gc_set_wake); static u32 irq_readl_be(void __iomem *addr) { diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 5899260a8..466eaa74f 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,5 +1,17 @@ # SPDX-License-Identifier: GPL-2.0 +CURRENT_DIR := $(abspath $(dir $(realpath $(lastword $(MAKEFILE_LIST))))) + +ifeq ($(PRODUCT_PATH),) +$(error PRODUCT_PATH is not set) +endif + +WEAKUP_DIR := ../../../../../../$(PRODUCT_PATH)/kernel_core/kernel/power +ifeq ($(wildcard $(CURRENT_DIR)/$(WEAKUP_DIR)),) +HCS_ABS_DIR := $(abspath $(CURRENT_DIR)/$(WEAKUP_DIR)) +$(error miss in $(HCS_ABS_DIR) for standrad system) +endif + ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG KASAN_SANITIZE_snapshot.o := n @@ -17,4 +29,5 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o +obj-$(CONFIG_SUSPEND) += $(WEAKUP_DIR)/ obj-$(CONFIG_ENERGY_MODEL) += energy_model.o diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 119b929dc..41430128d 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -52,6 +52,17 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); +static int em_debug_units_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; + + seq_printf(s, "%s\n", units); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_units); + static void em_debug_create_pd(struct device *dev) { struct dentry *d; @@ -64,6 +75,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, &em_debug_cpus_fops); + debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); + /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) em_debug_create_ps(&dev->em_pd->table[i], d); @@ -245,17 +258,24 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * @cpus : Pointer to cpumask_t, which in case of a CPU device is * obligatory. It can be taken from i.e. 'policy->cpus'. For other * type of devices this should be set to NULL. + * @milliwatts : Flag indicating that the power values are in milliWatts or + * in some other scale. It must be set properly. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. * + * The @milliwatts is important to set with correct value. Some kernel + * sub-systems might rely on this flag and check if all devices in the EM are + * using the same scale. + * * If multiple clients register the same performance domain, all but the first * registration will be ignored. * * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *cpus) + struct em_data_callback *cb, cpumask_t *cpus, + bool milliwatts) { unsigned long cap, prev_cap = 0; int cpu, ret; @@ -308,6 +328,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, if (ret) goto unlock; + dev->em_pd->milliwatts = milliwatts; + em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index bf640fd61..b13fe337f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -326,7 +326,7 @@ static int create_image(int platform_mode) if (!in_suspend) { events_check_enabled = false; - clear_free_pages(); + clear_or_poison_free_pages(); } platform_leave(platform_mode); diff --git a/kernel/power/power.h b/kernel/power/power.h index 24f12d534..778bf431e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -106,7 +106,7 @@ extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); -extern void clear_free_pages(void); +extern void clear_or_poison_free_pages(void); /** * Auxiliary structure used for reading the snapshot image data and diff --git a/kernel/power/process.c b/kernel/power/process.c index 45b054b7b..cc0623080 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -85,18 +85,21 @@ static int try_to_freeze_tasks(bool user_only) elapsed = ktime_sub(end, start); elapsed_msecs = ktime_to_ms(elapsed); - if (todo) { + if (wakeup) { pr_cont("\n"); - pr_err("Freezing of tasks %s after %d.%03d seconds " - "(%d tasks refusing to freeze, wq_busy=%d):\n", - wakeup ? "aborted" : "failed", + pr_err("Freezing of tasks aborted after %d.%03d seconds", + elapsed_msecs / 1000, elapsed_msecs % 1000); + } else if (todo) { + pr_cont("\n"); + pr_err("Freezing of tasks failed after %d.%03d seconds" + " (%d tasks refusing to freeze, wq_busy=%d):\n", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (wq_busy) show_workqueue_state(); - if (!wakeup || pm_debug_messages_on) { + if (pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 46b1804c1..a3491b29c 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1144,7 +1144,15 @@ void free_basic_memory_bitmaps(void) pr_debug("Basic memory bitmaps freed\n"); } -void clear_free_pages(void) +static void clear_or_poison_free_page(struct page *page) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, 1); + else if (want_init_on_free()) + clear_highpage(page); +} + +void clear_or_poison_free_pages(void) { struct memory_bitmap *bm = free_pages_map; unsigned long pfn; @@ -1152,12 +1160,12 @@ void clear_free_pages(void) if (WARN_ON(!(free_pages_map))) return; - if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) { + if (page_poisoning_enabled() || want_init_on_free()) { memory_bm_position_reset(bm); pfn = memory_bm_next_pfn(bm); while (pfn != BM_END_OF_MAP) { if (pfn_valid(pfn)) - clear_highpage(pfn_to_page(pfn)); + clear_or_poison_free_page(pfn_to_page(pfn)); pfn = memory_bm_next_pfn(bm); } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 32391acc8..545958377 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "power.h" @@ -139,6 +140,7 @@ static void s2idle_loop(void) } pm_wakeup_clear(false); + clear_wakeup_reasons(); s2idle_enter(); } @@ -359,6 +361,7 @@ static int suspend_prepare(suspend_state_t state) if (!error) return 0; + log_suspend_abort_reason("One or more tasks refusing to freeze"); suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); pm_notifier_call_chain(PM_POST_SUSPEND); @@ -388,7 +391,7 @@ void __weak arch_suspend_enable_irqs(void) */ static int suspend_enter(suspend_state_t state, bool *wakeup) { - int error; + int error, last_dev; error = platform_suspend_prepare(state); if (error) @@ -396,7 +399,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; pr_err("late suspend of devices failed\n"); + log_suspend_abort_reason("late suspend of %s device failed", + suspend_stats.failed_devs[last_dev]); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -405,7 +412,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; pr_err("noirq suspend of devices failed\n"); + log_suspend_abort_reason("noirq suspend of %s device failed", + suspend_stats.failed_devs[last_dev]); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); @@ -421,8 +432,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) } error = suspend_disable_secondary_cpus(); - if (error || suspend_test(TEST_CPUS)) + if (error || suspend_test(TEST_CPUS)) { + log_suspend_abort_reason("Disabling non-boot cpus failed"); goto Enable_cpus; + } arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -493,6 +506,8 @@ int suspend_devices_and_enter(suspend_state_t state) error = dpm_suspend_start(PMSG_SUSPEND); if (error) { pr_err("Some devices failed to suspend, or early wake event detected\n"); + log_suspend_abort_reason( + "Some devices failed to suspend, or early wake event detected"); goto Recover_platform; } suspend_test_finish("suspend devices"); diff --git a/kernel/reboot.c b/kernel/reboot.c index af6f23d8b..bce629531 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -215,6 +215,27 @@ void do_kernel_restart(char *cmd) atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd); } +#ifdef CONFIG_NO_GKI +static ATOMIC_NOTIFIER_HEAD(pre_restart_handler_list); + +int register_pre_restart_handler(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&pre_restart_handler_list, nb); +} +EXPORT_SYMBOL(register_pre_restart_handler); + +int unregister_pre_restart_handler(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&pre_restart_handler_list, nb); +} +EXPORT_SYMBOL(unregister_pre_restart_handler); + +void do_kernel_pre_restart(char *cmd) +{ + atomic_notifier_call_chain(&pre_restart_handler_list, reboot_mode, cmd); +} +#endif + void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e2f00be4b..750da3e7c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -47,6 +47,13 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking); +#ifdef CONFIG_SCHEDSTATS +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked); +#endif DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -660,7 +667,7 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; - if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; @@ -680,8 +687,25 @@ int get_nohz_timer_target(void) } } - if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + if (default_cpu == -1) { + for_each_cpu_and(i, cpu_active_mask, + housekeeping_cpumask(HK_FLAG_TIMER)) { + if (cpu == i) + continue; + + if (!idle_cpu(i)) { + cpu = i; + goto unlock; + } + } + + /* no active, not-idle, housekpeeing CPU found. */ + default_cpu = cpumask_any(cpu_active_mask); + + if (unlikely(default_cpu >= nr_cpu_ids)) + goto unlock; + } + cpu = default_cpu; unlock: rcu_read_unlock(); @@ -1770,7 +1794,10 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) if (is_per_cpu_kthread(p)) return cpu_online(cpu); - return cpu_active(cpu); + if (!cpu_active(cpu)) + return false; + + return cpumask_test_cpu(cpu, task_cpu_possible_mask(p)); } /* @@ -2433,10 +2460,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: - do_set_cpus_allowed(p, cpu_possible_mask); + do_set_cpus_allowed(p, task_cpu_possible_mask(p)); state = fail; break; - case fail: #ifdef CONFIG_CPU_ISOLATION_OPT allow_iso = true; @@ -2627,6 +2653,9 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, { int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; + if (wake_flags & WF_SYNC) + en_flags |= ENQUEUE_WAKEUP_SYNC; + lockdep_assert_held(&rq->lock); if (p->sched_contributes_to_load) @@ -3019,6 +3048,19 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (!(p->state & state)) goto unlock; +#ifdef CONFIG_FREEZER + /* + * If we're going to wake up a thread which may be frozen, then + * we can only do so if we have an active CPU which is capable of + * running it. This may not be the case when resuming from suspend, + * as the secondary CPUs may not yet be back online. See __thaw_task() + * for the actual wakeup. + */ + if (unlikely(frozen_or_skipped(p)) && + !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p))) + goto unlock; +#endif + trace_sched_waking(p); /* We're going to change ->state: */ @@ -5004,7 +5046,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -5713,16 +5755,19 @@ int sched_setscheduler(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, true); } +EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, true, true); } +EXPORT_SYMBOL_GPL(sched_setattr); int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. @@ -5742,6 +5787,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); /* * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally @@ -7044,6 +7090,11 @@ void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, */ update_rq_clock(rq); +#ifdef CONFIG_SCHED_DEBUG + /* note the clock update in orf */ + orf.clock_update_flags |= RQCF_UPDATED; +#endif + for (;;) { /* * There's this thread running, bail when that's the only diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 81e43a56d..4df7f4e68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -86,6 +86,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 750000ULL; +EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity); static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* @@ -10686,9 +10687,20 @@ void nohz_balance_enter_idle(int cpu) SCHED_WARN_ON(cpu != smp_processor_id()); - /* If this CPU is going down, then nothing needs to be done: */ - if (!cpu_active(cpu)) + if (!cpu_active(cpu)) { + /* + * A CPU can be paused while it is idle with it's tick + * stopped. nohz_balance_exit_idle() should be called + * from the local CPU, so it can't be called during + * pause. This results in paused CPU participating in + * the nohz idle balance, which should be avoided. + * + * When the paused CPU exits idle and enters again, + * exempt the paused CPU from nohz_balance_exit_idle. + */ + nohz_balance_exit_idle(rq); return; + } /* Spare idle load balancing on CPUs that don't want to be disturbed: */ if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2593a733c..69afd8d1e 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -450,6 +450,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) { raw_spin_unlock_irq(&rq->lock); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); raw_spin_lock_irq(&rq->lock); } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index d2a655643..b5837e277 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -75,6 +75,7 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[1] = (avenrun[1] + offset) << shift; loads[2] = (avenrun[2] + offset) << shift; } +EXPORT_SYMBOL_GPL(get_avenrun); long calc_load_fold_active(struct rq *this_rq, long adjust) { diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 2c613e1cf..e2890b677 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -28,6 +28,42 @@ #include "sched.h" #include "pelt.h" +int pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD; +int pelt_load_avg_max = PELT32_LOAD_AVG_MAX; +const u32 *pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv; + +static int __init set_pelt(char *str) +{ + int rc, num; + + rc = kstrtoint(str, 0, &num); + if (rc) { + pr_err("%s: kstrtoint failed. rc=%d\n", __func__, rc); + return 0; + } + + switch (num) { + case PELT8_LOAD_AVG_PERIOD: + pelt_load_avg_period = PELT8_LOAD_AVG_PERIOD; + pelt_load_avg_max = PELT8_LOAD_AVG_MAX; + pelt_runnable_avg_yN_inv = pelt8_runnable_avg_yN_inv; + pr_info("PELT half life is set to %dms\n", num); + break; + case PELT32_LOAD_AVG_PERIOD: + pelt_load_avg_period = PELT32_LOAD_AVG_PERIOD; + pelt_load_avg_max = PELT32_LOAD_AVG_MAX; + pelt_runnable_avg_yN_inv = pelt32_runnable_avg_yN_inv; + pr_info("PELT half life is set to %dms\n", num); + break; + default: + pr_err("Default PELT half life is 32ms\n"); + } + + return 0; +} + +early_param("pelt", set_pelt); + /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) @@ -54,7 +90,7 @@ static u64 decay_load(u64 val, u64 n) local_n %= LOAD_AVG_PERIOD; } - val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); + val = mul_u64_u32_shr(val, pelt_runnable_avg_yN_inv[local_n], 32); return val; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index b7f38f3ad..b0e6c438f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -550,7 +550,7 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } -/* Schedule polling if it's not already scheduled. */ +/* Schedule polling if it's not already scheduled or forced. */ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) { struct task_struct *task; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d5c00fa02..689cc1a63 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1390,6 +1390,27 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) enqueue_top_rt_rq(&rq->rt); } +#ifdef CONFIG_SMP +static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, + bool sync) +{ + /* + * If the waker is CFS, then an RT sync wakeup would preempt the waker + * and force it to run for a likely small time after the RT wakee is + * done. So, only honor RT sync wakeups from RT wakers. + */ + return sync && task_has_rt_policy(rq->curr) && + p->prio <= rq->rt.highest_prio.next && + rq->rt.rt_nr_running <= 2; +} +#else +static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p, + bool sync) +{ + return 0; +} +#endif + /* * Adding/removing a task to/from a priority array: */ @@ -1397,6 +1418,7 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; + bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC); if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; @@ -1404,7 +1426,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags); walt_inc_cumulative_runnable_avg(rq, p); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1 && + !should_honor_rt_sync(rq, p, sync)) enqueue_pushable_task(rq, p); } @@ -1461,7 +1484,10 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + struct rq *this_cpu_rq; bool test; + bool sync = !!(flags & WF_SYNC); + int this_cpu; /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1471,6 +1497,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + this_cpu = smp_processor_id(); + this_cpu_rq = cpu_rq(this_cpu); /* * If the current task on @p's runqueue is an RT task, then @@ -1502,6 +1530,15 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + /* + * Respect the sync flag as long as the task can run on this CPU. + */ + if (should_honor_rt_sync(this_cpu_rq, p, sync) && + cpumask_test_cpu(this_cpu, p->cpus_ptr)) { + cpu = this_cpu; + goto out_unlock; + } + if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index c529706be..92a6875bc 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ -static const u32 runnable_avg_yN_inv[] __maybe_unused = { +static const u32 pelt32_runnable_avg_yN_inv[] __maybe_unused = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, @@ -10,5 +10,20 @@ static const u32 runnable_avg_yN_inv[] __maybe_unused = { 0x85aac367, 0x82cd8698, }; -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 +#define PELT32_LOAD_AVG_PERIOD 32 +#define PELT32_LOAD_AVG_MAX 47742 + +static const u32 pelt8_runnable_avg_yN_inv[] __maybe_unused = { + 0xffffffff, 0xeac0c6e6, 0xd744fcc9, 0xc5672a10, + 0xb504f333, 0xa5fed6a9, 0x9837f050, 0x8b95c1e3, +}; + +#define PELT8_LOAD_AVG_PERIOD 8 +#define PELT8_LOAD_AVG_MAX 12336 + +extern const u32 *pelt_runnable_avg_yN_inv; +extern int pelt_load_avg_period; +extern int pelt_load_avg_max; + +#define LOAD_AVG_PERIOD pelt_load_avg_period +#define LOAD_AVG_MAX pelt_load_avg_max diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 592c8653c..7c02fed0a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1913,6 +1913,8 @@ extern const int sched_latency_to_weight[40]; #define ENQUEUE_MIGRATED 0x00 #endif +#define ENQUEUE_WAKEUP_SYNC 0x80 + #define RETRY_TASK ((void *)-1UL) struct sched_class { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9191e5daa..58d840c62 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -5,6 +5,9 @@ #include "sched.h" DEFINE_MUTEX(sched_domains_mutex); +#ifdef CONFIG_LOCKDEP +EXPORT_SYMBOL_GPL(sched_domains_mutex); +#endif /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index a55642aa3..6911bbca0 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -396,7 +396,8 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en } EXPORT_SYMBOL(finish_wait); -int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) +__sched int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, + int sync, void *key) { int ret = default_wake_function(wq_entry, mode, sync, key); @@ -432,7 +433,7 @@ static inline bool is_kthread_should_stop(void) * } smp_mb(); // C * remove_wait_queue(&wq_head, &wait); wq_entry->flags |= WQ_FLAG_WOKEN; */ -long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) +__sched long wait_woken(struct wait_queue_entry *wq_entry, unsigned int mode, long timeout) { /* * The below executes an smp_mb(), which matches with the full barrier @@ -457,7 +458,8 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) } EXPORT_SYMBOL(wait_woken); -int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) +__sched int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, + int sync, void *key) { /* Pairs with the smp_store_mb() in wait_woken(). */ smp_mb(); /* C */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 3e6740207..033fa94f3 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -27,6 +27,7 @@ * Structure to determine completion condition and record errors. May * be shared by works on different cpus. */ + struct cpu_stop_done { atomic_t nr_todo; /* nr left to execute */ int ret; /* collected return value */