1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Kernel-based Virtual Machine (KVM) Hypervisor
4 *
5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
7 *
8 * Authors:
9 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com>
11 */
12
13 #include <kvm/iodev.h>
14
15 #include <linux/kvm_host.h>
16 #include <linux/kvm.h>
17 #include <linux/module.h>
18 #include <linux/errno.h>
19 #include <linux/percpu.h>
20 #include <linux/mm.h>
21 #include <linux/miscdevice.h>
22 #include <linux/vmalloc.h>
23 #include <linux/reboot.h>
24 #include <linux/debugfs.h>
25 #include <linux/highmem.h>
26 #include <linux/file.h>
27 #include <linux/syscore_ops.h>
28 #include <linux/cpu.h>
29 #include <linux/sched/signal.h>
30 #include <linux/sched/mm.h>
31 #include <linux/sched/stat.h>
32 #include <linux/cpumask.h>
33 #include <linux/smp.h>
34 #include <linux/anon_inodes.h>
35 #include <linux/profile.h>
36 #include <linux/kvm_para.h>
37 #include <linux/pagemap.h>
38 #include <linux/mman.h>
39 #include <linux/swap.h>
40 #include <linux/bitops.h>
41 #include <linux/spinlock.h>
42 #include <linux/compat.h>
43 #include <linux/srcu.h>
44 #include <linux/hugetlb.h>
45 #include <linux/slab.h>
46 #include <linux/sort.h>
47 #include <linux/bsearch.h>
48 #include <linux/io.h>
49 #include <linux/lockdep.h>
50 #include <linux/kthread.h>
51 #include <linux/suspend.h>
52
53 #include <asm/processor.h>
54 #include <asm/ioctl.h>
55 #include <linux/uaccess.h>
56
57 #include "coalesced_mmio.h"
58 #include "async_pf.h"
59 #include "kvm_mm.h"
60 #include "vfio.h"
61
62 #include <trace/events/ipi.h>
63
64 #define CREATE_TRACE_POINTS
65 #include <trace/events/kvm.h>
66
67 #include <linux/kvm_dirty_ring.h>
68
69
70 /* Worst case buffer size needed for holding an integer. */
71 #define ITOA_MAX_LEN 12
72
73 MODULE_AUTHOR("Qumranet");
74 MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
75 MODULE_LICENSE("GPL");
76
77 /* Architectures should define their poll value according to the halt latency */
78 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
79 module_param(halt_poll_ns, uint, 0644);
80 EXPORT_SYMBOL_GPL(halt_poll_ns);
81
82 /* Default doubles per-vcpu halt_poll_ns. */
83 unsigned int halt_poll_ns_grow = 2;
84 module_param(halt_poll_ns_grow, uint, 0644);
85 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
86
87 /* The start value to grow halt_poll_ns from */
88 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
89 module_param(halt_poll_ns_grow_start, uint, 0644);
90 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
91
92 /* Default halves per-vcpu halt_poll_ns. */
93 unsigned int halt_poll_ns_shrink = 2;
94 module_param(halt_poll_ns_shrink, uint, 0644);
95 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
96
97 /*
98 * Ordering of locks:
99 *
100 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
101 */
102
103 DEFINE_MUTEX(kvm_lock);
104 LIST_HEAD(vm_list);
105
106 static struct kmem_cache *kvm_vcpu_cache;
107
108 static __read_mostly struct preempt_ops kvm_preempt_ops;
109 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
110
111 static struct dentry *kvm_debugfs_dir;
112
113 static const struct file_operations stat_fops_per_vm;
114
115 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
116 unsigned long arg);
117 #ifdef CONFIG_KVM_COMPAT
118 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
119 unsigned long arg);
120 #define KVM_COMPAT(c) .compat_ioctl = (c)
121 #else
122 /*
123 * For architectures that don't implement a compat infrastructure,
124 * adopt a double line of defense:
125 * - Prevent a compat task from opening /dev/kvm
126 * - If the open has been done by a 64bit task, and the KVM fd
127 * passed to a compat task, let the ioctls fail.
128 */
kvm_no_compat_ioctl(struct file * file,unsigned int ioctl,unsigned long arg)129 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
130 unsigned long arg) { return -EINVAL; }
131
kvm_no_compat_open(struct inode * inode,struct file * file)132 static int kvm_no_compat_open(struct inode *inode, struct file *file)
133 {
134 return is_compat_task() ? -ENODEV : 0;
135 }
136 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
137 .open = kvm_no_compat_open
138 #endif
139 static int kvm_enable_virtualization(void);
140 static void kvm_disable_virtualization(void);
141
142 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
143
144 #define KVM_EVENT_CREATE_VM 0
145 #define KVM_EVENT_DESTROY_VM 1
146 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
147 static unsigned long long kvm_createvm_count;
148 static unsigned long long kvm_active_vms;
149
150 static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
151
kvm_arch_guest_memory_reclaimed(struct kvm * kvm)152 __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
153 {
154 }
155
kvm_is_zone_device_page(struct page * page)156 bool kvm_is_zone_device_page(struct page *page)
157 {
158 /*
159 * The metadata used by is_zone_device_page() to determine whether or
160 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
161 * the device has been pinned, e.g. by get_user_pages(). WARN if the
162 * page_count() is zero to help detect bad usage of this helper.
163 */
164 if (WARN_ON_ONCE(!page_count(page)))
165 return false;
166
167 return is_zone_device_page(page);
168 }
169
170 /*
171 * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
172 * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
173 * is likely incomplete, it has been compiled purely through people wanting to
174 * back guest with a certain type of memory and encountering issues.
175 */
kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)176 struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
177 {
178 struct page *page;
179
180 if (!pfn_valid(pfn))
181 return NULL;
182
183 page = pfn_to_page(pfn);
184 if (!PageReserved(page))
185 return page;
186
187 /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
188 if (is_zero_pfn(pfn))
189 return page;
190
191 /*
192 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
193 * perspective they are "normal" pages, albeit with slightly different
194 * usage rules.
195 */
196 if (kvm_is_zone_device_page(page))
197 return page;
198
199 return NULL;
200 }
201
202 /*
203 * Switches to specified vcpu, until a matching vcpu_put()
204 */
vcpu_load(struct kvm_vcpu * vcpu)205 void vcpu_load(struct kvm_vcpu *vcpu)
206 {
207 int cpu = get_cpu();
208
209 __this_cpu_write(kvm_running_vcpu, vcpu);
210 preempt_notifier_register(&vcpu->preempt_notifier);
211 kvm_arch_vcpu_load(vcpu, cpu);
212 put_cpu();
213 }
214 EXPORT_SYMBOL_GPL(vcpu_load);
215
vcpu_put(struct kvm_vcpu * vcpu)216 void vcpu_put(struct kvm_vcpu *vcpu)
217 {
218 preempt_disable();
219 kvm_arch_vcpu_put(vcpu);
220 preempt_notifier_unregister(&vcpu->preempt_notifier);
221 __this_cpu_write(kvm_running_vcpu, NULL);
222 preempt_enable();
223 }
224 EXPORT_SYMBOL_GPL(vcpu_put);
225
226 /* TODO: merge with kvm_arch_vcpu_should_kick */
kvm_request_needs_ipi(struct kvm_vcpu * vcpu,unsigned req)227 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
228 {
229 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
230
231 /*
232 * We need to wait for the VCPU to reenable interrupts and get out of
233 * READING_SHADOW_PAGE_TABLES mode.
234 */
235 if (req & KVM_REQUEST_WAIT)
236 return mode != OUTSIDE_GUEST_MODE;
237
238 /*
239 * Need to kick a running VCPU, but otherwise there is nothing to do.
240 */
241 return mode == IN_GUEST_MODE;
242 }
243
ack_kick(void * _completed)244 static void ack_kick(void *_completed)
245 {
246 }
247
kvm_kick_many_cpus(struct cpumask * cpus,bool wait)248 static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
249 {
250 if (cpumask_empty(cpus))
251 return false;
252
253 smp_call_function_many(cpus, ack_kick, NULL, wait);
254 return true;
255 }
256
kvm_make_vcpu_request(struct kvm_vcpu * vcpu,unsigned int req,struct cpumask * tmp,int current_cpu)257 static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
258 struct cpumask *tmp, int current_cpu)
259 {
260 int cpu;
261
262 if (likely(!(req & KVM_REQUEST_NO_ACTION)))
263 __kvm_make_request(req, vcpu);
264
265 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
266 return;
267
268 /*
269 * Note, the vCPU could get migrated to a different pCPU at any point
270 * after kvm_request_needs_ipi(), which could result in sending an IPI
271 * to the previous pCPU. But, that's OK because the purpose of the IPI
272 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
273 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
274 * after this point is also OK, as the requirement is only that KVM wait
275 * for vCPUs that were reading SPTEs _before_ any changes were
276 * finalized. See kvm_vcpu_kick() for more details on handling requests.
277 */
278 if (kvm_request_needs_ipi(vcpu, req)) {
279 cpu = READ_ONCE(vcpu->cpu);
280 if (cpu != -1 && cpu != current_cpu)
281 __cpumask_set_cpu(cpu, tmp);
282 }
283 }
284
kvm_make_vcpus_request_mask(struct kvm * kvm,unsigned int req,unsigned long * vcpu_bitmap)285 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
286 unsigned long *vcpu_bitmap)
287 {
288 struct kvm_vcpu *vcpu;
289 struct cpumask *cpus;
290 int i, me;
291 bool called;
292
293 me = get_cpu();
294
295 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
296 cpumask_clear(cpus);
297
298 for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
299 vcpu = kvm_get_vcpu(kvm, i);
300 if (!vcpu)
301 continue;
302 kvm_make_vcpu_request(vcpu, req, cpus, me);
303 }
304
305 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
306 put_cpu();
307
308 return called;
309 }
310
kvm_make_all_cpus_request(struct kvm * kvm,unsigned int req)311 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
312 {
313 struct kvm_vcpu *vcpu;
314 struct cpumask *cpus;
315 unsigned long i;
316 bool called;
317 int me;
318
319 me = get_cpu();
320
321 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
322 cpumask_clear(cpus);
323
324 kvm_for_each_vcpu(i, vcpu, kvm)
325 kvm_make_vcpu_request(vcpu, req, cpus, me);
326
327 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
328 put_cpu();
329
330 return called;
331 }
332 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
333
kvm_flush_remote_tlbs(struct kvm * kvm)334 void kvm_flush_remote_tlbs(struct kvm *kvm)
335 {
336 ++kvm->stat.generic.remote_tlb_flush_requests;
337
338 /*
339 * We want to publish modifications to the page tables before reading
340 * mode. Pairs with a memory barrier in arch-specific code.
341 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
342 * and smp_mb in walk_shadow_page_lockless_begin/end.
343 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
344 *
345 * There is already an smp_mb__after_atomic() before
346 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
347 * barrier here.
348 */
349 if (!kvm_arch_flush_remote_tlbs(kvm)
350 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
351 ++kvm->stat.generic.remote_tlb_flush;
352 }
353 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
354
kvm_flush_remote_tlbs_range(struct kvm * kvm,gfn_t gfn,u64 nr_pages)355 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
356 {
357 if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
358 return;
359
360 /*
361 * Fall back to a flushing entire TLBs if the architecture range-based
362 * TLB invalidation is unsupported or can't be performed for whatever
363 * reason.
364 */
365 kvm_flush_remote_tlbs(kvm);
366 }
367
kvm_flush_remote_tlbs_memslot(struct kvm * kvm,const struct kvm_memory_slot * memslot)368 void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
369 const struct kvm_memory_slot *memslot)
370 {
371 /*
372 * All current use cases for flushing the TLBs for a specific memslot
373 * are related to dirty logging, and many do the TLB flush out of
374 * mmu_lock. The interaction between the various operations on memslot
375 * must be serialized by slots_locks to ensure the TLB flush from one
376 * operation is observed by any other operation on the same memslot.
377 */
378 lockdep_assert_held(&kvm->slots_lock);
379 kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
380 }
381
kvm_flush_shadow_all(struct kvm * kvm)382 static void kvm_flush_shadow_all(struct kvm *kvm)
383 {
384 kvm_arch_flush_shadow_all(kvm);
385 kvm_arch_guest_memory_reclaimed(kvm);
386 }
387
388 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache * mc,gfp_t gfp_flags)389 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
390 gfp_t gfp_flags)
391 {
392 void *page;
393
394 gfp_flags |= mc->gfp_zero;
395
396 if (mc->kmem_cache)
397 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
398
399 page = (void *)__get_free_page(gfp_flags);
400 if (page && mc->init_value)
401 memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
402 return page;
403 }
404
__kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache * mc,int capacity,int min)405 int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
406 {
407 gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
408 void *obj;
409
410 if (mc->nobjs >= min)
411 return 0;
412
413 if (unlikely(!mc->objects)) {
414 if (WARN_ON_ONCE(!capacity))
415 return -EIO;
416
417 /*
418 * Custom init values can be used only for page allocations,
419 * and obviously conflict with __GFP_ZERO.
420 */
421 if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
422 return -EIO;
423
424 mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
425 if (!mc->objects)
426 return -ENOMEM;
427
428 mc->capacity = capacity;
429 }
430
431 /* It is illegal to request a different capacity across topups. */
432 if (WARN_ON_ONCE(mc->capacity != capacity))
433 return -EIO;
434
435 while (mc->nobjs < mc->capacity) {
436 obj = mmu_memory_cache_alloc_obj(mc, gfp);
437 if (!obj)
438 return mc->nobjs >= min ? 0 : -ENOMEM;
439 mc->objects[mc->nobjs++] = obj;
440 }
441 return 0;
442 }
443
kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache * mc,int min)444 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
445 {
446 return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
447 }
448
kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache * mc)449 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
450 {
451 return mc->nobjs;
452 }
453
kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache * mc)454 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
455 {
456 while (mc->nobjs) {
457 if (mc->kmem_cache)
458 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
459 else
460 free_page((unsigned long)mc->objects[--mc->nobjs]);
461 }
462
463 kvfree(mc->objects);
464
465 mc->objects = NULL;
466 mc->capacity = 0;
467 }
468
kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache * mc)469 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
470 {
471 void *p;
472
473 if (WARN_ON(!mc->nobjs))
474 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
475 else
476 p = mc->objects[--mc->nobjs];
477 BUG_ON(!p);
478 return p;
479 }
480 #endif
481
kvm_vcpu_init(struct kvm_vcpu * vcpu,struct kvm * kvm,unsigned id)482 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
483 {
484 mutex_init(&vcpu->mutex);
485 vcpu->cpu = -1;
486 vcpu->kvm = kvm;
487 vcpu->vcpu_id = id;
488 vcpu->pid = NULL;
489 #ifndef __KVM_HAVE_ARCH_WQP
490 rcuwait_init(&vcpu->wait);
491 #endif
492 kvm_async_pf_vcpu_init(vcpu);
493
494 kvm_vcpu_set_in_spin_loop(vcpu, false);
495 kvm_vcpu_set_dy_eligible(vcpu, false);
496 vcpu->preempted = false;
497 vcpu->ready = false;
498 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
499 vcpu->last_used_slot = NULL;
500
501 /* Fill the stats id string for the vcpu */
502 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
503 task_pid_nr(current), id);
504 }
505
kvm_vcpu_destroy(struct kvm_vcpu * vcpu)506 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
507 {
508 kvm_arch_vcpu_destroy(vcpu);
509 kvm_dirty_ring_free(&vcpu->dirty_ring);
510
511 /*
512 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
513 * the vcpu->pid pointer, and at destruction time all file descriptors
514 * are already gone.
515 */
516 put_pid(rcu_dereference_protected(vcpu->pid, 1));
517
518 free_page((unsigned long)vcpu->run);
519 kmem_cache_free(kvm_vcpu_cache, vcpu);
520 }
521
kvm_destroy_vcpus(struct kvm * kvm)522 void kvm_destroy_vcpus(struct kvm *kvm)
523 {
524 unsigned long i;
525 struct kvm_vcpu *vcpu;
526
527 kvm_for_each_vcpu(i, vcpu, kvm) {
528 kvm_vcpu_destroy(vcpu);
529 xa_erase(&kvm->vcpu_array, i);
530 }
531
532 atomic_set(&kvm->online_vcpus, 0);
533 }
534 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
535
536 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
mmu_notifier_to_kvm(struct mmu_notifier * mn)537 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
538 {
539 return container_of(mn, struct kvm, mmu_notifier);
540 }
541
542 typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
543
544 typedef void (*on_lock_fn_t)(struct kvm *kvm);
545
546 struct kvm_mmu_notifier_range {
547 /*
548 * 64-bit addresses, as KVM notifiers can operate on host virtual
549 * addresses (unsigned long) and guest physical addresses (64-bit).
550 */
551 u64 start;
552 u64 end;
553 union kvm_mmu_notifier_arg arg;
554 gfn_handler_t handler;
555 on_lock_fn_t on_lock;
556 bool flush_on_ret;
557 bool may_block;
558 };
559
560 /*
561 * The inner-most helper returns a tuple containing the return value from the
562 * arch- and action-specific handler, plus a flag indicating whether or not at
563 * least one memslot was found, i.e. if the handler found guest memory.
564 *
565 * Note, most notifiers are averse to booleans, so even though KVM tracks the
566 * return from arch code as a bool, outer helpers will cast it to an int. :-(
567 */
568 typedef struct kvm_mmu_notifier_return {
569 bool ret;
570 bool found_memslot;
571 } kvm_mn_ret_t;
572
573 /*
574 * Use a dedicated stub instead of NULL to indicate that there is no callback
575 * function/handler. The compiler technically can't guarantee that a real
576 * function will have a non-zero address, and so it will generate code to
577 * check for !NULL, whereas comparing against a stub will be elided at compile
578 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
579 */
kvm_null_fn(void)580 static void kvm_null_fn(void)
581 {
582
583 }
584 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
585
586 /* Iterate over each memslot intersecting [start, last] (inclusive) range */
587 #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
588 for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
589 node; \
590 node = interval_tree_iter_next(node, start, last)) \
591
__kvm_handle_hva_range(struct kvm * kvm,const struct kvm_mmu_notifier_range * range)592 static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
593 const struct kvm_mmu_notifier_range *range)
594 {
595 struct kvm_mmu_notifier_return r = {
596 .ret = false,
597 .found_memslot = false,
598 };
599 struct kvm_gfn_range gfn_range;
600 struct kvm_memory_slot *slot;
601 struct kvm_memslots *slots;
602 int i, idx;
603
604 if (WARN_ON_ONCE(range->end <= range->start))
605 return r;
606
607 /* A null handler is allowed if and only if on_lock() is provided. */
608 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
609 IS_KVM_NULL_FN(range->handler)))
610 return r;
611
612 idx = srcu_read_lock(&kvm->srcu);
613
614 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
615 struct interval_tree_node *node;
616
617 slots = __kvm_memslots(kvm, i);
618 kvm_for_each_memslot_in_hva_range(node, slots,
619 range->start, range->end - 1) {
620 unsigned long hva_start, hva_end;
621
622 slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
623 hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
624 hva_end = min_t(unsigned long, range->end,
625 slot->userspace_addr + (slot->npages << PAGE_SHIFT));
626
627 /*
628 * To optimize for the likely case where the address
629 * range is covered by zero or one memslots, don't
630 * bother making these conditional (to avoid writes on
631 * the second or later invocation of the handler).
632 */
633 gfn_range.arg = range->arg;
634 gfn_range.may_block = range->may_block;
635 /*
636 * HVA-based notifications aren't relevant to private
637 * mappings as they don't have a userspace mapping.
638 */
639 gfn_range.attr_filter = KVM_FILTER_SHARED;
640
641 /*
642 * {gfn(page) | page intersects with [hva_start, hva_end)} =
643 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
644 */
645 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
646 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
647 gfn_range.slot = slot;
648
649 if (!r.found_memslot) {
650 r.found_memslot = true;
651 KVM_MMU_LOCK(kvm);
652 if (!IS_KVM_NULL_FN(range->on_lock))
653 range->on_lock(kvm);
654
655 if (IS_KVM_NULL_FN(range->handler))
656 goto mmu_unlock;
657 }
658 r.ret |= range->handler(kvm, &gfn_range);
659 }
660 }
661
662 if (range->flush_on_ret && r.ret)
663 kvm_flush_remote_tlbs(kvm);
664
665 mmu_unlock:
666 if (r.found_memslot)
667 KVM_MMU_UNLOCK(kvm);
668
669 srcu_read_unlock(&kvm->srcu, idx);
670
671 return r;
672 }
673
kvm_handle_hva_range(struct mmu_notifier * mn,unsigned long start,unsigned long end,gfn_handler_t handler)674 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
675 unsigned long start,
676 unsigned long end,
677 gfn_handler_t handler)
678 {
679 struct kvm *kvm = mmu_notifier_to_kvm(mn);
680 const struct kvm_mmu_notifier_range range = {
681 .start = start,
682 .end = end,
683 .handler = handler,
684 .on_lock = (void *)kvm_null_fn,
685 .flush_on_ret = true,
686 .may_block = false,
687 };
688
689 return __kvm_handle_hva_range(kvm, &range).ret;
690 }
691
kvm_handle_hva_range_no_flush(struct mmu_notifier * mn,unsigned long start,unsigned long end,gfn_handler_t handler)692 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
693 unsigned long start,
694 unsigned long end,
695 gfn_handler_t handler)
696 {
697 struct kvm *kvm = mmu_notifier_to_kvm(mn);
698 const struct kvm_mmu_notifier_range range = {
699 .start = start,
700 .end = end,
701 .handler = handler,
702 .on_lock = (void *)kvm_null_fn,
703 .flush_on_ret = false,
704 .may_block = false,
705 };
706
707 return __kvm_handle_hva_range(kvm, &range).ret;
708 }
709
kvm_mmu_invalidate_begin(struct kvm * kvm)710 void kvm_mmu_invalidate_begin(struct kvm *kvm)
711 {
712 lockdep_assert_held_write(&kvm->mmu_lock);
713 /*
714 * The count increase must become visible at unlock time as no
715 * spte can be established without taking the mmu_lock and
716 * count is also read inside the mmu_lock critical section.
717 */
718 kvm->mmu_invalidate_in_progress++;
719
720 if (likely(kvm->mmu_invalidate_in_progress == 1)) {
721 kvm->mmu_invalidate_range_start = INVALID_GPA;
722 kvm->mmu_invalidate_range_end = INVALID_GPA;
723 }
724 }
725
kvm_mmu_invalidate_range_add(struct kvm * kvm,gfn_t start,gfn_t end)726 void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
727 {
728 lockdep_assert_held_write(&kvm->mmu_lock);
729
730 WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
731
732 if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
733 kvm->mmu_invalidate_range_start = start;
734 kvm->mmu_invalidate_range_end = end;
735 } else {
736 /*
737 * Fully tracking multiple concurrent ranges has diminishing
738 * returns. Keep things simple and just find the minimal range
739 * which includes the current and new ranges. As there won't be
740 * enough information to subtract a range after its invalidate
741 * completes, any ranges invalidated concurrently will
742 * accumulate and persist until all outstanding invalidates
743 * complete.
744 */
745 kvm->mmu_invalidate_range_start =
746 min(kvm->mmu_invalidate_range_start, start);
747 kvm->mmu_invalidate_range_end =
748 max(kvm->mmu_invalidate_range_end, end);
749 }
750 }
751
kvm_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)752 bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
753 {
754 kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
755 return kvm_unmap_gfn_range(kvm, range);
756 }
757
kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier * mn,const struct mmu_notifier_range * range)758 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
759 const struct mmu_notifier_range *range)
760 {
761 struct kvm *kvm = mmu_notifier_to_kvm(mn);
762 const struct kvm_mmu_notifier_range hva_range = {
763 .start = range->start,
764 .end = range->end,
765 .handler = kvm_mmu_unmap_gfn_range,
766 .on_lock = kvm_mmu_invalidate_begin,
767 .flush_on_ret = true,
768 .may_block = mmu_notifier_range_blockable(range),
769 };
770
771 trace_kvm_unmap_hva_range(range->start, range->end);
772
773 /*
774 * Prevent memslot modification between range_start() and range_end()
775 * so that conditionally locking provides the same result in both
776 * functions. Without that guarantee, the mmu_invalidate_in_progress
777 * adjustments will be imbalanced.
778 *
779 * Pairs with the decrement in range_end().
780 */
781 spin_lock(&kvm->mn_invalidate_lock);
782 kvm->mn_active_invalidate_count++;
783 spin_unlock(&kvm->mn_invalidate_lock);
784
785 /*
786 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
787 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
788 * each cache's lock. There are relatively few caches in existence at
789 * any given time, and the caches themselves can check for hva overlap,
790 * i.e. don't need to rely on memslot overlap checks for performance.
791 * Because this runs without holding mmu_lock, the pfn caches must use
792 * mn_active_invalidate_count (see above) instead of
793 * mmu_invalidate_in_progress.
794 */
795 gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
796
797 /*
798 * If one or more memslots were found and thus zapped, notify arch code
799 * that guest memory has been reclaimed. This needs to be done *after*
800 * dropping mmu_lock, as x86's reclaim path is slooooow.
801 */
802 if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
803 kvm_arch_guest_memory_reclaimed(kvm);
804
805 return 0;
806 }
807
kvm_mmu_invalidate_end(struct kvm * kvm)808 void kvm_mmu_invalidate_end(struct kvm *kvm)
809 {
810 lockdep_assert_held_write(&kvm->mmu_lock);
811
812 /*
813 * This sequence increase will notify the kvm page fault that
814 * the page that is going to be mapped in the spte could have
815 * been freed.
816 */
817 kvm->mmu_invalidate_seq++;
818 smp_wmb();
819 /*
820 * The above sequence increase must be visible before the
821 * below count decrease, which is ensured by the smp_wmb above
822 * in conjunction with the smp_rmb in mmu_invalidate_retry().
823 */
824 kvm->mmu_invalidate_in_progress--;
825 KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
826
827 /*
828 * Assert that at least one range was added between start() and end().
829 * Not adding a range isn't fatal, but it is a KVM bug.
830 */
831 WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
832 }
833
kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier * mn,const struct mmu_notifier_range * range)834 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
835 const struct mmu_notifier_range *range)
836 {
837 struct kvm *kvm = mmu_notifier_to_kvm(mn);
838 const struct kvm_mmu_notifier_range hva_range = {
839 .start = range->start,
840 .end = range->end,
841 .handler = (void *)kvm_null_fn,
842 .on_lock = kvm_mmu_invalidate_end,
843 .flush_on_ret = false,
844 .may_block = mmu_notifier_range_blockable(range),
845 };
846 bool wake;
847
848 __kvm_handle_hva_range(kvm, &hva_range);
849
850 /* Pairs with the increment in range_start(). */
851 spin_lock(&kvm->mn_invalidate_lock);
852 if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
853 --kvm->mn_active_invalidate_count;
854 wake = !kvm->mn_active_invalidate_count;
855 spin_unlock(&kvm->mn_invalidate_lock);
856
857 /*
858 * There can only be one waiter, since the wait happens under
859 * slots_lock.
860 */
861 if (wake)
862 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
863 }
864
kvm_mmu_notifier_clear_flush_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)865 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
866 struct mm_struct *mm,
867 unsigned long start,
868 unsigned long end)
869 {
870 trace_kvm_age_hva(start, end);
871
872 return kvm_handle_hva_range(mn, start, end, kvm_age_gfn);
873 }
874
kvm_mmu_notifier_clear_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long start,unsigned long end)875 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
876 struct mm_struct *mm,
877 unsigned long start,
878 unsigned long end)
879 {
880 trace_kvm_age_hva(start, end);
881
882 /*
883 * Even though we do not flush TLB, this will still adversely
884 * affect performance on pre-Haswell Intel EPT, where there is
885 * no EPT Access Bit to clear so that we have to tear down EPT
886 * tables instead. If we find this unacceptable, we can always
887 * add a parameter to kvm_age_hva so that it effectively doesn't
888 * do anything on clear_young.
889 *
890 * Also note that currently we never issue secondary TLB flushes
891 * from clear_young, leaving this job up to the regular system
892 * cadence. If we find this inaccurate, we might come up with a
893 * more sophisticated heuristic later.
894 */
895 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
896 }
897
kvm_mmu_notifier_test_young(struct mmu_notifier * mn,struct mm_struct * mm,unsigned long address)898 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
899 struct mm_struct *mm,
900 unsigned long address)
901 {
902 trace_kvm_test_age_hva(address);
903
904 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
905 kvm_test_age_gfn);
906 }
907
kvm_mmu_notifier_release(struct mmu_notifier * mn,struct mm_struct * mm)908 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
909 struct mm_struct *mm)
910 {
911 struct kvm *kvm = mmu_notifier_to_kvm(mn);
912 int idx;
913
914 idx = srcu_read_lock(&kvm->srcu);
915 kvm_flush_shadow_all(kvm);
916 srcu_read_unlock(&kvm->srcu, idx);
917 }
918
919 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
920 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
921 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
922 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
923 .clear_young = kvm_mmu_notifier_clear_young,
924 .test_young = kvm_mmu_notifier_test_young,
925 .release = kvm_mmu_notifier_release,
926 };
927
kvm_init_mmu_notifier(struct kvm * kvm)928 static int kvm_init_mmu_notifier(struct kvm *kvm)
929 {
930 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
931 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
932 }
933
934 #else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
935
kvm_init_mmu_notifier(struct kvm * kvm)936 static int kvm_init_mmu_notifier(struct kvm *kvm)
937 {
938 return 0;
939 }
940
941 #endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
942
943 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
kvm_pm_notifier_call(struct notifier_block * bl,unsigned long state,void * unused)944 static int kvm_pm_notifier_call(struct notifier_block *bl,
945 unsigned long state,
946 void *unused)
947 {
948 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
949
950 return kvm_arch_pm_notifier(kvm, state);
951 }
952
kvm_init_pm_notifier(struct kvm * kvm)953 static void kvm_init_pm_notifier(struct kvm *kvm)
954 {
955 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
956 /* Suspend KVM before we suspend ftrace, RCU, etc. */
957 kvm->pm_notifier.priority = INT_MAX;
958 register_pm_notifier(&kvm->pm_notifier);
959 }
960
kvm_destroy_pm_notifier(struct kvm * kvm)961 static void kvm_destroy_pm_notifier(struct kvm *kvm)
962 {
963 unregister_pm_notifier(&kvm->pm_notifier);
964 }
965 #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
kvm_init_pm_notifier(struct kvm * kvm)966 static void kvm_init_pm_notifier(struct kvm *kvm)
967 {
968 }
969
kvm_destroy_pm_notifier(struct kvm * kvm)970 static void kvm_destroy_pm_notifier(struct kvm *kvm)
971 {
972 }
973 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
974
kvm_destroy_dirty_bitmap(struct kvm_memory_slot * memslot)975 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
976 {
977 if (!memslot->dirty_bitmap)
978 return;
979
980 vfree(memslot->dirty_bitmap);
981 memslot->dirty_bitmap = NULL;
982 }
983
984 /* This does not remove the slot from struct kvm_memslots data structures */
kvm_free_memslot(struct kvm * kvm,struct kvm_memory_slot * slot)985 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
986 {
987 if (slot->flags & KVM_MEM_GUEST_MEMFD)
988 kvm_gmem_unbind(slot);
989
990 kvm_destroy_dirty_bitmap(slot);
991
992 kvm_arch_free_memslot(kvm, slot);
993
994 kfree(slot);
995 }
996
kvm_free_memslots(struct kvm * kvm,struct kvm_memslots * slots)997 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
998 {
999 struct hlist_node *idnode;
1000 struct kvm_memory_slot *memslot;
1001 int bkt;
1002
1003 /*
1004 * The same memslot objects live in both active and inactive sets,
1005 * arbitrarily free using index '1' so the second invocation of this
1006 * function isn't operating over a structure with dangling pointers
1007 * (even though this function isn't actually touching them).
1008 */
1009 if (!slots->node_idx)
1010 return;
1011
1012 hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
1013 kvm_free_memslot(kvm, memslot);
1014 }
1015
kvm_stats_debugfs_mode(const struct _kvm_stats_desc * pdesc)1016 static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1017 {
1018 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1019 case KVM_STATS_TYPE_INSTANT:
1020 return 0444;
1021 case KVM_STATS_TYPE_CUMULATIVE:
1022 case KVM_STATS_TYPE_PEAK:
1023 default:
1024 return 0644;
1025 }
1026 }
1027
1028
kvm_destroy_vm_debugfs(struct kvm * kvm)1029 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1030 {
1031 int i;
1032 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1033 kvm_vcpu_stats_header.num_desc;
1034
1035 if (IS_ERR(kvm->debugfs_dentry))
1036 return;
1037
1038 debugfs_remove_recursive(kvm->debugfs_dentry);
1039
1040 if (kvm->debugfs_stat_data) {
1041 for (i = 0; i < kvm_debugfs_num_entries; i++)
1042 kfree(kvm->debugfs_stat_data[i]);
1043 kfree(kvm->debugfs_stat_data);
1044 }
1045 }
1046
kvm_create_vm_debugfs(struct kvm * kvm,const char * fdname)1047 static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
1048 {
1049 static DEFINE_MUTEX(kvm_debugfs_lock);
1050 struct dentry *dent;
1051 char dir_name[ITOA_MAX_LEN * 2];
1052 struct kvm_stat_data *stat_data;
1053 const struct _kvm_stats_desc *pdesc;
1054 int i, ret = -ENOMEM;
1055 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1056 kvm_vcpu_stats_header.num_desc;
1057
1058 if (!debugfs_initialized())
1059 return 0;
1060
1061 snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
1062 mutex_lock(&kvm_debugfs_lock);
1063 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1064 if (dent) {
1065 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1066 dput(dent);
1067 mutex_unlock(&kvm_debugfs_lock);
1068 return 0;
1069 }
1070 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1071 mutex_unlock(&kvm_debugfs_lock);
1072 if (IS_ERR(dent))
1073 return 0;
1074
1075 kvm->debugfs_dentry = dent;
1076 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1077 sizeof(*kvm->debugfs_stat_data),
1078 GFP_KERNEL_ACCOUNT);
1079 if (!kvm->debugfs_stat_data)
1080 goto out_err;
1081
1082 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1083 pdesc = &kvm_vm_stats_desc[i];
1084 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1085 if (!stat_data)
1086 goto out_err;
1087
1088 stat_data->kvm = kvm;
1089 stat_data->desc = pdesc;
1090 stat_data->kind = KVM_STAT_VM;
1091 kvm->debugfs_stat_data[i] = stat_data;
1092 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1093 kvm->debugfs_dentry, stat_data,
1094 &stat_fops_per_vm);
1095 }
1096
1097 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1098 pdesc = &kvm_vcpu_stats_desc[i];
1099 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1100 if (!stat_data)
1101 goto out_err;
1102
1103 stat_data->kvm = kvm;
1104 stat_data->desc = pdesc;
1105 stat_data->kind = KVM_STAT_VCPU;
1106 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1107 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1108 kvm->debugfs_dentry, stat_data,
1109 &stat_fops_per_vm);
1110 }
1111
1112 kvm_arch_create_vm_debugfs(kvm);
1113 return 0;
1114 out_err:
1115 kvm_destroy_vm_debugfs(kvm);
1116 return ret;
1117 }
1118
1119 /*
1120 * Called after the VM is otherwise initialized, but just before adding it to
1121 * the vm_list.
1122 */
kvm_arch_post_init_vm(struct kvm * kvm)1123 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1124 {
1125 return 0;
1126 }
1127
1128 /*
1129 * Called just after removing the VM from the vm_list, but before doing any
1130 * other destruction.
1131 */
kvm_arch_pre_destroy_vm(struct kvm * kvm)1132 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1133 {
1134 }
1135
1136 /*
1137 * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
1138 * be setup already, so we can create arch-specific debugfs entries under it.
1139 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1140 * a per-arch destroy interface is not needed.
1141 */
kvm_arch_create_vm_debugfs(struct kvm * kvm)1142 void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1143 {
1144 }
1145
kvm_create_vm(unsigned long type,const char * fdname)1146 static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
1147 {
1148 struct kvm *kvm = kvm_arch_alloc_vm();
1149 struct kvm_memslots *slots;
1150 int r, i, j;
1151
1152 if (!kvm)
1153 return ERR_PTR(-ENOMEM);
1154
1155 KVM_MMU_LOCK_INIT(kvm);
1156 mmgrab(current->mm);
1157 kvm->mm = current->mm;
1158 kvm_eventfd_init(kvm);
1159 mutex_init(&kvm->lock);
1160 mutex_init(&kvm->irq_lock);
1161 mutex_init(&kvm->slots_lock);
1162 mutex_init(&kvm->slots_arch_lock);
1163 spin_lock_init(&kvm->mn_invalidate_lock);
1164 rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1165 xa_init(&kvm->vcpu_array);
1166 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1167 xa_init(&kvm->mem_attr_array);
1168 #endif
1169
1170 INIT_LIST_HEAD(&kvm->gpc_list);
1171 spin_lock_init(&kvm->gpc_lock);
1172
1173 INIT_LIST_HEAD(&kvm->devices);
1174 kvm->max_vcpus = KVM_MAX_VCPUS;
1175
1176 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1177
1178 /*
1179 * Force subsequent debugfs file creations to fail if the VM directory
1180 * is not created (by kvm_create_vm_debugfs()).
1181 */
1182 kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1183
1184 snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1185 task_pid_nr(current));
1186
1187 r = -ENOMEM;
1188 if (init_srcu_struct(&kvm->srcu))
1189 goto out_err_no_srcu;
1190 if (init_srcu_struct(&kvm->irq_srcu))
1191 goto out_err_no_irq_srcu;
1192
1193 r = kvm_init_irq_routing(kvm);
1194 if (r)
1195 goto out_err_no_irq_routing;
1196
1197 refcount_set(&kvm->users_count, 1);
1198
1199 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
1200 for (j = 0; j < 2; j++) {
1201 slots = &kvm->__memslots[i][j];
1202
1203 atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1204 slots->hva_tree = RB_ROOT_CACHED;
1205 slots->gfn_tree = RB_ROOT;
1206 hash_init(slots->id_hash);
1207 slots->node_idx = j;
1208
1209 /* Generations must be different for each address space. */
1210 slots->generation = i;
1211 }
1212
1213 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1214 }
1215
1216 r = -ENOMEM;
1217 for (i = 0; i < KVM_NR_BUSES; i++) {
1218 rcu_assign_pointer(kvm->buses[i],
1219 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1220 if (!kvm->buses[i])
1221 goto out_err_no_arch_destroy_vm;
1222 }
1223
1224 r = kvm_arch_init_vm(kvm, type);
1225 if (r)
1226 goto out_err_no_arch_destroy_vm;
1227
1228 r = kvm_enable_virtualization();
1229 if (r)
1230 goto out_err_no_disable;
1231
1232 #ifdef CONFIG_HAVE_KVM_IRQCHIP
1233 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1234 #endif
1235
1236 r = kvm_init_mmu_notifier(kvm);
1237 if (r)
1238 goto out_err_no_mmu_notifier;
1239
1240 r = kvm_coalesced_mmio_init(kvm);
1241 if (r < 0)
1242 goto out_no_coalesced_mmio;
1243
1244 r = kvm_create_vm_debugfs(kvm, fdname);
1245 if (r)
1246 goto out_err_no_debugfs;
1247
1248 r = kvm_arch_post_init_vm(kvm);
1249 if (r)
1250 goto out_err;
1251
1252 mutex_lock(&kvm_lock);
1253 list_add(&kvm->vm_list, &vm_list);
1254 mutex_unlock(&kvm_lock);
1255
1256 preempt_notifier_inc();
1257 kvm_init_pm_notifier(kvm);
1258
1259 return kvm;
1260
1261 out_err:
1262 kvm_destroy_vm_debugfs(kvm);
1263 out_err_no_debugfs:
1264 kvm_coalesced_mmio_free(kvm);
1265 out_no_coalesced_mmio:
1266 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1267 if (kvm->mmu_notifier.ops)
1268 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1269 #endif
1270 out_err_no_mmu_notifier:
1271 kvm_disable_virtualization();
1272 out_err_no_disable:
1273 kvm_arch_destroy_vm(kvm);
1274 out_err_no_arch_destroy_vm:
1275 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1276 for (i = 0; i < KVM_NR_BUSES; i++)
1277 kfree(kvm_get_bus(kvm, i));
1278 kvm_free_irq_routing(kvm);
1279 out_err_no_irq_routing:
1280 cleanup_srcu_struct(&kvm->irq_srcu);
1281 out_err_no_irq_srcu:
1282 cleanup_srcu_struct(&kvm->srcu);
1283 out_err_no_srcu:
1284 kvm_arch_free_vm(kvm);
1285 mmdrop(current->mm);
1286 return ERR_PTR(r);
1287 }
1288
kvm_destroy_devices(struct kvm * kvm)1289 static void kvm_destroy_devices(struct kvm *kvm)
1290 {
1291 struct kvm_device *dev, *tmp;
1292
1293 /*
1294 * We do not need to take the kvm->lock here, because nobody else
1295 * has a reference to the struct kvm at this point and therefore
1296 * cannot access the devices list anyhow.
1297 *
1298 * The device list is generally managed as an rculist, but list_del()
1299 * is used intentionally here. If a bug in KVM introduced a reader that
1300 * was not backed by a reference on the kvm struct, the hope is that
1301 * it'd consume the poisoned forward pointer instead of suffering a
1302 * use-after-free, even though this cannot be guaranteed.
1303 */
1304 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1305 list_del(&dev->vm_node);
1306 dev->ops->destroy(dev);
1307 }
1308 }
1309
kvm_destroy_vm(struct kvm * kvm)1310 static void kvm_destroy_vm(struct kvm *kvm)
1311 {
1312 int i;
1313 struct mm_struct *mm = kvm->mm;
1314
1315 kvm_destroy_pm_notifier(kvm);
1316 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1317 kvm_destroy_vm_debugfs(kvm);
1318 kvm_arch_sync_events(kvm);
1319 mutex_lock(&kvm_lock);
1320 list_del(&kvm->vm_list);
1321 mutex_unlock(&kvm_lock);
1322 kvm_arch_pre_destroy_vm(kvm);
1323
1324 kvm_free_irq_routing(kvm);
1325 for (i = 0; i < KVM_NR_BUSES; i++) {
1326 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1327
1328 if (bus)
1329 kvm_io_bus_destroy(bus);
1330 kvm->buses[i] = NULL;
1331 }
1332 kvm_coalesced_mmio_free(kvm);
1333 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1334 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1335 /*
1336 * At this point, pending calls to invalidate_range_start()
1337 * have completed but no more MMU notifiers will run, so
1338 * mn_active_invalidate_count may remain unbalanced.
1339 * No threads can be waiting in kvm_swap_active_memslots() as the
1340 * last reference on KVM has been dropped, but freeing
1341 * memslots would deadlock without this manual intervention.
1342 *
1343 * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
1344 * notifier between a start() and end(), then there shouldn't be any
1345 * in-progress invalidations.
1346 */
1347 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1348 if (kvm->mn_active_invalidate_count)
1349 kvm->mn_active_invalidate_count = 0;
1350 else
1351 WARN_ON(kvm->mmu_invalidate_in_progress);
1352 #else
1353 kvm_flush_shadow_all(kvm);
1354 #endif
1355 kvm_arch_destroy_vm(kvm);
1356 kvm_destroy_devices(kvm);
1357 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
1358 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1359 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1360 }
1361 cleanup_srcu_struct(&kvm->irq_srcu);
1362 srcu_barrier(&kvm->srcu);
1363 cleanup_srcu_struct(&kvm->srcu);
1364 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1365 xa_destroy(&kvm->mem_attr_array);
1366 #endif
1367 kvm_arch_free_vm(kvm);
1368 preempt_notifier_dec();
1369 kvm_disable_virtualization();
1370 mmdrop(mm);
1371 }
1372
kvm_get_kvm(struct kvm * kvm)1373 void kvm_get_kvm(struct kvm *kvm)
1374 {
1375 refcount_inc(&kvm->users_count);
1376 }
1377 EXPORT_SYMBOL_GPL(kvm_get_kvm);
1378
1379 /*
1380 * Make sure the vm is not during destruction, which is a safe version of
1381 * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1382 */
kvm_get_kvm_safe(struct kvm * kvm)1383 bool kvm_get_kvm_safe(struct kvm *kvm)
1384 {
1385 return refcount_inc_not_zero(&kvm->users_count);
1386 }
1387 EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1388
kvm_put_kvm(struct kvm * kvm)1389 void kvm_put_kvm(struct kvm *kvm)
1390 {
1391 if (refcount_dec_and_test(&kvm->users_count))
1392 kvm_destroy_vm(kvm);
1393 }
1394 EXPORT_SYMBOL_GPL(kvm_put_kvm);
1395
1396 /*
1397 * Used to put a reference that was taken on behalf of an object associated
1398 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1399 * of the new file descriptor fails and the reference cannot be transferred to
1400 * its final owner. In such cases, the caller is still actively using @kvm and
1401 * will fail miserably if the refcount unexpectedly hits zero.
1402 */
kvm_put_kvm_no_destroy(struct kvm * kvm)1403 void kvm_put_kvm_no_destroy(struct kvm *kvm)
1404 {
1405 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1406 }
1407 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1408
kvm_vm_release(struct inode * inode,struct file * filp)1409 static int kvm_vm_release(struct inode *inode, struct file *filp)
1410 {
1411 struct kvm *kvm = filp->private_data;
1412
1413 kvm_irqfd_release(kvm);
1414
1415 kvm_put_kvm(kvm);
1416 return 0;
1417 }
1418
1419 /*
1420 * Allocation size is twice as large as the actual dirty bitmap size.
1421 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1422 */
kvm_alloc_dirty_bitmap(struct kvm_memory_slot * memslot)1423 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1424 {
1425 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1426
1427 memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1428 if (!memslot->dirty_bitmap)
1429 return -ENOMEM;
1430
1431 return 0;
1432 }
1433
kvm_get_inactive_memslots(struct kvm * kvm,int as_id)1434 static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1435 {
1436 struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1437 int node_idx_inactive = active->node_idx ^ 1;
1438
1439 return &kvm->__memslots[as_id][node_idx_inactive];
1440 }
1441
1442 /*
1443 * Helper to get the address space ID when one of memslot pointers may be NULL.
1444 * This also serves as a sanity that at least one of the pointers is non-NULL,
1445 * and that their address space IDs don't diverge.
1446 */
kvm_memslots_get_as_id(struct kvm_memory_slot * a,struct kvm_memory_slot * b)1447 static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1448 struct kvm_memory_slot *b)
1449 {
1450 if (WARN_ON_ONCE(!a && !b))
1451 return 0;
1452
1453 if (!a)
1454 return b->as_id;
1455 if (!b)
1456 return a->as_id;
1457
1458 WARN_ON_ONCE(a->as_id != b->as_id);
1459 return a->as_id;
1460 }
1461
kvm_insert_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * slot)1462 static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1463 struct kvm_memory_slot *slot)
1464 {
1465 struct rb_root *gfn_tree = &slots->gfn_tree;
1466 struct rb_node **node, *parent;
1467 int idx = slots->node_idx;
1468
1469 parent = NULL;
1470 for (node = &gfn_tree->rb_node; *node; ) {
1471 struct kvm_memory_slot *tmp;
1472
1473 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1474 parent = *node;
1475 if (slot->base_gfn < tmp->base_gfn)
1476 node = &(*node)->rb_left;
1477 else if (slot->base_gfn > tmp->base_gfn)
1478 node = &(*node)->rb_right;
1479 else
1480 BUG();
1481 }
1482
1483 rb_link_node(&slot->gfn_node[idx], parent, node);
1484 rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1485 }
1486
kvm_erase_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * slot)1487 static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1488 struct kvm_memory_slot *slot)
1489 {
1490 rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1491 }
1492
kvm_replace_gfn_node(struct kvm_memslots * slots,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1493 static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1494 struct kvm_memory_slot *old,
1495 struct kvm_memory_slot *new)
1496 {
1497 int idx = slots->node_idx;
1498
1499 WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1500
1501 rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1502 &slots->gfn_tree);
1503 }
1504
1505 /*
1506 * Replace @old with @new in the inactive memslots.
1507 *
1508 * With NULL @old this simply adds @new.
1509 * With NULL @new this simply removes @old.
1510 *
1511 * If @new is non-NULL its hva_node[slots_idx] range has to be set
1512 * appropriately.
1513 */
kvm_replace_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1514 static void kvm_replace_memslot(struct kvm *kvm,
1515 struct kvm_memory_slot *old,
1516 struct kvm_memory_slot *new)
1517 {
1518 int as_id = kvm_memslots_get_as_id(old, new);
1519 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1520 int idx = slots->node_idx;
1521
1522 if (old) {
1523 hash_del(&old->id_node[idx]);
1524 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1525
1526 if ((long)old == atomic_long_read(&slots->last_used_slot))
1527 atomic_long_set(&slots->last_used_slot, (long)new);
1528
1529 if (!new) {
1530 kvm_erase_gfn_node(slots, old);
1531 return;
1532 }
1533 }
1534
1535 /*
1536 * Initialize @new's hva range. Do this even when replacing an @old
1537 * slot, kvm_copy_memslot() deliberately does not touch node data.
1538 */
1539 new->hva_node[idx].start = new->userspace_addr;
1540 new->hva_node[idx].last = new->userspace_addr +
1541 (new->npages << PAGE_SHIFT) - 1;
1542
1543 /*
1544 * (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
1545 * hva_node needs to be swapped with remove+insert even though hva can't
1546 * change when replacing an existing slot.
1547 */
1548 hash_add(slots->id_hash, &new->id_node[idx], new->id);
1549 interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1550
1551 /*
1552 * If the memslot gfn is unchanged, rb_replace_node() can be used to
1553 * switch the node in the gfn tree instead of removing the old and
1554 * inserting the new as two separate operations. Replacement is a
1555 * single O(1) operation versus two O(log(n)) operations for
1556 * remove+insert.
1557 */
1558 if (old && old->base_gfn == new->base_gfn) {
1559 kvm_replace_gfn_node(slots, old, new);
1560 } else {
1561 if (old)
1562 kvm_erase_gfn_node(slots, old);
1563 kvm_insert_gfn_node(slots, new);
1564 }
1565 }
1566
1567 /*
1568 * Flags that do not access any of the extra space of struct
1569 * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS
1570 * only allows these.
1571 */
1572 #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
1573 (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
1574
check_memory_region_flags(struct kvm * kvm,const struct kvm_userspace_memory_region2 * mem)1575 static int check_memory_region_flags(struct kvm *kvm,
1576 const struct kvm_userspace_memory_region2 *mem)
1577 {
1578 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1579
1580 if (kvm_arch_has_private_mem(kvm))
1581 valid_flags |= KVM_MEM_GUEST_MEMFD;
1582
1583 /* Dirty logging private memory is not currently supported. */
1584 if (mem->flags & KVM_MEM_GUEST_MEMFD)
1585 valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
1586
1587 /*
1588 * GUEST_MEMFD is incompatible with read-only memslots, as writes to
1589 * read-only memslots have emulated MMIO, not page fault, semantics,
1590 * and KVM doesn't allow emulated MMIO for private memory.
1591 */
1592 if (kvm_arch_has_readonly_mem(kvm) &&
1593 !(mem->flags & KVM_MEM_GUEST_MEMFD))
1594 valid_flags |= KVM_MEM_READONLY;
1595
1596 if (mem->flags & ~valid_flags)
1597 return -EINVAL;
1598
1599 return 0;
1600 }
1601
kvm_swap_active_memslots(struct kvm * kvm,int as_id)1602 static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1603 {
1604 struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1605
1606 /* Grab the generation from the activate memslots. */
1607 u64 gen = __kvm_memslots(kvm, as_id)->generation;
1608
1609 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1610 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1611
1612 /*
1613 * Do not store the new memslots while there are invalidations in
1614 * progress, otherwise the locking in invalidate_range_start and
1615 * invalidate_range_end will be unbalanced.
1616 */
1617 spin_lock(&kvm->mn_invalidate_lock);
1618 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1619 while (kvm->mn_active_invalidate_count) {
1620 set_current_state(TASK_UNINTERRUPTIBLE);
1621 spin_unlock(&kvm->mn_invalidate_lock);
1622 schedule();
1623 spin_lock(&kvm->mn_invalidate_lock);
1624 }
1625 finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1626 rcu_assign_pointer(kvm->memslots[as_id], slots);
1627 spin_unlock(&kvm->mn_invalidate_lock);
1628
1629 /*
1630 * Acquired in kvm_set_memslot. Must be released before synchronize
1631 * SRCU below in order to avoid deadlock with another thread
1632 * acquiring the slots_arch_lock in an srcu critical section.
1633 */
1634 mutex_unlock(&kvm->slots_arch_lock);
1635
1636 synchronize_srcu_expedited(&kvm->srcu);
1637
1638 /*
1639 * Increment the new memslot generation a second time, dropping the
1640 * update in-progress flag and incrementing the generation based on
1641 * the number of address spaces. This provides a unique and easily
1642 * identifiable generation number while the memslots are in flux.
1643 */
1644 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1645
1646 /*
1647 * Generations must be unique even across address spaces. We do not need
1648 * a global counter for that, instead the generation space is evenly split
1649 * across address spaces. For example, with two address spaces, address
1650 * space 0 will use generations 0, 2, 4, ... while address space 1 will
1651 * use generations 1, 3, 5, ...
1652 */
1653 gen += kvm_arch_nr_memslot_as_ids(kvm);
1654
1655 kvm_arch_memslots_updated(kvm, gen);
1656
1657 slots->generation = gen;
1658 }
1659
kvm_prepare_memory_region(struct kvm * kvm,const struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)1660 static int kvm_prepare_memory_region(struct kvm *kvm,
1661 const struct kvm_memory_slot *old,
1662 struct kvm_memory_slot *new,
1663 enum kvm_mr_change change)
1664 {
1665 int r;
1666
1667 /*
1668 * If dirty logging is disabled, nullify the bitmap; the old bitmap
1669 * will be freed on "commit". If logging is enabled in both old and
1670 * new, reuse the existing bitmap. If logging is enabled only in the
1671 * new and KVM isn't using a ring buffer, allocate and initialize a
1672 * new bitmap.
1673 */
1674 if (change != KVM_MR_DELETE) {
1675 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1676 new->dirty_bitmap = NULL;
1677 else if (old && old->dirty_bitmap)
1678 new->dirty_bitmap = old->dirty_bitmap;
1679 else if (kvm_use_dirty_bitmap(kvm)) {
1680 r = kvm_alloc_dirty_bitmap(new);
1681 if (r)
1682 return r;
1683
1684 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1685 bitmap_set(new->dirty_bitmap, 0, new->npages);
1686 }
1687 }
1688
1689 r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1690
1691 /* Free the bitmap on failure if it was allocated above. */
1692 if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
1693 kvm_destroy_dirty_bitmap(new);
1694
1695 return r;
1696 }
1697
kvm_commit_memory_region(struct kvm * kvm,struct kvm_memory_slot * old,const struct kvm_memory_slot * new,enum kvm_mr_change change)1698 static void kvm_commit_memory_region(struct kvm *kvm,
1699 struct kvm_memory_slot *old,
1700 const struct kvm_memory_slot *new,
1701 enum kvm_mr_change change)
1702 {
1703 int old_flags = old ? old->flags : 0;
1704 int new_flags = new ? new->flags : 0;
1705 /*
1706 * Update the total number of memslot pages before calling the arch
1707 * hook so that architectures can consume the result directly.
1708 */
1709 if (change == KVM_MR_DELETE)
1710 kvm->nr_memslot_pages -= old->npages;
1711 else if (change == KVM_MR_CREATE)
1712 kvm->nr_memslot_pages += new->npages;
1713
1714 if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1715 int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1716 atomic_set(&kvm->nr_memslots_dirty_logging,
1717 atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1718 }
1719
1720 kvm_arch_commit_memory_region(kvm, old, new, change);
1721
1722 switch (change) {
1723 case KVM_MR_CREATE:
1724 /* Nothing more to do. */
1725 break;
1726 case KVM_MR_DELETE:
1727 /* Free the old memslot and all its metadata. */
1728 kvm_free_memslot(kvm, old);
1729 break;
1730 case KVM_MR_MOVE:
1731 case KVM_MR_FLAGS_ONLY:
1732 /*
1733 * Free the dirty bitmap as needed; the below check encompasses
1734 * both the flags and whether a ring buffer is being used)
1735 */
1736 if (old->dirty_bitmap && !new->dirty_bitmap)
1737 kvm_destroy_dirty_bitmap(old);
1738
1739 /*
1740 * The final quirk. Free the detached, old slot, but only its
1741 * memory, not any metadata. Metadata, including arch specific
1742 * data, may be reused by @new.
1743 */
1744 kfree(old);
1745 break;
1746 default:
1747 BUG();
1748 }
1749 }
1750
1751 /*
1752 * Activate @new, which must be installed in the inactive slots by the caller,
1753 * by swapping the active slots and then propagating @new to @old once @old is
1754 * unreachable and can be safely modified.
1755 *
1756 * With NULL @old this simply adds @new to @active (while swapping the sets).
1757 * With NULL @new this simply removes @old from @active and frees it
1758 * (while also swapping the sets).
1759 */
kvm_activate_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1760 static void kvm_activate_memslot(struct kvm *kvm,
1761 struct kvm_memory_slot *old,
1762 struct kvm_memory_slot *new)
1763 {
1764 int as_id = kvm_memslots_get_as_id(old, new);
1765
1766 kvm_swap_active_memslots(kvm, as_id);
1767
1768 /* Propagate the new memslot to the now inactive memslots. */
1769 kvm_replace_memslot(kvm, old, new);
1770 }
1771
kvm_copy_memslot(struct kvm_memory_slot * dest,const struct kvm_memory_slot * src)1772 static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1773 const struct kvm_memory_slot *src)
1774 {
1775 dest->base_gfn = src->base_gfn;
1776 dest->npages = src->npages;
1777 dest->dirty_bitmap = src->dirty_bitmap;
1778 dest->arch = src->arch;
1779 dest->userspace_addr = src->userspace_addr;
1780 dest->flags = src->flags;
1781 dest->id = src->id;
1782 dest->as_id = src->as_id;
1783 }
1784
kvm_invalidate_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * invalid_slot)1785 static void kvm_invalidate_memslot(struct kvm *kvm,
1786 struct kvm_memory_slot *old,
1787 struct kvm_memory_slot *invalid_slot)
1788 {
1789 /*
1790 * Mark the current slot INVALID. As with all memslot modifications,
1791 * this must be done on an unreachable slot to avoid modifying the
1792 * current slot in the active tree.
1793 */
1794 kvm_copy_memslot(invalid_slot, old);
1795 invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1796 kvm_replace_memslot(kvm, old, invalid_slot);
1797
1798 /*
1799 * Activate the slot that is now marked INVALID, but don't propagate
1800 * the slot to the now inactive slots. The slot is either going to be
1801 * deleted or recreated as a new slot.
1802 */
1803 kvm_swap_active_memslots(kvm, old->as_id);
1804
1805 /*
1806 * From this point no new shadow pages pointing to a deleted, or moved,
1807 * memslot will be created. Validation of sp->gfn happens in:
1808 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1809 * - kvm_is_visible_gfn (mmu_check_root)
1810 */
1811 kvm_arch_flush_shadow_memslot(kvm, old);
1812 kvm_arch_guest_memory_reclaimed(kvm);
1813
1814 /* Was released by kvm_swap_active_memslots(), reacquire. */
1815 mutex_lock(&kvm->slots_arch_lock);
1816
1817 /*
1818 * Copy the arch-specific field of the newly-installed slot back to the
1819 * old slot as the arch data could have changed between releasing
1820 * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1821 * above. Writers are required to retrieve memslots *after* acquiring
1822 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1823 */
1824 old->arch = invalid_slot->arch;
1825 }
1826
kvm_create_memslot(struct kvm * kvm,struct kvm_memory_slot * new)1827 static void kvm_create_memslot(struct kvm *kvm,
1828 struct kvm_memory_slot *new)
1829 {
1830 /* Add the new memslot to the inactive set and activate. */
1831 kvm_replace_memslot(kvm, NULL, new);
1832 kvm_activate_memslot(kvm, NULL, new);
1833 }
1834
kvm_delete_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * invalid_slot)1835 static void kvm_delete_memslot(struct kvm *kvm,
1836 struct kvm_memory_slot *old,
1837 struct kvm_memory_slot *invalid_slot)
1838 {
1839 /*
1840 * Remove the old memslot (in the inactive memslots) by passing NULL as
1841 * the "new" slot, and for the invalid version in the active slots.
1842 */
1843 kvm_replace_memslot(kvm, old, NULL);
1844 kvm_activate_memslot(kvm, invalid_slot, NULL);
1845 }
1846
kvm_move_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new,struct kvm_memory_slot * invalid_slot)1847 static void kvm_move_memslot(struct kvm *kvm,
1848 struct kvm_memory_slot *old,
1849 struct kvm_memory_slot *new,
1850 struct kvm_memory_slot *invalid_slot)
1851 {
1852 /*
1853 * Replace the old memslot in the inactive slots, and then swap slots
1854 * and replace the current INVALID with the new as well.
1855 */
1856 kvm_replace_memslot(kvm, old, new);
1857 kvm_activate_memslot(kvm, invalid_slot, new);
1858 }
1859
kvm_update_flags_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new)1860 static void kvm_update_flags_memslot(struct kvm *kvm,
1861 struct kvm_memory_slot *old,
1862 struct kvm_memory_slot *new)
1863 {
1864 /*
1865 * Similar to the MOVE case, but the slot doesn't need to be zapped as
1866 * an intermediate step. Instead, the old memslot is simply replaced
1867 * with a new, updated copy in both memslot sets.
1868 */
1869 kvm_replace_memslot(kvm, old, new);
1870 kvm_activate_memslot(kvm, old, new);
1871 }
1872
kvm_set_memslot(struct kvm * kvm,struct kvm_memory_slot * old,struct kvm_memory_slot * new,enum kvm_mr_change change)1873 static int kvm_set_memslot(struct kvm *kvm,
1874 struct kvm_memory_slot *old,
1875 struct kvm_memory_slot *new,
1876 enum kvm_mr_change change)
1877 {
1878 struct kvm_memory_slot *invalid_slot;
1879 int r;
1880
1881 /*
1882 * Released in kvm_swap_active_memslots().
1883 *
1884 * Must be held from before the current memslots are copied until after
1885 * the new memslots are installed with rcu_assign_pointer, then
1886 * released before the synchronize srcu in kvm_swap_active_memslots().
1887 *
1888 * When modifying memslots outside of the slots_lock, must be held
1889 * before reading the pointer to the current memslots until after all
1890 * changes to those memslots are complete.
1891 *
1892 * These rules ensure that installing new memslots does not lose
1893 * changes made to the previous memslots.
1894 */
1895 mutex_lock(&kvm->slots_arch_lock);
1896
1897 /*
1898 * Invalidate the old slot if it's being deleted or moved. This is
1899 * done prior to actually deleting/moving the memslot to allow vCPUs to
1900 * continue running by ensuring there are no mappings or shadow pages
1901 * for the memslot when it is deleted/moved. Without pre-invalidation
1902 * (and without a lock), a window would exist between effecting the
1903 * delete/move and committing the changes in arch code where KVM or a
1904 * guest could access a non-existent memslot.
1905 *
1906 * Modifications are done on a temporary, unreachable slot. The old
1907 * slot needs to be preserved in case a later step fails and the
1908 * invalidation needs to be reverted.
1909 */
1910 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1911 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1912 if (!invalid_slot) {
1913 mutex_unlock(&kvm->slots_arch_lock);
1914 return -ENOMEM;
1915 }
1916 kvm_invalidate_memslot(kvm, old, invalid_slot);
1917 }
1918
1919 r = kvm_prepare_memory_region(kvm, old, new, change);
1920 if (r) {
1921 /*
1922 * For DELETE/MOVE, revert the above INVALID change. No
1923 * modifications required since the original slot was preserved
1924 * in the inactive slots. Changing the active memslots also
1925 * release slots_arch_lock.
1926 */
1927 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1928 kvm_activate_memslot(kvm, invalid_slot, old);
1929 kfree(invalid_slot);
1930 } else {
1931 mutex_unlock(&kvm->slots_arch_lock);
1932 }
1933 return r;
1934 }
1935
1936 /*
1937 * For DELETE and MOVE, the working slot is now active as the INVALID
1938 * version of the old slot. MOVE is particularly special as it reuses
1939 * the old slot and returns a copy of the old slot (in working_slot).
1940 * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
1941 * old slot is detached but otherwise preserved.
1942 */
1943 if (change == KVM_MR_CREATE)
1944 kvm_create_memslot(kvm, new);
1945 else if (change == KVM_MR_DELETE)
1946 kvm_delete_memslot(kvm, old, invalid_slot);
1947 else if (change == KVM_MR_MOVE)
1948 kvm_move_memslot(kvm, old, new, invalid_slot);
1949 else if (change == KVM_MR_FLAGS_ONLY)
1950 kvm_update_flags_memslot(kvm, old, new);
1951 else
1952 BUG();
1953
1954 /* Free the temporary INVALID slot used for DELETE and MOVE. */
1955 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1956 kfree(invalid_slot);
1957
1958 /*
1959 * No need to refresh new->arch, changes after dropping slots_arch_lock
1960 * will directly hit the final, active memslot. Architectures are
1961 * responsible for knowing that new->arch may be stale.
1962 */
1963 kvm_commit_memory_region(kvm, old, new, change);
1964
1965 return 0;
1966 }
1967
kvm_check_memslot_overlap(struct kvm_memslots * slots,int id,gfn_t start,gfn_t end)1968 static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1969 gfn_t start, gfn_t end)
1970 {
1971 struct kvm_memslot_iter iter;
1972
1973 kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1974 if (iter.slot->id != id)
1975 return true;
1976 }
1977
1978 return false;
1979 }
1980
1981 /*
1982 * Allocate some memory and give it an address in the guest physical address
1983 * space.
1984 *
1985 * Discontiguous memory is allowed, mostly for framebuffers.
1986 *
1987 * Must be called holding kvm->slots_lock for write.
1988 */
__kvm_set_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region2 * mem)1989 int __kvm_set_memory_region(struct kvm *kvm,
1990 const struct kvm_userspace_memory_region2 *mem)
1991 {
1992 struct kvm_memory_slot *old, *new;
1993 struct kvm_memslots *slots;
1994 enum kvm_mr_change change;
1995 unsigned long npages;
1996 gfn_t base_gfn;
1997 int as_id, id;
1998 int r;
1999
2000 r = check_memory_region_flags(kvm, mem);
2001 if (r)
2002 return r;
2003
2004 as_id = mem->slot >> 16;
2005 id = (u16)mem->slot;
2006
2007 /* General sanity checks */
2008 if ((mem->memory_size & (PAGE_SIZE - 1)) ||
2009 (mem->memory_size != (unsigned long)mem->memory_size))
2010 return -EINVAL;
2011 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
2012 return -EINVAL;
2013 /* We can read the guest memory with __xxx_user() later on. */
2014 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
2015 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
2016 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
2017 mem->memory_size))
2018 return -EINVAL;
2019 if (mem->flags & KVM_MEM_GUEST_MEMFD &&
2020 (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
2021 mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
2022 return -EINVAL;
2023 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
2024 return -EINVAL;
2025 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
2026 return -EINVAL;
2027 if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
2028 return -EINVAL;
2029
2030 slots = __kvm_memslots(kvm, as_id);
2031
2032 /*
2033 * Note, the old memslot (and the pointer itself!) may be invalidated
2034 * and/or destroyed by kvm_set_memslot().
2035 */
2036 old = id_to_memslot(slots, id);
2037
2038 if (!mem->memory_size) {
2039 if (!old || !old->npages)
2040 return -EINVAL;
2041
2042 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
2043 return -EIO;
2044
2045 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
2046 }
2047
2048 base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2049 npages = (mem->memory_size >> PAGE_SHIFT);
2050
2051 if (!old || !old->npages) {
2052 change = KVM_MR_CREATE;
2053
2054 /*
2055 * To simplify KVM internals, the total number of pages across
2056 * all memslots must fit in an unsigned long.
2057 */
2058 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2059 return -EINVAL;
2060 } else { /* Modify an existing slot. */
2061 /* Private memslots are immutable, they can only be deleted. */
2062 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2063 return -EINVAL;
2064 if ((mem->userspace_addr != old->userspace_addr) ||
2065 (npages != old->npages) ||
2066 ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
2067 return -EINVAL;
2068
2069 if (base_gfn != old->base_gfn)
2070 change = KVM_MR_MOVE;
2071 else if (mem->flags != old->flags)
2072 change = KVM_MR_FLAGS_ONLY;
2073 else /* Nothing to change. */
2074 return 0;
2075 }
2076
2077 if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
2078 kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
2079 return -EEXIST;
2080
2081 /* Allocate a slot that will persist in the memslot. */
2082 new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2083 if (!new)
2084 return -ENOMEM;
2085
2086 new->as_id = as_id;
2087 new->id = id;
2088 new->base_gfn = base_gfn;
2089 new->npages = npages;
2090 new->flags = mem->flags;
2091 new->userspace_addr = mem->userspace_addr;
2092 if (mem->flags & KVM_MEM_GUEST_MEMFD) {
2093 r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
2094 if (r)
2095 goto out;
2096 }
2097
2098 r = kvm_set_memslot(kvm, old, new, change);
2099 if (r)
2100 goto out_unbind;
2101
2102 return 0;
2103
2104 out_unbind:
2105 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2106 kvm_gmem_unbind(new);
2107 out:
2108 kfree(new);
2109 return r;
2110 }
2111 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2112
kvm_set_memory_region(struct kvm * kvm,const struct kvm_userspace_memory_region2 * mem)2113 int kvm_set_memory_region(struct kvm *kvm,
2114 const struct kvm_userspace_memory_region2 *mem)
2115 {
2116 int r;
2117
2118 mutex_lock(&kvm->slots_lock);
2119 r = __kvm_set_memory_region(kvm, mem);
2120 mutex_unlock(&kvm->slots_lock);
2121 return r;
2122 }
2123 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2124
kvm_vm_ioctl_set_memory_region(struct kvm * kvm,struct kvm_userspace_memory_region2 * mem)2125 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2126 struct kvm_userspace_memory_region2 *mem)
2127 {
2128 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
2129 return -EINVAL;
2130
2131 return kvm_set_memory_region(kvm, mem);
2132 }
2133
2134 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2135 /**
2136 * kvm_get_dirty_log - get a snapshot of dirty pages
2137 * @kvm: pointer to kvm instance
2138 * @log: slot id and address to which we copy the log
2139 * @is_dirty: set to '1' if any dirty pages were found
2140 * @memslot: set to the associated memslot, always valid on success
2141 */
kvm_get_dirty_log(struct kvm * kvm,struct kvm_dirty_log * log,int * is_dirty,struct kvm_memory_slot ** memslot)2142 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2143 int *is_dirty, struct kvm_memory_slot **memslot)
2144 {
2145 struct kvm_memslots *slots;
2146 int i, as_id, id;
2147 unsigned long n;
2148 unsigned long any = 0;
2149
2150 /* Dirty ring tracking may be exclusive to dirty log tracking */
2151 if (!kvm_use_dirty_bitmap(kvm))
2152 return -ENXIO;
2153
2154 *memslot = NULL;
2155 *is_dirty = 0;
2156
2157 as_id = log->slot >> 16;
2158 id = (u16)log->slot;
2159 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2160 return -EINVAL;
2161
2162 slots = __kvm_memslots(kvm, as_id);
2163 *memslot = id_to_memslot(slots, id);
2164 if (!(*memslot) || !(*memslot)->dirty_bitmap)
2165 return -ENOENT;
2166
2167 kvm_arch_sync_dirty_log(kvm, *memslot);
2168
2169 n = kvm_dirty_bitmap_bytes(*memslot);
2170
2171 for (i = 0; !any && i < n/sizeof(long); ++i)
2172 any = (*memslot)->dirty_bitmap[i];
2173
2174 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2175 return -EFAULT;
2176
2177 if (any)
2178 *is_dirty = 1;
2179 return 0;
2180 }
2181 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
2182
2183 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2184 /**
2185 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2186 * and reenable dirty page tracking for the corresponding pages.
2187 * @kvm: pointer to kvm instance
2188 * @log: slot id and address to which we copy the log
2189 *
2190 * We need to keep it in mind that VCPU threads can write to the bitmap
2191 * concurrently. So, to avoid losing track of dirty pages we keep the
2192 * following order:
2193 *
2194 * 1. Take a snapshot of the bit and clear it if needed.
2195 * 2. Write protect the corresponding page.
2196 * 3. Copy the snapshot to the userspace.
2197 * 4. Upon return caller flushes TLB's if needed.
2198 *
2199 * Between 2 and 4, the guest may write to the page using the remaining TLB
2200 * entry. This is not a problem because the page is reported dirty using
2201 * the snapshot taken before and step 4 ensures that writes done after
2202 * exiting to userspace will be logged for the next call.
2203 *
2204 */
kvm_get_dirty_log_protect(struct kvm * kvm,struct kvm_dirty_log * log)2205 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2206 {
2207 struct kvm_memslots *slots;
2208 struct kvm_memory_slot *memslot;
2209 int i, as_id, id;
2210 unsigned long n;
2211 unsigned long *dirty_bitmap;
2212 unsigned long *dirty_bitmap_buffer;
2213 bool flush;
2214
2215 /* Dirty ring tracking may be exclusive to dirty log tracking */
2216 if (!kvm_use_dirty_bitmap(kvm))
2217 return -ENXIO;
2218
2219 as_id = log->slot >> 16;
2220 id = (u16)log->slot;
2221 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2222 return -EINVAL;
2223
2224 slots = __kvm_memslots(kvm, as_id);
2225 memslot = id_to_memslot(slots, id);
2226 if (!memslot || !memslot->dirty_bitmap)
2227 return -ENOENT;
2228
2229 dirty_bitmap = memslot->dirty_bitmap;
2230
2231 kvm_arch_sync_dirty_log(kvm, memslot);
2232
2233 n = kvm_dirty_bitmap_bytes(memslot);
2234 flush = false;
2235 if (kvm->manual_dirty_log_protect) {
2236 /*
2237 * Unlike kvm_get_dirty_log, we always return false in *flush,
2238 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
2239 * is some code duplication between this function and
2240 * kvm_get_dirty_log, but hopefully all architecture
2241 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2242 * can be eliminated.
2243 */
2244 dirty_bitmap_buffer = dirty_bitmap;
2245 } else {
2246 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2247 memset(dirty_bitmap_buffer, 0, n);
2248
2249 KVM_MMU_LOCK(kvm);
2250 for (i = 0; i < n / sizeof(long); i++) {
2251 unsigned long mask;
2252 gfn_t offset;
2253
2254 if (!dirty_bitmap[i])
2255 continue;
2256
2257 flush = true;
2258 mask = xchg(&dirty_bitmap[i], 0);
2259 dirty_bitmap_buffer[i] = mask;
2260
2261 offset = i * BITS_PER_LONG;
2262 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2263 offset, mask);
2264 }
2265 KVM_MMU_UNLOCK(kvm);
2266 }
2267
2268 if (flush)
2269 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2270
2271 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2272 return -EFAULT;
2273 return 0;
2274 }
2275
2276
2277 /**
2278 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2279 * @kvm: kvm instance
2280 * @log: slot id and address to which we copy the log
2281 *
2282 * Steps 1-4 below provide general overview of dirty page logging. See
2283 * kvm_get_dirty_log_protect() function description for additional details.
2284 *
2285 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2286 * always flush the TLB (step 4) even if previous step failed and the dirty
2287 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2288 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2289 * writes will be marked dirty for next log read.
2290 *
2291 * 1. Take a snapshot of the bit and clear it if needed.
2292 * 2. Write protect the corresponding page.
2293 * 3. Copy the snapshot to the userspace.
2294 * 4. Flush TLB's if needed.
2295 */
kvm_vm_ioctl_get_dirty_log(struct kvm * kvm,struct kvm_dirty_log * log)2296 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2297 struct kvm_dirty_log *log)
2298 {
2299 int r;
2300
2301 mutex_lock(&kvm->slots_lock);
2302
2303 r = kvm_get_dirty_log_protect(kvm, log);
2304
2305 mutex_unlock(&kvm->slots_lock);
2306 return r;
2307 }
2308
2309 /**
2310 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2311 * and reenable dirty page tracking for the corresponding pages.
2312 * @kvm: pointer to kvm instance
2313 * @log: slot id and address from which to fetch the bitmap of dirty pages
2314 */
kvm_clear_dirty_log_protect(struct kvm * kvm,struct kvm_clear_dirty_log * log)2315 static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2316 struct kvm_clear_dirty_log *log)
2317 {
2318 struct kvm_memslots *slots;
2319 struct kvm_memory_slot *memslot;
2320 int as_id, id;
2321 gfn_t offset;
2322 unsigned long i, n;
2323 unsigned long *dirty_bitmap;
2324 unsigned long *dirty_bitmap_buffer;
2325 bool flush;
2326
2327 /* Dirty ring tracking may be exclusive to dirty log tracking */
2328 if (!kvm_use_dirty_bitmap(kvm))
2329 return -ENXIO;
2330
2331 as_id = log->slot >> 16;
2332 id = (u16)log->slot;
2333 if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2334 return -EINVAL;
2335
2336 if (log->first_page & 63)
2337 return -EINVAL;
2338
2339 slots = __kvm_memslots(kvm, as_id);
2340 memslot = id_to_memslot(slots, id);
2341 if (!memslot || !memslot->dirty_bitmap)
2342 return -ENOENT;
2343
2344 dirty_bitmap = memslot->dirty_bitmap;
2345
2346 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2347
2348 if (log->first_page > memslot->npages ||
2349 log->num_pages > memslot->npages - log->first_page ||
2350 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2351 return -EINVAL;
2352
2353 kvm_arch_sync_dirty_log(kvm, memslot);
2354
2355 flush = false;
2356 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2357 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2358 return -EFAULT;
2359
2360 KVM_MMU_LOCK(kvm);
2361 for (offset = log->first_page, i = offset / BITS_PER_LONG,
2362 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2363 i++, offset += BITS_PER_LONG) {
2364 unsigned long mask = *dirty_bitmap_buffer++;
2365 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2366 if (!mask)
2367 continue;
2368
2369 mask &= atomic_long_fetch_andnot(mask, p);
2370
2371 /*
2372 * mask contains the bits that really have been cleared. This
2373 * never includes any bits beyond the length of the memslot (if
2374 * the length is not aligned to 64 pages), therefore it is not
2375 * a problem if userspace sets them in log->dirty_bitmap.
2376 */
2377 if (mask) {
2378 flush = true;
2379 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2380 offset, mask);
2381 }
2382 }
2383 KVM_MMU_UNLOCK(kvm);
2384
2385 if (flush)
2386 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2387
2388 return 0;
2389 }
2390
kvm_vm_ioctl_clear_dirty_log(struct kvm * kvm,struct kvm_clear_dirty_log * log)2391 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2392 struct kvm_clear_dirty_log *log)
2393 {
2394 int r;
2395
2396 mutex_lock(&kvm->slots_lock);
2397
2398 r = kvm_clear_dirty_log_protect(kvm, log);
2399
2400 mutex_unlock(&kvm->slots_lock);
2401 return r;
2402 }
2403 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2404
2405 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
kvm_supported_mem_attributes(struct kvm * kvm)2406 static u64 kvm_supported_mem_attributes(struct kvm *kvm)
2407 {
2408 if (!kvm || kvm_arch_has_private_mem(kvm))
2409 return KVM_MEMORY_ATTRIBUTE_PRIVATE;
2410
2411 return 0;
2412 }
2413
2414 /*
2415 * Returns true if _all_ gfns in the range [@start, @end) have attributes
2416 * such that the bits in @mask match @attrs.
2417 */
kvm_range_has_memory_attributes(struct kvm * kvm,gfn_t start,gfn_t end,unsigned long mask,unsigned long attrs)2418 bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2419 unsigned long mask, unsigned long attrs)
2420 {
2421 XA_STATE(xas, &kvm->mem_attr_array, start);
2422 unsigned long index;
2423 void *entry;
2424
2425 mask &= kvm_supported_mem_attributes(kvm);
2426 if (attrs & ~mask)
2427 return false;
2428
2429 if (end == start + 1)
2430 return (kvm_get_memory_attributes(kvm, start) & mask) == attrs;
2431
2432 guard(rcu)();
2433 if (!attrs)
2434 return !xas_find(&xas, end - 1);
2435
2436 for (index = start; index < end; index++) {
2437 do {
2438 entry = xas_next(&xas);
2439 } while (xas_retry(&xas, entry));
2440
2441 if (xas.xa_index != index ||
2442 (xa_to_value(entry) & mask) != attrs)
2443 return false;
2444 }
2445
2446 return true;
2447 }
2448
kvm_handle_gfn_range(struct kvm * kvm,struct kvm_mmu_notifier_range * range)2449 static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
2450 struct kvm_mmu_notifier_range *range)
2451 {
2452 struct kvm_gfn_range gfn_range;
2453 struct kvm_memory_slot *slot;
2454 struct kvm_memslots *slots;
2455 struct kvm_memslot_iter iter;
2456 bool found_memslot = false;
2457 bool ret = false;
2458 int i;
2459
2460 gfn_range.arg = range->arg;
2461 gfn_range.may_block = range->may_block;
2462
2463 /*
2464 * If/when KVM supports more attributes beyond private .vs shared, this
2465 * _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target
2466 * range already has the desired private vs. shared state (it's unclear
2467 * if that is a net win). For now, KVM reaches this point if and only
2468 * if the private flag is being toggled, i.e. all mappings are in play.
2469 */
2470
2471 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
2472 slots = __kvm_memslots(kvm, i);
2473
2474 kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
2475 slot = iter.slot;
2476 gfn_range.slot = slot;
2477
2478 gfn_range.start = max(range->start, slot->base_gfn);
2479 gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
2480 if (gfn_range.start >= gfn_range.end)
2481 continue;
2482
2483 if (!found_memslot) {
2484 found_memslot = true;
2485 KVM_MMU_LOCK(kvm);
2486 if (!IS_KVM_NULL_FN(range->on_lock))
2487 range->on_lock(kvm);
2488 }
2489
2490 ret |= range->handler(kvm, &gfn_range);
2491 }
2492 }
2493
2494 if (range->flush_on_ret && ret)
2495 kvm_flush_remote_tlbs(kvm);
2496
2497 if (found_memslot)
2498 KVM_MMU_UNLOCK(kvm);
2499 }
2500
kvm_pre_set_memory_attributes(struct kvm * kvm,struct kvm_gfn_range * range)2501 static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
2502 struct kvm_gfn_range *range)
2503 {
2504 /*
2505 * Unconditionally add the range to the invalidation set, regardless of
2506 * whether or not the arch callback actually needs to zap SPTEs. E.g.
2507 * if KVM supports RWX attributes in the future and the attributes are
2508 * going from R=>RW, zapping isn't strictly necessary. Unconditionally
2509 * adding the range allows KVM to require that MMU invalidations add at
2510 * least one range between begin() and end(), e.g. allows KVM to detect
2511 * bugs where the add() is missed. Relaxing the rule *might* be safe,
2512 * but it's not obvious that allowing new mappings while the attributes
2513 * are in flux is desirable or worth the complexity.
2514 */
2515 kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
2516
2517 return kvm_arch_pre_set_memory_attributes(kvm, range);
2518 }
2519
2520 /* Set @attributes for the gfn range [@start, @end). */
kvm_vm_set_mem_attributes(struct kvm * kvm,gfn_t start,gfn_t end,unsigned long attributes)2521 static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2522 unsigned long attributes)
2523 {
2524 struct kvm_mmu_notifier_range pre_set_range = {
2525 .start = start,
2526 .end = end,
2527 .arg.attributes = attributes,
2528 .handler = kvm_pre_set_memory_attributes,
2529 .on_lock = kvm_mmu_invalidate_begin,
2530 .flush_on_ret = true,
2531 .may_block = true,
2532 };
2533 struct kvm_mmu_notifier_range post_set_range = {
2534 .start = start,
2535 .end = end,
2536 .arg.attributes = attributes,
2537 .handler = kvm_arch_post_set_memory_attributes,
2538 .on_lock = kvm_mmu_invalidate_end,
2539 .may_block = true,
2540 };
2541 unsigned long i;
2542 void *entry;
2543 int r = 0;
2544
2545 entry = attributes ? xa_mk_value(attributes) : NULL;
2546
2547 mutex_lock(&kvm->slots_lock);
2548
2549 /* Nothing to do if the entire range as the desired attributes. */
2550 if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
2551 goto out_unlock;
2552
2553 /*
2554 * Reserve memory ahead of time to avoid having to deal with failures
2555 * partway through setting the new attributes.
2556 */
2557 for (i = start; i < end; i++) {
2558 r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
2559 if (r)
2560 goto out_unlock;
2561
2562 cond_resched();
2563 }
2564
2565 kvm_handle_gfn_range(kvm, &pre_set_range);
2566
2567 for (i = start; i < end; i++) {
2568 r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
2569 GFP_KERNEL_ACCOUNT));
2570 KVM_BUG_ON(r, kvm);
2571 cond_resched();
2572 }
2573
2574 kvm_handle_gfn_range(kvm, &post_set_range);
2575
2576 out_unlock:
2577 mutex_unlock(&kvm->slots_lock);
2578
2579 return r;
2580 }
kvm_vm_ioctl_set_mem_attributes(struct kvm * kvm,struct kvm_memory_attributes * attrs)2581 static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
2582 struct kvm_memory_attributes *attrs)
2583 {
2584 gfn_t start, end;
2585
2586 /* flags is currently not used. */
2587 if (attrs->flags)
2588 return -EINVAL;
2589 if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
2590 return -EINVAL;
2591 if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
2592 return -EINVAL;
2593 if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
2594 return -EINVAL;
2595
2596 start = attrs->address >> PAGE_SHIFT;
2597 end = (attrs->address + attrs->size) >> PAGE_SHIFT;
2598
2599 /*
2600 * xarray tracks data using "unsigned long", and as a result so does
2601 * KVM. For simplicity, supports generic attributes only on 64-bit
2602 * architectures.
2603 */
2604 BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
2605
2606 return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
2607 }
2608 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
2609
gfn_to_memslot(struct kvm * kvm,gfn_t gfn)2610 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2611 {
2612 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2613 }
2614 EXPORT_SYMBOL_GPL(gfn_to_memslot);
2615
kvm_vcpu_gfn_to_memslot(struct kvm_vcpu * vcpu,gfn_t gfn)2616 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2617 {
2618 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2619 u64 gen = slots->generation;
2620 struct kvm_memory_slot *slot;
2621
2622 /*
2623 * This also protects against using a memslot from a different address space,
2624 * since different address spaces have different generation numbers.
2625 */
2626 if (unlikely(gen != vcpu->last_used_slot_gen)) {
2627 vcpu->last_used_slot = NULL;
2628 vcpu->last_used_slot_gen = gen;
2629 }
2630
2631 slot = try_get_memslot(vcpu->last_used_slot, gfn);
2632 if (slot)
2633 return slot;
2634
2635 /*
2636 * Fall back to searching all memslots. We purposely use
2637 * search_memslots() instead of __gfn_to_memslot() to avoid
2638 * thrashing the VM-wide last_used_slot in kvm_memslots.
2639 */
2640 slot = search_memslots(slots, gfn, false);
2641 if (slot) {
2642 vcpu->last_used_slot = slot;
2643 return slot;
2644 }
2645
2646 return NULL;
2647 }
2648
kvm_is_visible_gfn(struct kvm * kvm,gfn_t gfn)2649 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2650 {
2651 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2652
2653 return kvm_is_visible_memslot(memslot);
2654 }
2655 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2656
kvm_vcpu_is_visible_gfn(struct kvm_vcpu * vcpu,gfn_t gfn)2657 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2658 {
2659 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2660
2661 return kvm_is_visible_memslot(memslot);
2662 }
2663 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2664
kvm_host_page_size(struct kvm_vcpu * vcpu,gfn_t gfn)2665 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2666 {
2667 struct vm_area_struct *vma;
2668 unsigned long addr, size;
2669
2670 size = PAGE_SIZE;
2671
2672 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2673 if (kvm_is_error_hva(addr))
2674 return PAGE_SIZE;
2675
2676 mmap_read_lock(current->mm);
2677 vma = find_vma(current->mm, addr);
2678 if (!vma)
2679 goto out;
2680
2681 size = vma_kernel_pagesize(vma);
2682
2683 out:
2684 mmap_read_unlock(current->mm);
2685
2686 return size;
2687 }
2688
memslot_is_readonly(const struct kvm_memory_slot * slot)2689 static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2690 {
2691 return slot->flags & KVM_MEM_READONLY;
2692 }
2693
2694 /*
2695 * Return the memslot of a @gfn and the R/W attribute if slot is valid, or NULL
2696 * if slot is not valid.
2697 *
2698 * @slot: the kvm_memory_slot which contains @gfn
2699 * @gfn: the gfn to be translated
2700 * @writable: used to return the read/write attribute of the @slot if the hva
2701 * is valid and @writable is not NULL
2702 */
gfn_to_memslot_prot(struct kvm * kvm,gfn_t gfn,bool * writable)2703 struct kvm_memory_slot *gfn_to_memslot_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2704 {
2705 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2706
2707 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2708 return NULL;
2709
2710 if (writable)
2711 *writable = !memslot_is_readonly(slot);
2712
2713 return slot;
2714 }
2715
__gfn_to_hva_many(const struct kvm_memory_slot * slot,gfn_t gfn,gfn_t * nr_pages,bool write)2716 static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2717 gfn_t *nr_pages, bool write)
2718 {
2719 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2720 return KVM_HVA_ERR_BAD;
2721
2722 if (memslot_is_readonly(slot) && write)
2723 return KVM_HVA_ERR_RO_BAD;
2724
2725 if (nr_pages)
2726 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2727
2728 return __gfn_to_hva_memslot(slot, gfn);
2729 }
2730
gfn_to_hva_many(struct kvm_memory_slot * slot,gfn_t gfn,gfn_t * nr_pages)2731 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2732 gfn_t *nr_pages)
2733 {
2734 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2735 }
2736
gfn_to_hva_memslot(struct kvm_memory_slot * slot,gfn_t gfn)2737 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2738 gfn_t gfn)
2739 {
2740 return gfn_to_hva_many(slot, gfn, NULL);
2741 }
2742 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2743
gfn_to_hva(struct kvm * kvm,gfn_t gfn)2744 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2745 {
2746 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2747 }
2748 EXPORT_SYMBOL_GPL(gfn_to_hva);
2749
kvm_vcpu_gfn_to_hva(struct kvm_vcpu * vcpu,gfn_t gfn)2750 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2751 {
2752 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2753 }
2754 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2755
2756 /*
2757 * Return the hva of a @gfn and the R/W attribute if possible.
2758 *
2759 * @slot: the kvm_memory_slot which contains @gfn
2760 * @gfn: the gfn to be translated
2761 * @writable: used to return the read/write attribute of the @slot if the hva
2762 * is valid and @writable is not NULL
2763 */
gfn_to_hva_memslot_prot(struct kvm_memory_slot * slot,gfn_t gfn,bool * writable)2764 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2765 gfn_t gfn, bool *writable)
2766 {
2767 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2768
2769 if (!kvm_is_error_hva(hva) && writable)
2770 *writable = !memslot_is_readonly(slot);
2771
2772 return hva;
2773 }
2774
gfn_to_hva_prot(struct kvm * kvm,gfn_t gfn,bool * writable)2775 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2776 {
2777 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2778
2779 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2780 }
2781
kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu * vcpu,gfn_t gfn,bool * writable)2782 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2783 {
2784 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2785
2786 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2787 }
2788
check_user_page_hwpoison(unsigned long addr)2789 static inline int check_user_page_hwpoison(unsigned long addr)
2790 {
2791 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2792
2793 rc = get_user_pages(addr, 1, flags, NULL);
2794 return rc == -EHWPOISON;
2795 }
2796
2797 /*
2798 * The fast path to get the writable pfn which will be stored in @pfn,
2799 * true indicates success, otherwise false is returned. It's also the
2800 * only part that runs if we can in atomic context.
2801 */
hva_to_pfn_fast(unsigned long addr,bool write_fault,bool * writable,kvm_pfn_t * pfn)2802 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2803 bool *writable, kvm_pfn_t *pfn)
2804 {
2805 struct page *page[1];
2806
2807 /*
2808 * Fast pin a writable pfn only if it is a write fault request
2809 * or the caller allows to map a writable pfn for a read fault
2810 * request.
2811 */
2812 if (!(write_fault || writable))
2813 return false;
2814
2815 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2816 *pfn = page_to_pfn(page[0]);
2817
2818 if (writable)
2819 *writable = true;
2820 return true;
2821 }
2822
2823 return false;
2824 }
2825
2826 /*
2827 * The slow path to get the pfn of the specified host virtual address,
2828 * 1 indicates success, -errno is returned if error is detected.
2829 */
hva_to_pfn_slow(unsigned long addr,bool * async,bool write_fault,bool interruptible,bool * writable,kvm_pfn_t * pfn)2830 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2831 bool interruptible, bool *writable, kvm_pfn_t *pfn)
2832 {
2833 /*
2834 * When a VCPU accesses a page that is not mapped into the secondary
2835 * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2836 * make progress. We always want to honor NUMA hinting faults in that
2837 * case, because GUP usage corresponds to memory accesses from the VCPU.
2838 * Otherwise, we'd not trigger NUMA hinting faults once a page is
2839 * mapped into the secondary MMU and gets accessed by a VCPU.
2840 *
2841 * Note that get_user_page_fast_only() and FOLL_WRITE for now
2842 * implicitly honor NUMA hinting faults and don't need this flag.
2843 */
2844 unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
2845 struct page *page;
2846 int npages;
2847
2848 might_sleep();
2849
2850 if (writable)
2851 *writable = write_fault;
2852
2853 if (write_fault)
2854 flags |= FOLL_WRITE;
2855 if (async)
2856 flags |= FOLL_NOWAIT;
2857 if (interruptible)
2858 flags |= FOLL_INTERRUPTIBLE;
2859
2860 npages = get_user_pages_unlocked(addr, 1, &page, flags);
2861 if (npages != 1)
2862 return npages;
2863
2864 /* map read fault as writable if possible */
2865 if (unlikely(!write_fault) && writable) {
2866 struct page *wpage;
2867
2868 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2869 *writable = true;
2870 put_page(page);
2871 page = wpage;
2872 }
2873 }
2874 *pfn = page_to_pfn(page);
2875 return npages;
2876 }
2877
vma_is_valid(struct vm_area_struct * vma,bool write_fault)2878 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2879 {
2880 if (unlikely(!(vma->vm_flags & VM_READ)))
2881 return false;
2882
2883 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2884 return false;
2885
2886 return true;
2887 }
2888
kvm_try_get_pfn(kvm_pfn_t pfn)2889 static int kvm_try_get_pfn(kvm_pfn_t pfn)
2890 {
2891 struct page *page = kvm_pfn_to_refcounted_page(pfn);
2892
2893 if (!page)
2894 return 1;
2895
2896 return get_page_unless_zero(page);
2897 }
2898
hva_to_pfn_remapped(struct vm_area_struct * vma,unsigned long addr,bool write_fault,bool * writable,kvm_pfn_t * p_pfn)2899 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2900 unsigned long addr, bool write_fault,
2901 bool *writable, kvm_pfn_t *p_pfn)
2902 {
2903 struct follow_pfnmap_args args = { .vma = vma, .address = addr };
2904 kvm_pfn_t pfn;
2905 int r;
2906
2907 r = follow_pfnmap_start(&args);
2908 if (r) {
2909 /*
2910 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2911 * not call the fault handler, so do it here.
2912 */
2913 bool unlocked = false;
2914 r = fixup_user_fault(current->mm, addr,
2915 (write_fault ? FAULT_FLAG_WRITE : 0),
2916 &unlocked);
2917 if (unlocked)
2918 return -EAGAIN;
2919 if (r)
2920 return r;
2921
2922 r = follow_pfnmap_start(&args);
2923 if (r)
2924 return r;
2925 }
2926
2927 if (write_fault && !args.writable) {
2928 pfn = KVM_PFN_ERR_RO_FAULT;
2929 goto out;
2930 }
2931
2932 if (writable)
2933 *writable = args.writable;
2934 pfn = args.pfn;
2935
2936 /*
2937 * Get a reference here because callers of *hva_to_pfn* and
2938 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2939 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
2940 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2941 * simply do nothing for reserved pfns.
2942 *
2943 * Whoever called remap_pfn_range is also going to call e.g.
2944 * unmap_mapping_range before the underlying pages are freed,
2945 * causing a call to our MMU notifier.
2946 *
2947 * Certain IO or PFNMAP mappings can be backed with valid
2948 * struct pages, but be allocated without refcounting e.g.,
2949 * tail pages of non-compound higher order allocations, which
2950 * would then underflow the refcount when the caller does the
2951 * required put_page. Don't allow those pages here.
2952 */
2953 if (!kvm_try_get_pfn(pfn))
2954 r = -EFAULT;
2955 out:
2956 follow_pfnmap_end(&args);
2957 *p_pfn = pfn;
2958
2959 return r;
2960 }
2961
2962 /*
2963 * Pin guest page in memory and return its pfn.
2964 * @addr: host virtual address which maps memory to the guest
2965 * @atomic: whether this function is forbidden from sleeping
2966 * @interruptible: whether the process can be interrupted by non-fatal signals
2967 * @async: whether this function need to wait IO complete if the
2968 * host page is not in the memory
2969 * @write_fault: whether we should get a writable host page
2970 * @writable: whether it allows to map a writable host page for !@write_fault
2971 *
2972 * The function will map a writable host page for these two cases:
2973 * 1): @write_fault = true
2974 * 2): @write_fault = false && @writable, @writable will tell the caller
2975 * whether the mapping is writable.
2976 */
hva_to_pfn(unsigned long addr,bool atomic,bool interruptible,bool * async,bool write_fault,bool * writable)2977 kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2978 bool *async, bool write_fault, bool *writable)
2979 {
2980 struct vm_area_struct *vma;
2981 kvm_pfn_t pfn;
2982 int npages, r;
2983
2984 /* we can do it either atomically or asynchronously, not both */
2985 BUG_ON(atomic && async);
2986
2987 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2988 return pfn;
2989
2990 if (atomic)
2991 return KVM_PFN_ERR_FAULT;
2992
2993 npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2994 writable, &pfn);
2995 if (npages == 1)
2996 return pfn;
2997 if (npages == -EINTR)
2998 return KVM_PFN_ERR_SIGPENDING;
2999
3000 mmap_read_lock(current->mm);
3001 if (npages == -EHWPOISON ||
3002 (!async && check_user_page_hwpoison(addr))) {
3003 pfn = KVM_PFN_ERR_HWPOISON;
3004 goto exit;
3005 }
3006
3007 retry:
3008 vma = vma_lookup(current->mm, addr);
3009
3010 if (vma == NULL)
3011 pfn = KVM_PFN_ERR_FAULT;
3012 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
3013 r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
3014 if (r == -EAGAIN)
3015 goto retry;
3016 if (r < 0)
3017 pfn = KVM_PFN_ERR_FAULT;
3018 } else {
3019 if (async && vma_is_valid(vma, write_fault))
3020 *async = true;
3021 pfn = KVM_PFN_ERR_FAULT;
3022 }
3023 exit:
3024 mmap_read_unlock(current->mm);
3025 return pfn;
3026 }
3027
__gfn_to_pfn_memslot(const struct kvm_memory_slot * slot,gfn_t gfn,bool atomic,bool interruptible,bool * async,bool write_fault,bool * writable,hva_t * hva)3028 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
3029 bool atomic, bool interruptible, bool *async,
3030 bool write_fault, bool *writable, hva_t *hva)
3031 {
3032 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
3033
3034 if (hva)
3035 *hva = addr;
3036
3037 if (kvm_is_error_hva(addr)) {
3038 if (writable)
3039 *writable = false;
3040
3041 return addr == KVM_HVA_ERR_RO_BAD ? KVM_PFN_ERR_RO_FAULT :
3042 KVM_PFN_NOSLOT;
3043 }
3044
3045 /* Do not map writable pfn in the readonly memslot. */
3046 if (writable && memslot_is_readonly(slot)) {
3047 *writable = false;
3048 writable = NULL;
3049 }
3050
3051 return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
3052 writable);
3053 }
3054 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
3055
gfn_to_pfn_prot(struct kvm * kvm,gfn_t gfn,bool write_fault,bool * writable)3056 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
3057 bool *writable)
3058 {
3059 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
3060 NULL, write_fault, writable, NULL);
3061 }
3062 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
3063
gfn_to_pfn_memslot(const struct kvm_memory_slot * slot,gfn_t gfn)3064 kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
3065 {
3066 return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
3067 NULL, NULL);
3068 }
3069 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
3070
gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot * slot,gfn_t gfn)3071 kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
3072 {
3073 return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
3074 NULL, NULL);
3075 }
3076 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
3077
gfn_to_pfn(struct kvm * kvm,gfn_t gfn)3078 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
3079 {
3080 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
3081 }
3082 EXPORT_SYMBOL_GPL(gfn_to_pfn);
3083
gfn_to_page_many_atomic(struct kvm_memory_slot * slot,gfn_t gfn,struct page ** pages,int nr_pages)3084 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3085 struct page **pages, int nr_pages)
3086 {
3087 unsigned long addr;
3088 gfn_t entry = 0;
3089
3090 addr = gfn_to_hva_many(slot, gfn, &entry);
3091 if (kvm_is_error_hva(addr))
3092 return -1;
3093
3094 if (entry < nr_pages)
3095 return 0;
3096
3097 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
3098 }
3099 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
3100
3101 /*
3102 * Do not use this helper unless you are absolutely certain the gfn _must_ be
3103 * backed by 'struct page'. A valid example is if the backing memslot is
3104 * controlled by KVM. Note, if the returned page is valid, it's refcount has
3105 * been elevated by gfn_to_pfn().
3106 */
gfn_to_page(struct kvm * kvm,gfn_t gfn)3107 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
3108 {
3109 struct page *page;
3110 kvm_pfn_t pfn;
3111
3112 pfn = gfn_to_pfn(kvm, gfn);
3113
3114 if (is_error_noslot_pfn(pfn))
3115 return KVM_ERR_PTR_BAD_PAGE;
3116
3117 page = kvm_pfn_to_refcounted_page(pfn);
3118 if (!page)
3119 return KVM_ERR_PTR_BAD_PAGE;
3120
3121 return page;
3122 }
3123 EXPORT_SYMBOL_GPL(gfn_to_page);
3124
kvm_release_pfn(kvm_pfn_t pfn,bool dirty)3125 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
3126 {
3127 if (dirty)
3128 kvm_release_pfn_dirty(pfn);
3129 else
3130 kvm_release_pfn_clean(pfn);
3131 }
3132
kvm_vcpu_map(struct kvm_vcpu * vcpu,gfn_t gfn,struct kvm_host_map * map)3133 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
3134 {
3135 kvm_pfn_t pfn;
3136 void *hva = NULL;
3137 struct page *page = KVM_UNMAPPED_PAGE;
3138
3139 if (!map)
3140 return -EINVAL;
3141
3142 pfn = gfn_to_pfn(vcpu->kvm, gfn);
3143 if (is_error_noslot_pfn(pfn))
3144 return -EINVAL;
3145
3146 if (pfn_valid(pfn)) {
3147 page = pfn_to_page(pfn);
3148 hva = kmap(page);
3149 #ifdef CONFIG_HAS_IOMEM
3150 } else {
3151 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
3152 #endif
3153 }
3154
3155 if (!hva)
3156 return -EFAULT;
3157
3158 map->page = page;
3159 map->hva = hva;
3160 map->pfn = pfn;
3161 map->gfn = gfn;
3162
3163 return 0;
3164 }
3165 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
3166
kvm_vcpu_unmap(struct kvm_vcpu * vcpu,struct kvm_host_map * map,bool dirty)3167 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
3168 {
3169 if (!map)
3170 return;
3171
3172 if (!map->hva)
3173 return;
3174
3175 if (map->page != KVM_UNMAPPED_PAGE)
3176 kunmap(map->page);
3177 #ifdef CONFIG_HAS_IOMEM
3178 else
3179 memunmap(map->hva);
3180 #endif
3181
3182 if (dirty)
3183 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
3184
3185 kvm_release_pfn(map->pfn, dirty);
3186
3187 map->hva = NULL;
3188 map->page = NULL;
3189 }
3190 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
3191
kvm_is_ad_tracked_page(struct page * page)3192 static bool kvm_is_ad_tracked_page(struct page *page)
3193 {
3194 /*
3195 * Per page-flags.h, pages tagged PG_reserved "should in general not be
3196 * touched (e.g. set dirty) except by its owner".
3197 */
3198 return !PageReserved(page);
3199 }
3200
kvm_set_page_dirty(struct page * page)3201 static void kvm_set_page_dirty(struct page *page)
3202 {
3203 if (kvm_is_ad_tracked_page(page))
3204 SetPageDirty(page);
3205 }
3206
kvm_set_page_accessed(struct page * page)3207 static void kvm_set_page_accessed(struct page *page)
3208 {
3209 if (kvm_is_ad_tracked_page(page))
3210 mark_page_accessed(page);
3211 }
3212
kvm_release_page_clean(struct page * page)3213 void kvm_release_page_clean(struct page *page)
3214 {
3215 WARN_ON(is_error_page(page));
3216
3217 kvm_set_page_accessed(page);
3218 put_page(page);
3219 }
3220 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
3221
kvm_release_pfn_clean(kvm_pfn_t pfn)3222 void kvm_release_pfn_clean(kvm_pfn_t pfn)
3223 {
3224 struct page *page;
3225
3226 if (is_error_noslot_pfn(pfn))
3227 return;
3228
3229 page = kvm_pfn_to_refcounted_page(pfn);
3230 if (!page)
3231 return;
3232
3233 kvm_release_page_clean(page);
3234 }
3235 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
3236
kvm_release_page_dirty(struct page * page)3237 void kvm_release_page_dirty(struct page *page)
3238 {
3239 WARN_ON(is_error_page(page));
3240
3241 kvm_set_page_dirty(page);
3242 kvm_release_page_clean(page);
3243 }
3244 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
3245
kvm_release_pfn_dirty(kvm_pfn_t pfn)3246 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
3247 {
3248 struct page *page;
3249
3250 if (is_error_noslot_pfn(pfn))
3251 return;
3252
3253 page = kvm_pfn_to_refcounted_page(pfn);
3254 if (!page)
3255 return;
3256
3257 kvm_release_page_dirty(page);
3258 }
3259 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
3260
3261 /*
3262 * Note, checking for an error/noslot pfn is the caller's responsibility when
3263 * directly marking a page dirty/accessed. Unlike the "release" helpers, the
3264 * "set" helpers are not to be used when the pfn might point at garbage.
3265 */
kvm_set_pfn_dirty(kvm_pfn_t pfn)3266 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
3267 {
3268 if (WARN_ON(is_error_noslot_pfn(pfn)))
3269 return;
3270
3271 if (pfn_valid(pfn))
3272 kvm_set_page_dirty(pfn_to_page(pfn));
3273 }
3274 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
3275
kvm_set_pfn_accessed(kvm_pfn_t pfn)3276 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
3277 {
3278 if (WARN_ON(is_error_noslot_pfn(pfn)))
3279 return;
3280
3281 if (pfn_valid(pfn))
3282 kvm_set_page_accessed(pfn_to_page(pfn));
3283 }
3284 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
3285
next_segment(unsigned long len,int offset)3286 static int next_segment(unsigned long len, int offset)
3287 {
3288 if (len > PAGE_SIZE - offset)
3289 return PAGE_SIZE - offset;
3290 else
3291 return len;
3292 }
3293
3294 /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
__kvm_read_guest_page(struct kvm_memory_slot * slot,gfn_t gfn,void * data,int offset,int len)3295 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3296 void *data, int offset, int len)
3297 {
3298 int r;
3299 unsigned long addr;
3300
3301 if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
3302 return -EFAULT;
3303
3304 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3305 if (kvm_is_error_hva(addr))
3306 return -EFAULT;
3307 r = __copy_from_user(data, (void __user *)addr + offset, len);
3308 if (r)
3309 return -EFAULT;
3310 return 0;
3311 }
3312
kvm_read_guest_page(struct kvm * kvm,gfn_t gfn,void * data,int offset,int len)3313 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3314 int len)
3315 {
3316 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3317
3318 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3319 }
3320 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3321
kvm_vcpu_read_guest_page(struct kvm_vcpu * vcpu,gfn_t gfn,void * data,int offset,int len)3322 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3323 int offset, int len)
3324 {
3325 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3326
3327 return __kvm_read_guest_page(slot, gfn, data, offset, len);
3328 }
3329 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3330
kvm_read_guest(struct kvm * kvm,gpa_t gpa,void * data,unsigned long len)3331 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3332 {
3333 gfn_t gfn = gpa >> PAGE_SHIFT;
3334 int seg;
3335 int offset = offset_in_page(gpa);
3336 int ret;
3337
3338 while ((seg = next_segment(len, offset)) != 0) {
3339 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3340 if (ret < 0)
3341 return ret;
3342 offset = 0;
3343 len -= seg;
3344 data += seg;
3345 ++gfn;
3346 }
3347 return 0;
3348 }
3349 EXPORT_SYMBOL_GPL(kvm_read_guest);
3350
kvm_vcpu_read_guest(struct kvm_vcpu * vcpu,gpa_t gpa,void * data,unsigned long len)3351 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
3352 {
3353 gfn_t gfn = gpa >> PAGE_SHIFT;
3354 int seg;
3355 int offset = offset_in_page(gpa);
3356 int ret;
3357
3358 while ((seg = next_segment(len, offset)) != 0) {
3359 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3360 if (ret < 0)
3361 return ret;
3362 offset = 0;
3363 len -= seg;
3364 data += seg;
3365 ++gfn;
3366 }
3367 return 0;
3368 }
3369 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
3370
__kvm_read_guest_atomic(struct kvm_memory_slot * slot,gfn_t gfn,void * data,int offset,unsigned long len)3371 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3372 void *data, int offset, unsigned long len)
3373 {
3374 int r;
3375 unsigned long addr;
3376
3377 if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
3378 return -EFAULT;
3379
3380 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3381 if (kvm_is_error_hva(addr))
3382 return -EFAULT;
3383 pagefault_disable();
3384 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
3385 pagefault_enable();
3386 if (r)
3387 return -EFAULT;
3388 return 0;
3389 }
3390
kvm_vcpu_read_guest_atomic(struct kvm_vcpu * vcpu,gpa_t gpa,void * data,unsigned long len)3391 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3392 void *data, unsigned long len)
3393 {
3394 gfn_t gfn = gpa >> PAGE_SHIFT;
3395 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3396 int offset = offset_in_page(gpa);
3397
3398 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3399 }
3400 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3401
3402 /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
__kvm_write_guest_page(struct kvm * kvm,struct kvm_memory_slot * memslot,gfn_t gfn,const void * data,int offset,int len)3403 static int __kvm_write_guest_page(struct kvm *kvm,
3404 struct kvm_memory_slot *memslot, gfn_t gfn,
3405 const void *data, int offset, int len)
3406 {
3407 int r;
3408 unsigned long addr;
3409
3410 if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
3411 return -EFAULT;
3412
3413 addr = gfn_to_hva_memslot(memslot, gfn);
3414 if (kvm_is_error_hva(addr))
3415 return -EFAULT;
3416 r = __copy_to_user((void __user *)addr + offset, data, len);
3417 if (r)
3418 return -EFAULT;
3419 mark_page_dirty_in_slot(kvm, memslot, gfn);
3420 return 0;
3421 }
3422
kvm_write_guest_page(struct kvm * kvm,gfn_t gfn,const void * data,int offset,int len)3423 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3424 const void *data, int offset, int len)
3425 {
3426 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3427
3428 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
3429 }
3430 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3431
kvm_vcpu_write_guest_page(struct kvm_vcpu * vcpu,gfn_t gfn,const void * data,int offset,int len)3432 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3433 const void *data, int offset, int len)
3434 {
3435 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3436
3437 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
3438 }
3439 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3440
kvm_write_guest(struct kvm * kvm,gpa_t gpa,const void * data,unsigned long len)3441 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3442 unsigned long len)
3443 {
3444 gfn_t gfn = gpa >> PAGE_SHIFT;
3445 int seg;
3446 int offset = offset_in_page(gpa);
3447 int ret;
3448
3449 while ((seg = next_segment(len, offset)) != 0) {
3450 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3451 if (ret < 0)
3452 return ret;
3453 offset = 0;
3454 len -= seg;
3455 data += seg;
3456 ++gfn;
3457 }
3458 return 0;
3459 }
3460 EXPORT_SYMBOL_GPL(kvm_write_guest);
3461
kvm_vcpu_write_guest(struct kvm_vcpu * vcpu,gpa_t gpa,const void * data,unsigned long len)3462 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3463 unsigned long len)
3464 {
3465 gfn_t gfn = gpa >> PAGE_SHIFT;
3466 int seg;
3467 int offset = offset_in_page(gpa);
3468 int ret;
3469
3470 while ((seg = next_segment(len, offset)) != 0) {
3471 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3472 if (ret < 0)
3473 return ret;
3474 offset = 0;
3475 len -= seg;
3476 data += seg;
3477 ++gfn;
3478 }
3479 return 0;
3480 }
3481 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3482
__kvm_gfn_to_hva_cache_init(struct kvm_memslots * slots,struct gfn_to_hva_cache * ghc,gpa_t gpa,unsigned long len)3483 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3484 struct gfn_to_hva_cache *ghc,
3485 gpa_t gpa, unsigned long len)
3486 {
3487 int offset = offset_in_page(gpa);
3488 gfn_t start_gfn = gpa >> PAGE_SHIFT;
3489 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3490 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3491 gfn_t nr_pages_avail;
3492
3493 /* Update ghc->generation before performing any error checks. */
3494 ghc->generation = slots->generation;
3495
3496 if (start_gfn > end_gfn) {
3497 ghc->hva = KVM_HVA_ERR_BAD;
3498 return -EINVAL;
3499 }
3500
3501 /*
3502 * If the requested region crosses two memslots, we still
3503 * verify that the entire region is valid here.
3504 */
3505 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3506 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3507 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3508 &nr_pages_avail);
3509 if (kvm_is_error_hva(ghc->hva))
3510 return -EFAULT;
3511 }
3512
3513 /* Use the slow path for cross page reads and writes. */
3514 if (nr_pages_needed == 1)
3515 ghc->hva += offset;
3516 else
3517 ghc->memslot = NULL;
3518
3519 ghc->gpa = gpa;
3520 ghc->len = len;
3521 return 0;
3522 }
3523
kvm_gfn_to_hva_cache_init(struct kvm * kvm,struct gfn_to_hva_cache * ghc,gpa_t gpa,unsigned long len)3524 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3525 gpa_t gpa, unsigned long len)
3526 {
3527 struct kvm_memslots *slots = kvm_memslots(kvm);
3528 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3529 }
3530 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3531
kvm_write_guest_offset_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned int offset,unsigned long len)3532 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3533 void *data, unsigned int offset,
3534 unsigned long len)
3535 {
3536 struct kvm_memslots *slots = kvm_memslots(kvm);
3537 int r;
3538 gpa_t gpa = ghc->gpa + offset;
3539
3540 if (WARN_ON_ONCE(len + offset > ghc->len))
3541 return -EINVAL;
3542
3543 if (slots->generation != ghc->generation) {
3544 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3545 return -EFAULT;
3546 }
3547
3548 if (kvm_is_error_hva(ghc->hva))
3549 return -EFAULT;
3550
3551 if (unlikely(!ghc->memslot))
3552 return kvm_write_guest(kvm, gpa, data, len);
3553
3554 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3555 if (r)
3556 return -EFAULT;
3557 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3558
3559 return 0;
3560 }
3561 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3562
kvm_write_guest_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned long len)3563 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3564 void *data, unsigned long len)
3565 {
3566 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3567 }
3568 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3569
kvm_read_guest_offset_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned int offset,unsigned long len)3570 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3571 void *data, unsigned int offset,
3572 unsigned long len)
3573 {
3574 struct kvm_memslots *slots = kvm_memslots(kvm);
3575 int r;
3576 gpa_t gpa = ghc->gpa + offset;
3577
3578 if (WARN_ON_ONCE(len + offset > ghc->len))
3579 return -EINVAL;
3580
3581 if (slots->generation != ghc->generation) {
3582 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3583 return -EFAULT;
3584 }
3585
3586 if (kvm_is_error_hva(ghc->hva))
3587 return -EFAULT;
3588
3589 if (unlikely(!ghc->memslot))
3590 return kvm_read_guest(kvm, gpa, data, len);
3591
3592 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3593 if (r)
3594 return -EFAULT;
3595
3596 return 0;
3597 }
3598 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3599
kvm_read_guest_cached(struct kvm * kvm,struct gfn_to_hva_cache * ghc,void * data,unsigned long len)3600 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3601 void *data, unsigned long len)
3602 {
3603 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3604 }
3605 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3606
kvm_clear_guest(struct kvm * kvm,gpa_t gpa,unsigned long len)3607 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3608 {
3609 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3610 gfn_t gfn = gpa >> PAGE_SHIFT;
3611 int seg;
3612 int offset = offset_in_page(gpa);
3613 int ret;
3614
3615 while ((seg = next_segment(len, offset)) != 0) {
3616 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg);
3617 if (ret < 0)
3618 return ret;
3619 offset = 0;
3620 len -= seg;
3621 ++gfn;
3622 }
3623 return 0;
3624 }
3625 EXPORT_SYMBOL_GPL(kvm_clear_guest);
3626
mark_page_dirty_in_slot(struct kvm * kvm,const struct kvm_memory_slot * memslot,gfn_t gfn)3627 void mark_page_dirty_in_slot(struct kvm *kvm,
3628 const struct kvm_memory_slot *memslot,
3629 gfn_t gfn)
3630 {
3631 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3632
3633 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3634 if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
3635 return;
3636
3637 WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3638 #endif
3639
3640 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3641 unsigned long rel_gfn = gfn - memslot->base_gfn;
3642 u32 slot = (memslot->as_id << 16) | memslot->id;
3643
3644 if (kvm->dirty_ring_size && vcpu)
3645 kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3646 else if (memslot->dirty_bitmap)
3647 set_bit_le(rel_gfn, memslot->dirty_bitmap);
3648 }
3649 }
3650 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3651
mark_page_dirty(struct kvm * kvm,gfn_t gfn)3652 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3653 {
3654 struct kvm_memory_slot *memslot;
3655
3656 memslot = gfn_to_memslot(kvm, gfn);
3657 mark_page_dirty_in_slot(kvm, memslot, gfn);
3658 }
3659 EXPORT_SYMBOL_GPL(mark_page_dirty);
3660
kvm_vcpu_mark_page_dirty(struct kvm_vcpu * vcpu,gfn_t gfn)3661 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3662 {
3663 struct kvm_memory_slot *memslot;
3664
3665 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3666 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3667 }
3668 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3669
kvm_sigset_activate(struct kvm_vcpu * vcpu)3670 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3671 {
3672 if (!vcpu->sigset_active)
3673 return;
3674
3675 /*
3676 * This does a lockless modification of ->real_blocked, which is fine
3677 * because, only current can change ->real_blocked and all readers of
3678 * ->real_blocked don't care as long ->real_blocked is always a subset
3679 * of ->blocked.
3680 */
3681 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked);
3682 }
3683
kvm_sigset_deactivate(struct kvm_vcpu * vcpu)3684 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3685 {
3686 if (!vcpu->sigset_active)
3687 return;
3688
3689 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL);
3690 sigemptyset(¤t->real_blocked);
3691 }
3692
grow_halt_poll_ns(struct kvm_vcpu * vcpu)3693 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3694 {
3695 unsigned int old, val, grow, grow_start;
3696
3697 old = val = vcpu->halt_poll_ns;
3698 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3699 grow = READ_ONCE(halt_poll_ns_grow);
3700 if (!grow)
3701 goto out;
3702
3703 val *= grow;
3704 if (val < grow_start)
3705 val = grow_start;
3706
3707 vcpu->halt_poll_ns = val;
3708 out:
3709 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3710 }
3711
shrink_halt_poll_ns(struct kvm_vcpu * vcpu)3712 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3713 {
3714 unsigned int old, val, shrink, grow_start;
3715
3716 old = val = vcpu->halt_poll_ns;
3717 shrink = READ_ONCE(halt_poll_ns_shrink);
3718 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3719 if (shrink == 0)
3720 val = 0;
3721 else
3722 val /= shrink;
3723
3724 if (val < grow_start)
3725 val = 0;
3726
3727 vcpu->halt_poll_ns = val;
3728 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3729 }
3730
kvm_vcpu_check_block(struct kvm_vcpu * vcpu)3731 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3732 {
3733 int ret = -EINTR;
3734 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3735
3736 if (kvm_arch_vcpu_runnable(vcpu))
3737 goto out;
3738 if (kvm_cpu_has_pending_timer(vcpu))
3739 goto out;
3740 if (signal_pending(current))
3741 goto out;
3742 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3743 goto out;
3744
3745 ret = 0;
3746 out:
3747 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3748 return ret;
3749 }
3750
3751 /*
3752 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3753 * pending. This is mostly used when halting a vCPU, but may also be used
3754 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3755 */
kvm_vcpu_block(struct kvm_vcpu * vcpu)3756 bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3757 {
3758 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3759 bool waited = false;
3760
3761 vcpu->stat.generic.blocking = 1;
3762
3763 preempt_disable();
3764 kvm_arch_vcpu_blocking(vcpu);
3765 prepare_to_rcuwait(wait);
3766 preempt_enable();
3767
3768 for (;;) {
3769 set_current_state(TASK_INTERRUPTIBLE);
3770
3771 if (kvm_vcpu_check_block(vcpu) < 0)
3772 break;
3773
3774 waited = true;
3775 schedule();
3776 }
3777
3778 preempt_disable();
3779 finish_rcuwait(wait);
3780 kvm_arch_vcpu_unblocking(vcpu);
3781 preempt_enable();
3782
3783 vcpu->stat.generic.blocking = 0;
3784
3785 return waited;
3786 }
3787
update_halt_poll_stats(struct kvm_vcpu * vcpu,ktime_t start,ktime_t end,bool success)3788 static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3789 ktime_t end, bool success)
3790 {
3791 struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3792 u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3793
3794 ++vcpu->stat.generic.halt_attempted_poll;
3795
3796 if (success) {
3797 ++vcpu->stat.generic.halt_successful_poll;
3798
3799 if (!vcpu_valid_wakeup(vcpu))
3800 ++vcpu->stat.generic.halt_poll_invalid;
3801
3802 stats->halt_poll_success_ns += poll_ns;
3803 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3804 } else {
3805 stats->halt_poll_fail_ns += poll_ns;
3806 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3807 }
3808 }
3809
kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu * vcpu)3810 static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3811 {
3812 struct kvm *kvm = vcpu->kvm;
3813
3814 if (kvm->override_halt_poll_ns) {
3815 /*
3816 * Ensure kvm->max_halt_poll_ns is not read before
3817 * kvm->override_halt_poll_ns.
3818 *
3819 * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3820 */
3821 smp_rmb();
3822 return READ_ONCE(kvm->max_halt_poll_ns);
3823 }
3824
3825 return READ_ONCE(halt_poll_ns);
3826 }
3827
3828 /*
3829 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
3830 * polling is enabled, busy wait for a short time before blocking to avoid the
3831 * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3832 * is halted.
3833 */
kvm_vcpu_halt(struct kvm_vcpu * vcpu)3834 void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3835 {
3836 unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3837 bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3838 ktime_t start, cur, poll_end;
3839 bool waited = false;
3840 bool do_halt_poll;
3841 u64 halt_ns;
3842
3843 if (vcpu->halt_poll_ns > max_halt_poll_ns)
3844 vcpu->halt_poll_ns = max_halt_poll_ns;
3845
3846 do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3847
3848 start = cur = poll_end = ktime_get();
3849 if (do_halt_poll) {
3850 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3851
3852 do {
3853 if (kvm_vcpu_check_block(vcpu) < 0)
3854 goto out;
3855 cpu_relax();
3856 poll_end = cur = ktime_get();
3857 } while (kvm_vcpu_can_poll(cur, stop));
3858 }
3859
3860 waited = kvm_vcpu_block(vcpu);
3861
3862 cur = ktime_get();
3863 if (waited) {
3864 vcpu->stat.generic.halt_wait_ns +=
3865 ktime_to_ns(cur) - ktime_to_ns(poll_end);
3866 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3867 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3868 }
3869 out:
3870 /* The total time the vCPU was "halted", including polling time. */
3871 halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3872
3873 /*
3874 * Note, halt-polling is considered successful so long as the vCPU was
3875 * never actually scheduled out, i.e. even if the wake event arrived
3876 * after of the halt-polling loop itself, but before the full wait.
3877 */
3878 if (do_halt_poll)
3879 update_halt_poll_stats(vcpu, start, poll_end, !waited);
3880
3881 if (halt_poll_allowed) {
3882 /* Recompute the max halt poll time in case it changed. */
3883 max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3884
3885 if (!vcpu_valid_wakeup(vcpu)) {
3886 shrink_halt_poll_ns(vcpu);
3887 } else if (max_halt_poll_ns) {
3888 if (halt_ns <= vcpu->halt_poll_ns)
3889 ;
3890 /* we had a long block, shrink polling */
3891 else if (vcpu->halt_poll_ns &&
3892 halt_ns > max_halt_poll_ns)
3893 shrink_halt_poll_ns(vcpu);
3894 /* we had a short halt and our poll time is too small */
3895 else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3896 halt_ns < max_halt_poll_ns)
3897 grow_halt_poll_ns(vcpu);
3898 } else {
3899 vcpu->halt_poll_ns = 0;
3900 }
3901 }
3902
3903 trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3904 }
3905 EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3906
kvm_vcpu_wake_up(struct kvm_vcpu * vcpu)3907 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3908 {
3909 if (__kvm_vcpu_wake_up(vcpu)) {
3910 WRITE_ONCE(vcpu->ready, true);
3911 ++vcpu->stat.generic.halt_wakeup;
3912 return true;
3913 }
3914
3915 return false;
3916 }
3917 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3918
3919 #ifndef CONFIG_S390
3920 /*
3921 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3922 */
kvm_vcpu_kick(struct kvm_vcpu * vcpu)3923 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3924 {
3925 int me, cpu;
3926
3927 if (kvm_vcpu_wake_up(vcpu))
3928 return;
3929
3930 me = get_cpu();
3931 /*
3932 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3933 * to EXITING_GUEST_MODE. Therefore the moderately expensive "should
3934 * kick" check does not need atomic operations if kvm_vcpu_kick is used
3935 * within the vCPU thread itself.
3936 */
3937 if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3938 if (vcpu->mode == IN_GUEST_MODE)
3939 WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3940 goto out;
3941 }
3942
3943 /*
3944 * Note, the vCPU could get migrated to a different pCPU at any point
3945 * after kvm_arch_vcpu_should_kick(), which could result in sending an
3946 * IPI to the previous pCPU. But, that's ok because the purpose of the
3947 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3948 * vCPU also requires it to leave IN_GUEST_MODE.
3949 */
3950 if (kvm_arch_vcpu_should_kick(vcpu)) {
3951 cpu = READ_ONCE(vcpu->cpu);
3952 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3953 smp_send_reschedule(cpu);
3954 }
3955 out:
3956 put_cpu();
3957 }
3958 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3959 #endif /* !CONFIG_S390 */
3960
kvm_vcpu_yield_to(struct kvm_vcpu * target)3961 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3962 {
3963 struct pid *pid;
3964 struct task_struct *task = NULL;
3965 int ret = 0;
3966
3967 rcu_read_lock();
3968 pid = rcu_dereference(target->pid);
3969 if (pid)
3970 task = get_pid_task(pid, PIDTYPE_PID);
3971 rcu_read_unlock();
3972 if (!task)
3973 return ret;
3974 ret = yield_to(task, 1);
3975 put_task_struct(task);
3976
3977 return ret;
3978 }
3979 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3980
3981 /*
3982 * Helper that checks whether a VCPU is eligible for directed yield.
3983 * Most eligible candidate to yield is decided by following heuristics:
3984 *
3985 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3986 * (preempted lock holder), indicated by @in_spin_loop.
3987 * Set at the beginning and cleared at the end of interception/PLE handler.
3988 *
3989 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3990 * chance last time (mostly it has become eligible now since we have probably
3991 * yielded to lockholder in last iteration. This is done by toggling
3992 * @dy_eligible each time a VCPU checked for eligibility.)
3993 *
3994 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3995 * to preempted lock-holder could result in wrong VCPU selection and CPU
3996 * burning. Giving priority for a potential lock-holder increases lock
3997 * progress.
3998 *
3999 * Since algorithm is based on heuristics, accessing another VCPU data without
4000 * locking does not harm. It may result in trying to yield to same VCPU, fail
4001 * and continue with next VCPU and so on.
4002 */
kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu * vcpu)4003 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
4004 {
4005 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
4006 bool eligible;
4007
4008 eligible = !vcpu->spin_loop.in_spin_loop ||
4009 vcpu->spin_loop.dy_eligible;
4010
4011 if (vcpu->spin_loop.in_spin_loop)
4012 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
4013
4014 return eligible;
4015 #else
4016 return true;
4017 #endif
4018 }
4019
4020 /*
4021 * Unlike kvm_arch_vcpu_runnable, this function is called outside
4022 * a vcpu_load/vcpu_put pair. However, for most architectures
4023 * kvm_arch_vcpu_runnable does not require vcpu_load.
4024 */
kvm_arch_dy_runnable(struct kvm_vcpu * vcpu)4025 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
4026 {
4027 return kvm_arch_vcpu_runnable(vcpu);
4028 }
4029
vcpu_dy_runnable(struct kvm_vcpu * vcpu)4030 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
4031 {
4032 if (kvm_arch_dy_runnable(vcpu))
4033 return true;
4034
4035 #ifdef CONFIG_KVM_ASYNC_PF
4036 if (!list_empty_careful(&vcpu->async_pf.done))
4037 return true;
4038 #endif
4039
4040 return false;
4041 }
4042
4043 /*
4044 * By default, simply query the target vCPU's current mode when checking if a
4045 * vCPU was preempted in kernel mode. All architectures except x86 (or more
4046 * specifical, except VMX) allow querying whether or not a vCPU is in kernel
4047 * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
4048 * directly for cross-vCPU checks is functionally correct and accurate.
4049 */
kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu * vcpu)4050 bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
4051 {
4052 return kvm_arch_vcpu_in_kernel(vcpu);
4053 }
4054
kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu * vcpu)4055 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
4056 {
4057 return false;
4058 }
4059
kvm_vcpu_on_spin(struct kvm_vcpu * me,bool yield_to_kernel_mode)4060 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
4061 {
4062 struct kvm *kvm = me->kvm;
4063 struct kvm_vcpu *vcpu;
4064 int last_boosted_vcpu;
4065 unsigned long i;
4066 int yielded = 0;
4067 int try = 3;
4068 int pass;
4069
4070 last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
4071 kvm_vcpu_set_in_spin_loop(me, true);
4072 /*
4073 * We boost the priority of a VCPU that is runnable but not
4074 * currently running, because it got preempted by something
4075 * else and called schedule in __vcpu_run. Hopefully that
4076 * VCPU is holding the lock that we need and will release it.
4077 * We approximate round-robin by starting at the last boosted VCPU.
4078 */
4079 for (pass = 0; pass < 2 && !yielded && try; pass++) {
4080 kvm_for_each_vcpu(i, vcpu, kvm) {
4081 if (!pass && i <= last_boosted_vcpu) {
4082 i = last_boosted_vcpu;
4083 continue;
4084 } else if (pass && i > last_boosted_vcpu)
4085 break;
4086 if (!READ_ONCE(vcpu->ready))
4087 continue;
4088 if (vcpu == me)
4089 continue;
4090 if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
4091 continue;
4092
4093 /*
4094 * Treat the target vCPU as being in-kernel if it has a
4095 * pending interrupt, as the vCPU trying to yield may
4096 * be spinning waiting on IPI delivery, i.e. the target
4097 * vCPU is in-kernel for the purposes of directed yield.
4098 */
4099 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
4100 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
4101 !kvm_arch_vcpu_preempted_in_kernel(vcpu))
4102 continue;
4103 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
4104 continue;
4105
4106 yielded = kvm_vcpu_yield_to(vcpu);
4107 if (yielded > 0) {
4108 WRITE_ONCE(kvm->last_boosted_vcpu, i);
4109 break;
4110 } else if (yielded < 0) {
4111 try--;
4112 if (!try)
4113 break;
4114 }
4115 }
4116 }
4117 kvm_vcpu_set_in_spin_loop(me, false);
4118
4119 /* Ensure vcpu is not eligible during next spinloop */
4120 kvm_vcpu_set_dy_eligible(me, false);
4121 }
4122 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
4123
kvm_page_in_dirty_ring(struct kvm * kvm,unsigned long pgoff)4124 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
4125 {
4126 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
4127 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
4128 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
4129 kvm->dirty_ring_size / PAGE_SIZE);
4130 #else
4131 return false;
4132 #endif
4133 }
4134
kvm_vcpu_fault(struct vm_fault * vmf)4135 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
4136 {
4137 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
4138 struct page *page;
4139
4140 if (vmf->pgoff == 0)
4141 page = virt_to_page(vcpu->run);
4142 #ifdef CONFIG_X86
4143 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
4144 page = virt_to_page(vcpu->arch.pio_data);
4145 #endif
4146 #ifdef CONFIG_KVM_MMIO
4147 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
4148 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
4149 #endif
4150 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
4151 page = kvm_dirty_ring_get_page(
4152 &vcpu->dirty_ring,
4153 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
4154 else
4155 return kvm_arch_vcpu_fault(vcpu, vmf);
4156 get_page(page);
4157 vmf->page = page;
4158 return 0;
4159 }
4160
4161 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
4162 .fault = kvm_vcpu_fault,
4163 };
4164
kvm_vcpu_mmap(struct file * file,struct vm_area_struct * vma)4165 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
4166 {
4167 struct kvm_vcpu *vcpu = file->private_data;
4168 unsigned long pages = vma_pages(vma);
4169
4170 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
4171 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
4172 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
4173 return -EINVAL;
4174
4175 vma->vm_ops = &kvm_vcpu_vm_ops;
4176 return 0;
4177 }
4178
kvm_vcpu_release(struct inode * inode,struct file * filp)4179 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
4180 {
4181 struct kvm_vcpu *vcpu = filp->private_data;
4182
4183 kvm_put_kvm(vcpu->kvm);
4184 return 0;
4185 }
4186
4187 static struct file_operations kvm_vcpu_fops = {
4188 .release = kvm_vcpu_release,
4189 .unlocked_ioctl = kvm_vcpu_ioctl,
4190 .mmap = kvm_vcpu_mmap,
4191 .llseek = noop_llseek,
4192 KVM_COMPAT(kvm_vcpu_compat_ioctl),
4193 };
4194
4195 /*
4196 * Allocates an inode for the vcpu.
4197 */
create_vcpu_fd(struct kvm_vcpu * vcpu)4198 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
4199 {
4200 char name[8 + 1 + ITOA_MAX_LEN + 1];
4201
4202 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
4203 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
4204 }
4205
4206 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
vcpu_get_pid(void * data,u64 * val)4207 static int vcpu_get_pid(void *data, u64 *val)
4208 {
4209 struct kvm_vcpu *vcpu = data;
4210
4211 rcu_read_lock();
4212 *val = pid_nr(rcu_dereference(vcpu->pid));
4213 rcu_read_unlock();
4214 return 0;
4215 }
4216
4217 DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
4218
kvm_create_vcpu_debugfs(struct kvm_vcpu * vcpu)4219 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
4220 {
4221 struct dentry *debugfs_dentry;
4222 char dir_name[ITOA_MAX_LEN * 2];
4223
4224 if (!debugfs_initialized())
4225 return;
4226
4227 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
4228 debugfs_dentry = debugfs_create_dir(dir_name,
4229 vcpu->kvm->debugfs_dentry);
4230 debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
4231 &vcpu_get_pid_fops);
4232
4233 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
4234 }
4235 #endif
4236
4237 /*
4238 * Creates some virtual cpus. Good luck creating more than one.
4239 */
kvm_vm_ioctl_create_vcpu(struct kvm * kvm,unsigned long id)4240 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
4241 {
4242 int r;
4243 struct kvm_vcpu *vcpu;
4244 struct page *page;
4245
4246 /*
4247 * KVM tracks vCPU IDs as 'int', be kind to userspace and reject
4248 * too-large values instead of silently truncating.
4249 *
4250 * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
4251 * changing the storage type (at the very least, IDs should be tracked
4252 * as unsigned ints).
4253 */
4254 BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
4255 if (id >= KVM_MAX_VCPU_IDS)
4256 return -EINVAL;
4257
4258 mutex_lock(&kvm->lock);
4259 if (kvm->created_vcpus >= kvm->max_vcpus) {
4260 mutex_unlock(&kvm->lock);
4261 return -EINVAL;
4262 }
4263
4264 r = kvm_arch_vcpu_precreate(kvm, id);
4265 if (r) {
4266 mutex_unlock(&kvm->lock);
4267 return r;
4268 }
4269
4270 kvm->created_vcpus++;
4271 mutex_unlock(&kvm->lock);
4272
4273 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
4274 if (!vcpu) {
4275 r = -ENOMEM;
4276 goto vcpu_decrement;
4277 }
4278
4279 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
4280 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
4281 if (!page) {
4282 r = -ENOMEM;
4283 goto vcpu_free;
4284 }
4285 vcpu->run = page_address(page);
4286
4287 kvm_vcpu_init(vcpu, kvm, id);
4288
4289 r = kvm_arch_vcpu_create(vcpu);
4290 if (r)
4291 goto vcpu_free_run_page;
4292
4293 if (kvm->dirty_ring_size) {
4294 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
4295 id, kvm->dirty_ring_size);
4296 if (r)
4297 goto arch_vcpu_destroy;
4298 }
4299
4300 mutex_lock(&kvm->lock);
4301
4302 #ifdef CONFIG_LOCKDEP
4303 /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
4304 mutex_lock(&vcpu->mutex);
4305 mutex_unlock(&vcpu->mutex);
4306 #endif
4307
4308 if (kvm_get_vcpu_by_id(kvm, id)) {
4309 r = -EEXIST;
4310 goto unlock_vcpu_destroy;
4311 }
4312
4313 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
4314 r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
4315 if (r)
4316 goto unlock_vcpu_destroy;
4317
4318 /* Now it's all set up, let userspace reach it */
4319 kvm_get_kvm(kvm);
4320 r = create_vcpu_fd(vcpu);
4321 if (r < 0)
4322 goto kvm_put_xa_release;
4323
4324 if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
4325 r = -EINVAL;
4326 goto kvm_put_xa_release;
4327 }
4328
4329 /*
4330 * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
4331 * pointer before kvm->online_vcpu's incremented value.
4332 */
4333 smp_wmb();
4334 atomic_inc(&kvm->online_vcpus);
4335
4336 mutex_unlock(&kvm->lock);
4337 kvm_arch_vcpu_postcreate(vcpu);
4338 kvm_create_vcpu_debugfs(vcpu);
4339 return r;
4340
4341 kvm_put_xa_release:
4342 kvm_put_kvm_no_destroy(kvm);
4343 xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
4344 unlock_vcpu_destroy:
4345 mutex_unlock(&kvm->lock);
4346 kvm_dirty_ring_free(&vcpu->dirty_ring);
4347 arch_vcpu_destroy:
4348 kvm_arch_vcpu_destroy(vcpu);
4349 vcpu_free_run_page:
4350 free_page((unsigned long)vcpu->run);
4351 vcpu_free:
4352 kmem_cache_free(kvm_vcpu_cache, vcpu);
4353 vcpu_decrement:
4354 mutex_lock(&kvm->lock);
4355 kvm->created_vcpus--;
4356 mutex_unlock(&kvm->lock);
4357 return r;
4358 }
4359
kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu * vcpu,sigset_t * sigset)4360 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4361 {
4362 if (sigset) {
4363 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4364 vcpu->sigset_active = 1;
4365 vcpu->sigset = *sigset;
4366 } else
4367 vcpu->sigset_active = 0;
4368 return 0;
4369 }
4370
kvm_vcpu_stats_read(struct file * file,char __user * user_buffer,size_t size,loff_t * offset)4371 static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4372 size_t size, loff_t *offset)
4373 {
4374 struct kvm_vcpu *vcpu = file->private_data;
4375
4376 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4377 &kvm_vcpu_stats_desc[0], &vcpu->stat,
4378 sizeof(vcpu->stat), user_buffer, size, offset);
4379 }
4380
kvm_vcpu_stats_release(struct inode * inode,struct file * file)4381 static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4382 {
4383 struct kvm_vcpu *vcpu = file->private_data;
4384
4385 kvm_put_kvm(vcpu->kvm);
4386 return 0;
4387 }
4388
4389 static const struct file_operations kvm_vcpu_stats_fops = {
4390 .owner = THIS_MODULE,
4391 .read = kvm_vcpu_stats_read,
4392 .release = kvm_vcpu_stats_release,
4393 .llseek = noop_llseek,
4394 };
4395
kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu * vcpu)4396 static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4397 {
4398 int fd;
4399 struct file *file;
4400 char name[15 + ITOA_MAX_LEN + 1];
4401
4402 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4403
4404 fd = get_unused_fd_flags(O_CLOEXEC);
4405 if (fd < 0)
4406 return fd;
4407
4408 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4409 if (IS_ERR(file)) {
4410 put_unused_fd(fd);
4411 return PTR_ERR(file);
4412 }
4413
4414 kvm_get_kvm(vcpu->kvm);
4415
4416 file->f_mode |= FMODE_PREAD;
4417 fd_install(fd, file);
4418
4419 return fd;
4420 }
4421
4422 #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
kvm_vcpu_pre_fault_memory(struct kvm_vcpu * vcpu,struct kvm_pre_fault_memory * range)4423 static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
4424 struct kvm_pre_fault_memory *range)
4425 {
4426 int idx;
4427 long r;
4428 u64 full_size;
4429
4430 if (range->flags)
4431 return -EINVAL;
4432
4433 if (!PAGE_ALIGNED(range->gpa) ||
4434 !PAGE_ALIGNED(range->size) ||
4435 range->gpa + range->size <= range->gpa)
4436 return -EINVAL;
4437
4438 vcpu_load(vcpu);
4439 idx = srcu_read_lock(&vcpu->kvm->srcu);
4440
4441 full_size = range->size;
4442 do {
4443 if (signal_pending(current)) {
4444 r = -EINTR;
4445 break;
4446 }
4447
4448 r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
4449 if (WARN_ON_ONCE(r == 0 || r == -EIO))
4450 break;
4451
4452 if (r < 0)
4453 break;
4454
4455 range->size -= r;
4456 range->gpa += r;
4457 cond_resched();
4458 } while (range->size);
4459
4460 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4461 vcpu_put(vcpu);
4462
4463 /* Return success if at least one page was mapped successfully. */
4464 return full_size == range->size ? r : 0;
4465 }
4466 #endif
4467
kvm_vcpu_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)4468 static long kvm_vcpu_ioctl(struct file *filp,
4469 unsigned int ioctl, unsigned long arg)
4470 {
4471 struct kvm_vcpu *vcpu = filp->private_data;
4472 void __user *argp = (void __user *)arg;
4473 int r;
4474 struct kvm_fpu *fpu = NULL;
4475 struct kvm_sregs *kvm_sregs = NULL;
4476
4477 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4478 return -EIO;
4479
4480 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4481 return -EINVAL;
4482
4483 /*
4484 * Some architectures have vcpu ioctls that are asynchronous to vcpu
4485 * execution; mutex_lock() would break them.
4486 */
4487 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4488 if (r != -ENOIOCTLCMD)
4489 return r;
4490
4491 if (mutex_lock_killable(&vcpu->mutex))
4492 return -EINTR;
4493 switch (ioctl) {
4494 case KVM_RUN: {
4495 struct pid *oldpid;
4496 r = -EINVAL;
4497 if (arg)
4498 goto out;
4499 oldpid = rcu_access_pointer(vcpu->pid);
4500 if (unlikely(oldpid != task_pid(current))) {
4501 /* The thread running this VCPU changed. */
4502 struct pid *newpid;
4503
4504 r = kvm_arch_vcpu_run_pid_change(vcpu);
4505 if (r)
4506 break;
4507
4508 newpid = get_task_pid(current, PIDTYPE_PID);
4509 rcu_assign_pointer(vcpu->pid, newpid);
4510 if (oldpid)
4511 synchronize_rcu();
4512 put_pid(oldpid);
4513 }
4514 vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
4515 r = kvm_arch_vcpu_ioctl_run(vcpu);
4516 vcpu->wants_to_run = false;
4517
4518 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
4519 break;
4520 }
4521 case KVM_GET_REGS: {
4522 struct kvm_regs *kvm_regs;
4523
4524 r = -ENOMEM;
4525 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
4526 if (!kvm_regs)
4527 goto out;
4528 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4529 if (r)
4530 goto out_free1;
4531 r = -EFAULT;
4532 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4533 goto out_free1;
4534 r = 0;
4535 out_free1:
4536 kfree(kvm_regs);
4537 break;
4538 }
4539 case KVM_SET_REGS: {
4540 struct kvm_regs *kvm_regs;
4541
4542 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4543 if (IS_ERR(kvm_regs)) {
4544 r = PTR_ERR(kvm_regs);
4545 goto out;
4546 }
4547 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
4548 kfree(kvm_regs);
4549 break;
4550 }
4551 case KVM_GET_SREGS: {
4552 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
4553 r = -ENOMEM;
4554 if (!kvm_sregs)
4555 goto out;
4556 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
4557 if (r)
4558 goto out;
4559 r = -EFAULT;
4560 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
4561 goto out;
4562 r = 0;
4563 break;
4564 }
4565 case KVM_SET_SREGS: {
4566 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4567 if (IS_ERR(kvm_sregs)) {
4568 r = PTR_ERR(kvm_sregs);
4569 kvm_sregs = NULL;
4570 goto out;
4571 }
4572 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
4573 break;
4574 }
4575 case KVM_GET_MP_STATE: {
4576 struct kvm_mp_state mp_state;
4577
4578 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4579 if (r)
4580 goto out;
4581 r = -EFAULT;
4582 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
4583 goto out;
4584 r = 0;
4585 break;
4586 }
4587 case KVM_SET_MP_STATE: {
4588 struct kvm_mp_state mp_state;
4589
4590 r = -EFAULT;
4591 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
4592 goto out;
4593 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
4594 break;
4595 }
4596 case KVM_TRANSLATE: {
4597 struct kvm_translation tr;
4598
4599 r = -EFAULT;
4600 if (copy_from_user(&tr, argp, sizeof(tr)))
4601 goto out;
4602 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
4603 if (r)
4604 goto out;
4605 r = -EFAULT;
4606 if (copy_to_user(argp, &tr, sizeof(tr)))
4607 goto out;
4608 r = 0;
4609 break;
4610 }
4611 case KVM_SET_GUEST_DEBUG: {
4612 struct kvm_guest_debug dbg;
4613
4614 r = -EFAULT;
4615 if (copy_from_user(&dbg, argp, sizeof(dbg)))
4616 goto out;
4617 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4618 break;
4619 }
4620 case KVM_SET_SIGNAL_MASK: {
4621 struct kvm_signal_mask __user *sigmask_arg = argp;
4622 struct kvm_signal_mask kvm_sigmask;
4623 sigset_t sigset, *p;
4624
4625 p = NULL;
4626 if (argp) {
4627 r = -EFAULT;
4628 if (copy_from_user(&kvm_sigmask, argp,
4629 sizeof(kvm_sigmask)))
4630 goto out;
4631 r = -EINVAL;
4632 if (kvm_sigmask.len != sizeof(sigset))
4633 goto out;
4634 r = -EFAULT;
4635 if (copy_from_user(&sigset, sigmask_arg->sigset,
4636 sizeof(sigset)))
4637 goto out;
4638 p = &sigset;
4639 }
4640 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4641 break;
4642 }
4643 case KVM_GET_FPU: {
4644 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
4645 r = -ENOMEM;
4646 if (!fpu)
4647 goto out;
4648 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4649 if (r)
4650 goto out;
4651 r = -EFAULT;
4652 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4653 goto out;
4654 r = 0;
4655 break;
4656 }
4657 case KVM_SET_FPU: {
4658 fpu = memdup_user(argp, sizeof(*fpu));
4659 if (IS_ERR(fpu)) {
4660 r = PTR_ERR(fpu);
4661 fpu = NULL;
4662 goto out;
4663 }
4664 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4665 break;
4666 }
4667 case KVM_GET_STATS_FD: {
4668 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4669 break;
4670 }
4671 #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
4672 case KVM_PRE_FAULT_MEMORY: {
4673 struct kvm_pre_fault_memory range;
4674
4675 r = -EFAULT;
4676 if (copy_from_user(&range, argp, sizeof(range)))
4677 break;
4678 r = kvm_vcpu_pre_fault_memory(vcpu, &range);
4679 /* Pass back leftover range. */
4680 if (copy_to_user(argp, &range, sizeof(range)))
4681 r = -EFAULT;
4682 break;
4683 }
4684 #endif
4685 default:
4686 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4687 }
4688 out:
4689 mutex_unlock(&vcpu->mutex);
4690 kfree(fpu);
4691 kfree(kvm_sregs);
4692 return r;
4693 }
4694
4695 #ifdef CONFIG_KVM_COMPAT
kvm_vcpu_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)4696 static long kvm_vcpu_compat_ioctl(struct file *filp,
4697 unsigned int ioctl, unsigned long arg)
4698 {
4699 struct kvm_vcpu *vcpu = filp->private_data;
4700 void __user *argp = compat_ptr(arg);
4701 int r;
4702
4703 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4704 return -EIO;
4705
4706 switch (ioctl) {
4707 case KVM_SET_SIGNAL_MASK: {
4708 struct kvm_signal_mask __user *sigmask_arg = argp;
4709 struct kvm_signal_mask kvm_sigmask;
4710 sigset_t sigset;
4711
4712 if (argp) {
4713 r = -EFAULT;
4714 if (copy_from_user(&kvm_sigmask, argp,
4715 sizeof(kvm_sigmask)))
4716 goto out;
4717 r = -EINVAL;
4718 if (kvm_sigmask.len != sizeof(compat_sigset_t))
4719 goto out;
4720 r = -EFAULT;
4721 if (get_compat_sigset(&sigset,
4722 (compat_sigset_t __user *)sigmask_arg->sigset))
4723 goto out;
4724 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4725 } else
4726 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4727 break;
4728 }
4729 default:
4730 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4731 }
4732
4733 out:
4734 return r;
4735 }
4736 #endif
4737
kvm_device_mmap(struct file * filp,struct vm_area_struct * vma)4738 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4739 {
4740 struct kvm_device *dev = filp->private_data;
4741
4742 if (dev->ops->mmap)
4743 return dev->ops->mmap(dev, vma);
4744
4745 return -ENODEV;
4746 }
4747
kvm_device_ioctl_attr(struct kvm_device * dev,int (* accessor)(struct kvm_device * dev,struct kvm_device_attr * attr),unsigned long arg)4748 static int kvm_device_ioctl_attr(struct kvm_device *dev,
4749 int (*accessor)(struct kvm_device *dev,
4750 struct kvm_device_attr *attr),
4751 unsigned long arg)
4752 {
4753 struct kvm_device_attr attr;
4754
4755 if (!accessor)
4756 return -EPERM;
4757
4758 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4759 return -EFAULT;
4760
4761 return accessor(dev, &attr);
4762 }
4763
kvm_device_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)4764 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4765 unsigned long arg)
4766 {
4767 struct kvm_device *dev = filp->private_data;
4768
4769 if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4770 return -EIO;
4771
4772 switch (ioctl) {
4773 case KVM_SET_DEVICE_ATTR:
4774 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4775 case KVM_GET_DEVICE_ATTR:
4776 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4777 case KVM_HAS_DEVICE_ATTR:
4778 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4779 default:
4780 if (dev->ops->ioctl)
4781 return dev->ops->ioctl(dev, ioctl, arg);
4782
4783 return -ENOTTY;
4784 }
4785 }
4786
kvm_device_release(struct inode * inode,struct file * filp)4787 static int kvm_device_release(struct inode *inode, struct file *filp)
4788 {
4789 struct kvm_device *dev = filp->private_data;
4790 struct kvm *kvm = dev->kvm;
4791
4792 if (dev->ops->release) {
4793 mutex_lock(&kvm->lock);
4794 list_del_rcu(&dev->vm_node);
4795 synchronize_rcu();
4796 dev->ops->release(dev);
4797 mutex_unlock(&kvm->lock);
4798 }
4799
4800 kvm_put_kvm(kvm);
4801 return 0;
4802 }
4803
4804 static struct file_operations kvm_device_fops = {
4805 .unlocked_ioctl = kvm_device_ioctl,
4806 .release = kvm_device_release,
4807 KVM_COMPAT(kvm_device_ioctl),
4808 .mmap = kvm_device_mmap,
4809 };
4810
kvm_device_from_filp(struct file * filp)4811 struct kvm_device *kvm_device_from_filp(struct file *filp)
4812 {
4813 if (filp->f_op != &kvm_device_fops)
4814 return NULL;
4815
4816 return filp->private_data;
4817 }
4818
4819 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4820 #ifdef CONFIG_KVM_MPIC
4821 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4822 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
4823 #endif
4824 };
4825
kvm_register_device_ops(const struct kvm_device_ops * ops,u32 type)4826 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4827 {
4828 if (type >= ARRAY_SIZE(kvm_device_ops_table))
4829 return -ENOSPC;
4830
4831 if (kvm_device_ops_table[type] != NULL)
4832 return -EEXIST;
4833
4834 kvm_device_ops_table[type] = ops;
4835 return 0;
4836 }
4837
kvm_unregister_device_ops(u32 type)4838 void kvm_unregister_device_ops(u32 type)
4839 {
4840 if (kvm_device_ops_table[type] != NULL)
4841 kvm_device_ops_table[type] = NULL;
4842 }
4843
kvm_ioctl_create_device(struct kvm * kvm,struct kvm_create_device * cd)4844 static int kvm_ioctl_create_device(struct kvm *kvm,
4845 struct kvm_create_device *cd)
4846 {
4847 const struct kvm_device_ops *ops;
4848 struct kvm_device *dev;
4849 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4850 int type;
4851 int ret;
4852
4853 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4854 return -ENODEV;
4855
4856 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4857 ops = kvm_device_ops_table[type];
4858 if (ops == NULL)
4859 return -ENODEV;
4860
4861 if (test)
4862 return 0;
4863
4864 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4865 if (!dev)
4866 return -ENOMEM;
4867
4868 dev->ops = ops;
4869 dev->kvm = kvm;
4870
4871 mutex_lock(&kvm->lock);
4872 ret = ops->create(dev, type);
4873 if (ret < 0) {
4874 mutex_unlock(&kvm->lock);
4875 kfree(dev);
4876 return ret;
4877 }
4878 list_add_rcu(&dev->vm_node, &kvm->devices);
4879 mutex_unlock(&kvm->lock);
4880
4881 if (ops->init)
4882 ops->init(dev);
4883
4884 kvm_get_kvm(kvm);
4885 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4886 if (ret < 0) {
4887 kvm_put_kvm_no_destroy(kvm);
4888 mutex_lock(&kvm->lock);
4889 list_del_rcu(&dev->vm_node);
4890 synchronize_rcu();
4891 if (ops->release)
4892 ops->release(dev);
4893 mutex_unlock(&kvm->lock);
4894 if (ops->destroy)
4895 ops->destroy(dev);
4896 return ret;
4897 }
4898
4899 cd->fd = ret;
4900 return 0;
4901 }
4902
kvm_vm_ioctl_check_extension_generic(struct kvm * kvm,long arg)4903 static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4904 {
4905 switch (arg) {
4906 case KVM_CAP_USER_MEMORY:
4907 case KVM_CAP_USER_MEMORY2:
4908 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4909 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4910 case KVM_CAP_INTERNAL_ERROR_DATA:
4911 #ifdef CONFIG_HAVE_KVM_MSI
4912 case KVM_CAP_SIGNAL_MSI:
4913 #endif
4914 #ifdef CONFIG_HAVE_KVM_IRQCHIP
4915 case KVM_CAP_IRQFD:
4916 #endif
4917 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4918 case KVM_CAP_CHECK_EXTENSION_VM:
4919 case KVM_CAP_ENABLE_CAP_VM:
4920 case KVM_CAP_HALT_POLL:
4921 return 1;
4922 #ifdef CONFIG_KVM_MMIO
4923 case KVM_CAP_COALESCED_MMIO:
4924 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4925 case KVM_CAP_COALESCED_PIO:
4926 return 1;
4927 #endif
4928 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4929 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4930 return KVM_DIRTY_LOG_MANUAL_CAPS;
4931 #endif
4932 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4933 case KVM_CAP_IRQ_ROUTING:
4934 return KVM_MAX_IRQ_ROUTES;
4935 #endif
4936 #if KVM_MAX_NR_ADDRESS_SPACES > 1
4937 case KVM_CAP_MULTI_ADDRESS_SPACE:
4938 if (kvm)
4939 return kvm_arch_nr_memslot_as_ids(kvm);
4940 return KVM_MAX_NR_ADDRESS_SPACES;
4941 #endif
4942 case KVM_CAP_NR_MEMSLOTS:
4943 return KVM_USER_MEM_SLOTS;
4944 case KVM_CAP_DIRTY_LOG_RING:
4945 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4946 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4947 #else
4948 return 0;
4949 #endif
4950 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4951 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4952 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4953 #else
4954 return 0;
4955 #endif
4956 #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4957 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
4958 #endif
4959 case KVM_CAP_BINARY_STATS_FD:
4960 case KVM_CAP_SYSTEM_EVENT_DATA:
4961 case KVM_CAP_DEVICE_CTRL:
4962 return 1;
4963 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
4964 case KVM_CAP_MEMORY_ATTRIBUTES:
4965 return kvm_supported_mem_attributes(kvm);
4966 #endif
4967 #ifdef CONFIG_KVM_PRIVATE_MEM
4968 case KVM_CAP_GUEST_MEMFD:
4969 return !kvm || kvm_arch_has_private_mem(kvm);
4970 #endif
4971 default:
4972 break;
4973 }
4974 return kvm_vm_ioctl_check_extension(kvm, arg);
4975 }
4976
kvm_vm_ioctl_enable_dirty_log_ring(struct kvm * kvm,u32 size)4977 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4978 {
4979 int r;
4980
4981 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4982 return -EINVAL;
4983
4984 /* the size should be power of 2 */
4985 if (!size || (size & (size - 1)))
4986 return -EINVAL;
4987
4988 /* Should be bigger to keep the reserved entries, or a page */
4989 if (size < kvm_dirty_ring_get_rsvd_entries() *
4990 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4991 return -EINVAL;
4992
4993 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4994 sizeof(struct kvm_dirty_gfn))
4995 return -E2BIG;
4996
4997 /* We only allow it to set once */
4998 if (kvm->dirty_ring_size)
4999 return -EINVAL;
5000
5001 mutex_lock(&kvm->lock);
5002
5003 if (kvm->created_vcpus) {
5004 /* We don't allow to change this value after vcpu created */
5005 r = -EINVAL;
5006 } else {
5007 kvm->dirty_ring_size = size;
5008 r = 0;
5009 }
5010
5011 mutex_unlock(&kvm->lock);
5012 return r;
5013 }
5014
kvm_vm_ioctl_reset_dirty_pages(struct kvm * kvm)5015 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
5016 {
5017 unsigned long i;
5018 struct kvm_vcpu *vcpu;
5019 int cleared = 0;
5020
5021 if (!kvm->dirty_ring_size)
5022 return -EINVAL;
5023
5024 mutex_lock(&kvm->slots_lock);
5025
5026 kvm_for_each_vcpu(i, vcpu, kvm)
5027 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
5028
5029 mutex_unlock(&kvm->slots_lock);
5030
5031 if (cleared)
5032 kvm_flush_remote_tlbs(kvm);
5033
5034 return cleared;
5035 }
5036
kvm_vm_ioctl_enable_cap(struct kvm * kvm,struct kvm_enable_cap * cap)5037 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
5038 struct kvm_enable_cap *cap)
5039 {
5040 return -EINVAL;
5041 }
5042
kvm_are_all_memslots_empty(struct kvm * kvm)5043 bool kvm_are_all_memslots_empty(struct kvm *kvm)
5044 {
5045 int i;
5046
5047 lockdep_assert_held(&kvm->slots_lock);
5048
5049 for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
5050 if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
5051 return false;
5052 }
5053
5054 return true;
5055 }
5056 EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
5057
kvm_vm_ioctl_enable_cap_generic(struct kvm * kvm,struct kvm_enable_cap * cap)5058 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
5059 struct kvm_enable_cap *cap)
5060 {
5061 switch (cap->cap) {
5062 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5063 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
5064 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
5065
5066 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
5067 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
5068
5069 if (cap->flags || (cap->args[0] & ~allowed_options))
5070 return -EINVAL;
5071 kvm->manual_dirty_log_protect = cap->args[0];
5072 return 0;
5073 }
5074 #endif
5075 case KVM_CAP_HALT_POLL: {
5076 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
5077 return -EINVAL;
5078
5079 kvm->max_halt_poll_ns = cap->args[0];
5080
5081 /*
5082 * Ensure kvm->override_halt_poll_ns does not become visible
5083 * before kvm->max_halt_poll_ns.
5084 *
5085 * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
5086 */
5087 smp_wmb();
5088 kvm->override_halt_poll_ns = true;
5089
5090 return 0;
5091 }
5092 case KVM_CAP_DIRTY_LOG_RING:
5093 case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
5094 if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
5095 return -EINVAL;
5096
5097 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
5098 case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
5099 int r = -EINVAL;
5100
5101 if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
5102 !kvm->dirty_ring_size || cap->flags)
5103 return r;
5104
5105 mutex_lock(&kvm->slots_lock);
5106
5107 /*
5108 * For simplicity, allow enabling ring+bitmap if and only if
5109 * there are no memslots, e.g. to ensure all memslots allocate
5110 * a bitmap after the capability is enabled.
5111 */
5112 if (kvm_are_all_memslots_empty(kvm)) {
5113 kvm->dirty_ring_with_bitmap = true;
5114 r = 0;
5115 }
5116
5117 mutex_unlock(&kvm->slots_lock);
5118
5119 return r;
5120 }
5121 default:
5122 return kvm_vm_ioctl_enable_cap(kvm, cap);
5123 }
5124 }
5125
kvm_vm_stats_read(struct file * file,char __user * user_buffer,size_t size,loff_t * offset)5126 static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
5127 size_t size, loff_t *offset)
5128 {
5129 struct kvm *kvm = file->private_data;
5130
5131 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
5132 &kvm_vm_stats_desc[0], &kvm->stat,
5133 sizeof(kvm->stat), user_buffer, size, offset);
5134 }
5135
kvm_vm_stats_release(struct inode * inode,struct file * file)5136 static int kvm_vm_stats_release(struct inode *inode, struct file *file)
5137 {
5138 struct kvm *kvm = file->private_data;
5139
5140 kvm_put_kvm(kvm);
5141 return 0;
5142 }
5143
5144 static const struct file_operations kvm_vm_stats_fops = {
5145 .owner = THIS_MODULE,
5146 .read = kvm_vm_stats_read,
5147 .release = kvm_vm_stats_release,
5148 .llseek = noop_llseek,
5149 };
5150
kvm_vm_ioctl_get_stats_fd(struct kvm * kvm)5151 static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
5152 {
5153 int fd;
5154 struct file *file;
5155
5156 fd = get_unused_fd_flags(O_CLOEXEC);
5157 if (fd < 0)
5158 return fd;
5159
5160 file = anon_inode_getfile("kvm-vm-stats",
5161 &kvm_vm_stats_fops, kvm, O_RDONLY);
5162 if (IS_ERR(file)) {
5163 put_unused_fd(fd);
5164 return PTR_ERR(file);
5165 }
5166
5167 kvm_get_kvm(kvm);
5168
5169 file->f_mode |= FMODE_PREAD;
5170 fd_install(fd, file);
5171
5172 return fd;
5173 }
5174
5175 #define SANITY_CHECK_MEM_REGION_FIELD(field) \
5176 do { \
5177 BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
5178 offsetof(struct kvm_userspace_memory_region2, field)); \
5179 BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
5180 sizeof_field(struct kvm_userspace_memory_region2, field)); \
5181 } while (0)
5182
kvm_vm_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)5183 static long kvm_vm_ioctl(struct file *filp,
5184 unsigned int ioctl, unsigned long arg)
5185 {
5186 struct kvm *kvm = filp->private_data;
5187 void __user *argp = (void __user *)arg;
5188 int r;
5189
5190 if (kvm->mm != current->mm || kvm->vm_dead)
5191 return -EIO;
5192 switch (ioctl) {
5193 case KVM_CREATE_VCPU:
5194 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
5195 break;
5196 case KVM_ENABLE_CAP: {
5197 struct kvm_enable_cap cap;
5198
5199 r = -EFAULT;
5200 if (copy_from_user(&cap, argp, sizeof(cap)))
5201 goto out;
5202 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
5203 break;
5204 }
5205 case KVM_SET_USER_MEMORY_REGION2:
5206 case KVM_SET_USER_MEMORY_REGION: {
5207 struct kvm_userspace_memory_region2 mem;
5208 unsigned long size;
5209
5210 if (ioctl == KVM_SET_USER_MEMORY_REGION) {
5211 /*
5212 * Fields beyond struct kvm_userspace_memory_region shouldn't be
5213 * accessed, but avoid leaking kernel memory in case of a bug.
5214 */
5215 memset(&mem, 0, sizeof(mem));
5216 size = sizeof(struct kvm_userspace_memory_region);
5217 } else {
5218 size = sizeof(struct kvm_userspace_memory_region2);
5219 }
5220
5221 /* Ensure the common parts of the two structs are identical. */
5222 SANITY_CHECK_MEM_REGION_FIELD(slot);
5223 SANITY_CHECK_MEM_REGION_FIELD(flags);
5224 SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
5225 SANITY_CHECK_MEM_REGION_FIELD(memory_size);
5226 SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
5227
5228 r = -EFAULT;
5229 if (copy_from_user(&mem, argp, size))
5230 goto out;
5231
5232 r = -EINVAL;
5233 if (ioctl == KVM_SET_USER_MEMORY_REGION &&
5234 (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
5235 goto out;
5236
5237 r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
5238 break;
5239 }
5240 case KVM_GET_DIRTY_LOG: {
5241 struct kvm_dirty_log log;
5242
5243 r = -EFAULT;
5244 if (copy_from_user(&log, argp, sizeof(log)))
5245 goto out;
5246 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5247 break;
5248 }
5249 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5250 case KVM_CLEAR_DIRTY_LOG: {
5251 struct kvm_clear_dirty_log log;
5252
5253 r = -EFAULT;
5254 if (copy_from_user(&log, argp, sizeof(log)))
5255 goto out;
5256 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5257 break;
5258 }
5259 #endif
5260 #ifdef CONFIG_KVM_MMIO
5261 case KVM_REGISTER_COALESCED_MMIO: {
5262 struct kvm_coalesced_mmio_zone zone;
5263
5264 r = -EFAULT;
5265 if (copy_from_user(&zone, argp, sizeof(zone)))
5266 goto out;
5267 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
5268 break;
5269 }
5270 case KVM_UNREGISTER_COALESCED_MMIO: {
5271 struct kvm_coalesced_mmio_zone zone;
5272
5273 r = -EFAULT;
5274 if (copy_from_user(&zone, argp, sizeof(zone)))
5275 goto out;
5276 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
5277 break;
5278 }
5279 #endif
5280 case KVM_IRQFD: {
5281 struct kvm_irqfd data;
5282
5283 r = -EFAULT;
5284 if (copy_from_user(&data, argp, sizeof(data)))
5285 goto out;
5286 r = kvm_irqfd(kvm, &data);
5287 break;
5288 }
5289 case KVM_IOEVENTFD: {
5290 struct kvm_ioeventfd data;
5291
5292 r = -EFAULT;
5293 if (copy_from_user(&data, argp, sizeof(data)))
5294 goto out;
5295 r = kvm_ioeventfd(kvm, &data);
5296 break;
5297 }
5298 #ifdef CONFIG_HAVE_KVM_MSI
5299 case KVM_SIGNAL_MSI: {
5300 struct kvm_msi msi;
5301
5302 r = -EFAULT;
5303 if (copy_from_user(&msi, argp, sizeof(msi)))
5304 goto out;
5305 r = kvm_send_userspace_msi(kvm, &msi);
5306 break;
5307 }
5308 #endif
5309 #ifdef __KVM_HAVE_IRQ_LINE
5310 case KVM_IRQ_LINE_STATUS:
5311 case KVM_IRQ_LINE: {
5312 struct kvm_irq_level irq_event;
5313
5314 r = -EFAULT;
5315 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
5316 goto out;
5317
5318 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
5319 ioctl == KVM_IRQ_LINE_STATUS);
5320 if (r)
5321 goto out;
5322
5323 r = -EFAULT;
5324 if (ioctl == KVM_IRQ_LINE_STATUS) {
5325 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
5326 goto out;
5327 }
5328
5329 r = 0;
5330 break;
5331 }
5332 #endif
5333 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
5334 case KVM_SET_GSI_ROUTING: {
5335 struct kvm_irq_routing routing;
5336 struct kvm_irq_routing __user *urouting;
5337 struct kvm_irq_routing_entry *entries = NULL;
5338
5339 r = -EFAULT;
5340 if (copy_from_user(&routing, argp, sizeof(routing)))
5341 goto out;
5342 r = -EINVAL;
5343 if (!kvm_arch_can_set_irq_routing(kvm))
5344 goto out;
5345 if (routing.nr > KVM_MAX_IRQ_ROUTES)
5346 goto out;
5347 if (routing.flags)
5348 goto out;
5349 if (routing.nr) {
5350 urouting = argp;
5351 entries = vmemdup_array_user(urouting->entries,
5352 routing.nr, sizeof(*entries));
5353 if (IS_ERR(entries)) {
5354 r = PTR_ERR(entries);
5355 goto out;
5356 }
5357 }
5358 r = kvm_set_irq_routing(kvm, entries, routing.nr,
5359 routing.flags);
5360 kvfree(entries);
5361 break;
5362 }
5363 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
5364 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
5365 case KVM_SET_MEMORY_ATTRIBUTES: {
5366 struct kvm_memory_attributes attrs;
5367
5368 r = -EFAULT;
5369 if (copy_from_user(&attrs, argp, sizeof(attrs)))
5370 goto out;
5371
5372 r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
5373 break;
5374 }
5375 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
5376 case KVM_CREATE_DEVICE: {
5377 struct kvm_create_device cd;
5378
5379 r = -EFAULT;
5380 if (copy_from_user(&cd, argp, sizeof(cd)))
5381 goto out;
5382
5383 r = kvm_ioctl_create_device(kvm, &cd);
5384 if (r)
5385 goto out;
5386
5387 r = -EFAULT;
5388 if (copy_to_user(argp, &cd, sizeof(cd)))
5389 goto out;
5390
5391 r = 0;
5392 break;
5393 }
5394 case KVM_CHECK_EXTENSION:
5395 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
5396 break;
5397 case KVM_RESET_DIRTY_RINGS:
5398 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
5399 break;
5400 case KVM_GET_STATS_FD:
5401 r = kvm_vm_ioctl_get_stats_fd(kvm);
5402 break;
5403 #ifdef CONFIG_KVM_PRIVATE_MEM
5404 case KVM_CREATE_GUEST_MEMFD: {
5405 struct kvm_create_guest_memfd guest_memfd;
5406
5407 r = -EFAULT;
5408 if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
5409 goto out;
5410
5411 r = kvm_gmem_create(kvm, &guest_memfd);
5412 break;
5413 }
5414 #endif
5415 default:
5416 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
5417 }
5418 out:
5419 return r;
5420 }
5421
5422 #ifdef CONFIG_KVM_COMPAT
5423 struct compat_kvm_dirty_log {
5424 __u32 slot;
5425 __u32 padding1;
5426 union {
5427 compat_uptr_t dirty_bitmap; /* one bit per page */
5428 __u64 padding2;
5429 };
5430 };
5431
5432 struct compat_kvm_clear_dirty_log {
5433 __u32 slot;
5434 __u32 num_pages;
5435 __u64 first_page;
5436 union {
5437 compat_uptr_t dirty_bitmap; /* one bit per page */
5438 __u64 padding2;
5439 };
5440 };
5441
kvm_arch_vm_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)5442 long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5443 unsigned long arg)
5444 {
5445 return -ENOTTY;
5446 }
5447
kvm_vm_compat_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)5448 static long kvm_vm_compat_ioctl(struct file *filp,
5449 unsigned int ioctl, unsigned long arg)
5450 {
5451 struct kvm *kvm = filp->private_data;
5452 int r;
5453
5454 if (kvm->mm != current->mm || kvm->vm_dead)
5455 return -EIO;
5456
5457 r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5458 if (r != -ENOTTY)
5459 return r;
5460
5461 switch (ioctl) {
5462 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5463 case KVM_CLEAR_DIRTY_LOG: {
5464 struct compat_kvm_clear_dirty_log compat_log;
5465 struct kvm_clear_dirty_log log;
5466
5467 if (copy_from_user(&compat_log, (void __user *)arg,
5468 sizeof(compat_log)))
5469 return -EFAULT;
5470 log.slot = compat_log.slot;
5471 log.num_pages = compat_log.num_pages;
5472 log.first_page = compat_log.first_page;
5473 log.padding2 = compat_log.padding2;
5474 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5475
5476 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5477 break;
5478 }
5479 #endif
5480 case KVM_GET_DIRTY_LOG: {
5481 struct compat_kvm_dirty_log compat_log;
5482 struct kvm_dirty_log log;
5483
5484 if (copy_from_user(&compat_log, (void __user *)arg,
5485 sizeof(compat_log)))
5486 return -EFAULT;
5487 log.slot = compat_log.slot;
5488 log.padding1 = compat_log.padding1;
5489 log.padding2 = compat_log.padding2;
5490 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5491
5492 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5493 break;
5494 }
5495 default:
5496 r = kvm_vm_ioctl(filp, ioctl, arg);
5497 }
5498 return r;
5499 }
5500 #endif
5501
5502 static struct file_operations kvm_vm_fops = {
5503 .release = kvm_vm_release,
5504 .unlocked_ioctl = kvm_vm_ioctl,
5505 .llseek = noop_llseek,
5506 KVM_COMPAT(kvm_vm_compat_ioctl),
5507 };
5508
file_is_kvm(struct file * file)5509 bool file_is_kvm(struct file *file)
5510 {
5511 return file && file->f_op == &kvm_vm_fops;
5512 }
5513 EXPORT_SYMBOL_GPL(file_is_kvm);
5514
kvm_dev_ioctl_create_vm(unsigned long type)5515 static int kvm_dev_ioctl_create_vm(unsigned long type)
5516 {
5517 char fdname[ITOA_MAX_LEN + 1];
5518 int r, fd;
5519 struct kvm *kvm;
5520 struct file *file;
5521
5522 fd = get_unused_fd_flags(O_CLOEXEC);
5523 if (fd < 0)
5524 return fd;
5525
5526 snprintf(fdname, sizeof(fdname), "%d", fd);
5527
5528 kvm = kvm_create_vm(type, fdname);
5529 if (IS_ERR(kvm)) {
5530 r = PTR_ERR(kvm);
5531 goto put_fd;
5532 }
5533
5534 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5535 if (IS_ERR(file)) {
5536 r = PTR_ERR(file);
5537 goto put_kvm;
5538 }
5539
5540 /*
5541 * Don't call kvm_put_kvm anymore at this point; file->f_op is
5542 * already set, with ->release() being kvm_vm_release(). In error
5543 * cases it will be called by the final fput(file) and will take
5544 * care of doing kvm_put_kvm(kvm).
5545 */
5546 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
5547
5548 fd_install(fd, file);
5549 return fd;
5550
5551 put_kvm:
5552 kvm_put_kvm(kvm);
5553 put_fd:
5554 put_unused_fd(fd);
5555 return r;
5556 }
5557
kvm_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)5558 static long kvm_dev_ioctl(struct file *filp,
5559 unsigned int ioctl, unsigned long arg)
5560 {
5561 int r = -EINVAL;
5562
5563 switch (ioctl) {
5564 case KVM_GET_API_VERSION:
5565 if (arg)
5566 goto out;
5567 r = KVM_API_VERSION;
5568 break;
5569 case KVM_CREATE_VM:
5570 r = kvm_dev_ioctl_create_vm(arg);
5571 break;
5572 case KVM_CHECK_EXTENSION:
5573 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5574 break;
5575 case KVM_GET_VCPU_MMAP_SIZE:
5576 if (arg)
5577 goto out;
5578 r = PAGE_SIZE; /* struct kvm_run */
5579 #ifdef CONFIG_X86
5580 r += PAGE_SIZE; /* pio data page */
5581 #endif
5582 #ifdef CONFIG_KVM_MMIO
5583 r += PAGE_SIZE; /* coalesced mmio ring page */
5584 #endif
5585 break;
5586 default:
5587 return kvm_arch_dev_ioctl(filp, ioctl, arg);
5588 }
5589 out:
5590 return r;
5591 }
5592
5593 static struct file_operations kvm_chardev_ops = {
5594 .unlocked_ioctl = kvm_dev_ioctl,
5595 .llseek = noop_llseek,
5596 KVM_COMPAT(kvm_dev_ioctl),
5597 };
5598
5599 static struct miscdevice kvm_dev = {
5600 KVM_MINOR,
5601 "kvm",
5602 &kvm_chardev_ops,
5603 };
5604
5605 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5606 static bool enable_virt_at_load = true;
5607 module_param(enable_virt_at_load, bool, 0444);
5608
5609 __visible bool kvm_rebooting;
5610 EXPORT_SYMBOL_GPL(kvm_rebooting);
5611
5612 static DEFINE_PER_CPU(bool, virtualization_enabled);
5613 static DEFINE_MUTEX(kvm_usage_lock);
5614 static int kvm_usage_count;
5615
kvm_arch_enable_virtualization(void)5616 __weak void kvm_arch_enable_virtualization(void)
5617 {
5618
5619 }
5620
kvm_arch_disable_virtualization(void)5621 __weak void kvm_arch_disable_virtualization(void)
5622 {
5623
5624 }
5625
kvm_enable_virtualization_cpu(void)5626 static int kvm_enable_virtualization_cpu(void)
5627 {
5628 if (__this_cpu_read(virtualization_enabled))
5629 return 0;
5630
5631 if (kvm_arch_enable_virtualization_cpu()) {
5632 pr_info("kvm: enabling virtualization on CPU%d failed\n",
5633 raw_smp_processor_id());
5634 return -EIO;
5635 }
5636
5637 __this_cpu_write(virtualization_enabled, true);
5638 return 0;
5639 }
5640
kvm_online_cpu(unsigned int cpu)5641 static int kvm_online_cpu(unsigned int cpu)
5642 {
5643 /*
5644 * Abort the CPU online process if hardware virtualization cannot
5645 * be enabled. Otherwise running VMs would encounter unrecoverable
5646 * errors when scheduled to this CPU.
5647 */
5648 return kvm_enable_virtualization_cpu();
5649 }
5650
kvm_disable_virtualization_cpu(void * ign)5651 static void kvm_disable_virtualization_cpu(void *ign)
5652 {
5653 if (!__this_cpu_read(virtualization_enabled))
5654 return;
5655
5656 kvm_arch_disable_virtualization_cpu();
5657
5658 __this_cpu_write(virtualization_enabled, false);
5659 }
5660
kvm_offline_cpu(unsigned int cpu)5661 static int kvm_offline_cpu(unsigned int cpu)
5662 {
5663 kvm_disable_virtualization_cpu(NULL);
5664 return 0;
5665 }
5666
kvm_shutdown(void)5667 static void kvm_shutdown(void)
5668 {
5669 /*
5670 * Disable hardware virtualization and set kvm_rebooting to indicate
5671 * that KVM has asynchronously disabled hardware virtualization, i.e.
5672 * that relevant errors and exceptions aren't entirely unexpected.
5673 * Some flavors of hardware virtualization need to be disabled before
5674 * transferring control to firmware (to perform shutdown/reboot), e.g.
5675 * on x86, virtualization can block INIT interrupts, which are used by
5676 * firmware to pull APs back under firmware control. Note, this path
5677 * is used for both shutdown and reboot scenarios, i.e. neither name is
5678 * 100% comprehensive.
5679 */
5680 pr_info("kvm: exiting hardware virtualization\n");
5681 kvm_rebooting = true;
5682 on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
5683 }
5684
kvm_suspend(void)5685 static int kvm_suspend(void)
5686 {
5687 /*
5688 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5689 * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
5690 * count is stable. Assert that kvm_usage_lock is not held to ensure
5691 * the system isn't suspended while KVM is enabling hardware. Hardware
5692 * enabling can be preempted, but the task cannot be frozen until it has
5693 * dropped all locks (userspace tasks are frozen via a fake signal).
5694 */
5695 lockdep_assert_not_held(&kvm_usage_lock);
5696 lockdep_assert_irqs_disabled();
5697
5698 kvm_disable_virtualization_cpu(NULL);
5699 return 0;
5700 }
5701
kvm_resume(void)5702 static void kvm_resume(void)
5703 {
5704 lockdep_assert_not_held(&kvm_usage_lock);
5705 lockdep_assert_irqs_disabled();
5706
5707 WARN_ON_ONCE(kvm_enable_virtualization_cpu());
5708 }
5709
5710 static struct syscore_ops kvm_syscore_ops = {
5711 .suspend = kvm_suspend,
5712 .resume = kvm_resume,
5713 .shutdown = kvm_shutdown,
5714 };
5715
kvm_enable_virtualization(void)5716 static int kvm_enable_virtualization(void)
5717 {
5718 int r;
5719
5720 guard(mutex)(&kvm_usage_lock);
5721
5722 if (kvm_usage_count++)
5723 return 0;
5724
5725 kvm_arch_enable_virtualization();
5726
5727 r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
5728 kvm_online_cpu, kvm_offline_cpu);
5729 if (r)
5730 goto err_cpuhp;
5731
5732 register_syscore_ops(&kvm_syscore_ops);
5733
5734 /*
5735 * Undo virtualization enabling and bail if the system is going down.
5736 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5737 * possible for an in-flight operation to enable virtualization after
5738 * syscore_shutdown() is called, i.e. without kvm_shutdown() being
5739 * invoked. Note, this relies on system_state being set _before_
5740 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
5741 * or this CPU observes the impending shutdown. Which is why KVM uses
5742 * a syscore ops hook instead of registering a dedicated reboot
5743 * notifier (the latter runs before system_state is updated).
5744 */
5745 if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5746 system_state == SYSTEM_RESTART) {
5747 r = -EBUSY;
5748 goto err_rebooting;
5749 }
5750
5751 return 0;
5752
5753 err_rebooting:
5754 unregister_syscore_ops(&kvm_syscore_ops);
5755 cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
5756 err_cpuhp:
5757 kvm_arch_disable_virtualization();
5758 --kvm_usage_count;
5759 return r;
5760 }
5761
kvm_disable_virtualization(void)5762 static void kvm_disable_virtualization(void)
5763 {
5764 guard(mutex)(&kvm_usage_lock);
5765
5766 if (--kvm_usage_count)
5767 return;
5768
5769 unregister_syscore_ops(&kvm_syscore_ops);
5770 cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
5771 kvm_arch_disable_virtualization();
5772 }
5773
kvm_init_virtualization(void)5774 static int kvm_init_virtualization(void)
5775 {
5776 if (enable_virt_at_load)
5777 return kvm_enable_virtualization();
5778
5779 return 0;
5780 }
5781
kvm_uninit_virtualization(void)5782 static void kvm_uninit_virtualization(void)
5783 {
5784 if (enable_virt_at_load)
5785 kvm_disable_virtualization();
5786 }
5787 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
kvm_enable_virtualization(void)5788 static int kvm_enable_virtualization(void)
5789 {
5790 return 0;
5791 }
5792
kvm_init_virtualization(void)5793 static int kvm_init_virtualization(void)
5794 {
5795 return 0;
5796 }
5797
kvm_disable_virtualization(void)5798 static void kvm_disable_virtualization(void)
5799 {
5800
5801 }
5802
kvm_uninit_virtualization(void)5803 static void kvm_uninit_virtualization(void)
5804 {
5805
5806 }
5807 #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5808
kvm_iodevice_destructor(struct kvm_io_device * dev)5809 static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5810 {
5811 if (dev->ops->destructor)
5812 dev->ops->destructor(dev);
5813 }
5814
kvm_io_bus_destroy(struct kvm_io_bus * bus)5815 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
5816 {
5817 int i;
5818
5819 for (i = 0; i < bus->dev_count; i++) {
5820 struct kvm_io_device *pos = bus->range[i].dev;
5821
5822 kvm_iodevice_destructor(pos);
5823 }
5824 kfree(bus);
5825 }
5826
kvm_io_bus_cmp(const struct kvm_io_range * r1,const struct kvm_io_range * r2)5827 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5828 const struct kvm_io_range *r2)
5829 {
5830 gpa_t addr1 = r1->addr;
5831 gpa_t addr2 = r2->addr;
5832
5833 if (addr1 < addr2)
5834 return -1;
5835
5836 /* If r2->len == 0, match the exact address. If r2->len != 0,
5837 * accept any overlapping write. Any order is acceptable for
5838 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5839 * we process all of them.
5840 */
5841 if (r2->len) {
5842 addr1 += r1->len;
5843 addr2 += r2->len;
5844 }
5845
5846 if (addr1 > addr2)
5847 return 1;
5848
5849 return 0;
5850 }
5851
kvm_io_bus_sort_cmp(const void * p1,const void * p2)5852 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5853 {
5854 return kvm_io_bus_cmp(p1, p2);
5855 }
5856
kvm_io_bus_get_first_dev(struct kvm_io_bus * bus,gpa_t addr,int len)5857 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5858 gpa_t addr, int len)
5859 {
5860 struct kvm_io_range *range, key;
5861 int off;
5862
5863 key = (struct kvm_io_range) {
5864 .addr = addr,
5865 .len = len,
5866 };
5867
5868 range = bsearch(&key, bus->range, bus->dev_count,
5869 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5870 if (range == NULL)
5871 return -ENOENT;
5872
5873 off = range - bus->range;
5874
5875 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5876 off--;
5877
5878 return off;
5879 }
5880
__kvm_io_bus_write(struct kvm_vcpu * vcpu,struct kvm_io_bus * bus,struct kvm_io_range * range,const void * val)5881 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5882 struct kvm_io_range *range, const void *val)
5883 {
5884 int idx;
5885
5886 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5887 if (idx < 0)
5888 return -EOPNOTSUPP;
5889
5890 while (idx < bus->dev_count &&
5891 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5892 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5893 range->len, val))
5894 return idx;
5895 idx++;
5896 }
5897
5898 return -EOPNOTSUPP;
5899 }
5900
5901 /* kvm_io_bus_write - called under kvm->slots_lock */
kvm_io_bus_write(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,const void * val)5902 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5903 int len, const void *val)
5904 {
5905 struct kvm_io_bus *bus;
5906 struct kvm_io_range range;
5907 int r;
5908
5909 range = (struct kvm_io_range) {
5910 .addr = addr,
5911 .len = len,
5912 };
5913
5914 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5915 if (!bus)
5916 return -ENOMEM;
5917 r = __kvm_io_bus_write(vcpu, bus, &range, val);
5918 return r < 0 ? r : 0;
5919 }
5920 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5921
5922 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
kvm_io_bus_write_cookie(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,const void * val,long cookie)5923 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5924 gpa_t addr, int len, const void *val, long cookie)
5925 {
5926 struct kvm_io_bus *bus;
5927 struct kvm_io_range range;
5928
5929 range = (struct kvm_io_range) {
5930 .addr = addr,
5931 .len = len,
5932 };
5933
5934 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5935 if (!bus)
5936 return -ENOMEM;
5937
5938 /* First try the device referenced by cookie. */
5939 if ((cookie >= 0) && (cookie < bus->dev_count) &&
5940 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5941 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5942 val))
5943 return cookie;
5944
5945 /*
5946 * cookie contained garbage; fall back to search and return the
5947 * correct cookie value.
5948 */
5949 return __kvm_io_bus_write(vcpu, bus, &range, val);
5950 }
5951
__kvm_io_bus_read(struct kvm_vcpu * vcpu,struct kvm_io_bus * bus,struct kvm_io_range * range,void * val)5952 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5953 struct kvm_io_range *range, void *val)
5954 {
5955 int idx;
5956
5957 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5958 if (idx < 0)
5959 return -EOPNOTSUPP;
5960
5961 while (idx < bus->dev_count &&
5962 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5963 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5964 range->len, val))
5965 return idx;
5966 idx++;
5967 }
5968
5969 return -EOPNOTSUPP;
5970 }
5971
5972 /* kvm_io_bus_read - called under kvm->slots_lock */
kvm_io_bus_read(struct kvm_vcpu * vcpu,enum kvm_bus bus_idx,gpa_t addr,int len,void * val)5973 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5974 int len, void *val)
5975 {
5976 struct kvm_io_bus *bus;
5977 struct kvm_io_range range;
5978 int r;
5979
5980 range = (struct kvm_io_range) {
5981 .addr = addr,
5982 .len = len,
5983 };
5984
5985 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5986 if (!bus)
5987 return -ENOMEM;
5988 r = __kvm_io_bus_read(vcpu, bus, &range, val);
5989 return r < 0 ? r : 0;
5990 }
5991
__free_bus(struct rcu_head * rcu)5992 static void __free_bus(struct rcu_head *rcu)
5993 {
5994 struct kvm_io_bus *bus = container_of(rcu, struct kvm_io_bus, rcu);
5995
5996 kfree(bus);
5997 }
5998
kvm_io_bus_register_dev(struct kvm * kvm,enum kvm_bus bus_idx,gpa_t addr,int len,struct kvm_io_device * dev)5999 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
6000 int len, struct kvm_io_device *dev)
6001 {
6002 int i;
6003 struct kvm_io_bus *new_bus, *bus;
6004 struct kvm_io_range range;
6005
6006 lockdep_assert_held(&kvm->slots_lock);
6007
6008 bus = kvm_get_bus(kvm, bus_idx);
6009 if (!bus)
6010 return -ENOMEM;
6011
6012 /* exclude ioeventfd which is limited by maximum fd */
6013 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
6014 return -ENOSPC;
6015
6016 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
6017 GFP_KERNEL_ACCOUNT);
6018 if (!new_bus)
6019 return -ENOMEM;
6020
6021 range = (struct kvm_io_range) {
6022 .addr = addr,
6023 .len = len,
6024 .dev = dev,
6025 };
6026
6027 for (i = 0; i < bus->dev_count; i++)
6028 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
6029 break;
6030
6031 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
6032 new_bus->dev_count++;
6033 new_bus->range[i] = range;
6034 memcpy(new_bus->range + i + 1, bus->range + i,
6035 (bus->dev_count - i) * sizeof(struct kvm_io_range));
6036 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
6037 call_srcu(&kvm->srcu, &bus->rcu, __free_bus);
6038
6039 return 0;
6040 }
6041
kvm_io_bus_unregister_dev(struct kvm * kvm,enum kvm_bus bus_idx,struct kvm_io_device * dev)6042 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
6043 struct kvm_io_device *dev)
6044 {
6045 int i;
6046 struct kvm_io_bus *new_bus, *bus;
6047
6048 lockdep_assert_held(&kvm->slots_lock);
6049
6050 bus = kvm_get_bus(kvm, bus_idx);
6051 if (!bus)
6052 return 0;
6053
6054 for (i = 0; i < bus->dev_count; i++) {
6055 if (bus->range[i].dev == dev) {
6056 break;
6057 }
6058 }
6059
6060 if (i == bus->dev_count)
6061 return 0;
6062
6063 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
6064 GFP_KERNEL_ACCOUNT);
6065 if (new_bus) {
6066 memcpy(new_bus, bus, struct_size(bus, range, i));
6067 new_bus->dev_count--;
6068 memcpy(new_bus->range + i, bus->range + i + 1,
6069 flex_array_size(new_bus, range, new_bus->dev_count - i));
6070 }
6071
6072 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
6073 synchronize_srcu_expedited(&kvm->srcu);
6074
6075 /*
6076 * If NULL bus is installed, destroy the old bus, including all the
6077 * attached devices. Otherwise, destroy the caller's device only.
6078 */
6079 if (!new_bus) {
6080 pr_err("kvm: failed to shrink bus, removing it completely\n");
6081 kvm_io_bus_destroy(bus);
6082 return -ENOMEM;
6083 }
6084
6085 kvm_iodevice_destructor(dev);
6086 kfree(bus);
6087 return 0;
6088 }
6089
kvm_io_bus_get_dev(struct kvm * kvm,enum kvm_bus bus_idx,gpa_t addr)6090 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
6091 gpa_t addr)
6092 {
6093 struct kvm_io_bus *bus;
6094 int dev_idx, srcu_idx;
6095 struct kvm_io_device *iodev = NULL;
6096
6097 srcu_idx = srcu_read_lock(&kvm->srcu);
6098
6099 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
6100 if (!bus)
6101 goto out_unlock;
6102
6103 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
6104 if (dev_idx < 0)
6105 goto out_unlock;
6106
6107 iodev = bus->range[dev_idx].dev;
6108
6109 out_unlock:
6110 srcu_read_unlock(&kvm->srcu, srcu_idx);
6111
6112 return iodev;
6113 }
6114 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
6115
kvm_debugfs_open(struct inode * inode,struct file * file,int (* get)(void *,u64 *),int (* set)(void *,u64),const char * fmt)6116 static int kvm_debugfs_open(struct inode *inode, struct file *file,
6117 int (*get)(void *, u64 *), int (*set)(void *, u64),
6118 const char *fmt)
6119 {
6120 int ret;
6121 struct kvm_stat_data *stat_data = inode->i_private;
6122
6123 /*
6124 * The debugfs files are a reference to the kvm struct which
6125 * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
6126 * avoids the race between open and the removal of the debugfs directory.
6127 */
6128 if (!kvm_get_kvm_safe(stat_data->kvm))
6129 return -ENOENT;
6130
6131 ret = simple_attr_open(inode, file, get,
6132 kvm_stats_debugfs_mode(stat_data->desc) & 0222
6133 ? set : NULL, fmt);
6134 if (ret)
6135 kvm_put_kvm(stat_data->kvm);
6136
6137 return ret;
6138 }
6139
kvm_debugfs_release(struct inode * inode,struct file * file)6140 static int kvm_debugfs_release(struct inode *inode, struct file *file)
6141 {
6142 struct kvm_stat_data *stat_data = inode->i_private;
6143
6144 simple_attr_release(inode, file);
6145 kvm_put_kvm(stat_data->kvm);
6146
6147 return 0;
6148 }
6149
kvm_get_stat_per_vm(struct kvm * kvm,size_t offset,u64 * val)6150 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
6151 {
6152 *val = *(u64 *)((void *)(&kvm->stat) + offset);
6153
6154 return 0;
6155 }
6156
kvm_clear_stat_per_vm(struct kvm * kvm,size_t offset)6157 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
6158 {
6159 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
6160
6161 return 0;
6162 }
6163
kvm_get_stat_per_vcpu(struct kvm * kvm,size_t offset,u64 * val)6164 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
6165 {
6166 unsigned long i;
6167 struct kvm_vcpu *vcpu;
6168
6169 *val = 0;
6170
6171 kvm_for_each_vcpu(i, vcpu, kvm)
6172 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
6173
6174 return 0;
6175 }
6176
kvm_clear_stat_per_vcpu(struct kvm * kvm,size_t offset)6177 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
6178 {
6179 unsigned long i;
6180 struct kvm_vcpu *vcpu;
6181
6182 kvm_for_each_vcpu(i, vcpu, kvm)
6183 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
6184
6185 return 0;
6186 }
6187
kvm_stat_data_get(void * data,u64 * val)6188 static int kvm_stat_data_get(void *data, u64 *val)
6189 {
6190 int r = -EFAULT;
6191 struct kvm_stat_data *stat_data = data;
6192
6193 switch (stat_data->kind) {
6194 case KVM_STAT_VM:
6195 r = kvm_get_stat_per_vm(stat_data->kvm,
6196 stat_data->desc->desc.offset, val);
6197 break;
6198 case KVM_STAT_VCPU:
6199 r = kvm_get_stat_per_vcpu(stat_data->kvm,
6200 stat_data->desc->desc.offset, val);
6201 break;
6202 }
6203
6204 return r;
6205 }
6206
kvm_stat_data_clear(void * data,u64 val)6207 static int kvm_stat_data_clear(void *data, u64 val)
6208 {
6209 int r = -EFAULT;
6210 struct kvm_stat_data *stat_data = data;
6211
6212 if (val)
6213 return -EINVAL;
6214
6215 switch (stat_data->kind) {
6216 case KVM_STAT_VM:
6217 r = kvm_clear_stat_per_vm(stat_data->kvm,
6218 stat_data->desc->desc.offset);
6219 break;
6220 case KVM_STAT_VCPU:
6221 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
6222 stat_data->desc->desc.offset);
6223 break;
6224 }
6225
6226 return r;
6227 }
6228
kvm_stat_data_open(struct inode * inode,struct file * file)6229 static int kvm_stat_data_open(struct inode *inode, struct file *file)
6230 {
6231 __simple_attr_check_format("%llu\n", 0ull);
6232 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
6233 kvm_stat_data_clear, "%llu\n");
6234 }
6235
6236 static const struct file_operations stat_fops_per_vm = {
6237 .owner = THIS_MODULE,
6238 .open = kvm_stat_data_open,
6239 .release = kvm_debugfs_release,
6240 .read = simple_attr_read,
6241 .write = simple_attr_write,
6242 };
6243
vm_stat_get(void * _offset,u64 * val)6244 static int vm_stat_get(void *_offset, u64 *val)
6245 {
6246 unsigned offset = (long)_offset;
6247 struct kvm *kvm;
6248 u64 tmp_val;
6249
6250 *val = 0;
6251 mutex_lock(&kvm_lock);
6252 list_for_each_entry(kvm, &vm_list, vm_list) {
6253 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
6254 *val += tmp_val;
6255 }
6256 mutex_unlock(&kvm_lock);
6257 return 0;
6258 }
6259
vm_stat_clear(void * _offset,u64 val)6260 static int vm_stat_clear(void *_offset, u64 val)
6261 {
6262 unsigned offset = (long)_offset;
6263 struct kvm *kvm;
6264
6265 if (val)
6266 return -EINVAL;
6267
6268 mutex_lock(&kvm_lock);
6269 list_for_each_entry(kvm, &vm_list, vm_list) {
6270 kvm_clear_stat_per_vm(kvm, offset);
6271 }
6272 mutex_unlock(&kvm_lock);
6273
6274 return 0;
6275 }
6276
6277 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
6278 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
6279
vcpu_stat_get(void * _offset,u64 * val)6280 static int vcpu_stat_get(void *_offset, u64 *val)
6281 {
6282 unsigned offset = (long)_offset;
6283 struct kvm *kvm;
6284 u64 tmp_val;
6285
6286 *val = 0;
6287 mutex_lock(&kvm_lock);
6288 list_for_each_entry(kvm, &vm_list, vm_list) {
6289 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
6290 *val += tmp_val;
6291 }
6292 mutex_unlock(&kvm_lock);
6293 return 0;
6294 }
6295
vcpu_stat_clear(void * _offset,u64 val)6296 static int vcpu_stat_clear(void *_offset, u64 val)
6297 {
6298 unsigned offset = (long)_offset;
6299 struct kvm *kvm;
6300
6301 if (val)
6302 return -EINVAL;
6303
6304 mutex_lock(&kvm_lock);
6305 list_for_each_entry(kvm, &vm_list, vm_list) {
6306 kvm_clear_stat_per_vcpu(kvm, offset);
6307 }
6308 mutex_unlock(&kvm_lock);
6309
6310 return 0;
6311 }
6312
6313 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
6314 "%llu\n");
6315 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
6316
kvm_uevent_notify_change(unsigned int type,struct kvm * kvm)6317 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
6318 {
6319 struct kobj_uevent_env *env;
6320 unsigned long long created, active;
6321
6322 if (!kvm_dev.this_device || !kvm)
6323 return;
6324
6325 mutex_lock(&kvm_lock);
6326 if (type == KVM_EVENT_CREATE_VM) {
6327 kvm_createvm_count++;
6328 kvm_active_vms++;
6329 } else if (type == KVM_EVENT_DESTROY_VM) {
6330 kvm_active_vms--;
6331 }
6332 created = kvm_createvm_count;
6333 active = kvm_active_vms;
6334 mutex_unlock(&kvm_lock);
6335
6336 env = kzalloc(sizeof(*env), GFP_KERNEL);
6337 if (!env)
6338 return;
6339
6340 add_uevent_var(env, "CREATED=%llu", created);
6341 add_uevent_var(env, "COUNT=%llu", active);
6342
6343 if (type == KVM_EVENT_CREATE_VM) {
6344 add_uevent_var(env, "EVENT=create");
6345 kvm->userspace_pid = task_pid_nr(current);
6346 } else if (type == KVM_EVENT_DESTROY_VM) {
6347 add_uevent_var(env, "EVENT=destroy");
6348 }
6349 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
6350
6351 if (!IS_ERR(kvm->debugfs_dentry)) {
6352 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
6353
6354 if (p) {
6355 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
6356 if (!IS_ERR(tmp))
6357 add_uevent_var(env, "STATS_PATH=%s", tmp);
6358 kfree(p);
6359 }
6360 }
6361 /* no need for checks, since we are adding at most only 5 keys */
6362 env->envp[env->envp_idx++] = NULL;
6363 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
6364 kfree(env);
6365 }
6366
kvm_init_debug(void)6367 static void kvm_init_debug(void)
6368 {
6369 const struct file_operations *fops;
6370 const struct _kvm_stats_desc *pdesc;
6371 int i;
6372
6373 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
6374
6375 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
6376 pdesc = &kvm_vm_stats_desc[i];
6377 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6378 fops = &vm_stat_fops;
6379 else
6380 fops = &vm_stat_readonly_fops;
6381 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6382 kvm_debugfs_dir,
6383 (void *)(long)pdesc->desc.offset, fops);
6384 }
6385
6386 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
6387 pdesc = &kvm_vcpu_stats_desc[i];
6388 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6389 fops = &vcpu_stat_fops;
6390 else
6391 fops = &vcpu_stat_readonly_fops;
6392 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6393 kvm_debugfs_dir,
6394 (void *)(long)pdesc->desc.offset, fops);
6395 }
6396 }
6397
6398 static inline
preempt_notifier_to_vcpu(struct preempt_notifier * pn)6399 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
6400 {
6401 return container_of(pn, struct kvm_vcpu, preempt_notifier);
6402 }
6403
kvm_sched_in(struct preempt_notifier * pn,int cpu)6404 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
6405 {
6406 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6407
6408 WRITE_ONCE(vcpu->preempted, false);
6409 WRITE_ONCE(vcpu->ready, false);
6410
6411 __this_cpu_write(kvm_running_vcpu, vcpu);
6412 kvm_arch_vcpu_load(vcpu, cpu);
6413
6414 WRITE_ONCE(vcpu->scheduled_out, false);
6415 }
6416
kvm_sched_out(struct preempt_notifier * pn,struct task_struct * next)6417 static void kvm_sched_out(struct preempt_notifier *pn,
6418 struct task_struct *next)
6419 {
6420 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6421
6422 WRITE_ONCE(vcpu->scheduled_out, true);
6423
6424 if (task_is_runnable(current) && vcpu->wants_to_run) {
6425 WRITE_ONCE(vcpu->preempted, true);
6426 WRITE_ONCE(vcpu->ready, true);
6427 }
6428 kvm_arch_vcpu_put(vcpu);
6429 __this_cpu_write(kvm_running_vcpu, NULL);
6430 }
6431
6432 /**
6433 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
6434 *
6435 * We can disable preemption locally around accessing the per-CPU variable,
6436 * and use the resolved vcpu pointer after enabling preemption again,
6437 * because even if the current thread is migrated to another CPU, reading
6438 * the per-CPU value later will give us the same value as we update the
6439 * per-CPU variable in the preempt notifier handlers.
6440 */
kvm_get_running_vcpu(void)6441 struct kvm_vcpu *kvm_get_running_vcpu(void)
6442 {
6443 struct kvm_vcpu *vcpu;
6444
6445 preempt_disable();
6446 vcpu = __this_cpu_read(kvm_running_vcpu);
6447 preempt_enable();
6448
6449 return vcpu;
6450 }
6451 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
6452
6453 /**
6454 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6455 */
kvm_get_running_vcpus(void)6456 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
6457 {
6458 return &kvm_running_vcpu;
6459 }
6460
6461 #ifdef CONFIG_GUEST_PERF_EVENTS
kvm_guest_state(void)6462 static unsigned int kvm_guest_state(void)
6463 {
6464 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6465 unsigned int state;
6466
6467 if (!kvm_arch_pmi_in_guest(vcpu))
6468 return 0;
6469
6470 state = PERF_GUEST_ACTIVE;
6471 if (!kvm_arch_vcpu_in_kernel(vcpu))
6472 state |= PERF_GUEST_USER;
6473
6474 return state;
6475 }
6476
kvm_guest_get_ip(void)6477 static unsigned long kvm_guest_get_ip(void)
6478 {
6479 struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6480
6481 /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6482 if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6483 return 0;
6484
6485 return kvm_arch_vcpu_get_ip(vcpu);
6486 }
6487
6488 static struct perf_guest_info_callbacks kvm_guest_cbs = {
6489 .state = kvm_guest_state,
6490 .get_ip = kvm_guest_get_ip,
6491 .handle_intel_pt_intr = NULL,
6492 };
6493
kvm_register_perf_callbacks(unsigned int (* pt_intr_handler)(void))6494 void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6495 {
6496 kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6497 perf_register_guest_info_callbacks(&kvm_guest_cbs);
6498 }
kvm_unregister_perf_callbacks(void)6499 void kvm_unregister_perf_callbacks(void)
6500 {
6501 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6502 }
6503 #endif
6504
kvm_init(unsigned vcpu_size,unsigned vcpu_align,struct module * module)6505 int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6506 {
6507 int r;
6508 int cpu;
6509
6510 /* A kmem cache lets us meet the alignment requirements of fx_save. */
6511 if (!vcpu_align)
6512 vcpu_align = __alignof__(struct kvm_vcpu);
6513 kvm_vcpu_cache =
6514 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6515 SLAB_ACCOUNT,
6516 offsetof(struct kvm_vcpu, arch),
6517 offsetofend(struct kvm_vcpu, stats_id)
6518 - offsetof(struct kvm_vcpu, arch),
6519 NULL);
6520 if (!kvm_vcpu_cache)
6521 return -ENOMEM;
6522
6523 for_each_possible_cpu(cpu) {
6524 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6525 GFP_KERNEL, cpu_to_node(cpu))) {
6526 r = -ENOMEM;
6527 goto err_cpu_kick_mask;
6528 }
6529 }
6530
6531 r = kvm_irqfd_init();
6532 if (r)
6533 goto err_irqfd;
6534
6535 r = kvm_async_pf_init();
6536 if (r)
6537 goto err_async_pf;
6538
6539 kvm_chardev_ops.owner = module;
6540 kvm_vm_fops.owner = module;
6541 kvm_vcpu_fops.owner = module;
6542 kvm_device_fops.owner = module;
6543
6544 kvm_preempt_ops.sched_in = kvm_sched_in;
6545 kvm_preempt_ops.sched_out = kvm_sched_out;
6546
6547 kvm_init_debug();
6548
6549 r = kvm_vfio_ops_init();
6550 if (WARN_ON_ONCE(r))
6551 goto err_vfio;
6552
6553 kvm_gmem_init(module);
6554
6555 r = kvm_init_virtualization();
6556 if (r)
6557 goto err_virt;
6558
6559 /*
6560 * Registration _must_ be the very last thing done, as this exposes
6561 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
6562 */
6563 r = misc_register(&kvm_dev);
6564 if (r) {
6565 pr_err("kvm: misc device register failed\n");
6566 goto err_register;
6567 }
6568
6569 return 0;
6570
6571 err_register:
6572 kvm_uninit_virtualization();
6573 err_virt:
6574 kvm_vfio_ops_exit();
6575 err_vfio:
6576 kvm_async_pf_deinit();
6577 err_async_pf:
6578 kvm_irqfd_exit();
6579 err_irqfd:
6580 err_cpu_kick_mask:
6581 for_each_possible_cpu(cpu)
6582 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6583 kmem_cache_destroy(kvm_vcpu_cache);
6584 return r;
6585 }
6586 EXPORT_SYMBOL_GPL(kvm_init);
6587
kvm_exit(void)6588 void kvm_exit(void)
6589 {
6590 int cpu;
6591
6592 /*
6593 * Note, unregistering /dev/kvm doesn't strictly need to come first,
6594 * fops_get(), a.k.a. try_module_get(), prevents acquiring references
6595 * to KVM while the module is being stopped.
6596 */
6597 misc_deregister(&kvm_dev);
6598
6599 kvm_uninit_virtualization();
6600
6601 debugfs_remove_recursive(kvm_debugfs_dir);
6602 for_each_possible_cpu(cpu)
6603 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6604 kmem_cache_destroy(kvm_vcpu_cache);
6605 kvm_vfio_ops_exit();
6606 kvm_async_pf_deinit();
6607 kvm_irqfd_exit();
6608 }
6609 EXPORT_SYMBOL_GPL(kvm_exit);
6610