• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * pSeries_lpar.c
4  * Copyright (C) 2001 Todd Inglett, IBM Corporation
5  *
6  * pSeries LPAR support.
7  */
8 
9 /* Enables debugging of low-level hash table routines - careful! */
10 #undef DEBUG
11 #define pr_fmt(fmt) "lpar: " fmt
12 
13 #include <linux/kernel.h>
14 #include <linux/dma-mapping.h>
15 #include <linux/console.h>
16 #include <linux/export.h>
17 #include <linux/jump_label.h>
18 #include <linux/delay.h>
19 #include <linux/stop_machine.h>
20 #include <linux/spinlock.h>
21 #include <linux/cpuhotplug.h>
22 #include <linux/workqueue.h>
23 #include <linux/proc_fs.h>
24 #include <asm/processor.h>
25 #include <asm/mmu.h>
26 #include <asm/page.h>
27 #include <asm/pgtable.h>
28 #include <asm/machdep.h>
29 #include <asm/mmu_context.h>
30 #include <asm/iommu.h>
31 #include <asm/tlb.h>
32 #include <asm/prom.h>
33 #include <asm/cputable.h>
34 #include <asm/udbg.h>
35 #include <asm/smp.h>
36 #include <asm/trace.h>
37 #include <asm/firmware.h>
38 #include <asm/plpar_wrappers.h>
39 #include <asm/kexec.h>
40 #include <asm/fadump.h>
41 #include <asm/asm-prototypes.h>
42 #include <asm/debugfs.h>
43 
44 #include "pseries.h"
45 
46 /* Flag bits for H_BULK_REMOVE */
47 #define HBR_REQUEST	0x4000000000000000UL
48 #define HBR_RESPONSE	0x8000000000000000UL
49 #define HBR_END		0xc000000000000000UL
50 #define HBR_AVPN	0x0200000000000000UL
51 #define HBR_ANDCOND	0x0100000000000000UL
52 
53 
54 /* in hvCall.S */
55 EXPORT_SYMBOL(plpar_hcall);
56 EXPORT_SYMBOL(plpar_hcall9);
57 EXPORT_SYMBOL(plpar_hcall_norets);
58 
59 /*
60  * H_BLOCK_REMOVE supported block size for this page size in segment who's base
61  * page size is that page size.
62  *
63  * The first index is the segment base page size, the second one is the actual
64  * page size.
65  */
66 static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init;
67 
68 /*
69  * Due to the involved complexity, and that the current hypervisor is only
70  * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE
71  * buffer size to 8 size block.
72  */
73 #define HBLKRM_SUPPORTED_BLOCK_SIZE 8
74 
75 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
76 static u8 dtl_mask = DTL_LOG_PREEMPT;
77 #else
78 static u8 dtl_mask;
79 #endif
80 
alloc_dtl_buffers(unsigned long * time_limit)81 void alloc_dtl_buffers(unsigned long *time_limit)
82 {
83 	int cpu;
84 	struct paca_struct *pp;
85 	struct dtl_entry *dtl;
86 
87 	for_each_possible_cpu(cpu) {
88 		pp = paca_ptrs[cpu];
89 		if (pp->dispatch_log)
90 			continue;
91 		dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
92 		if (!dtl) {
93 			pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
94 				cpu);
95 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
96 			pr_warn("Stolen time statistics will be unreliable\n");
97 #endif
98 			break;
99 		}
100 
101 		pp->dtl_ridx = 0;
102 		pp->dispatch_log = dtl;
103 		pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
104 		pp->dtl_curr = dtl;
105 
106 		if (time_limit && time_after(jiffies, *time_limit)) {
107 			cond_resched();
108 			*time_limit = jiffies + HZ;
109 		}
110 	}
111 }
112 
register_dtl_buffer(int cpu)113 void register_dtl_buffer(int cpu)
114 {
115 	long ret;
116 	struct paca_struct *pp;
117 	struct dtl_entry *dtl;
118 	int hwcpu = get_hard_smp_processor_id(cpu);
119 
120 	pp = paca_ptrs[cpu];
121 	dtl = pp->dispatch_log;
122 	if (dtl && dtl_mask) {
123 		pp->dtl_ridx = 0;
124 		pp->dtl_curr = dtl;
125 		lppaca_of(cpu).dtl_idx = 0;
126 
127 		/* hypervisor reads buffer length from this field */
128 		dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
129 		ret = register_dtl(hwcpu, __pa(dtl));
130 		if (ret)
131 			pr_err("WARNING: DTL registration of cpu %d (hw %d) failed with %ld\n",
132 			       cpu, hwcpu, ret);
133 
134 		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
135 	}
136 }
137 
138 #ifdef CONFIG_PPC_SPLPAR
139 struct dtl_worker {
140 	struct delayed_work work;
141 	int cpu;
142 };
143 
144 struct vcpu_dispatch_data {
145 	int last_disp_cpu;
146 
147 	int total_disp;
148 
149 	int same_cpu_disp;
150 	int same_chip_disp;
151 	int diff_chip_disp;
152 	int far_chip_disp;
153 
154 	int numa_home_disp;
155 	int numa_remote_disp;
156 	int numa_far_disp;
157 };
158 
159 /*
160  * This represents the number of cpus in the hypervisor. Since there is no
161  * architected way to discover the number of processors in the host, we
162  * provision for dealing with NR_CPUS. This is currently 2048 by default, and
163  * is sufficient for our purposes. This will need to be tweaked if
164  * CONFIG_NR_CPUS is changed.
165  */
166 #define NR_CPUS_H	NR_CPUS
167 
168 DEFINE_RWLOCK(dtl_access_lock);
169 static DEFINE_PER_CPU(struct vcpu_dispatch_data, vcpu_disp_data);
170 static DEFINE_PER_CPU(u64, dtl_entry_ridx);
171 static DEFINE_PER_CPU(struct dtl_worker, dtl_workers);
172 static enum cpuhp_state dtl_worker_state;
173 static DEFINE_MUTEX(dtl_enable_mutex);
174 static int vcpudispatch_stats_on __read_mostly;
175 static int vcpudispatch_stats_freq = 50;
176 static __be32 *vcpu_associativity, *pcpu_associativity;
177 
178 
free_dtl_buffers(unsigned long * time_limit)179 static void free_dtl_buffers(unsigned long *time_limit)
180 {
181 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
182 	int cpu;
183 	struct paca_struct *pp;
184 
185 	for_each_possible_cpu(cpu) {
186 		pp = paca_ptrs[cpu];
187 		if (!pp->dispatch_log)
188 			continue;
189 		kmem_cache_free(dtl_cache, pp->dispatch_log);
190 		pp->dtl_ridx = 0;
191 		pp->dispatch_log = 0;
192 		pp->dispatch_log_end = 0;
193 		pp->dtl_curr = 0;
194 
195 		if (time_limit && time_after(jiffies, *time_limit)) {
196 			cond_resched();
197 			*time_limit = jiffies + HZ;
198 		}
199 	}
200 #endif
201 }
202 
init_cpu_associativity(void)203 static int init_cpu_associativity(void)
204 {
205 	vcpu_associativity = kcalloc(num_possible_cpus() / threads_per_core,
206 			VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
207 	pcpu_associativity = kcalloc(NR_CPUS_H / threads_per_core,
208 			VPHN_ASSOC_BUFSIZE * sizeof(__be32), GFP_KERNEL);
209 
210 	if (!vcpu_associativity || !pcpu_associativity) {
211 		pr_err("error allocating memory for associativity information\n");
212 		return -ENOMEM;
213 	}
214 
215 	return 0;
216 }
217 
destroy_cpu_associativity(void)218 static void destroy_cpu_associativity(void)
219 {
220 	kfree(vcpu_associativity);
221 	kfree(pcpu_associativity);
222 	vcpu_associativity = pcpu_associativity = 0;
223 }
224 
__get_cpu_associativity(int cpu,__be32 * cpu_assoc,int flag)225 static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag)
226 {
227 	__be32 *assoc;
228 	int rc = 0;
229 
230 	assoc = &cpu_assoc[(int)(cpu / threads_per_core) * VPHN_ASSOC_BUFSIZE];
231 	if (!assoc[0]) {
232 		rc = hcall_vphn(cpu, flag, &assoc[0]);
233 		if (rc)
234 			return NULL;
235 	}
236 
237 	return assoc;
238 }
239 
get_pcpu_associativity(int cpu)240 static __be32 *get_pcpu_associativity(int cpu)
241 {
242 	return __get_cpu_associativity(cpu, pcpu_associativity, VPHN_FLAG_PCPU);
243 }
244 
get_vcpu_associativity(int cpu)245 static __be32 *get_vcpu_associativity(int cpu)
246 {
247 	return __get_cpu_associativity(cpu, vcpu_associativity, VPHN_FLAG_VCPU);
248 }
249 
cpu_relative_dispatch_distance(int last_disp_cpu,int cur_disp_cpu)250 static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu)
251 {
252 	__be32 *last_disp_cpu_assoc, *cur_disp_cpu_assoc;
253 
254 	if (last_disp_cpu >= NR_CPUS_H || cur_disp_cpu >= NR_CPUS_H)
255 		return -EINVAL;
256 
257 	last_disp_cpu_assoc = get_pcpu_associativity(last_disp_cpu);
258 	cur_disp_cpu_assoc = get_pcpu_associativity(cur_disp_cpu);
259 
260 	if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
261 		return -EIO;
262 
263 	return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
264 }
265 
cpu_home_node_dispatch_distance(int disp_cpu)266 static int cpu_home_node_dispatch_distance(int disp_cpu)
267 {
268 	__be32 *disp_cpu_assoc, *vcpu_assoc;
269 	int vcpu_id = smp_processor_id();
270 
271 	if (disp_cpu >= NR_CPUS_H) {
272 		pr_debug_ratelimited("vcpu dispatch cpu %d > %d\n",
273 						disp_cpu, NR_CPUS_H);
274 		return -EINVAL;
275 	}
276 
277 	disp_cpu_assoc = get_pcpu_associativity(disp_cpu);
278 	vcpu_assoc = get_vcpu_associativity(vcpu_id);
279 
280 	if (!disp_cpu_assoc || !vcpu_assoc)
281 		return -EIO;
282 
283 	return cpu_distance(disp_cpu_assoc, vcpu_assoc);
284 }
285 
update_vcpu_disp_stat(int disp_cpu)286 static void update_vcpu_disp_stat(int disp_cpu)
287 {
288 	struct vcpu_dispatch_data *disp;
289 	int distance;
290 
291 	disp = this_cpu_ptr(&vcpu_disp_data);
292 	if (disp->last_disp_cpu == -1) {
293 		disp->last_disp_cpu = disp_cpu;
294 		return;
295 	}
296 
297 	disp->total_disp++;
298 
299 	if (disp->last_disp_cpu == disp_cpu ||
300 		(cpu_first_thread_sibling(disp->last_disp_cpu) ==
301 					cpu_first_thread_sibling(disp_cpu)))
302 		disp->same_cpu_disp++;
303 	else {
304 		distance = cpu_relative_dispatch_distance(disp->last_disp_cpu,
305 								disp_cpu);
306 		if (distance < 0)
307 			pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
308 					smp_processor_id());
309 		else {
310 			switch (distance) {
311 			case 0:
312 				disp->same_chip_disp++;
313 				break;
314 			case 1:
315 				disp->diff_chip_disp++;
316 				break;
317 			case 2:
318 				disp->far_chip_disp++;
319 				break;
320 			default:
321 				pr_debug_ratelimited("vcpudispatch_stats: cpu %d (%d -> %d): unexpected relative dispatch distance %d\n",
322 						 smp_processor_id(),
323 						 disp->last_disp_cpu,
324 						 disp_cpu,
325 						 distance);
326 			}
327 		}
328 	}
329 
330 	distance = cpu_home_node_dispatch_distance(disp_cpu);
331 	if (distance < 0)
332 		pr_debug_ratelimited("vcpudispatch_stats: cpu %d: error determining associativity\n",
333 				smp_processor_id());
334 	else {
335 		switch (distance) {
336 		case 0:
337 			disp->numa_home_disp++;
338 			break;
339 		case 1:
340 			disp->numa_remote_disp++;
341 			break;
342 		case 2:
343 			disp->numa_far_disp++;
344 			break;
345 		default:
346 			pr_debug_ratelimited("vcpudispatch_stats: cpu %d on %d: unexpected numa dispatch distance %d\n",
347 						 smp_processor_id(),
348 						 disp_cpu,
349 						 distance);
350 		}
351 	}
352 
353 	disp->last_disp_cpu = disp_cpu;
354 }
355 
process_dtl_buffer(struct work_struct * work)356 static void process_dtl_buffer(struct work_struct *work)
357 {
358 	struct dtl_entry dtle;
359 	u64 i = __this_cpu_read(dtl_entry_ridx);
360 	struct dtl_entry *dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
361 	struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
362 	struct lppaca *vpa = local_paca->lppaca_ptr;
363 	struct dtl_worker *d = container_of(work, struct dtl_worker, work.work);
364 
365 	if (!local_paca->dispatch_log)
366 		return;
367 
368 	/* if we have been migrated away, we cancel ourself */
369 	if (d->cpu != smp_processor_id()) {
370 		pr_debug("vcpudispatch_stats: cpu %d worker migrated -- canceling worker\n",
371 						smp_processor_id());
372 		return;
373 	}
374 
375 	if (i == be64_to_cpu(vpa->dtl_idx))
376 		goto out;
377 
378 	while (i < be64_to_cpu(vpa->dtl_idx)) {
379 		dtle = *dtl;
380 		barrier();
381 		if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
382 			/* buffer has overflowed */
383 			pr_debug_ratelimited("vcpudispatch_stats: cpu %d lost %lld DTL samples\n",
384 				d->cpu,
385 				be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG - i);
386 			i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
387 			dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
388 			continue;
389 		}
390 		update_vcpu_disp_stat(be16_to_cpu(dtle.processor_id));
391 		++i;
392 		++dtl;
393 		if (dtl == dtl_end)
394 			dtl = local_paca->dispatch_log;
395 	}
396 
397 	__this_cpu_write(dtl_entry_ridx, i);
398 
399 out:
400 	schedule_delayed_work_on(d->cpu, to_delayed_work(work),
401 					HZ / vcpudispatch_stats_freq);
402 }
403 
dtl_worker_online(unsigned int cpu)404 static int dtl_worker_online(unsigned int cpu)
405 {
406 	struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
407 
408 	memset(d, 0, sizeof(*d));
409 	INIT_DELAYED_WORK(&d->work, process_dtl_buffer);
410 	d->cpu = cpu;
411 
412 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
413 	per_cpu(dtl_entry_ridx, cpu) = 0;
414 	register_dtl_buffer(cpu);
415 #else
416 	per_cpu(dtl_entry_ridx, cpu) = be64_to_cpu(lppaca_of(cpu).dtl_idx);
417 #endif
418 
419 	schedule_delayed_work_on(cpu, &d->work, HZ / vcpudispatch_stats_freq);
420 	return 0;
421 }
422 
dtl_worker_offline(unsigned int cpu)423 static int dtl_worker_offline(unsigned int cpu)
424 {
425 	struct dtl_worker *d = &per_cpu(dtl_workers, cpu);
426 
427 	cancel_delayed_work_sync(&d->work);
428 
429 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
430 	unregister_dtl(get_hard_smp_processor_id(cpu));
431 #endif
432 
433 	return 0;
434 }
435 
set_global_dtl_mask(u8 mask)436 static void set_global_dtl_mask(u8 mask)
437 {
438 	int cpu;
439 
440 	dtl_mask = mask;
441 	for_each_present_cpu(cpu)
442 		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
443 }
444 
reset_global_dtl_mask(void)445 static void reset_global_dtl_mask(void)
446 {
447 	int cpu;
448 
449 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
450 	dtl_mask = DTL_LOG_PREEMPT;
451 #else
452 	dtl_mask = 0;
453 #endif
454 	for_each_present_cpu(cpu)
455 		lppaca_of(cpu).dtl_enable_mask = dtl_mask;
456 }
457 
dtl_worker_enable(unsigned long * time_limit)458 static int dtl_worker_enable(unsigned long *time_limit)
459 {
460 	int rc = 0, state;
461 
462 	if (!write_trylock(&dtl_access_lock)) {
463 		rc = -EBUSY;
464 		goto out;
465 	}
466 
467 	set_global_dtl_mask(DTL_LOG_ALL);
468 
469 	/* Setup dtl buffers and register those */
470 	alloc_dtl_buffers(time_limit);
471 
472 	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/dtl:online",
473 					dtl_worker_online, dtl_worker_offline);
474 	if (state < 0) {
475 		pr_err("vcpudispatch_stats: unable to setup workqueue for DTL processing\n");
476 		free_dtl_buffers(time_limit);
477 		reset_global_dtl_mask();
478 		write_unlock(&dtl_access_lock);
479 		rc = -EINVAL;
480 		goto out;
481 	}
482 	dtl_worker_state = state;
483 
484 out:
485 	return rc;
486 }
487 
dtl_worker_disable(unsigned long * time_limit)488 static void dtl_worker_disable(unsigned long *time_limit)
489 {
490 	cpuhp_remove_state(dtl_worker_state);
491 	free_dtl_buffers(time_limit);
492 	reset_global_dtl_mask();
493 	write_unlock(&dtl_access_lock);
494 }
495 
vcpudispatch_stats_write(struct file * file,const char __user * p,size_t count,loff_t * ppos)496 static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p,
497 		size_t count, loff_t *ppos)
498 {
499 	unsigned long time_limit = jiffies + HZ;
500 	struct vcpu_dispatch_data *disp;
501 	int rc, cmd, cpu;
502 	char buf[16];
503 
504 	if (count > 15)
505 		return -EINVAL;
506 
507 	if (copy_from_user(buf, p, count))
508 		return -EFAULT;
509 
510 	buf[count] = 0;
511 	rc = kstrtoint(buf, 0, &cmd);
512 	if (rc || cmd < 0 || cmd > 1) {
513 		pr_err("vcpudispatch_stats: please use 0 to disable or 1 to enable dispatch statistics\n");
514 		return rc ? rc : -EINVAL;
515 	}
516 
517 	mutex_lock(&dtl_enable_mutex);
518 
519 	if ((cmd == 0 && !vcpudispatch_stats_on) ||
520 			(cmd == 1 && vcpudispatch_stats_on))
521 		goto out;
522 
523 	if (cmd) {
524 		rc = init_cpu_associativity();
525 		if (rc) {
526 			destroy_cpu_associativity();
527 			goto out;
528 		}
529 
530 		for_each_possible_cpu(cpu) {
531 			disp = per_cpu_ptr(&vcpu_disp_data, cpu);
532 			memset(disp, 0, sizeof(*disp));
533 			disp->last_disp_cpu = -1;
534 		}
535 
536 		rc = dtl_worker_enable(&time_limit);
537 		if (rc) {
538 			destroy_cpu_associativity();
539 			goto out;
540 		}
541 	} else {
542 		dtl_worker_disable(&time_limit);
543 		destroy_cpu_associativity();
544 	}
545 
546 	vcpudispatch_stats_on = cmd;
547 
548 out:
549 	mutex_unlock(&dtl_enable_mutex);
550 	if (rc)
551 		return rc;
552 	return count;
553 }
554 
vcpudispatch_stats_display(struct seq_file * p,void * v)555 static int vcpudispatch_stats_display(struct seq_file *p, void *v)
556 {
557 	int cpu;
558 	struct vcpu_dispatch_data *disp;
559 
560 	if (!vcpudispatch_stats_on) {
561 		seq_puts(p, "off\n");
562 		return 0;
563 	}
564 
565 	for_each_online_cpu(cpu) {
566 		disp = per_cpu_ptr(&vcpu_disp_data, cpu);
567 		seq_printf(p, "cpu%d", cpu);
568 		seq_put_decimal_ull(p, " ", disp->total_disp);
569 		seq_put_decimal_ull(p, " ", disp->same_cpu_disp);
570 		seq_put_decimal_ull(p, " ", disp->same_chip_disp);
571 		seq_put_decimal_ull(p, " ", disp->diff_chip_disp);
572 		seq_put_decimal_ull(p, " ", disp->far_chip_disp);
573 		seq_put_decimal_ull(p, " ", disp->numa_home_disp);
574 		seq_put_decimal_ull(p, " ", disp->numa_remote_disp);
575 		seq_put_decimal_ull(p, " ", disp->numa_far_disp);
576 		seq_puts(p, "\n");
577 	}
578 
579 	return 0;
580 }
581 
vcpudispatch_stats_open(struct inode * inode,struct file * file)582 static int vcpudispatch_stats_open(struct inode *inode, struct file *file)
583 {
584 	return single_open(file, vcpudispatch_stats_display, NULL);
585 }
586 
587 static const struct file_operations vcpudispatch_stats_proc_ops = {
588 	.open		= vcpudispatch_stats_open,
589 	.read		= seq_read,
590 	.write		= vcpudispatch_stats_write,
591 	.llseek		= seq_lseek,
592 	.release	= single_release,
593 };
594 
vcpudispatch_stats_freq_write(struct file * file,const char __user * p,size_t count,loff_t * ppos)595 static ssize_t vcpudispatch_stats_freq_write(struct file *file,
596 		const char __user *p, size_t count, loff_t *ppos)
597 {
598 	int rc, freq;
599 	char buf[16];
600 
601 	if (count > 15)
602 		return -EINVAL;
603 
604 	if (copy_from_user(buf, p, count))
605 		return -EFAULT;
606 
607 	buf[count] = 0;
608 	rc = kstrtoint(buf, 0, &freq);
609 	if (rc || freq < 1 || freq > HZ) {
610 		pr_err("vcpudispatch_stats_freq: please specify a frequency between 1 and %d\n",
611 				HZ);
612 		return rc ? rc : -EINVAL;
613 	}
614 
615 	vcpudispatch_stats_freq = freq;
616 
617 	return count;
618 }
619 
vcpudispatch_stats_freq_display(struct seq_file * p,void * v)620 static int vcpudispatch_stats_freq_display(struct seq_file *p, void *v)
621 {
622 	seq_printf(p, "%d\n", vcpudispatch_stats_freq);
623 	return 0;
624 }
625 
vcpudispatch_stats_freq_open(struct inode * inode,struct file * file)626 static int vcpudispatch_stats_freq_open(struct inode *inode, struct file *file)
627 {
628 	return single_open(file, vcpudispatch_stats_freq_display, NULL);
629 }
630 
631 static const struct file_operations vcpudispatch_stats_freq_proc_ops = {
632 	.open		= vcpudispatch_stats_freq_open,
633 	.read		= seq_read,
634 	.write		= vcpudispatch_stats_freq_write,
635 	.llseek		= seq_lseek,
636 	.release	= single_release,
637 };
638 
vcpudispatch_stats_procfs_init(void)639 static int __init vcpudispatch_stats_procfs_init(void)
640 {
641 	if (!lppaca_shared_proc(get_lppaca()))
642 		return 0;
643 
644 	if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL,
645 					&vcpudispatch_stats_proc_ops))
646 		pr_err("vcpudispatch_stats: error creating procfs file\n");
647 	else if (!proc_create("powerpc/vcpudispatch_stats_freq", 0600, NULL,
648 					&vcpudispatch_stats_freq_proc_ops))
649 		pr_err("vcpudispatch_stats_freq: error creating procfs file\n");
650 
651 	return 0;
652 }
653 
654 machine_device_initcall(pseries, vcpudispatch_stats_procfs_init);
655 #endif /* CONFIG_PPC_SPLPAR */
656 
vpa_init(int cpu)657 void vpa_init(int cpu)
658 {
659 	int hwcpu = get_hard_smp_processor_id(cpu);
660 	unsigned long addr;
661 	long ret;
662 
663 	/*
664 	 * The spec says it "may be problematic" if CPU x registers the VPA of
665 	 * CPU y. We should never do that, but wail if we ever do.
666 	 */
667 	WARN_ON(cpu != smp_processor_id());
668 
669 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
670 		lppaca_of(cpu).vmxregs_in_use = 1;
671 
672 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
673 		lppaca_of(cpu).ebb_regs_in_use = 1;
674 
675 	addr = __pa(&lppaca_of(cpu));
676 	ret = register_vpa(hwcpu, addr);
677 
678 	if (ret) {
679 		pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
680 		       "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
681 		return;
682 	}
683 
684 #ifdef CONFIG_PPC_BOOK3S_64
685 	/*
686 	 * PAPR says this feature is SLB-Buffer but firmware never
687 	 * reports that.  All SPLPAR support SLB shadow buffer.
688 	 */
689 	if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
690 		addr = __pa(paca_ptrs[cpu]->slb_shadow_ptr);
691 		ret = register_slb_shadow(hwcpu, addr);
692 		if (ret)
693 			pr_err("WARNING: SLB shadow buffer registration for "
694 			       "cpu %d (hw %d) of area %lx failed with %ld\n",
695 			       cpu, hwcpu, addr, ret);
696 	}
697 #endif /* CONFIG_PPC_BOOK3S_64 */
698 
699 	/*
700 	 * Register dispatch trace log, if one has been allocated.
701 	 */
702 	register_dtl_buffer(cpu);
703 }
704 
705 #ifdef CONFIG_PPC_BOOK3S_64
706 
pSeries_lpar_hpte_insert(unsigned long hpte_group,unsigned long vpn,unsigned long pa,unsigned long rflags,unsigned long vflags,int psize,int apsize,int ssize)707 static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
708 				     unsigned long vpn, unsigned long pa,
709 				     unsigned long rflags, unsigned long vflags,
710 				     int psize, int apsize, int ssize)
711 {
712 	unsigned long lpar_rc;
713 	unsigned long flags;
714 	unsigned long slot;
715 	unsigned long hpte_v, hpte_r;
716 
717 	if (!(vflags & HPTE_V_BOLTED))
718 		pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
719 			 "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
720 			 hpte_group, vpn,  pa, rflags, vflags, psize);
721 
722 	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
723 	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
724 
725 	if (!(vflags & HPTE_V_BOLTED))
726 		pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
727 
728 	/* Now fill in the actual HPTE */
729 	/* Set CEC cookie to 0         */
730 	/* Zero page = 0               */
731 	/* I-cache Invalidate = 0      */
732 	/* I-cache synchronize = 0     */
733 	/* Exact = 0                   */
734 	flags = 0;
735 
736 	if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
737 		flags |= H_COALESCE_CAND;
738 
739 	lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
740 	if (unlikely(lpar_rc == H_PTEG_FULL)) {
741 		pr_devel("Hash table group is full\n");
742 		return -1;
743 	}
744 
745 	/*
746 	 * Since we try and ioremap PHBs we don't own, the pte insert
747 	 * will fail. However we must catch the failure in hash_page
748 	 * or we will loop forever, so return -2 in this case.
749 	 */
750 	if (unlikely(lpar_rc != H_SUCCESS)) {
751 		pr_err("Failed hash pte insert with error %ld\n", lpar_rc);
752 		return -2;
753 	}
754 	if (!(vflags & HPTE_V_BOLTED))
755 		pr_devel(" -> slot: %lu\n", slot & 7);
756 
757 	/* Because of iSeries, we have to pass down the secondary
758 	 * bucket bit here as well
759 	 */
760 	return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
761 }
762 
763 static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
764 
pSeries_lpar_hpte_remove(unsigned long hpte_group)765 static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
766 {
767 	unsigned long slot_offset;
768 	unsigned long lpar_rc;
769 	int i;
770 	unsigned long dummy1, dummy2;
771 
772 	/* pick a random slot to start at */
773 	slot_offset = mftb() & 0x7;
774 
775 	for (i = 0; i < HPTES_PER_GROUP; i++) {
776 
777 		/* don't remove a bolted entry */
778 		lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
779 					   (0x1UL << 4), &dummy1, &dummy2);
780 		if (lpar_rc == H_SUCCESS)
781 			return i;
782 
783 		/*
784 		 * The test for adjunct partition is performed before the
785 		 * ANDCOND test.  H_RESOURCE may be returned, so we need to
786 		 * check for that as well.
787 		 */
788 		BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE);
789 
790 		slot_offset++;
791 		slot_offset &= 0x7;
792 	}
793 
794 	return -1;
795 }
796 
manual_hpte_clear_all(void)797 static void manual_hpte_clear_all(void)
798 {
799 	unsigned long size_bytes = 1UL << ppc64_pft_size;
800 	unsigned long hpte_count = size_bytes >> 4;
801 	struct {
802 		unsigned long pteh;
803 		unsigned long ptel;
804 	} ptes[4];
805 	long lpar_rc;
806 	unsigned long i, j;
807 
808 	/* Read in batches of 4,
809 	 * invalidate only valid entries not in the VRMA
810 	 * hpte_count will be a multiple of 4
811          */
812 	for (i = 0; i < hpte_count; i += 4) {
813 		lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
814 		if (lpar_rc != H_SUCCESS) {
815 			pr_info("Failed to read hash page table at %ld err %ld\n",
816 				i, lpar_rc);
817 			continue;
818 		}
819 		for (j = 0; j < 4; j++){
820 			if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
821 				HPTE_V_VRMA_MASK)
822 				continue;
823 			if (ptes[j].pteh & HPTE_V_VALID)
824 				plpar_pte_remove_raw(0, i + j, 0,
825 					&(ptes[j].pteh), &(ptes[j].ptel));
826 		}
827 	}
828 }
829 
hcall_hpte_clear_all(void)830 static int hcall_hpte_clear_all(void)
831 {
832 	int rc;
833 
834 	do {
835 		rc = plpar_hcall_norets(H_CLEAR_HPT);
836 	} while (rc == H_CONTINUE);
837 
838 	return rc;
839 }
840 
pseries_hpte_clear_all(void)841 static void pseries_hpte_clear_all(void)
842 {
843 	int rc;
844 
845 	rc = hcall_hpte_clear_all();
846 	if (rc != H_SUCCESS)
847 		manual_hpte_clear_all();
848 
849 #ifdef __LITTLE_ENDIAN__
850 	/*
851 	 * Reset exceptions to big endian.
852 	 *
853 	 * FIXME this is a hack for kexec, we need to reset the exception
854 	 * endian before starting the new kernel and this is a convenient place
855 	 * to do it.
856 	 *
857 	 * This is also called on boot when a fadump happens. In that case we
858 	 * must not change the exception endian mode.
859 	 */
860 	if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active())
861 		pseries_big_endian_exceptions();
862 #endif
863 }
864 
865 /*
866  * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
867  * the low 3 bits of flags happen to line up.  So no transform is needed.
868  * We can probably optimize here and assume the high bits of newpp are
869  * already zero.  For now I am paranoid.
870  */
pSeries_lpar_hpte_updatepp(unsigned long slot,unsigned long newpp,unsigned long vpn,int psize,int apsize,int ssize,unsigned long inv_flags)871 static long pSeries_lpar_hpte_updatepp(unsigned long slot,
872 				       unsigned long newpp,
873 				       unsigned long vpn,
874 				       int psize, int apsize,
875 				       int ssize, unsigned long inv_flags)
876 {
877 	unsigned long lpar_rc;
878 	unsigned long flags;
879 	unsigned long want_v;
880 
881 	want_v = hpte_encode_avpn(vpn, psize, ssize);
882 
883 	flags = (newpp & 7) | H_AVPN;
884 	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
885 		/* Move pp0 into bit 8 (IBM 55) */
886 		flags |= (newpp & HPTE_R_PP0) >> 55;
887 
888 	pr_devel("    update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
889 		 want_v, slot, flags, psize);
890 
891 	lpar_rc = plpar_pte_protect(flags, slot, want_v);
892 
893 	if (lpar_rc == H_NOT_FOUND) {
894 		pr_devel("not found !\n");
895 		return -1;
896 	}
897 
898 	pr_devel("ok\n");
899 
900 	BUG_ON(lpar_rc != H_SUCCESS);
901 
902 	return 0;
903 }
904 
__pSeries_lpar_hpte_find(unsigned long want_v,unsigned long hpte_group)905 static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group)
906 {
907 	long lpar_rc;
908 	unsigned long i, j;
909 	struct {
910 		unsigned long pteh;
911 		unsigned long ptel;
912 	} ptes[4];
913 
914 	for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
915 
916 		lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
917 		if (lpar_rc != H_SUCCESS) {
918 			pr_info("Failed to read hash page table at %ld err %ld\n",
919 				hpte_group, lpar_rc);
920 			continue;
921 		}
922 
923 		for (j = 0; j < 4; j++) {
924 			if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
925 			    (ptes[j].pteh & HPTE_V_VALID))
926 				return i + j;
927 		}
928 	}
929 
930 	return -1;
931 }
932 
pSeries_lpar_hpte_find(unsigned long vpn,int psize,int ssize)933 static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
934 {
935 	long slot;
936 	unsigned long hash;
937 	unsigned long want_v;
938 	unsigned long hpte_group;
939 
940 	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
941 	want_v = hpte_encode_avpn(vpn, psize, ssize);
942 
943 	/* Bolted entries are always in the primary group */
944 	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
945 	slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
946 	if (slot < 0)
947 		return -1;
948 	return hpte_group + slot;
949 }
950 
pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,unsigned long ea,int psize,int ssize)951 static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
952 					     unsigned long ea,
953 					     int psize, int ssize)
954 {
955 	unsigned long vpn;
956 	unsigned long lpar_rc, slot, vsid, flags;
957 
958 	vsid = get_kernel_vsid(ea, ssize);
959 	vpn = hpt_vpn(ea, vsid, ssize);
960 
961 	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
962 	BUG_ON(slot == -1);
963 
964 	flags = newpp & 7;
965 	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
966 		/* Move pp0 into bit 8 (IBM 55) */
967 		flags |= (newpp & HPTE_R_PP0) >> 55;
968 
969 	lpar_rc = plpar_pte_protect(flags, slot, 0);
970 
971 	BUG_ON(lpar_rc != H_SUCCESS);
972 }
973 
pSeries_lpar_hpte_invalidate(unsigned long slot,unsigned long vpn,int psize,int apsize,int ssize,int local)974 static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
975 					 int psize, int apsize,
976 					 int ssize, int local)
977 {
978 	unsigned long want_v;
979 	unsigned long lpar_rc;
980 	unsigned long dummy1, dummy2;
981 
982 	pr_devel("    inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
983 		 slot, vpn, psize, local);
984 
985 	want_v = hpte_encode_avpn(vpn, psize, ssize);
986 	lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
987 	if (lpar_rc == H_NOT_FOUND)
988 		return;
989 
990 	BUG_ON(lpar_rc != H_SUCCESS);
991 }
992 
993 
994 /*
995  * As defined in the PAPR's section 14.5.4.1.8
996  * The control mask doesn't include the returned reference and change bit from
997  * the processed PTE.
998  */
999 #define HBLKR_AVPN		0x0100000000000000UL
1000 #define HBLKR_CTRL_MASK		0xf800000000000000UL
1001 #define HBLKR_CTRL_SUCCESS	0x8000000000000000UL
1002 #define HBLKR_CTRL_ERRNOTFOUND	0x8800000000000000UL
1003 #define HBLKR_CTRL_ERRBUSY	0xa000000000000000UL
1004 
1005 /*
1006  * Returned true if we are supporting this block size for the specified segment
1007  * base page size and actual page size.
1008  *
1009  * Currently, we only support 8 size block.
1010  */
is_supported_hlbkrm(int bpsize,int psize)1011 static inline bool is_supported_hlbkrm(int bpsize, int psize)
1012 {
1013 	return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE);
1014 }
1015 
1016 /**
1017  * H_BLOCK_REMOVE caller.
1018  * @idx should point to the latest @param entry set with a PTEX.
1019  * If PTE cannot be processed because another CPUs has already locked that
1020  * group, those entries are put back in @param starting at index 1.
1021  * If entries has to be retried and @retry_busy is set to true, these entries
1022  * are retried until success. If @retry_busy is set to false, the returned
1023  * is the number of entries yet to process.
1024  */
call_block_remove(unsigned long idx,unsigned long * param,bool retry_busy)1025 static unsigned long call_block_remove(unsigned long idx, unsigned long *param,
1026 				       bool retry_busy)
1027 {
1028 	unsigned long i, rc, new_idx;
1029 	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
1030 
1031 	if (idx < 2) {
1032 		pr_warn("Unexpected empty call to H_BLOCK_REMOVE");
1033 		return 0;
1034 	}
1035 again:
1036 	new_idx = 0;
1037 	if (idx > PLPAR_HCALL9_BUFSIZE) {
1038 		pr_err("Too many PTEs (%lu) for H_BLOCK_REMOVE", idx);
1039 		idx = PLPAR_HCALL9_BUFSIZE;
1040 	} else if (idx < PLPAR_HCALL9_BUFSIZE)
1041 		param[idx] = HBR_END;
1042 
1043 	rc = plpar_hcall9(H_BLOCK_REMOVE, retbuf,
1044 			  param[0], /* AVA */
1045 			  param[1],  param[2],  param[3],  param[4], /* TS0-7 */
1046 			  param[5],  param[6],  param[7],  param[8]);
1047 	if (rc == H_SUCCESS)
1048 		return 0;
1049 
1050 	BUG_ON(rc != H_PARTIAL);
1051 
1052 	/* Check that the unprocessed entries were 'not found' or 'busy' */
1053 	for (i = 0; i < idx-1; i++) {
1054 		unsigned long ctrl = retbuf[i] & HBLKR_CTRL_MASK;
1055 
1056 		if (ctrl == HBLKR_CTRL_ERRBUSY) {
1057 			param[++new_idx] = param[i+1];
1058 			continue;
1059 		}
1060 
1061 		BUG_ON(ctrl != HBLKR_CTRL_SUCCESS
1062 		       && ctrl != HBLKR_CTRL_ERRNOTFOUND);
1063 	}
1064 
1065 	/*
1066 	 * If there were entries found busy, retry these entries if requested,
1067 	 * of if all the entries have to be retried.
1068 	 */
1069 	if (new_idx && (retry_busy || new_idx == (PLPAR_HCALL9_BUFSIZE-1))) {
1070 		idx = new_idx + 1;
1071 		goto again;
1072 	}
1073 
1074 	return new_idx;
1075 }
1076 
1077 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1078 /*
1079  * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
1080  * to make sure that we avoid bouncing the hypervisor tlbie lock.
1081  */
1082 #define PPC64_HUGE_HPTE_BATCH 12
1083 
hugepage_block_invalidate(unsigned long * slot,unsigned long * vpn,int count,int psize,int ssize)1084 static void hugepage_block_invalidate(unsigned long *slot, unsigned long *vpn,
1085 				      int count, int psize, int ssize)
1086 {
1087 	unsigned long param[PLPAR_HCALL9_BUFSIZE];
1088 	unsigned long shift, current_vpgb, vpgb;
1089 	int i, pix = 0;
1090 
1091 	shift = mmu_psize_defs[psize].shift;
1092 
1093 	for (i = 0; i < count; i++) {
1094 		/*
1095 		 * Shifting 3 bits more on the right to get a
1096 		 * 8 pages aligned virtual addresse.
1097 		 */
1098 		vpgb = (vpn[i] >> (shift - VPN_SHIFT + 3));
1099 		if (!pix || vpgb != current_vpgb) {
1100 			/*
1101 			 * Need to start a new 8 pages block, flush
1102 			 * the current one if needed.
1103 			 */
1104 			if (pix)
1105 				(void)call_block_remove(pix, param, true);
1106 			current_vpgb = vpgb;
1107 			param[0] = hpte_encode_avpn(vpn[i], psize, ssize);
1108 			pix = 1;
1109 		}
1110 
1111 		param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot[i];
1112 		if (pix == PLPAR_HCALL9_BUFSIZE) {
1113 			pix = call_block_remove(pix, param, false);
1114 			/*
1115 			 * pix = 0 means that all the entries were
1116 			 * removed, we can start a new block.
1117 			 * Otherwise, this means that there are entries
1118 			 * to retry, and pix points to latest one, so
1119 			 * we should increment it and try to continue
1120 			 * the same block.
1121 			 */
1122 			if (pix)
1123 				pix++;
1124 		}
1125 	}
1126 	if (pix)
1127 		(void)call_block_remove(pix, param, true);
1128 }
1129 
hugepage_bulk_invalidate(unsigned long * slot,unsigned long * vpn,int count,int psize,int ssize)1130 static void hugepage_bulk_invalidate(unsigned long *slot, unsigned long *vpn,
1131 				     int count, int psize, int ssize)
1132 {
1133 	unsigned long param[PLPAR_HCALL9_BUFSIZE];
1134 	int i = 0, pix = 0, rc;
1135 
1136 	for (i = 0; i < count; i++) {
1137 
1138 		if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1139 			pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
1140 						     ssize, 0);
1141 		} else {
1142 			param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
1143 			param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
1144 			pix += 2;
1145 			if (pix == 8) {
1146 				rc = plpar_hcall9(H_BULK_REMOVE, param,
1147 						  param[0], param[1], param[2],
1148 						  param[3], param[4], param[5],
1149 						  param[6], param[7]);
1150 				BUG_ON(rc != H_SUCCESS);
1151 				pix = 0;
1152 			}
1153 		}
1154 	}
1155 	if (pix) {
1156 		param[pix] = HBR_END;
1157 		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
1158 				  param[2], param[3], param[4], param[5],
1159 				  param[6], param[7]);
1160 		BUG_ON(rc != H_SUCCESS);
1161 	}
1162 }
1163 
__pSeries_lpar_hugepage_invalidate(unsigned long * slot,unsigned long * vpn,int count,int psize,int ssize)1164 static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
1165 						      unsigned long *vpn,
1166 						      int count, int psize,
1167 						      int ssize)
1168 {
1169 	unsigned long flags = 0;
1170 	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
1171 
1172 	if (lock_tlbie)
1173 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
1174 
1175 	/* Assuming THP size is 16M */
1176 	if (is_supported_hlbkrm(psize, MMU_PAGE_16M))
1177 		hugepage_block_invalidate(slot, vpn, count, psize, ssize);
1178 	else
1179 		hugepage_bulk_invalidate(slot, vpn, count, psize, ssize);
1180 
1181 	if (lock_tlbie)
1182 		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
1183 }
1184 
pSeries_lpar_hugepage_invalidate(unsigned long vsid,unsigned long addr,unsigned char * hpte_slot_array,int psize,int ssize,int local)1185 static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
1186 					     unsigned long addr,
1187 					     unsigned char *hpte_slot_array,
1188 					     int psize, int ssize, int local)
1189 {
1190 	int i, index = 0;
1191 	unsigned long s_addr = addr;
1192 	unsigned int max_hpte_count, valid;
1193 	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
1194 	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
1195 	unsigned long shift, hidx, vpn = 0, hash, slot;
1196 
1197 	shift = mmu_psize_defs[psize].shift;
1198 	max_hpte_count = 1U << (PMD_SHIFT - shift);
1199 
1200 	for (i = 0; i < max_hpte_count; i++) {
1201 		valid = hpte_valid(hpte_slot_array, i);
1202 		if (!valid)
1203 			continue;
1204 		hidx =  hpte_hash_index(hpte_slot_array, i);
1205 
1206 		/* get the vpn */
1207 		addr = s_addr + (i * (1ul << shift));
1208 		vpn = hpt_vpn(addr, vsid, ssize);
1209 		hash = hpt_hash(vpn, shift, ssize);
1210 		if (hidx & _PTEIDX_SECONDARY)
1211 			hash = ~hash;
1212 
1213 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1214 		slot += hidx & _PTEIDX_GROUP_IX;
1215 
1216 		slot_array[index] = slot;
1217 		vpn_array[index] = vpn;
1218 		if (index == PPC64_HUGE_HPTE_BATCH - 1) {
1219 			/*
1220 			 * Now do a bluk invalidate
1221 			 */
1222 			__pSeries_lpar_hugepage_invalidate(slot_array,
1223 							   vpn_array,
1224 							   PPC64_HUGE_HPTE_BATCH,
1225 							   psize, ssize);
1226 			index = 0;
1227 		} else
1228 			index++;
1229 	}
1230 	if (index)
1231 		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
1232 						   index, psize, ssize);
1233 }
1234 #else
pSeries_lpar_hugepage_invalidate(unsigned long vsid,unsigned long addr,unsigned char * hpte_slot_array,int psize,int ssize,int local)1235 static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
1236 					     unsigned long addr,
1237 					     unsigned char *hpte_slot_array,
1238 					     int psize, int ssize, int local)
1239 {
1240 	WARN(1, "%s called without THP support\n", __func__);
1241 }
1242 #endif
1243 
pSeries_lpar_hpte_removebolted(unsigned long ea,int psize,int ssize)1244 static int pSeries_lpar_hpte_removebolted(unsigned long ea,
1245 					  int psize, int ssize)
1246 {
1247 	unsigned long vpn;
1248 	unsigned long slot, vsid;
1249 
1250 	vsid = get_kernel_vsid(ea, ssize);
1251 	vpn = hpt_vpn(ea, vsid, ssize);
1252 
1253 	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
1254 	if (slot == -1)
1255 		return -ENOENT;
1256 
1257 	/*
1258 	 * lpar doesn't use the passed actual page size
1259 	 */
1260 	pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
1261 	return 0;
1262 }
1263 
1264 
compute_slot(real_pte_t pte,unsigned long vpn,unsigned long index,unsigned long shift,int ssize)1265 static inline unsigned long compute_slot(real_pte_t pte,
1266 					 unsigned long vpn,
1267 					 unsigned long index,
1268 					 unsigned long shift,
1269 					 int ssize)
1270 {
1271 	unsigned long slot, hash, hidx;
1272 
1273 	hash = hpt_hash(vpn, shift, ssize);
1274 	hidx = __rpte_to_hidx(pte, index);
1275 	if (hidx & _PTEIDX_SECONDARY)
1276 		hash = ~hash;
1277 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1278 	slot += hidx & _PTEIDX_GROUP_IX;
1279 	return slot;
1280 }
1281 
1282 /**
1283  * The hcall H_BLOCK_REMOVE implies that the virtual pages to processed are
1284  * "all within the same naturally aligned 8 page virtual address block".
1285  */
do_block_remove(unsigned long number,struct ppc64_tlb_batch * batch,unsigned long * param)1286 static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch,
1287 			    unsigned long *param)
1288 {
1289 	unsigned long vpn;
1290 	unsigned long i, pix = 0;
1291 	unsigned long index, shift, slot, current_vpgb, vpgb;
1292 	real_pte_t pte;
1293 	int psize, ssize;
1294 
1295 	psize = batch->psize;
1296 	ssize = batch->ssize;
1297 
1298 	for (i = 0; i < number; i++) {
1299 		vpn = batch->vpn[i];
1300 		pte = batch->pte[i];
1301 		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
1302 			/*
1303 			 * Shifting 3 bits more on the right to get a
1304 			 * 8 pages aligned virtual addresse.
1305 			 */
1306 			vpgb = (vpn >> (shift - VPN_SHIFT + 3));
1307 			if (!pix || vpgb != current_vpgb) {
1308 				/*
1309 				 * Need to start a new 8 pages block, flush
1310 				 * the current one if needed.
1311 				 */
1312 				if (pix)
1313 					(void)call_block_remove(pix, param,
1314 								true);
1315 				current_vpgb = vpgb;
1316 				param[0] = hpte_encode_avpn(vpn, psize,
1317 							    ssize);
1318 				pix = 1;
1319 			}
1320 
1321 			slot = compute_slot(pte, vpn, index, shift, ssize);
1322 			param[pix++] = HBR_REQUEST | HBLKR_AVPN | slot;
1323 
1324 			if (pix == PLPAR_HCALL9_BUFSIZE) {
1325 				pix = call_block_remove(pix, param, false);
1326 				/*
1327 				 * pix = 0 means that all the entries were
1328 				 * removed, we can start a new block.
1329 				 * Otherwise, this means that there are entries
1330 				 * to retry, and pix points to latest one, so
1331 				 * we should increment it and try to continue
1332 				 * the same block.
1333 				 */
1334 				if (pix)
1335 					pix++;
1336 			}
1337 		} pte_iterate_hashed_end();
1338 	}
1339 
1340 	if (pix)
1341 		(void)call_block_remove(pix, param, true);
1342 }
1343 
1344 /*
1345  * TLB Block Invalidate Characteristics
1346  *
1347  * These characteristics define the size of the block the hcall H_BLOCK_REMOVE
1348  * is able to process for each couple segment base page size, actual page size.
1349  *
1350  * The ibm,get-system-parameter properties is returning a buffer with the
1351  * following layout:
1352  *
1353  * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ]
1354  * -----------------
1355  * TLB Block Invalidate Specifiers:
1356  * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ]
1357  * [ 1 byte Number of page sizes (N) that are supported for the specified
1358  *          TLB invalidate block size ]
1359  * [ 1 byte Encoded segment base page size and actual page size
1360  *          MSB=0 means 4k segment base page size and actual page size
1361  *          MSB=1 the penc value in mmu_psize_def ]
1362  * ...
1363  * -----------------
1364  * Next TLB Block Invalidate Specifiers...
1365  * -----------------
1366  * [ 0 ]
1367  */
set_hblkrm_bloc_size(int bpsize,int psize,unsigned int block_size)1368 static inline void set_hblkrm_bloc_size(int bpsize, int psize,
1369 					unsigned int block_size)
1370 {
1371 	if (block_size > hblkrm_size[bpsize][psize])
1372 		hblkrm_size[bpsize][psize] = block_size;
1373 }
1374 
1375 /*
1376  * Decode the Encoded segment base page size and actual page size.
1377  * PAPR specifies:
1378  *   - bit 7 is the L bit
1379  *   - bits 0-5 are the penc value
1380  * If the L bit is 0, this means 4K segment base page size and actual page size
1381  * otherwise the penc value should be read.
1382  */
1383 #define HBLKRM_L_MASK		0x80
1384 #define HBLKRM_PENC_MASK	0x3f
check_lp_set_hblkrm(unsigned int lp,unsigned int block_size)1385 static inline void __init check_lp_set_hblkrm(unsigned int lp,
1386 					      unsigned int block_size)
1387 {
1388 	unsigned int bpsize, psize;
1389 
1390 	/* First, check the L bit, if not set, this means 4K */
1391 	if ((lp & HBLKRM_L_MASK) == 0) {
1392 		set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size);
1393 		return;
1394 	}
1395 
1396 	lp &= HBLKRM_PENC_MASK;
1397 	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) {
1398 		struct mmu_psize_def *def = &mmu_psize_defs[bpsize];
1399 
1400 		for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
1401 			if (def->penc[psize] == lp) {
1402 				set_hblkrm_bloc_size(bpsize, psize, block_size);
1403 				return;
1404 			}
1405 		}
1406 	}
1407 }
1408 
1409 #define SPLPAR_TLB_BIC_TOKEN		50
1410 
1411 /*
1412  * The size of the TLB Block Invalidate Characteristics is variable. But at the
1413  * maximum it will be the number of possible page sizes *2 + 10 bytes.
1414  * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size
1415  * (128 bytes) for the buffer to get plenty of space.
1416  */
1417 #define SPLPAR_TLB_BIC_MAXLENGTH	128
1418 
pseries_lpar_read_hblkrm_characteristics(void)1419 void __init pseries_lpar_read_hblkrm_characteristics(void)
1420 {
1421 	const s32 token = rtas_token("ibm,get-system-parameter");
1422 	unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH];
1423 	int call_status, len, idx, bpsize;
1424 
1425 	if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
1426 		return;
1427 
1428 	do {
1429 		spin_lock(&rtas_data_buf_lock);
1430 		memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
1431 		call_status = rtas_call(token, 3, 1, NULL, SPLPAR_TLB_BIC_TOKEN,
1432 					__pa(rtas_data_buf), RTAS_DATA_BUF_SIZE);
1433 		memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH);
1434 		local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0';
1435 		spin_unlock(&rtas_data_buf_lock);
1436 	} while (rtas_busy_delay(call_status));
1437 
1438 	if (call_status != 0) {
1439 		pr_warn("%s %s Error calling get-system-parameter (0x%x)\n",
1440 			__FILE__, __func__, call_status);
1441 		return;
1442 	}
1443 
1444 	/*
1445 	 * The first two (2) bytes of the data in the buffer are the length of
1446 	 * the returned data, not counting these first two (2) bytes.
1447 	 */
1448 	len = be16_to_cpu(*((u16 *)local_buffer)) + 2;
1449 	if (len > SPLPAR_TLB_BIC_MAXLENGTH) {
1450 		pr_warn("%s too large returned buffer %d", __func__, len);
1451 		return;
1452 	}
1453 
1454 	idx = 2;
1455 	while (idx < len) {
1456 		u8 block_shift = local_buffer[idx++];
1457 		u32 block_size;
1458 		unsigned int npsize;
1459 
1460 		if (!block_shift)
1461 			break;
1462 
1463 		block_size = 1 << block_shift;
1464 
1465 		for (npsize = local_buffer[idx++];
1466 		     npsize > 0 && idx < len; npsize--)
1467 			check_lp_set_hblkrm((unsigned int) local_buffer[idx++],
1468 					    block_size);
1469 	}
1470 
1471 	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
1472 		for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
1473 			if (hblkrm_size[bpsize][idx])
1474 				pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d",
1475 					bpsize, idx, hblkrm_size[bpsize][idx]);
1476 }
1477 
1478 /*
1479  * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
1480  * lock.
1481  */
pSeries_lpar_flush_hash_range(unsigned long number,int local)1482 static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
1483 {
1484 	unsigned long vpn;
1485 	unsigned long i, pix, rc;
1486 	unsigned long flags = 0;
1487 	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
1488 	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
1489 	unsigned long param[PLPAR_HCALL9_BUFSIZE];
1490 	unsigned long index, shift, slot;
1491 	real_pte_t pte;
1492 	int psize, ssize;
1493 
1494 	if (lock_tlbie)
1495 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
1496 
1497 	if (is_supported_hlbkrm(batch->psize, batch->psize)) {
1498 		do_block_remove(number, batch, param);
1499 		goto out;
1500 	}
1501 
1502 	psize = batch->psize;
1503 	ssize = batch->ssize;
1504 	pix = 0;
1505 	for (i = 0; i < number; i++) {
1506 		vpn = batch->vpn[i];
1507 		pte = batch->pte[i];
1508 		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
1509 			slot = compute_slot(pte, vpn, index, shift, ssize);
1510 			if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1511 				/*
1512 				 * lpar doesn't use the passed actual page size
1513 				 */
1514 				pSeries_lpar_hpte_invalidate(slot, vpn, psize,
1515 							     0, ssize, local);
1516 			} else {
1517 				param[pix] = HBR_REQUEST | HBR_AVPN | slot;
1518 				param[pix+1] = hpte_encode_avpn(vpn, psize,
1519 								ssize);
1520 				pix += 2;
1521 				if (pix == 8) {
1522 					rc = plpar_hcall9(H_BULK_REMOVE, param,
1523 						param[0], param[1], param[2],
1524 						param[3], param[4], param[5],
1525 						param[6], param[7]);
1526 					BUG_ON(rc != H_SUCCESS);
1527 					pix = 0;
1528 				}
1529 			}
1530 		} pte_iterate_hashed_end();
1531 	}
1532 	if (pix) {
1533 		param[pix] = HBR_END;
1534 		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
1535 				  param[2], param[3], param[4], param[5],
1536 				  param[6], param[7]);
1537 		BUG_ON(rc != H_SUCCESS);
1538 	}
1539 
1540 out:
1541 	if (lock_tlbie)
1542 		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
1543 }
1544 
disable_bulk_remove(char * str)1545 static int __init disable_bulk_remove(char *str)
1546 {
1547 	if (strcmp(str, "off") == 0 &&
1548 	    firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
1549 		pr_info("Disabling BULK_REMOVE firmware feature");
1550 		powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
1551 	}
1552 	return 1;
1553 }
1554 
1555 __setup("bulk_remove=", disable_bulk_remove);
1556 
1557 #define HPT_RESIZE_TIMEOUT	10000 /* ms */
1558 
1559 struct hpt_resize_state {
1560 	unsigned long shift;
1561 	int commit_rc;
1562 };
1563 
pseries_lpar_resize_hpt_commit(void * data)1564 static int pseries_lpar_resize_hpt_commit(void *data)
1565 {
1566 	struct hpt_resize_state *state = data;
1567 
1568 	state->commit_rc = plpar_resize_hpt_commit(0, state->shift);
1569 	if (state->commit_rc != H_SUCCESS)
1570 		return -EIO;
1571 
1572 	/* Hypervisor has transitioned the HTAB, update our globals */
1573 	ppc64_pft_size = state->shift;
1574 	htab_size_bytes = 1UL << ppc64_pft_size;
1575 	htab_hash_mask = (htab_size_bytes >> 7) - 1;
1576 
1577 	return 0;
1578 }
1579 
1580 /*
1581  * Must be called in process context. The caller must hold the
1582  * cpus_lock.
1583  */
pseries_lpar_resize_hpt(unsigned long shift)1584 static int pseries_lpar_resize_hpt(unsigned long shift)
1585 {
1586 	struct hpt_resize_state state = {
1587 		.shift = shift,
1588 		.commit_rc = H_FUNCTION,
1589 	};
1590 	unsigned int delay, total_delay = 0;
1591 	int rc;
1592 	ktime_t t0, t1, t2;
1593 
1594 	might_sleep();
1595 
1596 	if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE))
1597 		return -ENODEV;
1598 
1599 	pr_info("Attempting to resize HPT to shift %lu\n", shift);
1600 
1601 	t0 = ktime_get();
1602 
1603 	rc = plpar_resize_hpt_prepare(0, shift);
1604 	while (H_IS_LONG_BUSY(rc)) {
1605 		delay = get_longbusy_msecs(rc);
1606 		total_delay += delay;
1607 		if (total_delay > HPT_RESIZE_TIMEOUT) {
1608 			/* prepare with shift==0 cancels an in-progress resize */
1609 			rc = plpar_resize_hpt_prepare(0, 0);
1610 			if (rc != H_SUCCESS)
1611 				pr_warn("Unexpected error %d cancelling timed out HPT resize\n",
1612 				       rc);
1613 			return -ETIMEDOUT;
1614 		}
1615 		msleep(delay);
1616 		rc = plpar_resize_hpt_prepare(0, shift);
1617 	};
1618 
1619 	switch (rc) {
1620 	case H_SUCCESS:
1621 		/* Continue on */
1622 		break;
1623 
1624 	case H_PARAMETER:
1625 		pr_warn("Invalid argument from H_RESIZE_HPT_PREPARE\n");
1626 		return -EINVAL;
1627 	case H_RESOURCE:
1628 		pr_warn("Operation not permitted from H_RESIZE_HPT_PREPARE\n");
1629 		return -EPERM;
1630 	default:
1631 		pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc);
1632 		return -EIO;
1633 	}
1634 
1635 	t1 = ktime_get();
1636 
1637 	rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
1638 				     &state, NULL);
1639 
1640 	t2 = ktime_get();
1641 
1642 	if (rc != 0) {
1643 		switch (state.commit_rc) {
1644 		case H_PTEG_FULL:
1645 			return -ENOSPC;
1646 
1647 		default:
1648 			pr_warn("Unexpected error %d from H_RESIZE_HPT_COMMIT\n",
1649 				state.commit_rc);
1650 			return -EIO;
1651 		};
1652 	}
1653 
1654 	pr_info("HPT resize to shift %lu complete (%lld ms / %lld ms)\n",
1655 		shift, (long long) ktime_ms_delta(t1, t0),
1656 		(long long) ktime_ms_delta(t2, t1));
1657 
1658 	return 0;
1659 }
1660 
pseries_lpar_register_process_table(unsigned long base,unsigned long page_size,unsigned long table_size)1661 static int pseries_lpar_register_process_table(unsigned long base,
1662 			unsigned long page_size, unsigned long table_size)
1663 {
1664 	long rc;
1665 	unsigned long flags = 0;
1666 
1667 	if (table_size)
1668 		flags |= PROC_TABLE_NEW;
1669 	if (radix_enabled())
1670 		flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
1671 	else
1672 		flags |= PROC_TABLE_HPT_SLB;
1673 	for (;;) {
1674 		rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
1675 					page_size, table_size);
1676 		if (!H_IS_LONG_BUSY(rc))
1677 			break;
1678 		mdelay(get_longbusy_msecs(rc));
1679 	}
1680 	if (rc != H_SUCCESS) {
1681 		pr_err("Failed to register process table (rc=%ld)\n", rc);
1682 		BUG();
1683 	}
1684 	return rc;
1685 }
1686 
hpte_init_pseries(void)1687 void __init hpte_init_pseries(void)
1688 {
1689 	mmu_hash_ops.hpte_invalidate	 = pSeries_lpar_hpte_invalidate;
1690 	mmu_hash_ops.hpte_updatepp	 = pSeries_lpar_hpte_updatepp;
1691 	mmu_hash_ops.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
1692 	mmu_hash_ops.hpte_insert	 = pSeries_lpar_hpte_insert;
1693 	mmu_hash_ops.hpte_remove	 = pSeries_lpar_hpte_remove;
1694 	mmu_hash_ops.hpte_removebolted   = pSeries_lpar_hpte_removebolted;
1695 	mmu_hash_ops.flush_hash_range	 = pSeries_lpar_flush_hash_range;
1696 	mmu_hash_ops.hpte_clear_all      = pseries_hpte_clear_all;
1697 	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
1698 
1699 	if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
1700 		mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
1701 
1702 	/*
1703 	 * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
1704 	 * to inform the hypervisor that we wish to use the HPT.
1705 	 */
1706 	if (cpu_has_feature(CPU_FTR_ARCH_300))
1707 		pseries_lpar_register_process_table(0, 0, 0);
1708 }
1709 
radix_init_pseries(void)1710 void radix_init_pseries(void)
1711 {
1712 	pr_info("Using radix MMU under hypervisor\n");
1713 
1714 	pseries_lpar_register_process_table(__pa(process_tb),
1715 						0, PRTB_SIZE_SHIFT - 12);
1716 }
1717 
1718 #ifdef CONFIG_PPC_SMLPAR
1719 #define CMO_FREE_HINT_DEFAULT 1
1720 static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
1721 
cmo_free_hint(char * str)1722 static int __init cmo_free_hint(char *str)
1723 {
1724 	char *parm;
1725 	parm = strstrip(str);
1726 
1727 	if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
1728 		pr_info("%s: CMO free page hinting is not active.\n", __func__);
1729 		cmo_free_hint_flag = 0;
1730 		return 1;
1731 	}
1732 
1733 	cmo_free_hint_flag = 1;
1734 	pr_info("%s: CMO free page hinting is active.\n", __func__);
1735 
1736 	if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
1737 		return 1;
1738 
1739 	return 0;
1740 }
1741 
1742 __setup("cmo_free_hint=", cmo_free_hint);
1743 
pSeries_set_page_state(struct page * page,int order,unsigned long state)1744 static void pSeries_set_page_state(struct page *page, int order,
1745 				   unsigned long state)
1746 {
1747 	int i, j;
1748 	unsigned long cmo_page_sz, addr;
1749 
1750 	cmo_page_sz = cmo_get_page_size();
1751 	addr = __pa((unsigned long)page_address(page));
1752 
1753 	for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
1754 		for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
1755 			plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
1756 	}
1757 }
1758 
arch_free_page(struct page * page,int order)1759 void arch_free_page(struct page *page, int order)
1760 {
1761 	if (radix_enabled())
1762 		return;
1763 	if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
1764 		return;
1765 
1766 	pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
1767 }
1768 EXPORT_SYMBOL(arch_free_page);
1769 
1770 #endif /* CONFIG_PPC_SMLPAR */
1771 #endif /* CONFIG_PPC_BOOK3S_64 */
1772 
1773 #ifdef CONFIG_TRACEPOINTS
1774 #ifdef CONFIG_JUMP_LABEL
1775 struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
1776 
hcall_tracepoint_regfunc(void)1777 int hcall_tracepoint_regfunc(void)
1778 {
1779 	static_key_slow_inc(&hcall_tracepoint_key);
1780 	return 0;
1781 }
1782 
hcall_tracepoint_unregfunc(void)1783 void hcall_tracepoint_unregfunc(void)
1784 {
1785 	static_key_slow_dec(&hcall_tracepoint_key);
1786 }
1787 #else
1788 /*
1789  * We optimise our hcall path by placing hcall_tracepoint_refcount
1790  * directly in the TOC so we can check if the hcall tracepoints are
1791  * enabled via a single load.
1792  */
1793 
1794 /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
1795 extern long hcall_tracepoint_refcount;
1796 
hcall_tracepoint_regfunc(void)1797 int hcall_tracepoint_regfunc(void)
1798 {
1799 	hcall_tracepoint_refcount++;
1800 	return 0;
1801 }
1802 
hcall_tracepoint_unregfunc(void)1803 void hcall_tracepoint_unregfunc(void)
1804 {
1805 	hcall_tracepoint_refcount--;
1806 }
1807 #endif
1808 
1809 /*
1810  * Since the tracing code might execute hcalls we need to guard against
1811  * recursion. One example of this are spinlocks calling H_YIELD on
1812  * shared processor partitions.
1813  */
1814 static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
1815 
1816 
__trace_hcall_entry(unsigned long opcode,unsigned long * args)1817 void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
1818 {
1819 	unsigned long flags;
1820 	unsigned int *depth;
1821 
1822 	/*
1823 	 * We cannot call tracepoints inside RCU idle regions which
1824 	 * means we must not trace H_CEDE.
1825 	 */
1826 	if (opcode == H_CEDE)
1827 		return;
1828 
1829 	local_irq_save(flags);
1830 
1831 	depth = this_cpu_ptr(&hcall_trace_depth);
1832 
1833 	if (*depth)
1834 		goto out;
1835 
1836 	(*depth)++;
1837 	preempt_disable();
1838 	trace_hcall_entry(opcode, args);
1839 	(*depth)--;
1840 
1841 out:
1842 	local_irq_restore(flags);
1843 }
1844 
__trace_hcall_exit(long opcode,long retval,unsigned long * retbuf)1845 void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf)
1846 {
1847 	unsigned long flags;
1848 	unsigned int *depth;
1849 
1850 	if (opcode == H_CEDE)
1851 		return;
1852 
1853 	local_irq_save(flags);
1854 
1855 	depth = this_cpu_ptr(&hcall_trace_depth);
1856 
1857 	if (*depth)
1858 		goto out;
1859 
1860 	(*depth)++;
1861 	trace_hcall_exit(opcode, retval, retbuf);
1862 	preempt_enable();
1863 	(*depth)--;
1864 
1865 out:
1866 	local_irq_restore(flags);
1867 }
1868 #endif
1869 
1870 /**
1871  * h_get_mpp
1872  * H_GET_MPP hcall returns info in 7 parms
1873  */
h_get_mpp(struct hvcall_mpp_data * mpp_data)1874 int h_get_mpp(struct hvcall_mpp_data *mpp_data)
1875 {
1876 	int rc;
1877 	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
1878 
1879 	rc = plpar_hcall9(H_GET_MPP, retbuf);
1880 
1881 	mpp_data->entitled_mem = retbuf[0];
1882 	mpp_data->mapped_mem = retbuf[1];
1883 
1884 	mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
1885 	mpp_data->pool_num = retbuf[2] & 0xffff;
1886 
1887 	mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
1888 	mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
1889 	mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL;
1890 
1891 	mpp_data->pool_size = retbuf[4];
1892 	mpp_data->loan_request = retbuf[5];
1893 	mpp_data->backing_mem = retbuf[6];
1894 
1895 	return rc;
1896 }
1897 EXPORT_SYMBOL(h_get_mpp);
1898 
h_get_mpp_x(struct hvcall_mpp_x_data * mpp_x_data)1899 int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
1900 {
1901 	int rc;
1902 	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
1903 
1904 	rc = plpar_hcall9(H_GET_MPP_X, retbuf);
1905 
1906 	mpp_x_data->coalesced_bytes = retbuf[0];
1907 	mpp_x_data->pool_coalesced_bytes = retbuf[1];
1908 	mpp_x_data->pool_purr_cycles = retbuf[2];
1909 	mpp_x_data->pool_spurr_cycles = retbuf[3];
1910 
1911 	return rc;
1912 }
1913 
vsid_unscramble(unsigned long vsid,int ssize)1914 static unsigned long vsid_unscramble(unsigned long vsid, int ssize)
1915 {
1916 	unsigned long protovsid;
1917 	unsigned long va_bits = VA_BITS;
1918 	unsigned long modinv, vsid_modulus;
1919 	unsigned long max_mod_inv, tmp_modinv;
1920 
1921 	if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
1922 		va_bits = 65;
1923 
1924 	if (ssize == MMU_SEGSIZE_256M) {
1925 		modinv = VSID_MULINV_256M;
1926 		vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
1927 	} else {
1928 		modinv = VSID_MULINV_1T;
1929 		vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
1930 	}
1931 
1932 	/*
1933 	 * vsid outside our range.
1934 	 */
1935 	if (vsid >= vsid_modulus)
1936 		return 0;
1937 
1938 	/*
1939 	 * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
1940 	 * and vsid = (protovsid * x) % vsid_modulus, then we say:
1941 	 *   protovsid = (vsid * modinv) % vsid_modulus
1942 	 */
1943 
1944 	/* Check if (vsid * modinv) overflow (63 bits) */
1945 	max_mod_inv = 0x7fffffffffffffffull / vsid;
1946 	if (modinv < max_mod_inv)
1947 		return (vsid * modinv) % vsid_modulus;
1948 
1949 	tmp_modinv = modinv/max_mod_inv;
1950 	modinv %= max_mod_inv;
1951 
1952 	protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
1953 	protovsid = (protovsid + vsid * modinv) % vsid_modulus;
1954 
1955 	return protovsid;
1956 }
1957 
reserve_vrma_context_id(void)1958 static int __init reserve_vrma_context_id(void)
1959 {
1960 	unsigned long protovsid;
1961 
1962 	/*
1963 	 * Reserve context ids which map to reserved virtual addresses. For now
1964 	 * we only reserve the context id which maps to the VRMA VSID. We ignore
1965 	 * the addresses in "ibm,adjunct-virtual-addresses" because we don't
1966 	 * enable adjunct support via the "ibm,client-architecture-support"
1967 	 * interface.
1968 	 */
1969 	protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
1970 	hash__reserve_context_id(protovsid >> ESID_BITS_1T);
1971 	return 0;
1972 }
1973 machine_device_initcall(pseries, reserve_vrma_context_id);
1974 
1975 #ifdef CONFIG_DEBUG_FS
1976 /* debugfs file interface for vpa data */
vpa_file_read(struct file * filp,char __user * buf,size_t len,loff_t * pos)1977 static ssize_t vpa_file_read(struct file *filp, char __user *buf, size_t len,
1978 			      loff_t *pos)
1979 {
1980 	int cpu = (long)filp->private_data;
1981 	struct lppaca *lppaca = &lppaca_of(cpu);
1982 
1983 	return simple_read_from_buffer(buf, len, pos, lppaca,
1984 				sizeof(struct lppaca));
1985 }
1986 
1987 static const struct file_operations vpa_fops = {
1988 	.open		= simple_open,
1989 	.read		= vpa_file_read,
1990 	.llseek		= default_llseek,
1991 };
1992 
vpa_debugfs_init(void)1993 static int __init vpa_debugfs_init(void)
1994 {
1995 	char name[16];
1996 	long i;
1997 	struct dentry *vpa_dir;
1998 
1999 	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
2000 		return 0;
2001 
2002 	vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root);
2003 	if (!vpa_dir) {
2004 		pr_warn("%s: can't create vpa root dir\n", __func__);
2005 		return -ENOMEM;
2006 	}
2007 
2008 	/* set up the per-cpu vpa file*/
2009 	for_each_possible_cpu(i) {
2010 		struct dentry *d;
2011 
2012 		sprintf(name, "cpu-%ld", i);
2013 
2014 		d = debugfs_create_file(name, 0400, vpa_dir, (void *)i,
2015 					&vpa_fops);
2016 		if (!d) {
2017 			pr_warn("%s: can't create per-cpu vpa file\n",
2018 					__func__);
2019 			return -ENOMEM;
2020 		}
2021 	}
2022 
2023 	return 0;
2024 }
2025 machine_arch_initcall(pseries, vpa_debugfs_init);
2026 #endif /* CONFIG_DEBUG_FS */
2027