• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * SN Platform GRU Driver
3   *
4   *              FAULT HANDLER FOR GRU DETECTED TLB MISSES
5   *
6   * This file contains code that handles TLB misses within the GRU.
7   * These misses are reported either via interrupts or user polling of
8   * the user CB.
9   *
10   *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
11   *
12   *  This program is free software; you can redistribute it and/or modify
13   *  it under the terms of the GNU General Public License as published by
14   *  the Free Software Foundation; either version 2 of the License, or
15   *  (at your option) any later version.
16   *
17   *  This program is distributed in the hope that it will be useful,
18   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
19   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   *  GNU General Public License for more details.
21   *
22   *  You should have received a copy of the GNU General Public License
23   *  along with this program; if not, write to the Free Software
24   *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
25   */
26  
27  #include <linux/kernel.h>
28  #include <linux/errno.h>
29  #include <linux/spinlock.h>
30  #include <linux/mm.h>
31  #include <linux/hugetlb.h>
32  #include <linux/device.h>
33  #include <linux/io.h>
34  #include <linux/uaccess.h>
35  #include <linux/security.h>
36  #include <linux/prefetch.h>
37  #include <asm/pgtable.h>
38  #include "gru.h"
39  #include "grutables.h"
40  #include "grulib.h"
41  #include "gru_instructions.h"
42  #include <asm/uv/uv_hub.h>
43  
44  /* Return codes for vtop functions */
45  #define VTOP_SUCCESS               0
46  #define VTOP_INVALID               -1
47  #define VTOP_RETRY                 -2
48  
49  
50  /*
51   * Test if a physical address is a valid GRU GSEG address
52   */
is_gru_paddr(unsigned long paddr)53  static inline int is_gru_paddr(unsigned long paddr)
54  {
55  	return paddr >= gru_start_paddr && paddr < gru_end_paddr;
56  }
57  
58  /*
59   * Find the vma of a GRU segment. Caller must hold mmap_sem.
60   */
gru_find_vma(unsigned long vaddr)61  struct vm_area_struct *gru_find_vma(unsigned long vaddr)
62  {
63  	struct vm_area_struct *vma;
64  
65  	vma = find_vma(current->mm, vaddr);
66  	if (vma && vma->vm_start <= vaddr && vma->vm_ops == &gru_vm_ops)
67  		return vma;
68  	return NULL;
69  }
70  
71  /*
72   * Find and lock the gts that contains the specified user vaddr.
73   *
74   * Returns:
75   * 	- *gts with the mmap_sem locked for read and the GTS locked.
76   *	- NULL if vaddr invalid OR is not a valid GSEG vaddr.
77   */
78  
gru_find_lock_gts(unsigned long vaddr)79  static struct gru_thread_state *gru_find_lock_gts(unsigned long vaddr)
80  {
81  	struct mm_struct *mm = current->mm;
82  	struct vm_area_struct *vma;
83  	struct gru_thread_state *gts = NULL;
84  
85  	down_read(&mm->mmap_sem);
86  	vma = gru_find_vma(vaddr);
87  	if (vma)
88  		gts = gru_find_thread_state(vma, TSID(vaddr, vma));
89  	if (gts)
90  		mutex_lock(&gts->ts_ctxlock);
91  	else
92  		up_read(&mm->mmap_sem);
93  	return gts;
94  }
95  
gru_alloc_locked_gts(unsigned long vaddr)96  static struct gru_thread_state *gru_alloc_locked_gts(unsigned long vaddr)
97  {
98  	struct mm_struct *mm = current->mm;
99  	struct vm_area_struct *vma;
100  	struct gru_thread_state *gts = ERR_PTR(-EINVAL);
101  
102  	down_write(&mm->mmap_sem);
103  	vma = gru_find_vma(vaddr);
104  	if (!vma)
105  		goto err;
106  
107  	gts = gru_alloc_thread_state(vma, TSID(vaddr, vma));
108  	if (IS_ERR(gts))
109  		goto err;
110  	mutex_lock(&gts->ts_ctxlock);
111  	downgrade_write(&mm->mmap_sem);
112  	return gts;
113  
114  err:
115  	up_write(&mm->mmap_sem);
116  	return gts;
117  }
118  
119  /*
120   * Unlock a GTS that was previously locked with gru_find_lock_gts().
121   */
gru_unlock_gts(struct gru_thread_state * gts)122  static void gru_unlock_gts(struct gru_thread_state *gts)
123  {
124  	mutex_unlock(&gts->ts_ctxlock);
125  	up_read(&current->mm->mmap_sem);
126  }
127  
128  /*
129   * Set a CB.istatus to active using a user virtual address. This must be done
130   * just prior to a TFH RESTART. The new cb.istatus is an in-cache status ONLY.
131   * If the line is evicted, the status may be lost. The in-cache update
132   * is necessary to prevent the user from seeing a stale cb.istatus that will
133   * change as soon as the TFH restart is complete. Races may cause an
134   * occasional failure to clear the cb.istatus, but that is ok.
135   */
gru_cb_set_istatus_active(struct gru_instruction_bits * cbk)136  static void gru_cb_set_istatus_active(struct gru_instruction_bits *cbk)
137  {
138  	if (cbk) {
139  		cbk->istatus = CBS_ACTIVE;
140  	}
141  }
142  
143  /*
144   * Read & clear a TFM
145   *
146   * The GRU has an array of fault maps. A map is private to a cpu
147   * Only one cpu will be accessing a cpu's fault map.
148   *
149   * This function scans the cpu-private fault map & clears all bits that
150   * are set. The function returns a bitmap that indicates the bits that
151   * were cleared. Note that sense the maps may be updated asynchronously by
152   * the GRU, atomic operations must be used to clear bits.
153   */
get_clear_fault_map(struct gru_state * gru,struct gru_tlb_fault_map * imap,struct gru_tlb_fault_map * dmap)154  static void get_clear_fault_map(struct gru_state *gru,
155  				struct gru_tlb_fault_map *imap,
156  				struct gru_tlb_fault_map *dmap)
157  {
158  	unsigned long i, k;
159  	struct gru_tlb_fault_map *tfm;
160  
161  	tfm = get_tfm_for_cpu(gru, gru_cpu_fault_map_id());
162  	prefetchw(tfm);		/* Helps on hardware, required for emulator */
163  	for (i = 0; i < BITS_TO_LONGS(GRU_NUM_CBE); i++) {
164  		k = tfm->fault_bits[i];
165  		if (k)
166  			k = xchg(&tfm->fault_bits[i], 0UL);
167  		imap->fault_bits[i] = k;
168  		k = tfm->done_bits[i];
169  		if (k)
170  			k = xchg(&tfm->done_bits[i], 0UL);
171  		dmap->fault_bits[i] = k;
172  	}
173  
174  	/*
175  	 * Not functionally required but helps performance. (Required
176  	 * on emulator)
177  	 */
178  	gru_flush_cache(tfm);
179  }
180  
181  /*
182   * Atomic (interrupt context) & non-atomic (user context) functions to
183   * convert a vaddr into a physical address. The size of the page
184   * is returned in pageshift.
185   * 	returns:
186   * 		  0 - successful
187   * 		< 0 - error code
188   * 		  1 - (atomic only) try again in non-atomic context
189   */
non_atomic_pte_lookup(struct vm_area_struct * vma,unsigned long vaddr,int write,unsigned long * paddr,int * pageshift)190  static int non_atomic_pte_lookup(struct vm_area_struct *vma,
191  				 unsigned long vaddr, int write,
192  				 unsigned long *paddr, int *pageshift)
193  {
194  	struct page *page;
195  
196  #ifdef CONFIG_HUGETLB_PAGE
197  	*pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
198  #else
199  	*pageshift = PAGE_SHIFT;
200  #endif
201  	if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0)
202  		return -EFAULT;
203  	*paddr = page_to_phys(page);
204  	put_page(page);
205  	return 0;
206  }
207  
208  /*
209   * atomic_pte_lookup
210   *
211   * Convert a user virtual address to a physical address
212   * Only supports Intel large pages (2MB only) on x86_64.
213   *	ZZZ - hugepage support is incomplete
214   *
215   * NOTE: mmap_sem is already held on entry to this function. This
216   * guarantees existence of the page tables.
217   */
atomic_pte_lookup(struct vm_area_struct * vma,unsigned long vaddr,int write,unsigned long * paddr,int * pageshift)218  static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr,
219  	int write, unsigned long *paddr, int *pageshift)
220  {
221  	pgd_t *pgdp;
222  	p4d_t *p4dp;
223  	pud_t *pudp;
224  	pmd_t *pmdp;
225  	pte_t pte;
226  
227  	pgdp = pgd_offset(vma->vm_mm, vaddr);
228  	if (unlikely(pgd_none(*pgdp)))
229  		goto err;
230  
231  	p4dp = p4d_offset(pgdp, vaddr);
232  	if (unlikely(p4d_none(*p4dp)))
233  		goto err;
234  
235  	pudp = pud_offset(p4dp, vaddr);
236  	if (unlikely(pud_none(*pudp)))
237  		goto err;
238  
239  	pmdp = pmd_offset(pudp, vaddr);
240  	if (unlikely(pmd_none(*pmdp)))
241  		goto err;
242  #ifdef CONFIG_X86_64
243  	if (unlikely(pmd_large(*pmdp)))
244  		pte = *(pte_t *) pmdp;
245  	else
246  #endif
247  		pte = *pte_offset_kernel(pmdp, vaddr);
248  
249  	if (unlikely(!pte_present(pte) ||
250  		     (write && (!pte_write(pte) || !pte_dirty(pte)))))
251  		return 1;
252  
253  	*paddr = pte_pfn(pte) << PAGE_SHIFT;
254  #ifdef CONFIG_HUGETLB_PAGE
255  	*pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
256  #else
257  	*pageshift = PAGE_SHIFT;
258  #endif
259  	return 0;
260  
261  err:
262  	return 1;
263  }
264  
gru_vtop(struct gru_thread_state * gts,unsigned long vaddr,int write,int atomic,unsigned long * gpa,int * pageshift)265  static int gru_vtop(struct gru_thread_state *gts, unsigned long vaddr,
266  		    int write, int atomic, unsigned long *gpa, int *pageshift)
267  {
268  	struct mm_struct *mm = gts->ts_mm;
269  	struct vm_area_struct *vma;
270  	unsigned long paddr;
271  	int ret, ps;
272  
273  	vma = find_vma(mm, vaddr);
274  	if (!vma)
275  		goto inval;
276  
277  	/*
278  	 * Atomic lookup is faster & usually works even if called in non-atomic
279  	 * context.
280  	 */
281  	rmb();	/* Must/check ms_range_active before loading PTEs */
282  	ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &ps);
283  	if (ret) {
284  		if (atomic)
285  			goto upm;
286  		if (non_atomic_pte_lookup(vma, vaddr, write, &paddr, &ps))
287  			goto inval;
288  	}
289  	if (is_gru_paddr(paddr))
290  		goto inval;
291  	paddr = paddr & ~((1UL << ps) - 1);
292  	*gpa = uv_soc_phys_ram_to_gpa(paddr);
293  	*pageshift = ps;
294  	return VTOP_SUCCESS;
295  
296  inval:
297  	return VTOP_INVALID;
298  upm:
299  	return VTOP_RETRY;
300  }
301  
302  
303  /*
304   * Flush a CBE from cache. The CBE is clean in the cache. Dirty the
305   * CBE cacheline so that the line will be written back to home agent.
306   * Otherwise the line may be silently dropped. This has no impact
307   * except on performance.
308   */
gru_flush_cache_cbe(struct gru_control_block_extended * cbe)309  static void gru_flush_cache_cbe(struct gru_control_block_extended *cbe)
310  {
311  	if (unlikely(cbe)) {
312  		cbe->cbrexecstatus = 0;         /* make CL dirty */
313  		gru_flush_cache(cbe);
314  	}
315  }
316  
317  /*
318   * Preload the TLB with entries that may be required. Currently, preloading
319   * is implemented only for BCOPY. Preload  <tlb_preload_count> pages OR to
320   * the end of the bcopy tranfer, whichever is smaller.
321   */
gru_preload_tlb(struct gru_state * gru,struct gru_thread_state * gts,int atomic,unsigned long fault_vaddr,int asid,int write,unsigned char tlb_preload_count,struct gru_tlb_fault_handle * tfh,struct gru_control_block_extended * cbe)322  static void gru_preload_tlb(struct gru_state *gru,
323  			struct gru_thread_state *gts, int atomic,
324  			unsigned long fault_vaddr, int asid, int write,
325  			unsigned char tlb_preload_count,
326  			struct gru_tlb_fault_handle *tfh,
327  			struct gru_control_block_extended *cbe)
328  {
329  	unsigned long vaddr = 0, gpa;
330  	int ret, pageshift;
331  
332  	if (cbe->opccpy != OP_BCOPY)
333  		return;
334  
335  	if (fault_vaddr == cbe->cbe_baddr0)
336  		vaddr = fault_vaddr + GRU_CACHE_LINE_BYTES * cbe->cbe_src_cl - 1;
337  	else if (fault_vaddr == cbe->cbe_baddr1)
338  		vaddr = fault_vaddr + (1 << cbe->xtypecpy) * cbe->cbe_nelemcur - 1;
339  
340  	fault_vaddr &= PAGE_MASK;
341  	vaddr &= PAGE_MASK;
342  	vaddr = min(vaddr, fault_vaddr + tlb_preload_count * PAGE_SIZE);
343  
344  	while (vaddr > fault_vaddr) {
345  		ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift);
346  		if (ret || tfh_write_only(tfh, gpa, GAA_RAM, vaddr, asid, write,
347  					  GRU_PAGESIZE(pageshift)))
348  			return;
349  		gru_dbg(grudev,
350  			"%s: gid %d, gts 0x%p, tfh 0x%p, vaddr 0x%lx, asid 0x%x, rw %d, ps %d, gpa 0x%lx\n",
351  			atomic ? "atomic" : "non-atomic", gru->gs_gid, gts, tfh,
352  			vaddr, asid, write, pageshift, gpa);
353  		vaddr -= PAGE_SIZE;
354  		STAT(tlb_preload_page);
355  	}
356  }
357  
358  /*
359   * Drop a TLB entry into the GRU. The fault is described by info in an TFH.
360   *	Input:
361   *		cb    Address of user CBR. Null if not running in user context
362   * 	Return:
363   * 		  0 = dropin, exception, or switch to UPM successful
364   * 		  1 = range invalidate active
365   * 		< 0 = error code
366   *
367   */
gru_try_dropin(struct gru_state * gru,struct gru_thread_state * gts,struct gru_tlb_fault_handle * tfh,struct gru_instruction_bits * cbk)368  static int gru_try_dropin(struct gru_state *gru,
369  			  struct gru_thread_state *gts,
370  			  struct gru_tlb_fault_handle *tfh,
371  			  struct gru_instruction_bits *cbk)
372  {
373  	struct gru_control_block_extended *cbe = NULL;
374  	unsigned char tlb_preload_count = gts->ts_tlb_preload_count;
375  	int pageshift = 0, asid, write, ret, atomic = !cbk, indexway;
376  	unsigned long gpa = 0, vaddr = 0;
377  
378  	/*
379  	 * NOTE: The GRU contains magic hardware that eliminates races between
380  	 * TLB invalidates and TLB dropins. If an invalidate occurs
381  	 * in the window between reading the TFH and the subsequent TLB dropin,
382  	 * the dropin is ignored. This eliminates the need for additional locks.
383  	 */
384  
385  	/*
386  	 * Prefetch the CBE if doing TLB preloading
387  	 */
388  	if (unlikely(tlb_preload_count)) {
389  		cbe = gru_tfh_to_cbe(tfh);
390  		prefetchw(cbe);
391  	}
392  
393  	/*
394  	 * Error if TFH state is IDLE or FMM mode & the user issuing a UPM call.
395  	 * Might be a hardware race OR a stupid user. Ignore FMM because FMM
396  	 * is a transient state.
397  	 */
398  	if (tfh->status != TFHSTATUS_EXCEPTION) {
399  		gru_flush_cache(tfh);
400  		sync_core();
401  		if (tfh->status != TFHSTATUS_EXCEPTION)
402  			goto failnoexception;
403  		STAT(tfh_stale_on_fault);
404  	}
405  	if (tfh->state == TFHSTATE_IDLE)
406  		goto failidle;
407  	if (tfh->state == TFHSTATE_MISS_FMM && cbk)
408  		goto failfmm;
409  
410  	write = (tfh->cause & TFHCAUSE_TLB_MOD) != 0;
411  	vaddr = tfh->missvaddr;
412  	asid = tfh->missasid;
413  	indexway = tfh->indexway;
414  	if (asid == 0)
415  		goto failnoasid;
416  
417  	rmb();	/* TFH must be cache resident before reading ms_range_active */
418  
419  	/*
420  	 * TFH is cache resident - at least briefly. Fail the dropin
421  	 * if a range invalidate is active.
422  	 */
423  	if (atomic_read(&gts->ts_gms->ms_range_active))
424  		goto failactive;
425  
426  	ret = gru_vtop(gts, vaddr, write, atomic, &gpa, &pageshift);
427  	if (ret == VTOP_INVALID)
428  		goto failinval;
429  	if (ret == VTOP_RETRY)
430  		goto failupm;
431  
432  	if (!(gts->ts_sizeavail & GRU_SIZEAVAIL(pageshift))) {
433  		gts->ts_sizeavail |= GRU_SIZEAVAIL(pageshift);
434  		if (atomic || !gru_update_cch(gts)) {
435  			gts->ts_force_cch_reload = 1;
436  			goto failupm;
437  		}
438  	}
439  
440  	if (unlikely(cbe) && pageshift == PAGE_SHIFT) {
441  		gru_preload_tlb(gru, gts, atomic, vaddr, asid, write, tlb_preload_count, tfh, cbe);
442  		gru_flush_cache_cbe(cbe);
443  	}
444  
445  	gru_cb_set_istatus_active(cbk);
446  	gts->ustats.tlbdropin++;
447  	tfh_write_restart(tfh, gpa, GAA_RAM, vaddr, asid, write,
448  			  GRU_PAGESIZE(pageshift));
449  	gru_dbg(grudev,
450  		"%s: gid %d, gts 0x%p, tfh 0x%p, vaddr 0x%lx, asid 0x%x, indexway 0x%x,"
451  		" rw %d, ps %d, gpa 0x%lx\n",
452  		atomic ? "atomic" : "non-atomic", gru->gs_gid, gts, tfh, vaddr, asid,
453  		indexway, write, pageshift, gpa);
454  	STAT(tlb_dropin);
455  	return 0;
456  
457  failnoasid:
458  	/* No asid (delayed unload). */
459  	STAT(tlb_dropin_fail_no_asid);
460  	gru_dbg(grudev, "FAILED no_asid tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
461  	if (!cbk)
462  		tfh_user_polling_mode(tfh);
463  	else
464  		gru_flush_cache(tfh);
465  	gru_flush_cache_cbe(cbe);
466  	return -EAGAIN;
467  
468  failupm:
469  	/* Atomic failure switch CBR to UPM */
470  	tfh_user_polling_mode(tfh);
471  	gru_flush_cache_cbe(cbe);
472  	STAT(tlb_dropin_fail_upm);
473  	gru_dbg(grudev, "FAILED upm tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
474  	return 1;
475  
476  failfmm:
477  	/* FMM state on UPM call */
478  	gru_flush_cache(tfh);
479  	gru_flush_cache_cbe(cbe);
480  	STAT(tlb_dropin_fail_fmm);
481  	gru_dbg(grudev, "FAILED fmm tfh: 0x%p, state %d\n", tfh, tfh->state);
482  	return 0;
483  
484  failnoexception:
485  	/* TFH status did not show exception pending */
486  	gru_flush_cache(tfh);
487  	gru_flush_cache_cbe(cbe);
488  	if (cbk)
489  		gru_flush_cache(cbk);
490  	STAT(tlb_dropin_fail_no_exception);
491  	gru_dbg(grudev, "FAILED non-exception tfh: 0x%p, status %d, state %d\n",
492  		tfh, tfh->status, tfh->state);
493  	return 0;
494  
495  failidle:
496  	/* TFH state was idle  - no miss pending */
497  	gru_flush_cache(tfh);
498  	gru_flush_cache_cbe(cbe);
499  	if (cbk)
500  		gru_flush_cache(cbk);
501  	STAT(tlb_dropin_fail_idle);
502  	gru_dbg(grudev, "FAILED idle tfh: 0x%p, state %d\n", tfh, tfh->state);
503  	return 0;
504  
505  failinval:
506  	/* All errors (atomic & non-atomic) switch CBR to EXCEPTION state */
507  	tfh_exception(tfh);
508  	gru_flush_cache_cbe(cbe);
509  	STAT(tlb_dropin_fail_invalid);
510  	gru_dbg(grudev, "FAILED inval tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
511  	return -EFAULT;
512  
513  failactive:
514  	/* Range invalidate active. Switch to UPM iff atomic */
515  	if (!cbk)
516  		tfh_user_polling_mode(tfh);
517  	else
518  		gru_flush_cache(tfh);
519  	gru_flush_cache_cbe(cbe);
520  	STAT(tlb_dropin_fail_range_active);
521  	gru_dbg(grudev, "FAILED range active: tfh 0x%p, vaddr 0x%lx\n",
522  		tfh, vaddr);
523  	return 1;
524  }
525  
526  /*
527   * Process an external interrupt from the GRU. This interrupt is
528   * caused by a TLB miss.
529   * Note that this is the interrupt handler that is registered with linux
530   * interrupt handlers.
531   */
gru_intr(int chiplet,int blade)532  static irqreturn_t gru_intr(int chiplet, int blade)
533  {
534  	struct gru_state *gru;
535  	struct gru_tlb_fault_map imap, dmap;
536  	struct gru_thread_state *gts;
537  	struct gru_tlb_fault_handle *tfh = NULL;
538  	struct completion *cmp;
539  	int cbrnum, ctxnum;
540  
541  	STAT(intr);
542  
543  	gru = &gru_base[blade]->bs_grus[chiplet];
544  	if (!gru) {
545  		dev_err(grudev, "GRU: invalid interrupt: cpu %d, chiplet %d\n",
546  			raw_smp_processor_id(), chiplet);
547  		return IRQ_NONE;
548  	}
549  	get_clear_fault_map(gru, &imap, &dmap);
550  	gru_dbg(grudev,
551  		"cpu %d, chiplet %d, gid %d, imap %016lx %016lx, dmap %016lx %016lx\n",
552  		smp_processor_id(), chiplet, gru->gs_gid,
553  		imap.fault_bits[0], imap.fault_bits[1],
554  		dmap.fault_bits[0], dmap.fault_bits[1]);
555  
556  	for_each_cbr_in_tfm(cbrnum, dmap.fault_bits) {
557  		STAT(intr_cbr);
558  		cmp = gru->gs_blade->bs_async_wq;
559  		if (cmp)
560  			complete(cmp);
561  		gru_dbg(grudev, "gid %d, cbr_done %d, done %d\n",
562  			gru->gs_gid, cbrnum, cmp ? cmp->done : -1);
563  	}
564  
565  	for_each_cbr_in_tfm(cbrnum, imap.fault_bits) {
566  		STAT(intr_tfh);
567  		tfh = get_tfh_by_index(gru, cbrnum);
568  		prefetchw(tfh);	/* Helps on hdw, required for emulator */
569  
570  		/*
571  		 * When hardware sets a bit in the faultmap, it implicitly
572  		 * locks the GRU context so that it cannot be unloaded.
573  		 * The gts cannot change until a TFH start/writestart command
574  		 * is issued.
575  		 */
576  		ctxnum = tfh->ctxnum;
577  		gts = gru->gs_gts[ctxnum];
578  
579  		/* Spurious interrupts can cause this. Ignore. */
580  		if (!gts) {
581  			STAT(intr_spurious);
582  			continue;
583  		}
584  
585  		/*
586  		 * This is running in interrupt context. Trylock the mmap_sem.
587  		 * If it fails, retry the fault in user context.
588  		 */
589  		gts->ustats.fmm_tlbmiss++;
590  		if (!gts->ts_force_cch_reload &&
591  					down_read_trylock(&gts->ts_mm->mmap_sem)) {
592  			gru_try_dropin(gru, gts, tfh, NULL);
593  			up_read(&gts->ts_mm->mmap_sem);
594  		} else {
595  			tfh_user_polling_mode(tfh);
596  			STAT(intr_mm_lock_failed);
597  		}
598  	}
599  	return IRQ_HANDLED;
600  }
601  
gru0_intr(int irq,void * dev_id)602  irqreturn_t gru0_intr(int irq, void *dev_id)
603  {
604  	return gru_intr(0, uv_numa_blade_id());
605  }
606  
gru1_intr(int irq,void * dev_id)607  irqreturn_t gru1_intr(int irq, void *dev_id)
608  {
609  	return gru_intr(1, uv_numa_blade_id());
610  }
611  
gru_intr_mblade(int irq,void * dev_id)612  irqreturn_t gru_intr_mblade(int irq, void *dev_id)
613  {
614  	int blade;
615  
616  	for_each_possible_blade(blade) {
617  		if (uv_blade_nr_possible_cpus(blade))
618  			continue;
619  		 gru_intr(0, blade);
620  		 gru_intr(1, blade);
621  	}
622  	return IRQ_HANDLED;
623  }
624  
625  
gru_user_dropin(struct gru_thread_state * gts,struct gru_tlb_fault_handle * tfh,void * cb)626  static int gru_user_dropin(struct gru_thread_state *gts,
627  			   struct gru_tlb_fault_handle *tfh,
628  			   void *cb)
629  {
630  	struct gru_mm_struct *gms = gts->ts_gms;
631  	int ret;
632  
633  	gts->ustats.upm_tlbmiss++;
634  	while (1) {
635  		wait_event(gms->ms_wait_queue,
636  			   atomic_read(&gms->ms_range_active) == 0);
637  		prefetchw(tfh);	/* Helps on hdw, required for emulator */
638  		ret = gru_try_dropin(gts->ts_gru, gts, tfh, cb);
639  		if (ret <= 0)
640  			return ret;
641  		STAT(call_os_wait_queue);
642  	}
643  }
644  
645  /*
646   * This interface is called as a result of a user detecting a "call OS" bit
647   * in a user CB. Normally means that a TLB fault has occurred.
648   * 	cb - user virtual address of the CB
649   */
gru_handle_user_call_os(unsigned long cb)650  int gru_handle_user_call_os(unsigned long cb)
651  {
652  	struct gru_tlb_fault_handle *tfh;
653  	struct gru_thread_state *gts;
654  	void *cbk;
655  	int ucbnum, cbrnum, ret = -EINVAL;
656  
657  	STAT(call_os);
658  
659  	/* sanity check the cb pointer */
660  	ucbnum = get_cb_number((void *)cb);
661  	if ((cb & (GRU_HANDLE_STRIDE - 1)) || ucbnum >= GRU_NUM_CB)
662  		return -EINVAL;
663  
664  	gts = gru_find_lock_gts(cb);
665  	if (!gts)
666  		return -EINVAL;
667  	gru_dbg(grudev, "address 0x%lx, gid %d, gts 0x%p\n", cb, gts->ts_gru ? gts->ts_gru->gs_gid : -1, gts);
668  
669  	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE)
670  		goto exit;
671  
672  	gru_check_context_placement(gts);
673  
674  	/*
675  	 * CCH may contain stale data if ts_force_cch_reload is set.
676  	 */
677  	if (gts->ts_gru && gts->ts_force_cch_reload) {
678  		gts->ts_force_cch_reload = 0;
679  		gru_update_cch(gts);
680  	}
681  
682  	ret = -EAGAIN;
683  	cbrnum = thread_cbr_number(gts, ucbnum);
684  	if (gts->ts_gru) {
685  		tfh = get_tfh_by_index(gts->ts_gru, cbrnum);
686  		cbk = get_gseg_base_address_cb(gts->ts_gru->gs_gru_base_vaddr,
687  				gts->ts_ctxnum, ucbnum);
688  		ret = gru_user_dropin(gts, tfh, cbk);
689  	}
690  exit:
691  	gru_unlock_gts(gts);
692  	return ret;
693  }
694  
695  /*
696   * Fetch the exception detail information for a CB that terminated with
697   * an exception.
698   */
gru_get_exception_detail(unsigned long arg)699  int gru_get_exception_detail(unsigned long arg)
700  {
701  	struct control_block_extended_exc_detail excdet;
702  	struct gru_control_block_extended *cbe;
703  	struct gru_thread_state *gts;
704  	int ucbnum, cbrnum, ret;
705  
706  	STAT(user_exception);
707  	if (copy_from_user(&excdet, (void __user *)arg, sizeof(excdet)))
708  		return -EFAULT;
709  
710  	gts = gru_find_lock_gts(excdet.cb);
711  	if (!gts)
712  		return -EINVAL;
713  
714  	gru_dbg(grudev, "address 0x%lx, gid %d, gts 0x%p\n", excdet.cb, gts->ts_gru ? gts->ts_gru->gs_gid : -1, gts);
715  	ucbnum = get_cb_number((void *)excdet.cb);
716  	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
717  		ret = -EINVAL;
718  	} else if (gts->ts_gru) {
719  		cbrnum = thread_cbr_number(gts, ucbnum);
720  		cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
721  		gru_flush_cache(cbe);	/* CBE not coherent */
722  		sync_core();		/* make sure we are have current data */
723  		excdet.opc = cbe->opccpy;
724  		excdet.exopc = cbe->exopccpy;
725  		excdet.ecause = cbe->ecause;
726  		excdet.exceptdet0 = cbe->idef1upd;
727  		excdet.exceptdet1 = cbe->idef3upd;
728  		excdet.cbrstate = cbe->cbrstate;
729  		excdet.cbrexecstatus = cbe->cbrexecstatus;
730  		gru_flush_cache_cbe(cbe);
731  		ret = 0;
732  	} else {
733  		ret = -EAGAIN;
734  	}
735  	gru_unlock_gts(gts);
736  
737  	gru_dbg(grudev,
738  		"cb 0x%lx, op %d, exopc %d, cbrstate %d, cbrexecstatus 0x%x, ecause 0x%x, "
739  		"exdet0 0x%lx, exdet1 0x%x\n",
740  		excdet.cb, excdet.opc, excdet.exopc, excdet.cbrstate, excdet.cbrexecstatus,
741  		excdet.ecause, excdet.exceptdet0, excdet.exceptdet1);
742  	if (!ret && copy_to_user((void __user *)arg, &excdet, sizeof(excdet)))
743  		ret = -EFAULT;
744  	return ret;
745  }
746  
747  /*
748   * User request to unload a context. Content is saved for possible reload.
749   */
gru_unload_all_contexts(void)750  static int gru_unload_all_contexts(void)
751  {
752  	struct gru_thread_state *gts;
753  	struct gru_state *gru;
754  	int gid, ctxnum;
755  
756  	if (!capable(CAP_SYS_ADMIN))
757  		return -EPERM;
758  	foreach_gid(gid) {
759  		gru = GID_TO_GRU(gid);
760  		spin_lock(&gru->gs_lock);
761  		for (ctxnum = 0; ctxnum < GRU_NUM_CCH; ctxnum++) {
762  			gts = gru->gs_gts[ctxnum];
763  			if (gts && mutex_trylock(&gts->ts_ctxlock)) {
764  				spin_unlock(&gru->gs_lock);
765  				gru_unload_context(gts, 1);
766  				mutex_unlock(&gts->ts_ctxlock);
767  				spin_lock(&gru->gs_lock);
768  			}
769  		}
770  		spin_unlock(&gru->gs_lock);
771  	}
772  	return 0;
773  }
774  
gru_user_unload_context(unsigned long arg)775  int gru_user_unload_context(unsigned long arg)
776  {
777  	struct gru_thread_state *gts;
778  	struct gru_unload_context_req req;
779  
780  	STAT(user_unload_context);
781  	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
782  		return -EFAULT;
783  
784  	gru_dbg(grudev, "gseg 0x%lx\n", req.gseg);
785  
786  	if (!req.gseg)
787  		return gru_unload_all_contexts();
788  
789  	gts = gru_find_lock_gts(req.gseg);
790  	if (!gts)
791  		return -EINVAL;
792  
793  	if (gts->ts_gru)
794  		gru_unload_context(gts, 1);
795  	gru_unlock_gts(gts);
796  
797  	return 0;
798  }
799  
800  /*
801   * User request to flush a range of virtual addresses from the GRU TLB
802   * (Mainly for testing).
803   */
gru_user_flush_tlb(unsigned long arg)804  int gru_user_flush_tlb(unsigned long arg)
805  {
806  	struct gru_thread_state *gts;
807  	struct gru_flush_tlb_req req;
808  	struct gru_mm_struct *gms;
809  
810  	STAT(user_flush_tlb);
811  	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
812  		return -EFAULT;
813  
814  	gru_dbg(grudev, "gseg 0x%lx, vaddr 0x%lx, len 0x%lx\n", req.gseg,
815  		req.vaddr, req.len);
816  
817  	gts = gru_find_lock_gts(req.gseg);
818  	if (!gts)
819  		return -EINVAL;
820  
821  	gms = gts->ts_gms;
822  	gru_unlock_gts(gts);
823  	gru_flush_tlb_range(gms, req.vaddr, req.len);
824  
825  	return 0;
826  }
827  
828  /*
829   * Fetch GSEG statisticss
830   */
gru_get_gseg_statistics(unsigned long arg)831  long gru_get_gseg_statistics(unsigned long arg)
832  {
833  	struct gru_thread_state *gts;
834  	struct gru_get_gseg_statistics_req req;
835  
836  	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
837  		return -EFAULT;
838  
839  	/*
840  	 * The library creates arrays of contexts for threaded programs.
841  	 * If no gts exists in the array, the context has never been used & all
842  	 * statistics are implicitly 0.
843  	 */
844  	gts = gru_find_lock_gts(req.gseg);
845  	if (gts) {
846  		memcpy(&req.stats, &gts->ustats, sizeof(gts->ustats));
847  		gru_unlock_gts(gts);
848  	} else {
849  		memset(&req.stats, 0, sizeof(gts->ustats));
850  	}
851  
852  	if (copy_to_user((void __user *)arg, &req, sizeof(req)))
853  		return -EFAULT;
854  
855  	return 0;
856  }
857  
858  /*
859   * Register the current task as the user of the GSEG slice.
860   * Needed for TLB fault interrupt targeting.
861   */
gru_set_context_option(unsigned long arg)862  int gru_set_context_option(unsigned long arg)
863  {
864  	struct gru_thread_state *gts;
865  	struct gru_set_context_option_req req;
866  	int ret = 0;
867  
868  	STAT(set_context_option);
869  	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
870  		return -EFAULT;
871  	gru_dbg(grudev, "op %d, gseg 0x%lx, value1 0x%lx\n", req.op, req.gseg, req.val1);
872  
873  	gts = gru_find_lock_gts(req.gseg);
874  	if (!gts) {
875  		gts = gru_alloc_locked_gts(req.gseg);
876  		if (IS_ERR(gts))
877  			return PTR_ERR(gts);
878  	}
879  
880  	switch (req.op) {
881  	case sco_blade_chiplet:
882  		/* Select blade/chiplet for GRU context */
883  		if (req.val0 < -1 || req.val0 >= GRU_CHIPLETS_PER_HUB ||
884  		    req.val1 < -1 || req.val1 >= GRU_MAX_BLADES ||
885  		    (req.val1 >= 0 && !gru_base[req.val1])) {
886  			ret = -EINVAL;
887  		} else {
888  			gts->ts_user_blade_id = req.val1;
889  			gts->ts_user_chiplet_id = req.val0;
890  			gru_check_context_placement(gts);
891  		}
892  		break;
893  	case sco_gseg_owner:
894   		/* Register the current task as the GSEG owner */
895  		gts->ts_tgid_owner = current->tgid;
896  		break;
897  	case sco_cch_req_slice:
898   		/* Set the CCH slice option */
899  		gts->ts_cch_req_slice = req.val1 & 3;
900  		break;
901  	default:
902  		ret = -EINVAL;
903  	}
904  	gru_unlock_gts(gts);
905  
906  	return ret;
907  }
908