1/* 2 * Copyright 2010 Tilera Corporation. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation, version 2. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 11 * NON INFRINGEMENT. See the GNU General Public License for 12 * more details. 13 * 14 * Support routines for atomic operations. Each function takes: 15 * 16 * r0: address to manipulate 17 * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG) 18 * r2: new value to write, or for cmpxchg/add_unless, value to compare against 19 * r3: (cmpxchg/xchg_add_unless) new value to write or add; 20 * (atomic64 ops) high word of value to write 21 * r4/r5: (cmpxchg64/add_unless64) new value to write or add 22 * 23 * The 32-bit routines return a "struct __get_user" so that the futex code 24 * has an opportunity to return -EFAULT to the user if needed. 25 * The 64-bit routines just return a "long long" with the value, 26 * since they are only used from kernel space and don't expect to fault. 27 * Support for 16-bit ops is included in the framework but we don't provide 28 * any (x86_64 has an atomic_inc_short(), so we might want to some day). 29 * 30 * Note that the caller is advised to issue a suitable L1 or L2 31 * prefetch on the address being manipulated to avoid extra stalls. 32 * In addition, the hot path is on two icache lines, and we start with 33 * a jump to the second line to make sure they are both in cache so 34 * that we never stall waiting on icache fill while holding the lock. 35 * (This doesn't work out with most 64-bit ops, since they consume 36 * too many bundles, so may take an extra i-cache stall.) 37 * 38 * These routines set the INTERRUPT_CRITICAL_SECTION bit, just 39 * like sys_cmpxchg(), so that NMIs like PERF_COUNT will not interrupt 40 * the code, just page faults. 41 * 42 * If the load or store faults in a way that can be directly fixed in 43 * the do_page_fault_ics() handler (e.g. a vmalloc reference) we fix it 44 * directly, return to the instruction that faulted, and retry it. 45 * 46 * If the load or store faults in a way that potentially requires us 47 * to release the atomic lock, then retry (e.g. a migrating PTE), we 48 * reset the PC in do_page_fault_ics() to the "tns" instruction so 49 * that on return we will reacquire the lock and restart the op. We 50 * are somewhat overloading the exception_table_entry notion by doing 51 * this, since those entries are not normally used for migrating PTEs. 52 * 53 * If the main page fault handler discovers a bad address, it will see 54 * the PC pointing to the "tns" instruction (due to the earlier 55 * exception_table_entry processing in do_page_fault_ics), and 56 * re-reset the PC to the fault handler, atomic_bad_address(), which 57 * effectively takes over from the atomic op and can either return a 58 * bad "struct __get_user" (for user addresses) or can just panic (for 59 * bad kernel addresses). 60 * 61 * Note that if the value we would store is the same as what we 62 * loaded, we bypass the store. Other platforms with true atomics can 63 * make the guarantee that a non-atomic __clear_bit(), for example, 64 * can safely race with an atomic test_and_set_bit(); this example is 65 * from bit_spinlock.h in slub_lock() / slub_unlock(). We can't do 66 * that on Tile since the "atomic" op is really just a 67 * read/modify/write, and can race with the non-atomic 68 * read/modify/write. However, if we can short-circuit the write when 69 * it is not needed, in the atomic case, we avoid the race. 70 */ 71 72#include <linux/linkage.h> 73#include <asm/atomic_32.h> 74#include <asm/page.h> 75#include <asm/processor.h> 76 77 .section .text.atomic,"ax" 78ENTRY(__start_atomic_asm_code) 79 80 .macro atomic_op, name, bitwidth, body 81 .align 64 82STD_ENTRY_SECTION(__atomic\name, .text.atomic) 83 { 84 movei r24, 1 85 j 4f /* branch to second cache line */ 86 } 871: { 88 .ifc \bitwidth,16 89 lh r22, r0 90 .else 91 lw r22, r0 92 addi r28, r0, 4 93 .endif 94 } 95 .ifc \bitwidth,64 96 lw r23, r28 97 .endif 98 \body /* set r24, and r25 if 64-bit */ 99 { 100 seq r26, r22, r24 101 seq r27, r23, r25 102 } 103 .ifc \bitwidth,64 104 bbnst r27, 2f 105 .endif 106 bbs r26, 3f /* skip write-back if it's the same value */ 1072: { 108 .ifc \bitwidth,16 109 sh r0, r24 110 .else 111 sw r0, r24 112 .endif 113 } 114 .ifc \bitwidth,64 115 sw r28, r25 116 .endif 117 mf 1183: { 119 move r0, r22 120 .ifc \bitwidth,64 121 move r1, r23 122 .else 123 move r1, zero 124 .endif 125 sw ATOMIC_LOCK_REG_NAME, zero 126 } 127 mtspr INTERRUPT_CRITICAL_SECTION, zero 128 jrp lr 1294: { 130 move ATOMIC_LOCK_REG_NAME, r1 131 mtspr INTERRUPT_CRITICAL_SECTION, r24 132 } 133#ifndef CONFIG_SMP 134 j 1b /* no atomic locks */ 135#else 136 { 137 tns r21, ATOMIC_LOCK_REG_NAME 138 moveli r23, 2048 /* maximum backoff time in cycles */ 139 } 140 { 141 bzt r21, 1b /* branch if lock acquired */ 142 moveli r25, 32 /* starting backoff time in cycles */ 143 } 1445: mtspr INTERRUPT_CRITICAL_SECTION, zero 145 mfspr r26, CYCLE_LOW /* get start point for this backoff */ 1466: mfspr r22, CYCLE_LOW /* test to see if we've backed off enough */ 147 sub r22, r22, r26 148 slt r22, r22, r25 149 bbst r22, 6b 150 { 151 mtspr INTERRUPT_CRITICAL_SECTION, r24 152 shli r25, r25, 1 /* double the backoff; retry the tns */ 153 } 154 { 155 tns r21, ATOMIC_LOCK_REG_NAME 156 slt r26, r23, r25 /* is the proposed backoff too big? */ 157 } 158 { 159 bzt r21, 1b /* branch if lock acquired */ 160 mvnz r25, r26, r23 161 } 162 j 5b 163#endif 164 STD_ENDPROC(__atomic\name) 165 .ifc \bitwidth,32 166 .pushsection __ex_table,"a" 167 .align 4 168 .word 1b, __atomic\name 169 .word 2b, __atomic\name 170 .word __atomic\name, __atomic_bad_address 171 .popsection 172 .endif 173 .endm 174 175 176/* 177 * Use __atomic32 prefix to avoid collisions with GCC builtin __atomic functions. 178 */ 179 180atomic_op 32_cmpxchg, 32, "seq r26, r22, r2; { bbns r26, 3f; move r24, r3 }" 181atomic_op 32_xchg, 32, "move r24, r2" 182atomic_op 32_xchg_add, 32, "add r24, r22, r2" 183atomic_op 32_xchg_add_unless, 32, \ 184 "sne r26, r22, r2; { bbns r26, 3f; add r24, r22, r3 }" 185atomic_op 32_fetch_or, 32, "or r24, r22, r2" 186atomic_op 32_fetch_and, 32, "and r24, r22, r2" 187atomic_op 32_fetch_andn, 32, "nor r2, r2, zero; and r24, r22, r2" 188atomic_op 32_fetch_xor, 32, "xor r24, r22, r2" 189 190atomic_op 64_cmpxchg, 64, "{ seq r26, r22, r2; seq r27, r23, r3 }; \ 191 { bbns r26, 3f; move r24, r4 }; { bbns r27, 3f; move r25, r5 }" 192atomic_op 64_xchg, 64, "{ move r24, r2; move r25, r3 }" 193atomic_op 64_xchg_add, 64, "{ add r24, r22, r2; add r25, r23, r3 }; \ 194 slt_u r26, r24, r22; add r25, r25, r26" 195atomic_op 64_xchg_add_unless, 64, \ 196 "{ sne r26, r22, r2; sne r27, r23, r3 }; \ 197 { bbns r26, 3f; add r24, r22, r4 }; \ 198 { bbns r27, 3f; add r25, r23, r5 }; \ 199 slt_u r26, r24, r22; add r25, r25, r26" 200atomic_op 64_fetch_or, 64, "{ or r24, r22, r2; or r25, r23, r3 }" 201atomic_op 64_fetch_and, 64, "{ and r24, r22, r2; and r25, r23, r3 }" 202atomic_op 64_fetch_xor, 64, "{ xor r24, r22, r2; xor r25, r23, r3 }" 203 204 jrp lr /* happy backtracer */ 205 206ENTRY(__end_atomic_asm_code) 207