• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0 OR MIT
2 
3 /*
4 128-bit atomic implementation on x86_64 using CMPXCHG16B (DWCAS).
5 
6 Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
7 this module and use intrinsics.rs instead.
8 
9 Refs:
10 - x86 and amd64 instruction reference https://www.felixcloutier.com/x86
11 - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
12 
13 Generated asm:
14 - x86_64 (+cmpxchg16b) https://godbolt.org/z/r5x9M8PdK
15 */
16 
17 // TODO: use core::arch::x86_64::cmpxchg16b where available and efficient than asm
18 
19 include!("macros.rs");
20 
21 #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
22 #[path = "../fallback/outline_atomics.rs"]
23 mod fallback;
24 
25 #[cfg(not(portable_atomic_no_outline_atomics))]
26 #[cfg(not(target_env = "sgx"))]
27 #[cfg_attr(
28     not(target_feature = "sse"),
29     cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))
30 )]
31 #[path = "../detect/x86_64.rs"]
32 mod detect;
33 
34 #[cfg(not(portable_atomic_no_asm))]
35 use core::arch::asm;
36 use core::sync::atomic::Ordering;
37 
38 use crate::utils::{Pair, U128};
39 
40 // Asserts that the function is called in the correct context.
41 macro_rules! debug_assert_cmpxchg16b {
42     () => {
43         #[cfg(not(any(
44             target_feature = "cmpxchg16b",
45             portable_atomic_target_feature = "cmpxchg16b",
46         )))]
47         {
48             debug_assert!(detect::detect().has_cmpxchg16b());
49         }
50     };
51 }
52 #[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
53 #[cfg(target_feature = "sse")]
54 macro_rules! debug_assert_vmovdqa_atomic {
55     () => {{
56         debug_assert_cmpxchg16b!();
57         debug_assert!(detect::detect().has_vmovdqa_atomic());
58     }};
59 }
60 
61 #[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
62 #[cfg(target_feature = "sse")]
63 #[cfg(target_pointer_width = "32")]
64 macro_rules! ptr_modifier {
65     () => {
66         ":e"
67     };
68 }
69 #[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
70 #[cfg(target_feature = "sse")]
71 #[cfg(target_pointer_width = "64")]
72 macro_rules! ptr_modifier {
73     () => {
74         ""
75     };
76 }
77 
78 // Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction
79 // requirements for the currently enabled target features. In the first place,
80 // there is no option in the x86 assembly for such case, like Arm .arch_extension,
81 // RISC-V .option arch, PowerPC .machine, etc.
82 // However, we set target_feature(enable) when available (Rust 1.69+) in case a
83 // new codegen backend is added that checks for it in the future, or an option
84 // is added to the assembler to check for it.
85 #[cfg_attr(
86     not(portable_atomic_no_cmpxchg16b_target_feature),
87     target_feature(enable = "cmpxchg16b")
88 )]
89 #[inline]
cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool)90 unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
91     debug_assert!(dst as usize % 16 == 0);
92     debug_assert_cmpxchg16b!();
93 
94     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
95     // reads, 16-byte aligned (required by CMPXCHG16B), that there are no
96     // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
97     //
98     // If the value at `dst` (destination operand) and rdx:rax are equal, the
99     // 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at
100     // `dst` is loaded to rdx:rax.
101     //
102     // The ZF flag is set if the value at `dst` and rdx:rax are equal,
103     // otherwise it is cleared. Other flags are unaffected.
104     //
105     // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
106     unsafe {
107         // cmpxchg16b is always SeqCst.
108         let r: u8;
109         let old = U128 { whole: old };
110         let new = U128 { whole: new };
111         let (prev_lo, prev_hi);
112         macro_rules! cmpxchg16b {
113             ($rdi:tt) => {
114                 asm!(
115                     "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
116                     concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
117                     "sete cl",
118                     "mov rbx, {rbx_tmp}", // restore rbx
119                     rbx_tmp = inout(reg) new.pair.lo => _,
120                     in("rcx") new.pair.hi,
121                     inout("rax") old.pair.lo => prev_lo,
122                     inout("rdx") old.pair.hi => prev_hi,
123                     in($rdi) dst,
124                     lateout("cl") r,
125                     // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
126                     options(nostack),
127                 )
128             };
129         }
130         #[cfg(target_pointer_width = "32")]
131         cmpxchg16b!("edi");
132         #[cfg(target_pointer_width = "64")]
133         cmpxchg16b!("rdi");
134         crate::utils::assert_unchecked(r == 0 || r == 1); // needed to remove extra test
135         (U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0)
136     }
137 }
138 
139 // VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX.
140 // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
141 //
142 // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
143 //
144 // Use cfg(target_feature = "sse") here -- SSE is included in the x86_64
145 // baseline and is always available, but the SSE target feature is disabled for
146 // use cases such as kernels and firmware that should not use vector registers.
147 // So, do not use vector registers unless SSE target feature is enabled.
148 // See also https://github.com/rust-lang/rust/blob/1.80.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md.
149 #[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
150 #[cfg(target_feature = "sse")]
151 #[target_feature(enable = "avx")]
152 #[inline]
atomic_load_vmovdqa(src: *mut u128) -> u128153 unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 {
154     debug_assert!(src as usize % 16 == 0);
155     debug_assert_vmovdqa_atomic!();
156 
157     // SAFETY: the caller must uphold the safety contract.
158     //
159     // atomic load by vmovdqa is always SeqCst.
160     unsafe {
161         let out: core::arch::x86_64::__m128i;
162         asm!(
163             concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"),
164             src = in(reg) src,
165             out = out(xmm_reg) out,
166             options(nostack, preserves_flags),
167         );
168         core::mem::transmute(out)
169     }
170 }
171 #[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
172 #[cfg(target_feature = "sse")]
173 #[target_feature(enable = "avx")]
174 #[inline]
atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering)175 unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
176     debug_assert!(dst as usize % 16 == 0);
177     debug_assert_vmovdqa_atomic!();
178 
179     // SAFETY: the caller must uphold the safety contract.
180     unsafe {
181         let val: core::arch::x86_64::__m128i = core::mem::transmute(val);
182         match order {
183             // Relaxed and Release stores are equivalent.
184             Ordering::Relaxed | Ordering::Release => {
185                 asm!(
186                     concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
187                     dst = in(reg) dst,
188                     val = in(xmm_reg) val,
189                     options(nostack, preserves_flags),
190                 );
191             }
192             Ordering::SeqCst => {
193                 let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit());
194                 asm!(
195                     concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
196                     // Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
197                     // - https://github.com/taiki-e/portable-atomic/pull/156
198                     // - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc
199                     // - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
200                     // - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740
201                     // - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
202                     concat!("xchg qword ptr [{p", ptr_modifier!(), "}], {tmp}"),
203                     dst = in(reg) dst,
204                     val = in(xmm_reg) val,
205                     p = inout(reg) p.get() => _,
206                     tmp = lateout(reg) _,
207                     options(nostack, preserves_flags),
208                 );
209             }
210             _ => unreachable!(),
211         }
212     }
213 }
214 
215 #[cfg(not(all(
216     any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
217     any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
218 )))]
219 macro_rules! load_store_detect {
220     (
221         vmovdqa = $vmovdqa:ident
222         cmpxchg16b = $cmpxchg16b:ident
223         fallback = $fallback:ident
224     ) => {{
225         let cpuid = detect::detect();
226         #[cfg(not(any(
227             target_feature = "cmpxchg16b",
228             portable_atomic_target_feature = "cmpxchg16b",
229         )))]
230         {
231             // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access.
232             if cpuid.has_cmpxchg16b() {
233                 // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
234                 #[cfg(target_feature = "sse")]
235                 {
236                     if cpuid.has_vmovdqa_atomic() {
237                         $vmovdqa
238                     } else {
239                         $cmpxchg16b
240                     }
241                 }
242                 #[cfg(not(target_feature = "sse"))]
243                 {
244                     $cmpxchg16b
245                 }
246             } else {
247                 fallback::$fallback
248             }
249         }
250         #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
251         {
252             if cpuid.has_vmovdqa_atomic() {
253                 $vmovdqa
254             } else {
255                 $cmpxchg16b
256             }
257         }
258     }};
259 }
260 
261 #[inline]
atomic_load(src: *mut u128, _order: Ordering) -> u128262 unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
263     // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
264     // SGX doesn't support CPUID.
265     #[cfg(all(
266         any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
267         any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
268     ))]
269     // SAFETY: the caller must uphold the safety contract.
270     // cfg guarantees that CMPXCHG16B is available at compile-time.
271     unsafe {
272         // cmpxchg16b is always SeqCst.
273         atomic_load_cmpxchg16b(src)
274     }
275     #[cfg(not(all(
276         any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
277         any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
278     )))]
279     // SAFETY: the caller must uphold the safety contract.
280     unsafe {
281         ifunc!(unsafe fn(src: *mut u128) -> u128 {
282             load_store_detect! {
283                 vmovdqa = atomic_load_vmovdqa
284                 cmpxchg16b = atomic_load_cmpxchg16b
285                 // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst.
286                 fallback = atomic_load_seqcst
287             }
288         })
289     }
290 }
291 // See cmpxchg16b() for target_feature(enable).
292 #[cfg_attr(
293     not(portable_atomic_no_cmpxchg16b_target_feature),
294     target_feature(enable = "cmpxchg16b")
295 )]
296 #[inline]
atomic_load_cmpxchg16b(src: *mut u128) -> u128297 unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 {
298     debug_assert!(src as usize % 16 == 0);
299     debug_assert_cmpxchg16b!();
300 
301     // SAFETY: the caller must guarantee that `src` is valid for both writes and
302     // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
303     // cfg guarantees that the CPU supports CMPXCHG16B.
304     //
305     // See cmpxchg16b function for more.
306     //
307     // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
308     // omitting the storing of condition flags and avoid use of xchg to handle rbx.
309     unsafe {
310         // cmpxchg16b is always SeqCst.
311         let (out_lo, out_hi);
312         macro_rules! cmpxchg16b {
313             ($rdi:tt) => {
314                 asm!(
315                     "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
316                     "xor rbx, rbx", // zeroed rbx
317                     concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
318                     "mov rbx, {rbx_tmp}", // restore rbx
319                     // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
320                     rbx_tmp = out(reg) _,
321                     in("rcx") 0_u64,
322                     inout("rax") 0_u64 => out_lo,
323                     inout("rdx") 0_u64 => out_hi,
324                     in($rdi) src,
325                     // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
326                     options(nostack),
327                 )
328             };
329         }
330         #[cfg(target_pointer_width = "32")]
331         cmpxchg16b!("edi");
332         #[cfg(target_pointer_width = "64")]
333         cmpxchg16b!("rdi");
334         U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole
335     }
336 }
337 
338 #[inline]
atomic_store(dst: *mut u128, val: u128, order: Ordering)339 unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
340     // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
341     // SGX doesn't support CPUID.
342     #[cfg(all(
343         any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
344         any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
345     ))]
346     // SAFETY: the caller must uphold the safety contract.
347     // cfg guarantees that CMPXCHG16B is available at compile-time.
348     unsafe {
349         // cmpxchg16b is always SeqCst.
350         let _ = order;
351         atomic_store_cmpxchg16b(dst, val);
352     }
353     #[cfg(not(all(
354         any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
355         any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
356     )))]
357     // SAFETY: the caller must uphold the safety contract.
358     unsafe {
359         #[cfg(target_feature = "sse")]
360         fn_alias! {
361             #[target_feature(enable = "avx")]
362             unsafe fn(dst: *mut u128, val: u128);
363             // atomic store by vmovdqa has at least release semantics.
364             atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release);
365             atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst);
366         }
367         match order {
368             // Relaxed and Release stores are equivalent in all implementations
369             // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback).
370             // core::arch's cmpxchg16b will never called here.
371             Ordering::Relaxed | Ordering::Release => {
372                 ifunc!(unsafe fn(dst: *mut u128, val: u128) {
373                     load_store_detect! {
374                         vmovdqa = atomic_store_vmovdqa_non_seqcst
375                         cmpxchg16b = atomic_store_cmpxchg16b
376                         fallback = atomic_store_non_seqcst
377                     }
378                 });
379             }
380             Ordering::SeqCst => {
381                 ifunc!(unsafe fn(dst: *mut u128, val: u128) {
382                     load_store_detect! {
383                         vmovdqa = atomic_store_vmovdqa_seqcst
384                         cmpxchg16b = atomic_store_cmpxchg16b
385                         fallback = atomic_store_seqcst
386                     }
387                 });
388             }
389             _ => unreachable!(),
390         }
391     }
392 }
393 // See cmpxchg16b() for target_feature(enable).
394 #[cfg_attr(
395     not(portable_atomic_no_cmpxchg16b_target_feature),
396     target_feature(enable = "cmpxchg16b")
397 )]
398 #[inline]
atomic_store_cmpxchg16b(dst: *mut u128, val: u128)399 unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) {
400     // SAFETY: the caller must uphold the safety contract.
401     unsafe {
402         // cmpxchg16b is always SeqCst.
403         atomic_swap_cmpxchg16b(dst, val, Ordering::SeqCst);
404     }
405 }
406 
407 #[inline]
atomic_compare_exchange( dst: *mut u128, old: u128, new: u128, _success: Ordering, _failure: Ordering, ) -> Result<u128, u128>408 unsafe fn atomic_compare_exchange(
409     dst: *mut u128,
410     old: u128,
411     new: u128,
412     _success: Ordering,
413     _failure: Ordering,
414 ) -> Result<u128, u128> {
415     #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
416     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
417     // reads, 16-byte aligned, that there are no concurrent non-atomic operations,
418     // and cfg guarantees that CMPXCHG16B is available at compile-time.
419     let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) };
420     #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
421     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
422     // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
423     let (prev, ok) = unsafe {
424         ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
425             if detect::detect().has_cmpxchg16b() {
426                 cmpxchg16b
427             } else {
428                 // Use SeqCst because cmpxchg16b is always SeqCst.
429                 fallback::atomic_compare_exchange_seqcst
430             }
431         })
432     };
433     if ok {
434         Ok(prev)
435     } else {
436         Err(prev)
437     }
438 }
439 
440 // cmpxchg16b is always strong.
441 use atomic_compare_exchange as atomic_compare_exchange_weak;
442 
443 // See cmpxchg16b() for target_feature(enable).
444 #[cfg_attr(
445     not(portable_atomic_no_cmpxchg16b_target_feature),
446     target_feature(enable = "cmpxchg16b")
447 )]
448 #[inline]
atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128449 unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
450     debug_assert!(dst as usize % 16 == 0);
451     debug_assert_cmpxchg16b!();
452 
453     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
454     // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
455     // cfg guarantees that the CPU supports CMPXCHG16B.
456     //
457     // See cmpxchg16b function for more.
458     //
459     // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
460     // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
461     //
462     // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
463     unsafe {
464         // cmpxchg16b is always SeqCst.
465         let val = U128 { whole: val };
466         let (mut prev_lo, mut prev_hi);
467         macro_rules! cmpxchg16b {
468             ($rdi:tt) => {
469                 asm!(
470                     "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
471                     // This is not single-copy atomic reads, but this is ok because subsequent
472                     // CAS will check for consistency.
473                     //
474                     // This is based on the code generated for the first load in DW RMWs by LLVM.
475                     //
476                     // Note that the C++20 memory model does not allow mixed-sized atomic access,
477                     // so we must use inline assembly to implement this.
478                     // (i.e., byte-wise atomic based on the standard library's atomic types
479                     // cannot be used here).
480                     concat!("mov rax, qword ptr [", $rdi, "]"),
481                     concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
482                     "2:",
483                         concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
484                         "jne 2b",
485                     "mov rbx, {rbx_tmp}", // restore rbx
486                     rbx_tmp = inout(reg) val.pair.lo => _,
487                     in("rcx") val.pair.hi,
488                     out("rax") prev_lo,
489                     out("rdx") prev_hi,
490                     in($rdi) dst,
491                     // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
492                     options(nostack),
493                 )
494             };
495         }
496         #[cfg(target_pointer_width = "32")]
497         cmpxchg16b!("edi");
498         #[cfg(target_pointer_width = "64")]
499         cmpxchg16b!("rdi");
500         U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
501     }
502 }
503 
504 /// Atomic RMW by CAS loop (3 arguments)
505 /// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;`
506 ///
507 /// `$op` can use the following registers:
508 /// - rsi/r8 pair: val argument (read-only for `$op`)
509 /// - rax/rdx pair: previous value loaded (read-only for `$op`)
510 /// - rbx/rcx pair: new value that will be stored
511 // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
512 // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
513 macro_rules! atomic_rmw_cas_3 {
514     ($name:ident, $($op:tt)*) => {
515         // See cmpxchg16b() for target_feature(enable).
516         #[cfg_attr(
517             not(portable_atomic_no_cmpxchg16b_target_feature),
518             target_feature(enable = "cmpxchg16b")
519         )]
520         #[inline]
521         unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
522             debug_assert!(dst as usize % 16 == 0);
523             debug_assert_cmpxchg16b!();
524             // SAFETY: the caller must guarantee that `dst` is valid for both writes and
525             // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
526             // cfg guarantees that the CPU supports CMPXCHG16B.
527             //
528             // See cmpxchg16b function for more.
529             unsafe {
530                 // cmpxchg16b is always SeqCst.
531                 let val = U128 { whole: val };
532                 let (mut prev_lo, mut prev_hi);
533                 macro_rules! cmpxchg16b {
534                     ($rdi:tt) => {
535                         asm!(
536                             "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
537                             // This is not single-copy atomic reads, but this is ok because subsequent
538                             // CAS will check for consistency.
539                             //
540                             // This is based on the code generated for the first load in DW RMWs by LLVM.
541                             //
542                             // Note that the C++20 memory model does not allow mixed-sized atomic access,
543                             // so we must use inline assembly to implement this.
544                             // (i.e., byte-wise atomic based on the standard library's atomic types
545                             // cannot be used here).
546                             concat!("mov rax, qword ptr [", $rdi, "]"),
547                             concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
548                             "2:",
549                                 $($op)*
550                                 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
551                                 "jne 2b",
552                             "mov rbx, {rbx_tmp}", // restore rbx
553                             rbx_tmp = out(reg) _,
554                             out("rcx") _,
555                             out("rax") prev_lo,
556                             out("rdx") prev_hi,
557                             in($rdi) dst,
558                             in("rsi") val.pair.lo,
559                             in("r8") val.pair.hi,
560                             // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
561                             options(nostack),
562                         )
563                     };
564                 }
565                 #[cfg(target_pointer_width = "32")]
566                 cmpxchg16b!("edi");
567                 #[cfg(target_pointer_width = "64")]
568                 cmpxchg16b!("rdi");
569                 U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
570             }
571         }
572     };
573 }
574 /// Atomic RMW by CAS loop (2 arguments)
575 /// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;`
576 ///
577 /// `$op` can use the following registers:
578 /// - rax/rdx pair: previous value loaded (read-only for `$op`)
579 /// - rbx/rcx pair: new value that will be stored
580 // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
581 // omitting the storing of condition flags and avoid use of xchg to handle rbx.
582 macro_rules! atomic_rmw_cas_2 {
583     ($name:ident, $($op:tt)*) => {
584         // See cmpxchg16b() for target_feature(enable).
585         #[cfg_attr(
586             not(portable_atomic_no_cmpxchg16b_target_feature),
587             target_feature(enable = "cmpxchg16b")
588         )]
589         #[inline]
590         unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
591             debug_assert!(dst as usize % 16 == 0);
592             debug_assert_cmpxchg16b!();
593             // SAFETY: the caller must guarantee that `dst` is valid for both writes and
594             // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
595             // cfg guarantees that the CPU supports CMPXCHG16B.
596             //
597             // See cmpxchg16b function for more.
598             unsafe {
599                 // cmpxchg16b is always SeqCst.
600                 let (mut prev_lo, mut prev_hi);
601                 macro_rules! cmpxchg16b {
602                     ($rdi:tt) => {
603                         asm!(
604                             "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
605                             // This is not single-copy atomic reads, but this is ok because subsequent
606                             // CAS will check for consistency.
607                             //
608                             // This is based on the code generated for the first load in DW RMWs by LLVM.
609                             //
610                             // Note that the C++20 memory model does not allow mixed-sized atomic access,
611                             // so we must use inline assembly to implement this.
612                             // (i.e., byte-wise atomic based on the standard library's atomic types
613                             // cannot be used here).
614                             concat!("mov rax, qword ptr [", $rdi, "]"),
615                             concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
616                             "2:",
617                                 $($op)*
618                                 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
619                                 "jne 2b",
620                             "mov rbx, {rbx_tmp}", // restore rbx
621                             rbx_tmp = out(reg) _,
622                             out("rcx") _,
623                             out("rax") prev_lo,
624                             out("rdx") prev_hi,
625                             in($rdi) dst,
626                             // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
627                             options(nostack),
628                         )
629                     };
630                 }
631                 #[cfg(target_pointer_width = "32")]
632                 cmpxchg16b!("edi");
633                 #[cfg(target_pointer_width = "64")]
634                 cmpxchg16b!("rdi");
635                 U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
636             }
637         }
638     };
639 }
640 
641 atomic_rmw_cas_3! {
642     atomic_add_cmpxchg16b,
643     "mov rbx, rax",
644     "add rbx, rsi",
645     "mov rcx, rdx",
646     "adc rcx, r8",
647 }
648 atomic_rmw_cas_3! {
649     atomic_sub_cmpxchg16b,
650     "mov rbx, rax",
651     "sub rbx, rsi",
652     "mov rcx, rdx",
653     "sbb rcx, r8",
654 }
655 atomic_rmw_cas_3! {
656     atomic_and_cmpxchg16b,
657     "mov rbx, rax",
658     "and rbx, rsi",
659     "mov rcx, rdx",
660     "and rcx, r8",
661 }
662 atomic_rmw_cas_3! {
663     atomic_nand_cmpxchg16b,
664     "mov rbx, rax",
665     "and rbx, rsi",
666     "not rbx",
667     "mov rcx, rdx",
668     "and rcx, r8",
669     "not rcx",
670 }
671 atomic_rmw_cas_3! {
672     atomic_or_cmpxchg16b,
673     "mov rbx, rax",
674     "or rbx, rsi",
675     "mov rcx, rdx",
676     "or rcx, r8",
677 }
678 atomic_rmw_cas_3! {
679     atomic_xor_cmpxchg16b,
680     "mov rbx, rax",
681     "xor rbx, rsi",
682     "mov rcx, rdx",
683     "xor rcx, r8",
684 }
685 
686 atomic_rmw_cas_2! {
687     atomic_not_cmpxchg16b,
688     "mov rbx, rax",
689     "not rbx",
690     "mov rcx, rdx",
691     "not rcx",
692 }
693 atomic_rmw_cas_2! {
694     atomic_neg_cmpxchg16b,
695     "mov rbx, rax",
696     "neg rbx",
697     "mov rcx, 0",
698     "sbb rcx, rdx",
699 }
700 
701 atomic_rmw_cas_3! {
702     atomic_max_cmpxchg16b,
703     "cmp rsi, rax",
704     "mov rcx, r8",
705     "sbb rcx, rdx",
706     "mov rcx, r8",
707     "cmovl rcx, rdx",
708     "mov rbx, rsi",
709     "cmovl rbx, rax",
710 }
711 atomic_rmw_cas_3! {
712     atomic_umax_cmpxchg16b,
713     "cmp rsi, rax",
714     "mov rcx, r8",
715     "sbb rcx, rdx",
716     "mov rcx, r8",
717     "cmovb rcx, rdx",
718     "mov rbx, rsi",
719     "cmovb rbx, rax",
720 }
721 atomic_rmw_cas_3! {
722     atomic_min_cmpxchg16b,
723     "cmp rsi, rax",
724     "mov rcx, r8",
725     "sbb rcx, rdx",
726     "mov rcx, r8",
727     "cmovge rcx, rdx",
728     "mov rbx, rsi",
729     "cmovge rbx, rax",
730 }
731 atomic_rmw_cas_3! {
732     atomic_umin_cmpxchg16b,
733     "cmp rsi, rax",
734     "mov rcx, r8",
735     "sbb rcx, rdx",
736     "mov rcx, r8",
737     "cmovae rcx, rdx",
738     "mov rbx, rsi",
739     "cmovae rbx, rax",
740 }
741 
742 macro_rules! select_atomic_rmw {
743     (
744         unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?;
745         cmpxchg16b = $cmpxchg16b_fn:ident;
746         fallback = $seqcst_fallback_fn:ident;
747     ) => {
748         // If cmpxchg16b is available at compile-time, we can always use cmpxchg16b_fn.
749         #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
750         use $cmpxchg16b_fn as $name;
751         // Otherwise, we need to do run-time detection and can use cmpxchg16b_fn only if cmpxchg16b is available.
752         #[cfg(not(any(
753             target_feature = "cmpxchg16b",
754             portable_atomic_target_feature = "cmpxchg16b",
755         )))]
756         #[inline]
757         unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? {
758             fn_alias! {
759                 // See cmpxchg16b() for target_feature(enable).
760                 #[cfg_attr(
761                     not(portable_atomic_no_cmpxchg16b_target_feature),
762                     target_feature(enable = "cmpxchg16b")
763                 )]
764                 unsafe fn($($arg)*) $(-> $ret_ty)?;
765                 // cmpxchg16b is always SeqCst.
766                 cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst);
767             }
768             // SAFETY: the caller must uphold the safety contract.
769             // we only calls cmpxchg16b_fn if cmpxchg16b is available.
770             unsafe {
771                 ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
772                     if detect::detect().has_cmpxchg16b() {
773                         cmpxchg16b_seqcst_fn
774                     } else {
775                         // Use SeqCst because cmpxchg16b is always SeqCst.
776                         fallback::$seqcst_fallback_fn
777                     }
778                 })
779             }
780         }
781     };
782 }
783 
784 select_atomic_rmw! {
785     unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128;
786     cmpxchg16b = atomic_swap_cmpxchg16b;
787     fallback = atomic_swap_seqcst;
788 }
789 select_atomic_rmw! {
790     unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128;
791     cmpxchg16b = atomic_add_cmpxchg16b;
792     fallback = atomic_add_seqcst;
793 }
794 select_atomic_rmw! {
795     unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128;
796     cmpxchg16b = atomic_sub_cmpxchg16b;
797     fallback = atomic_sub_seqcst;
798 }
799 select_atomic_rmw! {
800     unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128;
801     cmpxchg16b = atomic_and_cmpxchg16b;
802     fallback = atomic_and_seqcst;
803 }
804 select_atomic_rmw! {
805     unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128;
806     cmpxchg16b = atomic_nand_cmpxchg16b;
807     fallback = atomic_nand_seqcst;
808 }
809 select_atomic_rmw! {
810     unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128;
811     cmpxchg16b = atomic_or_cmpxchg16b;
812     fallback = atomic_or_seqcst;
813 }
814 select_atomic_rmw! {
815     unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128;
816     cmpxchg16b = atomic_xor_cmpxchg16b;
817     fallback = atomic_xor_seqcst;
818 }
819 select_atomic_rmw! {
820     unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128;
821     cmpxchg16b = atomic_max_cmpxchg16b;
822     fallback = atomic_max_seqcst;
823 }
824 select_atomic_rmw! {
825     unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128;
826     cmpxchg16b = atomic_umax_cmpxchg16b;
827     fallback = atomic_umax_seqcst;
828 }
829 select_atomic_rmw! {
830     unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128;
831     cmpxchg16b = atomic_min_cmpxchg16b;
832     fallback = atomic_min_seqcst;
833 }
834 select_atomic_rmw! {
835     unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128;
836     cmpxchg16b = atomic_umin_cmpxchg16b;
837     fallback = atomic_umin_seqcst;
838 }
839 select_atomic_rmw! {
840     unsafe fn atomic_not(dst: *mut u128) -> u128;
841     cmpxchg16b = atomic_not_cmpxchg16b;
842     fallback = atomic_not_seqcst;
843 }
844 select_atomic_rmw! {
845     unsafe fn atomic_neg(dst: *mut u128) -> u128;
846     cmpxchg16b = atomic_neg_cmpxchg16b;
847     fallback = atomic_neg_seqcst;
848 }
849 
850 #[inline]
is_lock_free() -> bool851 fn is_lock_free() -> bool {
852     #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
853     {
854         // CMPXCHG16B is available at compile-time.
855         true
856     }
857     #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
858     {
859         detect::detect().has_cmpxchg16b()
860     }
861 }
862 const IS_ALWAYS_LOCK_FREE: bool =
863     cfg!(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"));
864 
865 atomic128!(AtomicI128, i128, atomic_max, atomic_min);
866 atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);
867 
868 #[allow(clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)]
869 #[cfg(test)]
870 mod tests {
871     use super::*;
872 
873     test_atomic_int!(i128);
874     test_atomic_int!(u128);
875 
876     // load/store/swap implementation is not affected by signedness, so it is
877     // enough to test only unsigned types.
878     stress_test!(u128);
879 }
880