• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X64
2; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X32
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
4
5; This file checks that atomic (non-seq_cst) stores of immediate values are
6; done in one mov instruction and not 2. More precisely, it makes sure that the
7; immediate is not first copied uselessly into a register.
8
9; Similarily, it checks that a binary operation of an immediate with an atomic
10; variable that is stored back in that variable is done as a single instruction.
11; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release)
12; should be just an add instruction, instead of loading x into a register, doing
13; an add and storing the result back.
14; The binary operations supported are currently add, and, or, xor.
15; sub is not supported because they are translated by an addition of the
16; negated immediate.
17;
18; We also check the same patterns:
19; - For inc/dec.
20; - For register instead of immediate operands.
21; - For floating point operations.
22
23; seq_cst stores are left as (lock) xchgl, but we try to check every other
24; attribute at least once.
25
26; Please note that these operations do not require the lock prefix: only
27; sequentially consistent stores require this kind of protection on X86.
28; And even for seq_cst operations, llvm uses the xchg instruction which has
29; an implicit lock prefix, so making it explicit is not required.
30
31define void @store_atomic_imm_8(i8* %p) {
32; X64-LABEL: store_atomic_imm_8:
33; X64: movb
34; X64-NOT: movb
35; X32-LABEL: store_atomic_imm_8:
36; X32: movb
37; X32-NOT: movb
38  store atomic i8 42, i8* %p release, align 1
39  ret void
40}
41
42define void @store_atomic_imm_16(i16* %p) {
43; X64-LABEL: store_atomic_imm_16:
44; X64: movw
45; X64-NOT: movw
46; X32-LABEL: store_atomic_imm_16:
47; X32: movw
48; X32-NOT: movw
49  store atomic i16 42, i16* %p monotonic, align 2
50  ret void
51}
52
53define void @store_atomic_imm_32(i32* %p) {
54; X64-LABEL: store_atomic_imm_32:
55; X64: movl
56; X64-NOT: movl
57;   On 32 bits, there is an extra movl for each of those functions
58;   (probably for alignment reasons).
59; X32-LABEL: store_atomic_imm_32:
60; X32: movl 4(%esp), %eax
61; X32: movl
62; X32-NOT: movl
63  store atomic i32 42, i32* %p release, align 4
64  ret void
65}
66
67define void @store_atomic_imm_64(i64* %p) {
68; X64-LABEL: store_atomic_imm_64:
69; X64: movq
70; X64-NOT: movq
71;   These are implemented with a CAS loop on 32 bit architectures, and thus
72;   cannot be optimized in the same way as the others.
73; X32-LABEL: store_atomic_imm_64:
74; X32: cmpxchg8b
75  store atomic i64 42, i64* %p release, align 8
76  ret void
77}
78
79; If an immediate is too big to fit in 32 bits, it cannot be store in one mov,
80; even on X64, one must use movabsq that can only target a register.
81define void @store_atomic_imm_64_big(i64* %p) {
82; X64-LABEL: store_atomic_imm_64_big:
83; X64: movabsq
84; X64: movq
85  store atomic i64 100000000000, i64* %p monotonic, align 8
86  ret void
87}
88
89; It would be incorrect to replace a lock xchgl by a movl
90define void @store_atomic_imm_32_seq_cst(i32* %p) {
91; X64-LABEL: store_atomic_imm_32_seq_cst:
92; X64: xchgl
93; X32-LABEL: store_atomic_imm_32_seq_cst:
94; X32: xchgl
95  store atomic i32 42, i32* %p seq_cst, align 4
96  ret void
97}
98
99; ----- ADD -----
100
101define void @add_8i(i8* %p) {
102; X64-LABEL: add_8i:
103; X64-NOT: lock
104; X64: addb
105; X64-NOT: movb
106; X32-LABEL: add_8i:
107; X32-NOT: lock
108; X32: addb
109; X32-NOT: movb
110  %1 = load atomic i8, i8* %p seq_cst, align 1
111  %2 = add i8 %1, 2
112  store atomic i8 %2, i8* %p release, align 1
113  ret void
114}
115
116define void @add_8r(i8* %p, i8 %v) {
117; X64-LABEL: add_8r:
118; X64-NOT: lock
119; X64: addb
120; X64-NOT: movb
121; X32-LABEL: add_8r:
122; X32-NOT: lock
123; X32: addb
124; X32-NOT: movb
125  %1 = load atomic i8, i8* %p seq_cst, align 1
126  %2 = add i8 %1, %v
127  store atomic i8 %2, i8* %p release, align 1
128  ret void
129}
130
131define void @add_16i(i16* %p) {
132;   Currently the transformation is not done on 16 bit accesses, as the backend
133;   treat 16 bit arithmetic as expensive on X86/X86_64.
134; X64-LABEL: add_16i:
135; X64-NOT: addw
136; X32-LABEL: add_16i:
137; X32-NOT: addw
138  %1 = load atomic i16, i16* %p acquire, align 2
139  %2 = add i16 %1, 2
140  store atomic i16 %2, i16* %p release, align 2
141  ret void
142}
143
144define void @add_16r(i16* %p, i16 %v) {
145;   Currently the transformation is not done on 16 bit accesses, as the backend
146;   treat 16 bit arithmetic as expensive on X86/X86_64.
147; X64-LABEL: add_16r:
148; X64-NOT: addw
149; X32-LABEL: add_16r:
150; X32-NOT: addw [.*], (
151  %1 = load atomic i16, i16* %p acquire, align 2
152  %2 = add i16 %1, %v
153  store atomic i16 %2, i16* %p release, align 2
154  ret void
155}
156
157define void @add_32i(i32* %p) {
158; X64-LABEL: add_32i:
159; X64-NOT: lock
160; X64: addl
161; X64-NOT: movl
162; X32-LABEL: add_32i:
163; X32-NOT: lock
164; X32: addl
165; X32-NOT: movl
166  %1 = load atomic i32, i32* %p acquire, align 4
167  %2 = add i32 %1, 2
168  store atomic i32 %2, i32* %p monotonic, align 4
169  ret void
170}
171
172define void @add_32r(i32* %p, i32 %v) {
173; X64-LABEL: add_32r:
174; X64-NOT: lock
175; X64: addl
176; X64-NOT: movl
177; X32-LABEL: add_32r:
178; X32-NOT: lock
179; X32: addl
180; X32-NOT: movl
181  %1 = load atomic i32, i32* %p acquire, align 4
182  %2 = add i32 %1, %v
183  store atomic i32 %2, i32* %p monotonic, align 4
184  ret void
185}
186
187; The following is a corner case where the load is added to itself. The pattern
188; matching should not fold this. We only test with 32-bit add, but the same
189; applies to other sizes and operations.
190define void @add_32r_self(i32* %p) {
191; X64-LABEL: add_32r_self:
192; X64-NOT: lock
193; X64: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
194; X64: addl %[[R]], %[[R]]
195; X64: movl %[[R]], (%[[M]])
196; X32-LABEL: add_32r_self:
197; X32-NOT: lock
198; X32: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]]
199; X32: addl %[[R]], %[[R]]
200; X32: movl %[[R]], (%[[M]])
201  %1 = load atomic i32, i32* %p acquire, align 4
202  %2 = add i32 %1, %1
203  store atomic i32 %2, i32* %p monotonic, align 4
204  ret void
205}
206
207; The following is a corner case where the load's result is returned. The
208; optimizer isn't allowed to duplicate the load because it's atomic.
209define i32 @add_32r_ret_load(i32* %p, i32 %v) {
210; X64-LABEL: add_32r_ret_load:
211; X64-NOT: lock
212; X64:      movl (%rdi), %eax
213; X64-NEXT: addl %eax, %esi
214; X64-NEXT: movl %esi, (%rdi)
215; X64-NEXT: retq
216; X32-LABEL: add_32r_ret_load:
217; X32-NOT: lock
218; X32:      movl 4(%esp), %[[P:[a-z]+]]
219; X32-NEXT: movl (%[[P]]),
220; X32-NOT: %[[P]]
221; More code here, we just don't want it to load from P.
222; X32: movl %{{.*}}, (%[[P]])
223; X32-NEXT: retl
224  %1 = load atomic i32, i32* %p acquire, align 4
225  %2 = add i32 %1, %v
226  store atomic i32 %2, i32* %p monotonic, align 4
227  ret i32 %1
228}
229
230define void @add_64i(i64* %p) {
231; X64-LABEL: add_64i:
232; X64-NOT: lock
233; X64: addq
234; X64-NOT: movq
235;   We do not check X86-32 as it cannot do 'addq'.
236; X32-LABEL: add_64i:
237  %1 = load atomic i64, i64* %p acquire, align 8
238  %2 = add i64 %1, 2
239  store atomic i64 %2, i64* %p release, align 8
240  ret void
241}
242
243define void @add_64r(i64* %p, i64 %v) {
244; X64-LABEL: add_64r:
245; X64-NOT: lock
246; X64: addq
247; X64-NOT: movq
248;   We do not check X86-32 as it cannot do 'addq'.
249; X32-LABEL: add_64r:
250  %1 = load atomic i64, i64* %p acquire, align 8
251  %2 = add i64 %1, %v
252  store atomic i64 %2, i64* %p release, align 8
253  ret void
254}
255
256define void @add_32i_seq_cst(i32* %p) {
257; X64-LABEL: add_32i_seq_cst:
258; X64: xchgl
259; X32-LABEL: add_32i_seq_cst:
260; X32: xchgl
261  %1 = load atomic i32, i32* %p monotonic, align 4
262  %2 = add i32 %1, 2
263  store atomic i32 %2, i32* %p seq_cst, align 4
264  ret void
265}
266
267define void @add_32r_seq_cst(i32* %p, i32 %v) {
268; X64-LABEL: add_32r_seq_cst:
269; X64: xchgl
270; X32-LABEL: add_32r_seq_cst:
271; X32: xchgl
272  %1 = load atomic i32, i32* %p monotonic, align 4
273  %2 = add i32 %1, %v
274  store atomic i32 %2, i32* %p seq_cst, align 4
275  ret void
276}
277
278; ----- AND -----
279
280define void @and_8i(i8* %p) {
281; X64-LABEL: and_8i:
282; X64-NOT: lock
283; X64: andb
284; X64-NOT: movb
285; X32-LABEL: and_8i:
286; X32-NOT: lock
287; X32: andb
288; X32-NOT: movb
289  %1 = load atomic i8, i8* %p monotonic, align 1
290  %2 = and i8 %1, 2
291  store atomic i8 %2, i8* %p release, align 1
292  ret void
293}
294
295define void @and_8r(i8* %p, i8 %v) {
296; X64-LABEL: and_8r:
297; X64-NOT: lock
298; X64: andb
299; X64-NOT: movb
300; X32-LABEL: and_8r:
301; X32-NOT: lock
302; X32: andb
303; X32-NOT: movb
304  %1 = load atomic i8, i8* %p monotonic, align 1
305  %2 = and i8 %1, %v
306  store atomic i8 %2, i8* %p release, align 1
307  ret void
308}
309
310define void @and_16i(i16* %p) {
311;   Currently the transformation is not done on 16 bit accesses, as the backend
312;   treat 16 bit arithmetic as expensive on X86/X86_64.
313; X64-LABEL: and_16i:
314; X64-NOT: andw
315; X32-LABEL: and_16i:
316; X32-NOT: andw
317  %1 = load atomic i16, i16* %p acquire, align 2
318  %2 = and i16 %1, 2
319  store atomic i16 %2, i16* %p release, align 2
320  ret void
321}
322
323define void @and_16r(i16* %p, i16 %v) {
324;   Currently the transformation is not done on 16 bit accesses, as the backend
325;   treat 16 bit arithmetic as expensive on X86/X86_64.
326; X64-LABEL: and_16r:
327; X64-NOT: andw
328; X32-LABEL: and_16r:
329; X32-NOT: andw [.*], (
330  %1 = load atomic i16, i16* %p acquire, align 2
331  %2 = and i16 %1, %v
332  store atomic i16 %2, i16* %p release, align 2
333  ret void
334}
335
336define void @and_32i(i32* %p) {
337; X64-LABEL: and_32i:
338; X64-NOT: lock
339; X64: andl
340; X64-NOT: movl
341; X32-LABEL: and_32i:
342; X32-NOT: lock
343; X32: andl
344; X32-NOT: movl
345  %1 = load atomic i32, i32* %p acquire, align 4
346  %2 = and i32 %1, 2
347  store atomic i32 %2, i32* %p release, align 4
348  ret void
349}
350
351define void @and_32r(i32* %p, i32 %v) {
352; X64-LABEL: and_32r:
353; X64-NOT: lock
354; X64: andl
355; X64-NOT: movl
356; X32-LABEL: and_32r:
357; X32-NOT: lock
358; X32: andl
359; X32-NOT: movl
360  %1 = load atomic i32, i32* %p acquire, align 4
361  %2 = and i32 %1, %v
362  store atomic i32 %2, i32* %p release, align 4
363  ret void
364}
365
366define void @and_64i(i64* %p) {
367; X64-LABEL: and_64i:
368; X64-NOT: lock
369; X64: andq
370; X64-NOT: movq
371;   We do not check X86-32 as it cannot do 'andq'.
372; X32-LABEL: and_64i:
373  %1 = load atomic i64, i64* %p acquire, align 8
374  %2 = and i64 %1, 2
375  store atomic i64 %2, i64* %p release, align 8
376  ret void
377}
378
379define void @and_64r(i64* %p, i64 %v) {
380; X64-LABEL: and_64r:
381; X64-NOT: lock
382; X64: andq
383; X64-NOT: movq
384;   We do not check X86-32 as it cannot do 'andq'.
385; X32-LABEL: and_64r:
386  %1 = load atomic i64, i64* %p acquire, align 8
387  %2 = and i64 %1, %v
388  store atomic i64 %2, i64* %p release, align 8
389  ret void
390}
391
392define void @and_32i_seq_cst(i32* %p) {
393; X64-LABEL: and_32i_seq_cst:
394; X64: xchgl
395; X32-LABEL: and_32i_seq_cst:
396; X32: xchgl
397  %1 = load atomic i32, i32* %p monotonic, align 4
398  %2 = and i32 %1, 2
399  store atomic i32 %2, i32* %p seq_cst, align 4
400  ret void
401}
402
403define void @and_32r_seq_cst(i32* %p, i32 %v) {
404; X64-LABEL: and_32r_seq_cst:
405; X64: xchgl
406; X32-LABEL: and_32r_seq_cst:
407; X32: xchgl
408  %1 = load atomic i32, i32* %p monotonic, align 4
409  %2 = and i32 %1, %v
410  store atomic i32 %2, i32* %p seq_cst, align 4
411  ret void
412}
413
414; ----- OR -----
415
416define void @or_8i(i8* %p) {
417; X64-LABEL: or_8i:
418; X64-NOT: lock
419; X64: orb
420; X64-NOT: movb
421; X32-LABEL: or_8i:
422; X32-NOT: lock
423; X32: orb
424; X32-NOT: movb
425  %1 = load atomic i8, i8* %p acquire, align 1
426  %2 = or i8 %1, 2
427  store atomic i8 %2, i8* %p release, align 1
428  ret void
429}
430
431define void @or_8r(i8* %p, i8 %v) {
432; X64-LABEL: or_8r:
433; X64-NOT: lock
434; X64: orb
435; X64-NOT: movb
436; X32-LABEL: or_8r:
437; X32-NOT: lock
438; X32: orb
439; X32-NOT: movb
440  %1 = load atomic i8, i8* %p acquire, align 1
441  %2 = or i8 %1, %v
442  store atomic i8 %2, i8* %p release, align 1
443  ret void
444}
445
446define void @or_16i(i16* %p) {
447; X64-LABEL: or_16i:
448; X64-NOT: orw
449; X32-LABEL: or_16i:
450; X32-NOT: orw
451  %1 = load atomic i16, i16* %p acquire, align 2
452  %2 = or i16 %1, 2
453  store atomic i16 %2, i16* %p release, align 2
454  ret void
455}
456
457define void @or_16r(i16* %p, i16 %v) {
458; X64-LABEL: or_16r:
459; X64-NOT: orw
460; X32-LABEL: or_16r:
461; X32-NOT: orw [.*], (
462  %1 = load atomic i16, i16* %p acquire, align 2
463  %2 = or i16 %1, %v
464  store atomic i16 %2, i16* %p release, align 2
465  ret void
466}
467
468define void @or_32i(i32* %p) {
469; X64-LABEL: or_32i:
470; X64-NOT: lock
471; X64: orl
472; X64-NOT: movl
473; X32-LABEL: or_32i:
474; X32-NOT: lock
475; X32: orl
476; X32-NOT: movl
477  %1 = load atomic i32, i32* %p acquire, align 4
478  %2 = or i32 %1, 2
479  store atomic i32 %2, i32* %p release, align 4
480  ret void
481}
482
483define void @or_32r(i32* %p, i32 %v) {
484; X64-LABEL: or_32r:
485; X64-NOT: lock
486; X64: orl
487; X64-NOT: movl
488; X32-LABEL: or_32r:
489; X32-NOT: lock
490; X32: orl
491; X32-NOT: movl
492  %1 = load atomic i32, i32* %p acquire, align 4
493  %2 = or i32 %1, %v
494  store atomic i32 %2, i32* %p release, align 4
495  ret void
496}
497
498define void @or_64i(i64* %p) {
499; X64-LABEL: or_64i:
500; X64-NOT: lock
501; X64: orq
502; X64-NOT: movq
503;   We do not check X86-32 as it cannot do 'orq'.
504; X32-LABEL: or_64i:
505  %1 = load atomic i64, i64* %p acquire, align 8
506  %2 = or i64 %1, 2
507  store atomic i64 %2, i64* %p release, align 8
508  ret void
509}
510
511define void @or_64r(i64* %p, i64 %v) {
512; X64-LABEL: or_64r:
513; X64-NOT: lock
514; X64: orq
515; X64-NOT: movq
516;   We do not check X86-32 as it cannot do 'orq'.
517; X32-LABEL: or_64r:
518  %1 = load atomic i64, i64* %p acquire, align 8
519  %2 = or i64 %1, %v
520  store atomic i64 %2, i64* %p release, align 8
521  ret void
522}
523
524define void @or_32i_seq_cst(i32* %p) {
525; X64-LABEL: or_32i_seq_cst:
526; X64: xchgl
527; X32-LABEL: or_32i_seq_cst:
528; X32: xchgl
529  %1 = load atomic i32, i32* %p monotonic, align 4
530  %2 = or i32 %1, 2
531  store atomic i32 %2, i32* %p seq_cst, align 4
532  ret void
533}
534
535define void @or_32r_seq_cst(i32* %p, i32 %v) {
536; X64-LABEL: or_32r_seq_cst:
537; X64: xchgl
538; X32-LABEL: or_32r_seq_cst:
539; X32: xchgl
540  %1 = load atomic i32, i32* %p monotonic, align 4
541  %2 = or i32 %1, %v
542  store atomic i32 %2, i32* %p seq_cst, align 4
543  ret void
544}
545
546; ----- XOR -----
547
548define void @xor_8i(i8* %p) {
549; X64-LABEL: xor_8i:
550; X64-NOT: lock
551; X64: xorb
552; X64-NOT: movb
553; X32-LABEL: xor_8i:
554; X32-NOT: lock
555; X32: xorb
556; X32-NOT: movb
557  %1 = load atomic i8, i8* %p acquire, align 1
558  %2 = xor i8 %1, 2
559  store atomic i8 %2, i8* %p release, align 1
560  ret void
561}
562
563define void @xor_8r(i8* %p, i8 %v) {
564; X64-LABEL: xor_8r:
565; X64-NOT: lock
566; X64: xorb
567; X64-NOT: movb
568; X32-LABEL: xor_8r:
569; X32-NOT: lock
570; X32: xorb
571; X32-NOT: movb
572  %1 = load atomic i8, i8* %p acquire, align 1
573  %2 = xor i8 %1, %v
574  store atomic i8 %2, i8* %p release, align 1
575  ret void
576}
577
578define void @xor_16i(i16* %p) {
579; X64-LABEL: xor_16i:
580; X64-NOT: xorw
581; X32-LABEL: xor_16i:
582; X32-NOT: xorw
583  %1 = load atomic i16, i16* %p acquire, align 2
584  %2 = xor i16 %1, 2
585  store atomic i16 %2, i16* %p release, align 2
586  ret void
587}
588
589define void @xor_16r(i16* %p, i16 %v) {
590; X64-LABEL: xor_16r:
591; X64-NOT: xorw
592; X32-LABEL: xor_16r:
593; X32-NOT: xorw [.*], (
594  %1 = load atomic i16, i16* %p acquire, align 2
595  %2 = xor i16 %1, %v
596  store atomic i16 %2, i16* %p release, align 2
597  ret void
598}
599
600define void @xor_32i(i32* %p) {
601; X64-LABEL: xor_32i:
602; X64-NOT: lock
603; X64: xorl
604; X64-NOT: movl
605; X32-LABEL: xor_32i:
606; X32-NOT: lock
607; X32: xorl
608; X32-NOT: movl
609  %1 = load atomic i32, i32* %p acquire, align 4
610  %2 = xor i32 %1, 2
611  store atomic i32 %2, i32* %p release, align 4
612  ret void
613}
614
615define void @xor_32r(i32* %p, i32 %v) {
616; X64-LABEL: xor_32r:
617; X64-NOT: lock
618; X64: xorl
619; X64-NOT: movl
620; X32-LABEL: xor_32r:
621; X32-NOT: lock
622; X32: xorl
623; X32-NOT: movl
624  %1 = load atomic i32, i32* %p acquire, align 4
625  %2 = xor i32 %1, %v
626  store atomic i32 %2, i32* %p release, align 4
627  ret void
628}
629
630define void @xor_64i(i64* %p) {
631; X64-LABEL: xor_64i:
632; X64-NOT: lock
633; X64: xorq
634; X64-NOT: movq
635;   We do not check X86-32 as it cannot do 'xorq'.
636; X32-LABEL: xor_64i:
637  %1 = load atomic i64, i64* %p acquire, align 8
638  %2 = xor i64 %1, 2
639  store atomic i64 %2, i64* %p release, align 8
640  ret void
641}
642
643define void @xor_64r(i64* %p, i64 %v) {
644; X64-LABEL: xor_64r:
645; X64-NOT: lock
646; X64: xorq
647; X64-NOT: movq
648;   We do not check X86-32 as it cannot do 'xorq'.
649; X32-LABEL: xor_64r:
650  %1 = load atomic i64, i64* %p acquire, align 8
651  %2 = xor i64 %1, %v
652  store atomic i64 %2, i64* %p release, align 8
653  ret void
654}
655
656define void @xor_32i_seq_cst(i32* %p) {
657; X64-LABEL: xor_32i_seq_cst:
658; X64: xchgl
659; X32-LABEL: xor_32i_seq_cst:
660; X32: xchgl
661  %1 = load atomic i32, i32* %p monotonic, align 4
662  %2 = xor i32 %1, 2
663  store atomic i32 %2, i32* %p seq_cst, align 4
664  ret void
665}
666
667define void @xor_32r_seq_cst(i32* %p, i32 %v) {
668; X64-LABEL: xor_32r_seq_cst:
669; X64: xchgl
670; X32-LABEL: xor_32r_seq_cst:
671; X32: xchgl
672  %1 = load atomic i32, i32* %p monotonic, align 4
673  %2 = xor i32 %1, %v
674  store atomic i32 %2, i32* %p seq_cst, align 4
675  ret void
676}
677
678; ----- INC -----
679
680define void @inc_8(i8* %p) {
681; X64-LABEL: inc_8:
682; X64-NOT: lock
683; X64: incb
684; X64-NOT: movb
685; X32-LABEL: inc_8:
686; X32-NOT: lock
687; X32: incb
688; X32-NOT: movb
689; SLOW_INC-LABEL: inc_8:
690; SLOW_INC-NOT: incb
691; SLOW_INC-NOT: movb
692  %1 = load atomic i8, i8* %p seq_cst, align 1
693  %2 = add i8 %1, 1
694  store atomic i8 %2, i8* %p release, align 1
695  ret void
696}
697
698define void @inc_16(i16* %p) {
699;   Currently the transformation is not done on 16 bit accesses, as the backend
700;   treat 16 bit arithmetic as expensive on X86/X86_64.
701; X64-LABEL: inc_16:
702; X64-NOT: incw
703; X32-LABEL: inc_16:
704; X32-NOT: incw
705; SLOW_INC-LABEL: inc_16:
706; SLOW_INC-NOT: incw
707  %1 = load atomic i16, i16* %p acquire, align 2
708  %2 = add i16 %1, 1
709  store atomic i16 %2, i16* %p release, align 2
710  ret void
711}
712
713define void @inc_32(i32* %p) {
714; X64-LABEL: inc_32:
715; X64-NOT: lock
716; X64: incl
717; X64-NOT: movl
718; X32-LABEL: inc_32:
719; X32-NOT: lock
720; X32: incl
721; X32-NOT: movl
722; SLOW_INC-LABEL: inc_32:
723; SLOW_INC-NOT: incl
724; SLOW_INC-NOT: movl
725  %1 = load atomic i32, i32* %p acquire, align 4
726  %2 = add i32 %1, 1
727  store atomic i32 %2, i32* %p monotonic, align 4
728  ret void
729}
730
731define void @inc_64(i64* %p) {
732; X64-LABEL: inc_64:
733; X64-NOT: lock
734; X64: incq
735; X64-NOT: movq
736;   We do not check X86-32 as it cannot do 'incq'.
737; X32-LABEL: inc_64:
738; SLOW_INC-LABEL: inc_64:
739; SLOW_INC-NOT: incq
740; SLOW_INC-NOT: movq
741  %1 = load atomic i64, i64* %p acquire, align 8
742  %2 = add i64 %1, 1
743  store atomic i64 %2, i64* %p release, align 8
744  ret void
745}
746
747define void @inc_32_seq_cst(i32* %p) {
748; X64-LABEL: inc_32_seq_cst:
749; X64: xchgl
750; X32-LABEL: inc_32_seq_cst:
751; X32: xchgl
752  %1 = load atomic i32, i32* %p monotonic, align 4
753  %2 = add i32 %1, 1
754  store atomic i32 %2, i32* %p seq_cst, align 4
755  ret void
756}
757
758; ----- DEC -----
759
760define void @dec_8(i8* %p) {
761; X64-LABEL: dec_8:
762; X64-NOT: lock
763; X64: decb
764; X64-NOT: movb
765; X32-LABEL: dec_8:
766; X32-NOT: lock
767; X32: decb
768; X32-NOT: movb
769; SLOW_INC-LABEL: dec_8:
770; SLOW_INC-NOT: decb
771; SLOW_INC-NOT: movb
772  %1 = load atomic i8, i8* %p seq_cst, align 1
773  %2 = sub i8 %1, 1
774  store atomic i8 %2, i8* %p release, align 1
775  ret void
776}
777
778define void @dec_16(i16* %p) {
779;   Currently the transformation is not done on 16 bit accesses, as the backend
780;   treat 16 bit arithmetic as expensive on X86/X86_64.
781; X64-LABEL: dec_16:
782; X64-NOT: decw
783; X32-LABEL: dec_16:
784; X32-NOT: decw
785; SLOW_INC-LABEL: dec_16:
786; SLOW_INC-NOT: decw
787  %1 = load atomic i16, i16* %p acquire, align 2
788  %2 = sub i16 %1, 1
789  store atomic i16 %2, i16* %p release, align 2
790  ret void
791}
792
793define void @dec_32(i32* %p) {
794; X64-LABEL: dec_32:
795; X64-NOT: lock
796; X64: decl
797; X64-NOT: movl
798; X32-LABEL: dec_32:
799; X32-NOT: lock
800; X32: decl
801; X32-NOT: movl
802; SLOW_INC-LABEL: dec_32:
803; SLOW_INC-NOT: decl
804; SLOW_INC-NOT: movl
805  %1 = load atomic i32, i32* %p acquire, align 4
806  %2 = sub i32 %1, 1
807  store atomic i32 %2, i32* %p monotonic, align 4
808  ret void
809}
810
811define void @dec_64(i64* %p) {
812; X64-LABEL: dec_64:
813; X64-NOT: lock
814; X64: decq
815; X64-NOT: movq
816;   We do not check X86-32 as it cannot do 'decq'.
817; X32-LABEL: dec_64:
818; SLOW_INC-LABEL: dec_64:
819; SLOW_INC-NOT: decq
820; SLOW_INC-NOT: movq
821  %1 = load atomic i64, i64* %p acquire, align 8
822  %2 = sub i64 %1, 1
823  store atomic i64 %2, i64* %p release, align 8
824  ret void
825}
826
827define void @dec_32_seq_cst(i32* %p) {
828; X64-LABEL: dec_32_seq_cst:
829; X64: xchgl
830; X32-LABEL: dec_32_seq_cst:
831; X32: xchgl
832  %1 = load atomic i32, i32* %p monotonic, align 4
833  %2 = sub i32 %1, 1
834  store atomic i32 %2, i32* %p seq_cst, align 4
835  ret void
836}
837
838; ----- FADD -----
839
840define void @fadd_32r(float* %loc, float %val) {
841; X64-LABEL: fadd_32r:
842; X64-NOT: lock
843; X64-NOT: mov
844; X64: addss (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
845; X64-NEXT: movss %[[XMM]], (%[[M]])
846; X32-LABEL: fadd_32r:
847; Don't check x86-32.
848; LLVM's SSE handling is conservative on x86-32 even without using atomics.
849  %floc = bitcast float* %loc to i32*
850  %1 = load atomic i32, i32* %floc seq_cst, align 4
851  %2 = bitcast i32 %1 to float
852  %add = fadd float %2, %val
853  %3 = bitcast float %add to i32
854  store atomic i32 %3, i32* %floc release, align 4
855  ret void
856}
857
858define void @fadd_64r(double* %loc, double %val) {
859; X64-LABEL: fadd_64r:
860; X64-NOT: lock
861; X64-NOT: mov
862; X64: addsd (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]]
863; X64-NEXT: movsd %[[XMM]], (%[[M]])
864; X32-LABEL: fadd_64r:
865; Don't check x86-32 (see comment above).
866  %floc = bitcast double* %loc to i64*
867  %1 = load atomic i64, i64* %floc seq_cst, align 8
868  %2 = bitcast i64 %1 to double
869  %add = fadd double %2, %val
870  %3 = bitcast double %add to i64
871  store atomic i64 %3, i64* %floc release, align 8
872  ret void
873}
874
875@glob32 = global float 0.000000e+00, align 4
876@glob64 = global double 0.000000e+00, align 8
877
878; Floating-point add to a global using an immediate.
879define void @fadd_32g() {
880; X64-LABEL: fadd_32g:
881; X64-NOT: lock
882; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
883; X64-NEXT: addss glob32(%rip), %[[XMM]]
884; X64-NEXT: movss %[[XMM]], glob32(%rip)
885; X32-LABEL: fadd_32g:
886; Don't check x86-32 (see comment above).
887  %i = load atomic i32, i32* bitcast (float* @glob32 to i32*) monotonic, align 4
888  %f = bitcast i32 %i to float
889  %add = fadd float %f, 1.000000e+00
890  %s = bitcast float %add to i32
891  store atomic i32 %s, i32* bitcast (float* @glob32 to i32*) monotonic, align 4
892  ret void
893}
894
895define void @fadd_64g() {
896; X64-LABEL: fadd_64g:
897; X64-NOT: lock
898; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
899; X64-NEXT: addsd glob64(%rip), %[[XMM]]
900; X64-NEXT: movsd %[[XMM]], glob64(%rip)
901; X32-LABEL: fadd_64g:
902; Don't check x86-32 (see comment above).
903  %i = load atomic i64, i64* bitcast (double* @glob64 to i64*) monotonic, align 8
904  %f = bitcast i64 %i to double
905  %add = fadd double %f, 1.000000e+00
906  %s = bitcast double %add to i64
907  store atomic i64 %s, i64* bitcast (double* @glob64 to i64*) monotonic, align 8
908  ret void
909}
910
911; Floating-point add to a hard-coded immediate location using an immediate.
912define void @fadd_32imm() {
913; X64-LABEL: fadd_32imm:
914; X64-NOT: lock
915; X64:      movl $3735928559, %e[[M:[a-z]+]]
916; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
917; X64-NEXT: addss (%r[[M]]), %[[XMM]]
918; X64-NEXT: movss %[[XMM]], (%r[[M]])
919; X32-LABEL: fadd_32imm:
920; Don't check x86-32 (see comment above).
921  %i = load atomic i32, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4
922  %f = bitcast i32 %i to float
923  %add = fadd float %f, 1.000000e+00
924  %s = bitcast float %add to i32
925  store atomic i32 %s, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4
926  ret void
927}
928
929define void @fadd_64imm() {
930; X64-LABEL: fadd_64imm:
931; X64-NOT: lock
932; X64:      movl $3735928559, %e[[M:[a-z]+]]
933; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
934; X64-NEXT: addsd (%r[[M]]), %[[XMM]]
935; X64-NEXT: movsd %[[XMM]], (%r[[M]])
936; X32-LABEL: fadd_64imm:
937; Don't check x86-32 (see comment above).
938  %i = load atomic i64, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8
939  %f = bitcast i64 %i to double
940  %add = fadd double %f, 1.000000e+00
941  %s = bitcast double %add to i64
942  store atomic i64 %s, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8
943  ret void
944}
945
946; Floating-point add to a stack location.
947define void @fadd_32stack() {
948; X64-LABEL: fadd_32stack:
949; X64-NOT: lock
950; X64:      movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
951; X64-NEXT: addss [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]]
952; X64-NEXT: movss %[[XMM]], [[STACKOFF]](%rsp)
953; X32-LABEL: fadd_32stack:
954; Don't check x86-32 (see comment above).
955  %ptr = alloca i32, align 4
956  %bc3 = bitcast i32* %ptr to float*
957  %load = load atomic i32, i32* %ptr acquire, align 4
958  %bc0 = bitcast i32 %load to float
959  %fadd = fadd float 1.000000e+00, %bc0
960  %bc1 = bitcast float %fadd to i32
961  store atomic i32 %bc1, i32* %ptr release, align 4
962  ret void
963}
964
965define void @fadd_64stack() {
966; X64-LABEL: fadd_64stack:
967; X64-NOT: lock
968; X64:      movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]]
969; X64-NEXT: addsd [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]]
970; X64-NEXT: movsd %[[XMM]], [[STACKOFF]](%rsp)
971; X32-LABEL: fadd_64stack:
972; Don't check x86-32 (see comment above).
973  %ptr = alloca i64, align 8
974  %bc3 = bitcast i64* %ptr to double*
975  %load = load atomic i64, i64* %ptr acquire, align 8
976  %bc0 = bitcast i64 %load to double
977  %fadd = fadd double 1.000000e+00, %bc0
978  %bc1 = bitcast double %fadd to i64
979  store atomic i64 %bc1, i64* %ptr release, align 8
980  ret void
981}
982
983define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) {
984; X64-LABEL: fadd_array:
985; X64-NOT: lock
986; X64: addsd ([[ADDR:%r..,%r..,8]]), %[[XMM:xmm[0-9]+]]
987; X64-NEXT: movsd %[[XMM]], ([[ADDR]])
988; X32-LABEL: fadd_array:
989; Don't check x86-32 (see comment above).
990bb:
991  %tmp4 = getelementptr inbounds i64, i64* %arg, i64 %arg2
992  %tmp6 = load atomic i64, i64* %tmp4 monotonic, align 8
993  %tmp7 = bitcast i64 %tmp6 to double
994  %tmp8 = fadd double %tmp7, %arg1
995  %tmp9 = bitcast double %tmp8 to i64
996  store atomic i64 %tmp9, i64* %tmp4 monotonic, align 8
997  ret void
998}
999