1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X64 2; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix X32 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC 4 5; This file checks that atomic (non-seq_cst) stores of immediate values are 6; done in one mov instruction and not 2. More precisely, it makes sure that the 7; immediate is not first copied uselessly into a register. 8 9; Similarily, it checks that a binary operation of an immediate with an atomic 10; variable that is stored back in that variable is done as a single instruction. 11; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release) 12; should be just an add instruction, instead of loading x into a register, doing 13; an add and storing the result back. 14; The binary operations supported are currently add, and, or, xor. 15; sub is not supported because they are translated by an addition of the 16; negated immediate. 17; 18; We also check the same patterns: 19; - For inc/dec. 20; - For register instead of immediate operands. 21; - For floating point operations. 22 23; seq_cst stores are left as (lock) xchgl, but we try to check every other 24; attribute at least once. 25 26; Please note that these operations do not require the lock prefix: only 27; sequentially consistent stores require this kind of protection on X86. 28; And even for seq_cst operations, llvm uses the xchg instruction which has 29; an implicit lock prefix, so making it explicit is not required. 30 31define void @store_atomic_imm_8(i8* %p) { 32; X64-LABEL: store_atomic_imm_8: 33; X64: movb 34; X64-NOT: movb 35; X32-LABEL: store_atomic_imm_8: 36; X32: movb 37; X32-NOT: movb 38 store atomic i8 42, i8* %p release, align 1 39 ret void 40} 41 42define void @store_atomic_imm_16(i16* %p) { 43; X64-LABEL: store_atomic_imm_16: 44; X64: movw 45; X64-NOT: movw 46; X32-LABEL: store_atomic_imm_16: 47; X32: movw 48; X32-NOT: movw 49 store atomic i16 42, i16* %p monotonic, align 2 50 ret void 51} 52 53define void @store_atomic_imm_32(i32* %p) { 54; X64-LABEL: store_atomic_imm_32: 55; X64: movl 56; X64-NOT: movl 57; On 32 bits, there is an extra movl for each of those functions 58; (probably for alignment reasons). 59; X32-LABEL: store_atomic_imm_32: 60; X32: movl 4(%esp), %eax 61; X32: movl 62; X32-NOT: movl 63 store atomic i32 42, i32* %p release, align 4 64 ret void 65} 66 67define void @store_atomic_imm_64(i64* %p) { 68; X64-LABEL: store_atomic_imm_64: 69; X64: movq 70; X64-NOT: movq 71; These are implemented with a CAS loop on 32 bit architectures, and thus 72; cannot be optimized in the same way as the others. 73; X32-LABEL: store_atomic_imm_64: 74; X32: cmpxchg8b 75 store atomic i64 42, i64* %p release, align 8 76 ret void 77} 78 79; If an immediate is too big to fit in 32 bits, it cannot be store in one mov, 80; even on X64, one must use movabsq that can only target a register. 81define void @store_atomic_imm_64_big(i64* %p) { 82; X64-LABEL: store_atomic_imm_64_big: 83; X64: movabsq 84; X64: movq 85 store atomic i64 100000000000, i64* %p monotonic, align 8 86 ret void 87} 88 89; It would be incorrect to replace a lock xchgl by a movl 90define void @store_atomic_imm_32_seq_cst(i32* %p) { 91; X64-LABEL: store_atomic_imm_32_seq_cst: 92; X64: xchgl 93; X32-LABEL: store_atomic_imm_32_seq_cst: 94; X32: xchgl 95 store atomic i32 42, i32* %p seq_cst, align 4 96 ret void 97} 98 99; ----- ADD ----- 100 101define void @add_8i(i8* %p) { 102; X64-LABEL: add_8i: 103; X64-NOT: lock 104; X64: addb 105; X64-NOT: movb 106; X32-LABEL: add_8i: 107; X32-NOT: lock 108; X32: addb 109; X32-NOT: movb 110 %1 = load atomic i8, i8* %p seq_cst, align 1 111 %2 = add i8 %1, 2 112 store atomic i8 %2, i8* %p release, align 1 113 ret void 114} 115 116define void @add_8r(i8* %p, i8 %v) { 117; X64-LABEL: add_8r: 118; X64-NOT: lock 119; X64: addb 120; X64-NOT: movb 121; X32-LABEL: add_8r: 122; X32-NOT: lock 123; X32: addb 124; X32-NOT: movb 125 %1 = load atomic i8, i8* %p seq_cst, align 1 126 %2 = add i8 %1, %v 127 store atomic i8 %2, i8* %p release, align 1 128 ret void 129} 130 131define void @add_16i(i16* %p) { 132; Currently the transformation is not done on 16 bit accesses, as the backend 133; treat 16 bit arithmetic as expensive on X86/X86_64. 134; X64-LABEL: add_16i: 135; X64-NOT: addw 136; X32-LABEL: add_16i: 137; X32-NOT: addw 138 %1 = load atomic i16, i16* %p acquire, align 2 139 %2 = add i16 %1, 2 140 store atomic i16 %2, i16* %p release, align 2 141 ret void 142} 143 144define void @add_16r(i16* %p, i16 %v) { 145; Currently the transformation is not done on 16 bit accesses, as the backend 146; treat 16 bit arithmetic as expensive on X86/X86_64. 147; X64-LABEL: add_16r: 148; X64-NOT: addw 149; X32-LABEL: add_16r: 150; X32-NOT: addw [.*], ( 151 %1 = load atomic i16, i16* %p acquire, align 2 152 %2 = add i16 %1, %v 153 store atomic i16 %2, i16* %p release, align 2 154 ret void 155} 156 157define void @add_32i(i32* %p) { 158; X64-LABEL: add_32i: 159; X64-NOT: lock 160; X64: addl 161; X64-NOT: movl 162; X32-LABEL: add_32i: 163; X32-NOT: lock 164; X32: addl 165; X32-NOT: movl 166 %1 = load atomic i32, i32* %p acquire, align 4 167 %2 = add i32 %1, 2 168 store atomic i32 %2, i32* %p monotonic, align 4 169 ret void 170} 171 172define void @add_32r(i32* %p, i32 %v) { 173; X64-LABEL: add_32r: 174; X64-NOT: lock 175; X64: addl 176; X64-NOT: movl 177; X32-LABEL: add_32r: 178; X32-NOT: lock 179; X32: addl 180; X32-NOT: movl 181 %1 = load atomic i32, i32* %p acquire, align 4 182 %2 = add i32 %1, %v 183 store atomic i32 %2, i32* %p monotonic, align 4 184 ret void 185} 186 187; The following is a corner case where the load is added to itself. The pattern 188; matching should not fold this. We only test with 32-bit add, but the same 189; applies to other sizes and operations. 190define void @add_32r_self(i32* %p) { 191; X64-LABEL: add_32r_self: 192; X64-NOT: lock 193; X64: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]] 194; X64: addl %[[R]], %[[R]] 195; X64: movl %[[R]], (%[[M]]) 196; X32-LABEL: add_32r_self: 197; X32-NOT: lock 198; X32: movl (%[[M:[a-z]+]]), %[[R:[a-z]+]] 199; X32: addl %[[R]], %[[R]] 200; X32: movl %[[R]], (%[[M]]) 201 %1 = load atomic i32, i32* %p acquire, align 4 202 %2 = add i32 %1, %1 203 store atomic i32 %2, i32* %p monotonic, align 4 204 ret void 205} 206 207; The following is a corner case where the load's result is returned. The 208; optimizer isn't allowed to duplicate the load because it's atomic. 209define i32 @add_32r_ret_load(i32* %p, i32 %v) { 210; X64-LABEL: add_32r_ret_load: 211; X64-NOT: lock 212; X64: movl (%rdi), %eax 213; X64-NEXT: addl %eax, %esi 214; X64-NEXT: movl %esi, (%rdi) 215; X64-NEXT: retq 216; X32-LABEL: add_32r_ret_load: 217; X32-NOT: lock 218; X32: movl 4(%esp), %[[P:[a-z]+]] 219; X32-NEXT: movl (%[[P]]), 220; X32-NOT: %[[P]] 221; More code here, we just don't want it to load from P. 222; X32: movl %{{.*}}, (%[[P]]) 223; X32-NEXT: retl 224 %1 = load atomic i32, i32* %p acquire, align 4 225 %2 = add i32 %1, %v 226 store atomic i32 %2, i32* %p monotonic, align 4 227 ret i32 %1 228} 229 230define void @add_64i(i64* %p) { 231; X64-LABEL: add_64i: 232; X64-NOT: lock 233; X64: addq 234; X64-NOT: movq 235; We do not check X86-32 as it cannot do 'addq'. 236; X32-LABEL: add_64i: 237 %1 = load atomic i64, i64* %p acquire, align 8 238 %2 = add i64 %1, 2 239 store atomic i64 %2, i64* %p release, align 8 240 ret void 241} 242 243define void @add_64r(i64* %p, i64 %v) { 244; X64-LABEL: add_64r: 245; X64-NOT: lock 246; X64: addq 247; X64-NOT: movq 248; We do not check X86-32 as it cannot do 'addq'. 249; X32-LABEL: add_64r: 250 %1 = load atomic i64, i64* %p acquire, align 8 251 %2 = add i64 %1, %v 252 store atomic i64 %2, i64* %p release, align 8 253 ret void 254} 255 256define void @add_32i_seq_cst(i32* %p) { 257; X64-LABEL: add_32i_seq_cst: 258; X64: xchgl 259; X32-LABEL: add_32i_seq_cst: 260; X32: xchgl 261 %1 = load atomic i32, i32* %p monotonic, align 4 262 %2 = add i32 %1, 2 263 store atomic i32 %2, i32* %p seq_cst, align 4 264 ret void 265} 266 267define void @add_32r_seq_cst(i32* %p, i32 %v) { 268; X64-LABEL: add_32r_seq_cst: 269; X64: xchgl 270; X32-LABEL: add_32r_seq_cst: 271; X32: xchgl 272 %1 = load atomic i32, i32* %p monotonic, align 4 273 %2 = add i32 %1, %v 274 store atomic i32 %2, i32* %p seq_cst, align 4 275 ret void 276} 277 278; ----- AND ----- 279 280define void @and_8i(i8* %p) { 281; X64-LABEL: and_8i: 282; X64-NOT: lock 283; X64: andb 284; X64-NOT: movb 285; X32-LABEL: and_8i: 286; X32-NOT: lock 287; X32: andb 288; X32-NOT: movb 289 %1 = load atomic i8, i8* %p monotonic, align 1 290 %2 = and i8 %1, 2 291 store atomic i8 %2, i8* %p release, align 1 292 ret void 293} 294 295define void @and_8r(i8* %p, i8 %v) { 296; X64-LABEL: and_8r: 297; X64-NOT: lock 298; X64: andb 299; X64-NOT: movb 300; X32-LABEL: and_8r: 301; X32-NOT: lock 302; X32: andb 303; X32-NOT: movb 304 %1 = load atomic i8, i8* %p monotonic, align 1 305 %2 = and i8 %1, %v 306 store atomic i8 %2, i8* %p release, align 1 307 ret void 308} 309 310define void @and_16i(i16* %p) { 311; Currently the transformation is not done on 16 bit accesses, as the backend 312; treat 16 bit arithmetic as expensive on X86/X86_64. 313; X64-LABEL: and_16i: 314; X64-NOT: andw 315; X32-LABEL: and_16i: 316; X32-NOT: andw 317 %1 = load atomic i16, i16* %p acquire, align 2 318 %2 = and i16 %1, 2 319 store atomic i16 %2, i16* %p release, align 2 320 ret void 321} 322 323define void @and_16r(i16* %p, i16 %v) { 324; Currently the transformation is not done on 16 bit accesses, as the backend 325; treat 16 bit arithmetic as expensive on X86/X86_64. 326; X64-LABEL: and_16r: 327; X64-NOT: andw 328; X32-LABEL: and_16r: 329; X32-NOT: andw [.*], ( 330 %1 = load atomic i16, i16* %p acquire, align 2 331 %2 = and i16 %1, %v 332 store atomic i16 %2, i16* %p release, align 2 333 ret void 334} 335 336define void @and_32i(i32* %p) { 337; X64-LABEL: and_32i: 338; X64-NOT: lock 339; X64: andl 340; X64-NOT: movl 341; X32-LABEL: and_32i: 342; X32-NOT: lock 343; X32: andl 344; X32-NOT: movl 345 %1 = load atomic i32, i32* %p acquire, align 4 346 %2 = and i32 %1, 2 347 store atomic i32 %2, i32* %p release, align 4 348 ret void 349} 350 351define void @and_32r(i32* %p, i32 %v) { 352; X64-LABEL: and_32r: 353; X64-NOT: lock 354; X64: andl 355; X64-NOT: movl 356; X32-LABEL: and_32r: 357; X32-NOT: lock 358; X32: andl 359; X32-NOT: movl 360 %1 = load atomic i32, i32* %p acquire, align 4 361 %2 = and i32 %1, %v 362 store atomic i32 %2, i32* %p release, align 4 363 ret void 364} 365 366define void @and_64i(i64* %p) { 367; X64-LABEL: and_64i: 368; X64-NOT: lock 369; X64: andq 370; X64-NOT: movq 371; We do not check X86-32 as it cannot do 'andq'. 372; X32-LABEL: and_64i: 373 %1 = load atomic i64, i64* %p acquire, align 8 374 %2 = and i64 %1, 2 375 store atomic i64 %2, i64* %p release, align 8 376 ret void 377} 378 379define void @and_64r(i64* %p, i64 %v) { 380; X64-LABEL: and_64r: 381; X64-NOT: lock 382; X64: andq 383; X64-NOT: movq 384; We do not check X86-32 as it cannot do 'andq'. 385; X32-LABEL: and_64r: 386 %1 = load atomic i64, i64* %p acquire, align 8 387 %2 = and i64 %1, %v 388 store atomic i64 %2, i64* %p release, align 8 389 ret void 390} 391 392define void @and_32i_seq_cst(i32* %p) { 393; X64-LABEL: and_32i_seq_cst: 394; X64: xchgl 395; X32-LABEL: and_32i_seq_cst: 396; X32: xchgl 397 %1 = load atomic i32, i32* %p monotonic, align 4 398 %2 = and i32 %1, 2 399 store atomic i32 %2, i32* %p seq_cst, align 4 400 ret void 401} 402 403define void @and_32r_seq_cst(i32* %p, i32 %v) { 404; X64-LABEL: and_32r_seq_cst: 405; X64: xchgl 406; X32-LABEL: and_32r_seq_cst: 407; X32: xchgl 408 %1 = load atomic i32, i32* %p monotonic, align 4 409 %2 = and i32 %1, %v 410 store atomic i32 %2, i32* %p seq_cst, align 4 411 ret void 412} 413 414; ----- OR ----- 415 416define void @or_8i(i8* %p) { 417; X64-LABEL: or_8i: 418; X64-NOT: lock 419; X64: orb 420; X64-NOT: movb 421; X32-LABEL: or_8i: 422; X32-NOT: lock 423; X32: orb 424; X32-NOT: movb 425 %1 = load atomic i8, i8* %p acquire, align 1 426 %2 = or i8 %1, 2 427 store atomic i8 %2, i8* %p release, align 1 428 ret void 429} 430 431define void @or_8r(i8* %p, i8 %v) { 432; X64-LABEL: or_8r: 433; X64-NOT: lock 434; X64: orb 435; X64-NOT: movb 436; X32-LABEL: or_8r: 437; X32-NOT: lock 438; X32: orb 439; X32-NOT: movb 440 %1 = load atomic i8, i8* %p acquire, align 1 441 %2 = or i8 %1, %v 442 store atomic i8 %2, i8* %p release, align 1 443 ret void 444} 445 446define void @or_16i(i16* %p) { 447; X64-LABEL: or_16i: 448; X64-NOT: orw 449; X32-LABEL: or_16i: 450; X32-NOT: orw 451 %1 = load atomic i16, i16* %p acquire, align 2 452 %2 = or i16 %1, 2 453 store atomic i16 %2, i16* %p release, align 2 454 ret void 455} 456 457define void @or_16r(i16* %p, i16 %v) { 458; X64-LABEL: or_16r: 459; X64-NOT: orw 460; X32-LABEL: or_16r: 461; X32-NOT: orw [.*], ( 462 %1 = load atomic i16, i16* %p acquire, align 2 463 %2 = or i16 %1, %v 464 store atomic i16 %2, i16* %p release, align 2 465 ret void 466} 467 468define void @or_32i(i32* %p) { 469; X64-LABEL: or_32i: 470; X64-NOT: lock 471; X64: orl 472; X64-NOT: movl 473; X32-LABEL: or_32i: 474; X32-NOT: lock 475; X32: orl 476; X32-NOT: movl 477 %1 = load atomic i32, i32* %p acquire, align 4 478 %2 = or i32 %1, 2 479 store atomic i32 %2, i32* %p release, align 4 480 ret void 481} 482 483define void @or_32r(i32* %p, i32 %v) { 484; X64-LABEL: or_32r: 485; X64-NOT: lock 486; X64: orl 487; X64-NOT: movl 488; X32-LABEL: or_32r: 489; X32-NOT: lock 490; X32: orl 491; X32-NOT: movl 492 %1 = load atomic i32, i32* %p acquire, align 4 493 %2 = or i32 %1, %v 494 store atomic i32 %2, i32* %p release, align 4 495 ret void 496} 497 498define void @or_64i(i64* %p) { 499; X64-LABEL: or_64i: 500; X64-NOT: lock 501; X64: orq 502; X64-NOT: movq 503; We do not check X86-32 as it cannot do 'orq'. 504; X32-LABEL: or_64i: 505 %1 = load atomic i64, i64* %p acquire, align 8 506 %2 = or i64 %1, 2 507 store atomic i64 %2, i64* %p release, align 8 508 ret void 509} 510 511define void @or_64r(i64* %p, i64 %v) { 512; X64-LABEL: or_64r: 513; X64-NOT: lock 514; X64: orq 515; X64-NOT: movq 516; We do not check X86-32 as it cannot do 'orq'. 517; X32-LABEL: or_64r: 518 %1 = load atomic i64, i64* %p acquire, align 8 519 %2 = or i64 %1, %v 520 store atomic i64 %2, i64* %p release, align 8 521 ret void 522} 523 524define void @or_32i_seq_cst(i32* %p) { 525; X64-LABEL: or_32i_seq_cst: 526; X64: xchgl 527; X32-LABEL: or_32i_seq_cst: 528; X32: xchgl 529 %1 = load atomic i32, i32* %p monotonic, align 4 530 %2 = or i32 %1, 2 531 store atomic i32 %2, i32* %p seq_cst, align 4 532 ret void 533} 534 535define void @or_32r_seq_cst(i32* %p, i32 %v) { 536; X64-LABEL: or_32r_seq_cst: 537; X64: xchgl 538; X32-LABEL: or_32r_seq_cst: 539; X32: xchgl 540 %1 = load atomic i32, i32* %p monotonic, align 4 541 %2 = or i32 %1, %v 542 store atomic i32 %2, i32* %p seq_cst, align 4 543 ret void 544} 545 546; ----- XOR ----- 547 548define void @xor_8i(i8* %p) { 549; X64-LABEL: xor_8i: 550; X64-NOT: lock 551; X64: xorb 552; X64-NOT: movb 553; X32-LABEL: xor_8i: 554; X32-NOT: lock 555; X32: xorb 556; X32-NOT: movb 557 %1 = load atomic i8, i8* %p acquire, align 1 558 %2 = xor i8 %1, 2 559 store atomic i8 %2, i8* %p release, align 1 560 ret void 561} 562 563define void @xor_8r(i8* %p, i8 %v) { 564; X64-LABEL: xor_8r: 565; X64-NOT: lock 566; X64: xorb 567; X64-NOT: movb 568; X32-LABEL: xor_8r: 569; X32-NOT: lock 570; X32: xorb 571; X32-NOT: movb 572 %1 = load atomic i8, i8* %p acquire, align 1 573 %2 = xor i8 %1, %v 574 store atomic i8 %2, i8* %p release, align 1 575 ret void 576} 577 578define void @xor_16i(i16* %p) { 579; X64-LABEL: xor_16i: 580; X64-NOT: xorw 581; X32-LABEL: xor_16i: 582; X32-NOT: xorw 583 %1 = load atomic i16, i16* %p acquire, align 2 584 %2 = xor i16 %1, 2 585 store atomic i16 %2, i16* %p release, align 2 586 ret void 587} 588 589define void @xor_16r(i16* %p, i16 %v) { 590; X64-LABEL: xor_16r: 591; X64-NOT: xorw 592; X32-LABEL: xor_16r: 593; X32-NOT: xorw [.*], ( 594 %1 = load atomic i16, i16* %p acquire, align 2 595 %2 = xor i16 %1, %v 596 store atomic i16 %2, i16* %p release, align 2 597 ret void 598} 599 600define void @xor_32i(i32* %p) { 601; X64-LABEL: xor_32i: 602; X64-NOT: lock 603; X64: xorl 604; X64-NOT: movl 605; X32-LABEL: xor_32i: 606; X32-NOT: lock 607; X32: xorl 608; X32-NOT: movl 609 %1 = load atomic i32, i32* %p acquire, align 4 610 %2 = xor i32 %1, 2 611 store atomic i32 %2, i32* %p release, align 4 612 ret void 613} 614 615define void @xor_32r(i32* %p, i32 %v) { 616; X64-LABEL: xor_32r: 617; X64-NOT: lock 618; X64: xorl 619; X64-NOT: movl 620; X32-LABEL: xor_32r: 621; X32-NOT: lock 622; X32: xorl 623; X32-NOT: movl 624 %1 = load atomic i32, i32* %p acquire, align 4 625 %2 = xor i32 %1, %v 626 store atomic i32 %2, i32* %p release, align 4 627 ret void 628} 629 630define void @xor_64i(i64* %p) { 631; X64-LABEL: xor_64i: 632; X64-NOT: lock 633; X64: xorq 634; X64-NOT: movq 635; We do not check X86-32 as it cannot do 'xorq'. 636; X32-LABEL: xor_64i: 637 %1 = load atomic i64, i64* %p acquire, align 8 638 %2 = xor i64 %1, 2 639 store atomic i64 %2, i64* %p release, align 8 640 ret void 641} 642 643define void @xor_64r(i64* %p, i64 %v) { 644; X64-LABEL: xor_64r: 645; X64-NOT: lock 646; X64: xorq 647; X64-NOT: movq 648; We do not check X86-32 as it cannot do 'xorq'. 649; X32-LABEL: xor_64r: 650 %1 = load atomic i64, i64* %p acquire, align 8 651 %2 = xor i64 %1, %v 652 store atomic i64 %2, i64* %p release, align 8 653 ret void 654} 655 656define void @xor_32i_seq_cst(i32* %p) { 657; X64-LABEL: xor_32i_seq_cst: 658; X64: xchgl 659; X32-LABEL: xor_32i_seq_cst: 660; X32: xchgl 661 %1 = load atomic i32, i32* %p monotonic, align 4 662 %2 = xor i32 %1, 2 663 store atomic i32 %2, i32* %p seq_cst, align 4 664 ret void 665} 666 667define void @xor_32r_seq_cst(i32* %p, i32 %v) { 668; X64-LABEL: xor_32r_seq_cst: 669; X64: xchgl 670; X32-LABEL: xor_32r_seq_cst: 671; X32: xchgl 672 %1 = load atomic i32, i32* %p monotonic, align 4 673 %2 = xor i32 %1, %v 674 store atomic i32 %2, i32* %p seq_cst, align 4 675 ret void 676} 677 678; ----- INC ----- 679 680define void @inc_8(i8* %p) { 681; X64-LABEL: inc_8: 682; X64-NOT: lock 683; X64: incb 684; X64-NOT: movb 685; X32-LABEL: inc_8: 686; X32-NOT: lock 687; X32: incb 688; X32-NOT: movb 689; SLOW_INC-LABEL: inc_8: 690; SLOW_INC-NOT: incb 691; SLOW_INC-NOT: movb 692 %1 = load atomic i8, i8* %p seq_cst, align 1 693 %2 = add i8 %1, 1 694 store atomic i8 %2, i8* %p release, align 1 695 ret void 696} 697 698define void @inc_16(i16* %p) { 699; Currently the transformation is not done on 16 bit accesses, as the backend 700; treat 16 bit arithmetic as expensive on X86/X86_64. 701; X64-LABEL: inc_16: 702; X64-NOT: incw 703; X32-LABEL: inc_16: 704; X32-NOT: incw 705; SLOW_INC-LABEL: inc_16: 706; SLOW_INC-NOT: incw 707 %1 = load atomic i16, i16* %p acquire, align 2 708 %2 = add i16 %1, 1 709 store atomic i16 %2, i16* %p release, align 2 710 ret void 711} 712 713define void @inc_32(i32* %p) { 714; X64-LABEL: inc_32: 715; X64-NOT: lock 716; X64: incl 717; X64-NOT: movl 718; X32-LABEL: inc_32: 719; X32-NOT: lock 720; X32: incl 721; X32-NOT: movl 722; SLOW_INC-LABEL: inc_32: 723; SLOW_INC-NOT: incl 724; SLOW_INC-NOT: movl 725 %1 = load atomic i32, i32* %p acquire, align 4 726 %2 = add i32 %1, 1 727 store atomic i32 %2, i32* %p monotonic, align 4 728 ret void 729} 730 731define void @inc_64(i64* %p) { 732; X64-LABEL: inc_64: 733; X64-NOT: lock 734; X64: incq 735; X64-NOT: movq 736; We do not check X86-32 as it cannot do 'incq'. 737; X32-LABEL: inc_64: 738; SLOW_INC-LABEL: inc_64: 739; SLOW_INC-NOT: incq 740; SLOW_INC-NOT: movq 741 %1 = load atomic i64, i64* %p acquire, align 8 742 %2 = add i64 %1, 1 743 store atomic i64 %2, i64* %p release, align 8 744 ret void 745} 746 747define void @inc_32_seq_cst(i32* %p) { 748; X64-LABEL: inc_32_seq_cst: 749; X64: xchgl 750; X32-LABEL: inc_32_seq_cst: 751; X32: xchgl 752 %1 = load atomic i32, i32* %p monotonic, align 4 753 %2 = add i32 %1, 1 754 store atomic i32 %2, i32* %p seq_cst, align 4 755 ret void 756} 757 758; ----- DEC ----- 759 760define void @dec_8(i8* %p) { 761; X64-LABEL: dec_8: 762; X64-NOT: lock 763; X64: decb 764; X64-NOT: movb 765; X32-LABEL: dec_8: 766; X32-NOT: lock 767; X32: decb 768; X32-NOT: movb 769; SLOW_INC-LABEL: dec_8: 770; SLOW_INC-NOT: decb 771; SLOW_INC-NOT: movb 772 %1 = load atomic i8, i8* %p seq_cst, align 1 773 %2 = sub i8 %1, 1 774 store atomic i8 %2, i8* %p release, align 1 775 ret void 776} 777 778define void @dec_16(i16* %p) { 779; Currently the transformation is not done on 16 bit accesses, as the backend 780; treat 16 bit arithmetic as expensive on X86/X86_64. 781; X64-LABEL: dec_16: 782; X64-NOT: decw 783; X32-LABEL: dec_16: 784; X32-NOT: decw 785; SLOW_INC-LABEL: dec_16: 786; SLOW_INC-NOT: decw 787 %1 = load atomic i16, i16* %p acquire, align 2 788 %2 = sub i16 %1, 1 789 store atomic i16 %2, i16* %p release, align 2 790 ret void 791} 792 793define void @dec_32(i32* %p) { 794; X64-LABEL: dec_32: 795; X64-NOT: lock 796; X64: decl 797; X64-NOT: movl 798; X32-LABEL: dec_32: 799; X32-NOT: lock 800; X32: decl 801; X32-NOT: movl 802; SLOW_INC-LABEL: dec_32: 803; SLOW_INC-NOT: decl 804; SLOW_INC-NOT: movl 805 %1 = load atomic i32, i32* %p acquire, align 4 806 %2 = sub i32 %1, 1 807 store atomic i32 %2, i32* %p monotonic, align 4 808 ret void 809} 810 811define void @dec_64(i64* %p) { 812; X64-LABEL: dec_64: 813; X64-NOT: lock 814; X64: decq 815; X64-NOT: movq 816; We do not check X86-32 as it cannot do 'decq'. 817; X32-LABEL: dec_64: 818; SLOW_INC-LABEL: dec_64: 819; SLOW_INC-NOT: decq 820; SLOW_INC-NOT: movq 821 %1 = load atomic i64, i64* %p acquire, align 8 822 %2 = sub i64 %1, 1 823 store atomic i64 %2, i64* %p release, align 8 824 ret void 825} 826 827define void @dec_32_seq_cst(i32* %p) { 828; X64-LABEL: dec_32_seq_cst: 829; X64: xchgl 830; X32-LABEL: dec_32_seq_cst: 831; X32: xchgl 832 %1 = load atomic i32, i32* %p monotonic, align 4 833 %2 = sub i32 %1, 1 834 store atomic i32 %2, i32* %p seq_cst, align 4 835 ret void 836} 837 838; ----- FADD ----- 839 840define void @fadd_32r(float* %loc, float %val) { 841; X64-LABEL: fadd_32r: 842; X64-NOT: lock 843; X64-NOT: mov 844; X64: addss (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]] 845; X64-NEXT: movss %[[XMM]], (%[[M]]) 846; X32-LABEL: fadd_32r: 847; Don't check x86-32. 848; LLVM's SSE handling is conservative on x86-32 even without using atomics. 849 %floc = bitcast float* %loc to i32* 850 %1 = load atomic i32, i32* %floc seq_cst, align 4 851 %2 = bitcast i32 %1 to float 852 %add = fadd float %2, %val 853 %3 = bitcast float %add to i32 854 store atomic i32 %3, i32* %floc release, align 4 855 ret void 856} 857 858define void @fadd_64r(double* %loc, double %val) { 859; X64-LABEL: fadd_64r: 860; X64-NOT: lock 861; X64-NOT: mov 862; X64: addsd (%[[M:[a-z]+]]), %[[XMM:xmm[0-9]+]] 863; X64-NEXT: movsd %[[XMM]], (%[[M]]) 864; X32-LABEL: fadd_64r: 865; Don't check x86-32 (see comment above). 866 %floc = bitcast double* %loc to i64* 867 %1 = load atomic i64, i64* %floc seq_cst, align 8 868 %2 = bitcast i64 %1 to double 869 %add = fadd double %2, %val 870 %3 = bitcast double %add to i64 871 store atomic i64 %3, i64* %floc release, align 8 872 ret void 873} 874 875@glob32 = global float 0.000000e+00, align 4 876@glob64 = global double 0.000000e+00, align 8 877 878; Floating-point add to a global using an immediate. 879define void @fadd_32g() { 880; X64-LABEL: fadd_32g: 881; X64-NOT: lock 882; X64: movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] 883; X64-NEXT: addss glob32(%rip), %[[XMM]] 884; X64-NEXT: movss %[[XMM]], glob32(%rip) 885; X32-LABEL: fadd_32g: 886; Don't check x86-32 (see comment above). 887 %i = load atomic i32, i32* bitcast (float* @glob32 to i32*) monotonic, align 4 888 %f = bitcast i32 %i to float 889 %add = fadd float %f, 1.000000e+00 890 %s = bitcast float %add to i32 891 store atomic i32 %s, i32* bitcast (float* @glob32 to i32*) monotonic, align 4 892 ret void 893} 894 895define void @fadd_64g() { 896; X64-LABEL: fadd_64g: 897; X64-NOT: lock 898; X64: movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] 899; X64-NEXT: addsd glob64(%rip), %[[XMM]] 900; X64-NEXT: movsd %[[XMM]], glob64(%rip) 901; X32-LABEL: fadd_64g: 902; Don't check x86-32 (see comment above). 903 %i = load atomic i64, i64* bitcast (double* @glob64 to i64*) monotonic, align 8 904 %f = bitcast i64 %i to double 905 %add = fadd double %f, 1.000000e+00 906 %s = bitcast double %add to i64 907 store atomic i64 %s, i64* bitcast (double* @glob64 to i64*) monotonic, align 8 908 ret void 909} 910 911; Floating-point add to a hard-coded immediate location using an immediate. 912define void @fadd_32imm() { 913; X64-LABEL: fadd_32imm: 914; X64-NOT: lock 915; X64: movl $3735928559, %e[[M:[a-z]+]] 916; X64: movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] 917; X64-NEXT: addss (%r[[M]]), %[[XMM]] 918; X64-NEXT: movss %[[XMM]], (%r[[M]]) 919; X32-LABEL: fadd_32imm: 920; Don't check x86-32 (see comment above). 921 %i = load atomic i32, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4 922 %f = bitcast i32 %i to float 923 %add = fadd float %f, 1.000000e+00 924 %s = bitcast float %add to i32 925 store atomic i32 %s, i32* inttoptr (i32 3735928559 to i32*) monotonic, align 4 926 ret void 927} 928 929define void @fadd_64imm() { 930; X64-LABEL: fadd_64imm: 931; X64-NOT: lock 932; X64: movl $3735928559, %e[[M:[a-z]+]] 933; X64: movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] 934; X64-NEXT: addsd (%r[[M]]), %[[XMM]] 935; X64-NEXT: movsd %[[XMM]], (%r[[M]]) 936; X32-LABEL: fadd_64imm: 937; Don't check x86-32 (see comment above). 938 %i = load atomic i64, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8 939 %f = bitcast i64 %i to double 940 %add = fadd double %f, 1.000000e+00 941 %s = bitcast double %add to i64 942 store atomic i64 %s, i64* inttoptr (i64 3735928559 to i64*) monotonic, align 8 943 ret void 944} 945 946; Floating-point add to a stack location. 947define void @fadd_32stack() { 948; X64-LABEL: fadd_32stack: 949; X64-NOT: lock 950; X64: movss .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] 951; X64-NEXT: addss [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]] 952; X64-NEXT: movss %[[XMM]], [[STACKOFF]](%rsp) 953; X32-LABEL: fadd_32stack: 954; Don't check x86-32 (see comment above). 955 %ptr = alloca i32, align 4 956 %bc3 = bitcast i32* %ptr to float* 957 %load = load atomic i32, i32* %ptr acquire, align 4 958 %bc0 = bitcast i32 %load to float 959 %fadd = fadd float 1.000000e+00, %bc0 960 %bc1 = bitcast float %fadd to i32 961 store atomic i32 %bc1, i32* %ptr release, align 4 962 ret void 963} 964 965define void @fadd_64stack() { 966; X64-LABEL: fadd_64stack: 967; X64-NOT: lock 968; X64: movsd .{{[A-Z0-9_]+}}(%rip), %[[XMM:xmm[0-9]+]] 969; X64-NEXT: addsd [[STACKOFF:-?[0-9]+]](%rsp), %[[XMM]] 970; X64-NEXT: movsd %[[XMM]], [[STACKOFF]](%rsp) 971; X32-LABEL: fadd_64stack: 972; Don't check x86-32 (see comment above). 973 %ptr = alloca i64, align 8 974 %bc3 = bitcast i64* %ptr to double* 975 %load = load atomic i64, i64* %ptr acquire, align 8 976 %bc0 = bitcast i64 %load to double 977 %fadd = fadd double 1.000000e+00, %bc0 978 %bc1 = bitcast double %fadd to i64 979 store atomic i64 %bc1, i64* %ptr release, align 8 980 ret void 981} 982