1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+aes,+pclmul < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) { 13; CHECK-LABEL: stack_fold_aesdec: 14; CHECK: # %bb.0: 15; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16; CHECK-NEXT: #APP 17; CHECK-NEXT: nop 18; CHECK-NEXT: #NO_APP 19; CHECK-NEXT: aesdec {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 20; CHECK-NEXT: retq 21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 22 %2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1) 23 ret <2 x i64> %2 24} 25declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone 26 27define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) { 28; CHECK-LABEL: stack_fold_aesdeclast: 29; CHECK: # %bb.0: 30; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 31; CHECK-NEXT: #APP 32; CHECK-NEXT: nop 33; CHECK-NEXT: #NO_APP 34; CHECK-NEXT: aesdeclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 35; CHECK-NEXT: retq 36 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 37 %2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1) 38 ret <2 x i64> %2 39} 40declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone 41 42define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) { 43; CHECK-LABEL: stack_fold_aesenc: 44; CHECK: # %bb.0: 45; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 46; CHECK-NEXT: #APP 47; CHECK-NEXT: nop 48; CHECK-NEXT: #NO_APP 49; CHECK-NEXT: aesenc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 50; CHECK-NEXT: retq 51 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 52 %2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1) 53 ret <2 x i64> %2 54} 55declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone 56 57define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) { 58; CHECK-LABEL: stack_fold_aesenclast: 59; CHECK: # %bb.0: 60; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 61; CHECK-NEXT: #APP 62; CHECK-NEXT: nop 63; CHECK-NEXT: #NO_APP 64; CHECK-NEXT: aesenclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 65; CHECK-NEXT: retq 66 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 67 %2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1) 68 ret <2 x i64> %2 69} 70declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone 71 72define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) { 73; CHECK-LABEL: stack_fold_aesimc: 74; CHECK: # %bb.0: 75; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 76; CHECK-NEXT: #APP 77; CHECK-NEXT: nop 78; CHECK-NEXT: #NO_APP 79; CHECK-NEXT: aesimc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 80; CHECK-NEXT: retq 81 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 82 %2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0) 83 ret <2 x i64> %2 84} 85declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone 86 87define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) { 88; CHECK-LABEL: stack_fold_aeskeygenassist: 89; CHECK: # %bb.0: 90; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 91; CHECK-NEXT: #APP 92; CHECK-NEXT: nop 93; CHECK-NEXT: #NO_APP 94; CHECK-NEXT: aeskeygenassist $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 95; CHECK-NEXT: retq 96 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 97 %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) 98 ret <2 x i64> %2 99} 100declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone 101 102define i32 @stack_fold_crc32_32_8(i32 %a0, i8 %a1) { 103; CHECK-LABEL: stack_fold_crc32_32_8: 104; CHECK: # %bb.0: 105; CHECK-NEXT: pushq %rbp 106; CHECK-NEXT: .cfi_def_cfa_offset 16 107; CHECK-NEXT: pushq %r15 108; CHECK-NEXT: .cfi_def_cfa_offset 24 109; CHECK-NEXT: pushq %r14 110; CHECK-NEXT: .cfi_def_cfa_offset 32 111; CHECK-NEXT: pushq %r13 112; CHECK-NEXT: .cfi_def_cfa_offset 40 113; CHECK-NEXT: pushq %r12 114; CHECK-NEXT: .cfi_def_cfa_offset 48 115; CHECK-NEXT: pushq %rbx 116; CHECK-NEXT: .cfi_def_cfa_offset 56 117; CHECK-NEXT: .cfi_offset %rbx, -56 118; CHECK-NEXT: .cfi_offset %r12, -48 119; CHECK-NEXT: .cfi_offset %r13, -40 120; CHECK-NEXT: .cfi_offset %r14, -32 121; CHECK-NEXT: .cfi_offset %r15, -24 122; CHECK-NEXT: .cfi_offset %rbp, -16 123; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 124; CHECK-NEXT: movl %edi, %eax 125; CHECK-NEXT: #APP 126; CHECK-NEXT: nop 127; CHECK-NEXT: #NO_APP 128; CHECK-NEXT: crc32b {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload 129; CHECK-NEXT: popq %rbx 130; CHECK-NEXT: .cfi_def_cfa_offset 48 131; CHECK-NEXT: popq %r12 132; CHECK-NEXT: .cfi_def_cfa_offset 40 133; CHECK-NEXT: popq %r13 134; CHECK-NEXT: .cfi_def_cfa_offset 32 135; CHECK-NEXT: popq %r14 136; CHECK-NEXT: .cfi_def_cfa_offset 24 137; CHECK-NEXT: popq %r15 138; CHECK-NEXT: .cfi_def_cfa_offset 16 139; CHECK-NEXT: popq %rbp 140; CHECK-NEXT: .cfi_def_cfa_offset 8 141; CHECK-NEXT: retq 142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 143 %2 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1) 144 ret i32 %2 145} 146declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind 147 148define i32 @stack_fold_crc32_32_16(i32 %a0, i16 %a1) { 149; CHECK-LABEL: stack_fold_crc32_32_16: 150; CHECK: # %bb.0: 151; CHECK-NEXT: pushq %rbp 152; CHECK-NEXT: .cfi_def_cfa_offset 16 153; CHECK-NEXT: pushq %r15 154; CHECK-NEXT: .cfi_def_cfa_offset 24 155; CHECK-NEXT: pushq %r14 156; CHECK-NEXT: .cfi_def_cfa_offset 32 157; CHECK-NEXT: pushq %r13 158; CHECK-NEXT: .cfi_def_cfa_offset 40 159; CHECK-NEXT: pushq %r12 160; CHECK-NEXT: .cfi_def_cfa_offset 48 161; CHECK-NEXT: pushq %rbx 162; CHECK-NEXT: .cfi_def_cfa_offset 56 163; CHECK-NEXT: .cfi_offset %rbx, -56 164; CHECK-NEXT: .cfi_offset %r12, -48 165; CHECK-NEXT: .cfi_offset %r13, -40 166; CHECK-NEXT: .cfi_offset %r14, -32 167; CHECK-NEXT: .cfi_offset %r15, -24 168; CHECK-NEXT: .cfi_offset %rbp, -16 169; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 170; CHECK-NEXT: movl %edi, %eax 171; CHECK-NEXT: #APP 172; CHECK-NEXT: nop 173; CHECK-NEXT: #NO_APP 174; CHECK-NEXT: crc32w {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload 175; CHECK-NEXT: popq %rbx 176; CHECK-NEXT: .cfi_def_cfa_offset 48 177; CHECK-NEXT: popq %r12 178; CHECK-NEXT: .cfi_def_cfa_offset 40 179; CHECK-NEXT: popq %r13 180; CHECK-NEXT: .cfi_def_cfa_offset 32 181; CHECK-NEXT: popq %r14 182; CHECK-NEXT: .cfi_def_cfa_offset 24 183; CHECK-NEXT: popq %r15 184; CHECK-NEXT: .cfi_def_cfa_offset 16 185; CHECK-NEXT: popq %rbp 186; CHECK-NEXT: .cfi_def_cfa_offset 8 187; CHECK-NEXT: retq 188 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 189 %2 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1) 190 ret i32 %2 191} 192declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind 193 194define i32 @stack_fold_crc32_32_32(i32 %a0, i32 %a1) { 195; CHECK-LABEL: stack_fold_crc32_32_32: 196; CHECK: # %bb.0: 197; CHECK-NEXT: pushq %rbp 198; CHECK-NEXT: .cfi_def_cfa_offset 16 199; CHECK-NEXT: pushq %r15 200; CHECK-NEXT: .cfi_def_cfa_offset 24 201; CHECK-NEXT: pushq %r14 202; CHECK-NEXT: .cfi_def_cfa_offset 32 203; CHECK-NEXT: pushq %r13 204; CHECK-NEXT: .cfi_def_cfa_offset 40 205; CHECK-NEXT: pushq %r12 206; CHECK-NEXT: .cfi_def_cfa_offset 48 207; CHECK-NEXT: pushq %rbx 208; CHECK-NEXT: .cfi_def_cfa_offset 56 209; CHECK-NEXT: .cfi_offset %rbx, -56 210; CHECK-NEXT: .cfi_offset %r12, -48 211; CHECK-NEXT: .cfi_offset %r13, -40 212; CHECK-NEXT: .cfi_offset %r14, -32 213; CHECK-NEXT: .cfi_offset %r15, -24 214; CHECK-NEXT: .cfi_offset %rbp, -16 215; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 216; CHECK-NEXT: movl %edi, %eax 217; CHECK-NEXT: #APP 218; CHECK-NEXT: nop 219; CHECK-NEXT: #NO_APP 220; CHECK-NEXT: crc32l {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload 221; CHECK-NEXT: popq %rbx 222; CHECK-NEXT: .cfi_def_cfa_offset 48 223; CHECK-NEXT: popq %r12 224; CHECK-NEXT: .cfi_def_cfa_offset 40 225; CHECK-NEXT: popq %r13 226; CHECK-NEXT: .cfi_def_cfa_offset 32 227; CHECK-NEXT: popq %r14 228; CHECK-NEXT: .cfi_def_cfa_offset 24 229; CHECK-NEXT: popq %r15 230; CHECK-NEXT: .cfi_def_cfa_offset 16 231; CHECK-NEXT: popq %rbp 232; CHECK-NEXT: .cfi_def_cfa_offset 8 233; CHECK-NEXT: retq 234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 235 %2 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1) 236 ret i32 %2 237} 238declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind 239 240define i64 @stack_fold_crc32_64_64(i64 %a0, i64 %a1) { 241; CHECK-LABEL: stack_fold_crc32_64_64: 242; CHECK: # %bb.0: 243; CHECK-NEXT: pushq %rbp 244; CHECK-NEXT: .cfi_def_cfa_offset 16 245; CHECK-NEXT: pushq %r15 246; CHECK-NEXT: .cfi_def_cfa_offset 24 247; CHECK-NEXT: pushq %r14 248; CHECK-NEXT: .cfi_def_cfa_offset 32 249; CHECK-NEXT: pushq %r13 250; CHECK-NEXT: .cfi_def_cfa_offset 40 251; CHECK-NEXT: pushq %r12 252; CHECK-NEXT: .cfi_def_cfa_offset 48 253; CHECK-NEXT: pushq %rbx 254; CHECK-NEXT: .cfi_def_cfa_offset 56 255; CHECK-NEXT: .cfi_offset %rbx, -56 256; CHECK-NEXT: .cfi_offset %r12, -48 257; CHECK-NEXT: .cfi_offset %r13, -40 258; CHECK-NEXT: .cfi_offset %r14, -32 259; CHECK-NEXT: .cfi_offset %r15, -24 260; CHECK-NEXT: .cfi_offset %rbp, -16 261; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 262; CHECK-NEXT: movq %rdi, %rax 263; CHECK-NEXT: #APP 264; CHECK-NEXT: nop 265; CHECK-NEXT: #NO_APP 266; CHECK-NEXT: crc32q {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload 267; CHECK-NEXT: popq %rbx 268; CHECK-NEXT: .cfi_def_cfa_offset 48 269; CHECK-NEXT: popq %r12 270; CHECK-NEXT: .cfi_def_cfa_offset 40 271; CHECK-NEXT: popq %r13 272; CHECK-NEXT: .cfi_def_cfa_offset 32 273; CHECK-NEXT: popq %r14 274; CHECK-NEXT: .cfi_def_cfa_offset 24 275; CHECK-NEXT: popq %r15 276; CHECK-NEXT: .cfi_def_cfa_offset 16 277; CHECK-NEXT: popq %rbp 278; CHECK-NEXT: .cfi_def_cfa_offset 8 279; CHECK-NEXT: retq 280 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 281 %2 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1) 282 ret i64 %2 283} 284declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind 285 286define <4 x i32> @stack_fold_movd_load(i32 %a0) { 287; CHECK-LABEL: stack_fold_movd_load: 288; CHECK: # %bb.0: 289; CHECK-NEXT: pushq %rbp 290; CHECK-NEXT: .cfi_def_cfa_offset 16 291; CHECK-NEXT: pushq %r15 292; CHECK-NEXT: .cfi_def_cfa_offset 24 293; CHECK-NEXT: pushq %r14 294; CHECK-NEXT: .cfi_def_cfa_offset 32 295; CHECK-NEXT: pushq %r13 296; CHECK-NEXT: .cfi_def_cfa_offset 40 297; CHECK-NEXT: pushq %r12 298; CHECK-NEXT: .cfi_def_cfa_offset 48 299; CHECK-NEXT: pushq %rbx 300; CHECK-NEXT: .cfi_def_cfa_offset 56 301; CHECK-NEXT: .cfi_offset %rbx, -56 302; CHECK-NEXT: .cfi_offset %r12, -48 303; CHECK-NEXT: .cfi_offset %r13, -40 304; CHECK-NEXT: .cfi_offset %r14, -32 305; CHECK-NEXT: .cfi_offset %r15, -24 306; CHECK-NEXT: .cfi_offset %rbp, -16 307; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 308; CHECK-NEXT: #APP 309; CHECK-NEXT: nop 310; CHECK-NEXT: #NO_APP 311; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 312; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero 313; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 314; CHECK-NEXT: psubd %xmm1, %xmm0 315; CHECK-NEXT: popq %rbx 316; CHECK-NEXT: .cfi_def_cfa_offset 48 317; CHECK-NEXT: popq %r12 318; CHECK-NEXT: .cfi_def_cfa_offset 40 319; CHECK-NEXT: popq %r13 320; CHECK-NEXT: .cfi_def_cfa_offset 32 321; CHECK-NEXT: popq %r14 322; CHECK-NEXT: .cfi_def_cfa_offset 24 323; CHECK-NEXT: popq %r15 324; CHECK-NEXT: .cfi_def_cfa_offset 16 325; CHECK-NEXT: popq %rbp 326; CHECK-NEXT: .cfi_def_cfa_offset 8 327; CHECK-NEXT: retq 328 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 329 %2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0 330 ; add forces execution domain 331 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 332 ret <4 x i32> %3 333} 334 335define i32 @stack_fold_movd_store(<4 x i32> %a0, <4 x i32> %a1) { 336; CHECK-LABEL: stack_fold_movd_store: 337; CHECK: # %bb.0: 338; CHECK-NEXT: pushq %rbp 339; CHECK-NEXT: .cfi_def_cfa_offset 16 340; CHECK-NEXT: pushq %r15 341; CHECK-NEXT: .cfi_def_cfa_offset 24 342; CHECK-NEXT: pushq %r14 343; CHECK-NEXT: .cfi_def_cfa_offset 32 344; CHECK-NEXT: pushq %r13 345; CHECK-NEXT: .cfi_def_cfa_offset 40 346; CHECK-NEXT: pushq %r12 347; CHECK-NEXT: .cfi_def_cfa_offset 48 348; CHECK-NEXT: pushq %rbx 349; CHECK-NEXT: .cfi_def_cfa_offset 56 350; CHECK-NEXT: .cfi_offset %rbx, -56 351; CHECK-NEXT: .cfi_offset %r12, -48 352; CHECK-NEXT: .cfi_offset %r13, -40 353; CHECK-NEXT: .cfi_offset %r14, -32 354; CHECK-NEXT: .cfi_offset %r15, -24 355; CHECK-NEXT: .cfi_offset %rbp, -16 356; CHECK-NEXT: paddd %xmm1, %xmm0 357; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 358; CHECK-NEXT: #APP 359; CHECK-NEXT: nop 360; CHECK-NEXT: #NO_APP 361; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 362; CHECK-NEXT: popq %rbx 363; CHECK-NEXT: .cfi_def_cfa_offset 48 364; CHECK-NEXT: popq %r12 365; CHECK-NEXT: .cfi_def_cfa_offset 40 366; CHECK-NEXT: popq %r13 367; CHECK-NEXT: .cfi_def_cfa_offset 32 368; CHECK-NEXT: popq %r14 369; CHECK-NEXT: .cfi_def_cfa_offset 24 370; CHECK-NEXT: popq %r15 371; CHECK-NEXT: .cfi_def_cfa_offset 16 372; CHECK-NEXT: popq %rbp 373; CHECK-NEXT: .cfi_def_cfa_offset 8 374; CHECK-NEXT: retq 375 ; add forces execution domain 376 %1 = add <4 x i32> %a0, %a1 377 %2 = extractelement <4 x i32> %1, i32 0 378 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 379 ret i32 %2 380} 381 382define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) { 383; CHECK-LABEL: stack_fold_movq_load: 384; CHECK: # %bb.0: 385; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 386; CHECK-NEXT: #APP 387; CHECK-NEXT: nop 388; CHECK-NEXT: #NO_APP 389; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 390; CHECK-NEXT: # xmm0 = mem[0],zero 391; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 392; CHECK-NEXT: psubq %xmm1, %xmm0 393; CHECK-NEXT: retq 394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 395 %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2> 396 ; add forces execution domain 397 %3 = add <2 x i64> %2, <i64 1, i64 1> 398 ret <2 x i64> %3 399} 400 401define i64 @stack_fold_movq_store(<2 x i64> %a0, <2 x i64> %a1) { 402; CHECK-LABEL: stack_fold_movq_store: 403; CHECK: # %bb.0: 404; CHECK-NEXT: pushq %rbp 405; CHECK-NEXT: .cfi_def_cfa_offset 16 406; CHECK-NEXT: pushq %r15 407; CHECK-NEXT: .cfi_def_cfa_offset 24 408; CHECK-NEXT: pushq %r14 409; CHECK-NEXT: .cfi_def_cfa_offset 32 410; CHECK-NEXT: pushq %r13 411; CHECK-NEXT: .cfi_def_cfa_offset 40 412; CHECK-NEXT: pushq %r12 413; CHECK-NEXT: .cfi_def_cfa_offset 48 414; CHECK-NEXT: pushq %rbx 415; CHECK-NEXT: .cfi_def_cfa_offset 56 416; CHECK-NEXT: .cfi_offset %rbx, -56 417; CHECK-NEXT: .cfi_offset %r12, -48 418; CHECK-NEXT: .cfi_offset %r13, -40 419; CHECK-NEXT: .cfi_offset %r14, -32 420; CHECK-NEXT: .cfi_offset %r15, -24 421; CHECK-NEXT: .cfi_offset %rbp, -16 422; CHECK-NEXT: paddq %xmm1, %xmm0 423; CHECK-NEXT: movq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 424; CHECK-NEXT: #APP 425; CHECK-NEXT: nop 426; CHECK-NEXT: #NO_APP 427; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 428; CHECK-NEXT: popq %rbx 429; CHECK-NEXT: .cfi_def_cfa_offset 48 430; CHECK-NEXT: popq %r12 431; CHECK-NEXT: .cfi_def_cfa_offset 40 432; CHECK-NEXT: popq %r13 433; CHECK-NEXT: .cfi_def_cfa_offset 32 434; CHECK-NEXT: popq %r14 435; CHECK-NEXT: .cfi_def_cfa_offset 24 436; CHECK-NEXT: popq %r15 437; CHECK-NEXT: .cfi_def_cfa_offset 16 438; CHECK-NEXT: popq %rbp 439; CHECK-NEXT: .cfi_def_cfa_offset 8 440; CHECK-NEXT: retq 441 ; add forces execution domain 442 %1 = add <2 x i64> %a0, %a1 443 %2 = extractelement <2 x i64> %1, i32 0 444 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 445 ret i64 %2 446} 447 448define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { 449; CHECK-LABEL: stack_fold_mpsadbw: 450; CHECK: # %bb.0: 451; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 452; CHECK-NEXT: #APP 453; CHECK-NEXT: nop 454; CHECK-NEXT: #NO_APP 455; CHECK-NEXT: mpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 456; CHECK-NEXT: retq 457 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 458 %2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) 459 ret <8 x i16> %2 460} 461declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone 462 463define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) { 464; CHECK-LABEL: stack_fold_pabsb: 465; CHECK: # %bb.0: 466; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 467; CHECK-NEXT: #APP 468; CHECK-NEXT: nop 469; CHECK-NEXT: #NO_APP 470; CHECK-NEXT: pabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 471; CHECK-NEXT: retq 472 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 473 %2 = icmp sgt <16 x i8> %a0, zeroinitializer 474 %3 = sub <16 x i8> zeroinitializer, %a0 475 %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3 476 ret <16 x i8> %4 477} 478 479define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) { 480; CHECK-LABEL: stack_fold_pabsd: 481; CHECK: # %bb.0: 482; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 483; CHECK-NEXT: #APP 484; CHECK-NEXT: nop 485; CHECK-NEXT: #NO_APP 486; CHECK-NEXT: pabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 487; CHECK-NEXT: retq 488 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 489 %2 = icmp sgt <4 x i32> %a0, zeroinitializer 490 %3 = sub <4 x i32> zeroinitializer, %a0 491 %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3 492 ret <4 x i32> %4 493} 494 495define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) { 496; CHECK-LABEL: stack_fold_pabsw: 497; CHECK: # %bb.0: 498; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 499; CHECK-NEXT: #APP 500; CHECK-NEXT: nop 501; CHECK-NEXT: #NO_APP 502; CHECK-NEXT: pabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 503; CHECK-NEXT: retq 504 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 505 %2 = icmp sgt <8 x i16> %a0, zeroinitializer 506 %3 = sub <8 x i16> zeroinitializer, %a0 507 %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3 508 ret <8 x i16> %4 509} 510 511define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) { 512; CHECK-LABEL: stack_fold_packssdw: 513; CHECK: # %bb.0: 514; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 515; CHECK-NEXT: #APP 516; CHECK-NEXT: nop 517; CHECK-NEXT: #NO_APP 518; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 519; CHECK-NEXT: retq 520 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 521 %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) 522 ret <8 x i16> %2 523} 524declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone 525 526define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) { 527; CHECK-LABEL: stack_fold_packsswb: 528; CHECK: # %bb.0: 529; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 530; CHECK-NEXT: #APP 531; CHECK-NEXT: nop 532; CHECK-NEXT: #NO_APP 533; CHECK-NEXT: packsswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 534; CHECK-NEXT: retq 535 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 536 %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) 537 ret <16 x i8> %2 538} 539declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone 540 541define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) { 542; CHECK-LABEL: stack_fold_packusdw: 543; CHECK: # %bb.0: 544; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 545; CHECK-NEXT: #APP 546; CHECK-NEXT: nop 547; CHECK-NEXT: #NO_APP 548; CHECK-NEXT: packusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 549; CHECK-NEXT: retq 550 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 551 %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) 552 ret <8 x i16> %2 553} 554declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone 555 556define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) { 557; CHECK-LABEL: stack_fold_packuswb: 558; CHECK: # %bb.0: 559; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 560; CHECK-NEXT: #APP 561; CHECK-NEXT: nop 562; CHECK-NEXT: #NO_APP 563; CHECK-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 564; CHECK-NEXT: retq 565 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 566 %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) 567 ret <16 x i8> %2 568} 569declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone 570 571define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) { 572; CHECK-LABEL: stack_fold_paddb: 573; CHECK: # %bb.0: 574; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 575; CHECK-NEXT: #APP 576; CHECK-NEXT: nop 577; CHECK-NEXT: #NO_APP 578; CHECK-NEXT: paddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 579; CHECK-NEXT: retq 580 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 581 %2 = add <16 x i8> %a0, %a1 582 ret <16 x i8> %2 583} 584 585define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) { 586; CHECK-LABEL: stack_fold_paddd: 587; CHECK: # %bb.0: 588; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 589; CHECK-NEXT: #APP 590; CHECK-NEXT: nop 591; CHECK-NEXT: #NO_APP 592; CHECK-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 593; CHECK-NEXT: retq 594 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 595 %2 = add <4 x i32> %a0, %a1 596 ret <4 x i32> %2 597} 598 599define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) { 600; CHECK-LABEL: stack_fold_paddq: 601; CHECK: # %bb.0: 602; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 603; CHECK-NEXT: #APP 604; CHECK-NEXT: nop 605; CHECK-NEXT: #NO_APP 606; CHECK-NEXT: paddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 607; CHECK-NEXT: retq 608 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 609 %2 = add <2 x i64> %a0, %a1 610 ret <2 x i64> %2 611} 612 613define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) { 614; CHECK-LABEL: stack_fold_paddsb: 615; CHECK: # %bb.0: 616; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 617; CHECK-NEXT: #APP 618; CHECK-NEXT: nop 619; CHECK-NEXT: #NO_APP 620; CHECK-NEXT: paddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 621; CHECK-NEXT: retq 622 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 623 %2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 624 ret <16 x i8> %2 625} 626declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 627 628define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) { 629; CHECK-LABEL: stack_fold_paddsw: 630; CHECK: # %bb.0: 631; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 632; CHECK-NEXT: #APP 633; CHECK-NEXT: nop 634; CHECK-NEXT: #NO_APP 635; CHECK-NEXT: paddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 636; CHECK-NEXT: retq 637 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 638 %2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 639 ret <8 x i16> %2 640} 641declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 642 643define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) { 644; CHECK-LABEL: stack_fold_paddusb: 645; CHECK: # %bb.0: 646; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 647; CHECK-NEXT: #APP 648; CHECK-NEXT: nop 649; CHECK-NEXT: #NO_APP 650; CHECK-NEXT: paddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 651; CHECK-NEXT: retq 652 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 653 %2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 654 ret <16 x i8> %2 655} 656declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 657 658define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) { 659; CHECK-LABEL: stack_fold_paddusw: 660; CHECK: # %bb.0: 661; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 662; CHECK-NEXT: #APP 663; CHECK-NEXT: nop 664; CHECK-NEXT: #NO_APP 665; CHECK-NEXT: paddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 666; CHECK-NEXT: retq 667 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 668 %2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 669 ret <8 x i16> %2 670} 671declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 672 673define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) { 674; CHECK-LABEL: stack_fold_paddw: 675; CHECK: # %bb.0: 676; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 677; CHECK-NEXT: #APP 678; CHECK-NEXT: nop 679; CHECK-NEXT: #NO_APP 680; CHECK-NEXT: paddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 681; CHECK-NEXT: retq 682 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 683 %2 = add <8 x i16> %a0, %a1 684 ret <8 x i16> %2 685} 686 687define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) { 688; CHECK-LABEL: stack_fold_palignr: 689; CHECK: # %bb.0: 690; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 691; CHECK-NEXT: #APP 692; CHECK-NEXT: nop 693; CHECK-NEXT: #NO_APP 694; CHECK-NEXT: palignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 695; CHECK-NEXT: # xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 696; CHECK-NEXT: retq 697 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 698 %2 = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 699 ret <16 x i8> %2 700} 701 702define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) { 703; CHECK-LABEL: stack_fold_pand: 704; CHECK: # %bb.0: 705; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 706; CHECK-NEXT: #APP 707; CHECK-NEXT: nop 708; CHECK-NEXT: #NO_APP 709; CHECK-NEXT: pand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 710; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 711; CHECK-NEXT: psubb %xmm1, %xmm0 712; CHECK-NEXT: retq 713 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 714 %2 = and <16 x i8> %a0, %a1 715 ; add forces execution domain 716 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 717 ret <16 x i8> %3 718} 719 720define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) { 721; CHECK-LABEL: stack_fold_pandn: 722; CHECK: # %bb.0: 723; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 724; CHECK-NEXT: #APP 725; CHECK-NEXT: nop 726; CHECK-NEXT: #NO_APP 727; CHECK-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 728; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 729; CHECK-NEXT: psubb %xmm1, %xmm0 730; CHECK-NEXT: retq 731 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 732 %2 = xor <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 733 %3 = and <16 x i8> %2, %a1 734 ; add forces execution domain 735 %4 = add <16 x i8> %3, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 736 ret <16 x i8> %4 737} 738 739define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) { 740; CHECK-LABEL: stack_fold_pavgb: 741; CHECK: # %bb.0: 742; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 743; CHECK-NEXT: #APP 744; CHECK-NEXT: nop 745; CHECK-NEXT: #NO_APP 746; CHECK-NEXT: pavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 747; CHECK-NEXT: retq 748 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 749 %2 = zext <16 x i8> %a0 to <16 x i16> 750 %3 = zext <16 x i8> %a1 to <16 x i16> 751 %4 = add <16 x i16> %2, %3 752 %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 753 %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 754 %7 = trunc <16 x i16> %6 to <16 x i8> 755 ret <16 x i8> %7 756} 757 758define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { 759; CHECK-LABEL: stack_fold_pavgw: 760; CHECK: # %bb.0: 761; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 762; CHECK-NEXT: #APP 763; CHECK-NEXT: nop 764; CHECK-NEXT: #NO_APP 765; CHECK-NEXT: pavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 766; CHECK-NEXT: retq 767 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 768 %2 = zext <8 x i16> %a0 to <8 x i32> 769 %3 = zext <8 x i16> %a1 to <8 x i32> 770 %4 = add <8 x i32> %2, %3 771 %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 772 %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 773 %7 = trunc <8 x i32> %6 to <8 x i16> 774 ret <8 x i16> %7 775} 776 777define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) { 778; CHECK-LABEL: stack_fold_pblendvb: 779; CHECK: # %bb.0: 780; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 781; CHECK-NEXT: movdqa %xmm1, %xmm2 782; CHECK-NEXT: #APP 783; CHECK-NEXT: nop 784; CHECK-NEXT: #NO_APP 785; CHECK-NEXT: pblendvb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 786; CHECK-NEXT: movdqa %xmm2, %xmm0 787; CHECK-NEXT: retq 788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 789 %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0) 790 ret <16 x i8> %2 791} 792declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone 793 794define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) { 795; CHECK-LABEL: stack_fold_pblendw: 796; CHECK: # %bb.0: 797; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 798; CHECK-NEXT: #APP 799; CHECK-NEXT: nop 800; CHECK-NEXT: #NO_APP 801; CHECK-NEXT: pblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 802; CHECK-NEXT: # xmm0 = mem[0,1,2],xmm0[3,4,5,6,7] 803; CHECK-NEXT: retq 804 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 805 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> 806 ret <8 x i16> %2 807} 808 809define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) { 810; CHECK-LABEL: stack_fold_pclmulqdq: 811; CHECK: # %bb.0: 812; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 813; CHECK-NEXT: #APP 814; CHECK-NEXT: nop 815; CHECK-NEXT: #NO_APP 816; CHECK-NEXT: pclmulqdq $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 817; CHECK-NEXT: retq 818 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 819 %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) 820 ret <2 x i64> %2 821} 822declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone 823 824define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) { 825; CHECK-LABEL: stack_fold_pcmpeqb: 826; CHECK: # %bb.0: 827; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 828; CHECK-NEXT: #APP 829; CHECK-NEXT: nop 830; CHECK-NEXT: #NO_APP 831; CHECK-NEXT: pcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 832; CHECK-NEXT: retq 833 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 834 %2 = icmp eq <16 x i8> %a0, %a1 835 %3 = sext <16 x i1> %2 to <16 x i8> 836 ret <16 x i8> %3 837} 838 839define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) { 840; CHECK-LABEL: stack_fold_pcmpeqd: 841; CHECK: # %bb.0: 842; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 843; CHECK-NEXT: #APP 844; CHECK-NEXT: nop 845; CHECK-NEXT: #NO_APP 846; CHECK-NEXT: pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 847; CHECK-NEXT: retq 848 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 849 %2 = icmp eq <4 x i32> %a0, %a1 850 %3 = sext <4 x i1> %2 to <4 x i32> 851 ret <4 x i32> %3 852} 853 854define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) { 855; CHECK-LABEL: stack_fold_pcmpeqq: 856; CHECK: # %bb.0: 857; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 858; CHECK-NEXT: #APP 859; CHECK-NEXT: nop 860; CHECK-NEXT: #NO_APP 861; CHECK-NEXT: pcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 862; CHECK-NEXT: retq 863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 864 %2 = icmp eq <2 x i64> %a0, %a1 865 %3 = sext <2 x i1> %2 to <2 x i64> 866 ret <2 x i64> %3 867} 868 869define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) { 870; CHECK-LABEL: stack_fold_pcmpeqw: 871; CHECK: # %bb.0: 872; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 873; CHECK-NEXT: #APP 874; CHECK-NEXT: nop 875; CHECK-NEXT: #NO_APP 876; CHECK-NEXT: pcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 877; CHECK-NEXT: retq 878 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 879 %2 = icmp eq <8 x i16> %a0, %a1 880 %3 = sext <8 x i1> %2 to <8 x i16> 881 ret <8 x i16> %3 882} 883 884define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) { 885; CHECK-LABEL: stack_fold_pcmpestri: 886; CHECK: # %bb.0: 887; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 888; CHECK-NEXT: #APP 889; CHECK-NEXT: nop 890; CHECK-NEXT: #NO_APP 891; CHECK-NEXT: movl $7, %eax 892; CHECK-NEXT: movl $7, %edx 893; CHECK-NEXT: pcmpestri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 894; CHECK-NEXT: movl %ecx, %eax 895; CHECK-NEXT: retq 896 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"() 897 %2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) 898 ret i32 %2 899} 900declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone 901 902define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) { 903; CHECK-LABEL: stack_fold_pcmpestrm: 904; CHECK: # %bb.0: 905; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 906; CHECK-NEXT: #APP 907; CHECK-NEXT: nop 908; CHECK-NEXT: #NO_APP 909; CHECK-NEXT: movl $7, %eax 910; CHECK-NEXT: movl $7, %edx 911; CHECK-NEXT: pcmpestrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 912; CHECK-NEXT: retq 913 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"() 914 %2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) 915 ret <16 x i8> %2 916} 917declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone 918 919define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) { 920; CHECK-LABEL: stack_fold_pcmpgtb: 921; CHECK: # %bb.0: 922; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 923; CHECK-NEXT: #APP 924; CHECK-NEXT: nop 925; CHECK-NEXT: #NO_APP 926; CHECK-NEXT: pcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 927; CHECK-NEXT: retq 928 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 929 %2 = icmp sgt <16 x i8> %a0, %a1 930 %3 = sext <16 x i1> %2 to <16 x i8> 931 ret <16 x i8> %3 932} 933 934define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) { 935; CHECK-LABEL: stack_fold_pcmpgtd: 936; CHECK: # %bb.0: 937; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 938; CHECK-NEXT: #APP 939; CHECK-NEXT: nop 940; CHECK-NEXT: #NO_APP 941; CHECK-NEXT: pcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 942; CHECK-NEXT: retq 943 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 944 %2 = icmp sgt <4 x i32> %a0, %a1 945 %3 = sext <4 x i1> %2 to <4 x i32> 946 ret <4 x i32> %3 947} 948 949define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) { 950; CHECK-LABEL: stack_fold_pcmpgtq: 951; CHECK: # %bb.0: 952; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 953; CHECK-NEXT: #APP 954; CHECK-NEXT: nop 955; CHECK-NEXT: #NO_APP 956; CHECK-NEXT: pcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 957; CHECK-NEXT: retq 958 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 959 %2 = icmp sgt <2 x i64> %a0, %a1 960 %3 = sext <2 x i1> %2 to <2 x i64> 961 ret <2 x i64> %3 962} 963 964define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) { 965; CHECK-LABEL: stack_fold_pcmpgtw: 966; CHECK: # %bb.0: 967; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 968; CHECK-NEXT: #APP 969; CHECK-NEXT: nop 970; CHECK-NEXT: #NO_APP 971; CHECK-NEXT: pcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 972; CHECK-NEXT: retq 973 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 974 %2 = icmp sgt <8 x i16> %a0, %a1 975 %3 = sext <8 x i1> %2 to <8 x i16> 976 ret <8 x i16> %3 977} 978 979define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) { 980; CHECK-LABEL: stack_fold_pcmpistri: 981; CHECK: # %bb.0: 982; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 983; CHECK-NEXT: #APP 984; CHECK-NEXT: nop 985; CHECK-NEXT: #NO_APP 986; CHECK-NEXT: pcmpistri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 987; CHECK-NEXT: movl %ecx, %eax 988; CHECK-NEXT: retq 989 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 990 %2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) 991 ret i32 %2 992} 993declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone 994 995define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) { 996; CHECK-LABEL: stack_fold_pcmpistrm: 997; CHECK: # %bb.0: 998; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 999; CHECK-NEXT: #APP 1000; CHECK-NEXT: nop 1001; CHECK-NEXT: #NO_APP 1002; CHECK-NEXT: pcmpistrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1003; CHECK-NEXT: retq 1004 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1005 %2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) 1006 ret <16 x i8> %2 1007} 1008declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone 1009 1010; TODO stack_fold_pextrb 1011 1012; We can't naively fold pextrw as it only writes to a 16-bit memory location 1013; even though it can store to a 32-bit register. 1014define i16 @stack_fold_pextrw(<8 x i16> %a0) { 1015; CHECK-LABEL: stack_fold_pextrw: 1016; CHECK: # %bb.0: # %entry 1017; CHECK-NEXT: pushq %rbp 1018; CHECK-NEXT: .cfi_def_cfa_offset 16 1019; CHECK-NEXT: pushq %r15 1020; CHECK-NEXT: .cfi_def_cfa_offset 24 1021; CHECK-NEXT: pushq %r14 1022; CHECK-NEXT: .cfi_def_cfa_offset 32 1023; CHECK-NEXT: pushq %r13 1024; CHECK-NEXT: .cfi_def_cfa_offset 40 1025; CHECK-NEXT: pushq %r12 1026; CHECK-NEXT: .cfi_def_cfa_offset 48 1027; CHECK-NEXT: pushq %rbx 1028; CHECK-NEXT: .cfi_def_cfa_offset 56 1029; CHECK-NEXT: .cfi_offset %rbx, -56 1030; CHECK-NEXT: .cfi_offset %r12, -48 1031; CHECK-NEXT: .cfi_offset %r13, -40 1032; CHECK-NEXT: .cfi_offset %r14, -32 1033; CHECK-NEXT: .cfi_offset %r15, -24 1034; CHECK-NEXT: .cfi_offset %rbp, -16 1035; CHECK-NEXT: pextrw $1, %xmm0, %eax 1036; CHECK-NEXT: addl $2, %eax 1037; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1038; CHECK-NEXT: #APP 1039; CHECK-NEXT: nop 1040; CHECK-NEXT: #NO_APP 1041; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 1042; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 1043; CHECK-NEXT: popq %rbx 1044; CHECK-NEXT: .cfi_def_cfa_offset 48 1045; CHECK-NEXT: popq %r12 1046; CHECK-NEXT: .cfi_def_cfa_offset 40 1047; CHECK-NEXT: popq %r13 1048; CHECK-NEXT: .cfi_def_cfa_offset 32 1049; CHECK-NEXT: popq %r14 1050; CHECK-NEXT: .cfi_def_cfa_offset 24 1051; CHECK-NEXT: popq %r15 1052; CHECK-NEXT: .cfi_def_cfa_offset 16 1053; CHECK-NEXT: popq %rbp 1054; CHECK-NEXT: .cfi_def_cfa_offset 8 1055; CHECK-NEXT: retq 1056entry: 1057; add forces execution domain 1058 %add = add <8 x i16> %a0, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8> 1059 %extract = extractelement <8 x i16> %add, i32 1 1060 %asm = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1061 ret i16 %extract 1062} 1063 1064define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) { 1065; CHECK-LABEL: stack_fold_pextrd: 1066; CHECK: # %bb.0: 1067; CHECK-NEXT: pushq %rbp 1068; CHECK-NEXT: .cfi_def_cfa_offset 16 1069; CHECK-NEXT: pushq %r15 1070; CHECK-NEXT: .cfi_def_cfa_offset 24 1071; CHECK-NEXT: pushq %r14 1072; CHECK-NEXT: .cfi_def_cfa_offset 32 1073; CHECK-NEXT: pushq %r13 1074; CHECK-NEXT: .cfi_def_cfa_offset 40 1075; CHECK-NEXT: pushq %r12 1076; CHECK-NEXT: .cfi_def_cfa_offset 48 1077; CHECK-NEXT: pushq %rbx 1078; CHECK-NEXT: .cfi_def_cfa_offset 56 1079; CHECK-NEXT: .cfi_offset %rbx, -56 1080; CHECK-NEXT: .cfi_offset %r12, -48 1081; CHECK-NEXT: .cfi_offset %r13, -40 1082; CHECK-NEXT: .cfi_offset %r14, -32 1083; CHECK-NEXT: .cfi_offset %r15, -24 1084; CHECK-NEXT: .cfi_offset %rbp, -16 1085; CHECK-NEXT: paddd %xmm1, %xmm0 1086; CHECK-NEXT: pextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 1087; CHECK-NEXT: #APP 1088; CHECK-NEXT: nop 1089; CHECK-NEXT: #NO_APP 1090; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 1091; CHECK-NEXT: popq %rbx 1092; CHECK-NEXT: .cfi_def_cfa_offset 48 1093; CHECK-NEXT: popq %r12 1094; CHECK-NEXT: .cfi_def_cfa_offset 40 1095; CHECK-NEXT: popq %r13 1096; CHECK-NEXT: .cfi_def_cfa_offset 32 1097; CHECK-NEXT: popq %r14 1098; CHECK-NEXT: .cfi_def_cfa_offset 24 1099; CHECK-NEXT: popq %r15 1100; CHECK-NEXT: .cfi_def_cfa_offset 16 1101; CHECK-NEXT: popq %rbp 1102; CHECK-NEXT: .cfi_def_cfa_offset 8 1103; CHECK-NEXT: retq 1104 ; add forces execution domain 1105 %1 = add <4 x i32> %a0, %a1 1106 %2 = extractelement <4 x i32> %1, i32 1 1107 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1108 ret i32 %2 1109} 1110 1111define i64 @stack_fold_pextrq(<2 x i64> %a0) { 1112; CHECK-LABEL: stack_fold_pextrq: 1113; CHECK: # %bb.0: 1114; CHECK-NEXT: pushq %rbp 1115; CHECK-NEXT: .cfi_def_cfa_offset 16 1116; CHECK-NEXT: pushq %r15 1117; CHECK-NEXT: .cfi_def_cfa_offset 24 1118; CHECK-NEXT: pushq %r14 1119; CHECK-NEXT: .cfi_def_cfa_offset 32 1120; CHECK-NEXT: pushq %r13 1121; CHECK-NEXT: .cfi_def_cfa_offset 40 1122; CHECK-NEXT: pushq %r12 1123; CHECK-NEXT: .cfi_def_cfa_offset 48 1124; CHECK-NEXT: pushq %rbx 1125; CHECK-NEXT: .cfi_def_cfa_offset 56 1126; CHECK-NEXT: .cfi_offset %rbx, -56 1127; CHECK-NEXT: .cfi_offset %r12, -48 1128; CHECK-NEXT: .cfi_offset %r13, -40 1129; CHECK-NEXT: .cfi_offset %r14, -32 1130; CHECK-NEXT: .cfi_offset %r15, -24 1131; CHECK-NEXT: .cfi_offset %rbp, -16 1132; CHECK-NEXT: pextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill 1133; CHECK-NEXT: #APP 1134; CHECK-NEXT: nop 1135; CHECK-NEXT: #NO_APP 1136; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 1137; CHECK-NEXT: popq %rbx 1138; CHECK-NEXT: .cfi_def_cfa_offset 48 1139; CHECK-NEXT: popq %r12 1140; CHECK-NEXT: .cfi_def_cfa_offset 40 1141; CHECK-NEXT: popq %r13 1142; CHECK-NEXT: .cfi_def_cfa_offset 32 1143; CHECK-NEXT: popq %r14 1144; CHECK-NEXT: .cfi_def_cfa_offset 24 1145; CHECK-NEXT: popq %r15 1146; CHECK-NEXT: .cfi_def_cfa_offset 16 1147; CHECK-NEXT: popq %rbp 1148; CHECK-NEXT: .cfi_def_cfa_offset 8 1149; CHECK-NEXT: retq 1150 %1 = extractelement <2 x i64> %a0, i32 1 1151 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1152 ret i64 %1 1153} 1154 1155define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) { 1156; CHECK-LABEL: stack_fold_phaddd: 1157; CHECK: # %bb.0: 1158; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1159; CHECK-NEXT: #APP 1160; CHECK-NEXT: nop 1161; CHECK-NEXT: #NO_APP 1162; CHECK-NEXT: phaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1163; CHECK-NEXT: retq 1164 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1165 %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) 1166 ret <4 x i32> %2 1167} 1168declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone 1169 1170define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) { 1171; CHECK-LABEL: stack_fold_phaddsw: 1172; CHECK: # %bb.0: 1173; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1174; CHECK-NEXT: #APP 1175; CHECK-NEXT: nop 1176; CHECK-NEXT: #NO_APP 1177; CHECK-NEXT: phaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1178; CHECK-NEXT: retq 1179 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1180 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) 1181 ret <8 x i16> %2 1182} 1183declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone 1184 1185define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) { 1186; CHECK-LABEL: stack_fold_phaddw: 1187; CHECK: # %bb.0: 1188; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1189; CHECK-NEXT: #APP 1190; CHECK-NEXT: nop 1191; CHECK-NEXT: #NO_APP 1192; CHECK-NEXT: phaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1193; CHECK-NEXT: retq 1194 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1195 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) 1196 ret <8 x i16> %2 1197} 1198declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone 1199 1200define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) { 1201; CHECK-LABEL: stack_fold_phminposuw: 1202; CHECK: # %bb.0: 1203; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1204; CHECK-NEXT: #APP 1205; CHECK-NEXT: nop 1206; CHECK-NEXT: #NO_APP 1207; CHECK-NEXT: phminposuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1208; CHECK-NEXT: retq 1209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1210 %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) 1211 ret <8 x i16> %2 1212} 1213declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone 1214 1215define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) { 1216; CHECK-LABEL: stack_fold_phsubd: 1217; CHECK: # %bb.0: 1218; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1219; CHECK-NEXT: #APP 1220; CHECK-NEXT: nop 1221; CHECK-NEXT: #NO_APP 1222; CHECK-NEXT: phsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1223; CHECK-NEXT: retq 1224 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1225 %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) 1226 ret <4 x i32> %2 1227} 1228declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone 1229 1230define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) { 1231; CHECK-LABEL: stack_fold_phsubsw: 1232; CHECK: # %bb.0: 1233; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1234; CHECK-NEXT: #APP 1235; CHECK-NEXT: nop 1236; CHECK-NEXT: #NO_APP 1237; CHECK-NEXT: phsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1238; CHECK-NEXT: retq 1239 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1240 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) 1241 ret <8 x i16> %2 1242} 1243declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone 1244 1245define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) { 1246; CHECK-LABEL: stack_fold_phsubw: 1247; CHECK: # %bb.0: 1248; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1249; CHECK-NEXT: #APP 1250; CHECK-NEXT: nop 1251; CHECK-NEXT: #NO_APP 1252; CHECK-NEXT: phsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1253; CHECK-NEXT: retq 1254 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1255 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) 1256 ret <8 x i16> %2 1257} 1258declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone 1259 1260define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) { 1261; CHECK-LABEL: stack_fold_pinsrb: 1262; CHECK: # %bb.0: 1263; CHECK-NEXT: pushq %rbp 1264; CHECK-NEXT: .cfi_def_cfa_offset 16 1265; CHECK-NEXT: pushq %r15 1266; CHECK-NEXT: .cfi_def_cfa_offset 24 1267; CHECK-NEXT: pushq %r14 1268; CHECK-NEXT: .cfi_def_cfa_offset 32 1269; CHECK-NEXT: pushq %r13 1270; CHECK-NEXT: .cfi_def_cfa_offset 40 1271; CHECK-NEXT: pushq %r12 1272; CHECK-NEXT: .cfi_def_cfa_offset 48 1273; CHECK-NEXT: pushq %rbx 1274; CHECK-NEXT: .cfi_def_cfa_offset 56 1275; CHECK-NEXT: .cfi_offset %rbx, -56 1276; CHECK-NEXT: .cfi_offset %r12, -48 1277; CHECK-NEXT: .cfi_offset %r13, -40 1278; CHECK-NEXT: .cfi_offset %r14, -32 1279; CHECK-NEXT: .cfi_offset %r15, -24 1280; CHECK-NEXT: .cfi_offset %rbp, -16 1281; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1282; CHECK-NEXT: #APP 1283; CHECK-NEXT: nop 1284; CHECK-NEXT: #NO_APP 1285; CHECK-NEXT: pinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 1286; CHECK-NEXT: popq %rbx 1287; CHECK-NEXT: .cfi_def_cfa_offset 48 1288; CHECK-NEXT: popq %r12 1289; CHECK-NEXT: .cfi_def_cfa_offset 40 1290; CHECK-NEXT: popq %r13 1291; CHECK-NEXT: .cfi_def_cfa_offset 32 1292; CHECK-NEXT: popq %r14 1293; CHECK-NEXT: .cfi_def_cfa_offset 24 1294; CHECK-NEXT: popq %r15 1295; CHECK-NEXT: .cfi_def_cfa_offset 16 1296; CHECK-NEXT: popq %rbp 1297; CHECK-NEXT: .cfi_def_cfa_offset 8 1298; CHECK-NEXT: retq 1299 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1300 %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1 1301 ret <16 x i8> %2 1302} 1303 1304define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) { 1305; CHECK-LABEL: stack_fold_pinsrd: 1306; CHECK: # %bb.0: 1307; CHECK-NEXT: pushq %rbp 1308; CHECK-NEXT: .cfi_def_cfa_offset 16 1309; CHECK-NEXT: pushq %r15 1310; CHECK-NEXT: .cfi_def_cfa_offset 24 1311; CHECK-NEXT: pushq %r14 1312; CHECK-NEXT: .cfi_def_cfa_offset 32 1313; CHECK-NEXT: pushq %r13 1314; CHECK-NEXT: .cfi_def_cfa_offset 40 1315; CHECK-NEXT: pushq %r12 1316; CHECK-NEXT: .cfi_def_cfa_offset 48 1317; CHECK-NEXT: pushq %rbx 1318; CHECK-NEXT: .cfi_def_cfa_offset 56 1319; CHECK-NEXT: .cfi_offset %rbx, -56 1320; CHECK-NEXT: .cfi_offset %r12, -48 1321; CHECK-NEXT: .cfi_offset %r13, -40 1322; CHECK-NEXT: .cfi_offset %r14, -32 1323; CHECK-NEXT: .cfi_offset %r15, -24 1324; CHECK-NEXT: .cfi_offset %rbp, -16 1325; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1326; CHECK-NEXT: #APP 1327; CHECK-NEXT: nop 1328; CHECK-NEXT: #NO_APP 1329; CHECK-NEXT: pinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 1330; CHECK-NEXT: popq %rbx 1331; CHECK-NEXT: .cfi_def_cfa_offset 48 1332; CHECK-NEXT: popq %r12 1333; CHECK-NEXT: .cfi_def_cfa_offset 40 1334; CHECK-NEXT: popq %r13 1335; CHECK-NEXT: .cfi_def_cfa_offset 32 1336; CHECK-NEXT: popq %r14 1337; CHECK-NEXT: .cfi_def_cfa_offset 24 1338; CHECK-NEXT: popq %r15 1339; CHECK-NEXT: .cfi_def_cfa_offset 16 1340; CHECK-NEXT: popq %rbp 1341; CHECK-NEXT: .cfi_def_cfa_offset 8 1342; CHECK-NEXT: retq 1343 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1344 %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1 1345 ret <4 x i32> %2 1346} 1347 1348define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) { 1349; CHECK-LABEL: stack_fold_pinsrq: 1350; CHECK: # %bb.0: 1351; CHECK-NEXT: pushq %rbp 1352; CHECK-NEXT: .cfi_def_cfa_offset 16 1353; CHECK-NEXT: pushq %r15 1354; CHECK-NEXT: .cfi_def_cfa_offset 24 1355; CHECK-NEXT: pushq %r14 1356; CHECK-NEXT: .cfi_def_cfa_offset 32 1357; CHECK-NEXT: pushq %r13 1358; CHECK-NEXT: .cfi_def_cfa_offset 40 1359; CHECK-NEXT: pushq %r12 1360; CHECK-NEXT: .cfi_def_cfa_offset 48 1361; CHECK-NEXT: pushq %rbx 1362; CHECK-NEXT: .cfi_def_cfa_offset 56 1363; CHECK-NEXT: .cfi_offset %rbx, -56 1364; CHECK-NEXT: .cfi_offset %r12, -48 1365; CHECK-NEXT: .cfi_offset %r13, -40 1366; CHECK-NEXT: .cfi_offset %r14, -32 1367; CHECK-NEXT: .cfi_offset %r15, -24 1368; CHECK-NEXT: .cfi_offset %rbp, -16 1369; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1370; CHECK-NEXT: #APP 1371; CHECK-NEXT: nop 1372; CHECK-NEXT: #NO_APP 1373; CHECK-NEXT: pinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload 1374; CHECK-NEXT: popq %rbx 1375; CHECK-NEXT: .cfi_def_cfa_offset 48 1376; CHECK-NEXT: popq %r12 1377; CHECK-NEXT: .cfi_def_cfa_offset 40 1378; CHECK-NEXT: popq %r13 1379; CHECK-NEXT: .cfi_def_cfa_offset 32 1380; CHECK-NEXT: popq %r14 1381; CHECK-NEXT: .cfi_def_cfa_offset 24 1382; CHECK-NEXT: popq %r15 1383; CHECK-NEXT: .cfi_def_cfa_offset 16 1384; CHECK-NEXT: popq %rbp 1385; CHECK-NEXT: .cfi_def_cfa_offset 8 1386; CHECK-NEXT: retq 1387 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1388 %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1 1389 ret <2 x i64> %2 1390} 1391 1392define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) { 1393; CHECK-LABEL: stack_fold_pinsrw: 1394; CHECK: # %bb.0: 1395; CHECK-NEXT: pushq %rbp 1396; CHECK-NEXT: .cfi_def_cfa_offset 16 1397; CHECK-NEXT: pushq %r15 1398; CHECK-NEXT: .cfi_def_cfa_offset 24 1399; CHECK-NEXT: pushq %r14 1400; CHECK-NEXT: .cfi_def_cfa_offset 32 1401; CHECK-NEXT: pushq %r13 1402; CHECK-NEXT: .cfi_def_cfa_offset 40 1403; CHECK-NEXT: pushq %r12 1404; CHECK-NEXT: .cfi_def_cfa_offset 48 1405; CHECK-NEXT: pushq %rbx 1406; CHECK-NEXT: .cfi_def_cfa_offset 56 1407; CHECK-NEXT: .cfi_offset %rbx, -56 1408; CHECK-NEXT: .cfi_offset %r12, -48 1409; CHECK-NEXT: .cfi_offset %r13, -40 1410; CHECK-NEXT: .cfi_offset %r14, -32 1411; CHECK-NEXT: .cfi_offset %r15, -24 1412; CHECK-NEXT: .cfi_offset %rbp, -16 1413; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1414; CHECK-NEXT: #APP 1415; CHECK-NEXT: nop 1416; CHECK-NEXT: #NO_APP 1417; CHECK-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 1418; CHECK-NEXT: popq %rbx 1419; CHECK-NEXT: .cfi_def_cfa_offset 48 1420; CHECK-NEXT: popq %r12 1421; CHECK-NEXT: .cfi_def_cfa_offset 40 1422; CHECK-NEXT: popq %r13 1423; CHECK-NEXT: .cfi_def_cfa_offset 32 1424; CHECK-NEXT: popq %r14 1425; CHECK-NEXT: .cfi_def_cfa_offset 24 1426; CHECK-NEXT: popq %r15 1427; CHECK-NEXT: .cfi_def_cfa_offset 16 1428; CHECK-NEXT: popq %rbp 1429; CHECK-NEXT: .cfi_def_cfa_offset 8 1430; CHECK-NEXT: retq 1431 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 1432 %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1 1433 ret <8 x i16> %2 1434} 1435 1436define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { 1437; CHECK-LABEL: stack_fold_pmaddubsw: 1438; CHECK: # %bb.0: 1439; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1440; CHECK-NEXT: #APP 1441; CHECK-NEXT: nop 1442; CHECK-NEXT: #NO_APP 1443; CHECK-NEXT: pmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1444; CHECK-NEXT: retq 1445 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1446 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) 1447 ret <8 x i16> %2 1448} 1449declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone 1450 1451define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) { 1452; CHECK-LABEL: stack_fold_pmaddwd: 1453; CHECK: # %bb.0: 1454; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1455; CHECK-NEXT: #APP 1456; CHECK-NEXT: nop 1457; CHECK-NEXT: #NO_APP 1458; CHECK-NEXT: pmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1459; CHECK-NEXT: retq 1460 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1461 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) 1462 ret <4 x i32> %2 1463} 1464declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone 1465 1466define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) { 1467; CHECK-LABEL: stack_fold_pmaxsb: 1468; CHECK: # %bb.0: 1469; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1470; CHECK-NEXT: #APP 1471; CHECK-NEXT: nop 1472; CHECK-NEXT: #NO_APP 1473; CHECK-NEXT: pmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1474; CHECK-NEXT: retq 1475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1476 %2 = icmp sgt <16 x i8> %a0, %a1 1477 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1478 ret <16 x i8> %3 1479} 1480 1481define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) { 1482; CHECK-LABEL: stack_fold_pmaxsd: 1483; CHECK: # %bb.0: 1484; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1485; CHECK-NEXT: #APP 1486; CHECK-NEXT: nop 1487; CHECK-NEXT: #NO_APP 1488; CHECK-NEXT: pmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1489; CHECK-NEXT: retq 1490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1491 %2 = icmp sgt <4 x i32> %a0, %a1 1492 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1493 ret <4 x i32> %3 1494} 1495 1496define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) { 1497; CHECK-LABEL: stack_fold_pmaxsw: 1498; CHECK: # %bb.0: 1499; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1500; CHECK-NEXT: #APP 1501; CHECK-NEXT: nop 1502; CHECK-NEXT: #NO_APP 1503; CHECK-NEXT: pmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1504; CHECK-NEXT: retq 1505 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1506 %2 = icmp sgt <8 x i16> %a0, %a1 1507 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1508 ret <8 x i16> %3 1509} 1510 1511define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) { 1512; CHECK-LABEL: stack_fold_pmaxub: 1513; CHECK: # %bb.0: 1514; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1515; CHECK-NEXT: #APP 1516; CHECK-NEXT: nop 1517; CHECK-NEXT: #NO_APP 1518; CHECK-NEXT: pmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1519; CHECK-NEXT: retq 1520 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1521 %2 = icmp ugt <16 x i8> %a0, %a1 1522 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1523 ret <16 x i8> %3 1524} 1525 1526define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) { 1527; CHECK-LABEL: stack_fold_pmaxud: 1528; CHECK: # %bb.0: 1529; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1530; CHECK-NEXT: #APP 1531; CHECK-NEXT: nop 1532; CHECK-NEXT: #NO_APP 1533; CHECK-NEXT: pmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1534; CHECK-NEXT: retq 1535 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1536 %2 = icmp ugt <4 x i32> %a0, %a1 1537 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1538 ret <4 x i32> %3 1539} 1540 1541define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) { 1542; CHECK-LABEL: stack_fold_pmaxuw: 1543; CHECK: # %bb.0: 1544; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1545; CHECK-NEXT: #APP 1546; CHECK-NEXT: nop 1547; CHECK-NEXT: #NO_APP 1548; CHECK-NEXT: pmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1549; CHECK-NEXT: retq 1550 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1551 %2 = icmp ugt <8 x i16> %a0, %a1 1552 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1553 ret <8 x i16> %3 1554} 1555 1556define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) { 1557; CHECK-LABEL: stack_fold_pminsb: 1558; CHECK: # %bb.0: 1559; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1560; CHECK-NEXT: #APP 1561; CHECK-NEXT: nop 1562; CHECK-NEXT: #NO_APP 1563; CHECK-NEXT: pminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1564; CHECK-NEXT: retq 1565 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1566 %2 = icmp slt <16 x i8> %a0, %a1 1567 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1568 ret <16 x i8> %3 1569} 1570 1571define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) { 1572; CHECK-LABEL: stack_fold_pminsd: 1573; CHECK: # %bb.0: 1574; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1575; CHECK-NEXT: #APP 1576; CHECK-NEXT: nop 1577; CHECK-NEXT: #NO_APP 1578; CHECK-NEXT: pminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1579; CHECK-NEXT: retq 1580 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1581 %2 = icmp slt <4 x i32> %a0, %a1 1582 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1583 ret <4 x i32> %3 1584} 1585 1586define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) { 1587; CHECK-LABEL: stack_fold_pminsw: 1588; CHECK: # %bb.0: 1589; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1590; CHECK-NEXT: #APP 1591; CHECK-NEXT: nop 1592; CHECK-NEXT: #NO_APP 1593; CHECK-NEXT: pminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1594; CHECK-NEXT: retq 1595 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1596 %2 = icmp slt <8 x i16> %a0, %a1 1597 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1598 ret <8 x i16> %3 1599} 1600 1601define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) { 1602; CHECK-LABEL: stack_fold_pminub: 1603; CHECK: # %bb.0: 1604; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1605; CHECK-NEXT: #APP 1606; CHECK-NEXT: nop 1607; CHECK-NEXT: #NO_APP 1608; CHECK-NEXT: pminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1609; CHECK-NEXT: retq 1610 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1611 %2 = icmp ult <16 x i8> %a0, %a1 1612 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1613 ret <16 x i8> %3 1614} 1615 1616define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) { 1617; CHECK-LABEL: stack_fold_pminud: 1618; CHECK: # %bb.0: 1619; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1620; CHECK-NEXT: #APP 1621; CHECK-NEXT: nop 1622; CHECK-NEXT: #NO_APP 1623; CHECK-NEXT: pminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1624; CHECK-NEXT: retq 1625 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1626 %2 = icmp ult <4 x i32> %a0, %a1 1627 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1628 ret <4 x i32> %3 1629} 1630 1631define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) { 1632; CHECK-LABEL: stack_fold_pminuw: 1633; CHECK: # %bb.0: 1634; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1635; CHECK-NEXT: #APP 1636; CHECK-NEXT: nop 1637; CHECK-NEXT: #NO_APP 1638; CHECK-NEXT: pminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1639; CHECK-NEXT: retq 1640 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1641 %2 = icmp ult <8 x i16> %a0, %a1 1642 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1643 ret <8 x i16> %3 1644} 1645 1646define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { 1647; CHECK-LABEL: stack_fold_pmovsxbd: 1648; CHECK: # %bb.0: 1649; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1650; CHECK-NEXT: #APP 1651; CHECK-NEXT: nop 1652; CHECK-NEXT: #NO_APP 1653; CHECK-NEXT: pmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1654; CHECK-NEXT: retq 1655 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1656 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1657 %3 = sext <4 x i8> %2 to <4 x i32> 1658 ret <4 x i32> %3 1659} 1660 1661define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { 1662; CHECK-LABEL: stack_fold_pmovsxbq: 1663; CHECK: # %bb.0: 1664; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1665; CHECK-NEXT: #APP 1666; CHECK-NEXT: nop 1667; CHECK-NEXT: #NO_APP 1668; CHECK-NEXT: pmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1669; CHECK-NEXT: retq 1670 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1671 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 1672 %3 = sext <2 x i8> %2 to <2 x i64> 1673 ret <2 x i64> %3 1674} 1675 1676define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { 1677; CHECK-LABEL: stack_fold_pmovsxbw: 1678; CHECK: # %bb.0: 1679; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1680; CHECK-NEXT: #APP 1681; CHECK-NEXT: nop 1682; CHECK-NEXT: #NO_APP 1683; CHECK-NEXT: pmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1684; CHECK-NEXT: retq 1685 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1686 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1687 %3 = sext <8 x i8> %2 to <8 x i16> 1688 ret <8 x i16> %3 1689} 1690 1691define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { 1692; CHECK-LABEL: stack_fold_pmovsxdq: 1693; CHECK: # %bb.0: 1694; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1695; CHECK-NEXT: #APP 1696; CHECK-NEXT: nop 1697; CHECK-NEXT: #NO_APP 1698; CHECK-NEXT: pmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1699; CHECK-NEXT: retq 1700 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1701 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1702 %3 = sext <2 x i32> %2 to <2 x i64> 1703 ret <2 x i64> %3 1704} 1705 1706define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { 1707; CHECK-LABEL: stack_fold_pmovsxwd: 1708; CHECK: # %bb.0: 1709; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1710; CHECK-NEXT: #APP 1711; CHECK-NEXT: nop 1712; CHECK-NEXT: #NO_APP 1713; CHECK-NEXT: pmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1714; CHECK-NEXT: retq 1715 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1716 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1717 %3 = sext <4 x i16> %2 to <4 x i32> 1718 ret <4 x i32> %3 1719} 1720 1721define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { 1722; CHECK-LABEL: stack_fold_pmovsxwq: 1723; CHECK: # %bb.0: 1724; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1725; CHECK-NEXT: #APP 1726; CHECK-NEXT: nop 1727; CHECK-NEXT: #NO_APP 1728; CHECK-NEXT: pmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1729; CHECK-NEXT: retq 1730 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1731 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 1732 %3 = sext <2 x i16> %2 to <2 x i64> 1733 ret <2 x i64> %3 1734} 1735 1736define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { 1737; CHECK-LABEL: stack_fold_pmovzxbd: 1738; CHECK: # %bb.0: 1739; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1740; CHECK-NEXT: #APP 1741; CHECK-NEXT: nop 1742; CHECK-NEXT: #NO_APP 1743; CHECK-NEXT: pmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1744; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1745; CHECK-NEXT: retq 1746 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1747 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27> 1748 %3 = bitcast <16 x i8> %2 to <4 x i32> 1749 ret <4 x i32> %3 1750} 1751 1752define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { 1753; CHECK-LABEL: stack_fold_pmovzxbq: 1754; CHECK: # %bb.0: 1755; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1756; CHECK-NEXT: #APP 1757; CHECK-NEXT: nop 1758; CHECK-NEXT: #NO_APP 1759; CHECK-NEXT: pmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1760; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 1761; CHECK-NEXT: retq 1762 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1763 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28> 1764 %3 = bitcast <16 x i8> %2 to <2 x i64> 1765 ret <2 x i64> %3 1766} 1767 1768define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { 1769; CHECK-LABEL: stack_fold_pmovzxbw: 1770; CHECK: # %bb.0: 1771; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1772; CHECK-NEXT: #APP 1773; CHECK-NEXT: nop 1774; CHECK-NEXT: #NO_APP 1775; CHECK-NEXT: pmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1776; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1777; CHECK-NEXT: retq 1778 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1779 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 1780 %3 = bitcast <16 x i8> %2 to <8 x i16> 1781 ret <8 x i16> %3 1782} 1783 1784define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { 1785; CHECK-LABEL: stack_fold_pmovzxdq: 1786; CHECK: # %bb.0: 1787; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1788; CHECK-NEXT: #APP 1789; CHECK-NEXT: nop 1790; CHECK-NEXT: #NO_APP 1791; CHECK-NEXT: pmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1792; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero 1793; CHECK-NEXT: retq 1794 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1795 %2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 1796 %3 = bitcast <4 x i32> %2 to <2 x i64> 1797 ret <2 x i64> %3 1798} 1799 1800define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { 1801; CHECK-LABEL: stack_fold_pmovzxwd: 1802; CHECK: # %bb.0: 1803; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1804; CHECK-NEXT: #APP 1805; CHECK-NEXT: nop 1806; CHECK-NEXT: #NO_APP 1807; CHECK-NEXT: pmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1808; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1809; CHECK-NEXT: retq 1810 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1811 %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 1812 %3 = bitcast <8 x i16> %2 to <4 x i32> 1813 ret <4 x i32> %3 1814} 1815 1816define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { 1817; CHECK-LABEL: stack_fold_pmovzxwq: 1818; CHECK: # %bb.0: 1819; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1820; CHECK-NEXT: #APP 1821; CHECK-NEXT: nop 1822; CHECK-NEXT: #NO_APP 1823; CHECK-NEXT: pmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1824; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero 1825; CHECK-NEXT: retq 1826 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1827 %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13> 1828 %3 = bitcast <8 x i16> %2 to <2 x i64> 1829 ret <2 x i64> %3 1830} 1831 1832define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { 1833; CHECK-LABEL: stack_fold_pmuldq: 1834; CHECK: # %bb.0: 1835; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1836; CHECK-NEXT: #APP 1837; CHECK-NEXT: nop 1838; CHECK-NEXT: #NO_APP 1839; CHECK-NEXT: pmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1840; CHECK-NEXT: retq 1841 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1842 %2 = bitcast <4 x i32> %a0 to <2 x i64> 1843 %3 = bitcast <4 x i32> %a1 to <2 x i64> 1844 %4 = shl <2 x i64> %2, <i64 32, i64 32> 1845 %5 = ashr <2 x i64> %4, <i64 32, i64 32> 1846 %6 = shl <2 x i64> %3, <i64 32, i64 32> 1847 %7 = ashr <2 x i64> %6, <i64 32, i64 32> 1848 %8 = mul <2 x i64> %5, %7 1849 ret <2 x i64> %8 1850} 1851 1852define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) { 1853; CHECK-LABEL: stack_fold_pmulhrsw: 1854; CHECK: # %bb.0: 1855; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1856; CHECK-NEXT: #APP 1857; CHECK-NEXT: nop 1858; CHECK-NEXT: #NO_APP 1859; CHECK-NEXT: pmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1860; CHECK-NEXT: retq 1861 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1862 %2 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) 1863 ret <8 x i16> %2 1864} 1865declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone 1866 1867define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) { 1868; CHECK-LABEL: stack_fold_pmulhuw: 1869; CHECK: # %bb.0: 1870; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1871; CHECK-NEXT: #APP 1872; CHECK-NEXT: nop 1873; CHECK-NEXT: #NO_APP 1874; CHECK-NEXT: pmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1875; CHECK-NEXT: retq 1876 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1877 %2 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) 1878 ret <8 x i16> %2 1879} 1880declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone 1881 1882define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) { 1883; CHECK-LABEL: stack_fold_pmulhw: 1884; CHECK: # %bb.0: 1885; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1886; CHECK-NEXT: #APP 1887; CHECK-NEXT: nop 1888; CHECK-NEXT: #NO_APP 1889; CHECK-NEXT: pmulhw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1890; CHECK-NEXT: retq 1891 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1892 %2 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) 1893 ret <8 x i16> %2 1894} 1895declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone 1896 1897define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) { 1898; CHECK-LABEL: stack_fold_pmulld: 1899; CHECK: # %bb.0: 1900; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1901; CHECK-NEXT: #APP 1902; CHECK-NEXT: nop 1903; CHECK-NEXT: #NO_APP 1904; CHECK-NEXT: pmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1905; CHECK-NEXT: retq 1906 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1907 %2 = mul <4 x i32> %a0, %a1 1908 ret <4 x i32> %2 1909} 1910 1911define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) { 1912; CHECK-LABEL: stack_fold_pmullw: 1913; CHECK: # %bb.0: 1914; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1915; CHECK-NEXT: #APP 1916; CHECK-NEXT: nop 1917; CHECK-NEXT: #NO_APP 1918; CHECK-NEXT: pmullw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1919; CHECK-NEXT: retq 1920 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1921 %2 = mul <8 x i16> %a0, %a1 1922 ret <8 x i16> %2 1923} 1924 1925define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { 1926; CHECK-LABEL: stack_fold_pmuludq: 1927; CHECK: # %bb.0: 1928; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1929; CHECK-NEXT: #APP 1930; CHECK-NEXT: nop 1931; CHECK-NEXT: #NO_APP 1932; CHECK-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1933; CHECK-NEXT: retq 1934 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1935 %2 = bitcast <4 x i32> %a0 to <2 x i64> 1936 %3 = bitcast <4 x i32> %a1 to <2 x i64> 1937 %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295> 1938 %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295> 1939 %6 = mul <2 x i64> %4, %5 1940 ret <2 x i64> %6 1941} 1942 1943define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) { 1944; CHECK-LABEL: stack_fold_por: 1945; CHECK: # %bb.0: 1946; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1947; CHECK-NEXT: #APP 1948; CHECK-NEXT: nop 1949; CHECK-NEXT: #NO_APP 1950; CHECK-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1951; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 1952; CHECK-NEXT: psubb %xmm1, %xmm0 1953; CHECK-NEXT: retq 1954 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1955 %2 = or <16 x i8> %a0, %a1 1956 ; add forces execution domain 1957 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1958 ret <16 x i8> %3 1959} 1960 1961define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) { 1962; CHECK-LABEL: stack_fold_psadbw: 1963; CHECK: # %bb.0: 1964; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1965; CHECK-NEXT: #APP 1966; CHECK-NEXT: nop 1967; CHECK-NEXT: #NO_APP 1968; CHECK-NEXT: psadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1969; CHECK-NEXT: retq 1970 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1971 %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) 1972 ret <2 x i64> %2 1973} 1974declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone 1975 1976define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 1977; CHECK-LABEL: stack_fold_pshufb: 1978; CHECK: # %bb.0: 1979; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1980; CHECK-NEXT: #APP 1981; CHECK-NEXT: nop 1982; CHECK-NEXT: #NO_APP 1983; CHECK-NEXT: pshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1984; CHECK-NEXT: retq 1985 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1986 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) 1987 ret <16 x i8> %2 1988} 1989declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone 1990 1991define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) { 1992; CHECK-LABEL: stack_fold_pshufd: 1993; CHECK: # %bb.0: 1994; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1995; CHECK-NEXT: #APP 1996; CHECK-NEXT: nop 1997; CHECK-NEXT: #NO_APP 1998; CHECK-NEXT: pshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1999; CHECK-NEXT: # xmm0 = mem[3,2,1,0] 2000; CHECK-NEXT: retq 2001 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2002 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 2003 ret <4 x i32> %2 2004} 2005 2006define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) { 2007; CHECK-LABEL: stack_fold_pshufhw: 2008; CHECK: # %bb.0: 2009; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2010; CHECK-NEXT: #APP 2011; CHECK-NEXT: nop 2012; CHECK-NEXT: #NO_APP 2013; CHECK-NEXT: pshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2014; CHECK-NEXT: # xmm0 = mem[0,1,2,3,7,6,4,4] 2015; CHECK-NEXT: retq 2016 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2017 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4> 2018 ret <8 x i16> %2 2019} 2020 2021define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) { 2022; CHECK-LABEL: stack_fold_pshuflw: 2023; CHECK: # %bb.0: 2024; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2025; CHECK-NEXT: #APP 2026; CHECK-NEXT: nop 2027; CHECK-NEXT: #NO_APP 2028; CHECK-NEXT: pshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2029; CHECK-NEXT: # xmm0 = mem[3,2,1,0,4,5,6,7] 2030; CHECK-NEXT: retq 2031 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2032 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> 2033 ret <8 x i16> %2 2034} 2035 2036define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) { 2037; CHECK-LABEL: stack_fold_psignb: 2038; CHECK: # %bb.0: 2039; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2040; CHECK-NEXT: #APP 2041; CHECK-NEXT: nop 2042; CHECK-NEXT: #NO_APP 2043; CHECK-NEXT: psignb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2044; CHECK-NEXT: retq 2045 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2046 %2 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) 2047 ret <16 x i8> %2 2048} 2049declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone 2050 2051define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) { 2052; CHECK-LABEL: stack_fold_psignd: 2053; CHECK: # %bb.0: 2054; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2055; CHECK-NEXT: #APP 2056; CHECK-NEXT: nop 2057; CHECK-NEXT: #NO_APP 2058; CHECK-NEXT: psignd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2059; CHECK-NEXT: retq 2060 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2061 %2 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) 2062 ret <4 x i32> %2 2063} 2064declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone 2065 2066define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) { 2067; CHECK-LABEL: stack_fold_psignw: 2068; CHECK: # %bb.0: 2069; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2070; CHECK-NEXT: #APP 2071; CHECK-NEXT: nop 2072; CHECK-NEXT: #NO_APP 2073; CHECK-NEXT: psignw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2074; CHECK-NEXT: retq 2075 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2076 %2 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) 2077 ret <8 x i16> %2 2078} 2079declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone 2080 2081define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) { 2082; CHECK-LABEL: stack_fold_pslld: 2083; CHECK: # %bb.0: 2084; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2085; CHECK-NEXT: #APP 2086; CHECK-NEXT: nop 2087; CHECK-NEXT: #NO_APP 2088; CHECK-NEXT: pslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2089; CHECK-NEXT: retq 2090 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2091 %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) 2092 ret <4 x i32> %2 2093} 2094declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone 2095 2096define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) { 2097; CHECK-LABEL: stack_fold_psllq: 2098; CHECK: # %bb.0: 2099; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2100; CHECK-NEXT: #APP 2101; CHECK-NEXT: nop 2102; CHECK-NEXT: #NO_APP 2103; CHECK-NEXT: psllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2104; CHECK-NEXT: retq 2105 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2106 %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) 2107 ret <2 x i64> %2 2108} 2109declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone 2110 2111define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) { 2112; CHECK-LABEL: stack_fold_psllw: 2113; CHECK: # %bb.0: 2114; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2115; CHECK-NEXT: #APP 2116; CHECK-NEXT: nop 2117; CHECK-NEXT: #NO_APP 2118; CHECK-NEXT: psllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2119; CHECK-NEXT: retq 2120 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2121 %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) 2122 ret <8 x i16> %2 2123} 2124declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone 2125 2126define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) { 2127; CHECK-LABEL: stack_fold_psrad: 2128; CHECK: # %bb.0: 2129; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2130; CHECK-NEXT: #APP 2131; CHECK-NEXT: nop 2132; CHECK-NEXT: #NO_APP 2133; CHECK-NEXT: psrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2134; CHECK-NEXT: retq 2135 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2136 %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) 2137 ret <4 x i32> %2 2138} 2139declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone 2140 2141define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) { 2142; CHECK-LABEL: stack_fold_psraw: 2143; CHECK: # %bb.0: 2144; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2145; CHECK-NEXT: #APP 2146; CHECK-NEXT: nop 2147; CHECK-NEXT: #NO_APP 2148; CHECK-NEXT: psraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2149; CHECK-NEXT: retq 2150 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2151 %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) 2152 ret <8 x i16> %2 2153} 2154declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone 2155 2156define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) { 2157; CHECK-LABEL: stack_fold_psrld: 2158; CHECK: # %bb.0: 2159; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2160; CHECK-NEXT: #APP 2161; CHECK-NEXT: nop 2162; CHECK-NEXT: #NO_APP 2163; CHECK-NEXT: psrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2164; CHECK-NEXT: retq 2165 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2166 %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) 2167 ret <4 x i32> %2 2168} 2169declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone 2170 2171define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) { 2172; CHECK-LABEL: stack_fold_psrlq: 2173; CHECK: # %bb.0: 2174; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2175; CHECK-NEXT: #APP 2176; CHECK-NEXT: nop 2177; CHECK-NEXT: #NO_APP 2178; CHECK-NEXT: psrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2179; CHECK-NEXT: retq 2180 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2181 %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) 2182 ret <2 x i64> %2 2183} 2184declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone 2185 2186define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) { 2187; CHECK-LABEL: stack_fold_psrlw: 2188; CHECK: # %bb.0: 2189; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2190; CHECK-NEXT: #APP 2191; CHECK-NEXT: nop 2192; CHECK-NEXT: #NO_APP 2193; CHECK-NEXT: psrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2194; CHECK-NEXT: retq 2195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2196 %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) 2197 ret <8 x i16> %2 2198} 2199declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone 2200 2201define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) { 2202; CHECK-LABEL: stack_fold_psubb: 2203; CHECK: # %bb.0: 2204; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2205; CHECK-NEXT: #APP 2206; CHECK-NEXT: nop 2207; CHECK-NEXT: #NO_APP 2208; CHECK-NEXT: psubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2209; CHECK-NEXT: retq 2210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2211 %2 = sub <16 x i8> %a0, %a1 2212 ret <16 x i8> %2 2213} 2214 2215define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) { 2216; CHECK-LABEL: stack_fold_psubd: 2217; CHECK: # %bb.0: 2218; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2219; CHECK-NEXT: #APP 2220; CHECK-NEXT: nop 2221; CHECK-NEXT: #NO_APP 2222; CHECK-NEXT: psubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2223; CHECK-NEXT: retq 2224 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2225 %2 = sub <4 x i32> %a0, %a1 2226 ret <4 x i32> %2 2227} 2228 2229define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) { 2230; CHECK-LABEL: stack_fold_psubq: 2231; CHECK: # %bb.0: 2232; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2233; CHECK-NEXT: #APP 2234; CHECK-NEXT: nop 2235; CHECK-NEXT: #NO_APP 2236; CHECK-NEXT: psubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2237; CHECK-NEXT: retq 2238 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2239 %2 = sub <2 x i64> %a0, %a1 2240 ret <2 x i64> %2 2241} 2242 2243define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) { 2244; CHECK-LABEL: stack_fold_psubsb: 2245; CHECK: # %bb.0: 2246; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2247; CHECK-NEXT: #APP 2248; CHECK-NEXT: nop 2249; CHECK-NEXT: #NO_APP 2250; CHECK-NEXT: psubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2251; CHECK-NEXT: retq 2252 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2253 %2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 2254 ret <16 x i8> %2 2255} 2256declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 2257 2258define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) { 2259; CHECK-LABEL: stack_fold_psubsw: 2260; CHECK: # %bb.0: 2261; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2262; CHECK-NEXT: #APP 2263; CHECK-NEXT: nop 2264; CHECK-NEXT: #NO_APP 2265; CHECK-NEXT: psubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2266; CHECK-NEXT: retq 2267 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2268 %2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 2269 ret <8 x i16> %2 2270} 2271declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 2272 2273define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) { 2274; CHECK-LABEL: stack_fold_psubusb: 2275; CHECK: # %bb.0: 2276; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2277; CHECK-NEXT: #APP 2278; CHECK-NEXT: nop 2279; CHECK-NEXT: #NO_APP 2280; CHECK-NEXT: psubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2281; CHECK-NEXT: retq 2282 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2283 %2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 2284 ret <16 x i8> %2 2285} 2286declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 2287 2288define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) { 2289; CHECK-LABEL: stack_fold_psubusw: 2290; CHECK: # %bb.0: 2291; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2292; CHECK-NEXT: #APP 2293; CHECK-NEXT: nop 2294; CHECK-NEXT: #NO_APP 2295; CHECK-NEXT: psubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2296; CHECK-NEXT: retq 2297 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2298 %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 2299 ret <8 x i16> %2 2300} 2301declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 2302 2303define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) { 2304; CHECK-LABEL: stack_fold_psubw: 2305; CHECK: # %bb.0: 2306; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2307; CHECK-NEXT: #APP 2308; CHECK-NEXT: nop 2309; CHECK-NEXT: #NO_APP 2310; CHECK-NEXT: psubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2311; CHECK-NEXT: retq 2312 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2313 %2 = sub <8 x i16> %a0, %a1 2314 ret <8 x i16> %2 2315} 2316 2317define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) { 2318; CHECK-LABEL: stack_fold_ptest: 2319; CHECK: # %bb.0: 2320; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2321; CHECK-NEXT: #APP 2322; CHECK-NEXT: nop 2323; CHECK-NEXT: #NO_APP 2324; CHECK-NEXT: xorl %eax, %eax 2325; CHECK-NEXT: ptest {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2326; CHECK-NEXT: setb %al 2327; CHECK-NEXT: retq 2328 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2329 %2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) 2330 ret i32 %2 2331} 2332declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone 2333 2334define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) { 2335; CHECK-LABEL: stack_fold_punpckhbw: 2336; CHECK: # %bb.0: 2337; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2338; CHECK-NEXT: #APP 2339; CHECK-NEXT: nop 2340; CHECK-NEXT: #NO_APP 2341; CHECK-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2342; CHECK-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] 2343; CHECK-NEXT: retq 2344 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2345 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 2346 ret <16 x i8> %2 2347} 2348 2349define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) { 2350; CHECK-LABEL: stack_fold_punpckhdq: 2351; CHECK: # %bb.0: 2352; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2353; CHECK-NEXT: #APP 2354; CHECK-NEXT: nop 2355; CHECK-NEXT: #NO_APP 2356; CHECK-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2357; CHECK-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] 2358; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 2359; CHECK-NEXT: psubd %xmm1, %xmm0 2360; CHECK-NEXT: retq 2361 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2362 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 2363 ; add forces execution domain 2364 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 2365 ret <4 x i32> %3 2366} 2367 2368define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) { 2369; CHECK-LABEL: stack_fold_punpckhqdq: 2370; CHECK: # %bb.0: 2371; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2372; CHECK-NEXT: #APP 2373; CHECK-NEXT: nop 2374; CHECK-NEXT: #NO_APP 2375; CHECK-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2376; CHECK-NEXT: # xmm0 = xmm0[1],mem[1] 2377; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 2378; CHECK-NEXT: psubq %xmm1, %xmm0 2379; CHECK-NEXT: retq 2380 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2381 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3> 2382 ; add forces execution domain 2383 %3 = add <2 x i64> %2, <i64 1, i64 1> 2384 ret <2 x i64> %3 2385} 2386 2387define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) { 2388; CHECK-LABEL: stack_fold_punpckhwd: 2389; CHECK: # %bb.0: 2390; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2391; CHECK-NEXT: #APP 2392; CHECK-NEXT: nop 2393; CHECK-NEXT: #NO_APP 2394; CHECK-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2395; CHECK-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 2396; CHECK-NEXT: retq 2397 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2398 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 2399 ret <8 x i16> %2 2400} 2401 2402define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) { 2403; CHECK-LABEL: stack_fold_punpcklbw: 2404; CHECK: # %bb.0: 2405; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2406; CHECK-NEXT: #APP 2407; CHECK-NEXT: nop 2408; CHECK-NEXT: #NO_APP 2409; CHECK-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2410; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] 2411; CHECK-NEXT: retq 2412 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2413 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 2414 ret <16 x i8> %2 2415} 2416 2417define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) { 2418; CHECK-LABEL: stack_fold_punpckldq: 2419; CHECK: # %bb.0: 2420; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2421; CHECK-NEXT: #APP 2422; CHECK-NEXT: nop 2423; CHECK-NEXT: #NO_APP 2424; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2425; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 2426; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 2427; CHECK-NEXT: psubd %xmm1, %xmm0 2428; CHECK-NEXT: retq 2429 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2430 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2431 ; add forces execution domain 2432 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 2433 ret <4 x i32> %3 2434} 2435 2436define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) { 2437; CHECK-LABEL: stack_fold_punpcklqdq: 2438; CHECK: # %bb.0: 2439; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2440; CHECK-NEXT: #APP 2441; CHECK-NEXT: nop 2442; CHECK-NEXT: #NO_APP 2443; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2444; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] 2445; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 2446; CHECK-NEXT: psubq %xmm1, %xmm0 2447; CHECK-NEXT: retq 2448 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2449 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2> 2450 ; add forces execution domain 2451 %3 = add <2 x i64> %2, <i64 1, i64 1> 2452 ret <2 x i64> %3 2453} 2454 2455define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) { 2456; CHECK-LABEL: stack_fold_punpcklwd: 2457; CHECK: # %bb.0: 2458; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2459; CHECK-NEXT: #APP 2460; CHECK-NEXT: nop 2461; CHECK-NEXT: #NO_APP 2462; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2463; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2464; CHECK-NEXT: retq 2465 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2466 %2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 2467 ret <8 x i16> %2 2468} 2469 2470define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) { 2471; CHECK-LABEL: stack_fold_pxor: 2472; CHECK: # %bb.0: 2473; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2474; CHECK-NEXT: #APP 2475; CHECK-NEXT: nop 2476; CHECK-NEXT: #NO_APP 2477; CHECK-NEXT: pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2478; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 2479; CHECK-NEXT: psubb %xmm1, %xmm0 2480; CHECK-NEXT: retq 2481 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2482 %2 = xor <16 x i8> %a0, %a1 2483 ; add forces execution domain 2484 %3 = add <16 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2485 ret <16 x i8> %3 2486} 2487