1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <2 x double> @stack_fold_vfrczpd(<2 x double> %a0) { 13; CHECK-LABEL: stack_fold_vfrczpd: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16; CHECK-NEXT: #APP 17; CHECK-NEXT: nop 18; CHECK-NEXT: #NO_APP 19; CHECK-NEXT: vfrczpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 20; CHECK-NEXT: retq 21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 22 %2 = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %a0) 23 ret <2 x double> %2 24} 25declare <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double>) nounwind readnone 26 27define <4 x double> @stack_fold_vfrczpd_ymm(<4 x double> %a0) { 28; CHECK-LABEL: stack_fold_vfrczpd_ymm: 29; CHECK: # %bb.0: 30; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 31; CHECK-NEXT: #APP 32; CHECK-NEXT: nop 33; CHECK-NEXT: #NO_APP 34; CHECK-NEXT: vfrczpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 35; CHECK-NEXT: retq 36 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 37 %2 = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %a0) 38 ret <4 x double> %2 39} 40declare <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double>) nounwind readnone 41 42define <4 x float> @stack_fold_vfrczps(<4 x float> %a0) { 43; CHECK-LABEL: stack_fold_vfrczps: 44; CHECK: # %bb.0: 45; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 46; CHECK-NEXT: #APP 47; CHECK-NEXT: nop 48; CHECK-NEXT: #NO_APP 49; CHECK-NEXT: vfrczps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 50; CHECK-NEXT: retq 51 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 52 %2 = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %a0) 53 ret <4 x float> %2 54} 55declare <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float>) nounwind readnone 56 57define <8 x float> @stack_fold_vfrczps_ymm(<8 x float> %a0) { 58; CHECK-LABEL: stack_fold_vfrczps_ymm: 59; CHECK: # %bb.0: 60; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 61; CHECK-NEXT: #APP 62; CHECK-NEXT: nop 63; CHECK-NEXT: #NO_APP 64; CHECK-NEXT: vfrczps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 65; CHECK-NEXT: retq 66 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 67 %2 = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %a0) 68 ret <8 x float> %2 69} 70declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone 71 72define <2 x double> @stack_fold_vfrczsd(<2 x double> %a0) { 73; CHECK-LABEL: stack_fold_vfrczsd: 74; CHECK: # %bb.0: 75; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 76; CHECK-NEXT: #APP 77; CHECK-NEXT: nop 78; CHECK-NEXT: #NO_APP 79; CHECK-NEXT: vfrczsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 80; CHECK-NEXT: retq 81 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 82 %2 = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0) 83 ret <2 x double> %2 84} 85declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone 86 87define <4 x float> @stack_fold_vfrczss(<4 x float> %a0) { 88; CHECK-LABEL: stack_fold_vfrczss: 89; CHECK: # %bb.0: 90; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 91; CHECK-NEXT: #APP 92; CHECK-NEXT: nop 93; CHECK-NEXT: #NO_APP 94; CHECK-NEXT: vfrczss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 95; CHECK-NEXT: retq 96 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 97 %2 = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0) 98 ret <4 x float> %2 99} 100declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone 101 102define <2 x i64> @stack_fold_vpcmov_rm(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { 103; CHECK-LABEL: stack_fold_vpcmov_rm: 104; CHECK: # %bb.0: 105; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 106; CHECK-NEXT: #APP 107; CHECK-NEXT: nop 108; CHECK-NEXT: #NO_APP 109; CHECK-NEXT: vpcmov {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0, %xmm0 # 16-byte Folded Reload 110; CHECK-NEXT: retq 111 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 112 %2 = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) 113 ret <2 x i64> %2 114} 115define <2 x i64> @stack_fold_vpcmov_mr(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { 116; CHECK-LABEL: stack_fold_vpcmov_mr: 117; CHECK: # %bb.0: 118; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 119; CHECK-NEXT: #APP 120; CHECK-NEXT: nop 121; CHECK-NEXT: #NO_APP 122; CHECK-NEXT: vpcmov %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 123; CHECK-NEXT: retq 124 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 125 %2 = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a2, <2 x i64> %a1) 126 ret <2 x i64> %2 127} 128declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone 129 130define <4 x i64> @stack_fold_vpcmov_rm_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { 131; CHECK-LABEL: stack_fold_vpcmov_rm_ymm: 132; CHECK: # %bb.0: 133; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 134; CHECK-NEXT: #APP 135; CHECK-NEXT: nop 136; CHECK-NEXT: #NO_APP 137; CHECK-NEXT: vpcmov {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0, %ymm0 # 32-byte Folded Reload 138; CHECK-NEXT: retq 139 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 140 %2 = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) 141 ret <4 x i64> %2 142} 143define <4 x i64> @stack_fold_vpcmov_mr_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { 144; CHECK-LABEL: stack_fold_vpcmov_mr_ymm: 145; CHECK: # %bb.0: 146; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 147; CHECK-NEXT: #APP 148; CHECK-NEXT: nop 149; CHECK-NEXT: #NO_APP 150; CHECK-NEXT: vpcmov %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 151; CHECK-NEXT: retq 152 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 153 %2 = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a2, <4 x i64> %a1) 154 ret <4 x i64> %2 155} 156declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone 157 158define <16 x i8> @stack_fold_vpcomb(<16 x i8> %a0, <16 x i8> %a1) { 159; CHECK-LABEL: stack_fold_vpcomb: 160; CHECK: # %bb.0: 161; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 162; CHECK-NEXT: #APP 163; CHECK-NEXT: nop 164; CHECK-NEXT: #NO_APP 165; CHECK-NEXT: vpcomltb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 166; CHECK-NEXT: retq 167 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 168 %2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %a0, <16 x i8> %a1, i8 0) 169 ret <16 x i8> %2 170} 171declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone 172 173define <4 x i32> @stack_fold_vpcomd(<4 x i32> %a0, <4 x i32> %a1) { 174; CHECK-LABEL: stack_fold_vpcomd: 175; CHECK: # %bb.0: 176; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 177; CHECK-NEXT: #APP 178; CHECK-NEXT: nop 179; CHECK-NEXT: #NO_APP 180; CHECK-NEXT: vpcomltd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 181; CHECK-NEXT: retq 182 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 183 %2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %a0, <4 x i32> %a1, i8 0) 184 ret <4 x i32> %2 185} 186declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone 187 188define <2 x i64> @stack_fold_vpcomq(<2 x i64> %a0, <2 x i64> %a1) { 189; CHECK-LABEL: stack_fold_vpcomq: 190; CHECK: # %bb.0: 191; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 192; CHECK-NEXT: #APP 193; CHECK-NEXT: nop 194; CHECK-NEXT: #NO_APP 195; CHECK-NEXT: vpcomltq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 196; CHECK-NEXT: retq 197 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 198 %2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0) 199 ret <2 x i64> %2 200} 201declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone 202 203define <16 x i8> @stack_fold_vpcomub(<16 x i8> %a0, <16 x i8> %a1) { 204; CHECK-LABEL: stack_fold_vpcomub: 205; CHECK: # %bb.0: 206; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 207; CHECK-NEXT: #APP 208; CHECK-NEXT: nop 209; CHECK-NEXT: #NO_APP 210; CHECK-NEXT: vpcomltub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 211; CHECK-NEXT: retq 212 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 213 %2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %a0, <16 x i8> %a1, i8 0) 214 ret <16 x i8> %2 215} 216declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone 217 218define <4 x i32> @stack_fold_vpcomud(<4 x i32> %a0, <4 x i32> %a1) { 219; CHECK-LABEL: stack_fold_vpcomud: 220; CHECK: # %bb.0: 221; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 222; CHECK-NEXT: #APP 223; CHECK-NEXT: nop 224; CHECK-NEXT: #NO_APP 225; CHECK-NEXT: vpcomltud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 226; CHECK-NEXT: retq 227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 228 %2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %a0, <4 x i32> %a1, i8 0) 229 ret <4 x i32> %2 230} 231declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone 232 233define <2 x i64> @stack_fold_vpcomuq(<2 x i64> %a0, <2 x i64> %a1) { 234; CHECK-LABEL: stack_fold_vpcomuq: 235; CHECK: # %bb.0: 236; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 237; CHECK-NEXT: #APP 238; CHECK-NEXT: nop 239; CHECK-NEXT: #NO_APP 240; CHECK-NEXT: vpcomltuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 241; CHECK-NEXT: retq 242 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 243 %2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0) 244 ret <2 x i64> %2 245} 246declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone 247 248define <8 x i16> @stack_fold_vpcomuw(<8 x i16> %a0, <8 x i16> %a1) { 249; CHECK-LABEL: stack_fold_vpcomuw: 250; CHECK: # %bb.0: 251; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 252; CHECK-NEXT: #APP 253; CHECK-NEXT: nop 254; CHECK-NEXT: #NO_APP 255; CHECK-NEXT: vpcomltuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 256; CHECK-NEXT: retq 257 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 258 %2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %a0, <8 x i16> %a1, i8 0) 259 ret <8 x i16> %2 260} 261declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone 262 263define <8 x i16> @stack_fold_vpcomw(<8 x i16> %a0, <8 x i16> %a1) { 264; CHECK-LABEL: stack_fold_vpcomw: 265; CHECK: # %bb.0: 266; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 267; CHECK-NEXT: #APP 268; CHECK-NEXT: nop 269; CHECK-NEXT: #NO_APP 270; CHECK-NEXT: vpcomltw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 271; CHECK-NEXT: retq 272 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 273 %2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %a0, <8 x i16> %a1, i8 0) 274 ret <8 x i16> %2 275} 276declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone 277 278define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) { 279; CHECK-LABEL: stack_fold_vpermil2pd_rm: 280; CHECK: # %bb.0: 281; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 282; CHECK-NEXT: #APP 283; CHECK-NEXT: nop 284; CHECK-NEXT: #NO_APP 285; CHECK-NEXT: vpermil2pd $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0, %xmm0 # 16-byte Folded Reload 286; CHECK-NEXT: retq 287 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 288 %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0) 289 ret <2 x double> %2 290} 291define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x i64> %a1, <2 x double> %a2) { 292; CHECK-LABEL: stack_fold_vpermil2pd_mr: 293; CHECK: # %bb.0: 294; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 295; CHECK-NEXT: #APP 296; CHECK-NEXT: nop 297; CHECK-NEXT: #NO_APP 298; CHECK-NEXT: vpermil2pd $0, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 299; CHECK-NEXT: retq 300 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 301 %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x i64> %a1, i8 0) 302 ret <2 x double> %2 303} 304declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone 305 306define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) { 307; CHECK-LABEL: stack_fold_vpermil2pd_rm_ymm: 308; CHECK: # %bb.0: 309; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 310; CHECK-NEXT: #APP 311; CHECK-NEXT: nop 312; CHECK-NEXT: #NO_APP 313; CHECK-NEXT: vpermil2pd $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0, %ymm0 # 32-byte Folded Reload 314; CHECK-NEXT: retq 315 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 316 %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0) 317 ret <4 x double> %2 318} 319define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x double> %a2) { 320; CHECK-LABEL: stack_fold_vpermil2pd_mr_ymm: 321; CHECK: # %bb.0: 322; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 323; CHECK-NEXT: #APP 324; CHECK-NEXT: nop 325; CHECK-NEXT: #NO_APP 326; CHECK-NEXT: vpermil2pd $0, %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 327; CHECK-NEXT: retq 328 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 329 %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x i64> %a1, i8 0) 330 ret <4 x double> %2 331} 332declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone 333 334define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) { 335; CHECK-LABEL: stack_fold_vpermil2ps_rm: 336; CHECK: # %bb.0: 337; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 338; CHECK-NEXT: #APP 339; CHECK-NEXT: nop 340; CHECK-NEXT: #NO_APP 341; CHECK-NEXT: vpermil2ps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0, %xmm0 # 16-byte Folded Reload 342; CHECK-NEXT: retq 343 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 344 %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 0) 345 ret <4 x float> %2 346} 347define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x i32> %a1, <4 x float> %a2) { 348; CHECK-LABEL: stack_fold_vpermil2ps_mr: 349; CHECK: # %bb.0: 350; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 351; CHECK-NEXT: #APP 352; CHECK-NEXT: nop 353; CHECK-NEXT: #NO_APP 354; CHECK-NEXT: vpermil2ps $0, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 355; CHECK-NEXT: retq 356 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 357 %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x i32> %a1, i8 0) 358 ret <4 x float> %2 359} 360declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone 361 362define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) { 363; CHECK-LABEL: stack_fold_vpermil2ps_rm_ymm: 364; CHECK: # %bb.0: 365; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 366; CHECK-NEXT: #APP 367; CHECK-NEXT: nop 368; CHECK-NEXT: #NO_APP 369; CHECK-NEXT: vpermil2ps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0, %ymm0 # 32-byte Folded Reload 370; CHECK-NEXT: retq 371 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 372 %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 0) 373 ret <8 x float> %2 374} 375define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x float> %a2) { 376; CHECK-LABEL: stack_fold_vpermil2ps_mr_ymm: 377; CHECK: # %bb.0: 378; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 379; CHECK-NEXT: #APP 380; CHECK-NEXT: nop 381; CHECK-NEXT: #NO_APP 382; CHECK-NEXT: vpermil2ps $0, %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 383; CHECK-NEXT: retq 384 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 385 %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x i32> %a1, i8 0) 386 ret <8 x float> %2 387} 388declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone 389 390define <4 x i32> @stack_fold_vphaddbd(<16 x i8> %a0) { 391; CHECK-LABEL: stack_fold_vphaddbd: 392; CHECK: # %bb.0: 393; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 394; CHECK-NEXT: #APP 395; CHECK-NEXT: nop 396; CHECK-NEXT: #NO_APP 397; CHECK-NEXT: vphaddbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 398; CHECK-NEXT: retq 399 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 400 %2 = call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %a0) 401 ret <4 x i32> %2 402} 403declare <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8>) nounwind readnone 404 405define <2 x i64> @stack_fold_vphaddbq(<16 x i8> %a0) { 406; CHECK-LABEL: stack_fold_vphaddbq: 407; CHECK: # %bb.0: 408; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 409; CHECK-NEXT: #APP 410; CHECK-NEXT: nop 411; CHECK-NEXT: #NO_APP 412; CHECK-NEXT: vphaddbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 413; CHECK-NEXT: retq 414 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 415 %2 = call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %a0) 416 ret <2 x i64> %2 417} 418declare <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8>) nounwind readnone 419 420define <8 x i16> @stack_fold_vphaddbw(<16 x i8> %a0) { 421; CHECK-LABEL: stack_fold_vphaddbw: 422; CHECK: # %bb.0: 423; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 424; CHECK-NEXT: #APP 425; CHECK-NEXT: nop 426; CHECK-NEXT: #NO_APP 427; CHECK-NEXT: vphaddbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 428; CHECK-NEXT: retq 429 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 430 %2 = call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %a0) 431 ret <8 x i16> %2 432} 433declare <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8>) nounwind readnone 434 435define <2 x i64> @stack_fold_vphadddq(<4 x i32> %a0) { 436; CHECK-LABEL: stack_fold_vphadddq: 437; CHECK: # %bb.0: 438; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 439; CHECK-NEXT: #APP 440; CHECK-NEXT: nop 441; CHECK-NEXT: #NO_APP 442; CHECK-NEXT: vphadddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 443; CHECK-NEXT: retq 444 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 445 %2 = call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %a0) 446 ret <2 x i64> %2 447} 448declare <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32>) nounwind readnone 449 450define <4 x i32> @stack_fold_vphaddubd(<16 x i8> %a0) { 451; CHECK-LABEL: stack_fold_vphaddubd: 452; CHECK: # %bb.0: 453; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 454; CHECK-NEXT: #APP 455; CHECK-NEXT: nop 456; CHECK-NEXT: #NO_APP 457; CHECK-NEXT: vphaddubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 458; CHECK-NEXT: retq 459 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 460 %2 = call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %a0) 461 ret <4 x i32> %2 462} 463declare <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8>) nounwind readnone 464 465define <2 x i64> @stack_fold_vphaddubq(<16 x i8> %a0) { 466; CHECK-LABEL: stack_fold_vphaddubq: 467; CHECK: # %bb.0: 468; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 469; CHECK-NEXT: #APP 470; CHECK-NEXT: nop 471; CHECK-NEXT: #NO_APP 472; CHECK-NEXT: vphaddubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 473; CHECK-NEXT: retq 474 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 475 %2 = call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %a0) 476 ret <2 x i64> %2 477} 478declare <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8>) nounwind readnone 479 480define <8 x i16> @stack_fold_vphaddubw(<16 x i8> %a0) { 481; CHECK-LABEL: stack_fold_vphaddubw: 482; CHECK: # %bb.0: 483; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 484; CHECK-NEXT: #APP 485; CHECK-NEXT: nop 486; CHECK-NEXT: #NO_APP 487; CHECK-NEXT: vphaddubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 488; CHECK-NEXT: retq 489 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 490 %2 = call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %a0) 491 ret <8 x i16> %2 492} 493declare <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8>) nounwind readnone 494 495define <2 x i64> @stack_fold_vphaddudq(<4 x i32> %a0) { 496; CHECK-LABEL: stack_fold_vphaddudq: 497; CHECK: # %bb.0: 498; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 499; CHECK-NEXT: #APP 500; CHECK-NEXT: nop 501; CHECK-NEXT: #NO_APP 502; CHECK-NEXT: vphaddudq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 503; CHECK-NEXT: retq 504 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 505 %2 = call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %a0) 506 ret <2 x i64> %2 507} 508declare <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32>) nounwind readnone 509 510define <4 x i32> @stack_fold_vphadduwd(<8 x i16> %a0) { 511; CHECK-LABEL: stack_fold_vphadduwd: 512; CHECK: # %bb.0: 513; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 514; CHECK-NEXT: #APP 515; CHECK-NEXT: nop 516; CHECK-NEXT: #NO_APP 517; CHECK-NEXT: vphadduwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 518; CHECK-NEXT: retq 519 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 520 %2 = call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %a0) 521 ret <4 x i32> %2 522} 523declare <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16>) nounwind readnone 524 525define <2 x i64> @stack_fold_vphadduwq(<8 x i16> %a0) { 526; CHECK-LABEL: stack_fold_vphadduwq: 527; CHECK: # %bb.0: 528; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 529; CHECK-NEXT: #APP 530; CHECK-NEXT: nop 531; CHECK-NEXT: #NO_APP 532; CHECK-NEXT: vphadduwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 533; CHECK-NEXT: retq 534 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 535 %2 = call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %a0) 536 ret <2 x i64> %2 537} 538declare <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16>) nounwind readnone 539 540define <4 x i32> @stack_fold_vphaddwd(<8 x i16> %a0) { 541; CHECK-LABEL: stack_fold_vphaddwd: 542; CHECK: # %bb.0: 543; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 544; CHECK-NEXT: #APP 545; CHECK-NEXT: nop 546; CHECK-NEXT: #NO_APP 547; CHECK-NEXT: vphaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 548; CHECK-NEXT: retq 549 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 550 %2 = call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %a0) 551 ret <4 x i32> %2 552} 553declare <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16>) nounwind readnone 554 555define <2 x i64> @stack_fold_vphaddwq(<8 x i16> %a0) { 556; CHECK-LABEL: stack_fold_vphaddwq: 557; CHECK: # %bb.0: 558; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 559; CHECK-NEXT: #APP 560; CHECK-NEXT: nop 561; CHECK-NEXT: #NO_APP 562; CHECK-NEXT: vphaddwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 563; CHECK-NEXT: retq 564 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 565 %2 = call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %a0) 566 ret <2 x i64> %2 567} 568declare <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16>) nounwind readnone 569 570define <8 x i16> @stack_fold_vphsubbw(<16 x i8> %a0) { 571; CHECK-LABEL: stack_fold_vphsubbw: 572; CHECK: # %bb.0: 573; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 574; CHECK-NEXT: #APP 575; CHECK-NEXT: nop 576; CHECK-NEXT: #NO_APP 577; CHECK-NEXT: vphsubbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 578; CHECK-NEXT: retq 579 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 580 %2 = call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %a0) 581 ret <8 x i16> %2 582} 583declare <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8>) nounwind readnone 584 585define <2 x i64> @stack_fold_vphsubdq(<4 x i32> %a0) { 586; CHECK-LABEL: stack_fold_vphsubdq: 587; CHECK: # %bb.0: 588; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 589; CHECK-NEXT: #APP 590; CHECK-NEXT: nop 591; CHECK-NEXT: #NO_APP 592; CHECK-NEXT: vphsubdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 593; CHECK-NEXT: retq 594 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 595 %2 = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %a0) 596 ret <2 x i64> %2 597} 598declare <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32>) nounwind readnone 599 600define <4 x i32> @stack_fold_vphsubwd(<8 x i16> %a0) { 601; CHECK-LABEL: stack_fold_vphsubwd: 602; CHECK: # %bb.0: 603; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 604; CHECK-NEXT: #APP 605; CHECK-NEXT: nop 606; CHECK-NEXT: #NO_APP 607; CHECK-NEXT: vphsubwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 608; CHECK-NEXT: retq 609 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 610 %2 = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %a0) 611 ret <4 x i32> %2 612} 613declare <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16>) nounwind readnone 614 615define <4 x i32> @stack_fold_vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 616; CHECK-LABEL: stack_fold_vpmacsdd: 617; CHECK: # %bb.0: 618; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 619; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 620; CHECK-NEXT: #APP 621; CHECK-NEXT: nop 622; CHECK-NEXT: #NO_APP 623; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 624; CHECK-NEXT: vpmacsdd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 625; CHECK-NEXT: retq 626 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 627 %2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) 628 ret <4 x i32> %2 629} 630declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 631 632define <2 x i64> @stack_fold_vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) { 633; CHECK-LABEL: stack_fold_vpmacsdqh: 634; CHECK: # %bb.0: 635; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 636; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 637; CHECK-NEXT: #APP 638; CHECK-NEXT: nop 639; CHECK-NEXT: #NO_APP 640; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 641; CHECK-NEXT: vpmacsdqh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 642; CHECK-NEXT: retq 643 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 644 %2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) 645 ret <2 x i64> %2 646} 647declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 648 649define <2 x i64> @stack_fold_vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) { 650; CHECK-LABEL: stack_fold_vpmacsdql: 651; CHECK: # %bb.0: 652; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 653; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 654; CHECK-NEXT: #APP 655; CHECK-NEXT: nop 656; CHECK-NEXT: #NO_APP 657; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 658; CHECK-NEXT: vpmacsdql %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 659; CHECK-NEXT: retq 660 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 661 %2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) 662 ret <2 x i64> %2 663} 664declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 665 666define <4 x i32> @stack_fold_vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 667; CHECK-LABEL: stack_fold_vpmacssdd: 668; CHECK: # %bb.0: 669; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 670; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 671; CHECK-NEXT: #APP 672; CHECK-NEXT: nop 673; CHECK-NEXT: #NO_APP 674; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 675; CHECK-NEXT: vpmacssdd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 676; CHECK-NEXT: retq 677 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 678 %2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) 679 ret <4 x i32> %2 680} 681declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 682 683define <2 x i64> @stack_fold_vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) { 684; CHECK-LABEL: stack_fold_vpmacssdqh: 685; CHECK: # %bb.0: 686; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 687; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 688; CHECK-NEXT: #APP 689; CHECK-NEXT: nop 690; CHECK-NEXT: #NO_APP 691; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 692; CHECK-NEXT: vpmacssdqh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 693; CHECK-NEXT: retq 694 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 695 %2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) 696 ret <2 x i64> %2 697} 698declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 699 700define <2 x i64> @stack_fold_vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) { 701; CHECK-LABEL: stack_fold_vpmacssdql: 702; CHECK: # %bb.0: 703; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 704; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 705; CHECK-NEXT: #APP 706; CHECK-NEXT: nop 707; CHECK-NEXT: #NO_APP 708; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 709; CHECK-NEXT: vpmacssdql %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 710; CHECK-NEXT: retq 711 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 712 %2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) 713 ret <2 x i64> %2 714} 715declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone 716 717define <4 x i32> @stack_fold_vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) { 718; CHECK-LABEL: stack_fold_vpmacsswd: 719; CHECK: # %bb.0: 720; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 721; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 722; CHECK-NEXT: #APP 723; CHECK-NEXT: nop 724; CHECK-NEXT: #NO_APP 725; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 726; CHECK-NEXT: vpmacsswd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 727; CHECK-NEXT: retq 728 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 729 %2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) 730 ret <4 x i32> %2 731} 732declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 733 734define <8 x i16> @stack_fold_vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) { 735; CHECK-LABEL: stack_fold_vpmacssww: 736; CHECK: # %bb.0: 737; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 738; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 739; CHECK-NEXT: #APP 740; CHECK-NEXT: nop 741; CHECK-NEXT: #NO_APP 742; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 743; CHECK-NEXT: vpmacssww %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 744; CHECK-NEXT: retq 745 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 746 %2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) 747 ret <8 x i16> %2 748} 749declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone 750 751define <4 x i32> @stack_fold_vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) { 752; CHECK-LABEL: stack_fold_vpmacswd: 753; CHECK: # %bb.0: 754; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 755; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 756; CHECK-NEXT: #APP 757; CHECK-NEXT: nop 758; CHECK-NEXT: #NO_APP 759; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 760; CHECK-NEXT: vpmacswd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 761; CHECK-NEXT: retq 762 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 763 %2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) 764 ret <4 x i32> %2 765} 766declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 767 768define <8 x i16> @stack_fold_vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) { 769; CHECK-LABEL: stack_fold_vpmacsww: 770; CHECK: # %bb.0: 771; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 772; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 773; CHECK-NEXT: #APP 774; CHECK-NEXT: nop 775; CHECK-NEXT: #NO_APP 776; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 777; CHECK-NEXT: vpmacsww %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 778; CHECK-NEXT: retq 779 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 780 %2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) 781 ret <8 x i16> %2 782} 783declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone 784 785define <4 x i32> @stack_fold_vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) { 786; CHECK-LABEL: stack_fold_vpmadcsswd: 787; CHECK: # %bb.0: 788; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 789; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 790; CHECK-NEXT: #APP 791; CHECK-NEXT: nop 792; CHECK-NEXT: #NO_APP 793; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 794; CHECK-NEXT: vpmadcsswd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 795; CHECK-NEXT: retq 796 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 797 %2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) 798 ret <4 x i32> %2 799} 800declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 801 802define <4 x i32> @stack_fold_vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) { 803; CHECK-LABEL: stack_fold_vpmadcswd: 804; CHECK: # %bb.0: 805; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 806; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 807; CHECK-NEXT: #APP 808; CHECK-NEXT: nop 809; CHECK-NEXT: #NO_APP 810; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 811; CHECK-NEXT: vpmadcswd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 812; CHECK-NEXT: retq 813 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 814 %2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) 815 ret <4 x i32> %2 816} 817declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone 818 819define <16 x i8> @stack_fold_vpperm_rm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) { 820; CHECK-LABEL: stack_fold_vpperm_rm: 821; CHECK: # %bb.0: 822; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 823; CHECK-NEXT: #APP 824; CHECK-NEXT: nop 825; CHECK-NEXT: #NO_APP 826; CHECK-NEXT: vpperm {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0, %xmm0 # 16-byte Folded Reload 827; CHECK-NEXT: retq 828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 829 %2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) 830 ret <16 x i8> %2 831} 832define <16 x i8> @stack_fold_vpperm_mr(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) { 833; CHECK-LABEL: stack_fold_vpperm_mr: 834; CHECK: # %bb.0: 835; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 836; CHECK-NEXT: #APP 837; CHECK-NEXT: nop 838; CHECK-NEXT: #NO_APP 839; CHECK-NEXT: vpperm %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 840; CHECK-NEXT: retq 841 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 842 %2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a2, <16 x i8> %a1) 843 ret <16 x i8> %2 844} 845declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone 846 847define <16 x i8> @stack_fold_vprotb(<16 x i8> %a0) { 848; CHECK-LABEL: stack_fold_vprotb: 849; CHECK: # %bb.0: 850; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 851; CHECK-NEXT: #APP 852; CHECK-NEXT: nop 853; CHECK-NEXT: #NO_APP 854; CHECK-NEXT: vprotb $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 855; CHECK-NEXT: retq 856 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 857 %2 = call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %a0, i8 7) 858 ret <16 x i8> %2 859} 860declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone 861 862define <16 x i8> @stack_fold_vprotb_rm(<16 x i8> %a0, <16 x i8> %a1) { 863; CHECK-LABEL: stack_fold_vprotb_rm: 864; CHECK: # %bb.0: 865; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 866; CHECK-NEXT: #APP 867; CHECK-NEXT: nop 868; CHECK-NEXT: #NO_APP 869; CHECK-NEXT: vprotb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 870; CHECK-NEXT: retq 871 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 872 %2 = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %a0, <16 x i8> %a1) 873 ret <16 x i8> %2 874} 875define <16 x i8> @stack_fold_vprotb_mr(<16 x i8> %a0, <16 x i8> %a1) { 876; CHECK-LABEL: stack_fold_vprotb_mr: 877; CHECK: # %bb.0: 878; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 879; CHECK-NEXT: #APP 880; CHECK-NEXT: nop 881; CHECK-NEXT: #NO_APP 882; CHECK-NEXT: vprotb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 883; CHECK-NEXT: retq 884 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 885 %2 = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %a1, <16 x i8> %a0) 886 ret <16 x i8> %2 887} 888declare <16 x i8> @llvm.x86.xop.vprotb(<16 x i8>, <16 x i8>) nounwind readnone 889 890define <4 x i32> @stack_fold_vprotd(<4 x i32> %a0) { 891; CHECK-LABEL: stack_fold_vprotd: 892; CHECK: # %bb.0: 893; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 894; CHECK-NEXT: #APP 895; CHECK-NEXT: nop 896; CHECK-NEXT: #NO_APP 897; CHECK-NEXT: vprotd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 898; CHECK-NEXT: retq 899 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 900 %2 = call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %a0, i8 7) 901 ret <4 x i32> %2 902} 903declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone 904 905define <4 x i32> @stack_fold_vprotd_rm(<4 x i32> %a0, <4 x i32> %a1) { 906; CHECK-LABEL: stack_fold_vprotd_rm: 907; CHECK: # %bb.0: 908; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 909; CHECK-NEXT: #APP 910; CHECK-NEXT: nop 911; CHECK-NEXT: #NO_APP 912; CHECK-NEXT: vprotd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 913; CHECK-NEXT: retq 914 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 915 %2 = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %a0, <4 x i32> %a1) 916 ret <4 x i32> %2 917} 918define <4 x i32> @stack_fold_vprotd_mr(<4 x i32> %a0, <4 x i32> %a1) { 919; CHECK-LABEL: stack_fold_vprotd_mr: 920; CHECK: # %bb.0: 921; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 922; CHECK-NEXT: #APP 923; CHECK-NEXT: nop 924; CHECK-NEXT: #NO_APP 925; CHECK-NEXT: vprotd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 926; CHECK-NEXT: retq 927 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 928 %2 = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %a1, <4 x i32> %a0) 929 ret <4 x i32> %2 930} 931declare <4 x i32> @llvm.x86.xop.vprotd(<4 x i32>, <4 x i32>) nounwind readnone 932 933define <2 x i64> @stack_fold_vprotq(<2 x i64> %a0) { 934; CHECK-LABEL: stack_fold_vprotq: 935; CHECK: # %bb.0: 936; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 937; CHECK-NEXT: #APP 938; CHECK-NEXT: nop 939; CHECK-NEXT: #NO_APP 940; CHECK-NEXT: vprotq $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 941; CHECK-NEXT: retq 942 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 943 %2 = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 7) 944 ret <2 x i64> %2 945} 946declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone 947 948define <2 x i64> @stack_fold_vprotq_rm(<2 x i64> %a0, <2 x i64> %a1) { 949; CHECK-LABEL: stack_fold_vprotq_rm: 950; CHECK: # %bb.0: 951; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 952; CHECK-NEXT: #APP 953; CHECK-NEXT: nop 954; CHECK-NEXT: #NO_APP 955; CHECK-NEXT: vprotq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 956; CHECK-NEXT: retq 957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 958 %2 = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a0, <2 x i64> %a1) 959 ret <2 x i64> %2 960} 961define <2 x i64> @stack_fold_vprotq_mr(<2 x i64> %a0, <2 x i64> %a1) { 962; CHECK-LABEL: stack_fold_vprotq_mr: 963; CHECK: # %bb.0: 964; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 965; CHECK-NEXT: #APP 966; CHECK-NEXT: nop 967; CHECK-NEXT: #NO_APP 968; CHECK-NEXT: vprotq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 969; CHECK-NEXT: retq 970 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 971 %2 = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a1, <2 x i64> %a0) 972 ret <2 x i64> %2 973} 974declare <2 x i64> @llvm.x86.xop.vprotq(<2 x i64>, <2 x i64>) nounwind readnone 975 976define <8 x i16> @stack_fold_vprotw(<8 x i16> %a0) { 977; CHECK-LABEL: stack_fold_vprotw: 978; CHECK: # %bb.0: 979; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 980; CHECK-NEXT: #APP 981; CHECK-NEXT: nop 982; CHECK-NEXT: #NO_APP 983; CHECK-NEXT: vprotw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 984; CHECK-NEXT: retq 985 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 986 %2 = call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %a0, i8 7) 987 ret <8 x i16> %2 988} 989declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone 990 991define <8 x i16> @stack_fold_vprotw_rm(<8 x i16> %a0, <8 x i16> %a1) { 992; CHECK-LABEL: stack_fold_vprotw_rm: 993; CHECK: # %bb.0: 994; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 995; CHECK-NEXT: #APP 996; CHECK-NEXT: nop 997; CHECK-NEXT: #NO_APP 998; CHECK-NEXT: vprotw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 999; CHECK-NEXT: retq 1000 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1001 %2 = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %a0, <8 x i16> %a1) 1002 ret <8 x i16> %2 1003} 1004define <8 x i16> @stack_fold_vprotw_mr(<8 x i16> %a0, <8 x i16> %a1) { 1005; CHECK-LABEL: stack_fold_vprotw_mr: 1006; CHECK: # %bb.0: 1007; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1008; CHECK-NEXT: #APP 1009; CHECK-NEXT: nop 1010; CHECK-NEXT: #NO_APP 1011; CHECK-NEXT: vprotw %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1012; CHECK-NEXT: retq 1013 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1014 %2 = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %a1, <8 x i16> %a0) 1015 ret <8 x i16> %2 1016} 1017declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone 1018 1019define <16 x i8> @stack_fold_vpshab_rm(<16 x i8> %a0, <16 x i8> %a1) { 1020; CHECK-LABEL: stack_fold_vpshab_rm: 1021; CHECK: # %bb.0: 1022; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1023; CHECK-NEXT: #APP 1024; CHECK-NEXT: nop 1025; CHECK-NEXT: #NO_APP 1026; CHECK-NEXT: vpshab {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1027; CHECK-NEXT: retq 1028 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1029 %2 = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %a0, <16 x i8> %a1) 1030 ret <16 x i8> %2 1031} 1032define <16 x i8> @stack_fold_vpshab_mr(<16 x i8> %a0, <16 x i8> %a1) { 1033; CHECK-LABEL: stack_fold_vpshab_mr: 1034; CHECK: # %bb.0: 1035; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1036; CHECK-NEXT: #APP 1037; CHECK-NEXT: nop 1038; CHECK-NEXT: #NO_APP 1039; CHECK-NEXT: vpshab %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1040; CHECK-NEXT: retq 1041 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1042 %2 = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %a1, <16 x i8> %a0) 1043 ret <16 x i8> %2 1044} 1045declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone 1046 1047define <4 x i32> @stack_fold_vpshad_rm(<4 x i32> %a0, <4 x i32> %a1) { 1048; CHECK-LABEL: stack_fold_vpshad_rm: 1049; CHECK: # %bb.0: 1050; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1051; CHECK-NEXT: #APP 1052; CHECK-NEXT: nop 1053; CHECK-NEXT: #NO_APP 1054; CHECK-NEXT: vpshad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1055; CHECK-NEXT: retq 1056 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1057 %2 = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %a0, <4 x i32> %a1) 1058 ret <4 x i32> %2 1059} 1060define <4 x i32> @stack_fold_vpshad_mr(<4 x i32> %a0, <4 x i32> %a1) { 1061; CHECK-LABEL: stack_fold_vpshad_mr: 1062; CHECK: # %bb.0: 1063; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1064; CHECK-NEXT: #APP 1065; CHECK-NEXT: nop 1066; CHECK-NEXT: #NO_APP 1067; CHECK-NEXT: vpshad %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1068; CHECK-NEXT: retq 1069 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1070 %2 = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %a1, <4 x i32> %a0) 1071 ret <4 x i32> %2 1072} 1073declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone 1074 1075define <2 x i64> @stack_fold_vpshaq_rm(<2 x i64> %a0, <2 x i64> %a1) { 1076; CHECK-LABEL: stack_fold_vpshaq_rm: 1077; CHECK: # %bb.0: 1078; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1079; CHECK-NEXT: #APP 1080; CHECK-NEXT: nop 1081; CHECK-NEXT: #NO_APP 1082; CHECK-NEXT: vpshaq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1083; CHECK-NEXT: retq 1084 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1085 %2 = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a0, <2 x i64> %a1) 1086 ret <2 x i64> %2 1087} 1088define <2 x i64> @stack_fold_vpshaq_mr(<2 x i64> %a0, <2 x i64> %a1) { 1089; CHECK-LABEL: stack_fold_vpshaq_mr: 1090; CHECK: # %bb.0: 1091; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1092; CHECK-NEXT: #APP 1093; CHECK-NEXT: nop 1094; CHECK-NEXT: #NO_APP 1095; CHECK-NEXT: vpshaq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1096; CHECK-NEXT: retq 1097 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1098 %2 = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a1, <2 x i64> %a0) 1099 ret <2 x i64> %2 1100} 1101declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone 1102 1103define <8 x i16> @stack_fold_vpshaw_rm(<8 x i16> %a0, <8 x i16> %a1) { 1104; CHECK-LABEL: stack_fold_vpshaw_rm: 1105; CHECK: # %bb.0: 1106; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1107; CHECK-NEXT: #APP 1108; CHECK-NEXT: nop 1109; CHECK-NEXT: #NO_APP 1110; CHECK-NEXT: vpshaw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1111; CHECK-NEXT: retq 1112 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1113 %2 = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %a0, <8 x i16> %a1) 1114 ret <8 x i16> %2 1115} 1116define <8 x i16> @stack_fold_vpshaw_mr(<8 x i16> %a0, <8 x i16> %a1) { 1117; CHECK-LABEL: stack_fold_vpshaw_mr: 1118; CHECK: # %bb.0: 1119; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1120; CHECK-NEXT: #APP 1121; CHECK-NEXT: nop 1122; CHECK-NEXT: #NO_APP 1123; CHECK-NEXT: vpshaw %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1124; CHECK-NEXT: retq 1125 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1126 %2 = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %a1, <8 x i16> %a0) 1127 ret <8 x i16> %2 1128} 1129declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone 1130 1131define <16 x i8> @stack_fold_vpshlb_rm(<16 x i8> %a0, <16 x i8> %a1) { 1132; CHECK-LABEL: stack_fold_vpshlb_rm: 1133; CHECK: # %bb.0: 1134; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1135; CHECK-NEXT: #APP 1136; CHECK-NEXT: nop 1137; CHECK-NEXT: #NO_APP 1138; CHECK-NEXT: vpshlb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1139; CHECK-NEXT: retq 1140 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1141 %2 = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %a0, <16 x i8> %a1) 1142 ret <16 x i8> %2 1143} 1144define <16 x i8> @stack_fold_vpshlb_mr(<16 x i8> %a0, <16 x i8> %a1) { 1145; CHECK-LABEL: stack_fold_vpshlb_mr: 1146; CHECK: # %bb.0: 1147; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1148; CHECK-NEXT: #APP 1149; CHECK-NEXT: nop 1150; CHECK-NEXT: #NO_APP 1151; CHECK-NEXT: vpshlb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1152; CHECK-NEXT: retq 1153 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1154 %2 = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %a1, <16 x i8> %a0) 1155 ret <16 x i8> %2 1156} 1157declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone 1158 1159define <4 x i32> @stack_fold_vpshld_rm(<4 x i32> %a0, <4 x i32> %a1) { 1160; CHECK-LABEL: stack_fold_vpshld_rm: 1161; CHECK: # %bb.0: 1162; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1163; CHECK-NEXT: #APP 1164; CHECK-NEXT: nop 1165; CHECK-NEXT: #NO_APP 1166; CHECK-NEXT: vpshld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1167; CHECK-NEXT: retq 1168 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1169 %2 = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %a0, <4 x i32> %a1) 1170 ret <4 x i32> %2 1171} 1172define <4 x i32> @stack_fold_vpshld_mr(<4 x i32> %a0, <4 x i32> %a1) { 1173; CHECK-LABEL: stack_fold_vpshld_mr: 1174; CHECK: # %bb.0: 1175; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1176; CHECK-NEXT: #APP 1177; CHECK-NEXT: nop 1178; CHECK-NEXT: #NO_APP 1179; CHECK-NEXT: vpshld %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1180; CHECK-NEXT: retq 1181 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1182 %2 = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %a1, <4 x i32> %a0) 1183 ret <4 x i32> %2 1184} 1185declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone 1186 1187define <2 x i64> @stack_fold_vpshlq_rm(<2 x i64> %a0, <2 x i64> %a1) { 1188; CHECK-LABEL: stack_fold_vpshlq_rm: 1189; CHECK: # %bb.0: 1190; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1191; CHECK-NEXT: #APP 1192; CHECK-NEXT: nop 1193; CHECK-NEXT: #NO_APP 1194; CHECK-NEXT: vpshlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1195; CHECK-NEXT: retq 1196 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1197 %2 = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a0, <2 x i64> %a1) 1198 ret <2 x i64> %2 1199} 1200define <2 x i64> @stack_fold_vpshlq_mr(<2 x i64> %a0, <2 x i64> %a1) { 1201; CHECK-LABEL: stack_fold_vpshlq_mr: 1202; CHECK: # %bb.0: 1203; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1204; CHECK-NEXT: #APP 1205; CHECK-NEXT: nop 1206; CHECK-NEXT: #NO_APP 1207; CHECK-NEXT: vpshlq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1208; CHECK-NEXT: retq 1209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1210 %2 = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a1, <2 x i64> %a0) 1211 ret <2 x i64> %2 1212} 1213declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone 1214 1215define <8 x i16> @stack_fold_vpshlw_rm(<8 x i16> %a0, <8 x i16> %a1) { 1216; CHECK-LABEL: stack_fold_vpshlw_rm: 1217; CHECK: # %bb.0: 1218; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1219; CHECK-NEXT: #APP 1220; CHECK-NEXT: nop 1221; CHECK-NEXT: #NO_APP 1222; CHECK-NEXT: vpshlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1223; CHECK-NEXT: retq 1224 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1225 %2 = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a0, <8 x i16> %a1) 1226 ret <8 x i16> %2 1227} 1228define <8 x i16> @stack_fold_vpshlw_mr(<8 x i16> %a0, <8 x i16> %a1) { 1229; CHECK-LABEL: stack_fold_vpshlw_mr: 1230; CHECK: # %bb.0: 1231; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1232; CHECK-NEXT: #APP 1233; CHECK-NEXT: nop 1234; CHECK-NEXT: #NO_APP 1235; CHECK-NEXT: vpshlw %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1236; CHECK-NEXT: retq 1237 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1238 %2 = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a1, <8 x i16> %a0) 1239 ret <8 x i16> %2 1240} 1241declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone 1242