1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <8 x i32> @stack_fold_valignd_ymm(<8 x i32> %a, <8 x i32> %b) { 13; CHECK-LABEL: stack_fold_valignd_ymm: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 16; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 17; CHECK-NEXT: #APP 18; CHECK-NEXT: nop 19; CHECK-NEXT: #NO_APP 20; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 21; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 22; CHECK-NEXT: # ymm0 = mem[1,2,3,4,5,6,7],ymm0[0] 23; CHECK-NEXT: retq 24 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 25 %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 26 ret <8 x i32> %2 27} 28 29define <8 x i32> @stack_fold_valignd_ymm_mask(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %passthru, i8 %mask) { 30; CHECK-LABEL: stack_fold_valignd_ymm_mask: 31; CHECK: # %bb.0: 32; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 33; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 34; CHECK-NEXT: #APP 35; CHECK-NEXT: nop 36; CHECK-NEXT: #NO_APP 37; CHECK-NEXT: kmovd %esi, %k1 38; CHECK-NEXT: vmovdqa (%rdi), %ymm1 39; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 40; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload 41; CHECK-NEXT: # ymm1 {%k1} = mem[1,2,3,4,5,6,7],ymm0[0] 42; CHECK-NEXT: vmovdqa %ymm1, %ymm0 43; CHECK-NEXT: retq 44 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 45 %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 46 %3 = bitcast i8 %mask to <8 x i1> 47 %4 = load <8 x i32>, <8 x i32>* %passthru 48 %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4 49 ret <8 x i32> %5 50} 51 52define <8 x i32> @stack_fold_valignd_ymm_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) { 53; CHECK-LABEL: stack_fold_valignd_ymm_maskz: 54; CHECK: # %bb.0: 55; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 56; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 57; CHECK-NEXT: #APP 58; CHECK-NEXT: nop 59; CHECK-NEXT: #NO_APP 60; CHECK-NEXT: kmovd %edi, %k1 61; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 62; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 63; CHECK-NEXT: # ymm0 {%k1} {z} = mem[1,2,3,4,5,6,7],ymm0[0] 64; CHECK-NEXT: retq 65 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 66 %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 67 %3 = bitcast i8 %mask to <8 x i1> 68 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 69 ret <8 x i32> %4 70} 71 72define <4 x i64> @stack_fold_valignq_ymm(<4 x i64> %a, <4 x i64> %b) { 73; CHECK-LABEL: stack_fold_valignq_ymm: 74; CHECK: # %bb.0: 75; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 76; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 77; CHECK-NEXT: #APP 78; CHECK-NEXT: nop 79; CHECK-NEXT: #NO_APP 80; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 81; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 82; CHECK-NEXT: # ymm0 = mem[1,2,3],ymm0[0] 83; CHECK-NEXT: retq 84 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 85 %2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 86 ret <4 x i64> %2 87} 88 89define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) { 90; CHECK-LABEL: stack_fold_pavgb: 91; CHECK: # %bb.0: 92; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 93; CHECK-NEXT: #APP 94; CHECK-NEXT: nop 95; CHECK-NEXT: #NO_APP 96; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 97; CHECK-NEXT: retq 98 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 99 %2 = zext <16 x i8> %a0 to <16 x i16> 100 %3 = zext <16 x i8> %a1 to <16 x i16> 101 %4 = add <16 x i16> %2, %3 102 %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 103 %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 104 %7 = trunc <16 x i16> %6 to <16 x i8> 105 ret <16 x i8> %7 106} 107 108define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 109; CHECK-LABEL: stack_fold_pavgb_ymm: 110; CHECK: # %bb.0: 111; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 112; CHECK-NEXT: #APP 113; CHECK-NEXT: nop 114; CHECK-NEXT: #NO_APP 115; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 116; CHECK-NEXT: retq 117 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 118 %2 = zext <32 x i8> %a0 to <32 x i16> 119 %3 = zext <32 x i8> %a1 to <32 x i16> 120 %4 = add <32 x i16> %2, %3 121 %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 122 %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 123 %7 = trunc <32 x i16> %6 to <32 x i8> 124 ret <32 x i8> %7 125} 126 127define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) { 128; CHECK-LABEL: stack_fold_pavgw: 129; CHECK: # %bb.0: 130; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 131; CHECK-NEXT: #APP 132; CHECK-NEXT: nop 133; CHECK-NEXT: #NO_APP 134; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 135; CHECK-NEXT: retq 136 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 137 %2 = zext <8 x i16> %a0 to <8 x i32> 138 %3 = zext <8 x i16> %a1 to <8 x i32> 139 %4 = add <8 x i32> %2, %3 140 %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 141 %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 142 %7 = trunc <8 x i32> %6 to <8 x i16> 143 ret <8 x i16> %7 144} 145 146define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 147; CHECK-LABEL: stack_fold_pavgw_ymm: 148; CHECK: # %bb.0: 149; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 150; CHECK-NEXT: #APP 151; CHECK-NEXT: nop 152; CHECK-NEXT: #NO_APP 153; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 154; CHECK-NEXT: retq 155 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 156 %2 = zext <16 x i16> %a0 to <16 x i32> 157 %3 = zext <16 x i16> %a1 to <16 x i32> 158 %4 = add <16 x i32> %2, %3 159 %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 160 %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 161 %7 = trunc <16 x i32> %6 to <16 x i16> 162 ret <16 x i16> %7 163} 164 165define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) { 166; CHECK-LABEL: stack_fold_vpconflictd: 167; CHECK: # %bb.0: 168; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 169; CHECK-NEXT: #APP 170; CHECK-NEXT: nop 171; CHECK-NEXT: #NO_APP 172; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 173; CHECK-NEXT: retq 174 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 175 %2 = call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %a0) 176 ret <4 x i32> %2 177} 178 179define <8 x i32> @stack_fold_vpconflictd_ymm(<8 x i32> %a0) { 180; CHECK-LABEL: stack_fold_vpconflictd_ymm: 181; CHECK: # %bb.0: 182; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 183; CHECK-NEXT: #APP 184; CHECK-NEXT: nop 185; CHECK-NEXT: #NO_APP 186; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 187; CHECK-NEXT: retq 188 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 189 %2 = call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %a0) 190 ret <8 x i32> %2 191} 192 193define <2 x i64> @stack_fold_vpconflictq(<2 x i64> %a0) { 194; CHECK-LABEL: stack_fold_vpconflictq: 195; CHECK: # %bb.0: 196; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 197; CHECK-NEXT: #APP 198; CHECK-NEXT: nop 199; CHECK-NEXT: #NO_APP 200; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 201; CHECK-NEXT: retq 202 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 203 %2 = call <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %a0) 204 ret <2 x i64> %2 205} 206 207define <4 x i64> @stack_fold_vpconflictq_ymm(<4 x i64> %a0) { 208; CHECK-LABEL: stack_fold_vpconflictq_ymm: 209; CHECK: # %bb.0: 210; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 211; CHECK-NEXT: #APP 212; CHECK-NEXT: nop 213; CHECK-NEXT: #NO_APP 214; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 215; CHECK-NEXT: retq 216 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 217 %2 = call <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %a0) 218 ret <4 x i64> %2 219} 220 221define <4 x i32> @stack_fold_extracti32x4(<8 x i16> %a0, <8 x i32> %a1) { 222; CHECK-LABEL: stack_fold_extracti32x4: 223; CHECK: # %bb.0: 224; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 225; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 226; CHECK-NEXT: #APP 227; CHECK-NEXT: nop 228; CHECK-NEXT: #NO_APP 229; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 230; CHECK-NEXT: vzeroupper 231; CHECK-NEXT: retq 232 ; zext forces execution domain 233 %1 = zext <8 x i16> %a0 to <8 x i32> 234 %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 235 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 236 ret <4 x i32> %2 237} 238 239define <2 x i64> @stack_fold_extracti64x2(<4 x i32> %a0, <4 x i64> %a1) { 240; CHECK-LABEL: stack_fold_extracti64x2: 241; CHECK: # %bb.0: 242; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 243; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 244; CHECK-NEXT: #APP 245; CHECK-NEXT: nop 246; CHECK-NEXT: #NO_APP 247; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 248; CHECK-NEXT: vzeroupper 249; CHECK-NEXT: retq 250 ; zext forces execution domain 251 %1 = zext <4 x i32> %a0 to <4 x i64> 252 %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3> 253 %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 254 ret <2 x i64> %2 255} 256 257define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) { 258; CHECK-LABEL: stack_fold_inserti32x4: 259; CHECK: # %bb.0: 260; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 261; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 262; CHECK-NEXT: #APP 263; CHECK-NEXT: nop 264; CHECK-NEXT: #NO_APP 265; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 266; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 267; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 268; CHECK-NEXT: retq 269 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 270 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 271 ; add forces execution domain 272 %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 273 ret <8 x i32> %3 274} 275 276define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) { 277; CHECK-LABEL: stack_fold_inserti64x2: 278; CHECK: # %bb.0: 279; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 280; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 281; CHECK-NEXT: #APP 282; CHECK-NEXT: nop 283; CHECK-NEXT: #NO_APP 284; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 285; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 286; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 287; CHECK-NEXT: retq 288 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 289 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 290 ; add forces execution domain 291 %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1> 292 ret <4 x i64> %3 293} 294 295define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) { 296; CHECK-LABEL: stack_fold_pabsb: 297; CHECK: # %bb.0: 298; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 299; CHECK-NEXT: #APP 300; CHECK-NEXT: nop 301; CHECK-NEXT: #NO_APP 302; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 303; CHECK-NEXT: retq 304 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 305 %2 = icmp sgt <16 x i8> %a0, zeroinitializer 306 %3 = sub <16 x i8> zeroinitializer, %a0 307 %4 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %3 308 ret <16 x i8> %4 309} 310 311define <32 x i8> @stack_fold_pabsb_ymm(<32 x i8> %a0) { 312; CHECK-LABEL: stack_fold_pabsb_ymm: 313; CHECK: # %bb.0: 314; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 315; CHECK-NEXT: #APP 316; CHECK-NEXT: nop 317; CHECK-NEXT: #NO_APP 318; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 319; CHECK-NEXT: retq 320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 321 %2 = icmp sgt <32 x i8> %a0, zeroinitializer 322 %3 = sub <32 x i8> zeroinitializer, %a0 323 %4 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %3 324 ret <32 x i8> %4 325} 326 327define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) { 328; CHECK-LABEL: stack_fold_pabsd: 329; CHECK: # %bb.0: 330; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 331; CHECK-NEXT: #APP 332; CHECK-NEXT: nop 333; CHECK-NEXT: #NO_APP 334; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 335; CHECK-NEXT: retq 336 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 337 %2 = icmp sgt <4 x i32> %a0, zeroinitializer 338 %3 = sub <4 x i32> zeroinitializer, %a0 339 %4 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %3 340 ret <4 x i32> %4 341} 342 343define <8 x i32> @stack_fold_pabsd_ymm(<8 x i32> %a0) { 344; CHECK-LABEL: stack_fold_pabsd_ymm: 345; CHECK: # %bb.0: 346; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 347; CHECK-NEXT: #APP 348; CHECK-NEXT: nop 349; CHECK-NEXT: #NO_APP 350; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 351; CHECK-NEXT: retq 352 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 353 %2 = icmp sgt <8 x i32> %a0, zeroinitializer 354 %3 = sub <8 x i32> zeroinitializer, %a0 355 %4 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %3 356 ret <8 x i32> %4 357} 358 359define <2 x i64> @stack_fold_pabsq(<2 x i64> %a0) { 360; CHECK-LABEL: stack_fold_pabsq: 361; CHECK: # %bb.0: 362; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 363; CHECK-NEXT: #APP 364; CHECK-NEXT: nop 365; CHECK-NEXT: #NO_APP 366; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 367; CHECK-NEXT: retq 368 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 369 %2 = icmp sgt <2 x i64> %a0, zeroinitializer 370 %3 = sub <2 x i64> zeroinitializer, %a0 371 %4 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %3 372 ret <2 x i64> %4 373} 374 375define <4 x i64> @stack_fold_pabsq_ymm(<4 x i64> %a0) { 376; CHECK-LABEL: stack_fold_pabsq_ymm: 377; CHECK: # %bb.0: 378; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 379; CHECK-NEXT: #APP 380; CHECK-NEXT: nop 381; CHECK-NEXT: #NO_APP 382; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 383; CHECK-NEXT: retq 384 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 385 %2 = icmp sgt <4 x i64> %a0, zeroinitializer 386 %3 = sub <4 x i64> zeroinitializer, %a0 387 %4 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %3 388 ret <4 x i64> %4 389} 390 391define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) { 392; CHECK-LABEL: stack_fold_pabsw: 393; CHECK: # %bb.0: 394; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 395; CHECK-NEXT: #APP 396; CHECK-NEXT: nop 397; CHECK-NEXT: #NO_APP 398; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 399; CHECK-NEXT: retq 400 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 401 %2 = icmp sgt <8 x i16> %a0, zeroinitializer 402 %3 = sub <8 x i16> zeroinitializer, %a0 403 %4 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %3 404 ret <8 x i16> %4 405} 406 407define <16 x i16> @stack_fold_pabsw_ymm(<16 x i16> %a0) { 408; CHECK-LABEL: stack_fold_pabsw_ymm: 409; CHECK: # %bb.0: 410; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 411; CHECK-NEXT: #APP 412; CHECK-NEXT: nop 413; CHECK-NEXT: #NO_APP 414; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 415; CHECK-NEXT: retq 416 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 417 %2 = icmp sgt <16 x i16> %a0, zeroinitializer 418 %3 = sub <16 x i16> zeroinitializer, %a0 419 %4 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %3 420 ret <16 x i16> %4 421} 422 423define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) { 424; CHECK-LABEL: stack_fold_packssdw: 425; CHECK: # %bb.0: 426; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 427; CHECK-NEXT: #APP 428; CHECK-NEXT: nop 429; CHECK-NEXT: #NO_APP 430; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 431; CHECK-NEXT: retq 432 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 433 %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) 434 ret <8 x i16> %2 435} 436declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone 437 438define <16 x i16> @stack_fold_packssdw_ymm(<8 x i32> %a0, <8 x i32> %a1) { 439; CHECK-LABEL: stack_fold_packssdw_ymm: 440; CHECK: # %bb.0: 441; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 442; CHECK-NEXT: #APP 443; CHECK-NEXT: nop 444; CHECK-NEXT: #NO_APP 445; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 446; CHECK-NEXT: retq 447 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 448 %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) 449 ret <16 x i16> %2 450} 451declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone 452 453define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) { 454; CHECK-LABEL: stack_fold_packsswb: 455; CHECK: # %bb.0: 456; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 457; CHECK-NEXT: #APP 458; CHECK-NEXT: nop 459; CHECK-NEXT: #NO_APP 460; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 461; CHECK-NEXT: retq 462 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 463 %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) 464 ret <16 x i8> %2 465} 466declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone 467 468define <32 x i8> @stack_fold_packsswb_ymm(<16 x i16> %a0, <16 x i16> %a1) { 469; CHECK-LABEL: stack_fold_packsswb_ymm: 470; CHECK: # %bb.0: 471; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 472; CHECK-NEXT: #APP 473; CHECK-NEXT: nop 474; CHECK-NEXT: #NO_APP 475; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 476; CHECK-NEXT: retq 477 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 478 %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) 479 ret <32 x i8> %2 480} 481declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone 482 483define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) { 484; CHECK-LABEL: stack_fold_packusdw: 485; CHECK: # %bb.0: 486; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 487; CHECK-NEXT: #APP 488; CHECK-NEXT: nop 489; CHECK-NEXT: #NO_APP 490; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 491; CHECK-NEXT: retq 492 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 493 %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) 494 ret <8 x i16> %2 495} 496declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone 497 498define <16 x i16> @stack_fold_packusdw_ymm(<8 x i32> %a0, <8 x i32> %a1) { 499; CHECK-LABEL: stack_fold_packusdw_ymm: 500; CHECK: # %bb.0: 501; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 502; CHECK-NEXT: #APP 503; CHECK-NEXT: nop 504; CHECK-NEXT: #NO_APP 505; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 506; CHECK-NEXT: retq 507 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 508 %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) 509 ret <16 x i16> %2 510} 511declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone 512 513define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) { 514; CHECK-LABEL: stack_fold_packuswb: 515; CHECK: # %bb.0: 516; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 517; CHECK-NEXT: #APP 518; CHECK-NEXT: nop 519; CHECK-NEXT: #NO_APP 520; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 521; CHECK-NEXT: retq 522 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 523 %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) 524 ret <16 x i8> %2 525} 526declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone 527 528define <32 x i8> @stack_fold_packuswb_ymm(<16 x i16> %a0, <16 x i16> %a1) { 529; CHECK-LABEL: stack_fold_packuswb_ymm: 530; CHECK: # %bb.0: 531; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 532; CHECK-NEXT: #APP 533; CHECK-NEXT: nop 534; CHECK-NEXT: #NO_APP 535; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 536; CHECK-NEXT: retq 537 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 538 %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) 539 ret <32 x i8> %2 540} 541declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone 542 543define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) { 544; CHECK-LABEL: stack_fold_paddb: 545; CHECK: # %bb.0: 546; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 547; CHECK-NEXT: #APP 548; CHECK-NEXT: nop 549; CHECK-NEXT: #NO_APP 550; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 551; CHECK-NEXT: retq 552 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 553 %2 = add <16 x i8> %a0, %a1 554 ret <16 x i8> %2 555} 556 557define <16 x i8> @stack_fold_paddb_mask(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %a2, i16 %mask) { 558; CHECK-LABEL: stack_fold_paddb_mask: 559; CHECK: # %bb.0: 560; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 561; CHECK-NEXT: #APP 562; CHECK-NEXT: nop 563; CHECK-NEXT: #NO_APP 564; CHECK-NEXT: kmovd %esi, %k1 565; CHECK-NEXT: vmovdqa (%rdi), %xmm2 566; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 567; CHECK-NEXT: vmovdqa %xmm2, %xmm0 568; CHECK-NEXT: retq 569 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 570 %2 = add <16 x i8> %a0, %a1 571 %3 = bitcast i16 %mask to <16 x i1> 572 ; load needed to keep the operation from being scheduled about the asm block 573 %4 = load <16 x i8>, <16 x i8>* %a2 574 %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4 575 ret <16 x i8> %5 576} 577 578define <16 x i8> @stack_fold_paddb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 579; CHECK-LABEL: stack_fold_paddb_maskz: 580; CHECK: # %bb.0: 581; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 582; CHECK-NEXT: #APP 583; CHECK-NEXT: nop 584; CHECK-NEXT: #NO_APP 585; CHECK-NEXT: kmovd %edi, %k1 586; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 587; CHECK-NEXT: retq 588 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 589 %2 = add <16 x i8> %a0, %a1 590 %3 = bitcast i16 %mask to <16 x i1> 591 %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer 592 ret <16 x i8> %4 593} 594 595define <32 x i8> @stack_fold_paddb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 596; CHECK-LABEL: stack_fold_paddb_ymm: 597; CHECK: # %bb.0: 598; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 599; CHECK-NEXT: #APP 600; CHECK-NEXT: nop 601; CHECK-NEXT: #NO_APP 602; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 603; CHECK-NEXT: retq 604 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 605 %2 = add <32 x i8> %a0, %a1 606 ret <32 x i8> %2 607} 608 609define <32 x i8> @stack_fold_paddb_mask_ymm(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %a2, i32 %mask) { 610; CHECK-LABEL: stack_fold_paddb_mask_ymm: 611; CHECK: # %bb.0: 612; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 613; CHECK-NEXT: #APP 614; CHECK-NEXT: nop 615; CHECK-NEXT: #NO_APP 616; CHECK-NEXT: kmovd %esi, %k1 617; CHECK-NEXT: vmovdqa (%rdi), %ymm2 618; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 619; CHECK-NEXT: vmovdqa %ymm2, %ymm0 620; CHECK-NEXT: retq 621 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 622 %2 = add <32 x i8> %a0, %a1 623 %3 = bitcast i32 %mask to <32 x i1> 624 ; load needed to keep the operation from being scheduled about the asm block 625 %4 = load <32 x i8>, <32 x i8>* %a2 626 %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4 627 ret <32 x i8> %5 628} 629 630define <32 x i8> @stack_fold_paddb_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 631; CHECK-LABEL: stack_fold_paddb_maskz_ymm: 632; CHECK: # %bb.0: 633; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 634; CHECK-NEXT: #APP 635; CHECK-NEXT: nop 636; CHECK-NEXT: #NO_APP 637; CHECK-NEXT: kmovd %edi, %k1 638; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 639; CHECK-NEXT: retq 640 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 641 %2 = add <32 x i8> %a0, %a1 642 %3 = bitcast i32 %mask to <32 x i1> 643 %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer 644 ret <32 x i8> %4 645} 646 647define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) { 648; CHECK-LABEL: stack_fold_paddd: 649; CHECK: # %bb.0: 650; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 651; CHECK-NEXT: #APP 652; CHECK-NEXT: nop 653; CHECK-NEXT: #NO_APP 654; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 655; CHECK-NEXT: retq 656 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 657 %2 = add <4 x i32> %a0, %a1 658 ret <4 x i32> %2 659} 660 661define <8 x i32> @stack_fold_paddd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 662; CHECK-LABEL: stack_fold_paddd_ymm: 663; CHECK: # %bb.0: 664; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 665; CHECK-NEXT: #APP 666; CHECK-NEXT: nop 667; CHECK-NEXT: #NO_APP 668; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 669; CHECK-NEXT: retq 670 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 671 %2 = add <8 x i32> %a0, %a1 672 ret <8 x i32> %2 673} 674 675define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) { 676; CHECK-LABEL: stack_fold_paddq: 677; CHECK: # %bb.0: 678; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 679; CHECK-NEXT: #APP 680; CHECK-NEXT: nop 681; CHECK-NEXT: #NO_APP 682; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 683; CHECK-NEXT: retq 684 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 685 %2 = add <2 x i64> %a0, %a1 686 ret <2 x i64> %2 687} 688 689define <4 x i64> @stack_fold_paddq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 690; CHECK-LABEL: stack_fold_paddq_ymm: 691; CHECK: # %bb.0: 692; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 693; CHECK-NEXT: #APP 694; CHECK-NEXT: nop 695; CHECK-NEXT: #NO_APP 696; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 697; CHECK-NEXT: retq 698 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 699 %2 = add <4 x i64> %a0, %a1 700 ret <4 x i64> %2 701} 702 703define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) { 704; CHECK-LABEL: stack_fold_paddsb: 705; CHECK: # %bb.0: 706; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 707; CHECK-NEXT: #APP 708; CHECK-NEXT: nop 709; CHECK-NEXT: #NO_APP 710; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 711; CHECK-NEXT: retq 712 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 713 %2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 714 ret <16 x i8> %2 715} 716declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 717 718define <32 x i8> @stack_fold_paddsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 719; CHECK-LABEL: stack_fold_paddsb_ymm: 720; CHECK: # %bb.0: 721; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 722; CHECK-NEXT: #APP 723; CHECK-NEXT: nop 724; CHECK-NEXT: #NO_APP 725; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 726; CHECK-NEXT: retq 727 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 728 %2 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 729 ret <32 x i8> %2 730} 731declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 732 733define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) { 734; CHECK-LABEL: stack_fold_paddsw: 735; CHECK: # %bb.0: 736; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 737; CHECK-NEXT: #APP 738; CHECK-NEXT: nop 739; CHECK-NEXT: #NO_APP 740; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 741; CHECK-NEXT: retq 742 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 743 %2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 744 ret <8 x i16> %2 745} 746declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 747 748define <16 x i16> @stack_fold_paddsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 749; CHECK-LABEL: stack_fold_paddsw_ymm: 750; CHECK: # %bb.0: 751; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 752; CHECK-NEXT: #APP 753; CHECK-NEXT: nop 754; CHECK-NEXT: #NO_APP 755; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 756; CHECK-NEXT: retq 757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 758 %2 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 759 ret <16 x i16> %2 760} 761declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 762 763define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) { 764; CHECK-LABEL: stack_fold_paddusb: 765; CHECK: # %bb.0: 766; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 767; CHECK-NEXT: #APP 768; CHECK-NEXT: nop 769; CHECK-NEXT: #NO_APP 770; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 771; CHECK-NEXT: retq 772 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 773 %2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 774 ret <16 x i8> %2 775} 776declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 777 778define <32 x i8> @stack_fold_paddusb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 779; CHECK-LABEL: stack_fold_paddusb_ymm: 780; CHECK: # %bb.0: 781; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 782; CHECK-NEXT: #APP 783; CHECK-NEXT: nop 784; CHECK-NEXT: #NO_APP 785; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 786; CHECK-NEXT: retq 787 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 788 %2 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 789 ret <32 x i8> %2 790} 791declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 792 793define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) { 794; CHECK-LABEL: stack_fold_paddusw: 795; CHECK: # %bb.0: 796; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 797; CHECK-NEXT: #APP 798; CHECK-NEXT: nop 799; CHECK-NEXT: #NO_APP 800; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 801; CHECK-NEXT: retq 802 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 803 %2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 804 ret <8 x i16> %2 805} 806declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 807 808define <16 x i16> @stack_fold_paddusw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 809; CHECK-LABEL: stack_fold_paddusw_ymm: 810; CHECK: # %bb.0: 811; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 812; CHECK-NEXT: #APP 813; CHECK-NEXT: nop 814; CHECK-NEXT: #NO_APP 815; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 816; CHECK-NEXT: retq 817 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 818 %2 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 819 ret <16 x i16> %2 820} 821declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 822 823define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) { 824; CHECK-LABEL: stack_fold_paddw: 825; CHECK: # %bb.0: 826; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 827; CHECK-NEXT: #APP 828; CHECK-NEXT: nop 829; CHECK-NEXT: #NO_APP 830; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 831; CHECK-NEXT: retq 832 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 833 %2 = add <8 x i16> %a0, %a1 834 ret <8 x i16> %2 835} 836 837define <16 x i16> @stack_fold_paddw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 838; CHECK-LABEL: stack_fold_paddw_ymm: 839; CHECK: # %bb.0: 840; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 841; CHECK-NEXT: #APP 842; CHECK-NEXT: nop 843; CHECK-NEXT: #NO_APP 844; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 845; CHECK-NEXT: retq 846 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 847 %2 = add <16 x i16> %a0, %a1 848 ret <16 x i16> %2 849} 850 851define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) { 852; CHECK-LABEL: stack_fold_palignr: 853; CHECK: # %bb.0: 854; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 855; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 856; CHECK-NEXT: #APP 857; CHECK-NEXT: nop 858; CHECK-NEXT: #NO_APP 859; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 860; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 861; CHECK-NEXT: # ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] 862; CHECK-NEXT: retq 863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 864 %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 865 ret <32 x i8> %2 866} 867 868define <32 x i8> @stack_fold_palignr_mask(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %passthru, i32 %mask) { 869; CHECK-LABEL: stack_fold_palignr_mask: 870; CHECK: # %bb.0: 871; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 872; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 873; CHECK-NEXT: #APP 874; CHECK-NEXT: nop 875; CHECK-NEXT: #NO_APP 876; CHECK-NEXT: kmovd %esi, %k1 877; CHECK-NEXT: vmovdqa (%rdi), %ymm1 878; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 879; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload 880; CHECK-NEXT: # ymm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] 881; CHECK-NEXT: vmovdqa %ymm1, %ymm0 882; CHECK-NEXT: retq 883 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 884 %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 885 %3 = bitcast i32 %mask to <32 x i1> 886 %4 = load <32 x i8>, <32 x i8>* %passthru 887 %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4 888 ret <32 x i8> %5 889} 890 891define <32 x i8> @stack_fold_palignr_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 892; CHECK-LABEL: stack_fold_palignr_maskz: 893; CHECK: # %bb.0: 894; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 895; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 896; CHECK-NEXT: #APP 897; CHECK-NEXT: nop 898; CHECK-NEXT: #NO_APP 899; CHECK-NEXT: kmovd %edi, %k1 900; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 901; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 902; CHECK-NEXT: # ymm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] 903; CHECK-NEXT: retq 904 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 905 %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 906 %3 = bitcast i32 %mask to <32 x i1> 907 %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer 908 ret <32 x i8> %4 909} 910 911define i16 @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) { 912; CHECK-LABEL: stack_fold_pcmpeqb: 913; CHECK: # %bb.0: 914; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 915; CHECK-NEXT: #APP 916; CHECK-NEXT: nop 917; CHECK-NEXT: #NO_APP 918; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 919; CHECK-NEXT: kmovd %k0, %eax 920; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 921; CHECK-NEXT: retq 922 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 923 %2 = icmp eq <16 x i8> %a0, %a1 924 %3 = bitcast <16 x i1> %2 to i16 925 ret i16 %3 926} 927 928define i8 @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) { 929; CHECK-LABEL: stack_fold_pcmpeqd: 930; CHECK: # %bb.0: 931; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 932; CHECK-NEXT: #APP 933; CHECK-NEXT: nop 934; CHECK-NEXT: #NO_APP 935; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 936; CHECK-NEXT: kmovd %k0, %eax 937; CHECK-NEXT: # kill: def $al killed $al killed $eax 938; CHECK-NEXT: retq 939 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 940 %2 = icmp eq <4 x i32> %a0, %a1 941 %3 = shufflevector <4 x i1> %2, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 942 %4 = bitcast <8 x i1> %3 to i8 943 ret i8 %4 944} 945 946define i8 @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) { 947; CHECK-LABEL: stack_fold_pcmpeqq: 948; CHECK: # %bb.0: 949; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 950; CHECK-NEXT: #APP 951; CHECK-NEXT: nop 952; CHECK-NEXT: #NO_APP 953; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 954; CHECK-NEXT: kmovd %k0, %eax 955; CHECK-NEXT: # kill: def $al killed $al killed $eax 956; CHECK-NEXT: retq 957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 958 %2 = icmp eq <2 x i64> %a0, %a1 959 %3 = shufflevector <2 x i1> %2, <2 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 960 %4 = bitcast <8 x i1> %3 to i8 961 ret i8 %4 962} 963 964define i8 @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) { 965; CHECK-LABEL: stack_fold_pcmpeqw: 966; CHECK: # %bb.0: 967; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 968; CHECK-NEXT: #APP 969; CHECK-NEXT: nop 970; CHECK-NEXT: #NO_APP 971; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 972; CHECK-NEXT: kmovd %k0, %eax 973; CHECK-NEXT: # kill: def $al killed $al killed $eax 974; CHECK-NEXT: retq 975 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 976 %2 = icmp eq <8 x i16> %a0, %a1 977 %3 = bitcast <8 x i1> %2 to i8 978 ret i8 %3 979} 980 981define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) { 982; CHECK-LABEL: stack_fold_permbvar: 983; CHECK: # %bb.0: 984; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 985; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 986; CHECK-NEXT: #APP 987; CHECK-NEXT: nop 988; CHECK-NEXT: #NO_APP 989; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 990; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 991; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 992; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0 993; CHECK-NEXT: retq 994 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 995 %2 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a1, <32 x i8> %a0) 996 ; add forces execution domain 997 %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 998 ret <32 x i8> %3 999} 1000declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) nounwind readonly 1001 1002define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) { 1003; CHECK-LABEL: stack_fold_permd: 1004; CHECK: # %bb.0: 1005; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1006; CHECK-NEXT: #APP 1007; CHECK-NEXT: nop 1008; CHECK-NEXT: #NO_APP 1009; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1010; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1011; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 1012; CHECK-NEXT: retq 1013 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1014 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0) 1015 ; add forces execution domain 1016 %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1017 ret <8 x i32> %3 1018} 1019declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly 1020 1021define <16 x i8> @stack_fold_vpermi2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { 1022; CHECK-LABEL: stack_fold_vpermi2b: 1023; CHECK: # %bb.0: 1024; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1025; CHECK-NEXT: #APP 1026; CHECK-NEXT: nop 1027; CHECK-NEXT: #NO_APP 1028; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1029; CHECK-NEXT: retq 1030 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1031 %2 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) 1032 ret <16 x i8> %2 1033} 1034 1035define <32 x i8> @stack_fold_vpermi2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { 1036; CHECK-LABEL: stack_fold_vpermi2b_ymm: 1037; CHECK: # %bb.0: 1038; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1039; CHECK-NEXT: #APP 1040; CHECK-NEXT: nop 1041; CHECK-NEXT: #NO_APP 1042; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1043; CHECK-NEXT: retq 1044 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1045 %2 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) 1046 ret <32 x i8> %2 1047} 1048 1049define <4 x i32> @stack_fold_vpermi2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { 1050; CHECK-LABEL: stack_fold_vpermi2d: 1051; CHECK: # %bb.0: 1052; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1053; CHECK-NEXT: #APP 1054; CHECK-NEXT: nop 1055; CHECK-NEXT: #NO_APP 1056; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1057; CHECK-NEXT: retq 1058 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1059 %2 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) 1060 ret <4 x i32> %2 1061} 1062 1063define <8 x i32> @stack_fold_vpermi2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { 1064; CHECK-LABEL: stack_fold_vpermi2d_ymm: 1065; CHECK: # %bb.0: 1066; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1067; CHECK-NEXT: #APP 1068; CHECK-NEXT: nop 1069; CHECK-NEXT: #NO_APP 1070; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1071; CHECK-NEXT: retq 1072 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1073 %2 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) 1074 ret <8 x i32> %2 1075} 1076 1077define <2 x i64> @stack_fold_vpermi2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { 1078; CHECK-LABEL: stack_fold_vpermi2q: 1079; CHECK: # %bb.0: 1080; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1081; CHECK-NEXT: #APP 1082; CHECK-NEXT: nop 1083; CHECK-NEXT: #NO_APP 1084; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1085; CHECK-NEXT: retq 1086 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1087 %2 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) 1088 ret <2 x i64> %2 1089} 1090 1091define <4 x i64> @stack_fold_vpermi2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { 1092; CHECK-LABEL: stack_fold_vpermi2q_ymm: 1093; CHECK: # %bb.0: 1094; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1095; CHECK-NEXT: #APP 1096; CHECK-NEXT: nop 1097; CHECK-NEXT: #NO_APP 1098; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1099; CHECK-NEXT: retq 1100 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1101 %2 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) 1102 ret <4 x i64> %2 1103} 1104 1105define <8 x i16> @stack_fold_vpermi2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { 1106; CHECK-LABEL: stack_fold_vpermi2w: 1107; CHECK: # %bb.0: 1108; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1109; CHECK-NEXT: #APP 1110; CHECK-NEXT: nop 1111; CHECK-NEXT: #NO_APP 1112; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1113; CHECK-NEXT: retq 1114 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1115 %2 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) 1116 ret <8 x i16> %2 1117} 1118 1119define <16 x i16> @stack_fold_vpermi2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { 1120; CHECK-LABEL: stack_fold_vpermi2w_ymm: 1121; CHECK: # %bb.0: 1122; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1123; CHECK-NEXT: #APP 1124; CHECK-NEXT: nop 1125; CHECK-NEXT: #NO_APP 1126; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1127; CHECK-NEXT: retq 1128 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1129 %2 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) 1130 ret <16 x i16> %2 1131} 1132 1133define <4 x i64> @stack_fold_permq(<4 x i64> %a0) { 1134; CHECK-LABEL: stack_fold_permq: 1135; CHECK: # %bb.0: 1136; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1137; CHECK-NEXT: #APP 1138; CHECK-NEXT: nop 1139; CHECK-NEXT: #NO_APP 1140; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1141; CHECK-NEXT: # ymm0 = mem[3,2,2,3] 1142; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1143; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 1144; CHECK-NEXT: retq 1145 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1146 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> 1147 ; add forces execution domain 1148 %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1> 1149 ret <4 x i64> %3 1150} 1151 1152define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) { 1153; CHECK-LABEL: stack_fold_permqvar: 1154; CHECK: # %bb.0: 1155; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1156; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1157; CHECK-NEXT: #APP 1158; CHECK-NEXT: nop 1159; CHECK-NEXT: #NO_APP 1160; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1161; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1162; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1163; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 1164; CHECK-NEXT: retq 1165 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1166 %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a1, <4 x i64> %a0) 1167 ; add forces execution domain 1168 %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1> 1169 ret <4 x i64> %3 1170} 1171declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) nounwind readonly 1172 1173define <16 x i8> @stack_fold_vpermt2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { 1174; CHECK-LABEL: stack_fold_vpermt2b: 1175; CHECK: # %bb.0: 1176; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1177; CHECK-NEXT: #APP 1178; CHECK-NEXT: nop 1179; CHECK-NEXT: #NO_APP 1180; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1181; CHECK-NEXT: retq 1182 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1183 %2 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) 1184 ret <16 x i8> %2 1185} 1186declare <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>) 1187 1188define <32 x i8> @stack_fold_vpermt2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { 1189; CHECK-LABEL: stack_fold_vpermt2b_ymm: 1190; CHECK: # %bb.0: 1191; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1192; CHECK-NEXT: #APP 1193; CHECK-NEXT: nop 1194; CHECK-NEXT: #NO_APP 1195; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1196; CHECK-NEXT: retq 1197 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1198 %2 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) 1199 ret <32 x i8> %2 1200} 1201declare <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>) 1202 1203define <4 x i32> @stack_fold_vpermt2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { 1204; CHECK-LABEL: stack_fold_vpermt2d: 1205; CHECK: # %bb.0: 1206; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1207; CHECK-NEXT: #APP 1208; CHECK-NEXT: nop 1209; CHECK-NEXT: #NO_APP 1210; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1211; CHECK-NEXT: retq 1212 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1213 %2 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) 1214 ret <4 x i32> %2 1215} 1216declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>) 1217 1218define <8 x i32> @stack_fold_vpermt2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { 1219; CHECK-LABEL: stack_fold_vpermt2d_ymm: 1220; CHECK: # %bb.0: 1221; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1222; CHECK-NEXT: #APP 1223; CHECK-NEXT: nop 1224; CHECK-NEXT: #NO_APP 1225; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1226; CHECK-NEXT: retq 1227 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1228 %2 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) 1229 ret <8 x i32> %2 1230} 1231declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>) 1232 1233define <2 x i64> @stack_fold_vpermt2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { 1234; CHECK-LABEL: stack_fold_vpermt2q: 1235; CHECK: # %bb.0: 1236; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1237; CHECK-NEXT: #APP 1238; CHECK-NEXT: nop 1239; CHECK-NEXT: #NO_APP 1240; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1241; CHECK-NEXT: retq 1242 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1243 %2 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) 1244 ret <2 x i64> %2 1245} 1246declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>) 1247 1248define <4 x i64> @stack_fold_vpermt2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { 1249; CHECK-LABEL: stack_fold_vpermt2q_ymm: 1250; CHECK: # %bb.0: 1251; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1252; CHECK-NEXT: #APP 1253; CHECK-NEXT: nop 1254; CHECK-NEXT: #NO_APP 1255; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1256; CHECK-NEXT: retq 1257 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1258 %2 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) 1259 ret <4 x i64> %2 1260} 1261declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>) 1262 1263define <8 x i16> @stack_fold_vpermt2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { 1264; CHECK-LABEL: stack_fold_vpermt2w: 1265; CHECK: # %bb.0: 1266; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1267; CHECK-NEXT: #APP 1268; CHECK-NEXT: nop 1269; CHECK-NEXT: #NO_APP 1270; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 1271; CHECK-NEXT: retq 1272 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1273 %2 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) 1274 ret <8 x i16> %2 1275} 1276declare <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>) 1277 1278define <16 x i16> @stack_fold_vpermt2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { 1279; CHECK-LABEL: stack_fold_vpermt2w_ymm: 1280; CHECK: # %bb.0: 1281; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1282; CHECK-NEXT: #APP 1283; CHECK-NEXT: nop 1284; CHECK-NEXT: #NO_APP 1285; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 1286; CHECK-NEXT: retq 1287 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1288 %2 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) 1289 ret <16 x i16> %2 1290} 1291declare <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>) 1292 1293define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) { 1294; CHECK-LABEL: stack_fold_permwvar: 1295; CHECK: # %bb.0: 1296; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1297; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1298; CHECK-NEXT: #APP 1299; CHECK-NEXT: nop 1300; CHECK-NEXT: #NO_APP 1301; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1302; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1303; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1304; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1305; CHECK-NEXT: retq 1306 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1307 %2 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a1, <16 x i16> %a0) 1308 ; add forces execution domain 1309 %3 = add <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1310 ret <16 x i16> %3 1311} 1312declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) nounwind readonly 1313 1314define <4 x i32> @stack_fold_vplzcntd(<4 x i32> %a0) { 1315; CHECK-LABEL: stack_fold_vplzcntd: 1316; CHECK: # %bb.0: 1317; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1318; CHECK-NEXT: #APP 1319; CHECK-NEXT: nop 1320; CHECK-NEXT: #NO_APP 1321; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1322; CHECK-NEXT: retq 1323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1324 %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a0, i1 false) 1325 ret <4 x i32> %2 1326} 1327 1328define <8 x i32> @stack_fold_vplzcntd_ymm(<8 x i32> %a0) { 1329; CHECK-LABEL: stack_fold_vplzcntd_ymm: 1330; CHECK: # %bb.0: 1331; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1332; CHECK-NEXT: #APP 1333; CHECK-NEXT: nop 1334; CHECK-NEXT: #NO_APP 1335; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1336; CHECK-NEXT: retq 1337 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1338 %2 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a0, i1 false) 1339 ret <8 x i32> %2 1340} 1341 1342define <2 x i64> @stack_fold_vplzcntq(<2 x i64> %a0) { 1343; CHECK-LABEL: stack_fold_vplzcntq: 1344; CHECK: # %bb.0: 1345; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1346; CHECK-NEXT: #APP 1347; CHECK-NEXT: nop 1348; CHECK-NEXT: #NO_APP 1349; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1350; CHECK-NEXT: retq 1351 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1352 %2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a0, i1 false) 1353 ret <2 x i64> %2 1354} 1355 1356define <4 x i64> @stack_fold_vplzcntq_ymm(<4 x i64> %a0) { 1357; CHECK-LABEL: stack_fold_vplzcntq_ymm: 1358; CHECK: # %bb.0: 1359; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1360; CHECK-NEXT: #APP 1361; CHECK-NEXT: nop 1362; CHECK-NEXT: #NO_APP 1363; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1364; CHECK-NEXT: retq 1365 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1366 %2 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a0, i1 false) 1367 ret <4 x i64> %2 1368} 1369 1370define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) { 1371; CHECK-LABEL: stack_fold_pmaddubsw: 1372; CHECK: # %bb.0: 1373; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1374; CHECK-NEXT: #APP 1375; CHECK-NEXT: nop 1376; CHECK-NEXT: #NO_APP 1377; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1378; CHECK-NEXT: retq 1379 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1380 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) 1381 ret <8 x i16> %2 1382} 1383declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone 1384 1385define <8 x i16> @stack_fold_pmaddubsw_mask(<8 x i16>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i8 %mask) { 1386; CHECK-LABEL: stack_fold_pmaddubsw_mask: 1387; CHECK: # %bb.0: 1388; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1389; CHECK-NEXT: #APP 1390; CHECK-NEXT: nop 1391; CHECK-NEXT: #NO_APP 1392; CHECK-NEXT: kmovd %esi, %k1 1393; CHECK-NEXT: vmovdqa (%rdi), %xmm2 1394; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1395; CHECK-NEXT: vmovdqa %xmm2, %xmm0 1396; CHECK-NEXT: retq 1397 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1398 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) 1399 %3 = bitcast i8 %mask to <8 x i1> 1400 ; load needed to keep the operation from being scheduled about the asm block 1401 %4 = load <8 x i16>, <8 x i16>* %passthru 1402 %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4 1403 ret <8 x i16> %5 1404} 1405 1406define <8 x i16> @stack_fold_pmaddubsw_maskz(<16 x i8> %a0, <16 x i8> %a1, i8 %mask) { 1407; CHECK-LABEL: stack_fold_pmaddubsw_maskz: 1408; CHECK: # %bb.0: 1409; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1410; CHECK-NEXT: #APP 1411; CHECK-NEXT: nop 1412; CHECK-NEXT: #NO_APP 1413; CHECK-NEXT: kmovd %edi, %k1 1414; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 1415; CHECK-NEXT: retq 1416 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1417 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) 1418 %3 = bitcast i8 %mask to <8 x i1> 1419 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 1420 ret <8 x i16> %4 1421} 1422 1423define <16 x i16> @stack_fold_pmaddubsw_ymm(<32 x i8> %a0, <32 x i8> %a1) { 1424; CHECK-LABEL: stack_fold_pmaddubsw_ymm: 1425; CHECK: # %bb.0: 1426; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1427; CHECK-NEXT: #APP 1428; CHECK-NEXT: nop 1429; CHECK-NEXT: #NO_APP 1430; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1431; CHECK-NEXT: retq 1432 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1433 %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) 1434 ret <16 x i16> %2 1435} 1436declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone 1437 1438define <16 x i16> @stack_fold_pmaddubsw_ymm_mask(<16 x i16>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i16 %mask) { 1439; CHECK-LABEL: stack_fold_pmaddubsw_ymm_mask: 1440; CHECK: # %bb.0: 1441; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1442; CHECK-NEXT: #APP 1443; CHECK-NEXT: nop 1444; CHECK-NEXT: #NO_APP 1445; CHECK-NEXT: kmovd %esi, %k1 1446; CHECK-NEXT: vmovdqa (%rdi), %ymm2 1447; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1448; CHECK-NEXT: vmovdqa %ymm2, %ymm0 1449; CHECK-NEXT: retq 1450 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1451 %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) 1452 %3 = bitcast i16 %mask to <16 x i1> 1453 ; load needed to keep the operation from being scheduled about the asm block 1454 %4 = load <16 x i16>, <16 x i16>* %passthru 1455 %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4 1456 ret <16 x i16> %5 1457} 1458 1459define <16 x i16> @stack_fold_pmaddubsw_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i16 %mask) { 1460; CHECK-LABEL: stack_fold_pmaddubsw_ymm_maskz: 1461; CHECK: # %bb.0: 1462; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1463; CHECK-NEXT: #APP 1464; CHECK-NEXT: nop 1465; CHECK-NEXT: #NO_APP 1466; CHECK-NEXT: kmovd %edi, %k1 1467; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 1468; CHECK-NEXT: retq 1469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1470 %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) 1471 %3 = bitcast i16 %mask to <16 x i1> 1472 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer 1473 ret <16 x i16> %4 1474} 1475 1476define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) { 1477; CHECK-LABEL: stack_fold_pmaddwd: 1478; CHECK: # %bb.0: 1479; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1480; CHECK-NEXT: #APP 1481; CHECK-NEXT: nop 1482; CHECK-NEXT: #NO_APP 1483; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1484; CHECK-NEXT: retq 1485 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1486 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) 1487 ret <4 x i32> %2 1488} 1489declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone 1490 1491define <4 x i32> @stack_fold_pmaddwd_mask(<4 x i32>* %passthru, <8 x i16> %a0, <8 x i16> %a1, i8 %mask) { 1492; CHECK-LABEL: stack_fold_pmaddwd_mask: 1493; CHECK: # %bb.0: 1494; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1495; CHECK-NEXT: #APP 1496; CHECK-NEXT: nop 1497; CHECK-NEXT: #NO_APP 1498; CHECK-NEXT: kmovd %esi, %k1 1499; CHECK-NEXT: vmovdqa (%rdi), %xmm2 1500; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1501; CHECK-NEXT: vmovdqa %xmm2, %xmm0 1502; CHECK-NEXT: retq 1503 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1504 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) 1505 %3 = bitcast i8 %mask to <8 x i1> 1506 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1507 ; load needed to keep the operation from being scheduled about the asm block 1508 %5 = load <4 x i32>, <4 x i32>* %passthru 1509 %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %5 1510 ret <4 x i32> %6 1511} 1512 1513define <4 x i32> @stack_fold_pmaddwd_maskz(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { 1514; CHECK-LABEL: stack_fold_pmaddwd_maskz: 1515; CHECK: # %bb.0: 1516; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1517; CHECK-NEXT: #APP 1518; CHECK-NEXT: nop 1519; CHECK-NEXT: #NO_APP 1520; CHECK-NEXT: kmovd %edi, %k1 1521; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 1522; CHECK-NEXT: retq 1523 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1524 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) 1525 %3 = bitcast i8 %mask to <8 x i1> 1526 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1527 %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer 1528 ret <4 x i32> %5 1529} 1530 1531define <8 x i32> @stack_fold_pmaddwd_ymm(<16 x i16> %a0, <16 x i16> %a1) { 1532; CHECK-LABEL: stack_fold_pmaddwd_ymm: 1533; CHECK: # %bb.0: 1534; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1535; CHECK-NEXT: #APP 1536; CHECK-NEXT: nop 1537; CHECK-NEXT: #NO_APP 1538; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1539; CHECK-NEXT: retq 1540 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1541 %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) 1542 ret <8 x i32> %2 1543} 1544declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone 1545 1546define <8 x i32> @stack_fold_pmaddwd_ymm_mask(<8 x i32>* %passthru, <16 x i16> %a0, <16 x i16> %a1, i8 %mask) { 1547; CHECK-LABEL: stack_fold_pmaddwd_ymm_mask: 1548; CHECK: # %bb.0: 1549; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1550; CHECK-NEXT: #APP 1551; CHECK-NEXT: nop 1552; CHECK-NEXT: #NO_APP 1553; CHECK-NEXT: kmovd %esi, %k1 1554; CHECK-NEXT: vmovdqa (%rdi), %ymm2 1555; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1556; CHECK-NEXT: vmovdqa %ymm2, %ymm0 1557; CHECK-NEXT: retq 1558 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1559 %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) 1560 %3 = bitcast i8 %mask to <8 x i1> 1561 ; load needed to keep the operation from being scheduled about the asm block 1562 %4 = load <8 x i32>, <8 x i32>* %passthru 1563 %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4 1564 ret <8 x i32> %5 1565} 1566 1567define <8 x i32> @stack_fold_pmaddwd_ymm_maskz(<16 x i16> %a0, <16 x i16> %a1, i8 %mask) { 1568; CHECK-LABEL: stack_fold_pmaddwd_ymm_maskz: 1569; CHECK: # %bb.0: 1570; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1571; CHECK-NEXT: #APP 1572; CHECK-NEXT: nop 1573; CHECK-NEXT: #NO_APP 1574; CHECK-NEXT: kmovd %edi, %k1 1575; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 1576; CHECK-NEXT: retq 1577 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1578 %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) 1579 %3 = bitcast i8 %mask to <8 x i1> 1580 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 1581 ret <8 x i32> %4 1582} 1583 1584define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) { 1585; CHECK-LABEL: stack_fold_pmaxsb: 1586; CHECK: # %bb.0: 1587; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1588; CHECK-NEXT: #APP 1589; CHECK-NEXT: nop 1590; CHECK-NEXT: #NO_APP 1591; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1592; CHECK-NEXT: retq 1593 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1594 %2 = icmp sgt <16 x i8> %a0, %a1 1595 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1596 ret <16 x i8> %3 1597} 1598 1599define <32 x i8> @stack_fold_pmaxsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 1600; CHECK-LABEL: stack_fold_pmaxsb_ymm: 1601; CHECK: # %bb.0: 1602; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1603; CHECK-NEXT: #APP 1604; CHECK-NEXT: nop 1605; CHECK-NEXT: #NO_APP 1606; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1607; CHECK-NEXT: retq 1608 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1609 %2 = icmp sgt <32 x i8> %a0, %a1 1610 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 1611 ret <32 x i8> %3 1612} 1613 1614define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) { 1615; CHECK-LABEL: stack_fold_pmaxsd: 1616; CHECK: # %bb.0: 1617; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1618; CHECK-NEXT: #APP 1619; CHECK-NEXT: nop 1620; CHECK-NEXT: #NO_APP 1621; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1622; CHECK-NEXT: retq 1623 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1624 %2 = icmp sgt <4 x i32> %a0, %a1 1625 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1626 ret <4 x i32> %3 1627} 1628 1629define <8 x i32> @stack_fold_pmaxsd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 1630; CHECK-LABEL: stack_fold_pmaxsd_ymm: 1631; CHECK: # %bb.0: 1632; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1633; CHECK-NEXT: #APP 1634; CHECK-NEXT: nop 1635; CHECK-NEXT: #NO_APP 1636; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1637; CHECK-NEXT: retq 1638 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1639 %2 = icmp sgt <8 x i32> %a0, %a1 1640 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 1641 ret <8 x i32> %3 1642} 1643 1644define <2 x i64> @stack_fold_pmaxsq(<2 x i64> %a0, <2 x i64> %a1) { 1645; CHECK-LABEL: stack_fold_pmaxsq: 1646; CHECK: # %bb.0: 1647; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1648; CHECK-NEXT: #APP 1649; CHECK-NEXT: nop 1650; CHECK-NEXT: #NO_APP 1651; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1652; CHECK-NEXT: retq 1653 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1654 %2 = icmp sgt <2 x i64> %a0, %a1 1655 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1 1656 ret <2 x i64> %3 1657} 1658 1659define <4 x i64> @stack_fold_pmaxsq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 1660; CHECK-LABEL: stack_fold_pmaxsq_ymm: 1661; CHECK: # %bb.0: 1662; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1663; CHECK-NEXT: #APP 1664; CHECK-NEXT: nop 1665; CHECK-NEXT: #NO_APP 1666; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1667; CHECK-NEXT: retq 1668 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1669 %2 = icmp sgt <4 x i64> %a0, %a1 1670 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1 1671 ret <4 x i64> %3 1672} 1673 1674define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) { 1675; CHECK-LABEL: stack_fold_pmaxsw: 1676; CHECK: # %bb.0: 1677; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1678; CHECK-NEXT: #APP 1679; CHECK-NEXT: nop 1680; CHECK-NEXT: #NO_APP 1681; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1682; CHECK-NEXT: retq 1683 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1684 %2 = icmp sgt <8 x i16> %a0, %a1 1685 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1686 ret <8 x i16> %3 1687} 1688 1689define <16 x i16> @stack_fold_pmaxsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 1690; CHECK-LABEL: stack_fold_pmaxsw_ymm: 1691; CHECK: # %bb.0: 1692; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1693; CHECK-NEXT: #APP 1694; CHECK-NEXT: nop 1695; CHECK-NEXT: #NO_APP 1696; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1697; CHECK-NEXT: retq 1698 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1699 %2 = icmp sgt <16 x i16> %a0, %a1 1700 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 1701 ret <16 x i16> %3 1702} 1703 1704define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) { 1705; CHECK-LABEL: stack_fold_pmaxub: 1706; CHECK: # %bb.0: 1707; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1708; CHECK-NEXT: #APP 1709; CHECK-NEXT: nop 1710; CHECK-NEXT: #NO_APP 1711; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1712; CHECK-NEXT: retq 1713 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1714 %2 = icmp ugt <16 x i8> %a0, %a1 1715 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1716 ret <16 x i8> %3 1717} 1718 1719define <32 x i8> @stack_fold_pmaxub_ymm(<32 x i8> %a0, <32 x i8> %a1) { 1720; CHECK-LABEL: stack_fold_pmaxub_ymm: 1721; CHECK: # %bb.0: 1722; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1723; CHECK-NEXT: #APP 1724; CHECK-NEXT: nop 1725; CHECK-NEXT: #NO_APP 1726; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1727; CHECK-NEXT: retq 1728 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1729 %2 = icmp ugt <32 x i8> %a0, %a1 1730 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 1731 ret <32 x i8> %3 1732} 1733 1734define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) { 1735; CHECK-LABEL: stack_fold_pmaxud: 1736; CHECK: # %bb.0: 1737; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1738; CHECK-NEXT: #APP 1739; CHECK-NEXT: nop 1740; CHECK-NEXT: #NO_APP 1741; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1742; CHECK-NEXT: retq 1743 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1744 %2 = icmp ugt <4 x i32> %a0, %a1 1745 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1746 ret <4 x i32> %3 1747} 1748 1749define <8 x i32> @stack_fold_pmaxud_ymm(<8 x i32> %a0, <8 x i32> %a1) { 1750; CHECK-LABEL: stack_fold_pmaxud_ymm: 1751; CHECK: # %bb.0: 1752; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1753; CHECK-NEXT: #APP 1754; CHECK-NEXT: nop 1755; CHECK-NEXT: #NO_APP 1756; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1757; CHECK-NEXT: retq 1758 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1759 %2 = icmp ugt <8 x i32> %a0, %a1 1760 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 1761 ret <8 x i32> %3 1762} 1763 1764define <2 x i64> @stack_fold_pmaxuq(<2 x i64> %a0, <2 x i64> %a1) { 1765; CHECK-LABEL: stack_fold_pmaxuq: 1766; CHECK: # %bb.0: 1767; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1768; CHECK-NEXT: #APP 1769; CHECK-NEXT: nop 1770; CHECK-NEXT: #NO_APP 1771; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1772; CHECK-NEXT: retq 1773 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1774 %2 = icmp ugt <2 x i64> %a0, %a1 1775 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1 1776 ret <2 x i64> %3 1777} 1778 1779define <2 x i64> @stack_fold_pmaxuq_mask(<2 x i64>* %passthru, <2 x i64> %a0, <2 x i64> %a1, i8 %mask) { 1780; CHECK-LABEL: stack_fold_pmaxuq_mask: 1781; CHECK: # %bb.0: 1782; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1783; CHECK-NEXT: #APP 1784; CHECK-NEXT: nop 1785; CHECK-NEXT: #NO_APP 1786; CHECK-NEXT: vmovdqa (%rdi), %xmm2 1787; CHECK-NEXT: kmovd %esi, %k1 1788; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 1789; CHECK-NEXT: vmovdqa %xmm2, %xmm0 1790; CHECK-NEXT: retq 1791 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1792 %2 = load <2 x i64>, <2 x i64>* %passthru 1793 %3 = icmp ugt <2 x i64> %a0, %a1 1794 %4 = select <2 x i1> %3, <2 x i64> %a0, <2 x i64> %a1 1795 %5 = bitcast i8 %mask to <8 x i1> 1796 %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1> 1797 %6 = select <2 x i1> %extract, <2 x i64> %4, <2 x i64> %2 1798 ret <2 x i64> %6 1799} 1800 1801define <2 x i64> @stack_fold_pmaxuq_maskz(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { 1802; CHECK-LABEL: stack_fold_pmaxuq_maskz: 1803; CHECK: # %bb.0: 1804; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1805; CHECK-NEXT: #APP 1806; CHECK-NEXT: nop 1807; CHECK-NEXT: #NO_APP 1808; CHECK-NEXT: kmovd %edi, %k1 1809; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 1810; CHECK-NEXT: retq 1811 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1812 %2 = icmp ugt <2 x i64> %a0, %a1 1813 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1 1814 %4 = bitcast i8 %mask to <8 x i1> 1815 %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <2 x i32> <i32 0, i32 1> 1816 %5 = select <2 x i1> %extract, <2 x i64> %3, <2 x i64> zeroinitializer 1817 ret <2 x i64> %5 1818} 1819 1820define <4 x i64> @stack_fold_pmaxuq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 1821; CHECK-LABEL: stack_fold_pmaxuq_ymm: 1822; CHECK: # %bb.0: 1823; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1824; CHECK-NEXT: #APP 1825; CHECK-NEXT: nop 1826; CHECK-NEXT: #NO_APP 1827; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1828; CHECK-NEXT: retq 1829 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1830 %2 = icmp ugt <4 x i64> %a0, %a1 1831 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1 1832 ret <4 x i64> %3 1833} 1834 1835define <4 x i64> @stack_fold_pmaxuq_ymm_mask(<4 x i64>* %passthru, <4 x i64> %a0, <4 x i64> %a1, i8 %mask) { 1836; CHECK-LABEL: stack_fold_pmaxuq_ymm_mask: 1837; CHECK: # %bb.0: 1838; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1839; CHECK-NEXT: #APP 1840; CHECK-NEXT: nop 1841; CHECK-NEXT: #NO_APP 1842; CHECK-NEXT: vmovdqa (%rdi), %ymm2 1843; CHECK-NEXT: kmovd %esi, %k1 1844; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 1845; CHECK-NEXT: vmovdqa %ymm2, %ymm0 1846; CHECK-NEXT: retq 1847 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1848 %2 = load <4 x i64>, <4 x i64>* %passthru 1849 %3 = icmp ugt <4 x i64> %a0, %a1 1850 %4 = select <4 x i1> %3, <4 x i64> %a0, <4 x i64> %a1 1851 %5 = bitcast i8 %mask to <8 x i1> 1852 %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1853 %6 = select <4 x i1> %extract, <4 x i64> %4, <4 x i64> %2 1854 ret <4 x i64> %6 1855} 1856 1857define <4 x i64> @stack_fold_pmaxuq_ymm_maskz(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { 1858; CHECK-LABEL: stack_fold_pmaxuq_ymm_maskz: 1859; CHECK: # %bb.0: 1860; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1861; CHECK-NEXT: #APP 1862; CHECK-NEXT: nop 1863; CHECK-NEXT: #NO_APP 1864; CHECK-NEXT: kmovd %edi, %k1 1865; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 1866; CHECK-NEXT: retq 1867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1868 %2 = icmp ugt <4 x i64> %a0, %a1 1869 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1 1870 %4 = bitcast i8 %mask to <8 x i1> 1871 %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1872 %5 = select <4 x i1> %extract, <4 x i64> %3, <4 x i64> zeroinitializer 1873 ret <4 x i64> %5 1874} 1875 1876define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) { 1877; CHECK-LABEL: stack_fold_pmaxuw: 1878; CHECK: # %bb.0: 1879; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1880; CHECK-NEXT: #APP 1881; CHECK-NEXT: nop 1882; CHECK-NEXT: #NO_APP 1883; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1884; CHECK-NEXT: retq 1885 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1886 %2 = icmp ugt <8 x i16> %a0, %a1 1887 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 1888 ret <8 x i16> %3 1889} 1890 1891define <16 x i16> @stack_fold_pmaxuw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 1892; CHECK-LABEL: stack_fold_pmaxuw_ymm: 1893; CHECK: # %bb.0: 1894; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1895; CHECK-NEXT: #APP 1896; CHECK-NEXT: nop 1897; CHECK-NEXT: #NO_APP 1898; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1899; CHECK-NEXT: retq 1900 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1901 %2 = icmp ugt <16 x i16> %a0, %a1 1902 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 1903 ret <16 x i16> %3 1904} 1905declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone 1906 1907define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) { 1908; CHECK-LABEL: stack_fold_pminsb: 1909; CHECK: # %bb.0: 1910; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1911; CHECK-NEXT: #APP 1912; CHECK-NEXT: nop 1913; CHECK-NEXT: #NO_APP 1914; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1915; CHECK-NEXT: retq 1916 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1917 %2 = icmp slt <16 x i8> %a0, %a1 1918 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 1919 ret <16 x i8> %3 1920} 1921 1922define <32 x i8> @stack_fold_pminsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 1923; CHECK-LABEL: stack_fold_pminsb_ymm: 1924; CHECK: # %bb.0: 1925; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1926; CHECK-NEXT: #APP 1927; CHECK-NEXT: nop 1928; CHECK-NEXT: #NO_APP 1929; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1930; CHECK-NEXT: retq 1931 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1932 %2 = icmp slt <32 x i8> %a0, %a1 1933 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 1934 ret <32 x i8> %3 1935} 1936 1937define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) { 1938; CHECK-LABEL: stack_fold_pminsd: 1939; CHECK: # %bb.0: 1940; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1941; CHECK-NEXT: #APP 1942; CHECK-NEXT: nop 1943; CHECK-NEXT: #NO_APP 1944; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1945; CHECK-NEXT: retq 1946 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1947 %2 = icmp slt <4 x i32> %a0, %a1 1948 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 1949 ret <4 x i32> %3 1950} 1951 1952define <8 x i32> @stack_fold_pminsd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 1953; CHECK-LABEL: stack_fold_pminsd_ymm: 1954; CHECK: # %bb.0: 1955; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1956; CHECK-NEXT: #APP 1957; CHECK-NEXT: nop 1958; CHECK-NEXT: #NO_APP 1959; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1960; CHECK-NEXT: retq 1961 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1962 %2 = icmp slt <8 x i32> %a0, %a1 1963 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 1964 ret <8 x i32> %3 1965} 1966 1967define <2 x i64> @stack_fold_pminsq(<2 x i64> %a0, <2 x i64> %a1) { 1968; CHECK-LABEL: stack_fold_pminsq: 1969; CHECK: # %bb.0: 1970; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1971; CHECK-NEXT: #APP 1972; CHECK-NEXT: nop 1973; CHECK-NEXT: #NO_APP 1974; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1975; CHECK-NEXT: retq 1976 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1977 %2 = icmp slt <2 x i64> %a0, %a1 1978 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1 1979 ret <2 x i64> %3 1980} 1981 1982define <4 x i64> @stack_fold_pminsq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 1983; CHECK-LABEL: stack_fold_pminsq_ymm: 1984; CHECK: # %bb.0: 1985; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1986; CHECK-NEXT: #APP 1987; CHECK-NEXT: nop 1988; CHECK-NEXT: #NO_APP 1989; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1990; CHECK-NEXT: retq 1991 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 1992 %2 = icmp slt <4 x i64> %a0, %a1 1993 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1 1994 ret <4 x i64> %3 1995} 1996 1997define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) { 1998; CHECK-LABEL: stack_fold_pminsw: 1999; CHECK: # %bb.0: 2000; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2001; CHECK-NEXT: #APP 2002; CHECK-NEXT: nop 2003; CHECK-NEXT: #NO_APP 2004; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2005; CHECK-NEXT: retq 2006 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2007 %2 = icmp slt <8 x i16> %a0, %a1 2008 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 2009 ret <8 x i16> %3 2010} 2011 2012define <16 x i16> @stack_fold_pminsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 2013; CHECK-LABEL: stack_fold_pminsw_ymm: 2014; CHECK: # %bb.0: 2015; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2016; CHECK-NEXT: #APP 2017; CHECK-NEXT: nop 2018; CHECK-NEXT: #NO_APP 2019; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2020; CHECK-NEXT: retq 2021 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2022 %2 = icmp slt <16 x i16> %a0, %a1 2023 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 2024 ret <16 x i16> %3 2025} 2026 2027define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) { 2028; CHECK-LABEL: stack_fold_pminub: 2029; CHECK: # %bb.0: 2030; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2031; CHECK-NEXT: #APP 2032; CHECK-NEXT: nop 2033; CHECK-NEXT: #NO_APP 2034; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2035; CHECK-NEXT: retq 2036 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2037 %2 = icmp ult <16 x i8> %a0, %a1 2038 %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1 2039 ret <16 x i8> %3 2040} 2041 2042define <32 x i8> @stack_fold_pminub_ymm(<32 x i8> %a0, <32 x i8> %a1) { 2043; CHECK-LABEL: stack_fold_pminub_ymm: 2044; CHECK: # %bb.0: 2045; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2046; CHECK-NEXT: #APP 2047; CHECK-NEXT: nop 2048; CHECK-NEXT: #NO_APP 2049; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2050; CHECK-NEXT: retq 2051 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2052 %2 = icmp ult <32 x i8> %a0, %a1 2053 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 2054 ret <32 x i8> %3 2055} 2056 2057define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) { 2058; CHECK-LABEL: stack_fold_pminud: 2059; CHECK: # %bb.0: 2060; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2061; CHECK-NEXT: #APP 2062; CHECK-NEXT: nop 2063; CHECK-NEXT: #NO_APP 2064; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2065; CHECK-NEXT: retq 2066 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2067 %2 = icmp ult <4 x i32> %a0, %a1 2068 %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1 2069 ret <4 x i32> %3 2070} 2071 2072define <8 x i32> @stack_fold_pminud_ymm(<8 x i32> %a0, <8 x i32> %a1) { 2073; CHECK-LABEL: stack_fold_pminud_ymm: 2074; CHECK: # %bb.0: 2075; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2076; CHECK-NEXT: #APP 2077; CHECK-NEXT: nop 2078; CHECK-NEXT: #NO_APP 2079; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2080; CHECK-NEXT: retq 2081 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2082 %2 = icmp ult <8 x i32> %a0, %a1 2083 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 2084 ret <8 x i32> %3 2085} 2086 2087define <2 x i64> @stack_fold_pminuq(<2 x i64> %a0, <2 x i64> %a1) { 2088; CHECK-LABEL: stack_fold_pminuq: 2089; CHECK: # %bb.0: 2090; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2091; CHECK-NEXT: #APP 2092; CHECK-NEXT: nop 2093; CHECK-NEXT: #NO_APP 2094; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2095; CHECK-NEXT: retq 2096 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2097 %2 = icmp ult <2 x i64> %a0, %a1 2098 %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1 2099 ret <2 x i64> %3 2100} 2101 2102define <4 x i64> @stack_fold_pminuq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 2103; CHECK-LABEL: stack_fold_pminuq_ymm: 2104; CHECK: # %bb.0: 2105; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2106; CHECK-NEXT: #APP 2107; CHECK-NEXT: nop 2108; CHECK-NEXT: #NO_APP 2109; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2110; CHECK-NEXT: retq 2111 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2112 %2 = icmp ult <4 x i64> %a0, %a1 2113 %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1 2114 ret <4 x i64> %3 2115} 2116 2117define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) { 2118; CHECK-LABEL: stack_fold_pminuw: 2119; CHECK: # %bb.0: 2120; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2121; CHECK-NEXT: #APP 2122; CHECK-NEXT: nop 2123; CHECK-NEXT: #NO_APP 2124; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2125; CHECK-NEXT: retq 2126 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2127 %2 = icmp ult <8 x i16> %a0, %a1 2128 %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1 2129 ret <8 x i16> %3 2130} 2131 2132define <16 x i16> @stack_fold_pminuw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 2133; CHECK-LABEL: stack_fold_pminuw_ymm: 2134; CHECK: # %bb.0: 2135; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2136; CHECK-NEXT: #APP 2137; CHECK-NEXT: nop 2138; CHECK-NEXT: #NO_APP 2139; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2140; CHECK-NEXT: retq 2141 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2142 %2 = icmp ult <16 x i16> %a0, %a1 2143 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 2144 ret <16 x i16> %3 2145} 2146 2147define <8 x i16> @stack_fold_vpmovdw(<8 x i32> %a0) { 2148; CHECK-LABEL: stack_fold_vpmovdw: 2149; CHECK: # %bb.0: 2150; CHECK-NEXT: vpmovdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2151; CHECK-NEXT: #APP 2152; CHECK-NEXT: nop 2153; CHECK-NEXT: #NO_APP 2154; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2155; CHECK-NEXT: vzeroupper 2156; CHECK-NEXT: retq 2157 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) 2158 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2159 ret <8 x i16> %1 2160} 2161declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8) 2162 2163define <4 x i32> @stack_fold_vpmovqd(<4 x i64> %a0) { 2164; CHECK-LABEL: stack_fold_vpmovqd: 2165; CHECK: # %bb.0: 2166; CHECK-NEXT: vpmovqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2167; CHECK-NEXT: #APP 2168; CHECK-NEXT: nop 2169; CHECK-NEXT: #NO_APP 2170; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2171; CHECK-NEXT: vzeroupper 2172; CHECK-NEXT: retq 2173 %1 = trunc <4 x i64> %a0 to <4 x i32> 2174 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2175 ret <4 x i32> %1 2176} 2177declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8) 2178 2179define <16 x i8> @stack_fold_vpmovwb(<16 x i16> %a0) { 2180; CHECK-LABEL: stack_fold_vpmovwb: 2181; CHECK: # %bb.0: 2182; CHECK-NEXT: vpmovwb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2183; CHECK-NEXT: #APP 2184; CHECK-NEXT: nop 2185; CHECK-NEXT: #NO_APP 2186; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2187; CHECK-NEXT: vzeroupper 2188; CHECK-NEXT: retq 2189 %1 = trunc <16 x i16> %a0 to <16 x i8> 2190 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2191 ret <16 x i8> %1 2192} 2193declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) 2194 2195define <8 x i16> @stack_fold_vpmovsdw(<8 x i32> %a0) { 2196; CHECK-LABEL: stack_fold_vpmovsdw: 2197; CHECK: # %bb.0: 2198; CHECK-NEXT: vpmovsdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2199; CHECK-NEXT: #APP 2200; CHECK-NEXT: nop 2201; CHECK-NEXT: #NO_APP 2202; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2203; CHECK-NEXT: vzeroupper 2204; CHECK-NEXT: retq 2205 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) 2206 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2207 ret <8 x i16> %1 2208} 2209declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8) 2210 2211define <4 x i32> @stack_fold_vpmovsqd(<4 x i64> %a0) { 2212; CHECK-LABEL: stack_fold_vpmovsqd: 2213; CHECK: # %bb.0: 2214; CHECK-NEXT: vpmovsqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2215; CHECK-NEXT: #APP 2216; CHECK-NEXT: nop 2217; CHECK-NEXT: #NO_APP 2218; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2219; CHECK-NEXT: vzeroupper 2220; CHECK-NEXT: retq 2221 %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1) 2222 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2223 ret <4 x i32> %1 2224} 2225declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8) 2226 2227define <16 x i8> @stack_fold_vpmovswb(<16 x i16> %a0) { 2228; CHECK-LABEL: stack_fold_vpmovswb: 2229; CHECK: # %bb.0: 2230; CHECK-NEXT: vpmovswb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2231; CHECK-NEXT: #APP 2232; CHECK-NEXT: nop 2233; CHECK-NEXT: #NO_APP 2234; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2235; CHECK-NEXT: vzeroupper 2236; CHECK-NEXT: retq 2237 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1) 2238 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2239 ret <16 x i8> %1 2240} 2241declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16) 2242 2243define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { 2244; CHECK-LABEL: stack_fold_pmovsxbd: 2245; CHECK: # %bb.0: 2246; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2247; CHECK-NEXT: #APP 2248; CHECK-NEXT: nop 2249; CHECK-NEXT: #NO_APP 2250; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2251; CHECK-NEXT: retq 2252 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2253 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2254 %3 = sext <4 x i8> %2 to <4 x i32> 2255 ret <4 x i32> %3 2256} 2257 2258define <8 x i32> @stack_fold_pmovsxbd_ymm(<16 x i8> %a0) { 2259; CHECK-LABEL: stack_fold_pmovsxbd_ymm: 2260; CHECK: # %bb.0: 2261; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2262; CHECK-NEXT: #APP 2263; CHECK-NEXT: nop 2264; CHECK-NEXT: #NO_APP 2265; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2266; CHECK-NEXT: retq 2267 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2268 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2269 %3 = sext <8 x i8> %2 to <8 x i32> 2270 ret <8 x i32> %3 2271} 2272 2273define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { 2274; CHECK-LABEL: stack_fold_pmovsxbq: 2275; CHECK: # %bb.0: 2276; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2277; CHECK-NEXT: #APP 2278; CHECK-NEXT: nop 2279; CHECK-NEXT: #NO_APP 2280; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2281; CHECK-NEXT: retq 2282 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2283 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 2284 %3 = sext <2 x i8> %2 to <2 x i64> 2285 ret <2 x i64> %3 2286} 2287 2288define <4 x i64> @stack_fold_pmovsxbq_ymm(<16 x i8> %a0) { 2289; CHECK-LABEL: stack_fold_pmovsxbq_ymm: 2290; CHECK: # %bb.0: 2291; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2292; CHECK-NEXT: #APP 2293; CHECK-NEXT: nop 2294; CHECK-NEXT: #NO_APP 2295; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2296; CHECK-NEXT: retq 2297 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2298 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2299 %3 = sext <4 x i8> %2 to <4 x i64> 2300 ret <4 x i64> %3 2301} 2302 2303define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { 2304; CHECK-LABEL: stack_fold_pmovsxbw: 2305; CHECK: # %bb.0: 2306; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2307; CHECK-NEXT: #APP 2308; CHECK-NEXT: nop 2309; CHECK-NEXT: #NO_APP 2310; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2311; CHECK-NEXT: retq 2312 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2313 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2314 %3 = sext <8 x i8> %2 to <8 x i16> 2315 ret <8 x i16> %3 2316} 2317 2318define <16 x i16> @stack_fold_pmovsxbw_ymm(<16 x i8> %a0) { 2319; CHECK-LABEL: stack_fold_pmovsxbw_ymm: 2320; CHECK: # %bb.0: 2321; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2322; CHECK-NEXT: #APP 2323; CHECK-NEXT: nop 2324; CHECK-NEXT: #NO_APP 2325; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2326; CHECK-NEXT: retq 2327 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2328 %2 = sext <16 x i8> %a0 to <16 x i16> 2329 ret <16 x i16> %2 2330} 2331 2332define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { 2333; CHECK-LABEL: stack_fold_pmovsxdq: 2334; CHECK: # %bb.0: 2335; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2336; CHECK-NEXT: #APP 2337; CHECK-NEXT: nop 2338; CHECK-NEXT: #NO_APP 2339; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2340; CHECK-NEXT: retq 2341 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2342 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 2343 %3 = sext <2 x i32> %2 to <2 x i64> 2344 ret <2 x i64> %3 2345} 2346 2347define <4 x i64> @stack_fold_pmovsxdq_ymm(<4 x i32> %a0) { 2348; CHECK-LABEL: stack_fold_pmovsxdq_ymm: 2349; CHECK: # %bb.0: 2350; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2351; CHECK-NEXT: #APP 2352; CHECK-NEXT: nop 2353; CHECK-NEXT: #NO_APP 2354; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2355; CHECK-NEXT: retq 2356 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2357 %2 = sext <4 x i32> %a0 to <4 x i64> 2358 ret <4 x i64> %2 2359} 2360 2361define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { 2362; CHECK-LABEL: stack_fold_pmovsxwd: 2363; CHECK: # %bb.0: 2364; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2365; CHECK-NEXT: #APP 2366; CHECK-NEXT: nop 2367; CHECK-NEXT: #NO_APP 2368; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2369; CHECK-NEXT: retq 2370 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2371 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2372 %3 = sext <4 x i16> %2 to <4 x i32> 2373 ret <4 x i32> %3 2374} 2375 2376define <8 x i32> @stack_fold_pmovsxwd_ymm(<8 x i16> %a0) { 2377; CHECK-LABEL: stack_fold_pmovsxwd_ymm: 2378; CHECK: # %bb.0: 2379; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2380; CHECK-NEXT: #APP 2381; CHECK-NEXT: nop 2382; CHECK-NEXT: #NO_APP 2383; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2384; CHECK-NEXT: retq 2385 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2386 %2 = sext <8 x i16> %a0 to <8 x i32> 2387 ret <8 x i32> %2 2388} 2389 2390define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { 2391; CHECK-LABEL: stack_fold_pmovsxwq: 2392; CHECK: # %bb.0: 2393; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2394; CHECK-NEXT: #APP 2395; CHECK-NEXT: nop 2396; CHECK-NEXT: #NO_APP 2397; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2398; CHECK-NEXT: retq 2399 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2400 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 2401 %3 = sext <2 x i16> %2 to <2 x i64> 2402 ret <2 x i64> %3 2403} 2404 2405define <4 x i64> @stack_fold_pmovsxwq_ymm(<8 x i16> %a0) { 2406; CHECK-LABEL: stack_fold_pmovsxwq_ymm: 2407; CHECK: # %bb.0: 2408; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2409; CHECK-NEXT: #APP 2410; CHECK-NEXT: nop 2411; CHECK-NEXT: #NO_APP 2412; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2413; CHECK-NEXT: retq 2414 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2415 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2416 %3 = sext <4 x i16> %2 to <4 x i64> 2417 ret <4 x i64> %3 2418} 2419 2420define <8 x i16> @stack_fold_vpmovusdw(<8 x i32> %a0) { 2421; CHECK-LABEL: stack_fold_vpmovusdw: 2422; CHECK: # %bb.0: 2423; CHECK-NEXT: vpmovusdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2424; CHECK-NEXT: #APP 2425; CHECK-NEXT: nop 2426; CHECK-NEXT: #NO_APP 2427; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2428; CHECK-NEXT: vzeroupper 2429; CHECK-NEXT: retq 2430 %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1) 2431 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2432 ret <8 x i16> %1 2433} 2434declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8) 2435 2436define <4 x i32> @stack_fold_vpmovusqd(<4 x i64> %a0) { 2437; CHECK-LABEL: stack_fold_vpmovusqd: 2438; CHECK: # %bb.0: 2439; CHECK-NEXT: vpmovusqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2440; CHECK-NEXT: #APP 2441; CHECK-NEXT: nop 2442; CHECK-NEXT: #NO_APP 2443; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2444; CHECK-NEXT: vzeroupper 2445; CHECK-NEXT: retq 2446 %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1) 2447 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2448 ret <4 x i32> %1 2449} 2450declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8) 2451 2452define <16 x i8> @stack_fold_vpmovuswb(<16 x i16> %a0) { 2453; CHECK-LABEL: stack_fold_vpmovuswb: 2454; CHECK: # %bb.0: 2455; CHECK-NEXT: vpmovuswb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 2456; CHECK-NEXT: #APP 2457; CHECK-NEXT: nop 2458; CHECK-NEXT: #NO_APP 2459; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2460; CHECK-NEXT: vzeroupper 2461; CHECK-NEXT: retq 2462 %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1) 2463 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2464 ret <16 x i8> %1 2465} 2466declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16) 2467 2468define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { 2469; CHECK-LABEL: stack_fold_pmovzxbd: 2470; CHECK: # %bb.0: 2471; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2472; CHECK-NEXT: #APP 2473; CHECK-NEXT: nop 2474; CHECK-NEXT: #NO_APP 2475; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2476; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2477; CHECK-NEXT: retq 2478 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2479 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27> 2480 %3 = bitcast <16 x i8> %2 to <4 x i32> 2481 ret <4 x i32> %3 2482} 2483 2484define <8 x i32> @stack_fold_pmovzxbd_ymm(<16 x i8> %a0) { 2485; CHECK-LABEL: stack_fold_pmovzxbd_ymm: 2486; CHECK: # %bb.0: 2487; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2488; CHECK-NEXT: #APP 2489; CHECK-NEXT: nop 2490; CHECK-NEXT: #NO_APP 2491; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2492; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 2493; CHECK-NEXT: retq 2494 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2495 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2496 %3 = zext <8 x i8> %2 to <8 x i32> 2497 ret <8 x i32> %3 2498} 2499 2500define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { 2501; CHECK-LABEL: stack_fold_pmovzxbq: 2502; CHECK: # %bb.0: 2503; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2504; CHECK-NEXT: #APP 2505; CHECK-NEXT: nop 2506; CHECK-NEXT: #NO_APP 2507; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2508; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 2509; CHECK-NEXT: retq 2510 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2511 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28> 2512 %3 = bitcast <16 x i8> %2 to <2 x i64> 2513 ret <2 x i64> %3 2514} 2515 2516define <4 x i64> @stack_fold_pmovzxbq_ymm(<16 x i8> %a0) { 2517; CHECK-LABEL: stack_fold_pmovzxbq_ymm: 2518; CHECK: # %bb.0: 2519; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2520; CHECK-NEXT: #APP 2521; CHECK-NEXT: nop 2522; CHECK-NEXT: #NO_APP 2523; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2524; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 2525; CHECK-NEXT: retq 2526 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2527 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2528 %3 = zext <4 x i8> %2 to <4 x i64> 2529 ret <4 x i64> %3 2530} 2531 2532define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { 2533; CHECK-LABEL: stack_fold_pmovzxbw: 2534; CHECK: # %bb.0: 2535; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2536; CHECK-NEXT: #APP 2537; CHECK-NEXT: nop 2538; CHECK-NEXT: #NO_APP 2539; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2540; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2541; CHECK-NEXT: retq 2542 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2543 %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 2544 %3 = bitcast <16 x i8> %2 to <8 x i16> 2545 ret <8 x i16> %3 2546} 2547 2548define <16 x i16> @stack_fold_pmovzxbw_ymm(<16 x i8> %a0) { 2549; CHECK-LABEL: stack_fold_pmovzxbw_ymm: 2550; CHECK: # %bb.0: 2551; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2552; CHECK-NEXT: #APP 2553; CHECK-NEXT: nop 2554; CHECK-NEXT: #NO_APP 2555; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2556; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 2557; CHECK-NEXT: retq 2558 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2559 %2 = zext <16 x i8> %a0 to <16 x i16> 2560 ret <16 x i16> %2 2561} 2562 2563define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { 2564; CHECK-LABEL: stack_fold_pmovzxdq: 2565; CHECK: # %bb.0: 2566; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2567; CHECK-NEXT: #APP 2568; CHECK-NEXT: nop 2569; CHECK-NEXT: #NO_APP 2570; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2571; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero 2572; CHECK-NEXT: retq 2573 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2574 %2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2575 %3 = bitcast <4 x i32> %2 to <2 x i64> 2576 ret <2 x i64> %3 2577} 2578 2579define <4 x i64> @stack_fold_pmovzxdq_ymm(<4 x i32> %a0) { 2580; CHECK-LABEL: stack_fold_pmovzxdq_ymm: 2581; CHECK: # %bb.0: 2582; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2583; CHECK-NEXT: #APP 2584; CHECK-NEXT: nop 2585; CHECK-NEXT: #NO_APP 2586; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2587; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2588; CHECK-NEXT: retq 2589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2590 %2 = zext <4 x i32> %a0 to <4 x i64> 2591 ret <4 x i64> %2 2592} 2593 2594define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { 2595; CHECK-LABEL: stack_fold_pmovzxwd: 2596; CHECK: # %bb.0: 2597; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2598; CHECK-NEXT: #APP 2599; CHECK-NEXT: nop 2600; CHECK-NEXT: #NO_APP 2601; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2602; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 2603; CHECK-NEXT: retq 2604 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2605 %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 2606 %3 = bitcast <8 x i16> %2 to <4 x i32> 2607 ret <4 x i32> %3 2608} 2609 2610define <8 x i32> @stack_fold_pmovzxwd_ymm(<8 x i16> %a0) { 2611; CHECK-LABEL: stack_fold_pmovzxwd_ymm: 2612; CHECK: # %bb.0: 2613; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2614; CHECK-NEXT: #APP 2615; CHECK-NEXT: nop 2616; CHECK-NEXT: #NO_APP 2617; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2618; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 2619; CHECK-NEXT: retq 2620 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2621 %2 = zext <8 x i16> %a0 to <8 x i32> 2622 ret <8 x i32> %2 2623} 2624 2625define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { 2626; CHECK-LABEL: stack_fold_pmovzxwq: 2627; CHECK: # %bb.0: 2628; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2629; CHECK-NEXT: #APP 2630; CHECK-NEXT: nop 2631; CHECK-NEXT: #NO_APP 2632; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2633; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero 2634; CHECK-NEXT: retq 2635 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2636 %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13> 2637 %3 = bitcast <8 x i16> %2 to <2 x i64> 2638 ret <2 x i64> %3 2639} 2640 2641define <4 x i64> @stack_fold_pmovzxwq_ymm(<8 x i16> %a0) { 2642; CHECK-LABEL: stack_fold_pmovzxwq_ymm: 2643; CHECK: # %bb.0: 2644; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2645; CHECK-NEXT: #APP 2646; CHECK-NEXT: nop 2647; CHECK-NEXT: #NO_APP 2648; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 2649; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2650; CHECK-NEXT: retq 2651 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2652 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2653 %3 = zext <4 x i16> %2 to <4 x i64> 2654 ret <4 x i64> %3 2655} 2656 2657define <4 x i64> @stack_fold_pmovzxwq_maskz_ymm(<8 x i16> %a0, i8 %mask) { 2658; CHECK-LABEL: stack_fold_pmovzxwq_maskz_ymm: 2659; CHECK: # %bb.0: 2660; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2661; CHECK-NEXT: #APP 2662; CHECK-NEXT: nop 2663; CHECK-NEXT: #NO_APP 2664; CHECK-NEXT: kmovd %edi, %k1 2665; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 16-byte Folded Reload 2666; CHECK-NEXT: # ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2667; CHECK-NEXT: retq 2668 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2669 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2670 %3 = zext <4 x i16> %2 to <4 x i64> 2671 %4 = bitcast i8 %mask to <8 x i1> 2672 %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2673 %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> zeroinitializer 2674 ret <4 x i64> %6 2675} 2676 2677define <4 x i64> @stack_fold_pmovzxwq_mask_ymm(<4 x i64> %passthru, <8 x i16> %a0, i8 %mask) { 2678; CHECK-LABEL: stack_fold_pmovzxwq_mask_ymm: 2679; CHECK: # %bb.0: 2680; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2681; CHECK-NEXT: #APP 2682; CHECK-NEXT: nop 2683; CHECK-NEXT: #NO_APP 2684; CHECK-NEXT: kmovd %edi, %k1 2685; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 16-byte Folded Reload 2686; CHECK-NEXT: # ymm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2687; CHECK-NEXT: retq 2688 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2689 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2690 %3 = zext <4 x i16> %2 to <4 x i64> 2691 %4 = bitcast i8 %mask to <8 x i1> 2692 %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2693 %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %passthru 2694 ret <4 x i64> %6 2695} 2696 2697define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { 2698; CHECK-LABEL: stack_fold_pmuldq: 2699; CHECK: # %bb.0: 2700; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2701; CHECK-NEXT: #APP 2702; CHECK-NEXT: nop 2703; CHECK-NEXT: #NO_APP 2704; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2705; CHECK-NEXT: retq 2706 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2707 %2 = bitcast <4 x i32> %a0 to <2 x i64> 2708 %3 = bitcast <4 x i32> %a1 to <2 x i64> 2709 %4 = shl <2 x i64> %2, <i64 32, i64 32> 2710 %5 = ashr <2 x i64> %4, <i64 32, i64 32> 2711 %6 = shl <2 x i64> %3, <i64 32, i64 32> 2712 %7 = ashr <2 x i64> %6, <i64 32, i64 32> 2713 %8 = mul <2 x i64> %5, %7 2714 ret <2 x i64> %8 2715} 2716 2717define <4 x i64> @stack_fold_pmuldq_ymm(<8 x i32> %a0, <8 x i32> %a1) { 2718; CHECK-LABEL: stack_fold_pmuldq_ymm: 2719; CHECK: # %bb.0: 2720; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2721; CHECK-NEXT: #APP 2722; CHECK-NEXT: nop 2723; CHECK-NEXT: #NO_APP 2724; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2725; CHECK-NEXT: retq 2726 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2727 %2 = bitcast <8 x i32> %a0 to <4 x i64> 2728 %3 = bitcast <8 x i32> %a1 to <4 x i64> 2729 %4 = shl <4 x i64> %2, <i64 32, i64 32, i64 32, i64 32> 2730 %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32> 2731 %6 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32> 2732 %7 = ashr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32> 2733 %8 = mul <4 x i64> %5, %7 2734 ret <4 x i64> %8 2735} 2736 2737define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { 2738; CHECK-LABEL: stack_fold_pmuludq: 2739; CHECK: # %bb.0: 2740; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2741; CHECK-NEXT: #APP 2742; CHECK-NEXT: nop 2743; CHECK-NEXT: #NO_APP 2744; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2745; CHECK-NEXT: retq 2746 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2747 %2 = bitcast <4 x i32> %a0 to <2 x i64> 2748 %3 = bitcast <4 x i32> %a1 to <2 x i64> 2749 %4 = and <2 x i64> %2, <i64 4294967295, i64 4294967295> 2750 %5 = and <2 x i64> %3, <i64 4294967295, i64 4294967295> 2751 %6 = mul <2 x i64> %4, %5 2752 ret <2 x i64> %6 2753} 2754 2755define <4 x i64> @stack_fold_pmuludq_ymm(<8 x i32> %a0, <8 x i32> %a1) { 2756; CHECK-LABEL: stack_fold_pmuludq_ymm: 2757; CHECK: # %bb.0: 2758; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2759; CHECK-NEXT: #APP 2760; CHECK-NEXT: nop 2761; CHECK-NEXT: #NO_APP 2762; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2763; CHECK-NEXT: retq 2764 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2765 %2 = bitcast <8 x i32> %a0 to <4 x i64> 2766 %3 = bitcast <8 x i32> %a1 to <4 x i64> 2767 %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2768 %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2769 %6 = mul <4 x i64> %4, %5 2770 ret <4 x i64> %6 2771} 2772 2773define <4 x i64> @stack_fold_pmuludq_ymm_mask(<4 x i64>* %passthru, <8 x i32> %a0, <8 x i32> %a1, i8 %mask) { 2774; CHECK-LABEL: stack_fold_pmuludq_ymm_mask: 2775; CHECK: # %bb.0: 2776; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2777; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2778; CHECK-NEXT: #APP 2779; CHECK-NEXT: nop 2780; CHECK-NEXT: #NO_APP 2781; CHECK-NEXT: kmovd %esi, %k1 2782; CHECK-NEXT: vmovdqa (%rdi), %ymm1 2783; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2784; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload 2785; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2786; CHECK-NEXT: retq 2787 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2788 %2 = bitcast <8 x i32> %a0 to <4 x i64> 2789 %3 = bitcast <8 x i32> %a1 to <4 x i64> 2790 %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2791 %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2792 %6 = mul <4 x i64> %4, %5 2793 %7 = bitcast i8 %mask to <8 x i1> 2794 %8 = shufflevector <8 x i1> %7, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2795 %9 = load <4 x i64>, <4 x i64>* %passthru 2796 %10 = select <4 x i1> %8, <4 x i64> %6, <4 x i64> %9 2797 ret <4 x i64> %10 2798} 2799 2800define <4 x i64> @stack_fold_pmuludq_ymm_maskz(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { 2801; CHECK-LABEL: stack_fold_pmuludq_ymm_maskz: 2802; CHECK: # %bb.0: 2803; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2804; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2805; CHECK-NEXT: #APP 2806; CHECK-NEXT: nop 2807; CHECK-NEXT: #NO_APP 2808; CHECK-NEXT: kmovd %edi, %k1 2809; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2810; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 2811; CHECK-NEXT: retq 2812 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2813 %2 = bitcast <8 x i32> %a0 to <4 x i64> 2814 %3 = bitcast <8 x i32> %a1 to <4 x i64> 2815 %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2816 %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 2817 %6 = mul <4 x i64> %4, %5 2818 %7 = bitcast i8 %mask to <8 x i1> 2819 %8 = shufflevector <8 x i1> %7, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2820 %9 = select <4 x i1> %8, <4 x i64> %6, <4 x i64> zeroinitializer 2821 ret <4 x i64> %9 2822} 2823 2824define <4 x i32> @stack_fold_vpopcntd(<4 x i32> %a0) { 2825; CHECK-LABEL: stack_fold_vpopcntd: 2826; CHECK: # %bb.0: 2827; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2828; CHECK-NEXT: #APP 2829; CHECK-NEXT: nop 2830; CHECK-NEXT: #NO_APP 2831; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2832; CHECK-NEXT: retq 2833 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2834 %2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a0) 2835 ret <4 x i32> %2 2836} 2837declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readonly 2838 2839define <8 x i32> @stack_fold_vpopcntd_ymm(<8 x i32> %a0) { 2840; CHECK-LABEL: stack_fold_vpopcntd_ymm: 2841; CHECK: # %bb.0: 2842; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2843; CHECK-NEXT: #APP 2844; CHECK-NEXT: nop 2845; CHECK-NEXT: #NO_APP 2846; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2847; CHECK-NEXT: retq 2848 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2849 %2 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a0) 2850 ret <8 x i32> %2 2851} 2852declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readonly 2853 2854define <2 x i64> @stack_fold_vpopcntq(<2 x i64> %a0) { 2855; CHECK-LABEL: stack_fold_vpopcntq: 2856; CHECK: # %bb.0: 2857; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2858; CHECK-NEXT: #APP 2859; CHECK-NEXT: nop 2860; CHECK-NEXT: #NO_APP 2861; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2862; CHECK-NEXT: retq 2863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2864 %2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a0) 2865 ret <2 x i64> %2 2866} 2867declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone 2868 2869define <4 x i64> @stack_fold_vpopcntq_ymm(<4 x i64> %a0) { 2870; CHECK-LABEL: stack_fold_vpopcntq_ymm: 2871; CHECK: # %bb.0: 2872; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2873; CHECK-NEXT: #APP 2874; CHECK-NEXT: nop 2875; CHECK-NEXT: #NO_APP 2876; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 2877; CHECK-NEXT: retq 2878 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2879 %2 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a0) 2880 ret <4 x i64> %2 2881} 2882declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone 2883 2884define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) { 2885; CHECK-LABEL: stack_fold_psadbw: 2886; CHECK: # %bb.0: 2887; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2888; CHECK-NEXT: #APP 2889; CHECK-NEXT: nop 2890; CHECK-NEXT: #NO_APP 2891; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2892; CHECK-NEXT: retq 2893 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2894 %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) 2895 ret <2 x i64> %2 2896} 2897declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone 2898 2899define <2 x i64> @stack_fold_psadbw_commute(<16 x i8> %a0, <16 x i8> %a1) { 2900; CHECK-LABEL: stack_fold_psadbw_commute: 2901; CHECK: # %bb.0: 2902; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2903; CHECK-NEXT: #APP 2904; CHECK-NEXT: nop 2905; CHECK-NEXT: #NO_APP 2906; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2907; CHECK-NEXT: retq 2908 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2909 %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a1, <16 x i8> %a0) 2910 ret <2 x i64> %2 2911} 2912 2913define <4 x i64> @stack_fold_psadbw_ymm(<32 x i8> %a0, <32 x i8> %a1) { 2914; CHECK-LABEL: stack_fold_psadbw_ymm: 2915; CHECK: # %bb.0: 2916; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2917; CHECK-NEXT: #APP 2918; CHECK-NEXT: nop 2919; CHECK-NEXT: #NO_APP 2920; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2921; CHECK-NEXT: retq 2922 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2923 %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) 2924 ret <4 x i64> %2 2925} 2926declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone 2927 2928define <4 x i64> @stack_fold_psadbw_ymm_commute(<32 x i8> %a0, <32 x i8> %a1) { 2929; CHECK-LABEL: stack_fold_psadbw_ymm_commute: 2930; CHECK: # %bb.0: 2931; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2932; CHECK-NEXT: #APP 2933; CHECK-NEXT: nop 2934; CHECK-NEXT: #NO_APP 2935; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2936; CHECK-NEXT: retq 2937 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2938 %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a1, <32 x i8> %a0) 2939 ret <4 x i64> %2 2940} 2941 2942define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 2943; CHECK-LABEL: stack_fold_pshufb: 2944; CHECK: # %bb.0: 2945; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2946; CHECK-NEXT: #APP 2947; CHECK-NEXT: nop 2948; CHECK-NEXT: #NO_APP 2949; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 2950; CHECK-NEXT: retq 2951 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2952 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) 2953 ret <16 x i8> %2 2954} 2955declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone 2956 2957define <16 x i8> @stack_fold_pshufb_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 2958; CHECK-LABEL: stack_fold_pshufb_mask: 2959; CHECK: # %bb.0: 2960; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2961; CHECK-NEXT: #APP 2962; CHECK-NEXT: nop 2963; CHECK-NEXT: #NO_APP 2964; CHECK-NEXT: vmovdqa (%rdi), %xmm2 2965; CHECK-NEXT: kmovd %esi, %k1 2966; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 2967; CHECK-NEXT: vmovdqa %xmm2, %xmm0 2968; CHECK-NEXT: retq 2969 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2970 %2 = load <16 x i8>, <16 x i8>* %passthru 2971 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) 2972 %4 = bitcast i16 %mask to <16 x i1> 2973 %5 = select <16 x i1> %4, <16 x i8> %3, <16 x i8> %2 2974 ret <16 x i8> %5 2975} 2976 2977define <16 x i8> @stack_fold_pshufb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 2978; CHECK-LABEL: stack_fold_pshufb_maskz: 2979; CHECK: # %bb.0: 2980; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2981; CHECK-NEXT: #APP 2982; CHECK-NEXT: nop 2983; CHECK-NEXT: #NO_APP 2984; CHECK-NEXT: kmovd %edi, %k1 2985; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 2986; CHECK-NEXT: retq 2987 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 2988 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) 2989 %3 = bitcast i16 %mask to <16 x i1> 2990 %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer 2991 ret <16 x i8> %4 2992} 2993 2994define <32 x i8> @stack_fold_pshufb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 2995; CHECK-LABEL: stack_fold_pshufb_ymm: 2996; CHECK: # %bb.0: 2997; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2998; CHECK-NEXT: #APP 2999; CHECK-NEXT: nop 3000; CHECK-NEXT: #NO_APP 3001; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3002; CHECK-NEXT: retq 3003 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3004 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) 3005 ret <32 x i8> %2 3006} 3007declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) 3008 3009define <32 x i8> @stack_fold_pshufb_ymm_mask(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 3010; CHECK-LABEL: stack_fold_pshufb_ymm_mask: 3011; CHECK: # %bb.0: 3012; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3013; CHECK-NEXT: #APP 3014; CHECK-NEXT: nop 3015; CHECK-NEXT: #NO_APP 3016; CHECK-NEXT: vmovdqa (%rdi), %ymm2 3017; CHECK-NEXT: kmovd %esi, %k1 3018; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 3019; CHECK-NEXT: vmovdqa %ymm2, %ymm0 3020; CHECK-NEXT: retq 3021 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3022 %2 = load <32 x i8>, <32 x i8>* %passthru 3023 %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) 3024 %4 = bitcast i32 %mask to <32 x i1> 3025 %5 = select <32 x i1> %4, <32 x i8> %3, <32 x i8> %2 3026 ret <32 x i8> %5 3027} 3028 3029define <32 x i8> @stack_fold_pshufb_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 3030; CHECK-LABEL: stack_fold_pshufb_ymm_maskz: 3031; CHECK: # %bb.0: 3032; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3033; CHECK-NEXT: #APP 3034; CHECK-NEXT: nop 3035; CHECK-NEXT: #NO_APP 3036; CHECK-NEXT: kmovd %edi, %k1 3037; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 3038; CHECK-NEXT: retq 3039 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3040 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) 3041 %3 = bitcast i32 %mask to <32 x i1> 3042 %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer 3043 ret <32 x i8> %4 3044} 3045 3046define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) { 3047; CHECK-LABEL: stack_fold_pshufd: 3048; CHECK: # %bb.0: 3049; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3050; CHECK-NEXT: #APP 3051; CHECK-NEXT: nop 3052; CHECK-NEXT: #NO_APP 3053; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3054; CHECK-NEXT: # xmm0 = mem[3,2,1,0] 3055; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 3056; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 3057; CHECK-NEXT: retq 3058 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3059 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 3060 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 3061 ret <4 x i32> %3 3062} 3063 3064define <4 x i32> @stack_fold_pshufd_mask(<4 x i32> %passthru, <4 x i32> %a0, i8 %mask) { 3065; CHECK-LABEL: stack_fold_pshufd_mask: 3066; CHECK: # %bb.0: 3067; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3068; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3069; CHECK-NEXT: #APP 3070; CHECK-NEXT: nop 3071; CHECK-NEXT: #NO_APP 3072; CHECK-NEXT: kmovd %edi, %k1 3073; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3074; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload 3075; CHECK-NEXT: # xmm0 {%k1} = mem[3,2,1,0] 3076; CHECK-NEXT: retq 3077 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3078 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 3079 %3 = bitcast i8 %mask to <8 x i1> 3080 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3081 %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %passthru 3082 ret <4 x i32> %5 3083} 3084 3085define <4 x i32> @stack_fold_pshufd_maskz(<4 x i32> %a0, i8 %mask) { 3086; CHECK-LABEL: stack_fold_pshufd_maskz: 3087; CHECK: # %bb.0: 3088; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3089; CHECK-NEXT: #APP 3090; CHECK-NEXT: nop 3091; CHECK-NEXT: #NO_APP 3092; CHECK-NEXT: kmovd %edi, %k1 3093; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload 3094; CHECK-NEXT: # xmm0 {%k1} {z} = mem[3,2,1,0] 3095; CHECK-NEXT: retq 3096 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3097 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 3098 %3 = bitcast i8 %mask to <8 x i1> 3099 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3100 %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer 3101 ret <4 x i32> %5 3102} 3103 3104define <8 x i32> @stack_fold_pshufd_ymm(<8 x i32> %a0) { 3105; CHECK-LABEL: stack_fold_pshufd_ymm: 3106; CHECK: # %bb.0: 3107; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3108; CHECK-NEXT: #APP 3109; CHECK-NEXT: nop 3110; CHECK-NEXT: #NO_APP 3111; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3112; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4] 3113; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 3114; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 3115; CHECK-NEXT: retq 3116 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3117 %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 3118 %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 3119 ret <8 x i32> %3 3120} 3121 3122define <8 x i32> @stack_fold_pshufd_ymm_mask(<8 x i32> %passthru, <8 x i32> %a0, i8 %mask) { 3123; CHECK-LABEL: stack_fold_pshufd_ymm_mask: 3124; CHECK: # %bb.0: 3125; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3126; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3127; CHECK-NEXT: #APP 3128; CHECK-NEXT: nop 3129; CHECK-NEXT: #NO_APP 3130; CHECK-NEXT: kmovd %edi, %k1 3131; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3132; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload 3133; CHECK-NEXT: # ymm0 {%k1} = mem[3,2,1,0,7,6,5,4] 3134; CHECK-NEXT: retq 3135 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3136 %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 3137 %3 = bitcast i8 %mask to <8 x i1> 3138 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %passthru 3139 ret <8 x i32> %4 3140} 3141 3142define <8 x i32> @stack_fold_pshufd_ymm_maskz(<8 x i32> %a0, i8 %mask) { 3143; CHECK-LABEL: stack_fold_pshufd_ymm_maskz: 3144; CHECK: # %bb.0: 3145; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3146; CHECK-NEXT: #APP 3147; CHECK-NEXT: nop 3148; CHECK-NEXT: #NO_APP 3149; CHECK-NEXT: kmovd %edi, %k1 3150; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload 3151; CHECK-NEXT: # ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4] 3152; CHECK-NEXT: retq 3153 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3154 %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 3155 %3 = bitcast i8 %mask to <8 x i1> 3156 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 3157 ret <8 x i32> %4 3158} 3159 3160define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) { 3161; CHECK-LABEL: stack_fold_pshufhw: 3162; CHECK: # %bb.0: 3163; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3164; CHECK-NEXT: #APP 3165; CHECK-NEXT: nop 3166; CHECK-NEXT: #NO_APP 3167; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3168; CHECK-NEXT: # xmm0 = mem[0,1,2,3,7,6,4,4] 3169; CHECK-NEXT: retq 3170 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3171 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4> 3172 ret <8 x i16> %2 3173} 3174 3175define <8 x i16> @stack_fold_pshufhw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) { 3176; CHECK-LABEL: stack_fold_pshufhw_mask: 3177; CHECK: # %bb.0: 3178; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3179; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3180; CHECK-NEXT: #APP 3181; CHECK-NEXT: nop 3182; CHECK-NEXT: #NO_APP 3183; CHECK-NEXT: kmovd %edi, %k1 3184; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3185; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload 3186; CHECK-NEXT: # xmm0 {%k1} = mem[0,1,2,3,7,6,4,4] 3187; CHECK-NEXT: retq 3188 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3189 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4> 3190 %3 = bitcast i8 %mask to <8 x i1> 3191 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru 3192 ret <8 x i16> %4 3193} 3194 3195define <8 x i16> @stack_fold_pshufhw_maskz(<8 x i16> %a0, i8 %mask) { 3196; CHECK-LABEL: stack_fold_pshufhw_maskz: 3197; CHECK: # %bb.0: 3198; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3199; CHECK-NEXT: #APP 3200; CHECK-NEXT: nop 3201; CHECK-NEXT: #NO_APP 3202; CHECK-NEXT: kmovd %edi, %k1 3203; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload 3204; CHECK-NEXT: # xmm0 {%k1} {z} = mem[0,1,2,3,7,6,4,4] 3205; CHECK-NEXT: retq 3206 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3207 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4> 3208 %3 = bitcast i8 %mask to <8 x i1> 3209 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 3210 ret <8 x i16> %4 3211} 3212 3213define <16 x i16> @stack_fold_pshufhw_ymm(<16 x i16> %a0) { 3214; CHECK-LABEL: stack_fold_pshufhw_ymm: 3215; CHECK: # %bb.0: 3216; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3217; CHECK-NEXT: #APP 3218; CHECK-NEXT: nop 3219; CHECK-NEXT: #NO_APP 3220; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3221; CHECK-NEXT: # ymm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] 3222; CHECK-NEXT: retq 3223 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3224 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> 3225 ret <16 x i16> %2 3226} 3227 3228define <16 x i16> @stack_fold_pshufhw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) { 3229; CHECK-LABEL: stack_fold_pshufhw_ymm_mask: 3230; CHECK: # %bb.0: 3231; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3232; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3233; CHECK-NEXT: #APP 3234; CHECK-NEXT: nop 3235; CHECK-NEXT: #NO_APP 3236; CHECK-NEXT: kmovd %edi, %k1 3237; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3238; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload 3239; CHECK-NEXT: # ymm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] 3240; CHECK-NEXT: retq 3241 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3242 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> 3243 %3 = bitcast i16 %mask to <16 x i1> 3244 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %passthru 3245 ret <16 x i16> %4 3246} 3247 3248define <16 x i16> @stack_fold_pshufhw_ymm_maskz(<16 x i16> %a0, i16 %mask) { 3249; CHECK-LABEL: stack_fold_pshufhw_ymm_maskz: 3250; CHECK: # %bb.0: 3251; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3252; CHECK-NEXT: #APP 3253; CHECK-NEXT: nop 3254; CHECK-NEXT: #NO_APP 3255; CHECK-NEXT: kmovd %edi, %k1 3256; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload 3257; CHECK-NEXT: # ymm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] 3258; CHECK-NEXT: retq 3259 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3260 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> 3261 %3 = bitcast i16 %mask to <16 x i1> 3262 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer 3263 ret <16 x i16> %4 3264} 3265 3266define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) { 3267; CHECK-LABEL: stack_fold_pshuflw: 3268; CHECK: # %bb.0: 3269; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3270; CHECK-NEXT: #APP 3271; CHECK-NEXT: nop 3272; CHECK-NEXT: #NO_APP 3273; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3274; CHECK-NEXT: # xmm0 = mem[3,2,1,0,4,5,6,7] 3275; CHECK-NEXT: retq 3276 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3277 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> 3278 ret <8 x i16> %2 3279} 3280 3281define <8 x i16> @stack_fold_pshuflw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) { 3282; CHECK-LABEL: stack_fold_pshuflw_mask: 3283; CHECK: # %bb.0: 3284; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3285; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3286; CHECK-NEXT: #APP 3287; CHECK-NEXT: nop 3288; CHECK-NEXT: #NO_APP 3289; CHECK-NEXT: kmovd %edi, %k1 3290; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3291; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload 3292; CHECK-NEXT: # xmm0 {%k1} = mem[3,2,1,0,4,5,6,7] 3293; CHECK-NEXT: retq 3294 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3295 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> 3296 %3 = bitcast i8 %mask to <8 x i1> 3297 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru 3298 ret <8 x i16> %4 3299} 3300 3301define <8 x i16> @stack_fold_pshuflw_maskz(<8 x i16> %a0, i8 %mask) { 3302; CHECK-LABEL: stack_fold_pshuflw_maskz: 3303; CHECK: # %bb.0: 3304; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3305; CHECK-NEXT: #APP 3306; CHECK-NEXT: nop 3307; CHECK-NEXT: #NO_APP 3308; CHECK-NEXT: kmovd %edi, %k1 3309; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload 3310; CHECK-NEXT: # xmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7] 3311; CHECK-NEXT: retq 3312 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3313 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> 3314 %3 = bitcast i8 %mask to <8 x i1> 3315 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 3316 ret <8 x i16> %4 3317} 3318 3319define <16 x i16> @stack_fold_pshuflw_ymm(<16 x i16> %a0) { 3320; CHECK-LABEL: stack_fold_pshuflw_ymm: 3321; CHECK: # %bb.0: 3322; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3323; CHECK-NEXT: #APP 3324; CHECK-NEXT: nop 3325; CHECK-NEXT: #NO_APP 3326; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3327; CHECK-NEXT: # ymm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] 3328; CHECK-NEXT: retq 3329 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3330 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> 3331 ret <16 x i16> %2 3332} 3333 3334define <16 x i16> @stack_fold_pshuflw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) { 3335; CHECK-LABEL: stack_fold_pshuflw_ymm_mask: 3336; CHECK: # %bb.0: 3337; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3338; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3339; CHECK-NEXT: #APP 3340; CHECK-NEXT: nop 3341; CHECK-NEXT: #NO_APP 3342; CHECK-NEXT: kmovd %edi, %k1 3343; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3344; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload 3345; CHECK-NEXT: # ymm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] 3346; CHECK-NEXT: retq 3347 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3348 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> 3349 %3 = bitcast i16 %mask to <16 x i1> 3350 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %passthru 3351 ret <16 x i16> %4 3352} 3353 3354define <16 x i16> @stack_fold_pshuflw_ymm_maskz(<16 x i16> %a0, i16 %mask) { 3355; CHECK-LABEL: stack_fold_pshuflw_ymm_maskz: 3356; CHECK: # %bb.0: 3357; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3358; CHECK-NEXT: #APP 3359; CHECK-NEXT: nop 3360; CHECK-NEXT: #NO_APP 3361; CHECK-NEXT: kmovd %edi, %k1 3362; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload 3363; CHECK-NEXT: # ymm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] 3364; CHECK-NEXT: retq 3365 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3366 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> 3367 %3 = bitcast i16 %mask to <16 x i1> 3368 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer 3369 ret <16 x i16> %4 3370} 3371 3372define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) { 3373; CHECK-LABEL: stack_fold_pslld: 3374; CHECK: # %bb.0: 3375; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3376; CHECK-NEXT: #APP 3377; CHECK-NEXT: nop 3378; CHECK-NEXT: #NO_APP 3379; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3380; CHECK-NEXT: retq 3381 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3382 %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) 3383 ret <4 x i32> %2 3384} 3385declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone 3386 3387define <8 x i32> @stack_fold_pslld_ymm(<8 x i32> %a0, <4 x i32> %a1) { 3388; CHECK-LABEL: stack_fold_pslld_ymm: 3389; CHECK: # %bb.0: 3390; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3391; CHECK-NEXT: #APP 3392; CHECK-NEXT: nop 3393; CHECK-NEXT: #NO_APP 3394; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3395; CHECK-NEXT: retq 3396 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3397 %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) 3398 ret <8 x i32> %2 3399} 3400declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone 3401 3402define <16 x i8> @stack_fold_pslldq(<16 x i8> %a) { 3403; CHECK-LABEL: stack_fold_pslldq: 3404; CHECK: # %bb.0: 3405; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3406; CHECK-NEXT: #APP 3407; CHECK-NEXT: nop 3408; CHECK-NEXT: #NO_APP 3409; CHECK-NEXT: vpslldq $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3410; CHECK-NEXT: # xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[0,1,2,3] 3411; CHECK-NEXT: retq 3412 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3413 %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 17, i32 18, i32 19> 3414 ret <16 x i8> %2 3415} 3416 3417define <32 x i8> @stack_fold_pslldq_ymm(<32 x i8> %a) { 3418; CHECK-LABEL: stack_fold_pslldq_ymm: 3419; CHECK: # %bb.0: 3420; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3421; CHECK-NEXT: #APP 3422; CHECK-NEXT: nop 3423; CHECK-NEXT: #NO_APP 3424; CHECK-NEXT: vpslldq $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3425; CHECK-NEXT: # ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[16] 3426; CHECK-NEXT: retq 3427 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3428 %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48> 3429 ret <32 x i8> %2 3430} 3431 3432define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) { 3433; CHECK-LABEL: stack_fold_psllq: 3434; CHECK: # %bb.0: 3435; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3436; CHECK-NEXT: #APP 3437; CHECK-NEXT: nop 3438; CHECK-NEXT: #NO_APP 3439; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3440; CHECK-NEXT: retq 3441 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3442 %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) 3443 ret <2 x i64> %2 3444} 3445declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone 3446 3447define <4 x i64> @stack_fold_psllq_ymm(<4 x i64> %a0, <2 x i64> %a1) { 3448; CHECK-LABEL: stack_fold_psllq_ymm: 3449; CHECK: # %bb.0: 3450; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3451; CHECK-NEXT: #APP 3452; CHECK-NEXT: nop 3453; CHECK-NEXT: #NO_APP 3454; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3455; CHECK-NEXT: retq 3456 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3457 %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) 3458 ret <4 x i64> %2 3459} 3460declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone 3461 3462define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) { 3463; CHECK-LABEL: stack_fold_psllvd: 3464; CHECK: # %bb.0: 3465; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3466; CHECK-NEXT: #APP 3467; CHECK-NEXT: nop 3468; CHECK-NEXT: #NO_APP 3469; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3470; CHECK-NEXT: retq 3471 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3472 %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) 3473 ret <4 x i32> %2 3474} 3475declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone 3476 3477define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 3478; CHECK-LABEL: stack_fold_psllvd_ymm: 3479; CHECK: # %bb.0: 3480; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3481; CHECK-NEXT: #APP 3482; CHECK-NEXT: nop 3483; CHECK-NEXT: #NO_APP 3484; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3485; CHECK-NEXT: retq 3486 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3487 %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) 3488 ret <8 x i32> %2 3489} 3490declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 3491 3492define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) { 3493; CHECK-LABEL: stack_fold_psllvq: 3494; CHECK: # %bb.0: 3495; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3496; CHECK-NEXT: #APP 3497; CHECK-NEXT: nop 3498; CHECK-NEXT: #NO_APP 3499; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3500; CHECK-NEXT: retq 3501 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3502 %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) 3503 ret <2 x i64> %2 3504} 3505declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone 3506 3507define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 3508; CHECK-LABEL: stack_fold_psllvq_ymm: 3509; CHECK: # %bb.0: 3510; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3511; CHECK-NEXT: #APP 3512; CHECK-NEXT: nop 3513; CHECK-NEXT: #NO_APP 3514; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3515; CHECK-NEXT: retq 3516 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3517 %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) 3518 ret <4 x i64> %2 3519} 3520declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 3521 3522define <8 x i16> @stack_fold_psllvw(<8 x i16> %a0, <8 x i16> %a1) { 3523; CHECK-LABEL: stack_fold_psllvw: 3524; CHECK: # %bb.0: 3525; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3526; CHECK-NEXT: #APP 3527; CHECK-NEXT: nop 3528; CHECK-NEXT: #NO_APP 3529; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3530; CHECK-NEXT: retq 3531 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3532 %2 = call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %a0, <8 x i16> %a1) 3533 ret <8 x i16> %2 3534} 3535declare <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16>, <8 x i16>) nounwind readnone 3536 3537define <16 x i16> @stack_fold_psllvw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 3538; CHECK-LABEL: stack_fold_psllvw_ymm: 3539; CHECK: # %bb.0: 3540; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3541; CHECK-NEXT: #APP 3542; CHECK-NEXT: nop 3543; CHECK-NEXT: #NO_APP 3544; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3545; CHECK-NEXT: retq 3546 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3547 %2 = call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %a0, <16 x i16> %a1) 3548 ret <16 x i16> %2 3549} 3550declare <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16>, <16 x i16>) nounwind readnone 3551 3552define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) { 3553; CHECK-LABEL: stack_fold_psllw: 3554; CHECK: # %bb.0: 3555; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3556; CHECK-NEXT: #APP 3557; CHECK-NEXT: nop 3558; CHECK-NEXT: #NO_APP 3559; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3560; CHECK-NEXT: retq 3561 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3562 %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) 3563 ret <8 x i16> %2 3564} 3565declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone 3566 3567define <16 x i16> @stack_fold_psllw_ymm(<16 x i16> %a0, <8 x i16> %a1) { 3568; CHECK-LABEL: stack_fold_psllw_ymm: 3569; CHECK: # %bb.0: 3570; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3571; CHECK-NEXT: #APP 3572; CHECK-NEXT: nop 3573; CHECK-NEXT: #NO_APP 3574; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3575; CHECK-NEXT: retq 3576 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3577 %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) 3578 ret <16 x i16> %2 3579} 3580declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone 3581 3582define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) { 3583; CHECK-LABEL: stack_fold_psrad: 3584; CHECK: # %bb.0: 3585; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3586; CHECK-NEXT: #APP 3587; CHECK-NEXT: nop 3588; CHECK-NEXT: #NO_APP 3589; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3590; CHECK-NEXT: retq 3591 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3592 %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) 3593 ret <4 x i32> %2 3594} 3595declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone 3596 3597define <8 x i32> @stack_fold_psrad_ymm(<8 x i32> %a0, <4 x i32> %a1) { 3598; CHECK-LABEL: stack_fold_psrad_ymm: 3599; CHECK: # %bb.0: 3600; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3601; CHECK-NEXT: #APP 3602; CHECK-NEXT: nop 3603; CHECK-NEXT: #NO_APP 3604; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3605; CHECK-NEXT: retq 3606 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3607 %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) 3608 ret <8 x i32> %2 3609} 3610declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone 3611 3612define <2 x i64> @stack_fold_psraq(<2 x i64> %a0, <2 x i64> %a1) { 3613; CHECK-LABEL: stack_fold_psraq: 3614; CHECK: # %bb.0: 3615; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3616; CHECK-NEXT: #APP 3617; CHECK-NEXT: nop 3618; CHECK-NEXT: #NO_APP 3619; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3620; CHECK-NEXT: retq 3621 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3622 %2 = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) 3623 ret <2 x i64> %2 3624} 3625declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) nounwind readnone 3626 3627define <4 x i64> @stack_fold_psraq_ymm(<4 x i64> %a0, <2 x i64> %a1) { 3628; CHECK-LABEL: stack_fold_psraq_ymm: 3629; CHECK: # %bb.0: 3630; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3631; CHECK-NEXT: #APP 3632; CHECK-NEXT: nop 3633; CHECK-NEXT: #NO_APP 3634; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3635; CHECK-NEXT: retq 3636 %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3637 %2 = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) 3638 ret <4 x i64> %2 3639} 3640declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) nounwind readnone 3641 3642define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) { 3643; CHECK-LABEL: stack_fold_psravd: 3644; CHECK: # %bb.0: 3645; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3646; CHECK-NEXT: #APP 3647; CHECK-NEXT: nop 3648; CHECK-NEXT: #NO_APP 3649; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3650; CHECK-NEXT: retq 3651 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3652 %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) 3653 ret <4 x i32> %2 3654} 3655declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone 3656 3657define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 3658; CHECK-LABEL: stack_fold_psravd_ymm: 3659; CHECK: # %bb.0: 3660; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3661; CHECK-NEXT: #APP 3662; CHECK-NEXT: nop 3663; CHECK-NEXT: #NO_APP 3664; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3665; CHECK-NEXT: retq 3666 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3667 %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) 3668 ret <8 x i32> %2 3669} 3670declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone 3671 3672define <2 x i64> @stack_fold_psravq(<2 x i64> %a0, <2 x i64> %a1) { 3673; CHECK-LABEL: stack_fold_psravq: 3674; CHECK: # %bb.0: 3675; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3676; CHECK-NEXT: #APP 3677; CHECK-NEXT: nop 3678; CHECK-NEXT: #NO_APP 3679; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3680; CHECK-NEXT: retq 3681 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3682 %2 = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1) 3683 ret <2 x i64> %2 3684} 3685declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) nounwind readnone 3686 3687define <4 x i64> @stack_fold_psravq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 3688; CHECK-LABEL: stack_fold_psravq_ymm: 3689; CHECK: # %bb.0: 3690; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3691; CHECK-NEXT: #APP 3692; CHECK-NEXT: nop 3693; CHECK-NEXT: #NO_APP 3694; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3695; CHECK-NEXT: retq 3696 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3697 %2 = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1) 3698 ret <4 x i64> %2 3699} 3700declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) nounwind readnone 3701 3702define <8 x i16> @stack_fold_psravw(<8 x i16> %a0, <8 x i16> %a1) { 3703; CHECK-LABEL: stack_fold_psravw: 3704; CHECK: # %bb.0: 3705; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3706; CHECK-NEXT: #APP 3707; CHECK-NEXT: nop 3708; CHECK-NEXT: #NO_APP 3709; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3710; CHECK-NEXT: retq 3711 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3712 %2 = call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %a0, <8 x i16> %a1) 3713 ret <8 x i16> %2 3714} 3715declare <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16>, <8 x i16>) nounwind readnone 3716 3717define <16 x i16> @stack_fold_psravw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 3718; CHECK-LABEL: stack_fold_psravw_ymm: 3719; CHECK: # %bb.0: 3720; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3721; CHECK-NEXT: #APP 3722; CHECK-NEXT: nop 3723; CHECK-NEXT: #NO_APP 3724; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3725; CHECK-NEXT: retq 3726 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3727 %2 = call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %a0, <16 x i16> %a1) 3728 ret <16 x i16> %2 3729} 3730declare <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16>, <16 x i16>) nounwind readnone 3731 3732define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) { 3733; CHECK-LABEL: stack_fold_psraw: 3734; CHECK: # %bb.0: 3735; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3736; CHECK-NEXT: #APP 3737; CHECK-NEXT: nop 3738; CHECK-NEXT: #NO_APP 3739; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3740; CHECK-NEXT: retq 3741 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3742 %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) 3743 ret <8 x i16> %2 3744} 3745declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone 3746 3747define <16 x i16> @stack_fold_psraw_ymm(<16 x i16> %a0, <8 x i16> %a1) { 3748; CHECK-LABEL: stack_fold_psraw_ymm: 3749; CHECK: # %bb.0: 3750; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3751; CHECK-NEXT: #APP 3752; CHECK-NEXT: nop 3753; CHECK-NEXT: #NO_APP 3754; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3755; CHECK-NEXT: retq 3756 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3757 %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) 3758 ret <16 x i16> %2 3759} 3760declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone 3761 3762define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) { 3763; CHECK-LABEL: stack_fold_psrld: 3764; CHECK: # %bb.0: 3765; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3766; CHECK-NEXT: #APP 3767; CHECK-NEXT: nop 3768; CHECK-NEXT: #NO_APP 3769; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3770; CHECK-NEXT: retq 3771 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3772 %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) 3773 ret <4 x i32> %2 3774} 3775declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone 3776 3777define <8 x i32> @stack_fold_psrld_ymm(<8 x i32> %a0, <4 x i32> %a1) { 3778; CHECK-LABEL: stack_fold_psrld_ymm: 3779; CHECK: # %bb.0: 3780; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3781; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3782; CHECK-NEXT: #APP 3783; CHECK-NEXT: nop 3784; CHECK-NEXT: #NO_APP 3785; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3786; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3787; CHECK-NEXT: retq 3788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3789 %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) 3790 ret <8 x i32> %2 3791} 3792declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone 3793 3794define <16 x i8> @stack_fold_psrldq(<16 x i8> %a) { 3795; CHECK-LABEL: stack_fold_psrldq: 3796; CHECK: # %bb.0: 3797; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3798; CHECK-NEXT: #APP 3799; CHECK-NEXT: nop 3800; CHECK-NEXT: #NO_APP 3801; CHECK-NEXT: vpsrldq $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3802; CHECK-NEXT: # xmm0 = mem[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3803; CHECK-NEXT: retq 3804 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3805 %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 29, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0> 3806 ret <16 x i8> %2 3807} 3808 3809define <32 x i8> @stack_fold_psrldq_ymm(<32 x i8> %a) { 3810; CHECK-LABEL: stack_fold_psrldq_ymm: 3811; CHECK: # %bb.0: 3812; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3813; CHECK-NEXT: #APP 3814; CHECK-NEXT: nop 3815; CHECK-NEXT: #NO_APP 3816; CHECK-NEXT: vpsrldq $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 3817; CHECK-NEXT: # ymm0 = mem[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3818; CHECK-NEXT: retq 3819 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3820 %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 3821 ret <32 x i8> %2 3822} 3823 3824define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) { 3825; CHECK-LABEL: stack_fold_psrlq: 3826; CHECK: # %bb.0: 3827; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3828; CHECK-NEXT: #APP 3829; CHECK-NEXT: nop 3830; CHECK-NEXT: #NO_APP 3831; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3832; CHECK-NEXT: retq 3833 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3834 %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) 3835 ret <2 x i64> %2 3836} 3837declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone 3838 3839define <4 x i64> @stack_fold_psrlq_ymm(<4 x i64> %a0, <2 x i64> %a1) { 3840; CHECK-LABEL: stack_fold_psrlq_ymm: 3841; CHECK: # %bb.0: 3842; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3843; CHECK-NEXT: #APP 3844; CHECK-NEXT: nop 3845; CHECK-NEXT: #NO_APP 3846; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3847; CHECK-NEXT: retq 3848 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3849 %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) 3850 ret <4 x i64> %2 3851} 3852declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone 3853 3854define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) { 3855; CHECK-LABEL: stack_fold_psrlvd: 3856; CHECK: # %bb.0: 3857; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3858; CHECK-NEXT: #APP 3859; CHECK-NEXT: nop 3860; CHECK-NEXT: #NO_APP 3861; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3862; CHECK-NEXT: retq 3863 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3864 %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) 3865 ret <4 x i32> %2 3866} 3867declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone 3868 3869define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 3870; CHECK-LABEL: stack_fold_psrlvd_ymm: 3871; CHECK: # %bb.0: 3872; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3873; CHECK-NEXT: #APP 3874; CHECK-NEXT: nop 3875; CHECK-NEXT: #NO_APP 3876; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3877; CHECK-NEXT: retq 3878 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3879 %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) 3880 ret <8 x i32> %2 3881} 3882declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 3883 3884define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) { 3885; CHECK-LABEL: stack_fold_psrlvq: 3886; CHECK: # %bb.0: 3887; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3888; CHECK-NEXT: #APP 3889; CHECK-NEXT: nop 3890; CHECK-NEXT: #NO_APP 3891; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3892; CHECK-NEXT: retq 3893 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3894 %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) 3895 ret <2 x i64> %2 3896} 3897declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone 3898 3899define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 3900; CHECK-LABEL: stack_fold_psrlvq_ymm: 3901; CHECK: # %bb.0: 3902; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3903; CHECK-NEXT: #APP 3904; CHECK-NEXT: nop 3905; CHECK-NEXT: #NO_APP 3906; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3907; CHECK-NEXT: retq 3908 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3909 %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) 3910 ret <4 x i64> %2 3911} 3912declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 3913 3914define <8 x i16> @stack_fold_psrlvw(<8 x i16> %a0, <8 x i16> %a1) { 3915; CHECK-LABEL: stack_fold_psrlvw: 3916; CHECK: # %bb.0: 3917; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3918; CHECK-NEXT: #APP 3919; CHECK-NEXT: nop 3920; CHECK-NEXT: #NO_APP 3921; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3922; CHECK-NEXT: retq 3923 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3924 %2 = call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %a0, <8 x i16> %a1) 3925 ret <8 x i16> %2 3926} 3927declare <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16>, <8 x i16>) nounwind readnone 3928 3929define <16 x i16> @stack_fold_psrlvw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 3930; CHECK-LABEL: stack_fold_psrlvw_ymm: 3931; CHECK: # %bb.0: 3932; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3933; CHECK-NEXT: #APP 3934; CHECK-NEXT: nop 3935; CHECK-NEXT: #NO_APP 3936; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3937; CHECK-NEXT: retq 3938 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3939 %2 = call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %a0, <16 x i16> %a1) 3940 ret <16 x i16> %2 3941} 3942declare <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16>, <16 x i16>) nounwind readnone 3943 3944define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) { 3945; CHECK-LABEL: stack_fold_psrlw: 3946; CHECK: # %bb.0: 3947; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3948; CHECK-NEXT: #APP 3949; CHECK-NEXT: nop 3950; CHECK-NEXT: #NO_APP 3951; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3952; CHECK-NEXT: retq 3953 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3954 %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) 3955 ret <8 x i16> %2 3956} 3957declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone 3958 3959define <16 x i16> @stack_fold_psrlw_ymm(<16 x i16> %a0, <8 x i16> %a1) { 3960; CHECK-LABEL: stack_fold_psrlw_ymm: 3961; CHECK: # %bb.0: 3962; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3963; CHECK-NEXT: #APP 3964; CHECK-NEXT: nop 3965; CHECK-NEXT: #NO_APP 3966; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 3967; CHECK-NEXT: retq 3968 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3969 %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) 3970 ret <16 x i16> %2 3971} 3972declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone 3973 3974define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) { 3975; CHECK-LABEL: stack_fold_psubb: 3976; CHECK: # %bb.0: 3977; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3978; CHECK-NEXT: #APP 3979; CHECK-NEXT: nop 3980; CHECK-NEXT: #NO_APP 3981; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 3982; CHECK-NEXT: retq 3983 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3984 %2 = sub <16 x i8> %a0, %a1 3985 ret <16 x i8> %2 3986} 3987 3988define <32 x i8> @stack_fold_psubb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 3989; CHECK-LABEL: stack_fold_psubb_ymm: 3990; CHECK: # %bb.0: 3991; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3992; CHECK-NEXT: #APP 3993; CHECK-NEXT: nop 3994; CHECK-NEXT: #NO_APP 3995; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3996; CHECK-NEXT: retq 3997 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 3998 %2 = sub <32 x i8> %a0, %a1 3999 ret <32 x i8> %2 4000} 4001 4002define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) { 4003; CHECK-LABEL: stack_fold_psubd: 4004; CHECK: # %bb.0: 4005; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4006; CHECK-NEXT: #APP 4007; CHECK-NEXT: nop 4008; CHECK-NEXT: #NO_APP 4009; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4010; CHECK-NEXT: retq 4011 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4012 %2 = sub <4 x i32> %a0, %a1 4013 ret <4 x i32> %2 4014} 4015 4016define <8 x i32> @stack_fold_psubd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 4017; CHECK-LABEL: stack_fold_psubd_ymm: 4018; CHECK: # %bb.0: 4019; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4020; CHECK-NEXT: #APP 4021; CHECK-NEXT: nop 4022; CHECK-NEXT: #NO_APP 4023; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4024; CHECK-NEXT: retq 4025 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4026 %2 = sub <8 x i32> %a0, %a1 4027 ret <8 x i32> %2 4028} 4029 4030define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) { 4031; CHECK-LABEL: stack_fold_psubq: 4032; CHECK: # %bb.0: 4033; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4034; CHECK-NEXT: #APP 4035; CHECK-NEXT: nop 4036; CHECK-NEXT: #NO_APP 4037; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4038; CHECK-NEXT: retq 4039 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4040 %2 = sub <2 x i64> %a0, %a1 4041 ret <2 x i64> %2 4042} 4043 4044define <4 x i64> @stack_fold_psubq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 4045; CHECK-LABEL: stack_fold_psubq_ymm: 4046; CHECK: # %bb.0: 4047; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4048; CHECK-NEXT: #APP 4049; CHECK-NEXT: nop 4050; CHECK-NEXT: #NO_APP 4051; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4052; CHECK-NEXT: retq 4053 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4054 %2 = sub <4 x i64> %a0, %a1 4055 ret <4 x i64> %2 4056} 4057 4058define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) { 4059; CHECK-LABEL: stack_fold_psubsb: 4060; CHECK: # %bb.0: 4061; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4062; CHECK-NEXT: #APP 4063; CHECK-NEXT: nop 4064; CHECK-NEXT: #NO_APP 4065; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4066; CHECK-NEXT: retq 4067 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4068 %2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 4069 ret <16 x i8> %2 4070} 4071declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 4072 4073define <32 x i8> @stack_fold_psubsb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 4074; CHECK-LABEL: stack_fold_psubsb_ymm: 4075; CHECK: # %bb.0: 4076; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4077; CHECK-NEXT: #APP 4078; CHECK-NEXT: nop 4079; CHECK-NEXT: #NO_APP 4080; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4081; CHECK-NEXT: retq 4082 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4083 %2 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 4084 ret <32 x i8> %2 4085} 4086declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 4087 4088define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) { 4089; CHECK-LABEL: stack_fold_psubsw: 4090; CHECK: # %bb.0: 4091; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4092; CHECK-NEXT: #APP 4093; CHECK-NEXT: nop 4094; CHECK-NEXT: #NO_APP 4095; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4096; CHECK-NEXT: retq 4097 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4098 %2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 4099 ret <8 x i16> %2 4100} 4101declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 4102 4103define <16 x i16> @stack_fold_psubsw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 4104; CHECK-LABEL: stack_fold_psubsw_ymm: 4105; CHECK: # %bb.0: 4106; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4107; CHECK-NEXT: #APP 4108; CHECK-NEXT: nop 4109; CHECK-NEXT: #NO_APP 4110; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4111; CHECK-NEXT: retq 4112 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4113 %2 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 4114 ret <16 x i16> %2 4115} 4116declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 4117 4118define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) { 4119; CHECK-LABEL: stack_fold_psubusb: 4120; CHECK: # %bb.0: 4121; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4122; CHECK-NEXT: #APP 4123; CHECK-NEXT: nop 4124; CHECK-NEXT: #NO_APP 4125; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4126; CHECK-NEXT: retq 4127 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4128 %2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) 4129 ret <16 x i8> %2 4130} 4131declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone 4132 4133define <32 x i8> @stack_fold_psubusb_ymm(<32 x i8> %a0, <32 x i8> %a1) { 4134; CHECK-LABEL: stack_fold_psubusb_ymm: 4135; CHECK: # %bb.0: 4136; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4137; CHECK-NEXT: #APP 4138; CHECK-NEXT: nop 4139; CHECK-NEXT: #NO_APP 4140; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4141; CHECK-NEXT: retq 4142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4143 %2 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 4144 ret <32 x i8> %2 4145} 4146declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 4147 4148define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) { 4149; CHECK-LABEL: stack_fold_psubusw: 4150; CHECK: # %bb.0: 4151; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4152; CHECK-NEXT: #APP 4153; CHECK-NEXT: nop 4154; CHECK-NEXT: #NO_APP 4155; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4156; CHECK-NEXT: retq 4157 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4158 %2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) 4159 ret <8 x i16> %2 4160} 4161declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 4162 4163define <16 x i16> @stack_fold_psubusw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 4164; CHECK-LABEL: stack_fold_psubusw_ymm: 4165; CHECK: # %bb.0: 4166; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4167; CHECK-NEXT: #APP 4168; CHECK-NEXT: nop 4169; CHECK-NEXT: #NO_APP 4170; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4171; CHECK-NEXT: retq 4172 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4173 %2 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 4174 ret <16 x i16> %2 4175} 4176declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 4177 4178define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) { 4179; CHECK-LABEL: stack_fold_psubw: 4180; CHECK: # %bb.0: 4181; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4182; CHECK-NEXT: #APP 4183; CHECK-NEXT: nop 4184; CHECK-NEXT: #NO_APP 4185; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4186; CHECK-NEXT: retq 4187 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4188 %2 = sub <8 x i16> %a0, %a1 4189 ret <8 x i16> %2 4190} 4191 4192define <16 x i16> @stack_fold_psubw_ymm(<16 x i16> %a0, <16 x i16> %a1) { 4193; CHECK-LABEL: stack_fold_psubw_ymm: 4194; CHECK: # %bb.0: 4195; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4196; CHECK-NEXT: #APP 4197; CHECK-NEXT: nop 4198; CHECK-NEXT: #NO_APP 4199; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4200; CHECK-NEXT: retq 4201 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4202 %2 = sub <16 x i16> %a0, %a1 4203 ret <16 x i16> %2 4204} 4205 4206define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) { 4207; CHECK-LABEL: stack_fold_punpckhbw: 4208; CHECK: # %bb.0: 4209; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4210; CHECK-NEXT: #APP 4211; CHECK-NEXT: nop 4212; CHECK-NEXT: #NO_APP 4213; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4214; CHECK-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] 4215; CHECK-NEXT: retq 4216 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4217 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 4218 ret <16 x i8> %2 4219} 4220 4221define <16 x i8> @stack_fold_punpckhbw_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 4222; CHECK-LABEL: stack_fold_punpckhbw_mask: 4223; CHECK: # %bb.0: 4224; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4225; CHECK-NEXT: #APP 4226; CHECK-NEXT: nop 4227; CHECK-NEXT: #NO_APP 4228; CHECK-NEXT: kmovd %esi, %k1 4229; CHECK-NEXT: vmovdqa (%rdi), %xmm2 4230; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 4231; CHECK-NEXT: # xmm2 {%k1} = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] 4232; CHECK-NEXT: vmovdqa %xmm2, %xmm0 4233; CHECK-NEXT: retq 4234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4235 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 4236 %3 = bitcast i16 %mask to <16 x i1> 4237 ; load needed to keep the operation from being scheduled about the asm block 4238 %4 = load <16 x i8>, <16 x i8>* %passthru 4239 %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4 4240 ret <16 x i8> %5 4241} 4242 4243define <16 x i8> @stack_fold_punpckhbw_maskz(<16 x i8> %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 4244; CHECK-LABEL: stack_fold_punpckhbw_maskz: 4245; CHECK: # %bb.0: 4246; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4247; CHECK-NEXT: #APP 4248; CHECK-NEXT: nop 4249; CHECK-NEXT: #NO_APP 4250; CHECK-NEXT: kmovd %edi, %k1 4251; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 4252; CHECK-NEXT: # xmm0 {%k1} {z} = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] 4253; CHECK-NEXT: retq 4254 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4255 %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 4256 %3 = bitcast i16 %mask to <16 x i1> 4257 %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer 4258 ret <16 x i8> %4 4259} 4260 4261define <32 x i8> @stack_fold_punpckhbw_ymm(<32 x i8> %a0, <32 x i8> %a1) { 4262; CHECK-LABEL: stack_fold_punpckhbw_ymm: 4263; CHECK: # %bb.0: 4264; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4265; CHECK-NEXT: #APP 4266; CHECK-NEXT: nop 4267; CHECK-NEXT: #NO_APP 4268; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 4269; CHECK-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] 4270; CHECK-NEXT: retq 4271 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4272 %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 4273 ret <32 x i8> %2 4274} 4275 4276define <32 x i8> @stack_fold_punpckhbw_mask_ymm(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 4277; CHECK-LABEL: stack_fold_punpckhbw_mask_ymm: 4278; CHECK: # %bb.0: 4279; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4280; CHECK-NEXT: #APP 4281; CHECK-NEXT: nop 4282; CHECK-NEXT: #NO_APP 4283; CHECK-NEXT: kmovd %esi, %k1 4284; CHECK-NEXT: vmovdqa (%rdi), %ymm2 4285; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 4286; CHECK-NEXT: # ymm2 {%k1} = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] 4287; CHECK-NEXT: vmovdqa %ymm2, %ymm0 4288; CHECK-NEXT: retq 4289 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4290 %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 4291 %3 = bitcast i32 %mask to <32 x i1> 4292 ; load needed to keep the operation from being scheduled about the asm block 4293 %4 = load <32 x i8>, <32 x i8>* %passthru 4294 %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4 4295 ret <32 x i8> %5 4296} 4297 4298define <32 x i8> @stack_fold_punpckhbw_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 4299; CHECK-LABEL: stack_fold_punpckhbw_maskz_ymm: 4300; CHECK: # %bb.0: 4301; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4302; CHECK-NEXT: #APP 4303; CHECK-NEXT: nop 4304; CHECK-NEXT: #NO_APP 4305; CHECK-NEXT: kmovd %edi, %k1 4306; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 4307; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] 4308; CHECK-NEXT: retq 4309 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4310 %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 4311 %3 = bitcast i32 %mask to <32 x i1> 4312 %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer 4313 ret <32 x i8> %4 4314} 4315 4316define <4 x i64> @stack_fold_shufi64x2_maskz(<4 x i64> %a, <4 x i64> %b, i8 %mask) { 4317; CHECK-LABEL: stack_fold_shufi64x2_maskz: 4318; CHECK: # %bb.0: 4319; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4320; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4321; CHECK-NEXT: #APP 4322; CHECK-NEXT: nop 4323; CHECK-NEXT: #NO_APP 4324; CHECK-NEXT: kmovd %edi, %k1 4325; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4326; CHECK-NEXT: vshufi64x2 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 4327; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] 4328; CHECK-NEXT: retq 4329 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4330 %2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 4331 %3 = bitcast i8 %mask to <8 x i1> 4332 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4333 %5 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer 4334 ret <4 x i64> %5 4335} 4336 4337define <8 x i32> @stack_fold_shufi32x4_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) { 4338; CHECK-LABEL: stack_fold_shufi32x4_maskz: 4339; CHECK: # %bb.0: 4340; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4341; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4342; CHECK-NEXT: #APP 4343; CHECK-NEXT: nop 4344; CHECK-NEXT: #NO_APP 4345; CHECK-NEXT: kmovd %edi, %k1 4346; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 4347; CHECK-NEXT: vshufi32x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 4348; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] 4349; CHECK-NEXT: retq 4350 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 4351 %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 4352 %3 = bitcast i8 %mask to <8 x i1> 4353 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 4354 ret <8 x i32> %4 4355} 4356 4357declare <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32>) 4358declare <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32>) 4359declare <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64>) 4360declare <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64>) 4361declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) 4362declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) 4363declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) 4364declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) 4365