1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16,+avx512vl < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <32 x i16> @stack_fold_cvtne2ps2bf16(<16 x float> %a0, <16 x float> %a1) { 13; CHECK-LABEL: stack_fold_cvtne2ps2bf16: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 16; CHECK-NEXT: #APP 17; CHECK-NEXT: nop 18; CHECK-NEXT: #NO_APP 19; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload 20; CHECK-NEXT: retq 21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 22 %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1) 23 ret <32 x i16> %2 24} 25declare <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) 26 27define <32 x i16> @stack_fold_cvtne2ps2bf16_mask(<16 x float> %a0, <16 x float> %a1, <32 x i16>* %passthru, i32 %U) { 28; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask: 29; CHECK: # %bb.0: 30; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 31; CHECK-NEXT: #APP 32; CHECK-NEXT: nop 33; CHECK-NEXT: #NO_APP 34; CHECK-NEXT: kmovd %esi, %k1 35; CHECK-NEXT: vmovaps (%rdi), %zmm2 36; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 37; CHECK-NEXT: vmovaps %zmm2, %zmm0 38; CHECK-NEXT: retq 39 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 40 %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1) 41 %3 = bitcast i32 %U to <32 x i1> 42 ; load needed to keep the operation from being scheduled above the asm block 43 %4 = load <32 x i16>, <32 x i16>* %passthru 44 %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 45 ret <32 x i16> %5 46} 47 48define <32 x i16> @stack_fold_cvtne2ps2bf16_maskz(<16 x float> %a0, <16 x float> %a1, i32 %U) { 49; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz: 50; CHECK: # %bb.0: 51; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 52; CHECK-NEXT: #APP 53; CHECK-NEXT: nop 54; CHECK-NEXT: #NO_APP 55; CHECK-NEXT: kmovd %edi, %k1 56; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload 57; CHECK-NEXT: retq 58 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 59 %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1) 60 %3 = bitcast i32 %U to <32 x i1> 61 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 62 ret <32 x i16> %4 63} 64 65define <16 x i16> @stack_fold_cvtneps2bf16(<16 x float> %a0) { 66; CHECK-LABEL: stack_fold_cvtneps2bf16: 67; CHECK: # %bb.0: 68; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 69; CHECK-NEXT: #APP 70; CHECK-NEXT: nop 71; CHECK-NEXT: #NO_APP 72; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 64-byte Folded Reload 73; CHECK-NEXT: retq 74 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 75 %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0) 76 ret <16 x i16> %2 77} 78declare <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) 79 80define <16 x i16> @stack_fold_cvtneps2bf16_mask(<16 x float> %a0, <16 x i16>* %passthru, i16 %U) { 81; CHECK-LABEL: stack_fold_cvtneps2bf16_mask: 82; CHECK: # %bb.0: 83; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 84; CHECK-NEXT: #APP 85; CHECK-NEXT: nop 86; CHECK-NEXT: #NO_APP 87; CHECK-NEXT: kmovd %esi, %k1 88; CHECK-NEXT: vmovaps (%rdi), %ymm1 89; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 64-byte Folded Reload 90; CHECK-NEXT: vmovaps %ymm1, %ymm0 91; CHECK-NEXT: retq 92 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 93 %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0) 94 %3 = bitcast i16 %U to <16 x i1> 95 ; load needed to keep the operation from being scheduled above the asm block 96 %4 = load <16 x i16>, <16 x i16>* %passthru 97 %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4 98 ret <16 x i16> %5 99} 100 101define <16 x i16> @stack_fold_cvtneps2bf16_maskz(<16 x float> %a0, i16 %U) { 102; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz: 103; CHECK: # %bb.0: 104; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 105; CHECK-NEXT: #APP 106; CHECK-NEXT: nop 107; CHECK-NEXT: #NO_APP 108; CHECK-NEXT: kmovd %edi, %k1 109; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 64-byte Folded Reload 110; CHECK-NEXT: retq 111 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 112 %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0) 113 %3 = bitcast i16 %U to <16 x i1> 114 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer 115 ret <16 x i16> %4 116} 117 118define <16 x float> @stack_fold_vdpbf16ps(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2) { 119; CHECK-LABEL: stack_fold_vdpbf16ps: 120; CHECK: # %bb.0: 121; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 122; CHECK-NEXT: #APP 123; CHECK-NEXT: nop 124; CHECK-NEXT: #NO_APP 125; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload 126; CHECK-NEXT: retq 127 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 128 %2 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2) 129 ret <16 x float> %2 130} 131declare <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <16 x i32>, <16 x i32>) 132 133define <16 x float> @stack_fold_vdpbf16ps_mask(<16 x float>* %a0, <16 x i32> %a1, <16 x i32> %a2, <16 x float>* %passthru, i16 %U) { 134; CHECK-LABEL: stack_fold_vdpbf16ps_mask: 135; CHECK: # %bb.0: 136; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 137; CHECK-NEXT: #APP 138; CHECK-NEXT: nop 139; CHECK-NEXT: #NO_APP 140; CHECK-NEXT: vmovaps (%rdi), %zmm2 141; CHECK-NEXT: kmovd %edx, %k1 142; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload 143; CHECK-NEXT: vmovaps %zmm2, %zmm0 144; CHECK-NEXT: retq 145 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 146 ; load needed to keep the operation from being scheduled above the asm block 147 %2 = load <16 x float>, <16 x float>* %a0 148 %3 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %2, <16 x i32> %a1, <16 x i32> %a2) 149 %4 = bitcast i16 %U to <16 x i1> 150 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %2 151 ret <16 x float> %5 152} 153 154define <16 x float> @stack_fold_vdpbf16ps_maskz(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %U) { 155; CHECK-LABEL: stack_fold_vdpbf16ps_maskz: 156; CHECK: # %bb.0: 157; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 158; CHECK-NEXT: #APP 159; CHECK-NEXT: nop 160; CHECK-NEXT: #NO_APP 161; CHECK-NEXT: kmovw (%rdi), %k1 162; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload 163; CHECK-NEXT: retq 164 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 165 %2 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2) 166 %3 = load i16, i16* %U 167 %4 = bitcast i16 %3 to <16 x i1> 168 %5 = select <16 x i1> %4, <16 x float> %2, <16 x float> zeroinitializer 169 ret <16 x float> %5 170} 171 172 173 174define <16 x i16> @stack_fold_cvtne2ps2bf16_ymm(<8 x float> %a0, <8 x float> %a1) { 175; CHECK-LABEL: stack_fold_cvtne2ps2bf16_ymm: 176; CHECK: # %bb.0: 177; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 178; CHECK-NEXT: #APP 179; CHECK-NEXT: nop 180; CHECK-NEXT: #NO_APP 181; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 182; CHECK-NEXT: retq 183 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 184 %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1) 185 ret <16 x i16> %2 186} 187declare <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) 188 189define <16 x i16> @stack_fold_cvtne2ps2bf16_mask_ymm(<8 x float> %a0, <8 x float> %a1, <16 x i16>* %passthru, i16 %U) { 190; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_ymm: 191; CHECK: # %bb.0: 192; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 193; CHECK-NEXT: #APP 194; CHECK-NEXT: nop 195; CHECK-NEXT: #NO_APP 196; CHECK-NEXT: kmovd %esi, %k1 197; CHECK-NEXT: vmovaps (%rdi), %ymm2 198; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 199; CHECK-NEXT: vmovaps %ymm2, %ymm0 200; CHECK-NEXT: retq 201 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 202 %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1) 203 %3 = bitcast i16 %U to <16 x i1> 204 ; load needed to keep the operation from being scheduled above the asm block 205 %4 = load <16 x i16>, <16 x i16>* %passthru 206 %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4 207 ret <16 x i16> %5 208} 209 210define <16 x i16> @stack_fold_cvtne2ps2bf16_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i16 %U) { 211; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_ymm: 212; CHECK: # %bb.0: 213; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 214; CHECK-NEXT: #APP 215; CHECK-NEXT: nop 216; CHECK-NEXT: #NO_APP 217; CHECK-NEXT: kmovd %edi, %k1 218; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload 219; CHECK-NEXT: retq 220 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 221 %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1) 222 %3 = bitcast i16 %U to <16 x i1> 223 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer 224 ret <16 x i16> %4 225} 226 227define <8 x i16> @stack_fold_cvtneps2bf16_ymm(<8 x float> %a0) { 228; CHECK-LABEL: stack_fold_cvtneps2bf16_ymm: 229; CHECK: # %bb.0: 230; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 231; CHECK-NEXT: #APP 232; CHECK-NEXT: nop 233; CHECK-NEXT: #NO_APP 234; CHECK-NEXT: vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload 235; CHECK-NEXT: vzeroupper 236; CHECK-NEXT: retq 237 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 238 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0) 239 ret <8 x i16> %2 240} 241declare <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) 242 243define <8 x i16> @stack_fold_cvtneps2bf16_mask_ymm(<8 x float> %a0, <8 x i16>* %passthru, i8 %U) { 244; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_ymm: 245; CHECK: # %bb.0: 246; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 247; CHECK-NEXT: #APP 248; CHECK-NEXT: nop 249; CHECK-NEXT: #NO_APP 250; CHECK-NEXT: kmovd %esi, %k1 251; CHECK-NEXT: vmovaps (%rdi), %xmm1 252; CHECK-NEXT: vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 32-byte Folded Reload 253; CHECK-NEXT: vmovaps %xmm1, %xmm0 254; CHECK-NEXT: vzeroupper 255; CHECK-NEXT: retq 256 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 257 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0) 258 %3 = bitcast i8 %U to <8 x i1> 259 ; load needed to keep the operation from being scheduled above the asm block 260 %4 = load <8 x i16>, <8 x i16>* %passthru 261 %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4 262 ret <8 x i16> %5 263} 264 265define <8 x i16> @stack_fold_cvtneps2bf16_maskz_ymm(<8 x float> %a0, i8 %U) { 266; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_ymm: 267; CHECK: # %bb.0: 268; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 269; CHECK-NEXT: #APP 270; CHECK-NEXT: nop 271; CHECK-NEXT: #NO_APP 272; CHECK-NEXT: kmovd %edi, %k1 273; CHECK-NEXT: vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 32-byte Folded Reload 274; CHECK-NEXT: vzeroupper 275; CHECK-NEXT: retq 276 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 277 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0) 278 %3 = bitcast i8 %U to <8 x i1> 279 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 280 ret <8 x i16> %4 281} 282 283define <8 x float> @stack_fold_vdpbf16ps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2) { 284; CHECK-LABEL: stack_fold_vdpbf16ps_ymm: 285; CHECK: # %bb.0: 286; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 287; CHECK-NEXT: #APP 288; CHECK-NEXT: nop 289; CHECK-NEXT: #NO_APP 290; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 291; CHECK-NEXT: retq 292 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 293 %2 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2) 294 ret <8 x float> %2 295} 296declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <8 x i32>, <8 x i32>) 297 298define <8 x float> @stack_fold_vdpbf16ps_mask_ymm(<8 x float>* %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x float>* %passthru, i8 %U) { 299; CHECK-LABEL: stack_fold_vdpbf16ps_mask_ymm: 300; CHECK: # %bb.0: 301; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 302; CHECK-NEXT: #APP 303; CHECK-NEXT: nop 304; CHECK-NEXT: #NO_APP 305; CHECK-NEXT: vmovaps (%rdi), %ymm2 306; CHECK-NEXT: kmovd %edx, %k1 307; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload 308; CHECK-NEXT: vmovaps %ymm2, %ymm0 309; CHECK-NEXT: retq 310 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 311 ; load needed to keep the operation from being scheduled above the asm block 312 %2 = load <8 x float>, <8 x float>* %a0 313 %3 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %2, <8 x i32> %a1, <8 x i32> %a2) 314 %4 = bitcast i8 %U to <8 x i1> 315 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %2 316 ret <8 x float> %5 317} 318 319define <8 x float> @stack_fold_vdpbf16ps_maskz_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2, i8* %U) { 320; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_ymm: 321; CHECK: # %bb.0: 322; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 323; CHECK-NEXT: #APP 324; CHECK-NEXT: nop 325; CHECK-NEXT: #NO_APP 326; CHECK-NEXT: movzbl (%rdi), %eax 327; CHECK-NEXT: kmovd %eax, %k1 328; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload 329; CHECK-NEXT: retq 330 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 331 %2 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2) 332 %3 = load i8, i8* %U 333 %4 = bitcast i8 %3 to <8 x i1> 334 %5 = select <8 x i1> %4, <8 x float> %2, <8 x float> zeroinitializer 335 ret <8 x float> %5 336} 337 338 339 340 341define <8 x i16> @stack_fold_cvtne2ps2bf16_xmm(<4 x float> %a0, <4 x float> %a1) { 342; CHECK-LABEL: stack_fold_cvtne2ps2bf16_xmm: 343; CHECK: # %bb.0: 344; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 345; CHECK-NEXT: #APP 346; CHECK-NEXT: nop 347; CHECK-NEXT: #NO_APP 348; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 349; CHECK-NEXT: retq 350 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 351 %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1) 352 ret <8 x i16> %2 353} 354declare <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) 355 356define <8 x i16> @stack_fold_cvtne2ps2bf16_mask_xmm(<4 x float> %a0, <4 x float> %a1, <8 x i16>* %passthru, i8 %U) { 357; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_xmm: 358; CHECK: # %bb.0: 359; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 360; CHECK-NEXT: #APP 361; CHECK-NEXT: nop 362; CHECK-NEXT: #NO_APP 363; CHECK-NEXT: kmovd %esi, %k1 364; CHECK-NEXT: vmovaps (%rdi), %xmm2 365; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 366; CHECK-NEXT: vmovaps %xmm2, %xmm0 367; CHECK-NEXT: retq 368 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 369 %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1) 370 %3 = bitcast i8 %U to <8 x i1> 371 ; load needed to keep the operation from being scheduled above the asm block 372 %4 = load <8 x i16>, <8 x i16>* %passthru 373 %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4 374 ret <8 x i16> %5 375} 376 377define <8 x i16> @stack_fold_cvtne2ps2bf16_maskz_xmm(<4 x float> %a0, <4 x float> %a1, i8 %U) { 378; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_xmm: 379; CHECK: # %bb.0: 380; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 381; CHECK-NEXT: #APP 382; CHECK-NEXT: nop 383; CHECK-NEXT: #NO_APP 384; CHECK-NEXT: kmovd %edi, %k1 385; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload 386; CHECK-NEXT: retq 387 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 388 %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1) 389 %3 = bitcast i8 %U to <8 x i1> 390 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 391 ret <8 x i16> %4 392} 393 394define <8 x i16> @stack_fold_cvtneps2bf16_xmm(<4 x float> %a0) { 395; CHECK-LABEL: stack_fold_cvtneps2bf16_xmm: 396; CHECK: # %bb.0: 397; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 398; CHECK-NEXT: #APP 399; CHECK-NEXT: nop 400; CHECK-NEXT: #NO_APP 401; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 402; CHECK-NEXT: retq 403 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 404 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 405 ret <8 x i16> %2 406} 407declare <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x i16>, <4 x i1>) 408 409define <8 x i16> @stack_fold_cvtneps2bf16_mask_xmm(<4 x float> %a0, <8 x i16>* %passthru, i8 %U) { 410; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_xmm: 411; CHECK: # %bb.0: 412; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 413; CHECK-NEXT: #APP 414; CHECK-NEXT: nop 415; CHECK-NEXT: #NO_APP 416; CHECK-NEXT: vmovaps (%rdi), %xmm1 417; CHECK-NEXT: kmovd %esi, %k1 418; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload 419; CHECK-NEXT: vmovaps %xmm1, %xmm0 420; CHECK-NEXT: retq 421 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 422 %2 = load <8 x i16>, <8 x i16>* %passthru 423 %3 = bitcast i8 %U to <8 x i1> 424 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 425 %5 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> %2, <4 x i1> %4) 426 ret <8 x i16> %5 427} 428 429define <8 x i16> @stack_fold_cvtneps2bf16_maskz_xmm(<4 x float> %a0, i8 %U) { 430; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_xmm: 431; CHECK: # %bb.0: 432; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 433; CHECK-NEXT: #APP 434; CHECK-NEXT: nop 435; CHECK-NEXT: #NO_APP 436; CHECK-NEXT: kmovd %edi, %k1 437; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload 438; CHECK-NEXT: retq 439 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 440 %2 = bitcast i8 %U to <8 x i1> 441 %3 = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 442 %4 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> zeroinitializer, <4 x i1> %3) 443 ret <8 x i16> %4 444} 445 446define <4 x float> @stack_fold_vdpbf16ps_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2) { 447; CHECK-LABEL: stack_fold_vdpbf16ps_xmm: 448; CHECK: # %bb.0: 449; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 450; CHECK-NEXT: #APP 451; CHECK-NEXT: nop 452; CHECK-NEXT: #NO_APP 453; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload 454; CHECK-NEXT: retq 455 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 456 %2 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2) 457 ret <4 x float> %2 458} 459declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <4 x i32>, <4 x i32>) 460 461define <4 x float> @stack_fold_vdpbf16ps_mask_xmm(<4 x float>* %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x float>* %passthru, i8 %U) { 462; CHECK-LABEL: stack_fold_vdpbf16ps_mask_xmm: 463; CHECK: # %bb.0: 464; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 465; CHECK-NEXT: #APP 466; CHECK-NEXT: nop 467; CHECK-NEXT: #NO_APP 468; CHECK-NEXT: vmovaps (%rdi), %xmm2 469; CHECK-NEXT: kmovd %edx, %k1 470; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload 471; CHECK-NEXT: vmovaps %xmm2, %xmm0 472; CHECK-NEXT: retq 473 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 474 ; load needed to keep the operation from being scheduled above the asm block 475 %2 = load <4 x float>, <4 x float>* %a0 476 %3 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %2, <4 x i32> %a1, <4 x i32> %a2) 477 %4 = bitcast i8 %U to <8 x i1> 478 %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 479 %6 = select <4 x i1> %5, <4 x float> %3, <4 x float> %2 480 ret <4 x float> %6 481} 482 483define <4 x float> @stack_fold_vdpbf16ps_maskz_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2, i8* %U) { 484; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_xmm: 485; CHECK: # %bb.0: 486; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 487; CHECK-NEXT: #APP 488; CHECK-NEXT: nop 489; CHECK-NEXT: #NO_APP 490; CHECK-NEXT: movzbl (%rdi), %eax 491; CHECK-NEXT: kmovd %eax, %k1 492; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload 493; CHECK-NEXT: retq 494 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 495 %2 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2) 496 %3 = load i8, i8* %U 497 %4 = bitcast i8 %3 to <8 x i1> 498 %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 499 %6 = select <4 x i1> %5, <4 x float> %2, <4 x float> zeroinitializer 500 ret <4 x float> %6 501} 502