1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bf16 -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16 -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 4 5declare <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1 6 7define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 { 8; CHECK-LABEL: test_mm_cvtne2ps2bf16_128: 9; CHECK: # %bb.0: # %entry 10; CHECK-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x72,0xc1] 11; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 12entry: 13 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 14 %1 = bitcast <8 x i16> %0 to <2 x i64> 15 ret <2 x i64> %1 16} 17 18define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 { 19; X86-LABEL: test_mm_maskz_cvtne2ps2bf16_128: 20; X86: # %bb.0: # %entry 21; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 22; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 23; X86-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0x89,0x72,0xc1] 24; X86-NEXT: retl # encoding: [0xc3] 25; 26; X64-LABEL: test_mm_maskz_cvtne2ps2bf16_128: 27; X64: # %bb.0: # %entry 28; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 29; X64-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0x89,0x72,0xc1] 30; X64-NEXT: retq # encoding: [0xc3] 31entry: 32 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 33 %1 = bitcast i8 %U to <8 x i1> 34 %2 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> zeroinitializer 35 %3 = bitcast <8 x i16> %2 to <2 x i64> 36 ret <2 x i64> %3 37} 38 39define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 { 40; X86-LABEL: test_mm_mask_cvtne2ps2bf16_128: 41; X86: # %bb.0: # %entry 42; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 43; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 44; X86-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x72,0xc2] 45; X86-NEXT: retl # encoding: [0xc3] 46; 47; X64-LABEL: test_mm_mask_cvtne2ps2bf16_128: 48; X64: # %bb.0: # %entry 49; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 50; X64-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x72,0xc2] 51; X64-NEXT: retq # encoding: [0xc3] 52entry: 53 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 54 %1 = bitcast <2 x i64> %C to <8 x i16> 55 %2 = bitcast i8 %U to <8 x i1> 56 %3 = select <8 x i1> %2, <8 x i16> %0, <8 x i16> %1 57 %4 = bitcast <8 x i16> %3 to <2 x i64> 58 ret <2 x i64> %4 59} 60 61declare <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3 62 63define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 { 64; CHECK-LABEL: test_mm256_cvtne2ps2bf16_256: 65; CHECK: # %bb.0: # %entry 66; CHECK-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x72,0xc1] 67; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 68entry: 69 %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 70 %1 = bitcast <16 x i16> %0 to <4 x i64> 71 ret <4 x i64> %1 72} 73 74define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 { 75; X86-LABEL: test_mm256_maskz_cvtne2ps2bf16_256: 76; X86: # %bb.0: # %entry 77; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] 78; X86-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xa9,0x72,0xc1] 79; X86-NEXT: retl # encoding: [0xc3] 80; 81; X64-LABEL: test_mm256_maskz_cvtne2ps2bf16_256: 82; X64: # %bb.0: # %entry 83; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 84; X64-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xa9,0x72,0xc1] 85; X64-NEXT: retq # encoding: [0xc3] 86entry: 87 %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 88 %1 = bitcast i16 %U to <16 x i1> 89 %2 = select <16 x i1> %1, <16 x i16> %0, <16 x i16> zeroinitializer 90 %3 = bitcast <16 x i16> %2 to <4 x i64> 91 ret <4 x i64> %3 92} 93 94define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 { 95; X86-LABEL: test_mm256_mask_cvtne2ps2bf16_256: 96; X86: # %bb.0: # %entry 97; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] 98; X86-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x72,0xc2] 99; X86-NEXT: retl # encoding: [0xc3] 100; 101; X64-LABEL: test_mm256_mask_cvtne2ps2bf16_256: 102; X64: # %bb.0: # %entry 103; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 104; X64-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x72,0xc2] 105; X64-NEXT: retq # encoding: [0xc3] 106entry: 107 %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 108 %1 = bitcast <4 x i64> %C to <16 x i16> 109 %2 = bitcast i16 %U to <16 x i1> 110 %3 = select <16 x i1> %2, <16 x i16> %0, <16 x i16> %1 111 %4 = bitcast <16 x i16> %3 to <4 x i64> 112 ret <4 x i64> %4 113} 114 115declare <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3 116 117define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 { 118; CHECK-LABEL: test_mm256_cvtneps2bf16_256: 119; CHECK: # %bb.0: # %entry 120; CHECK-NEXT: vcvtneps2bf16 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc0] 121; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 122; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 123entry: 124 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 125 %1 = bitcast <8 x i16> %0 to <2 x i64> 126 ret <2 x i64> %1 127} 128 129define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 { 130; X86-LABEL: test_mm256_maskz_cvtneps2bf16_256: 131; X86: # %bb.0: # %entry 132; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 133; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 134; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x72,0xc0] 135; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 136; X86-NEXT: retl # encoding: [0xc3] 137; 138; X64-LABEL: test_mm256_maskz_cvtneps2bf16_256: 139; X64: # %bb.0: # %entry 140; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 141; X64-NEXT: vcvtneps2bf16 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x72,0xc0] 142; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 143; X64-NEXT: retq # encoding: [0xc3] 144entry: 145 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 146 %1 = bitcast i8 %U to <8 x i1> 147 %2 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> zeroinitializer 148 %3 = bitcast <8 x i16> %2 to <2 x i64> 149 ret <2 x i64> %3 150} 151 152define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 { 153; X86-LABEL: test_mm256_mask_cvtneps2bf16_256: 154; X86: # %bb.0: # %entry 155; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 156; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 157; X86-NEXT: vcvtneps2bf16 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x72,0xc1] 158; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 159; X86-NEXT: retl # encoding: [0xc3] 160; 161; X64-LABEL: test_mm256_mask_cvtneps2bf16_256: 162; X64: # %bb.0: # %entry 163; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 164; X64-NEXT: vcvtneps2bf16 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x72,0xc1] 165; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 166; X64-NEXT: retq # encoding: [0xc3] 167entry: 168 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 169 %1 = bitcast <2 x i64> %C to <8 x i16> 170 %2 = bitcast i8 %U to <8 x i1> 171 %3 = select <8 x i1> %2, <8 x i16> %0, <8 x i16> %1 172 %4 = bitcast <8 x i16> %3 to <2 x i64> 173 ret <2 x i64> %4 174} 175 176declare <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x i16>, <4 x i1>) #3 177 178define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 { 179; CHECK-LABEL: test_mm128_cvtneps2bf16_128: 180; CHECK: # %bb.0: # %entry 181; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0] 182; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 183entry: 184 %0 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4 185 %1 = bitcast <8 x i16> %0 to <2 x i64> 186 ret <2 x i64> %1 187} 188 189define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 { 190; X86-LABEL: test_mm128_maskz_cvtneps2bf16_128: 191; X86: # %bb.0: # %entry 192; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 193; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 194; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x72,0xc0] 195; X86-NEXT: retl # encoding: [0xc3] 196; 197; X64-LABEL: test_mm128_maskz_cvtneps2bf16_128: 198; X64: # %bb.0: # %entry 199; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 200; X64-NEXT: vcvtneps2bf16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x72,0xc0] 201; X64-NEXT: retq # encoding: [0xc3] 202entry: 203 %0 = bitcast i8 %U to <8 x i1> 204 %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 205 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> zeroinitializer, <4 x i1> %1) #4 206 %3 = bitcast <8 x i16> %2 to <2 x i64> 207 ret <2 x i64> %3 208} 209 210define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 { 211; X86-LABEL: test_mm128_mask_cvtneps2bf16_128: 212; X86: # %bb.0: # %entry 213; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 214; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 215; X86-NEXT: vcvtneps2bf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x72,0xc1] 216; X86-NEXT: retl # encoding: [0xc3] 217; 218; X64-LABEL: test_mm128_mask_cvtneps2bf16_128: 219; X64: # %bb.0: # %entry 220; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 221; X64-NEXT: vcvtneps2bf16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x72,0xc1] 222; X64-NEXT: retq # encoding: [0xc3] 223entry: 224 %0 = bitcast i8 %U to <8 x i1> 225 %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 226 %2 = bitcast <2 x i64> %C to <8 x i16> 227 %3 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> %2, <4 x i1> %1) #4 228 %4 = bitcast <8 x i16> %3 to <2 x i64> 229 ret <2 x i64> %4 230} 231 232; Make sure we don't fold a select into the 128 bit form of cvtneps2bf16. It 233; always writes zeros to bits 127:64 regardless of mask. 234define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 { 235; X86-LABEL: test_mm128_cvtneps2bf16_128_select: 236; X86: # %bb.0: # %entry 237; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 238; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 239; X86-NEXT: vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9] 240; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] 241; X86-NEXT: retl # encoding: [0xc3] 242; 243; X64-LABEL: test_mm128_cvtneps2bf16_128_select: 244; X64: # %bb.0: # %entry 245; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 246; X64-NEXT: vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9] 247; X64-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] 248; X64-NEXT: retq # encoding: [0xc3] 249entry: 250 %0 = bitcast i8 %U to <8 x i1> 251 %1 = bitcast <2 x i64> %C to <8 x i16> 252 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4 253 %3 = select <8 x i1> %0, <8 x i16> %2, <8 x i16> %1 254 %4 = bitcast <8 x i16> %3 to <2 x i64> 255 ret <2 x i64> %4 256} 257 258declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <8 x i32>, <8 x i32>) #3 259 260define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <8 x i32> %A, <8 x i32> %B) local_unnamed_addr #2 { 261; CHECK-LABEL: test_mm256_dpbf16ps_256: 262; CHECK: # %bb.0: # %entry 263; CHECK-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x52,0xc2] 264; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 265entry: 266 %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <8 x i32> %A, <8 x i32> %B) #4 267 ret <8 x float> %0 268} 269 270define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <8 x i32> %A, <8 x i32> %B, i8 zeroext %U) local_unnamed_addr #2 { 271; X86-LABEL: test_mm256_maskz_dpbf16ps_256: 272; X86: # %bb.0: # %entry 273; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 274; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 275; X86-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x52,0xc2] 276; X86-NEXT: retl # encoding: [0xc3] 277; 278; X64-LABEL: test_mm256_maskz_dpbf16ps_256: 279; X64: # %bb.0: # %entry 280; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 281; X64-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x52,0xc2] 282; X64-NEXT: retq # encoding: [0xc3] 283entry: 284 %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <8 x i32> %A, <8 x i32> %B) #4 285 %1 = bitcast i8 %U to <8 x i1> 286 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 287 ret <8 x float> %2 288} 289define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <8 x i32> %A, <8 x i32> %B) local_unnamed_addr #2 { 290; X86-LABEL: test_mm256_mask_dpbf16ps_256: 291; X86: # %bb.0: # %entry 292; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 293; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 294; X86-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x52,0xc2] 295; X86-NEXT: retl # encoding: [0xc3] 296; 297; X64-LABEL: test_mm256_mask_dpbf16ps_256: 298; X64: # %bb.0: # %entry 299; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 300; X64-NEXT: vdpbf16ps %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x52,0xc2] 301; X64-NEXT: retq # encoding: [0xc3] 302entry: 303 %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <8 x i32> %A, <8 x i32> %B) #4 304 %1 = bitcast i8 %U to <8 x i1> 305 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E 306 ret <8 x float> %2 307} 308 309declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <4 x i32>, <4 x i32>) #3 310 311define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <4 x i32> %A, <4 x i32> %B) local_unnamed_addr #2 { 312; CHECK-LABEL: test_mm128_dpbf16ps_128: 313; CHECK: # %bb.0: # %entry 314; CHECK-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x52,0xc2] 315; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] 316entry: 317 %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <4 x i32> %A, <4x i32> %B) #4 318 ret <4 x float> %0 319} 320 321define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <4 x i32> %A, <4 x i32> %B, i4 zeroext %U) local_unnamed_addr #2 { 322; X86-LABEL: test_mm128_maskz_dpbf16ps_128: 323; X86: # %bb.0: # %entry 324; X86-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04] 325; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 326; X86-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x52,0xc2] 327; X86-NEXT: retl # encoding: [0xc3] 328; 329; X64-LABEL: test_mm128_maskz_dpbf16ps_128: 330; X64: # %bb.0: # %entry 331; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 332; X64-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x52,0xc2] 333; X64-NEXT: retq # encoding: [0xc3] 334entry: 335 %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <4 x i32> %A, <4 x i32> %B) #4 336 %1 = bitcast i4 %U to <4 x i1> 337 %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer 338 ret <4 x float> %2 339} 340define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <4 x i32> %A, <4 x i32> %B) local_unnamed_addr #2 { 341; X86-LABEL: test_mm128_mask_dpbf16ps_128: 342; X86: # %bb.0: # %entry 343; X86-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04] 344; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] 345; X86-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x52,0xc2] 346; X86-NEXT: retl # encoding: [0xc3] 347; 348; X64-LABEL: test_mm128_mask_dpbf16ps_128: 349; X64: # %bb.0: # %entry 350; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] 351; X64-NEXT: vdpbf16ps %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x52,0xc2] 352; X64-NEXT: retq # encoding: [0xc3] 353entry: 354 %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <4 x i32> %A, <4 x i32> %B) #4 355 %1 = bitcast i4 %U to <4 x i1> 356 %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E 357 ret <4 x float> %2 358} 359