1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c 6 7define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 8; X86-LABEL: test_mm_mask_fmadd_pd: 9; X86: # %bb.0: # %entry 10; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 11; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 12; X86-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1] 13; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) + xmm2 14; X86-NEXT: retl # encoding: [0xc3] 15; 16; X64-LABEL: test_mm_mask_fmadd_pd: 17; X64: # %bb.0: # %entry 18; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 19; X64-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1] 20; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) + xmm2 21; X64-NEXT: retq # encoding: [0xc3] 22entry: 23 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 24 %1 = bitcast i8 %__U to <8 x i1> 25 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 26 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 27 ret <2 x double> %2 28} 29 30define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 31; X86-LABEL: test_mm_mask_fmsub_pd: 32; X86: # %bb.0: # %entry 33; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 34; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 35; X86-NEXT: vfmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9a,0xc1] 36; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) - xmm2 37; X86-NEXT: retl # encoding: [0xc3] 38; 39; X64-LABEL: test_mm_mask_fmsub_pd: 40; X64: # %bb.0: # %entry 41; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 42; X64-NEXT: vfmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9a,0xc1] 43; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) - xmm2 44; X64-NEXT: retq # encoding: [0xc3] 45entry: 46 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 47 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 48 %1 = bitcast i8 %__U to <8 x i1> 49 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 50 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 51 ret <2 x double> %2 52} 53 54define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 55; X86-LABEL: test_mm_mask3_fmadd_pd: 56; X86: # %bb.0: # %entry 57; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 58; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 59; X86-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1] 60; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) + xmm2 61; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 62; X86-NEXT: retl # encoding: [0xc3] 63; 64; X64-LABEL: test_mm_mask3_fmadd_pd: 65; X64: # %bb.0: # %entry 66; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 67; X64-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1] 68; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) + xmm2 69; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 70; X64-NEXT: retq # encoding: [0xc3] 71entry: 72 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 73 %1 = bitcast i8 %__U to <8 x i1> 74 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 75 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 76 ret <2 x double> %2 77} 78 79define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 80; X86-LABEL: test_mm_mask3_fnmadd_pd: 81; X86: # %bb.0: # %entry 82; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 83; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 84; X86-NEXT: vfnmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbc,0xd1] 85; X86-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 86; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 87; X86-NEXT: retl # encoding: [0xc3] 88; 89; X64-LABEL: test_mm_mask3_fnmadd_pd: 90; X64: # %bb.0: # %entry 91; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 92; X64-NEXT: vfnmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbc,0xd1] 93; X64-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 94; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 95; X64-NEXT: retq # encoding: [0xc3] 96entry: 97 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 98 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9 99 %1 = bitcast i8 %__U to <8 x i1> 100 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 101 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 102 ret <2 x double> %2 103} 104 105define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 106; X86-LABEL: test_mm_maskz_fmadd_pd: 107; X86: # %bb.0: # %entry 108; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 109; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 110; X86-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa8,0xc2] 111; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 112; X86-NEXT: retl # encoding: [0xc3] 113; 114; X64-LABEL: test_mm_maskz_fmadd_pd: 115; X64: # %bb.0: # %entry 116; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 117; X64-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa8,0xc2] 118; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 119; X64-NEXT: retq # encoding: [0xc3] 120entry: 121 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 122 %1 = bitcast i8 %__U to <8 x i1> 123 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 124 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 125 ret <2 x double> %2 126} 127 128define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 129; X86-LABEL: test_mm_maskz_fmsub_pd: 130; X86: # %bb.0: # %entry 131; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 132; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 133; X86-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaa,0xc2] 134; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 135; X86-NEXT: retl # encoding: [0xc3] 136; 137; X64-LABEL: test_mm_maskz_fmsub_pd: 138; X64: # %bb.0: # %entry 139; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 140; X64-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaa,0xc2] 141; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 142; X64-NEXT: retq # encoding: [0xc3] 143entry: 144 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 145 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 146 %1 = bitcast i8 %__U to <8 x i1> 147 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 148 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 149 ret <2 x double> %2 150} 151 152define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 153; X86-LABEL: test_mm_maskz_fnmadd_pd: 154; X86: # %bb.0: # %entry 155; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 156; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 157; X86-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xac,0xc2] 158; X86-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 159; X86-NEXT: retl # encoding: [0xc3] 160; 161; X64-LABEL: test_mm_maskz_fnmadd_pd: 162; X64: # %bb.0: # %entry 163; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 164; X64-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xac,0xc2] 165; X64-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 166; X64-NEXT: retq # encoding: [0xc3] 167entry: 168 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 169 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9 170 %1 = bitcast i8 %__U to <8 x i1> 171 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 172 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 173 ret <2 x double> %2 174} 175 176define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 177; X86-LABEL: test_mm_maskz_fnmsub_pd: 178; X86: # %bb.0: # %entry 179; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 180; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 181; X86-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xae,0xc2] 182; X86-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 183; X86-NEXT: retl # encoding: [0xc3] 184; 185; X64-LABEL: test_mm_maskz_fnmsub_pd: 186; X64: # %bb.0: # %entry 187; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 188; X64-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xae,0xc2] 189; X64-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 190; X64-NEXT: retq # encoding: [0xc3] 191entry: 192 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 193 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 194 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9 195 %1 = bitcast i8 %__U to <8 x i1> 196 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 197 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 198 ret <2 x double> %2 199} 200 201define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 202; X86-LABEL: test_mm256_mask_fmadd_pd: 203; X86: # %bb.0: # %entry 204; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 205; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 206; X86-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1] 207; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) + ymm2 208; X86-NEXT: retl # encoding: [0xc3] 209; 210; X64-LABEL: test_mm256_mask_fmadd_pd: 211; X64: # %bb.0: # %entry 212; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 213; X64-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1] 214; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) + ymm2 215; X64-NEXT: retq # encoding: [0xc3] 216entry: 217 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 218 %1 = bitcast i8 %__U to <8 x i1> 219 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 220 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 221 ret <4 x double> %2 222} 223 224define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 225; X86-LABEL: test_mm256_mask_fmsub_pd: 226; X86: # %bb.0: # %entry 227; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 228; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 229; X86-NEXT: vfmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9a,0xc1] 230; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) - ymm2 231; X86-NEXT: retl # encoding: [0xc3] 232; 233; X64-LABEL: test_mm256_mask_fmsub_pd: 234; X64: # %bb.0: # %entry 235; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 236; X64-NEXT: vfmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9a,0xc1] 237; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) - ymm2 238; X64-NEXT: retq # encoding: [0xc3] 239entry: 240 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 241 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 242 %1 = bitcast i8 %__U to <8 x i1> 243 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 244 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 245 ret <4 x double> %2 246} 247 248define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 249; X86-LABEL: test_mm256_mask3_fmadd_pd: 250; X86: # %bb.0: # %entry 251; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 252; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 253; X86-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1] 254; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) + ymm2 255; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 256; X86-NEXT: retl # encoding: [0xc3] 257; 258; X64-LABEL: test_mm256_mask3_fmadd_pd: 259; X64: # %bb.0: # %entry 260; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 261; X64-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1] 262; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) + ymm2 263; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 264; X64-NEXT: retq # encoding: [0xc3] 265entry: 266 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 267 %1 = bitcast i8 %__U to <8 x i1> 268 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 269 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 270 ret <4 x double> %2 271} 272 273define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 274; X86-LABEL: test_mm256_mask3_fnmadd_pd: 275; X86: # %bb.0: # %entry 276; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 277; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 278; X86-NEXT: vfnmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbc,0xd1] 279; X86-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2 280; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 281; X86-NEXT: retl # encoding: [0xc3] 282; 283; X64-LABEL: test_mm256_mask3_fnmadd_pd: 284; X64: # %bb.0: # %entry 285; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 286; X64-NEXT: vfnmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbc,0xd1] 287; X64-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2 288; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 289; X64-NEXT: retq # encoding: [0xc3] 290entry: 291 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 292 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9 293 %1 = bitcast i8 %__U to <8 x i1> 294 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 295 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 296 ret <4 x double> %2 297} 298 299define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 300; X86-LABEL: test_mm256_maskz_fmadd_pd: 301; X86: # %bb.0: # %entry 302; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 303; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 304; X86-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa8,0xc2] 305; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2 306; X86-NEXT: retl # encoding: [0xc3] 307; 308; X64-LABEL: test_mm256_maskz_fmadd_pd: 309; X64: # %bb.0: # %entry 310; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 311; X64-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa8,0xc2] 312; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2 313; X64-NEXT: retq # encoding: [0xc3] 314entry: 315 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 316 %1 = bitcast i8 %__U to <8 x i1> 317 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 318 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 319 ret <4 x double> %2 320} 321 322define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 323; X86-LABEL: test_mm256_maskz_fmsub_pd: 324; X86: # %bb.0: # %entry 325; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 326; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 327; X86-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xaa,0xc2] 328; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2 329; X86-NEXT: retl # encoding: [0xc3] 330; 331; X64-LABEL: test_mm256_maskz_fmsub_pd: 332; X64: # %bb.0: # %entry 333; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 334; X64-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xaa,0xc2] 335; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2 336; X64-NEXT: retq # encoding: [0xc3] 337entry: 338 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 339 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 340 %1 = bitcast i8 %__U to <8 x i1> 341 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 342 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 343 ret <4 x double> %2 344} 345 346define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 347; X86-LABEL: test_mm256_maskz_fnmadd_pd: 348; X86: # %bb.0: # %entry 349; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 350; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 351; X86-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xac,0xc2] 352; X86-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2 353; X86-NEXT: retl # encoding: [0xc3] 354; 355; X64-LABEL: test_mm256_maskz_fnmadd_pd: 356; X64: # %bb.0: # %entry 357; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 358; X64-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xac,0xc2] 359; X64-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2 360; X64-NEXT: retq # encoding: [0xc3] 361entry: 362 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 363 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9 364 %1 = bitcast i8 %__U to <8 x i1> 365 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 366 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 367 ret <4 x double> %2 368} 369 370define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 371; X86-LABEL: test_mm256_maskz_fnmsub_pd: 372; X86: # %bb.0: # %entry 373; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 374; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 375; X86-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xae,0xc2] 376; X86-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2 377; X86-NEXT: retl # encoding: [0xc3] 378; 379; X64-LABEL: test_mm256_maskz_fnmsub_pd: 380; X64: # %bb.0: # %entry 381; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 382; X64-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xae,0xc2] 383; X64-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2 384; X64-NEXT: retq # encoding: [0xc3] 385entry: 386 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 387 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 388 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9 389 %1 = bitcast i8 %__U to <8 x i1> 390 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 391 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 392 ret <4 x double> %2 393} 394 395define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 396; X86-LABEL: test_mm_mask_fmadd_ps: 397; X86: # %bb.0: # %entry 398; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 399; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 400; X86-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1] 401; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) + xmm2 402; X86-NEXT: retl # encoding: [0xc3] 403; 404; X64-LABEL: test_mm_mask_fmadd_ps: 405; X64: # %bb.0: # %entry 406; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 407; X64-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1] 408; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) + xmm2 409; X64-NEXT: retq # encoding: [0xc3] 410entry: 411 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 412 %1 = bitcast i8 %__U to <8 x i1> 413 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 414 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 415 ret <4 x float> %2 416} 417 418define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 419; X86-LABEL: test_mm_mask_fmsub_ps: 420; X86: # %bb.0: # %entry 421; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 422; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 423; X86-NEXT: vfmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9a,0xc1] 424; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) - xmm2 425; X86-NEXT: retl # encoding: [0xc3] 426; 427; X64-LABEL: test_mm_mask_fmsub_ps: 428; X64: # %bb.0: # %entry 429; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 430; X64-NEXT: vfmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9a,0xc1] 431; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) - xmm2 432; X64-NEXT: retq # encoding: [0xc3] 433entry: 434 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 435 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 436 %1 = bitcast i8 %__U to <8 x i1> 437 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 438 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 439 ret <4 x float> %2 440} 441 442define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 443; X86-LABEL: test_mm_mask3_fmadd_ps: 444; X86: # %bb.0: # %entry 445; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 446; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 447; X86-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1] 448; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) + xmm2 449; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 450; X86-NEXT: retl # encoding: [0xc3] 451; 452; X64-LABEL: test_mm_mask3_fmadd_ps: 453; X64: # %bb.0: # %entry 454; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 455; X64-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1] 456; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) + xmm2 457; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 458; X64-NEXT: retq # encoding: [0xc3] 459entry: 460 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 461 %1 = bitcast i8 %__U to <8 x i1> 462 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 463 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 464 ret <4 x float> %2 465} 466 467define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 468; X86-LABEL: test_mm_mask3_fnmadd_ps: 469; X86: # %bb.0: # %entry 470; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 471; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 472; X86-NEXT: vfnmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbc,0xd1] 473; X86-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 474; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 475; X86-NEXT: retl # encoding: [0xc3] 476; 477; X64-LABEL: test_mm_mask3_fnmadd_ps: 478; X64: # %bb.0: # %entry 479; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 480; X64-NEXT: vfnmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbc,0xd1] 481; X64-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 482; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 483; X64-NEXT: retq # encoding: [0xc3] 484entry: 485 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 486 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9 487 %1 = bitcast i8 %__U to <8 x i1> 488 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 489 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 490 ret <4 x float> %2 491} 492 493define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 494; X86-LABEL: test_mm_maskz_fmadd_ps: 495; X86: # %bb.0: # %entry 496; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 497; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 498; X86-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa8,0xc2] 499; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 500; X86-NEXT: retl # encoding: [0xc3] 501; 502; X64-LABEL: test_mm_maskz_fmadd_ps: 503; X64: # %bb.0: # %entry 504; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 505; X64-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa8,0xc2] 506; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 507; X64-NEXT: retq # encoding: [0xc3] 508entry: 509 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 510 %1 = bitcast i8 %__U to <8 x i1> 511 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 512 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 513 ret <4 x float> %2 514} 515 516define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 517; X86-LABEL: test_mm_maskz_fmsub_ps: 518; X86: # %bb.0: # %entry 519; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 520; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 521; X86-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaa,0xc2] 522; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 523; X86-NEXT: retl # encoding: [0xc3] 524; 525; X64-LABEL: test_mm_maskz_fmsub_ps: 526; X64: # %bb.0: # %entry 527; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 528; X64-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaa,0xc2] 529; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 530; X64-NEXT: retq # encoding: [0xc3] 531entry: 532 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 533 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 534 %1 = bitcast i8 %__U to <8 x i1> 535 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 536 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 537 ret <4 x float> %2 538} 539 540define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 541; X86-LABEL: test_mm_maskz_fnmadd_ps: 542; X86: # %bb.0: # %entry 543; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 544; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 545; X86-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xac,0xc2] 546; X86-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 547; X86-NEXT: retl # encoding: [0xc3] 548; 549; X64-LABEL: test_mm_maskz_fnmadd_ps: 550; X64: # %bb.0: # %entry 551; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 552; X64-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xac,0xc2] 553; X64-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 554; X64-NEXT: retq # encoding: [0xc3] 555entry: 556 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 557 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9 558 %1 = bitcast i8 %__U to <8 x i1> 559 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 560 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 561 ret <4 x float> %2 562} 563 564define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 565; X86-LABEL: test_mm_maskz_fnmsub_ps: 566; X86: # %bb.0: # %entry 567; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 568; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 569; X86-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xae,0xc2] 570; X86-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 571; X86-NEXT: retl # encoding: [0xc3] 572; 573; X64-LABEL: test_mm_maskz_fnmsub_ps: 574; X64: # %bb.0: # %entry 575; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 576; X64-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xae,0xc2] 577; X64-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 578; X64-NEXT: retq # encoding: [0xc3] 579entry: 580 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 581 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 582 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9 583 %1 = bitcast i8 %__U to <8 x i1> 584 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 585 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 586 ret <4 x float> %2 587} 588 589define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 590; X86-LABEL: test_mm256_mask_fmadd_ps: 591; X86: # %bb.0: # %entry 592; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 593; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 594; X86-NEXT: vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1] 595; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) + ymm2 596; X86-NEXT: retl # encoding: [0xc3] 597; 598; X64-LABEL: test_mm256_mask_fmadd_ps: 599; X64: # %bb.0: # %entry 600; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 601; X64-NEXT: vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1] 602; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) + ymm2 603; X64-NEXT: retq # encoding: [0xc3] 604entry: 605 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 606 %1 = bitcast i8 %__U to <8 x i1> 607 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 608 ret <8 x float> %2 609} 610 611define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 612; X86-LABEL: test_mm256_mask_fmsub_ps: 613; X86: # %bb.0: # %entry 614; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 615; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 616; X86-NEXT: vfmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9a,0xc1] 617; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) - ymm2 618; X86-NEXT: retl # encoding: [0xc3] 619; 620; X64-LABEL: test_mm256_mask_fmsub_ps: 621; X64: # %bb.0: # %entry 622; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 623; X64-NEXT: vfmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9a,0xc1] 624; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) - ymm2 625; X64-NEXT: retq # encoding: [0xc3] 626entry: 627 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 628 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 629 %1 = bitcast i8 %__U to <8 x i1> 630 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 631 ret <8 x float> %2 632} 633 634define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 635; X86-LABEL: test_mm256_mask3_fmadd_ps: 636; X86: # %bb.0: # %entry 637; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 638; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 639; X86-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1] 640; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) + ymm2 641; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 642; X86-NEXT: retl # encoding: [0xc3] 643; 644; X64-LABEL: test_mm256_mask3_fmadd_ps: 645; X64: # %bb.0: # %entry 646; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 647; X64-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1] 648; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) + ymm2 649; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 650; X64-NEXT: retq # encoding: [0xc3] 651entry: 652 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 653 %1 = bitcast i8 %__U to <8 x i1> 654 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 655 ret <8 x float> %2 656} 657 658define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 659; X86-LABEL: test_mm256_mask3_fnmadd_ps: 660; X86: # %bb.0: # %entry 661; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 662; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 663; X86-NEXT: vfnmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbc,0xd1] 664; X86-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2 665; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 666; X86-NEXT: retl # encoding: [0xc3] 667; 668; X64-LABEL: test_mm256_mask3_fnmadd_ps: 669; X64: # %bb.0: # %entry 670; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 671; X64-NEXT: vfnmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbc,0xd1] 672; X64-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2 673; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 674; X64-NEXT: retq # encoding: [0xc3] 675entry: 676 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 677 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9 678 %1 = bitcast i8 %__U to <8 x i1> 679 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 680 ret <8 x float> %2 681} 682 683define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 684; X86-LABEL: test_mm256_maskz_fmadd_ps: 685; X86: # %bb.0: # %entry 686; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 687; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 688; X86-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa8,0xc2] 689; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2 690; X86-NEXT: retl # encoding: [0xc3] 691; 692; X64-LABEL: test_mm256_maskz_fmadd_ps: 693; X64: # %bb.0: # %entry 694; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 695; X64-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa8,0xc2] 696; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2 697; X64-NEXT: retq # encoding: [0xc3] 698entry: 699 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 700 %1 = bitcast i8 %__U to <8 x i1> 701 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 702 ret <8 x float> %2 703} 704 705define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 706; X86-LABEL: test_mm256_maskz_fmsub_ps: 707; X86: # %bb.0: # %entry 708; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 709; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 710; X86-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xaa,0xc2] 711; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2 712; X86-NEXT: retl # encoding: [0xc3] 713; 714; X64-LABEL: test_mm256_maskz_fmsub_ps: 715; X64: # %bb.0: # %entry 716; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 717; X64-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xaa,0xc2] 718; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2 719; X64-NEXT: retq # encoding: [0xc3] 720entry: 721 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 722 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 723 %1 = bitcast i8 %__U to <8 x i1> 724 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 725 ret <8 x float> %2 726} 727 728define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 729; X86-LABEL: test_mm256_maskz_fnmadd_ps: 730; X86: # %bb.0: # %entry 731; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 732; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 733; X86-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xac,0xc2] 734; X86-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2 735; X86-NEXT: retl # encoding: [0xc3] 736; 737; X64-LABEL: test_mm256_maskz_fnmadd_ps: 738; X64: # %bb.0: # %entry 739; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 740; X64-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xac,0xc2] 741; X64-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2 742; X64-NEXT: retq # encoding: [0xc3] 743entry: 744 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 745 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9 746 %1 = bitcast i8 %__U to <8 x i1> 747 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 748 ret <8 x float> %2 749} 750 751define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 752; X86-LABEL: test_mm256_maskz_fnmsub_ps: 753; X86: # %bb.0: # %entry 754; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 755; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 756; X86-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xae,0xc2] 757; X86-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2 758; X86-NEXT: retl # encoding: [0xc3] 759; 760; X64-LABEL: test_mm256_maskz_fnmsub_ps: 761; X64: # %bb.0: # %entry 762; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 763; X64-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xae,0xc2] 764; X64-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2 765; X64-NEXT: retq # encoding: [0xc3] 766entry: 767 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 768 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 769 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9 770 %1 = bitcast i8 %__U to <8 x i1> 771 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 772 ret <8 x float> %2 773} 774 775define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 776; X86-LABEL: test_mm_mask_fmaddsub_pd: 777; X86: # %bb.0: # %entry 778; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 779; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 780; X86-NEXT: vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1] 781; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2 782; X86-NEXT: retl # encoding: [0xc3] 783; 784; X64-LABEL: test_mm_mask_fmaddsub_pd: 785; X64: # %bb.0: # %entry 786; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 787; X64-NEXT: vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1] 788; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2 789; X64-NEXT: retq # encoding: [0xc3] 790entry: 791 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 792 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 793 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 794 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 795 %4 = bitcast i8 %__U to <8 x i1> 796 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 797 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A 798 ret <2 x double> %5 799} 800 801define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 802; X86-LABEL: test_mm_mask_fmsubadd_pd: 803; X86: # %bb.0: # %entry 804; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 805; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 806; X86-NEXT: vfmsubadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x97,0xc1] 807; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2 808; X86-NEXT: retl # encoding: [0xc3] 809; 810; X64-LABEL: test_mm_mask_fmsubadd_pd: 811; X64: # %bb.0: # %entry 812; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 813; X64-NEXT: vfmsubadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x97,0xc1] 814; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2 815; X64-NEXT: retq # encoding: [0xc3] 816entry: 817 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 818 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 819 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 820 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 821 %3 = bitcast i8 %__U to <8 x i1> 822 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 823 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A 824 ret <2 x double> %4 825} 826 827define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 828; X86-LABEL: test_mm_mask3_fmaddsub_pd: 829; X86: # %bb.0: # %entry 830; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 831; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 832; X86-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1] 833; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2 834; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 835; X86-NEXT: retl # encoding: [0xc3] 836; 837; X64-LABEL: test_mm_mask3_fmaddsub_pd: 838; X64: # %bb.0: # %entry 839; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 840; X64-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1] 841; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2 842; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 843; X64-NEXT: retq # encoding: [0xc3] 844entry: 845 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 846 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 847 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 848 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 849 %4 = bitcast i8 %__U to <8 x i1> 850 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 851 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C 852 ret <2 x double> %5 853} 854 855define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 856; X86-LABEL: test_mm_maskz_fmaddsub_pd: 857; X86: # %bb.0: # %entry 858; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 859; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 860; X86-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa6,0xc2] 861; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2 862; X86-NEXT: retl # encoding: [0xc3] 863; 864; X64-LABEL: test_mm_maskz_fmaddsub_pd: 865; X64: # %bb.0: # %entry 866; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 867; X64-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa6,0xc2] 868; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2 869; X64-NEXT: retq # encoding: [0xc3] 870entry: 871 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 872 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 873 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 874 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 875 %4 = bitcast i8 %__U to <8 x i1> 876 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 877 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer 878 ret <2 x double> %5 879} 880 881define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 882; X86-LABEL: test_mm_maskz_fmsubadd_pd: 883; X86: # %bb.0: # %entry 884; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 885; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 886; X86-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa7,0xc2] 887; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2 888; X86-NEXT: retl # encoding: [0xc3] 889; 890; X64-LABEL: test_mm_maskz_fmsubadd_pd: 891; X64: # %bb.0: # %entry 892; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 893; X64-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa7,0xc2] 894; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2 895; X64-NEXT: retq # encoding: [0xc3] 896entry: 897 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 898 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 899 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 900 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 901 %3 = bitcast i8 %__U to <8 x i1> 902 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 903 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer 904 ret <2 x double> %4 905} 906 907define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 908; X86-LABEL: test_mm256_mask_fmaddsub_pd: 909; X86: # %bb.0: # %entry 910; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 911; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 912; X86-NEXT: vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1] 913; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2 914; X86-NEXT: retl # encoding: [0xc3] 915; 916; X64-LABEL: test_mm256_mask_fmaddsub_pd: 917; X64: # %bb.0: # %entry 918; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 919; X64-NEXT: vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1] 920; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2 921; X64-NEXT: retq # encoding: [0xc3] 922entry: 923 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 924 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 925 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 926 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 927 %4 = bitcast i8 %__U to <8 x i1> 928 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 929 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A 930 ret <4 x double> %5 931} 932 933define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 934; X86-LABEL: test_mm256_mask_fmsubadd_pd: 935; X86: # %bb.0: # %entry 936; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 937; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 938; X86-NEXT: vfmsubadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x97,0xc1] 939; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2 940; X86-NEXT: retl # encoding: [0xc3] 941; 942; X64-LABEL: test_mm256_mask_fmsubadd_pd: 943; X64: # %bb.0: # %entry 944; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 945; X64-NEXT: vfmsubadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x97,0xc1] 946; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2 947; X64-NEXT: retq # encoding: [0xc3] 948entry: 949 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 950 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 951 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 952 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 953 %3 = bitcast i8 %__U to <8 x i1> 954 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 955 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A 956 ret <4 x double> %4 957} 958 959define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 960; X86-LABEL: test_mm256_mask3_fmaddsub_pd: 961; X86: # %bb.0: # %entry 962; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 963; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 964; X86-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1] 965; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2 966; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 967; X86-NEXT: retl # encoding: [0xc3] 968; 969; X64-LABEL: test_mm256_mask3_fmaddsub_pd: 970; X64: # %bb.0: # %entry 971; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 972; X64-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1] 973; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2 974; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 975; X64-NEXT: retq # encoding: [0xc3] 976entry: 977 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 978 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 979 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 980 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 981 %4 = bitcast i8 %__U to <8 x i1> 982 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 983 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C 984 ret <4 x double> %5 985} 986 987define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 988; X86-LABEL: test_mm256_maskz_fmaddsub_pd: 989; X86: # %bb.0: # %entry 990; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 991; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 992; X86-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa6,0xc2] 993; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2 994; X86-NEXT: retl # encoding: [0xc3] 995; 996; X64-LABEL: test_mm256_maskz_fmaddsub_pd: 997; X64: # %bb.0: # %entry 998; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 999; X64-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa6,0xc2] 1000; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2 1001; X64-NEXT: retq # encoding: [0xc3] 1002entry: 1003 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 1004 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 1005 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 1006 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1007 %4 = bitcast i8 %__U to <8 x i1> 1008 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1009 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer 1010 ret <4 x double> %5 1011} 1012 1013define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 1014; X86-LABEL: test_mm256_maskz_fmsubadd_pd: 1015; X86: # %bb.0: # %entry 1016; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1017; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1018; X86-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa7,0xc2] 1019; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2 1020; X86-NEXT: retl # encoding: [0xc3] 1021; 1022; X64-LABEL: test_mm256_maskz_fmsubadd_pd: 1023; X64: # %bb.0: # %entry 1024; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1025; X64-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa7,0xc2] 1026; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2 1027; X64-NEXT: retq # encoding: [0xc3] 1028entry: 1029 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 1030 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 1031 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 1032 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1033 %3 = bitcast i8 %__U to <8 x i1> 1034 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1035 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer 1036 ret <4 x double> %4 1037} 1038 1039define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 1040; X86-LABEL: test_mm_mask_fmaddsub_ps: 1041; X86: # %bb.0: # %entry 1042; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1043; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1044; X86-NEXT: vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1] 1045; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2 1046; X86-NEXT: retl # encoding: [0xc3] 1047; 1048; X64-LABEL: test_mm_mask_fmaddsub_ps: 1049; X64: # %bb.0: # %entry 1050; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1051; X64-NEXT: vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1] 1052; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2 1053; X64-NEXT: retq # encoding: [0xc3] 1054entry: 1055 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 1056 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1057 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 1058 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1059 %4 = bitcast i8 %__U to <8 x i1> 1060 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1061 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A 1062 ret <4 x float> %5 1063} 1064 1065define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 1066; X86-LABEL: test_mm_mask_fmsubadd_ps: 1067; X86: # %bb.0: # %entry 1068; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1069; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1070; X86-NEXT: vfmsubadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x97,0xc1] 1071; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2 1072; X86-NEXT: retl # encoding: [0xc3] 1073; 1074; X64-LABEL: test_mm_mask_fmsubadd_ps: 1075; X64: # %bb.0: # %entry 1076; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1077; X64-NEXT: vfmsubadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x97,0xc1] 1078; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2 1079; X64-NEXT: retq # encoding: [0xc3] 1080entry: 1081 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1082 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 1083 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 1084 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1085 %3 = bitcast i8 %__U to <8 x i1> 1086 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1087 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A 1088 ret <4 x float> %4 1089} 1090 1091define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 1092; X86-LABEL: test_mm_mask3_fmaddsub_ps: 1093; X86: # %bb.0: # %entry 1094; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1095; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1096; X86-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1] 1097; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2 1098; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 1099; X86-NEXT: retl # encoding: [0xc3] 1100; 1101; X64-LABEL: test_mm_mask3_fmaddsub_ps: 1102; X64: # %bb.0: # %entry 1103; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1104; X64-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1] 1105; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2 1106; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 1107; X64-NEXT: retq # encoding: [0xc3] 1108entry: 1109 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 1110 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1111 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 1112 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1113 %4 = bitcast i8 %__U to <8 x i1> 1114 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1115 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C 1116 ret <4 x float> %5 1117} 1118 1119define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 1120; X86-LABEL: test_mm_maskz_fmaddsub_ps: 1121; X86: # %bb.0: # %entry 1122; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1123; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1124; X86-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa6,0xc2] 1125; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2 1126; X86-NEXT: retl # encoding: [0xc3] 1127; 1128; X64-LABEL: test_mm_maskz_fmaddsub_ps: 1129; X64: # %bb.0: # %entry 1130; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1131; X64-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa6,0xc2] 1132; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2 1133; X64-NEXT: retq # encoding: [0xc3] 1134entry: 1135 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 1136 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1137 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 1138 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1139 %4 = bitcast i8 %__U to <8 x i1> 1140 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1141 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer 1142 ret <4 x float> %5 1143} 1144 1145define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 1146; X86-LABEL: test_mm_maskz_fmsubadd_ps: 1147; X86: # %bb.0: # %entry 1148; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1149; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1150; X86-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa7,0xc2] 1151; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2 1152; X86-NEXT: retl # encoding: [0xc3] 1153; 1154; X64-LABEL: test_mm_maskz_fmsubadd_ps: 1155; X64: # %bb.0: # %entry 1156; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1157; X64-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa7,0xc2] 1158; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2 1159; X64-NEXT: retq # encoding: [0xc3] 1160entry: 1161 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1162 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 1163 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 1164 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1165 %3 = bitcast i8 %__U to <8 x i1> 1166 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1167 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer 1168 ret <4 x float> %4 1169} 1170 1171define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 1172; X86-LABEL: test_mm256_mask_fmaddsub_ps: 1173; X86: # %bb.0: # %entry 1174; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1175; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1176; X86-NEXT: vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1] 1177; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2 1178; X86-NEXT: retl # encoding: [0xc3] 1179; 1180; X64-LABEL: test_mm256_mask_fmaddsub_ps: 1181; X64: # %bb.0: # %entry 1182; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1183; X64-NEXT: vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1] 1184; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2 1185; X64-NEXT: retq # encoding: [0xc3] 1186entry: 1187 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 1188 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1189 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 1190 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 1191 %4 = bitcast i8 %__U to <8 x i1> 1192 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A 1193 ret <8 x float> %5 1194} 1195 1196define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 1197; X86-LABEL: test_mm256_mask_fmsubadd_ps: 1198; X86: # %bb.0: # %entry 1199; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1200; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1201; X86-NEXT: vfmsubadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x97,0xc1] 1202; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2 1203; X86-NEXT: retl # encoding: [0xc3] 1204; 1205; X64-LABEL: test_mm256_mask_fmsubadd_ps: 1206; X64: # %bb.0: # %entry 1207; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1208; X64-NEXT: vfmsubadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x97,0xc1] 1209; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2 1210; X64-NEXT: retq # encoding: [0xc3] 1211entry: 1212 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1213 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 1214 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 1215 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 1216 %3 = bitcast i8 %__U to <8 x i1> 1217 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A 1218 ret <8 x float> %4 1219} 1220 1221define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 1222; X86-LABEL: test_mm256_mask3_fmaddsub_ps: 1223; X86: # %bb.0: # %entry 1224; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1225; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1226; X86-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1] 1227; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2 1228; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 1229; X86-NEXT: retl # encoding: [0xc3] 1230; 1231; X64-LABEL: test_mm256_mask3_fmaddsub_ps: 1232; X64: # %bb.0: # %entry 1233; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1234; X64-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1] 1235; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2 1236; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 1237; X64-NEXT: retq # encoding: [0xc3] 1238entry: 1239 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 1240 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1241 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 1242 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 1243 %4 = bitcast i8 %__U to <8 x i1> 1244 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C 1245 ret <8 x float> %5 1246} 1247 1248define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 1249; X86-LABEL: test_mm256_maskz_fmaddsub_ps: 1250; X86: # %bb.0: # %entry 1251; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1252; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1253; X86-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa6,0xc2] 1254; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2 1255; X86-NEXT: retl # encoding: [0xc3] 1256; 1257; X64-LABEL: test_mm256_maskz_fmaddsub_ps: 1258; X64: # %bb.0: # %entry 1259; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1260; X64-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa6,0xc2] 1261; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2 1262; X64-NEXT: retq # encoding: [0xc3] 1263entry: 1264 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 1265 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1266 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 1267 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 1268 %4 = bitcast i8 %__U to <8 x i1> 1269 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer 1270 ret <8 x float> %5 1271} 1272 1273define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 1274; X86-LABEL: test_mm256_maskz_fmsubadd_ps: 1275; X86: # %bb.0: # %entry 1276; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1277; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1278; X86-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa7,0xc2] 1279; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2 1280; X86-NEXT: retl # encoding: [0xc3] 1281; 1282; X64-LABEL: test_mm256_maskz_fmsubadd_ps: 1283; X64: # %bb.0: # %entry 1284; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1285; X64-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa7,0xc2] 1286; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2 1287; X64-NEXT: retq # encoding: [0xc3] 1288entry: 1289 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1290 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 1291 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 1292 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 1293 %3 = bitcast i8 %__U to <8 x i1> 1294 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer 1295 ret <8 x float> %4 1296} 1297 1298define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 1299; X86-LABEL: test_mm_mask3_fmsub_pd: 1300; X86: # %bb.0: # %entry 1301; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1302; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1303; X86-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1] 1304; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) - xmm2 1305; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 1306; X86-NEXT: retl # encoding: [0xc3] 1307; 1308; X64-LABEL: test_mm_mask3_fmsub_pd: 1309; X64: # %bb.0: # %entry 1310; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1311; X64-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1] 1312; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) - xmm2 1313; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 1314; X64-NEXT: retq # encoding: [0xc3] 1315entry: 1316 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 1317 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 1318 %1 = bitcast i8 %__U to <8 x i1> 1319 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1320 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 1321 ret <2 x double> %2 1322} 1323 1324define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 1325; X86-LABEL: test_mm256_mask3_fmsub_pd: 1326; X86: # %bb.0: # %entry 1327; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1328; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1329; X86-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1] 1330; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) - ymm2 1331; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 1332; X86-NEXT: retl # encoding: [0xc3] 1333; 1334; X64-LABEL: test_mm256_mask3_fmsub_pd: 1335; X64: # %bb.0: # %entry 1336; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1337; X64-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1] 1338; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) - ymm2 1339; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 1340; X64-NEXT: retq # encoding: [0xc3] 1341entry: 1342 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 1343 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 1344 %1 = bitcast i8 %__U to <8 x i1> 1345 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1346 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 1347 ret <4 x double> %2 1348} 1349 1350define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 1351; X86-LABEL: test_mm_mask3_fmsub_ps: 1352; X86: # %bb.0: # %entry 1353; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1354; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1355; X86-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1] 1356; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) - xmm2 1357; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 1358; X86-NEXT: retl # encoding: [0xc3] 1359; 1360; X64-LABEL: test_mm_mask3_fmsub_ps: 1361; X64: # %bb.0: # %entry 1362; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1363; X64-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1] 1364; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) - xmm2 1365; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 1366; X64-NEXT: retq # encoding: [0xc3] 1367entry: 1368 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1369 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 1370 %1 = bitcast i8 %__U to <8 x i1> 1371 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1372 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 1373 ret <4 x float> %2 1374} 1375 1376define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 1377; X86-LABEL: test_mm256_mask3_fmsub_ps: 1378; X86: # %bb.0: # %entry 1379; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1380; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1381; X86-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1] 1382; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) - ymm2 1383; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 1384; X86-NEXT: retl # encoding: [0xc3] 1385; 1386; X64-LABEL: test_mm256_mask3_fmsub_ps: 1387; X64: # %bb.0: # %entry 1388; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1389; X64-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1] 1390; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) - ymm2 1391; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 1392; X64-NEXT: retq # encoding: [0xc3] 1393entry: 1394 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1395 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 1396 %1 = bitcast i8 %__U to <8 x i1> 1397 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 1398 ret <8 x float> %2 1399} 1400 1401define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 1402; X86-LABEL: test_mm_mask3_fmsubadd_pd: 1403; X86: # %bb.0: # %entry 1404; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1405; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1406; X86-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1] 1407; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2 1408; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 1409; X86-NEXT: retl # encoding: [0xc3] 1410; 1411; X64-LABEL: test_mm_mask3_fmsubadd_pd: 1412; X64: # %bb.0: # %entry 1413; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1414; X64-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1] 1415; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2 1416; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 1417; X64-NEXT: retq # encoding: [0xc3] 1418entry: 1419 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 1420 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 1421 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 1422 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 1423 %3 = bitcast i8 %__U to <8 x i1> 1424 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1425 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C 1426 ret <2 x double> %4 1427} 1428 1429define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 1430; X86-LABEL: test_mm256_mask3_fmsubadd_pd: 1431; X86: # %bb.0: # %entry 1432; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1433; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1434; X86-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1] 1435; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2 1436; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 1437; X86-NEXT: retl # encoding: [0xc3] 1438; 1439; X64-LABEL: test_mm256_mask3_fmsubadd_pd: 1440; X64: # %bb.0: # %entry 1441; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1442; X64-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1] 1443; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2 1444; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 1445; X64-NEXT: retq # encoding: [0xc3] 1446entry: 1447 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 1448 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 1449 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 1450 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1451 %3 = bitcast i8 %__U to <8 x i1> 1452 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1453 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C 1454 ret <4 x double> %4 1455} 1456 1457define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 1458; X86-LABEL: test_mm_mask3_fmsubadd_ps: 1459; X86: # %bb.0: # %entry 1460; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1461; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1462; X86-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1] 1463; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2 1464; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 1465; X86-NEXT: retl # encoding: [0xc3] 1466; 1467; X64-LABEL: test_mm_mask3_fmsubadd_ps: 1468; X64: # %bb.0: # %entry 1469; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1470; X64-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1] 1471; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2 1472; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 1473; X64-NEXT: retq # encoding: [0xc3] 1474entry: 1475 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1476 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 1477 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 1478 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1479 %3 = bitcast i8 %__U to <8 x i1> 1480 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1481 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C 1482 ret <4 x float> %4 1483} 1484 1485define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 1486; X86-LABEL: test_mm256_mask3_fmsubadd_ps: 1487; X86: # %bb.0: # %entry 1488; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1489; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1490; X86-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1] 1491; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2 1492; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 1493; X86-NEXT: retl # encoding: [0xc3] 1494; 1495; X64-LABEL: test_mm256_mask3_fmsubadd_ps: 1496; X64: # %bb.0: # %entry 1497; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1498; X64-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1] 1499; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2 1500; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 1501; X64-NEXT: retq # encoding: [0xc3] 1502entry: 1503 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1504 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 1505 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 1506 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 1507 %3 = bitcast i8 %__U to <8 x i1> 1508 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C 1509 ret <8 x float> %4 1510} 1511 1512define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 1513; X86-LABEL: test_mm_mask_fnmadd_pd: 1514; X86: # %bb.0: # %entry 1515; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1516; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1517; X86-NEXT: vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1] 1518; X86-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2 1519; X86-NEXT: retl # encoding: [0xc3] 1520; 1521; X64-LABEL: test_mm_mask_fnmadd_pd: 1522; X64: # %bb.0: # %entry 1523; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1524; X64-NEXT: vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1] 1525; X64-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2 1526; X64-NEXT: retq # encoding: [0xc3] 1527entry: 1528 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 1529 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9 1530 %1 = bitcast i8 %__U to <8 x i1> 1531 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1532 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 1533 ret <2 x double> %2 1534} 1535 1536define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 1537; X86-LABEL: test_mm256_mask_fnmadd_pd: 1538; X86: # %bb.0: # %entry 1539; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1540; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1541; X86-NEXT: vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1] 1542; X86-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2 1543; X86-NEXT: retl # encoding: [0xc3] 1544; 1545; X64-LABEL: test_mm256_mask_fnmadd_pd: 1546; X64: # %bb.0: # %entry 1547; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1548; X64-NEXT: vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1] 1549; X64-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2 1550; X64-NEXT: retq # encoding: [0xc3] 1551entry: 1552 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 1553 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9 1554 %1 = bitcast i8 %__U to <8 x i1> 1555 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1556 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 1557 ret <4 x double> %2 1558} 1559 1560define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 1561; X86-LABEL: test_mm_mask_fnmadd_ps: 1562; X86: # %bb.0: # %entry 1563; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1564; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1565; X86-NEXT: vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1] 1566; X86-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2 1567; X86-NEXT: retl # encoding: [0xc3] 1568; 1569; X64-LABEL: test_mm_mask_fnmadd_ps: 1570; X64: # %bb.0: # %entry 1571; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1572; X64-NEXT: vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1] 1573; X64-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2 1574; X64-NEXT: retq # encoding: [0xc3] 1575entry: 1576 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 1577 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9 1578 %1 = bitcast i8 %__U to <8 x i1> 1579 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1580 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 1581 ret <4 x float> %2 1582} 1583 1584define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 1585; X86-LABEL: test_mm256_mask_fnmadd_ps: 1586; X86: # %bb.0: # %entry 1587; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1588; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1589; X86-NEXT: vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1] 1590; X86-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2 1591; X86-NEXT: retl # encoding: [0xc3] 1592; 1593; X64-LABEL: test_mm256_mask_fnmadd_ps: 1594; X64: # %bb.0: # %entry 1595; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1596; X64-NEXT: vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1] 1597; X64-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2 1598; X64-NEXT: retq # encoding: [0xc3] 1599entry: 1600 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 1601 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9 1602 %1 = bitcast i8 %__U to <8 x i1> 1603 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 1604 ret <8 x float> %2 1605} 1606 1607define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 1608; X86-LABEL: test_mm_mask_fnmsub_pd: 1609; X86: # %bb.0: # %entry 1610; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1611; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1612; X86-NEXT: vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1] 1613; X86-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2 1614; X86-NEXT: retl # encoding: [0xc3] 1615; 1616; X64-LABEL: test_mm_mask_fnmsub_pd: 1617; X64: # %bb.0: # %entry 1618; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1619; X64-NEXT: vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1] 1620; X64-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2 1621; X64-NEXT: retq # encoding: [0xc3] 1622entry: 1623 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 1624 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 1625 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9 1626 %1 = bitcast i8 %__U to <8 x i1> 1627 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1628 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 1629 ret <2 x double> %2 1630} 1631 1632define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 1633; X86-LABEL: test_mm_mask3_fnmsub_pd: 1634; X86: # %bb.0: # %entry 1635; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1636; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1637; X86-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1] 1638; X86-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 1639; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 1640; X86-NEXT: retl # encoding: [0xc3] 1641; 1642; X64-LABEL: test_mm_mask3_fnmsub_pd: 1643; X64: # %bb.0: # %entry 1644; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1645; X64-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1] 1646; X64-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 1647; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] 1648; X64-NEXT: retq # encoding: [0xc3] 1649entry: 1650 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 1651 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 1652 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9 1653 %1 = bitcast i8 %__U to <8 x i1> 1654 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1655 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 1656 ret <2 x double> %2 1657} 1658 1659define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 1660; X86-LABEL: test_mm256_mask_fnmsub_pd: 1661; X86: # %bb.0: # %entry 1662; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1663; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1664; X86-NEXT: vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1] 1665; X86-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2 1666; X86-NEXT: retl # encoding: [0xc3] 1667; 1668; X64-LABEL: test_mm256_mask_fnmsub_pd: 1669; X64: # %bb.0: # %entry 1670; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1671; X64-NEXT: vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1] 1672; X64-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2 1673; X64-NEXT: retq # encoding: [0xc3] 1674entry: 1675 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 1676 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 1677 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9 1678 %1 = bitcast i8 %__U to <8 x i1> 1679 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1680 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 1681 ret <4 x double> %2 1682} 1683 1684define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 1685; X86-LABEL: test_mm256_mask3_fnmsub_pd: 1686; X86: # %bb.0: # %entry 1687; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1688; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1689; X86-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1] 1690; X86-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2 1691; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 1692; X86-NEXT: retl # encoding: [0xc3] 1693; 1694; X64-LABEL: test_mm256_mask3_fnmsub_pd: 1695; X64: # %bb.0: # %entry 1696; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1697; X64-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1] 1698; X64-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2 1699; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] 1700; X64-NEXT: retq # encoding: [0xc3] 1701entry: 1702 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 1703 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 1704 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9 1705 %1 = bitcast i8 %__U to <8 x i1> 1706 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1707 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 1708 ret <4 x double> %2 1709} 1710 1711define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 1712; X86-LABEL: test_mm_mask_fnmsub_ps: 1713; X86: # %bb.0: # %entry 1714; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1715; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1716; X86-NEXT: vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1] 1717; X86-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2 1718; X86-NEXT: retl # encoding: [0xc3] 1719; 1720; X64-LABEL: test_mm_mask_fnmsub_ps: 1721; X64: # %bb.0: # %entry 1722; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1723; X64-NEXT: vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1] 1724; X64-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2 1725; X64-NEXT: retq # encoding: [0xc3] 1726entry: 1727 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 1728 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1729 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9 1730 %1 = bitcast i8 %__U to <8 x i1> 1731 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1732 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 1733 ret <4 x float> %2 1734} 1735 1736define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 1737; X86-LABEL: test_mm_mask3_fnmsub_ps: 1738; X86: # %bb.0: # %entry 1739; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1740; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1741; X86-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1] 1742; X86-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 1743; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 1744; X86-NEXT: retl # encoding: [0xc3] 1745; 1746; X64-LABEL: test_mm_mask3_fnmsub_ps: 1747; X64: # %bb.0: # %entry 1748; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1749; X64-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1] 1750; X64-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 1751; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] 1752; X64-NEXT: retq # encoding: [0xc3] 1753entry: 1754 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 1755 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1756 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9 1757 %1 = bitcast i8 %__U to <8 x i1> 1758 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1759 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 1760 ret <4 x float> %2 1761} 1762 1763define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 1764; X86-LABEL: test_mm256_mask_fnmsub_ps: 1765; X86: # %bb.0: # %entry 1766; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1767; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1768; X86-NEXT: vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1] 1769; X86-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2 1770; X86-NEXT: retl # encoding: [0xc3] 1771; 1772; X64-LABEL: test_mm256_mask_fnmsub_ps: 1773; X64: # %bb.0: # %entry 1774; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1775; X64-NEXT: vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1] 1776; X64-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2 1777; X64-NEXT: retq # encoding: [0xc3] 1778entry: 1779 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 1780 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1781 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9 1782 %1 = bitcast i8 %__U to <8 x i1> 1783 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 1784 ret <8 x float> %2 1785} 1786 1787define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 1788; X86-LABEL: test_mm256_mask3_fnmsub_ps: 1789; X86: # %bb.0: # %entry 1790; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] 1791; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] 1792; X86-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1] 1793; X86-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2 1794; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 1795; X86-NEXT: retl # encoding: [0xc3] 1796; 1797; X64-LABEL: test_mm256_mask3_fnmsub_ps: 1798; X64: # %bb.0: # %entry 1799; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] 1800; X64-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1] 1801; X64-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2 1802; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] 1803; X64-NEXT: retq # encoding: [0xc3] 1804entry: 1805 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 1806 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 1807 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9 1808 %1 = bitcast i8 %__U to <8 x i1> 1809 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 1810 ret <8 x float> %2 1811} 1812 1813declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8 1814declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8 1815declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8 1816declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8 1817